Skip to content

块设备驱动:原理与架构

块设备 vs 字符设备

特性块设备字符设备
访问单位固定大小的块(512B/4KB)字节流
缓冲有页缓存(Page Cache)
随机访问支持通常不支持
典型设备硬盘、SSD、eMMC、ramdisk串口、GPIO

块设备 I/O 路径

用户空间 read()/write()

    ▼ VFS
页缓存(Page Cache)
    │  缓存未命中时

通用块层(Generic Block Layer)
    │  bio 结构

I/O 调度器(CFQ / deadline / mq-deadline / kyber / BFQ)
    │  request 队列

块设备驱动(request_fn / blk-mq)


硬件(SATA / NVMe / eMMC / ramdisk)

核心数据结构

bio — 基本 I/O 单元

c
struct bio {
    struct block_device *bi_bdev;
    blk_opf_t            bi_opf;       /* 操作类型:REQ_OP_READ/WRITE */
    sector_t             bi_iter.bi_sector;  /* 起始扇区 */
    unsigned int         bi_iter.bi_size;    /* 总字节数 */
    struct bio_vec      *bi_io_vec;    /* scatter-gather 列表 */
    unsigned short       bi_vcnt;      /* bio_vec 数量 */
    bio_end_io_t        *bi_end_io;    /* 完成回调 */
    void                *bi_private;
};

request — 合并后的 I/O 请求

多个相邻的 bio 会被 I/O 调度器合并成一个 request,减少磁头移动次数。

blk-mq 多队列驱动(现代方式)

Linux 5.x 之后,几乎所有新驱动都使用 blk-mq(Multi-Queue Block Layer):

c
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>

#define SECTOR_SIZE   512
#define DISK_SECTORS  (64 * 1024 * 2)   /* 64 MB ramdisk */

struct myblk_priv {
    struct gendisk      *disk;
    struct blk_mq_tag_set tag_set;
    u8                  *data;          /* ramdisk 数据区 */
    spinlock_t           lock;
};

/* 处理单个 I/O 请求 */
static blk_status_t myblk_queue_rq(struct blk_mq_hw_ctx *hctx,
                                    const struct blk_mq_queue_data *bd)
{
    struct request *rq = bd->rq;
    struct myblk_priv *priv = hctx->queue->queuedata;
    struct bio_vec bvec;
    struct req_iterator iter;
    sector_t sector = blk_rq_pos(rq);
    blk_status_t status = BLK_STS_OK;

    blk_mq_start_request(rq);

    /* 遍历 request 中的所有 bio_vec */
    rq_for_each_segment(bvec, rq, iter) {
        size_t len = bvec.bv_len;
        void *buf = page_address(bvec.bv_page) + bvec.bv_offset;
        loff_t offset = sector * SECTOR_SIZE;

        if (offset + len > DISK_SECTORS * SECTOR_SIZE) {
            status = BLK_STS_IOERR;
            break;
        }

        if (rq_data_dir(rq) == WRITE)
            memcpy(priv->data + offset, buf, len);
        else
            memcpy(buf, priv->data + offset, len);

        sector += len / SECTOR_SIZE;
    }

    blk_mq_end_request(rq, status);
    return BLK_STS_OK;
}

static const struct blk_mq_ops myblk_mq_ops = {
    .queue_rq = myblk_queue_rq,
};

static int __init myblk_init(void)
{
    struct myblk_priv *priv;
    int ret;

    priv = kzalloc(sizeof(*priv), GFP_KERNEL);
    if (!priv)
        return -ENOMEM;

    /* 分配 ramdisk 数据区 */
    priv->data = vzalloc(DISK_SECTORS * SECTOR_SIZE);
    if (!priv->data) {
        ret = -ENOMEM;
        goto err_data;
    }

    /* 初始化 blk-mq tag set */
    priv->tag_set.ops       = &myblk_mq_ops;
    priv->tag_set.nr_hw_queues = 1;
    priv->tag_set.queue_depth  = 128;
    priv->tag_set.numa_node    = NUMA_NO_NODE;
    priv->tag_set.cmd_size     = 0;
    priv->tag_set.flags        = BLK_MQ_F_SHOULD_MERGE;

    ret = blk_mq_alloc_tag_set(&priv->tag_set);
    if (ret)
        goto err_tag;

    /* 分配 gendisk */
    priv->disk = blk_mq_alloc_disk(&priv->tag_set, priv);
    if (IS_ERR(priv->disk)) {
        ret = PTR_ERR(priv->disk);
        goto err_disk;
    }

    /* 配置磁盘参数 */
    snprintf(priv->disk->disk_name, DISK_NAME_LEN, "myblk");
    priv->disk->major       = 0;   /* 动态分配 */
    priv->disk->first_minor = 0;
    priv->disk->minors      = 1;
    priv->disk->fops        = &myblk_fops;
    priv->disk->private_data = priv;

    blk_queue_logical_block_size(priv->disk->queue, SECTOR_SIZE);
    set_capacity(priv->disk, DISK_SECTORS);

    /* 注册磁盘 */
    ret = add_disk(priv->disk);
    if (ret)
        goto err_add;

    pr_info("myblk: registered %s, %llu sectors\n",
            priv->disk->disk_name, (u64)DISK_SECTORS);
    return 0;

err_add:
    put_disk(priv->disk);
err_disk:
    blk_mq_free_tag_set(&priv->tag_set);
err_tag:
    vfree(priv->data);
err_data:
    kfree(priv);
    return ret;
}

I/O 调度器

bash
# 查看当前调度器
cat /sys/block/sda/queue/scheduler

# 切换调度器
echo mq-deadline > /sys/block/sda/queue/scheduler

# 各调度器适用场景
# none      — NVMe SSD(硬件自带队列)
# mq-deadline — 通用 SSD/HDD,保证延迟
# bfq       — 桌面,公平带宽分配
# kyber     — 高性能 NVMe

关键调优参数

bash
# 队列深度
cat /sys/block/nvme0n1/queue/nr_requests

# 预读大小(KB)
cat /sys/block/sda/queue/read_ahead_kb
echo 256 > /sys/block/sda/queue/read_ahead_kb

# 合并策略
cat /sys/block/sda/queue/nomerges
# 0=允许合并, 1=只合并相邻, 2=禁止合并

褚成志的笔记