块设备驱动:原理与架构
块设备 vs 字符设备
| 特性 | 块设备 | 字符设备 |
|---|---|---|
| 访问单位 | 固定大小的块(512B/4KB) | 字节流 |
| 缓冲 | 有页缓存(Page Cache) | 无 |
| 随机访问 | 支持 | 通常不支持 |
| 典型设备 | 硬盘、SSD、eMMC、ramdisk | 串口、GPIO |
块设备 I/O 路径
用户空间 read()/write()
│
▼ VFS
页缓存(Page Cache)
│ 缓存未命中时
▼
通用块层(Generic Block Layer)
│ bio 结构
▼
I/O 调度器(CFQ / deadline / mq-deadline / kyber / BFQ)
│ request 队列
▼
块设备驱动(request_fn / blk-mq)
│
▼
硬件(SATA / NVMe / eMMC / ramdisk)核心数据结构
bio — 基本 I/O 单元
c
struct bio {
struct block_device *bi_bdev;
blk_opf_t bi_opf; /* 操作类型:REQ_OP_READ/WRITE */
sector_t bi_iter.bi_sector; /* 起始扇区 */
unsigned int bi_iter.bi_size; /* 总字节数 */
struct bio_vec *bi_io_vec; /* scatter-gather 列表 */
unsigned short bi_vcnt; /* bio_vec 数量 */
bio_end_io_t *bi_end_io; /* 完成回调 */
void *bi_private;
};request — 合并后的 I/O 请求
多个相邻的 bio 会被 I/O 调度器合并成一个 request,减少磁头移动次数。
blk-mq 多队列驱动(现代方式)
Linux 5.x 之后,几乎所有新驱动都使用 blk-mq(Multi-Queue Block Layer):
c
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#define SECTOR_SIZE 512
#define DISK_SECTORS (64 * 1024 * 2) /* 64 MB ramdisk */
struct myblk_priv {
struct gendisk *disk;
struct blk_mq_tag_set tag_set;
u8 *data; /* ramdisk 数据区 */
spinlock_t lock;
};
/* 处理单个 I/O 请求 */
static blk_status_t myblk_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct request *rq = bd->rq;
struct myblk_priv *priv = hctx->queue->queuedata;
struct bio_vec bvec;
struct req_iterator iter;
sector_t sector = blk_rq_pos(rq);
blk_status_t status = BLK_STS_OK;
blk_mq_start_request(rq);
/* 遍历 request 中的所有 bio_vec */
rq_for_each_segment(bvec, rq, iter) {
size_t len = bvec.bv_len;
void *buf = page_address(bvec.bv_page) + bvec.bv_offset;
loff_t offset = sector * SECTOR_SIZE;
if (offset + len > DISK_SECTORS * SECTOR_SIZE) {
status = BLK_STS_IOERR;
break;
}
if (rq_data_dir(rq) == WRITE)
memcpy(priv->data + offset, buf, len);
else
memcpy(buf, priv->data + offset, len);
sector += len / SECTOR_SIZE;
}
blk_mq_end_request(rq, status);
return BLK_STS_OK;
}
static const struct blk_mq_ops myblk_mq_ops = {
.queue_rq = myblk_queue_rq,
};
static int __init myblk_init(void)
{
struct myblk_priv *priv;
int ret;
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
if (!priv)
return -ENOMEM;
/* 分配 ramdisk 数据区 */
priv->data = vzalloc(DISK_SECTORS * SECTOR_SIZE);
if (!priv->data) {
ret = -ENOMEM;
goto err_data;
}
/* 初始化 blk-mq tag set */
priv->tag_set.ops = &myblk_mq_ops;
priv->tag_set.nr_hw_queues = 1;
priv->tag_set.queue_depth = 128;
priv->tag_set.numa_node = NUMA_NO_NODE;
priv->tag_set.cmd_size = 0;
priv->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
ret = blk_mq_alloc_tag_set(&priv->tag_set);
if (ret)
goto err_tag;
/* 分配 gendisk */
priv->disk = blk_mq_alloc_disk(&priv->tag_set, priv);
if (IS_ERR(priv->disk)) {
ret = PTR_ERR(priv->disk);
goto err_disk;
}
/* 配置磁盘参数 */
snprintf(priv->disk->disk_name, DISK_NAME_LEN, "myblk");
priv->disk->major = 0; /* 动态分配 */
priv->disk->first_minor = 0;
priv->disk->minors = 1;
priv->disk->fops = &myblk_fops;
priv->disk->private_data = priv;
blk_queue_logical_block_size(priv->disk->queue, SECTOR_SIZE);
set_capacity(priv->disk, DISK_SECTORS);
/* 注册磁盘 */
ret = add_disk(priv->disk);
if (ret)
goto err_add;
pr_info("myblk: registered %s, %llu sectors\n",
priv->disk->disk_name, (u64)DISK_SECTORS);
return 0;
err_add:
put_disk(priv->disk);
err_disk:
blk_mq_free_tag_set(&priv->tag_set);
err_tag:
vfree(priv->data);
err_data:
kfree(priv);
return ret;
}I/O 调度器
bash
# 查看当前调度器
cat /sys/block/sda/queue/scheduler
# 切换调度器
echo mq-deadline > /sys/block/sda/queue/scheduler
# 各调度器适用场景
# none — NVMe SSD(硬件自带队列)
# mq-deadline — 通用 SSD/HDD,保证延迟
# bfq — 桌面,公平带宽分配
# kyber — 高性能 NVMe关键调优参数
bash
# 队列深度
cat /sys/block/nvme0n1/queue/nr_requests
# 预读大小(KB)
cat /sys/block/sda/queue/read_ahead_kb
echo 256 > /sys/block/sda/queue/read_ahead_kb
# 合并策略
cat /sys/block/sda/queue/nomerges
# 0=允许合并, 1=只合并相邻, 2=禁止合并