8d1d9506cdd5ce37e02614fb68f3d1ffe7a612a9
usage
cd /home/lian/share/10.1-spdk/spdk
./configure --with-shared
make -j
make
# sometimes dd if=/dev/zero of=/dev/nvme0n1 bs=1M count=10
LD_PRELOAD=./libzvfs.so ./func_test
测试
cd /home/lian/share/10.1-spdk/spdk
export LD_LIBRARY_PATH=/home/lian/share/10.1-spdk/spdk/build/lib:/home/lian/share/10.1-spdk/spdk/dpdk/build/lib:$LD_LIBRARY_PATH
export PATH=/home/lian/share/10.1-spdk/spdk/build/bin:$PATH
./build/bin/spdk_nvme_perf \
-r 'trtype:PCIe traddr:0000:03:00.0' \
-q 1 -o 4096 -w randwrite -t 5
root@ubuntu:/home/lian/share/10.1-spdk/spdk# ./build/bin/spdk_nvme_perf -r 'trtype:PCIe traddr:0000:03:00.0' -q 1 -o 4096 -w randwrite -t 5
Initializing NVMe Controllers
Attached to NVMe Controller at 0000:03:00.0 [15ad:07f0]
Associating PCIE (0000:03:00.0) NSID 1 with lcore 0
Initialization complete. Launching workers.
========================================================
Latency(us)
Device Information : IOPS MiB/s Average min max
PCIE (0000:03:00.0) NSID 1 from core 0: 22097.20 86.32 45.21 21.93 1639.58
========================================================
Total : 22097.20 86.32 45.21 21.93 1639.58
./build/bin/spdk_nvme_perf \
-r 'trtype:PCIe traddr:0000:03:00.0' \
-q 32 -o 4096 -w randwrite -t 5
root@ubuntu:/home/lian/share/10.1-spdk/spdk# ./build/bin/spdk_nvme_perf -r 'trtype:PCIe traddr:0000:03:00.0' -q 32 -o 4096 -w randwrite -t 5
Initializing NVMe Controllers
Attached to NVMe Controller at 0000:03:00.0 [15ad:07f0]
Associating PCIE (0000:03:00.0) NSID 1 with lcore 0
Initialization complete. Launching workers.
========================================================
Latency(us)
Device Information : IOPS MiB/s Average min max
PCIE (0000:03:00.0) NSID 1 from core 0: 80122.94 312.98 399.36 36.31 2225.64
========================================================
Total : 80122.94 312.98 399.36 36.31 2225.64
./build/bin/spdk_nvme_perf \
-r 'trtype:PCIe traddr:0000:03:00.0' \
-q 1 -o 131072 -w write -t 5
root@ubuntu:/home/lian/share/10.1-spdk/spdk# export LD_LIBRARY_PATH=/home/lian/share/10.1-spdk/spdk/build/lib:/home/lian/share/10.1-spdk/spdk/dpdk/build/lib:$LD_LIBRARY_PATH
root@ubuntu:/home/lian/share/10.1-spdk/spdk# export PATH=/home/lian/share/10.1-spdk/spdk/build/bin:$PATH
root@ubuntu:/home/lian/share/10.1-spdk/spdk# ./build/bin/spdk_nvme_perf -r 'trtype:PCIe traddr:0000:03:00.0' -q 1 -o 131072 -w write -t 5
Initializing NVMe Controllers
Attached to NVMe Controller at 0000:03:00.0 [15ad:07f0]
Associating PCIE (0000:03:00.0) NSID 1 with lcore 0
Initialization complete. Launching workers.
========================================================
Latency(us)
Device Information : IOPS MiB/s Average min max
PCIE (0000:03:00.0) NSID 1 from core 0: 14746.80 1843.35 67.79 40.16 4324.96
========================================================
Total : 14746.80 1843.35 67.79 40.16 4324.96
./build/bin/spdk_nvme_perf \
-r 'trtype:PCIe traddr:0000:03:00.0' \
-q 32 -o 131072 -w write -t 5
root@ubuntu:/home/lian/share/10.1-spdk/spdk# ./build/bin/spdk_nvme_perf -r 'trtype:PCIe traddr:0000:03:00.0' -q 32 -o 131072 -w write -t 5
Initializing NVMe Controllers
Attached to NVMe Controller at 0000:03:00.0 [15ad:07f0]
Associating PCIE (0000:03:00.0) NSID 1 with lcore 0
Initialization complete. Launching workers.
========================================================
Latency(us)
Device Information : IOPS MiB/s Average min max
PCIE (0000:03:00.0) NSID 1 from core 0: 21997.40 2749.68 1455.09 96.64 26152.13
========================================================
Total : 21997.40 2749.68 1455.09 96.64 26152.13
系统调用
no O_DIRECT 小块
root@ubuntu:/home/lian/share/10.1-spdk/zvfs# ./func_test
=== test_single_file_perf ===
Path : /tmp/test.dat
IO size : 4 KB
Max file: 2048 MB
Duration: 10 sec
WRITE:
total : 12668.9 MB
time : 10.003 sec
IOPS : 324211 ops/sec
BW : 1266.45 MB/s
READ:
total : 7664.5 MB
time : 10.000 sec
IOPS : 196210 ops/sec
BW : 766.44 MB/s
=== all tests PASSED ===
no O_DIRECT 大块
root@ubuntu:/home/lian/share/10.1-spdk/zvfs# ./func_test
=== test_single_file_perf ===
Path : /tmp/test.dat
IO size : 128 KB
Max file: 2048 MB
Duration: 10 sec
WRITE:
total : 14609.5 MB
time : 10.000 sec
IOPS : 11688 ops/sec
BW : 1460.95 MB/s
READ:
total : 8138.6 MB
time : 10.000 sec
IOPS : 6511 ops/sec
BW : 813.85 MB/s
=== all tests PASSED ===
O_DIRECT 小块
root@ubuntu:/home/lian/share/10.1-spdk/zvfs# ./func_test
=== test_single_file_perf ===
Path : /tmp/test.dat
IO size : 4 KB
Max file: 2048 MB
Duration: 10 sec
WRITE:
total : 434.5 MB
time : 10.000 sec
IOPS : 11122 ops/sec
BW : 43.45 MB/s
READ:
total : 373.8 MB
time : 10.000 sec
IOPS : 9568 ops/sec
BW : 37.38 MB/s
=== all tests PASSED ===
O_DIRECT 大块
root@ubuntu:/home/lian/share/10.1-spdk/zvfs# ./func_test
=== test_single_file_perf ===
Path : /tmp/test.dat
IO size : 128 KB
Max file: 2048 MB
Duration: 10 sec
WRITE:
total : 7245.4 MB
time : 10.000 sec
IOPS : 5796 ops/sec
BW : 724.53 MB/s
READ:
total : 9006.5 MB
time : 10.000 sec
IOPS : 7205 ops/sec
BW : 900.64 MB/s
=== all tests PASSED ===
SPDK
非对齐
root@ubuntu:/home/lian/share/10.1-spdk/zvfs# LD_PRELOAD=./libzvfs.so ./func_test /zvfs
=== test_single_file_perf ===
Path : /zvfs/file.dat
IO size : 128 KB
Max file: 2048 MB
Duration: 10 sec
WRITE:
total : 10304.0 MB
time : 10.000 sec
IOPS : 8243 ops/sec
BW : 1030.40 MB/s
READ:
total : 17788.5 MB
time : 10.000 sec
IOPS : 14231 ops/sec
BW : 1778.85 MB/s
=== all tests PASSED ===
全对齐大块
root@ubuntu:/home/lian/share/10.1-spdk/zvfs# LD_PRELOAD=./libzvfs.so ./func_test /zvfs
=== test_single_file_perf ===
Path : /zvfs/file.dat
IO size : 128 KB
Max file: 2048 MB
Duration: 10 sec
WRITE:
total : 16624.4 MB
time : 10.000 sec
IOPS : 13299 ops/sec
BW : 1662.43 MB/s
READ:
total : 16430.8 MB
time : 10.000 sec
IOPS : 13145 ops/sec
BW : 1643.07 MB/s
=== all tests PASSED ===
全对齐小块
root@ubuntu:/home/lian/share/10.1-spdk/zvfs# LD_PRELOAD=./libzvfs.so ./func_test /zvfs
=== test_single_file_perf ===
Path : /zvfs/file.dat
IO size : 4 KB
Max file: 2048 MB
Duration: 10 sec
WRITE:
total : 944.5 MB
time : 10.000 sec
IOPS : 24179 ops/sec
BW : 94.45 MB/s
READ:
total : 982.8 MB
time : 10.000 sec
IOPS : 25159 ops/sec
BW : 98.28 MB/s
=== all tests PASSED ===
SPDK
- blob_store: blob仓库,管理多个blob对象。
- blob: 存储对象,逻辑上连续,物理上不一定连续。相当于文件。
- cluster: 分配单元,一个 blob 可以由多个 cluster 构成,扩容即分配新的 cluster。相当于文件系统的block group。
- page: IO单元,一个 cluster 等于多个 page。
文件系统
架构设计
| 应用程序
| (POSIX API: open/read/write/close)
| LD_PRELOAD 拦截层
| (简单路径判断和转发到zvfs)
| zvfs 文件系统层
| (blob 操作)
| SPDK Blobstore
| 块设备 (Malloc0)
磁盘布局
BlobStore:
|—— Super Blob(元数据,使用SPDK的Super Blob锚定)
|——超级块
|——目录项/目录日志
|—— Blob 1 (文件A...)
|—— Blob 2 (文件B...)
|—— Blob N (文件C...)
数据结构
Super Blob(元数据)
[超级块]
- magic_number: 0x5A563146 (ZV1F)
- version: 1
[目录项]
- filename[256]: 文件名
- blob_id: 对应的数据blob ID
- file_size: 文件实际大小(字节)
- allocated_clusters: 已分配的cluster数量
- is_valid: 标记是否有效(用于删除)
/* 目录项(内存中的目录) */
typedef struct {
char filename[256];
spdk_blob_id blob_id;
uint64_t file_size; // 文件逻辑大小(字节)
uint32_t allocated_clusters; // 已分配的cluster数量
bool is_valid; // false 表示已删除
int32_t open_count; // 打开的文件句柄数量
} zvfs_dirent_t;
/* 文件系统全局结构 */
typedef struct zvfs {
struct spdk_blob_store *bs;
struct spdk_io_channel *channel;
struct spdk_blob *super_blob; // 承载目录日志的blob
uint64_t io_unit_size; // page大小,单位字节
/* 目录 */
zvfs_dirent_t *dirents; // 目录项数组 #define ZVFS_MAX_FILES 1024
uint32_t dirent_count; // 当前有效项数
/* 伪FD表 */
struct zvfs_file *fd_table[ZVFS_MAX_FD]; // // e.g., #define ZVFS_MAX_FD 64
int fd_base; // 伪FD起始值,如10000
int openfd_count;
/* 元数据 */
uint32_t magic; // 0x5A563146 (ZV1F)
uint32_t version; // 1
} zvfs_t;
/* 打开的文件句柄 */
typedef struct zvfs_file {
zvfs_t *fs;
struct spdk_blob *blob;
zvfs_dirent_t *dirent; // 指回目录项 file_size/allocated_clusters
uint64_t current_offset; // 当前读写位置
int flags; // O_RDONLY / O_RDWR / O_CREAT 等
int pseudo_fd;
/* 临时DMA缓冲区(可选:每个file一个,避免每次malloc) */
void *dma_buf;
uint64_t dma_buf_size;
} zvfs_file_t;
工作流程
mount
hook POSIX API没有很好的调用时机,单线程目前采用懒加载。
1. [创建块设备]
- spdk_bdev_create_bs_dev_ext
2. [初始化文件系统]
- spdk_bs_init 或者 spdk_bs_load(已有数据时)
- spdk_bs_get_io_unit_size 获取io单元大小(page)
- spdk_bs_alloc_io_channel 分配blobstore的读写入口
3. [读取元数据]
- spdk_bs_get_super_blob 获取 Super Blob ID
- spdk_bs_open_blob 打开 Super Blob
- 读取超级块,校验 magic
- 读取目录项数组,加载到内存 dirents
4. [创建zvfs_t结构体]
- 创建 zvfs_t 结构体
- 填充 bs/channel/super_blob/dirents 等字段
open
O_RDONLY / O_RDWR
1. [文件名查找]
- 遍历 dirents,匹配 filename 且 is_valid=true
- 找不到返回 -ENOENT
2. [打开blob]
- spdk_bs_open_blob(dirent->blob_id)
- dirent->open_count++
- fs->openfd_count++
3. [分配文件句柄]
- 创建 zvfs_file_t,dirent 指针指向目录项
- 分配伪FD,写入 fd_table
5. [返回伪FD]
O_CREAT
1. [文件名查找]
- 遍历 dirents,若 filename 已存在且 is_valid=true,返回 -EEXIST
- 找一个 is_valid=false 的空槽位;没有空槽则追加(dirent_count < max_files)
2. [创建blob]
- spdk_bs_create_blob → 得到 blob_id
- spdk_bs_open_blob → 得到 blob 句柄
- spdk_blob_resize 初始分配空间
- spdk_blob_sync_md 持久化 cluster 分配
3. [写目录]
- 填充 filename/blob_id/file_size=0/is_valid=true
- dirent->open_count = 1
4. [创建文件句柄]
- 创建 zvfs_file_t
- 分配伪FD,写入 fd_table
5. [返回伪FD]
说明:目录变更只写内存,unmount 时统一持久化。
read
读写都以字节为单位,offset / count 单位为字节;根据 io_unit_size 做对齐计算。
1. [参数]
- fd
- buffer
- count
- offset(隐含)
2. [边界检查]
- 实际可读 = min(count, dirent->file_size - current_offset)
- 实际可读为0则返回0
3. [计算Blob位置]
- start_page = current_offset / io_unit_size
- page_offset = current_offset % io_unit_size
- num_pages = (page_offset + 实际可读 + io_unit_size - 1) / io_unit_size
4. [DMA读取]
- 非对齐读(offset != 0 || count 不是整页)
- 需要DMA临时缓冲区(spdk_dma_zmalloc)
- spdk_blob_io_read(blob, channel, dma_buffer, start_page, num_pages, ...)
- 从 dma_buffer + page_offset 拷贝到用户 buffer
- 对齐
- 仍使用DMA缓冲区执行读取,再拷贝到用户buffer
5. [更新offset]
- current_offset += 实际可读
6. [返回实际读取字节数]
说明:SPDK需要DMA可用的内存,应用提供的用户缓冲区通常不满足要求。即便对齐也不能直接提交给spdk_blob_io_*,应使用DMA缓冲作为跳板;未来通过注册内存池可优化直传。
write
1. [参数]
- fd
- buffer
- count
- offset(隐含)
2. [检查空间是否足够]
- 需要大小 = current_offset + count
- 若超过 allocated_clusters 对应容量:
- spdk_blob_resize 扩容
- spdk_blob_sync_md
- 更新 dirent->allocated_clusters
3. [计算写入位置]
- start_page / page_offset / num_pages(同read)
4. [DMA写入]
- 非对齐写(offset != 0 || count 不是整页)
- 读取涉及的首尾page到DMA临时缓冲区
- 修改对应位置的数据
- 写回:spdk_blob_io_write(blob, channel, dma_buffer, start_page, num_pages, ...)
- 对齐
- 仍通过DMA缓冲区提交写入
5. [更新状态]
- current_offset += count
- dirent->file_size = max(dirent->file_size, current_offset)
6. [返回写入字节数]
close
1. [关闭Blob]
- spdk_blob_close(file->blob)
- dirent->open_count--
- fs->openfd_count++
- 若 open_count == 0 且 is_valid == false(已unlink):spdk_bs_delete_blob, 清空dirent
- 若 openfd_count == 0 则 unmount
2. [释放缓冲区]
- 释放 dma_buf
- 清除 fd_table[pseudo_fd]
- free(zvfs_file_t)
3. [返回0]
unlink
1. [查找目录项]
- 遍历 dirents,匹配 filename 且 is_valid=true
- 找不到返回 -ENOENT
2. [标记删除]
- dirent->is_valid = false
3. [判断是否立即删除]
- open_count == 0:spdk_bs_delete_blob,清空该槽位
- open_count > 0:延迟,最后一个 close 负责删除
4. [返回0]
unmount
1. [关闭channel]
- spdk_bs_free_io_channel
2. [关闭BlobStore]
- spdk_bs_unload
3. [释放FS]
- free(fs)
其他方案
如果不使用LD_PRELOADhook,可以使用FUSE。
FUSE是一种内核文件系统程序,挂载在文件目录上,对这个目录的访问,会使用这个文件系统程序。
文件系统程序会将请求转发给应用层程序,这里的应用层程序可以是SPDK。这样就不用管其他的操作。
Description
Languages
C
95.5%
Shell
3.8%
Makefile
0.7%