diff --git a/README.md b/README.md index 9a83818..89d28be 100755 --- a/README.md +++ b/README.md @@ -1,640 +1,218 @@ +# ZVFS -## usage -```shell +ZVFS 是一个基于 `SPDK Blobstore` 的轻量级用户态文件系统原型, +通过 `LD_PRELOAD` 拦截常见 POSIX 文件 API,把 `/zvfs` 路径下的文件 I/O 转换为 Blob I/O。 + +目标是让上层应用尽量少改动地复用阻塞式文件接口,同时接近 SPDK 在低队列深度(QD≈1)场景的性能上限。 + +## 1. 项目结构 + +```text +zvfs/ +├── src/ +│ ├── hook/ # POSIX API hook 层(open/read/write/...) +│ ├── fs/ # inode/path/fd 运行时元数据管理 +│ ├── spdk_engine/ # SPDK Blobstore 封装 +│ ├── common/ # 对齐与缓冲区工具函数 +│ ├── config.h # 默认配置(JSON、bdev、xattr key 等) +│ └── Makefile # 产出 libzvfs.so +├── tests/ +│ ├── hook/ # hook API 语义测试 +│ ├── ioengine_test/ # Blob 引擎单元测试 +│ └── Makefile +├── scripts/ # db_bench/hook 测试辅助脚本 +├── spdk/ # SPDK 子模块 +└── README.md +``` + +## 2. 核心架构 + +### 2.1 分层 + +当前实现: + +```text +App (open/read/write/fstat/...) + -> LD_PRELOAD Hook (src/hook) + -> ZVFS Runtime Metadata (src/fs) + -> SPDK Engine (src/spdk_engine) + -> SPDK Blobstore + -> bdev (Malloc/NVMe) +``` + +目标架构(Daemon + IPC): + +```text +App (multi-process, e.g. PostgreSQL) + -> LD_PRELOAD Hook Client + -> IPC (Unix Domain Socket) + -> zvfs daemon + -> metadata manager + -> SPDK worker threads + -> SPDK Blobstore / bdev +``` + +### 2.2 目标架构简版(HOOK 层 + daemon 层) + +- `HOOK 层` + - 拦截 `/zvfs` 路径的 POSIX API 并同步发起 IPC 请求。 + - 维护本地最小状态(如 `fd -> remote_handle_id`)。 + - 对非 `/zvfs` 路径继续透传到 `real_*` syscall(POSIX passthrough)。 +- `daemon 层` + - 独占 SPDK 资源(`spdk_env/blobstore/spdk_thread`)。 + - 统一处理元数据与并发控制(path/inode/handle)。 + - 接收 IPC 请求并执行实际 I/O,返回 POSIX 风格结果与 errno。 + +### 2.3 元数据与数据映射 + +- 文件数据:存储在 SPDK blob 中。 +- 文件到 blob 的映射:写入真实文件的 `xattr`(key: `user.zvfs.blob_id`)。 +- 运行时维护三张表: + - `inode_table`:`blob_id -> inode` + - `path_cache`:`path -> inode` + - `fd_table`:`fd -> open_file` + +### 2.4 当前实现的 I/O 路径要点 + +- `blob_read/blob_write` 统一走按 `io_unit_size` 对齐的 DMA 缓冲。 +- 非对齐写会触发读改写(RMW):先读对齐块,再覆盖局部写回。 +- `readv/writev` 在 hook 层会做聚合,减少多次 I/O 提交。 +- `fsync/fdatasync` 对 zvfs fd 调用 `blob_sync_md`;`sync_file_range` 在 zvfs 路径直接返回成功。 + +## 3. 构建 + +> 下面命令以仓库根目录为 `/home/lian/try/zvfs` 为例。 + +### 3.1 初始化并构建 SPDK + +```bash git submodule update --init --recursive - cd spdk ./scripts/pkgdep.sh ./configure --with-shared -make -j - -make - -# sometimes dd if=/dev/zero of=/dev/nvme0n1 bs=1M count=10 -LD_PRELOAD=./libzvfs.so ./func_test +make -j"$(nproc)" ``` -## 测试 -### 总结 -由于是目标是hook阻塞的API,相当于队列深度为1。 +### 3.2 构建 ZVFS 与测试 -队列深度为1的情况下,spdk测试工具spdk_nvme_perf的测试结果: -1. iosize = 4K:100MiB/s -2. ioszie = 128K:1843MiB/s - -zvfs的测试结果: -1. iosize = 4K:95MiB/s -2. ioszie = 128K:1662MiB/s - -相当于spdk测试工具读写的90%性能。 - -对比系统调用: -1. O_DIRECT - 1. 大块4K:43MiB/s - 2. 小块128K:724MiB/s -2. !O_DIRECT - 1. 大块4K:1460MiB/s - 2. 小块128K:1266MiB/s - -非对齐情况下,写入性能/2,因为需要read-update-write。 - -### spdk_nvme_perf 性能基准测试 -```shell -cd /home/lian/share/10.1-spdk/spdk - -export LD_LIBRARY_PATH=/home/lian/share/10.1-spdk/zvfs/spdk/build/lib:/home/lian/share/10.1-spdk/zvfs/spdk/dpdk/build/lib:$LD_LIBRARY_PATH -export PATH=/home/lian/share/10.1-spdk/zvfs/spdk/build/bin:$PATH - -./build/bin/spdk_nvme_perf \ - -r 'trtype:PCIe traddr:0000:03:00.0' \ - -q 1 -o 4096 -w randwrite -t 5 - -root@ubuntu:/home/lian/share/10.1-spdk/spdk# ./build/bin/spdk_nvme_perf -r 'trtype:PCIe traddr:0000:03:00.0' -q 1 -o 4096 -w randwrite -t 5 -Initializing NVMe Controllers -Attached to NVMe Controller at 0000:03:00.0 [15ad:07f0] -Associating PCIE (0000:03:00.0) NSID 1 with lcore 0 -Initialization complete. Launching workers. -======================================================== - Latency(us) -Device Information : IOPS MiB/s Average min max -PCIE (0000:03:00.0) NSID 1 from core 0: 25765.92 100.65 38.77 16.58 802.32 -======================================================== -Total : 25765.92 100.65 38.77 16.58 802.32 - - -./build/bin/spdk_nvme_perf \ - -r 'trtype:PCIe traddr:0000:03:00.0' \ - -q 32 -o 4096 -w randwrite -t 5 - - -root@ubuntu:/home/lian/share/10.1-spdk/spdk# ./build/bin/spdk_nvme_perf -r 'trtype:PCIe traddr:0000:03:00.0' -q 32 -o 4096 -w randwrite -t 5 -Initializing NVMe Controllers -Attached to NVMe Controller at 0000:03:00.0 [15ad:07f0] -Associating PCIE (0000:03:00.0) NSID 1 with lcore 0 -Initialization complete. Launching workers. -======================================================== - Latency(us) -Device Information : IOPS MiB/s Average min max -PCIE (0000:03:00.0) NSID 1 from core 0: 80122.94 312.98 399.36 36.31 2225.64 -======================================================== -Total : 80122.94 312.98 399.36 36.31 2225.64 - - -./build/bin/spdk_nvme_perf \ - -r 'trtype:PCIe traddr:0000:03:00.0' \ - -q 1 -o 131072 -w write -t 5 - -root@ubuntu:/home/lian/share/10.1-spdk/spdk# ./build/bin/spdk_nvme_perf -r 'trtype:PCIe traddr:0000:03:00.0' -q 1 -o 131072 -w write -t 5 -Initializing NVMe Controllers -Attached to NVMe Controller at 0000:03:00.0 [15ad:07f0] -Associating PCIE (0000:03:00.0) NSID 1 with lcore 0 -Initialization complete. Launching workers. -======================================================== - Latency(us) -Device Information : IOPS MiB/s Average min max -PCIE (0000:03:00.0) NSID 1 from core 0: 14746.80 1843.35 67.79 40.16 4324.96 -======================================================== -Total : 14746.80 1843.35 67.79 40.16 4324.96 - - -./build/bin/spdk_nvme_perf \ - -r 'trtype:PCIe traddr:0000:03:00.0' \ - -q 32 -o 131072 -w write -t 5 - -root@ubuntu:/home/lian/share/10.1-spdk/spdk# ./build/bin/spdk_nvme_perf -r 'trtype:PCIe traddr:0000:03:00.0' -q 32 -o 131072 -w write -t 5 -Initializing NVMe Controllers -Attached to NVMe Controller at 0000:03:00.0 [15ad:07f0] -Associating PCIE (0000:03:00.0) NSID 1 with lcore 0 -Initialization complete. Launching workers. -======================================================== - Latency(us) -Device Information : IOPS MiB/s Average min max -PCIE (0000:03:00.0) NSID 1 from core 0: 21997.40 2749.68 1455.09 96.64 26152.13 -======================================================== -Total : 21997.40 2749.68 1455.09 96.64 26152.13 -``` -### 系统调用 -#### no O_DIRECT 小块 - -```shell -root@ubuntu:/home/lian/share/10.1-spdk/zvfs# ./func_test - -=== test_single_file_perf === -Path : /tmp/test.dat -IO size : 4 KB -Max file: 2048 MB -Duration: 10 sec - -WRITE: - total : 12668.9 MB - time : 10.003 sec - IOPS : 324211 ops/sec - BW : 1266.45 MB/s - -READ: - total : 7664.5 MB - time : 10.000 sec - IOPS : 196210 ops/sec - BW : 766.44 MB/s - -=== all tests PASSED === -``` -#### no O_DIRECT 大块 - -```shell -root@ubuntu:/home/lian/share/10.1-spdk/zvfs# ./func_test - -=== test_single_file_perf === -Path : /tmp/test.dat -IO size : 128 KB -Max file: 2048 MB -Duration: 10 sec - -WRITE: - total : 14609.5 MB - time : 10.000 sec - IOPS : 11688 ops/sec - BW : 1460.95 MB/s - -READ: - total : 8138.6 MB - time : 10.000 sec - IOPS : 6511 ops/sec - BW : 813.85 MB/s - -=== all tests PASSED === +```bash +cd /home/lian/try/zvfs +make -j"$(nproc)" +make test -j"$(nproc)" ``` -#### no O_DIRECT 随机 对齐 大块 -```shell -root@ubuntu:/home/lian/share/10.1-spdk/zvfs/zvfs# ./func_test +产物: -=== test_single_file_random_perf === -Path : /tmp/test.dat -IO size : 128 KB -Range : 2048 MB -Duration: 10 sec +- `src/libzvfs.so` +- `tests/bin/hook_api_test` +- `tests/bin/ioengine_single_blob_test` +- `tests/bin/ioengine_multi_blob_test` +- `tests/bin/ioengine_same_blob_mt_test` -RANDOM WRITE: - total : 8930.8 MB - time : 10.001 sec - IOPS : 7144 ops/sec - BW : 893.01 MB/s +## 4. 运行与验证 -RANDOM READ: - total : 8238.9 MB - time : 10.000 sec - IOPS : 6591 ops/sec - BW : 823.89 MB/s +### 4.1 Hook API 语义测试 -=== all tests PASSED === -``` -#### no O_DIRECT 随机 非对齐 大块 -```shell -root@ubuntu:/home/lian/share/10.1-spdk/zvfs/zvfs# ./func_test - -=== test_single_file_random_perf === -Path : /tmp/test.dat -IO size : 128 KB -Range : 2048 MB -Duration: 10 sec - -RANDOM WRITE: - total : 5964.4 MB - time : 10.000 sec - IOPS : 4771 ops/sec - BW : 596.43 MB/s - -RANDOM READ: - total : 6607.8 MB - time : 10.000 sec - IOPS : 5286 ops/sec - BW : 660.77 MB/s - -=== all tests PASSED === +```bash +mkdir -p /zvfs +cd /home/lian/try/zvfs +LD_PRELOAD=$PWD/src/libzvfs.so ZVFS_TEST_ROOT=/zvfs ./tests/bin/hook_api_test ``` -#### O_DIRECT 小块 -```shell -root@ubuntu:/home/lian/share/10.1-spdk/zvfs# ./func_test +覆盖点包括: -=== test_single_file_perf === -Path : /tmp/test.dat -IO size : 4 KB -Max file: 2048 MB -Duration: 10 sec +- `open/openat/rename/unlink` +- `read/write/pread/pwrite/readv/writev/pwritev` +- `fstat/lseek/ftruncate` +- `fcntl/ioctl(FIONREAD)` +- `fsync/fdatasync` -WRITE: - total : 434.5 MB - time : 10.000 sec - IOPS : 11122 ops/sec - BW : 43.45 MB/s +### 4.2 SPDK 引擎测试 -READ: - total : 373.8 MB - time : 10.000 sec - IOPS : 9568 ops/sec - BW : 37.38 MB/s - -=== all tests PASSED === -``` -#### O_DIRECT 大块 -```shell -root@ubuntu:/home/lian/share/10.1-spdk/zvfs# ./func_test - -=== test_single_file_perf === -Path : /tmp/test.dat -IO size : 128 KB -Max file: 2048 MB -Duration: 10 sec - -WRITE: - total : 7245.4 MB - time : 10.000 sec - IOPS : 5796 ops/sec - BW : 724.53 MB/s - -READ: - total : 9006.5 MB - time : 10.000 sec - IOPS : 7205 ops/sec - BW : 900.64 MB/s - -=== all tests PASSED === +```bash +cd /home/lian/try/zvfs +SPDK_BDEV_NAME=Malloc0 ./tests/bin/ioengine_single_blob_test +SPDK_BDEV_NAME=Malloc0 ./tests/bin/ioengine_multi_blob_test +SPDK_BDEV_NAME=Malloc0 ./tests/bin/ioengine_same_blob_mt_test ``` -### SPDK -#### 非对齐 -```shell -root@ubuntu:/home/lian/share/10.1-spdk/zvfs# LD_PRELOAD=./libzvfs.so ./func_test /zvfs +## 5. 关键环境变量 -=== test_single_file_perf === -Path : /zvfs/file.dat -IO size : 128 KB -Max file: 2048 MB -Duration: 10 sec +- `SPDK_BDEV_NAME`:选择后端 bdev(默认 `Malloc0`)。 +- `ZVFS_BDEV`:`zvfs_ensure_init` 使用的 bdev 名称(未设置时使用 `config.h` 默认值)。 +- `SPDK_JSON_CONFIG`:覆盖默认 SPDK JSON 配置路径。 -WRITE: - total : 10304.0 MB - time : 10.000 sec - IOPS : 8243 ops/sec - BW : 1030.40 MB/s +## 6. 性能说明(仅保留趋势) -READ: - total : 17788.5 MB - time : 10.000 sec - IOPS : 14231 ops/sec - BW : 1778.85 MB/s +`README` 历史压测数据来自旧版本,不能直接当作当前版本结论;但可作为设计趋势参考: -=== all tests PASSED === +- 目标工作负载为阻塞 API,近似 `QD=1`。 +- 旧数据下,ZVFS 在 `QD=1` 时约达到 `spdk_nvme_perf` 的 `90%~95%`。 + - 4K:约 `95 MiB/s` vs `100 MiB/s` + - 128K:约 `1662 MiB/s` vs `1843 MiB/s` +- 相对同机 `O_DIRECT` 路径,旧数据写带宽约有 `2.2x~2.3x` 提升。 +- 非对齐写存在 RMW,吞吐明显下降(旧数据常见接近对齐写的一半)。 + +如果需要用于对外汇报,请重新在当前 commit 与固定硬件环境下复测。 + +## 7. 当前限制 + +- 仅拦截 `/zvfs` 路径。 +- `mmap` 对 zvfs fd 当前返回 `ENOTSUP`(建议上层关闭 mmap 读写)。 +- `dup/dup2/dup3` 对 zvfs fd 当前返回 `ENOTSUP`。 +- `rename` 跨 `/zvfs` 与非 `/zvfs` 路径返回 `EXDEV`。 +- `fallocate(FALLOC_FL_PUNCH_HOLE)` 未实现。 + +## 8. 后续建议 + +- 补齐 mmap 路径(mmap table + 脏页回写)。 +- 完善多线程/高并发下的语义与压测基线。 +- 增加版本化 benchmark 报告,避免 README 中历史数据失真。 + +## 9. Blob Store 血泪教训 + +### Owner Thread 绑定 + +blobstore内部负责并发控制,让所有metadata操作都在一个线程上执行,回调固定绑定给创建blobstore的线程。所以多线程模型下不是send给谁谁就能poll到回调的。 + +正确架构: ``` -#### 全对齐大块 -```shell -root@ubuntu:/home/lian/share/10.1-spdk/zvfs# LD_PRELOAD=./libzvfs.so ./func_test /zvfs +metadata thread + spdk_bs_load() + resize + delete + sync_md -=== test_single_file_perf === -Path : /zvfs/file.dat -IO size : 128 KB -Max file: 2048 MB -Duration: 10 sec +worker thread + blob_io_read + blob_io_write + ``` -WRITE: - total : 16624.4 MB - time : 10.000 sec - IOPS : 13299 ops/sec - BW : 1662.43 MB/s - -READ: - total : 16430.8 MB - time : 10.000 sec - IOPS : 13145 ops/sec - BW : 1643.07 MB/s - -=== all tests PASSED === +### spdk_for_each_channel() Barrier +某些 metadata 操作非常慢: ``` - -#### 全对齐小块 -```shell -root@ubuntu:/home/lian/share/10.1-spdk/zvfs# LD_PRELOAD=./libzvfs.so ./func_test /zvfs - -=== test_single_file_perf === -Path : /zvfs/file.dat -IO size : 4 KB -Max file: 2048 MB -Duration: 10 sec - -WRITE: - total : 944.5 MB - time : 10.000 sec - IOPS : 24179 ops/sec - BW : 94.45 MB/s - -READ: - total : 982.8 MB - time : 10.000 sec - IOPS : 25159 ops/sec - BW : 98.28 MB/s - -=== all tests PASSED === +resize +delete +unload +snapshot ``` +这些操作内部会调用:spdk_for_each_channel() -#### 对齐随机写(大块) -```shell -root@ubuntu:/home/lian/share/10.1-spdk/zvfs/zvfs# LD_PRELOAD=./libzvfs.so ./func_test /zvfs - -=== test_single_file_random_perf === -Path : /zvfs/file.dat -IO size : 128 KB -Range : 2048 MB -Duration: 10 sec - -RANDOM WRITE: - total : 17461.8 MB - time : 10.000 sec - IOPS : 13969 ops/sec - BW : 1746.17 MB/s - -RANDOM READ: - total : 17439.5 MB - time : 10.000 sec - IOPS : 13952 ops/sec - BW : 1743.95 MB/s - -=== all tests PASSED === -``` -#### 非对齐随机写(大块) -```shell -root@ubuntu:/home/lian/share/10.1-spdk/zvfs/zvfs# LD_PRELOAD=./libzvfs.so ./func_test /zvfs - -=== test_single_file_random_perf === -Path : /zvfs/file.dat -IO size : 128 KB -Range : 2048 MB -Duration: 10 sec - -RANDOM WRITE: - total : 7500.2 MB - time : 10.000 sec - IOPS : 6000 ops/sec - BW : 750.02 MB/s - -RANDOM READ: - total : 15143.8 MB - time : 10.000 sec - IOPS : 12115 ops/sec - BW : 1514.35 MB/s - -=== all tests PASSED === -``` - -## SPDK -1. blob_store: blob仓库,管理多个blob对象。 -2. blob: 存储对象,逻辑上连续,物理上不一定连续。相当于文件。 -3. cluster: 分配单元,一个 blob 可以由多个 cluster 构成,扩容即分配新的 cluster。相当于文件系统的block group。 -4. page: IO单元,一个 cluster 等于多个 page。 - -文件系统 - -## 架构设计 -```scss -| 应用程序 - | (POSIX API: open/read/write/close) -| LD_PRELOAD 拦截层 - | (简单路径判断和转发到zvfs) -| zvfs 文件系统层 - | (blob 操作) -| SPDK Blobstore -| 块设备 (Malloc0) -``` - -### 磁盘布局 -```scss -BlobStore: -|—— Super Blob(元数据,使用SPDK的Super Blob锚定) - |——超级块 - |——目录项/目录日志 -|—— Blob 1 (文件A...) -|—— Blob 2 (文件B...) -|—— Blob N (文件C...) -``` - - -### 数据结构 -#### Super Blob(元数据) -```scss -[超级块] -- magic_number: 0x5A563146 (ZV1F) -- version: 1 - -[目录项] -- filename[256]: 文件名 -- blob_id: 对应的数据blob ID -- file_size: 文件实际大小(字节) -- allocated_clusters: 已分配的cluster数量 -- is_valid: 标记是否有效(用于删除) -``` +语义:在所有 io_channel 所属线程执行 callback +类似 ```c -/* 目录项(内存中的目录) */ -typedef struct { - char filename[256]; - spdk_blob_id blob_id; - uint64_t file_size; // 文件逻辑大小(字节) - uint32_t allocated_clusters; // 已分配的cluster数量 - bool is_valid; // false 表示已删除 - int32_t open_count; // 打开的文件句柄数量 -} zvfs_dirent_t; - -/* 文件系统全局结构 */ -typedef struct zvfs { - struct spdk_blob_store *bs; - struct spdk_io_channel *channel; - struct spdk_blob *super_blob; // 承载目录日志的blob - uint64_t io_unit_size; // page大小,单位字节 - - /* 目录 */ - zvfs_dirent_t *dirents; // 目录项数组 #define ZVFS_MAX_FILES 1024 - uint32_t dirent_count; // 当前有效项数 - - /* 伪FD表 */ - struct zvfs_file *fd_table[ZVFS_MAX_FD]; // // e.g., #define ZVFS_MAX_FD 64 - int fd_base; // 伪FD起始值,如10000 - int openfd_count; - - /* 元数据 */ - uint32_t magic; // 0x5A563146 (ZV1F) - uint32_t version; // 1 -} zvfs_t; - -/* 打开的文件句柄 */ -typedef struct zvfs_file { - zvfs_t *fs; - struct spdk_blob *blob; - zvfs_dirent_t *dirent; // 指回目录项 file_size/allocated_clusters - - uint64_t current_offset; // 当前读写位置 - int flags; // O_RDONLY / O_RDWR / O_CREAT 等 - int pseudo_fd; - - /* 临时DMA缓冲区(可选:每个file一个,避免每次malloc) */ - void *dma_buf; - uint64_t dma_buf_size; -} zvfs_file_t; +for each channel: + send_msg(channel->thread) ``` -### 工作流程 -#### mount -hook POSIX API没有很好的调用时机,单线程目前采用懒加载。 -```scss -1. [创建块设备] - - spdk_bdev_create_bs_dev_ext -2. [初始化文件系统] - - spdk_bs_init 或者 spdk_bs_load(已有数据时) - - spdk_bs_get_io_unit_size 获取io单元大小(page) - - spdk_bs_alloc_io_channel 分配blobstore的读写入口 -3. [读取元数据] - - spdk_bs_get_super_blob 获取 Super Blob ID - - spdk_bs_open_blob 打开 Super Blob - - 读取超级块,校验 magic - - 读取目录项数组,加载到内存 dirents -4. [创建zvfs_t结构体] - - 创建 zvfs_t 结构体 - - 填充 bs/channel/super_blob/dirents 等字段 -``` -#### open -##### O_RDONLY / O_RDWR -```scss -1. [文件名查找] - - 遍历 dirents,匹配 filename 且 is_valid=true - - 找不到返回 -ENOENT -2. [打开blob] - - spdk_bs_open_blob(dirent->blob_id) - - dirent->open_count++ - - fs->openfd_count++ -3. [分配文件句柄] - - 创建 zvfs_file_t,dirent 指针指向目录项 - - 分配伪FD,写入 fd_table -5. [返回伪FD] -``` +#### 问题1:持有 Channel 的 Thread 不 poll +如果所属线程不poll,就会卡住。 +#### 问题2:线程退出 Channel 没有释放 +永远卡住 -##### O_CREAT -```scss -1. [文件名查找] - - 遍历 dirents,若 filename 已存在且 is_valid=true,返回 -EEXIST - - 找一个 is_valid=false 的空槽位;没有空槽则追加(dirent_count < max_files) -2. [创建blob] - - spdk_bs_create_blob → 得到 blob_id - - spdk_bs_open_blob → 得到 blob 句柄 - - spdk_blob_resize 初始分配空间 - - spdk_blob_sync_md 持久化 cluster 分配 -3. [写目录] - - 填充 filename/blob_id/file_size=0/is_valid=true - - dirent->open_count = 1 -4. [创建文件句柄] - - 创建 zvfs_file_t - - 分配伪FD,写入 fd_table -5. [返回伪FD] +### IO 操作的回调行为与 metadata 操作不同 +spdk_blob_io_read / spdk_blob_io_write 的回调,是通过传入的 io_channel 投递的,回调回到分配该 channel 的 thread。 -``` -> 说明:目录变更只写内存,unmount 时统一持久化。 - -### read -读写都以字节为单位,offset / count 单位为字节;根据 io_unit_size 做对齐计算。 - -```scss -1. [参数] - - fd - - buffer - - count - - offset(隐含) -2. [边界检查] - - 实际可读 = min(count, dirent->file_size - current_offset) - - 实际可读为0则返回0 -3. [计算Blob位置] - - start_page = current_offset / io_unit_size - - page_offset = current_offset % io_unit_size - - num_pages = (page_offset + 实际可读 + io_unit_size - 1) / io_unit_size -4. [DMA读取] - - 非对齐读(offset != 0 || count 不是整页) - - 需要DMA临时缓冲区(spdk_dma_zmalloc) - - spdk_blob_io_read(blob, channel, dma_buffer, start_page, num_pages, ...) - - 从 dma_buffer + page_offset 拷贝到用户 buffer - - 对齐 - - 仍使用DMA缓冲区执行读取,再拷贝到用户buffer -5. [更新offset] - - current_offset += 实际可读 -6. [返回实际读取字节数] -``` -> 说明:SPDK需要DMA可用的内存,应用提供的用户缓冲区通常不满足要求。即便对齐也不能直接提交给spdk_blob_io_*,应使用DMA缓冲作为跳板;未来通过注册内存池可优化直传。 - -### write -```scss -1. [参数] - - fd - - buffer - - count - - offset(隐含) -2. [检查空间是否足够] - - 需要大小 = current_offset + count - - 若超过 allocated_clusters 对应容量: - - spdk_blob_resize 扩容 - - spdk_blob_sync_md - - 更新 dirent->allocated_clusters -3. [计算写入位置] - - start_page / page_offset / num_pages(同read) -4. [DMA写入] - - 非对齐写(offset != 0 || count 不是整页) - - 读取涉及的首尾page到DMA临时缓冲区 - - 修改对应位置的数据 - - 写回:spdk_blob_io_write(blob, channel, dma_buffer, start_page, num_pages, ...) - - 对齐 - - 仍通过DMA缓冲区提交写入 -5. [更新状态] - - current_offset += count - - dirent->file_size = max(dirent->file_size, current_offset) -6. [返回写入字节数] -``` - - -### close -```scss -1. [关闭Blob] - - spdk_blob_close(file->blob) - - dirent->open_count-- - - fs->openfd_count++ - - 若 open_count == 0 且 is_valid == false(已unlink):spdk_bs_delete_blob, 清空dirent - - 若 openfd_count == 0 则 unmount -2. [释放缓冲区] - - 释放 dma_buf - - 清除 fd_table[pseudo_fd] - - free(zvfs_file_t) -3. [返回0] -``` -### unlink -```scss -1. [查找目录项] - - 遍历 dirents,匹配 filename 且 is_valid=true - - 找不到返回 -ENOENT -2. [标记删除] - - dirent->is_valid = false -3. [判断是否立即删除] - - open_count == 0:spdk_bs_delete_blob,清空该槽位 - - open_count > 0:延迟,最后一个 close 负责删除 -4. [返回0] -``` - -### unmount -```scss -1. [关闭channel] - - spdk_bs_free_io_channel -2. [关闭BlobStore] - - spdk_bs_unload -3. [释放FS] - - free(fs) -``` - - -### 其他方案 -如果不使用`LD_PRELOAD`hook,可以使用FUSE。\ -FUSE是一种内核文件系统程序,挂载在文件目录上,对这个目录的访问,会使用这个文件系统程序。\ -文件系统程序会将请求转发给应用层程序,这里的应用层程序可以是SPDK。这样就不用管其他的操作。 +### 超时任务 +设置超时就免不了超时后回调成功执行,超时后回调仍会触发,存在 UAF 风险 diff --git a/scripts/run_db_bench_zvfs.sh b/scripts/run_db_bench_zvfs.sh index 5522f3a..4977c3f 100755 --- a/scripts/run_db_bench_zvfs.sh +++ b/scripts/run_db_bench_zvfs.sh @@ -1,5 +1,6 @@ #!/usr/bin/env bash set -euo pipefail +env -u LD_PRELOAD rm -rf /zvfs/rocksdb_manual || true # ========================= # Manual Config (edit here) @@ -19,9 +20,11 @@ DB_PATH="/zvfs/rocksdb_manual" BENCHMARKS="fillrandom,readrandom" # key数 -NUM=1000000 +# NUM=1000000 +NUM=50000 + # 线程数 -THREADS=1 +THREADS=2 # 随机种子 SEED=1 diff --git a/src/config.h b/src/config.h index fa543a4..283da44 100644 --- a/src/config.h +++ b/src/config.h @@ -25,6 +25,7 @@ // waiter #define WAITER_MAX_TIME 10000000 +#define ZVFS_WAIT_TIME 5000ULL diff --git a/src/hook/zvfs_hook_fd.c b/src/hook/zvfs_hook_fd.c index 782eff4..771d5e6 100644 --- a/src/hook/zvfs_hook_fd.c +++ b/src/hook/zvfs_hook_fd.c @@ -17,6 +17,7 @@ #include #include #include +#include /* ------------------------------------------------------------------ */ /* 内部:open 的核心逻辑(路径已解析为绝对路径) */ @@ -44,7 +45,15 @@ zvfs_open_impl(int real_fd, const char *abspath, int flags, mode_t mode) /* 1. 创建 blob */ handle = blob_create(0); - if (!handle) { errno = EIO; goto fail; } + if (!handle) { + int saved = errno; + if (saved == 0) saved = EIO; + fprintf(stderr, + "[zvfs] create blob failed path=%s flags=0x%x errno=%d(%s)\n", + abspath, flags, saved, strerror(saved)); + errno = saved; + goto fail; + } blob_id = handle->id; /* 2. 把 blob_id 写入真实文件的 xattr */ @@ -80,7 +89,7 @@ zvfs_open_impl(int real_fd, const char *abspath, int flags, mode_t mode) /* path_cache 命中:直接用缓存的 inode,重新 blob_open */ blob_id = inode->blob_id; handle = blob_open(blob_id); - if (!handle) { errno = EIO; goto fail; } + if (!handle) { if (errno == 0) errno = EIO; goto fail; } /* 共享 inode,增加引用 */ atomic_fetch_add(&inode->ref_count, 1); @@ -101,7 +110,7 @@ zvfs_open_impl(int real_fd, const char *abspath, int flags, mode_t mode) } else { /* 全新 inode:需从真实文件 stat 获取 mode/size */ struct stat st; - if (real_fstat(real_fd, &st) < 0) goto fail; + if (zvfs_real_fstat(real_fd, &st) < 0) goto fail; inode = inode_alloc(blob_id, st.st_mode, ZVFS_ITYPE_FILE); if (!inode) { errno = ENOMEM; goto fail; } @@ -117,7 +126,7 @@ zvfs_open_impl(int real_fd, const char *abspath, int flags, mode_t mode) } handle = blob_open(blob_id); - if (!handle) { errno = EIO; goto fail; } + if (!handle) { if (errno == 0) errno = EIO; goto fail; } } } @@ -340,11 +349,14 @@ zvfs_close_impl(int fd) return real_close(fd); } - /* ---- openfile 引用归零:关闭 blob handle --------------------- */ + /* ---- openfile 引用归零:先刷 metadata,再关闭 blob handle ------ */ struct zvfs_inode *inode = of->inode; struct zvfs_blob_handle *handle = of->handle; + int sync_failed = 0; openfile_free(of); + if (blob_sync_md(handle) < 0) + sync_failed = 1; blob_close(handle); /* ---- inode ref_count-- --------------------------------------- */ @@ -391,7 +403,14 @@ zvfs_close_impl(int fd) inode_free(inode); } - return real_close(fd); + int rc = real_close(fd); + if (rc < 0) + return -1; + if (sync_failed) { + errno = EIO; + return -1; + } + return 0; } int diff --git a/src/hook/zvfs_hook_init.c b/src/hook/zvfs_hook_init.c index 3f5a0e6..2fdd607 100644 --- a/src/hook/zvfs_hook_init.c +++ b/src/hook/zvfs_hook_init.c @@ -81,6 +81,15 @@ int (*real_fstatat)(int, const char *, struct stat *, int) = NULL; int (*real_fstatat64)(int, const char *, struct stat64 *, int) = NULL; int (*real_statx)(int, const char *, int, unsigned int, struct statx *) = NULL; +int (*real___xstat)(int, const char *, struct stat *) = NULL; +int (*real___xstat64)(int, const char *, struct stat64 *) = NULL; +int (*real___fxstat)(int, int, struct stat *) = NULL; +int (*real___fxstat64)(int, int, struct stat64 *) = NULL; +int (*real___lxstat)(int, const char *, struct stat *) = NULL; +int (*real___lxstat64)(int, const char *, struct stat64 *) = NULL; +int (*real___fxstatat)(int, int, const char *, struct stat *, int) = NULL; +int (*real___fxstatat64)(int, int, const char *, struct stat64 *, + int) = NULL; /* sync */ int (*real_fsync)(int) = NULL; @@ -116,10 +125,19 @@ int (*real___open64)(const char *, int, ...) = NULL; int (*real___libc_open)(const char *, int, ...) = NULL; ssize_t (*real___read)(int, void *, size_t) = NULL; ssize_t (*real___libc_read)(int, void *, size_t) = NULL; +ssize_t (*real___read_nocancel)(int, void *, size_t) = NULL; +ssize_t (*real___pread64)(int, void *, size_t, off_t) = NULL; +ssize_t (*real___libc_pread)(int, void *, size_t, off_t) = NULL; +ssize_t (*real___pread64_nocancel)(int, void *, size_t, off_t) = NULL; +ssize_t (*real___read_chk)(int, void *, size_t, size_t) = NULL; +ssize_t (*real___pread_chk)(int, void *, size_t, off_t, size_t) = NULL; +ssize_t (*real___pread64_chk)(int, void *, size_t, off_t, size_t) = NULL; ssize_t (*real___write)(int, const void *, size_t) = NULL; ssize_t (*real___libc_write)(int, const void *, size_t) = NULL; int (*real___close)(int) = NULL; int (*real___libc_close)(int) = NULL; +size_t (*real_fread_unlocked)(void *, size_t, size_t, FILE *) = NULL; +size_t (*real_fread)(void *, size_t, size_t, FILE *) = NULL; /* ------------------------------------------------------------------ */ /* dlsym 辅助宏 */ @@ -180,14 +198,14 @@ void zvfs_hook_init(void) LOAD_SYM(real_fallocate, "fallocate"); LOAD_SYM(real_posix_fallocate,"posix_fallocate"); - LOAD_SYM(real_stat, "stat"); - LOAD_SYM(real_stat64, "stat64"); - LOAD_SYM(real_fstat, "fstat"); - LOAD_SYM(real_fstat64, "fstat64"); - LOAD_SYM(real_lstat, "lstat"); - LOAD_SYM(real_lstat64, "lstat64"); - LOAD_SYM(real_fstatat, "fstatat"); - LOAD_SYM(real_fstatat64, "fstatat64"); + LOAD_SYM_OPTIONAL(real_stat, "stat"); + LOAD_SYM_OPTIONAL(real_stat64, "stat64"); + LOAD_SYM_OPTIONAL(real_fstat, "fstat"); + LOAD_SYM_OPTIONAL(real_fstat64, "fstat64"); + LOAD_SYM_OPTIONAL(real_lstat, "lstat"); + LOAD_SYM_OPTIONAL(real_lstat64, "lstat64"); + LOAD_SYM_OPTIONAL(real_fstatat, "fstatat"); + LOAD_SYM_OPTIONAL(real_fstatat64, "fstatat64"); LOAD_SYM(real_fsync, "fsync"); LOAD_SYM(real_fdatasync, "fdatasync"); LOAD_SYM(real_fcntl, "fcntl"); @@ -215,17 +233,110 @@ void zvfs_hook_init(void) LOAD_SYM_OPTIONAL(real___open, "__open"); LOAD_SYM_OPTIONAL(real___open64, "__open64"); LOAD_SYM_OPTIONAL(real___libc_open, "__libc_open"); + LOAD_SYM_OPTIONAL(real___xstat, "__xstat"); + LOAD_SYM_OPTIONAL(real___xstat64, "__xstat64"); + LOAD_SYM_OPTIONAL(real___fxstat, "__fxstat"); + LOAD_SYM_OPTIONAL(real___fxstat64, "__fxstat64"); + LOAD_SYM_OPTIONAL(real___lxstat, "__lxstat"); + LOAD_SYM_OPTIONAL(real___lxstat64, "__lxstat64"); + LOAD_SYM_OPTIONAL(real___fxstatat, "__fxstatat"); + LOAD_SYM_OPTIONAL(real___fxstatat64, "__fxstatat64"); LOAD_SYM_OPTIONAL(real___read, "__read"); LOAD_SYM_OPTIONAL(real___libc_read, "__libc_read"); + LOAD_SYM_OPTIONAL(real___read_nocancel, "__read_nocancel"); + LOAD_SYM_OPTIONAL(real___pread64, "__pread64"); + LOAD_SYM_OPTIONAL(real___libc_pread, "__libc_pread"); + LOAD_SYM_OPTIONAL(real___pread64_nocancel, "__pread64_nocancel"); + LOAD_SYM_OPTIONAL(real___read_chk, "__read_chk"); + LOAD_SYM_OPTIONAL(real___pread_chk, "__pread_chk"); + LOAD_SYM_OPTIONAL(real___pread64_chk, "__pread64_chk"); LOAD_SYM_OPTIONAL(real___write, "__write"); LOAD_SYM_OPTIONAL(real___libc_write, "__libc_write"); LOAD_SYM_OPTIONAL(real___close, "__close"); LOAD_SYM_OPTIONAL(real___libc_close, "__libc_close"); + LOAD_SYM_OPTIONAL(real_fread_unlocked, "fread_unlocked"); + LOAD_SYM_OPTIONAL(real_fread, "fread"); /* 初始化全局 fs 结构 */ zvfs_fs_init(); } +#ifndef _STAT_VER +#define _STAT_VER 0 +#endif + +int +zvfs_real_stat(const char *path, struct stat *buf) +{ + if (real_stat) return real_stat(path, buf); + if (real___xstat) return real___xstat(_STAT_VER, path, buf); + errno = ENOSYS; + return -1; +} + +int +zvfs_real_stat64(const char *path, struct stat64 *buf) +{ + if (real_stat64) return real_stat64(path, buf); + if (real___xstat64) return real___xstat64(_STAT_VER, path, buf); + errno = ENOSYS; + return -1; +} + +int +zvfs_real_fstat(int fd, struct stat *buf) +{ + if (real_fstat) return real_fstat(fd, buf); + if (real___fxstat) return real___fxstat(_STAT_VER, fd, buf); + errno = ENOSYS; + return -1; +} + +int +zvfs_real_fstat64(int fd, struct stat64 *buf) +{ + if (real_fstat64) return real_fstat64(fd, buf); + if (real___fxstat64) return real___fxstat64(_STAT_VER, fd, buf); + errno = ENOSYS; + return -1; +} + +int +zvfs_real_lstat(const char *path, struct stat *buf) +{ + if (real_lstat) return real_lstat(path, buf); + if (real___lxstat) return real___lxstat(_STAT_VER, path, buf); + errno = ENOSYS; + return -1; +} + +int +zvfs_real_lstat64(const char *path, struct stat64 *buf) +{ + if (real_lstat64) return real_lstat64(path, buf); + if (real___lxstat64) return real___lxstat64(_STAT_VER, path, buf); + errno = ENOSYS; + return -1; +} + +int +zvfs_real_fstatat(int dirfd, const char *path, struct stat *buf, int flags) +{ + if (real_fstatat) return real_fstatat(dirfd, path, buf, flags); + if (real___fxstatat) return real___fxstatat(_STAT_VER, dirfd, path, buf, flags); + errno = ENOSYS; + return -1; +} + +int +zvfs_real_fstatat64(int dirfd, const char *path, struct stat64 *buf, int flags) +{ + if (real_fstatat64) return real_fstatat64(dirfd, path, buf, flags); + if (real___fxstatat64) return real___fxstatat64(_STAT_VER, dirfd, path, buf, flags); + errno = ENOSYS; + return -1; +} + /* ------------------------------------------------------------------ */ /* 路径 / fd 判断 */ /* ------------------------------------------------------------------ */ diff --git a/src/hook/zvfs_hook_init.h b/src/hook/zvfs_hook_init.h index c6f9abc..2d97aba 100644 --- a/src/hook/zvfs_hook_init.h +++ b/src/hook/zvfs_hook_init.h @@ -7,6 +7,7 @@ #include #include #include +#include #include "fs/zvfs_sys_init.h" /* @@ -73,6 +74,17 @@ extern int (*real_fstatat)(int dirfd, const char *path, struct stat *buf, int extern int (*real_fstatat64)(int dirfd, const char *path, struct stat64 *buf, int flags); extern int (*real_statx)(int dirfd, const char *path, int flags, unsigned int mask, struct statx *buf); +/* glibc xstat fallback */ +extern int (*real___xstat)(int ver, const char *path, struct stat *buf); +extern int (*real___xstat64)(int ver, const char *path, struct stat64 *buf); +extern int (*real___fxstat)(int ver, int fd, struct stat *buf); +extern int (*real___fxstat64)(int ver, int fd, struct stat64 *buf); +extern int (*real___lxstat)(int ver, const char *path, struct stat *buf); +extern int (*real___lxstat64)(int ver, const char *path, struct stat64 *buf); +extern int (*real___fxstatat)(int ver, int dirfd, const char *path, + struct stat *buf, int flags); +extern int (*real___fxstatat64)(int ver, int dirfd, const char *path, + struct stat64 *buf, int flags); /* sync */ extern int (*real_fsync)(int fd); @@ -109,10 +121,19 @@ extern int (*real___open64)(const char *path, int flags, ...); extern int (*real___libc_open)(const char *path, int flags, ...); extern ssize_t (*real___read)(int fd, void *buf, size_t count); extern ssize_t (*real___libc_read)(int fd, void *buf, size_t count); +extern ssize_t (*real___read_nocancel)(int fd, void *buf, size_t count); +extern ssize_t (*real___pread64)(int fd, void *buf, size_t count, off64_t offset); +extern ssize_t (*real___libc_pread)(int fd, void *buf, size_t count, off64_t offset); +extern ssize_t (*real___pread64_nocancel)(int fd, void *buf, size_t count, off64_t offset); +extern ssize_t (*real___read_chk)(int fd, void *buf, size_t count, size_t buflen); +extern ssize_t (*real___pread_chk)(int fd, void *buf, size_t count, off_t offset, size_t buflen); +extern ssize_t (*real___pread64_chk)(int fd, void *buf, size_t count, off64_t offset, size_t buflen); extern ssize_t (*real___write)(int fd, const void *buf, size_t count); extern ssize_t (*real___libc_write)(int fd, const void *buf, size_t count); extern int (*real___close)(int fd); extern int (*real___libc_close)(int fd); +extern size_t (*real_fread_unlocked)(void *ptr, size_t size, size_t nmemb, FILE *stream); +extern size_t (*real_fread)(void *ptr, size_t size, size_t nmemb, FILE *stream); /* 初始化所有 real_* 指针,在 constructor 中调用 */ void zvfs_hook_init(void); @@ -127,4 +148,14 @@ int zvfs_is_zvfs_fd(int fd); * 成功返回 0,失败返回 -1 并设置 errno。 */ int zvfs_resolve_atpath(int dirfd, const char *path, char *buf, size_t bufsz); + +/* stat wrapper(优先 real_*,fallback 到 __xstat*) */ +int zvfs_real_stat(const char *path, struct stat *buf); +int zvfs_real_stat64(const char *path, struct stat64 *buf); +int zvfs_real_fstat(int fd, struct stat *buf); +int zvfs_real_fstat64(int fd, struct stat64 *buf); +int zvfs_real_lstat(const char *path, struct stat *buf); +int zvfs_real_lstat64(const char *path, struct stat64 *buf); +int zvfs_real_fstatat(int dirfd, const char *path, struct stat *buf, int flags); +int zvfs_real_fstatat64(int dirfd, const char *path, struct stat64 *buf, int flags); #endif // __ZVFS_HOOK_INIT_H__ diff --git a/src/hook/zvfs_hook_rw.c b/src/hook/zvfs_hook_rw.c index 85d03b7..9c37cb1 100644 --- a/src/hook/zvfs_hook_rw.c +++ b/src/hook/zvfs_hook_rw.c @@ -10,9 +10,14 @@ #include "spdk_engine/io_engine.h" #include +#include +#include #include #include #include +#include +#include +#include /* ------------------------------------------------------------------ */ /* 内部:单段 pread / pwrite(不修改 of->offset) */ @@ -212,6 +217,56 @@ get_of(int fd) return of; } +static size_t +zvfs_fread_impl(void *ptr, size_t size, size_t nmemb, FILE *stream, int unlocked) +{ + ZVFS_HOOK_ENTER(); + + int fd = stream ? fileno(stream) : -1; + struct zvfs_open_file *of = NULL; + + if (ZVFS_IN_HOOK() || fd < 0 || !(of = get_of(fd))) { + size_t r = 0; + if (unlocked) { + if (real_fread_unlocked) r = real_fread_unlocked(ptr, size, nmemb, stream); + else if (real_fread) r = real_fread(ptr, size, nmemb, stream); + else errno = ENOSYS; + } else { + if (real_fread) r = real_fread(ptr, size, nmemb, stream); + else if (real_fread_unlocked) r = real_fread_unlocked(ptr, size, nmemb, stream); + else errno = ENOSYS; + } + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + if (size == 0 || nmemb == 0) { + ZVFS_HOOK_LEAVE(); + return 0; + } + if (nmemb > SIZE_MAX / size) { + errno = EOVERFLOW; + ZVFS_HOOK_LEAVE(); + return 0; + } + + size_t total = size * nmemb; + ssize_t n = zvfs_pread_impl(of, ptr, total, of->offset); + if (n > 0) + of->offset += (uint64_t)n; + + /* Keep stdio state machine consistent for callers that check feof/ferror. */ + if (n < 0) { + stream->_flags |= _IO_ERR_SEEN; + } else if ((size_t)n < total) { + stream->_flags |= _IO_EOF_SEEN; + } + ZVFS_HOOK_LEAVE(); + return (n <= 0) ? 0 : ((size_t)n / size); +} + /* ------------------------------------------------------------------ */ /* read */ /* ------------------------------------------------------------------ */ @@ -221,7 +276,7 @@ read(int fd, void *buf, size_t count) { ZVFS_HOOK_ENTER(); - struct zvfs_open_file *of; + struct zvfs_open_file *of = NULL; if (ZVFS_IN_HOOK() || !(of = get_of(fd))) { ssize_t r = real_read(fd, buf, count); ZVFS_HOOK_LEAVE(); @@ -240,6 +295,15 @@ read(int fd, void *buf, size_t count) ssize_t __read(int fd, void *buf, size_t count) { return read(fd, buf, count); } ssize_t __libc_read(int fd, void *buf, size_t count) { return read(fd, buf, count); } +ssize_t __read_nocancel(int fd, void *buf, size_t count) { return read(fd, buf, count); } +ssize_t __read_chk(int fd, void *buf, size_t count, size_t buflen) +{ + if (count > buflen) { + errno = ERANGE; + return -1; + } + return read(fd, buf, count); +} /* ------------------------------------------------------------------ */ /* pread / pread64 */ @@ -250,7 +314,7 @@ pread(int fd, void *buf, size_t count, off_t offset) { ZVFS_HOOK_ENTER(); - struct zvfs_open_file *of; + struct zvfs_open_file *of = NULL; if (ZVFS_IN_HOOK() || !(of = get_of(fd))) { ssize_t r = real_pread(fd, buf, count, offset); ZVFS_HOOK_LEAVE(); @@ -269,6 +333,49 @@ ssize_t pread64(int fd, void *buf, size_t count, off_t offset) return pread(fd, buf, count, offset); } +ssize_t __pread64(int fd, void *buf, size_t count, off_t offset) +{ + return pread(fd, buf, count, offset); +} + +ssize_t __libc_pread(int fd, void *buf, size_t count, off_t offset) +{ + return pread(fd, buf, count, offset); +} + +ssize_t __pread64_nocancel(int fd, void *buf, size_t count, off_t offset) +{ + return pread(fd, buf, count, offset); +} + +ssize_t __pread_chk(int fd, void *buf, size_t count, off_t offset, size_t buflen) +{ + if (count > buflen) { + errno = ERANGE; + return -1; + } + return pread(fd, buf, count, offset); +} + +ssize_t __pread64_chk(int fd, void *buf, size_t count, off_t offset, size_t buflen) +{ + if (count > buflen) { + errno = ERANGE; + return -1; + } + return pread(fd, buf, count, offset); +} + +size_t fread_unlocked(void *ptr, size_t size, size_t nmemb, FILE *stream) +{ + return zvfs_fread_impl(ptr, size, nmemb, stream, 1); +} + +size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream) +{ + return zvfs_fread_impl(ptr, size, nmemb, stream, 0); +} + /* ------------------------------------------------------------------ */ /* readv / preadv / preadv64 / preadv2 */ /* ------------------------------------------------------------------ */ @@ -278,7 +385,7 @@ readv(int fd, const struct iovec *iov, int iovcnt) { ZVFS_HOOK_ENTER(); - struct zvfs_open_file *of; + struct zvfs_open_file *of = NULL; if (ZVFS_IN_HOOK() || !(of = get_of(fd))) { ssize_t r = real_readv(fd, iov, iovcnt); ZVFS_HOOK_LEAVE(); @@ -300,7 +407,7 @@ preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset) { ZVFS_HOOK_ENTER(); - struct zvfs_open_file *of; + struct zvfs_open_file *of = NULL; if (ZVFS_IN_HOOK() || !(of = get_of(fd))) { ssize_t r = real_preadv(fd, iov, iovcnt, offset); ZVFS_HOOK_LEAVE(); @@ -324,7 +431,7 @@ preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags) { ZVFS_HOOK_ENTER(); - struct zvfs_open_file *of; + struct zvfs_open_file *of = NULL; if (ZVFS_IN_HOOK() || !(of = get_of(fd))) { ssize_t r = real_preadv2 ? real_preadv2(fd, iov, iovcnt, offset, flags) @@ -358,7 +465,7 @@ write(int fd, const void *buf, size_t count) { ZVFS_HOOK_ENTER(); - struct zvfs_open_file *of; + struct zvfs_open_file *of = NULL; if (ZVFS_IN_HOOK() || !(of = get_of(fd))) { ssize_t r = real_write(fd, buf, count); ZVFS_HOOK_LEAVE(); @@ -423,7 +530,7 @@ pwrite(int fd, const void *buf, size_t count, off_t offset) { ZVFS_HOOK_ENTER(); - struct zvfs_open_file *of; + struct zvfs_open_file *of = NULL; if (ZVFS_IN_HOOK() || !(of = get_of(fd))) { ssize_t r = real_pwrite(fd, buf, count, offset); ZVFS_HOOK_LEAVE(); @@ -454,7 +561,7 @@ writev(int fd, const struct iovec *iov, int iovcnt) { ZVFS_HOOK_ENTER(); - struct zvfs_open_file *of; + struct zvfs_open_file *of = NULL; if (ZVFS_IN_HOOK() || !(of = get_of(fd))) { ssize_t r = real_writev(fd, iov, iovcnt); ZVFS_HOOK_LEAVE(); @@ -501,7 +608,7 @@ pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset) { ZVFS_HOOK_ENTER(); - struct zvfs_open_file *of; + struct zvfs_open_file *of = NULL; if (ZVFS_IN_HOOK() || !(of = get_of(fd))) { ssize_t r = real_pwritev(fd, iov, iovcnt, offset); ZVFS_HOOK_LEAVE(); @@ -525,7 +632,7 @@ pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags) { ZVFS_HOOK_ENTER(); - struct zvfs_open_file *of; + struct zvfs_open_file *of = NULL; if (ZVFS_IN_HOOK() || !(of = get_of(fd))) { ssize_t r = real_pwritev2 ? real_pwritev2(fd, iov, iovcnt, offset, flags) diff --git a/src/hook/zvfs_hook_rw.h b/src/hook/zvfs_hook_rw.h index 49a942d..ca18484 100644 --- a/src/hook/zvfs_hook_rw.h +++ b/src/hook/zvfs_hook_rw.h @@ -4,6 +4,7 @@ #include #include #include +#include /* * read / write 族。 @@ -46,7 +47,16 @@ ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, /* glibc 别名 */ ssize_t __read(int fd, void *buf, size_t count); ssize_t __libc_read(int fd, void *buf, size_t count); +ssize_t __read_nocancel(int fd, void *buf, size_t count); +ssize_t __pread64(int fd, void *buf, size_t count, off_t offset); +ssize_t __libc_pread(int fd, void *buf, size_t count, off_t offset); +ssize_t __pread64_nocancel(int fd, void *buf, size_t count, off_t offset); +ssize_t __read_chk(int fd, void *buf, size_t count, size_t buflen); +ssize_t __pread_chk(int fd, void *buf, size_t count, off_t offset, size_t buflen); +ssize_t __pread64_chk(int fd, void *buf, size_t count, off_t offset, size_t buflen); ssize_t __write(int fd, const void *buf, size_t count); ssize_t __libc_write(int fd, const void *buf, size_t count); +size_t fread_unlocked(void *ptr, size_t size, size_t nmemb, FILE *stream); +size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream); -#endif // __ZVFS_HOOK_RW_H__ \ No newline at end of file +#endif // __ZVFS_HOOK_RW_H__ diff --git a/src/hook/zvfs_hook_stat.c b/src/hook/zvfs_hook_stat.c index 70be1b5..0c83aa6 100644 --- a/src/hook/zvfs_hook_stat.c +++ b/src/hook/zvfs_hook_stat.c @@ -106,7 +106,7 @@ stat(const char *path, struct stat *buf) ZVFS_HOOK_ENTER(); if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(path)) { - int r = real_stat(path, buf); + int r = zvfs_real_stat(path, buf); ZVFS_HOOK_LEAVE(); return r; } @@ -114,7 +114,7 @@ stat(const char *path, struct stat *buf) zvfs_ensure_init(); /* 先透传,拿到完整 stat(mode、ino、dev、nlink 等) */ - if (real_stat(path, buf) < 0) { + if (zvfs_real_stat(path, buf) < 0) { ZVFS_HOOK_LEAVE(); return -1; } @@ -138,14 +138,14 @@ stat64(const char *path, struct stat64 *buf) ZVFS_HOOK_ENTER(); if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(path)) { - int r = real_stat64(path, buf); + int r = zvfs_real_stat64(path, buf); ZVFS_HOOK_LEAVE(); return r; } zvfs_ensure_init(); - if (real_stat64(path, buf) < 0) { + if (zvfs_real_stat64(path, buf) < 0) { ZVFS_HOOK_LEAVE(); return -1; } @@ -168,7 +168,7 @@ fstat(int fd, struct stat *buf) ZVFS_HOOK_ENTER(); /* 先透传:拿到 mode/ino/dev/nlink/blksize 等 */ - if (real_fstat(fd, buf) < 0) { + if (zvfs_real_fstat(fd, buf) < 0) { ZVFS_HOOK_LEAVE(); return -1; } @@ -196,7 +196,7 @@ fstat64(int fd, struct stat64 *buf) { ZVFS_HOOK_ENTER(); - if (real_fstat64(fd, buf) < 0) { + if (zvfs_real_fstat64(fd, buf) < 0) { ZVFS_HOOK_LEAVE(); return -1; } @@ -229,14 +229,14 @@ lstat(const char *path, struct stat *buf) ZVFS_HOOK_ENTER(); if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(path)) { - int r = real_lstat(path, buf); + int r = zvfs_real_lstat(path, buf); ZVFS_HOOK_LEAVE(); return r; } zvfs_ensure_init(); - if (real_lstat(path, buf) < 0) { + if (zvfs_real_lstat(path, buf) < 0) { ZVFS_HOOK_LEAVE(); return -1; } @@ -255,14 +255,14 @@ lstat64(const char *path, struct stat64 *buf) ZVFS_HOOK_ENTER(); if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(path)) { - int r = real_lstat64(path, buf); + int r = zvfs_real_lstat64(path, buf); ZVFS_HOOK_LEAVE(); return r; } zvfs_ensure_init(); - if (real_lstat64(path, buf) < 0) { + if (zvfs_real_lstat64(path, buf) < 0) { ZVFS_HOOK_LEAVE(); return -1; } @@ -292,7 +292,7 @@ fstatat(int dirfd, const char *path, struct stat *buf, int flags) is_zvfs = zvfs_is_zvfs_path(abspath); } - if (real_fstatat(dirfd, path, buf, flags) < 0) { + if (zvfs_real_fstatat(dirfd, path, buf, flags) < 0) { ZVFS_HOOK_LEAVE(); return -1; } @@ -321,7 +321,7 @@ fstatat64(int dirfd, const char *path, struct stat64 *buf, int flags) is_zvfs = zvfs_is_zvfs_path(abspath); } - if (real_fstatat64(dirfd, path, buf, flags) < 0) { + if (zvfs_real_fstatat64(dirfd, path, buf, flags) < 0) { ZVFS_HOOK_LEAVE(); return -1; } diff --git a/src/spdk_engine/io_engine.c b/src/spdk_engine/io_engine.c index ec470c2..b2c25ac 100644 --- a/src/spdk_engine/io_engine.c +++ b/src/spdk_engine/io_engine.c @@ -12,13 +12,14 @@ #include #include #include +#include struct zvfs_spdk_io_engine g_engine = {0}; static int g_engine_init_rc = -EAGAIN; -static pthread_mutex_t g_super_blob_mutex = PTHREAD_MUTEX_INITIALIZER; -static spdk_blob_id g_super_blob_id_cache = SPDK_BLOBID_INVALID; static __thread struct zvfs_tls_ctx tls = {0}; +static pthread_once_t g_tls_cleanup_once = PTHREAD_ONCE_INIT; +static pthread_key_t g_tls_cleanup_key; // 初始化操作上下文 struct json_load_ctx { @@ -54,9 +55,6 @@ struct md_op_ctx { struct { // for delete spdk_blob_id blob_id; } delete; - struct { // for get/set super - spdk_blob_id blob_id; - } super; }; char *op_name; }; @@ -67,9 +65,37 @@ struct io_completion_ctx { int rc; }; +struct md_poller_bootstrap_ctx { + const char *bdev_name; + pthread_mutex_t mu; + pthread_cond_t cv; + bool done; + int rc; +}; + +static uint64_t now_mono_ms(void); +static int open_bdev_and_init_bs(const char *bdev_name); +static void ensure_tls_cleanup_key(void); +static void tls_cleanup_destructor(void *arg); + // metadata poller 线程函数 static void *md_poller_fn(void *arg) { + struct md_poller_bootstrap_ctx *boot = arg; + spdk_set_thread(g_engine.md_thread); + tls.thread = g_engine.md_thread; + + int init_rc = open_bdev_and_init_bs(boot->bdev_name); + pthread_mutex_lock(&boot->mu); + boot->rc = init_rc; + boot->done = true; + pthread_cond_signal(&boot->cv); + pthread_mutex_unlock(&boot->mu); + + if (init_rc != 0) { + return NULL; + } + while (true) { spdk_thread_poll(g_engine.md_thread, 0, 0); usleep(1000); @@ -77,14 +103,21 @@ static void *md_poller_fn(void *arg) { return NULL; } +static uint64_t now_mono_ms(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (uint64_t)ts.tv_sec * 1000ULL + (uint64_t)ts.tv_nsec / 1000000ULL; +} + // 前向声明 static struct spdk_io_channel *get_current_channel(void); static int dispatch_md_op(struct md_op_ctx *ctx); -static int dispatch_md_op_quiet(struct md_op_ctx *ctx); static void md_op_cb(void *arg); -static int open_bdev_and_init_bs(const char *bdev_name); static int load_json_config(void); static int ensure_engine_ready(const char *op); +static int ensure_current_spdk_thread(const char *op); + + // callbacks static void json_app_load_done(int rc, void *arg); @@ -97,8 +130,6 @@ static void blob_sync_md_cb(void *arg, int rc); static void blob_close_cb(void *arg, int rc); static void blob_delete_cb(void *arg, int rc); static void io_completion_cb(void *arg, int rc); -static void blob_get_super_cb(void *arg, spdk_blob_id blobid, int rc); -static void blob_set_super_cb(void *arg, int rc); // op functions on matadata static void blob_create_on_md(struct md_op_ctx *ctx); @@ -107,8 +138,6 @@ static void blob_resize_on_md(struct md_op_ctx *ctx); static void blob_sync_md_on_md(struct md_op_ctx *ctx); static void blob_close_on_md(struct md_op_ctx *ctx); static void blob_delete_on_md(struct md_op_ctx *ctx); -static void blob_get_super_on_md(struct md_op_ctx *ctx); -static void blob_set_super_on_md(struct md_op_ctx *ctx); __attribute__((constructor)) static void preload_init(void) { const char *auto_init = getenv("ZVFS_AUTO_INIT"); @@ -116,7 +145,6 @@ __attribute__((constructor)) static void preload_init(void) { return; } - printf("\n\n auto init \n\n"); const char *bdev_name = getenv("SPDK_BDEV_NAME") ? getenv("SPDK_BDEV_NAME") : ZVFS_BDEV; g_engine_init_rc = io_engine_init(bdev_name); if (g_engine_init_rc != 0) { @@ -125,7 +153,7 @@ __attribute__((constructor)) static void preload_init(void) { } static int wait_done(bool *done_ptr, int *rc_ptr, const char *op) { - int iter = 0; + const uint64_t deadline_ms = now_mono_ms() + ZVFS_WAIT_TIME; while (!*done_ptr) { if (tls.thread) { spdk_thread_poll(tls.thread, 0, 0); @@ -133,7 +161,8 @@ static int wait_done(bool *done_ptr, int *rc_ptr, const char *op) { SPDK_ERRLOG("not init tls.thread\n"); return -EBADE; } - if (++iter > WAITER_MAX_TIME) { + + if (now_mono_ms() >= deadline_ms) { SPDK_ERRLOG("%s timeout\n", op); return -ETIMEDOUT; } @@ -147,15 +176,24 @@ static int wait_done(bool *done_ptr, int *rc_ptr, const char *op) { } static int wait_done_volatile(volatile bool *done_ptr, int *rc_ptr, const char *op) { - int iter = 0; - while (!*done_ptr) { + const uint64_t deadline_ms = now_mono_ms() + ZVFS_WAIT_TIME; + bool logged_no_tls = false; + while (!__atomic_load_n(done_ptr, __ATOMIC_ACQUIRE)) { if (tls.thread) { spdk_thread_poll(tls.thread, 0, 0); - }else{ - SPDK_ERRLOG("not init tls.thread\n"); - return -EBADE; + } else { + /* + * md ops are executed on g_engine.md_thread by md_poller_fn. + * If current worker TLS is not initialized, we still need to wait + * for callback completion; returning early can invalidate stack ctx. + */ + if (!logged_no_tls) { + SPDK_NOTICELOG("%s: tls.thread not initialized, waiting on md thread only\n", op); + logged_no_tls = true; + } + usleep(1000); } - if (++iter > WAITER_MAX_TIME) { + if (now_mono_ms() >= deadline_ms) { SPDK_ERRLOG("%s timeout\n", op); return -ETIMEDOUT; } @@ -168,25 +206,6 @@ static int wait_done_volatile(volatile bool *done_ptr, int *rc_ptr, const char * return 0; } -// no rc error -static int wait_done_volatile_quiet(volatile bool *done_ptr, int *rc_ptr, const char *op) { - int iter = 0; - while (!*done_ptr) { - if (tls.thread) { - spdk_thread_poll(tls.thread, 0, 0); - } else { - SPDK_ERRLOG("not init tls.thread\n"); - return -EBADE; - } - if (++iter > WAITER_MAX_TIME) { - SPDK_ERRLOG("%s timeout\n", op); - return -ETIMEDOUT; - } - } - - return *rc_ptr; -} - int io_engine_init(const char *bdev_name) { if (g_engine_init_rc == 0 && g_engine.bs != NULL && g_engine.md_thread != NULL) { return 0; @@ -239,22 +258,40 @@ int io_engine_init(const char *bdev_name) { return g_engine_init_rc; } - // 起专用 poller pthread for md_thread + struct md_poller_bootstrap_ctx boot = { + .bdev_name = bdev_name, + .done = false, + .rc = 0, + }; + pthread_mutex_init(&boot.mu, NULL); + pthread_cond_init(&boot.cv, NULL); + + // 起专用 poller pthread for md_thread(并在该线程完成 bdev/blobstore 初始化) pthread_t md_poller_tid; - if (pthread_create(&md_poller_tid, NULL, md_poller_fn, NULL) != 0) { + if (pthread_create(&md_poller_tid, NULL, md_poller_fn, &boot) != 0) { SPDK_ERRLOG("pthread_create for md_poller failed\n"); + pthread_cond_destroy(&boot.cv); + pthread_mutex_destroy(&boot.mu); g_engine_init_rc = -1; return g_engine_init_rc; } if (pthread_detach(md_poller_tid) != 0) { SPDK_ERRLOG("pthread_detach for md_poller failed\n"); + pthread_cond_destroy(&boot.cv); + pthread_mutex_destroy(&boot.mu); g_engine_init_rc = -1; return g_engine_init_rc; } - // init bdev/bs - g_super_blob_id_cache = SPDK_BLOBID_INVALID; - int rc = open_bdev_and_init_bs(bdev_name); + pthread_mutex_lock(&boot.mu); + while (!boot.done) { + pthread_cond_wait(&boot.cv, &boot.mu); + } + int rc = boot.rc; + pthread_mutex_unlock(&boot.mu); + pthread_cond_destroy(&boot.cv); + pthread_mutex_destroy(&boot.mu); + if (rc != 0) { g_engine_init_rc = rc; return rc; @@ -283,19 +320,12 @@ static struct spdk_io_channel *get_current_channel(void) { return NULL; } - if (tls.thread) { - spdk_thread_poll(tls.thread, 0, 0); + if (ensure_current_spdk_thread("get_current_channel") != 0) { + return NULL; } - if (!tls.thread) { - char name[32]; - snprintf(name, sizeof(name), "worker_%lu", pthread_self()); - tls.thread = spdk_thread_create(name, NULL); - if (!tls.thread) { - SPDK_ERRLOG("spdk_thread_create failed\n"); - return NULL; - } - spdk_set_thread(tls.thread); + if (tls.thread) { + spdk_thread_poll(tls.thread, 0, 0); } if (!tls.channel) { @@ -308,33 +338,107 @@ static struct spdk_io_channel *get_current_channel(void) { return tls.channel; } +static void put_current_channel(struct spdk_io_channel *ch) { + if (!ch) { + return; + } + spdk_put_io_channel(ch); + if (tls.thread) { + spdk_thread_poll(tls.thread, 0, 0); + } + if (tls.channel == ch) { + tls.channel = NULL; + } +} + +static void ensure_tls_cleanup_key(void) { + (void)pthread_key_create(&g_tls_cleanup_key, tls_cleanup_destructor); +} + +static void tls_cleanup_destructor(void *arg) { + (void)arg; + if (!tls.thread || tls.thread == g_engine.md_thread) { + return; + } + + spdk_set_thread(tls.thread); + + if (tls.channel) { + spdk_put_io_channel(tls.channel); + tls.channel = NULL; + } + + spdk_thread_exit(tls.thread); + const uint64_t deadline_ms = now_mono_ms() + ZVFS_WAIT_TIME; + while (!spdk_thread_is_exited(tls.thread)) { + spdk_thread_poll(tls.thread, 0, 0); + if (now_mono_ms() >= deadline_ms) { + SPDK_ERRLOG("worker tls thread exit timeout\n"); + break; + } + usleep(1000); + } + + if (spdk_thread_is_exited(tls.thread)) { + spdk_thread_destroy(tls.thread); + } + tls.thread = NULL; + pthread_setspecific(g_tls_cleanup_key, NULL); +} + +static int ensure_current_spdk_thread(const char *op) { + pthread_once(&g_tls_cleanup_once, ensure_tls_cleanup_key); + + if (!tls.thread) { + char name[32]; + snprintf(name, sizeof(name), "worker_%lu", (unsigned long)pthread_self()); + tls.thread = spdk_thread_create(name, NULL); + if (!tls.thread) { + SPDK_ERRLOG("%s: spdk_thread_create failed\n", op); + return -ENOMEM; + } + pthread_setspecific(g_tls_cleanup_key, (void *)1); + } + spdk_set_thread(tls.thread); + return 0; +} + // 通用 dispatch md op static int dispatch_md_op(struct md_op_ctx *ctx) { int rc = ensure_engine_ready(ctx->op_name ? ctx->op_name : "dispatch_md_op"); if (rc != 0) { return rc; } - - ctx->done = false; - ctx->rc = 0; - - spdk_thread_send_msg(g_engine.md_thread, md_op_cb, ctx); - - return wait_done_volatile(&ctx->done, &ctx->rc, ctx->op_name); -} - -static int dispatch_md_op_quiet(struct md_op_ctx *ctx) { - int rc = ensure_engine_ready(ctx->op_name ? ctx->op_name : "dispatch_md_op_quiet"); + rc = ensure_current_spdk_thread(ctx->op_name ? ctx->op_name : "dispatch_md_op"); if (rc != 0) { return rc; } - ctx->done = false; - ctx->rc = 0; + struct md_op_ctx *async_ctx = malloc(sizeof(*async_ctx)); + if (!async_ctx) { + return -ENOMEM; + } + *async_ctx = *ctx; + __atomic_store_n(&async_ctx->done, false, __ATOMIC_RELAXED); + async_ctx->rc = 0; - spdk_thread_send_msg(g_engine.md_thread, md_op_cb, ctx); + rc = spdk_thread_send_msg(g_engine.md_thread, md_op_cb, async_ctx); - return wait_done_volatile_quiet(&ctx->done, &ctx->rc, ctx->op_name); + if (rc != 0) { + SPDK_ERRLOG("%s: spdk_thread_send_msg failed: %d\n", async_ctx->op_name, rc); + free(async_ctx); + return rc; + } + + rc = wait_done_volatile(&async_ctx->done, &async_ctx->rc, async_ctx->op_name); + if (rc == -ETIMEDOUT) { + SPDK_ERRLOG("%s timeout; keep async ctx alive to avoid UAF\n", async_ctx->op_name); + return rc; + } + + *ctx = *async_ctx; + free(async_ctx); + return rc; } static int ensure_engine_ready(const char *op) { @@ -438,111 +542,12 @@ static int open_bdev_and_init_bs(const char *bdev_name) { return 0; } -static void blob_get_super_cb(void *arg, spdk_blob_id blobid, int rc) { - struct md_op_ctx *ctx = arg; - ctx->rc = rc; - ctx->super.blob_id = blobid; - ctx->done = true; -} - -static void blob_set_super_cb(void *arg, int rc) { - struct md_op_ctx *ctx = arg; - ctx->rc = rc; - ctx->done = true; -} - -static void blob_get_super_on_md(struct md_op_ctx *ctx) { - spdk_bs_get_super(g_engine.bs, blob_get_super_cb, ctx); -} - -static void blob_set_super_on_md(struct md_op_ctx *ctx) { - spdk_bs_set_super(g_engine.bs, ctx->super.blob_id, blob_set_super_cb, ctx); -} - -static int bs_get_super_id(spdk_blob_id *blob_id) { - struct md_op_ctx ctx = { - .fn = blob_get_super_on_md, - .op_name = "blob get super", - }; - ctx.super.blob_id = SPDK_BLOBID_INVALID; - - int rc = dispatch_md_op_quiet(&ctx); - if (rc != 0) { - return rc; - } - *blob_id = ctx.super.blob_id; - return 0; -} - -static int bs_set_super_id(spdk_blob_id blob_id) { - struct md_op_ctx ctx = { - .fn = blob_set_super_on_md, - .op_name = "blob set super", - }; - ctx.super.blob_id = blob_id; - return dispatch_md_op(&ctx); -} - -struct zvfs_blob_handle *blob_get_super(void) { - pthread_mutex_lock(&g_super_blob_mutex); - - if (g_super_blob_id_cache != SPDK_BLOBID_INVALID) { - struct zvfs_blob_handle *cached = blob_open(g_super_blob_id_cache); - if (cached) { - pthread_mutex_unlock(&g_super_blob_mutex); - return cached; - } - g_super_blob_id_cache = SPDK_BLOBID_INVALID; - } - - spdk_blob_id super_id = SPDK_BLOBID_INVALID; - int rc = bs_get_super_id(&super_id); - if (rc == 0 && super_id != SPDK_BLOBID_INVALID) { - g_super_blob_id_cache = super_id; - struct zvfs_blob_handle *existing = blob_open(super_id); - if (!existing) { - g_super_blob_id_cache = SPDK_BLOBID_INVALID; - } - pthread_mutex_unlock(&g_super_blob_mutex); - return existing; - } - if (rc == 0 && super_id == SPDK_BLOBID_INVALID) { - rc = -ENOENT; - } - - if (rc != -ENOENT) { - SPDK_ERRLOG("spdk_bs_get_super failed: %d\n", rc); - pthread_mutex_unlock(&g_super_blob_mutex); - return NULL; - } - - struct zvfs_blob_handle *created = blob_create(0); - if (!created) { - pthread_mutex_unlock(&g_super_blob_mutex); - return NULL; - } - - rc = bs_set_super_id(created->id); - if (rc != 0) { - spdk_blob_id created_id = created->id; - SPDK_ERRLOG("spdk_bs_set_super failed: %d\n", rc); - blob_close(created); - blob_delete(created_id); - pthread_mutex_unlock(&g_super_blob_mutex); - return NULL; - } - - g_super_blob_id_cache = created->id; - pthread_mutex_unlock(&g_super_blob_mutex); - return created; -} - // blob_create static void blob_create_cb(void *arg, spdk_blob_id blobid, int rc) { struct md_op_ctx *ctx = arg; ctx->rc = rc; ctx->create.blob_id = blobid; - ctx->done = true; + __atomic_store_n(&ctx->done, true, __ATOMIC_RELEASE); } static void blob_create_on_md(struct md_op_ctx *ctx) { @@ -556,13 +561,17 @@ struct zvfs_blob_handle *blob_create(uint64_t size_hint) { if(size_hint == 0) size_hint = g_engine.cluster_size; struct md_op_ctx ctx = {.fn = blob_create_on_md, .create.size_hint = size_hint, .op_name = "blob create"}; int rc = dispatch_md_op(&ctx); - if (rc) return NULL; + if (rc) { + errno = (rc < 0) ? -rc : EIO; + return NULL; + } struct zvfs_blob_handle *handle = blob_open(ctx.create.blob_id); if (handle && size_hint > 0) { rc = blob_resize(handle, size_hint); // 初始 resize if (rc != 0) { SPDK_ERRLOG("blob_resize failed after create: %d\n", rc); + errno = (rc < 0) ? -rc : EIO; blob_close(handle); return NULL; } @@ -570,6 +579,7 @@ struct zvfs_blob_handle *blob_create(uint64_t size_hint) { rc = blob_sync_md(handle); if (rc != 0) { SPDK_ERRLOG("blob_sync_md failed after resize: %d\n", rc); + errno = (rc < 0) ? -rc : EIO; blob_close(handle); return NULL; } @@ -582,7 +592,7 @@ static void blob_open_cb(void *arg, struct spdk_blob *blob, int rc) { struct md_op_ctx *ctx = arg; ctx->rc = rc; ctx->open.blob = blob; - ctx->done = true; + __atomic_store_n(&ctx->done, true, __ATOMIC_RELEASE); } static void blob_open_on_md(struct md_op_ctx *ctx) { @@ -594,7 +604,10 @@ static void blob_open_on_md(struct md_op_ctx *ctx) { struct zvfs_blob_handle *blob_open(uint64_t blob_id) { struct md_op_ctx ctx = {.fn = blob_open_on_md, .open.blob_id = blob_id, .op_name = "blob open"}; int rc = dispatch_md_op(&ctx); - if (rc) return NULL; + if (rc) { + errno = (rc < 0) ? -rc : EIO; + return NULL; + } struct zvfs_blob_handle *handle = malloc(sizeof(*handle)); if (!handle) return NULL; @@ -628,15 +641,18 @@ int blob_write(struct zvfs_blob_handle *handle, uint64_t offset, const void *buf spdk_thread_poll(tls.thread, 0, 0); } + if (len == 0) return 0; + struct spdk_io_channel *ch = get_current_channel(); if (!ch) return -1; - if (len == 0) return 0; + int ret = 0; // 越界检查 if (offset + len > handle->size) { SPDK_ERRLOG("blob_write out of range: offset=%lu len=%zu blob_size=%lu\n", offset, len, handle->size); - return -ERANGE; + ret = -ERANGE; + goto out; } // 计算对齐后的 IO 范围和 dma_buf 内偏移 @@ -646,13 +662,15 @@ int blob_write(struct zvfs_blob_handle *handle, uint64_t offset, const void *buf int rc = zvfs_calc_io_units(offset, len, g_engine.io_unit_size, &lba_off, &lba_len, &buf_off); if (rc != 0) { SPDK_ERRLOG("blob_write calc_io_units failed: %d\n", rc); - return rc; + ret = rc; + goto out; } size_t aligned_bytes = lba_len * g_engine.io_unit_size; if (aligned_bytes > ZVFS_DMA_BUF_SIZE) { SPDK_ERRLOG("blob_write aligned_bytes=%zu exceeds ZVFS_DMA_BUF_SIZE\n", aligned_bytes); - return -ENOSPC; + ret = -ENOSPC; + goto out; } struct io_completion_ctx io_ctx = {.done = false, .rc = 0}; @@ -662,7 +680,10 @@ int blob_write(struct zvfs_blob_handle *handle, uint64_t offset, const void *buf rc = wait_done(&io_ctx.done, &io_ctx.rc, "io_write(read phase)"); - if (rc != 0) return rc; + if (rc != 0) { + ret = rc; + goto out; + } memcpy((uint8_t *)handle->dma_buf + buf_off, buf, len); io_ctx.done = false; @@ -671,9 +692,15 @@ int blob_write(struct zvfs_blob_handle *handle, uint64_t offset, const void *buf spdk_blob_io_write(handle->blob, ch, handle->dma_buf, lba_off, lba_len, io_completion_cb, &io_ctx); rc = wait_done(&io_ctx.done, &io_ctx.rc, "io_write(write phase)"); - if (rc != 0) return rc; + if (rc != 0) { + ret = rc; + goto out; + } - return io_ctx.rc; + ret = io_ctx.rc; +out: + put_current_channel(ch); + return ret; } // blob_read 类似 @@ -681,16 +708,19 @@ int blob_read(struct zvfs_blob_handle *handle, uint64_t offset, void *buf, size_ if (tls.thread) { spdk_thread_poll(tls.thread, 0, 0); } - + + if (len == 0) return 0; + struct spdk_io_channel *ch = get_current_channel(); if (!ch) return -1; - if (len == 0) return 0; - + int ret = 0; + // 越界检查 if (offset + len > handle->size) { SPDK_ERRLOG("blob_read out of range: offset=%lu len=%zu blob_size=%lu\n", offset, len, handle->size); - return -ERANGE; + ret = -ERANGE; + goto out; } @@ -701,14 +731,16 @@ int blob_read(struct zvfs_blob_handle *handle, uint64_t offset, void *buf, size_ int rc = zvfs_calc_io_units(offset, len, g_engine.io_unit_size, &lba_off, &lba_len, &buf_off); if (rc != 0) { SPDK_ERRLOG("io_read offset/len not aligned to io_unit_size=%lu\n", g_engine.io_unit_size); - return rc; + ret = rc; + goto out; } // 读入对齐范围到 dma_buf,再从正确偏移处截取到用户 buf size_t aligned_bytes = lba_len * g_engine.io_unit_size; if (aligned_bytes > ZVFS_DMA_BUF_SIZE) { SPDK_ERRLOG("blob_read aligned_bytes=%zu exceeds ZVFS_DMA_BUF_SIZE\n", aligned_bytes); - return -ENOSPC; + ret = -ENOSPC; + goto out; } struct io_completion_ctx io_ctx = {.done = false, .rc = 0}; @@ -717,17 +749,23 @@ int blob_read(struct zvfs_blob_handle *handle, uint64_t offset, void *buf, size_ io_completion_cb, &io_ctx); rc = wait_done(&io_ctx.done, &io_ctx.rc, "io_read"); - if (rc != 0) return rc; + if (rc != 0) { + ret = rc; + goto out; + } memcpy(buf, (uint8_t *)handle->dma_buf + buf_off, len); - return io_ctx.rc; + ret = io_ctx.rc; +out: + put_current_channel(ch); + return ret; } // blob_resize static void blob_resize_cb(void *arg, int rc) { struct md_op_ctx *ctx = arg; ctx->rc = rc; - ctx->done = true; + __atomic_store_n(&ctx->done, true, __ATOMIC_RELEASE); } static void blob_resize_on_md(struct md_op_ctx *ctx) { @@ -736,7 +774,7 @@ static void blob_resize_on_md(struct md_op_ctx *ctx) { int rc = zvfs_calc_ceil_units(ctx->handle_op.new_size, cluster_size, &new_clusters); if (rc != 0) { ctx->rc = rc; - ctx->done = true; + __atomic_store_n(&ctx->done, true, __ATOMIC_RELEASE); return; } spdk_blob_resize(ctx->handle_op.handle->blob, new_clusters, blob_resize_cb, ctx); @@ -759,7 +797,7 @@ int blob_resize(struct zvfs_blob_handle *handle, uint64_t new_size) { static void blob_sync_md_cb(void *arg, int rc) { struct md_op_ctx *ctx = arg; ctx->rc = rc; - ctx->done = true; + __atomic_store_n(&ctx->done, true, __ATOMIC_RELEASE); } static void blob_sync_md_on_md(struct md_op_ctx *ctx) { @@ -776,7 +814,7 @@ int blob_sync_md(struct zvfs_blob_handle *handle) { static void blob_close_cb(void *arg, int rc) { struct md_op_ctx *ctx = arg; ctx->rc = rc; - ctx->done = true; + __atomic_store_n(&ctx->done, true, __ATOMIC_RELEASE); } static void blob_close_on_md(struct md_op_ctx *ctx) { @@ -798,7 +836,7 @@ int blob_close(struct zvfs_blob_handle *handle) { static void blob_delete_cb(void *arg, int rc) { struct md_op_ctx *ctx = arg; ctx->rc = rc; - ctx->done = true; + __atomic_store_n(&ctx->done, true, __ATOMIC_RELEASE); } static void blob_delete_on_md(struct md_op_ctx *ctx) { diff --git a/src/spdk_engine/io_engine.h b/src/spdk_engine/io_engine.h index e9c4dfc..c5a80d0 100644 --- a/src/spdk_engine/io_engine.h +++ b/src/spdk_engine/io_engine.h @@ -31,7 +31,6 @@ typedef struct zvfs_tls_ctx { int io_engine_init(const char *bdev_name); -struct zvfs_blob_handle *blob_get_super(void); struct zvfs_blob_handle *blob_create(uint64_t size_hint); // 创建并 open,返回 handle struct zvfs_blob_handle *blob_open(uint64_t blob_id); // open 现有 blob,返回 handle int blob_write(struct zvfs_blob_handle *handle, uint64_t offset, const void *buf, size_t len); diff --git a/src/zvfsmalloc.json b/src/zvfsmalloc.json index 10ded9d..69925d7 100755 --- a/src/zvfsmalloc.json +++ b/src/zvfsmalloc.json @@ -7,7 +7,7 @@ "method": "bdev_malloc_create", "params": { "name": "Malloc0", - "num_blocks": 32768, + "num_blocks": 262140, "block_size": 512 } }