From c33a694bd86aa50f8f58e6954b3874a04d28265b Mon Sep 17 00:00:00 2001 From: 1iaan Date: Mon, 2 Mar 2026 14:45:24 +0000 Subject: [PATCH] zvfs: hook db_bench thread 1 complete --- plan.md | 232 --- plan/baseline_commands.md | 98 ++ plan/phase1_validation.md | 168 ++ plan/phase2_validation.md | 82 + plan/plan.md | 249 +++ plan/rocksdb_syscall_matrix.md | 87 ++ rocksdb.md | 13 +- test/Makefile | 3 +- test/test_phase2_posix.c | 98 ++ zvfs/zvfs.c | 443 +++--- zvfs/zvfs.h | 56 +- zvfs/zvfs_hook.c | 2614 +++++++++++++++++++++++++++----- 12 files changed, 3349 insertions(+), 794 deletions(-) delete mode 100644 plan.md create mode 100644 plan/baseline_commands.md create mode 100644 plan/phase1_validation.md create mode 100644 plan/phase2_validation.md create mode 100644 plan/plan.md create mode 100644 plan/rocksdb_syscall_matrix.md create mode 100644 test/test_phase2_posix.c mode change 100755 => 100644 zvfs/zvfs_hook.c diff --git a/plan.md b/plan.md deleted file mode 100644 index b3f8d98..0000000 --- a/plan.md +++ /dev/null @@ -1,232 +0,0 @@ -# ZVFS LD_PRELOAD -> RocksDB 加速实施计划(给 Codex 执行) - -> 目标:基于当前 `zvfs/zvfs.c` + `zvfs/zvfs_hook.c`,从“单线程 + 接口不全”演进到“可支撑 RocksDB 多线程并有性能收益”。 -> -> 约束:**分阶段、可中断、可重入**;每阶段都必须有独立可验证的正确性门槛。 - ---- - -## 0. 执行规则(必须遵守) - -1. 一次只做一个阶段;阶段内通过验收前不得进入下一阶段。 -2. 每阶段结束后必须更新本文“阶段状态”。 -3. 每阶段保留一个“恢复锚点”: - - 可编译的代码状态; - - 一组固定验证命令; - - 一条阶段 commit(建议)。 -4. 出现阻塞时只回滚当前阶段改动,不回滚已完成阶段。 -5. 非 `/zvfs` 路径行为必须始终透传到 libc,不得回归。 - ---- - -## 1. 阶段状态(可重入入口) - -- [ ] Phase 1: 建立基线与 syscall 覆盖清单(RocksDB 实际需求) -- [ ] Phase 2: 修正现有 hook 语义缺口(不改线程模型) -- [ ] Phase 3: 补齐 RocksDB P0 接口集(单线程可跑通) -- [ ] Phase 4: 补齐 RocksDB P1 接口集(目录/锁/截断) -- [ ] Phase 5: 多线程执行模型改造(Worker + 请求队列) -- [ ] Phase 6: 并发正确性与崩溃恢复强化 -- [ ] Phase 7: 面向 RocksDB 的性能优化与验收 - -> 重入方式:中断后先看本区,继续第一个未完成阶段。 - ---- - -## 2. 当前代码关键问题(来自 `zvfs.c`/`zvfs_hook.c`) - -1. 单线程瓶颈:所有请求依赖 `global_thread + waiter` 同步轮询,调用线程直接 `spdk_thread_poll`,不适合并发。 -2. hook 覆盖不足:当前仅 `open/read/write/close/unlink/lseek`,RocksDB 常用接口大量缺失(如 `pread/pwrite/fstat/fsync/fdatasync/ftruncate/rename/openat/fcntl` 等)。 -3. 语义缺口:`O_TRUNC/O_APPEND/errno` 等语义不完整;元数据保存与文件操作一致性较弱。 -4. 线程安全缺口:`g_fs`、`fd_table`、`dirent`、`open_count` 等无锁并发访问。 - ---- - -## 3. 分阶段计划 - -## Phase 1: 建立基线与 syscall 覆盖清单(RocksDB 实际需求) - -### 目标 -明确 RocksDB 在当前环境下真实调用了哪些文件接口,得到“必须实现”的优先级列表。 - -### 任务 -1. 编译基线:`make`、`make -C test`。 -2. 跑现有回归:`make run-test`(普通路径和 `/zvfs` 路径各一轮)。 -3. 用 `strace -f` 跑 `db_bench`(或最小 RocksDB workload),导出 syscall 统计。 -4. 产出 `docs/rocksdb-syscall-matrix.md`(P0/P1/P2 分类 + 是否已支持)。 - -### 验收 -- 能给出可复现命令与 syscall 清单。 -- 明确哪些接口是“阻塞 RocksDB 跑通”的 P0。 - -### 中断/重入 -- 产物文件存在:`docs/rocksdb-syscall-matrix.md`。 -- 下一次从该清单继续,不需要重跑全量分析。 - ---- - -## Phase 2: 修正现有 hook 语义缺口(不改线程模型) - -### 目标 -在保持单线程架构不变的前提下,先把已有接口的 POSIX 语义修正到可用状态。 - -### 任务 -1. 修复 `open` 标志位语义:至少覆盖 `O_CREAT/O_EXCL/O_TRUNC/O_APPEND`。 -2. 统一返回值与 `errno`:`zvfs_*` 失败路径映射到标准 errno。 -3. 修复元数据 I/O 基础问题(如加载/保存边界、错误传播、close 使用 real 函数)。 -4. 增加小型语义回归测试(可放 `test/`)。 - -### 验收 -- 现有 `test` 全通过。 -- 新增语义测试通过。 -- 非 `/zvfs` 路径行为无回归。 - -### 中断/重入 -- 保留旧逻辑兼容开关(如宏开关)直到本阶段稳定。 -- 提交后可独立回退,不影响后续接口扩展。 - ---- - -## Phase 3: 补齐 RocksDB P0 接口集(单线程可跑通) - -### 目标 -先实现 RocksDB “必须有才能启动并跑基础 workload” 的接口集合。 - -### P0 接口(优先) -- 打开类:`open64/openat/openat64/__open_2/__open64_2` -- 偏移 I/O:`pread/pread64/pwrite/pwrite64` -- 元数据:`stat/lstat/fstat/fstatat/access` -- 持久化:`fsync/fdatasync` -- 变更:`rename/renameat/unlinkat` - -### 任务 -1. 在 `zvfs_hook.c` 增加 real 函数指针与统一初始化(建议 `pthread_once`)。 -2. 所有新 hook 必须支持“路径过滤 + 非 `/zvfs` 透传”。 -3. 对不支持的语义明确返回 `ENOTSUP/EOPNOTSUPP`,禁止静默成功。 -4. 增加 `pread/pwrite` 偏移语义测试。 - -### 验收 -- `db_bench` 单线程基础项可跑:`fillseq/fillrandom/readrandom`。 -- `strace` 显示 P0 接口已被正确接管或透传。 - -### 中断/重入 -- 每新增一类 hook 单独 commit(open/io/meta/sync/rename)。 -- 中断后按未完成类别继续,不影响已完成类别。 - ---- - -## Phase 4: 补齐 RocksDB P1 接口集(目录/锁/截断) - -### 目标 -支持 RocksDB 更完整运行路径,尤其是锁文件、目录操作、截断相关语义。 - -### P1 接口 -- `ftruncate/truncate` -- `fcntl`(至少 `F_SETLK/F_SETLKW/F_GETLK/F_UNLCK`) -- `mkdir/rmdir/opendir/readdir/closedir` -- `link/symlink/readlink`(按 strace 结果决定) - -### 任务 -1. 为目录与锁引入最小可用实现(先保证正确,再优化)。 -2. 对暂不支持特性返回明确 errno,不可假成功。 -3. 补充目录/锁语义测试(多进程或多线程最小场景)。 - -### 验收 -- `db_bench --threads=4` 可稳定执行基础 workload。 -- 无明显语义错误(锁冲突、目录丢失、truncate 异常)。 - -### 中断/重入 -- `fcntl` 与目录接口分成两个子里程碑。 -- 任一子里程碑完成即可落盘并停在该点。 - ---- - -## Phase 5: 多线程执行模型改造(Worker + 请求队列) - -### 目标 -把当前“调用线程主动 poll”的模式改为“专用 worker poll + 线程安全请求提交”,解决单线程瓶颈。 - -### 任务 -1. 新增 `zvfs_worker`:专用线程、请求队列、完成通知(cond/futex/eventfd 均可)。 -2. 将 `waiter` 路径替换为 `submit_and_wait` 路径;调用线程不再直接 `spdk_thread_poll`。 -3. 增加并发保护: - - 全局锁:`g_fs/mount/fd_table/dirent`; - - 文件锁:`offset/blob 生命周期`; - - 明确锁顺序避免死锁。 -4. 保留回退开关(例如 `ZVFS_USE_LEGACY_WAITER`)直到压测稳定。 - -### 验收 -- 线程数 4/8 下功能测试稳定,无死锁/崩溃。 -- CPU 火焰图或日志能证明调用线程不再承担 SPDK poll。 - -### 中断/重入 -- 先实现 worker 生命周期,再迁移 read/write,再迁移 open/close。 -- 每迁移一类操作即可独立验证与提交。 - ---- - -## Phase 6: 并发正确性与崩溃恢复强化 - -### 目标 -在多线程基础上补齐一致性:元数据、删除/关闭竞态、异常退出后的可恢复性。 - -### 任务 -1. 元数据持久化改为“原子写入流程”(临时文件 + fsync + rename)。 -2. 修复 `unlink/close/open` 并发竞态(引用计数与删除时机)。 -3. 建立故障注入测试:`kill -9`、中途断电模拟(最小可复现脚本)。 -4. 明确恢复策略与错误可观测日志。 - -### 验收 -- 故障注入后可重新挂载并读到一致元数据。 -- 并发 `open/unlink/close` 压测无崩溃无悬挂。 - -### 中断/重入 -- 先落地元数据原子写入,再处理并发删除,再做故障注入。 - ---- - -## Phase 7: 面向 RocksDB 的性能优化与验收 - -### 目标 -在正确性稳定后进行性能优化,并给出“确实加速 RocksDB”的证据。 - -### 任务 -1. 优化优先级: - - 对齐写 fast-path 与非对齐 RMW 优化; - - DMA buffer 复用/池化; - - 减少全局锁粒度; - - 批量/延迟元数据刷新策略。 -2. 构建统一 benchmark 脚本: - - 对照组 A:不使用 `LD_PRELOAD`; - - 对照组 B:`LD_PRELOAD=./libzvfs.so` + `/zvfs` 路径。 -3. 指标:吞吐(ops/s)、P99 延迟、CPU 使用率、失败率。 - -### 验收(最终目标) -- 在至少一个 RocksDB workload 上达到可重复的性能提升(建议目标 >= 1.3x,最终以实测为准)。 -- 提供完整报告:命令、环境、结果表、结论与剩余瓶颈。 - -### 中断/重入 -- 每个优化项必须可单独开关,可单独回滚。 - ---- - -## 4. 阶段验收矩阵(执行时打勾) - -- [ ] P1 已产出 syscall matrix 且可复现。 -- [ ] P2 已完成现有语义修复且回归通过。 -- [ ] P3 已实现 P0 接口并单线程跑通 RocksDB。 -- [ ] P4 已实现 P1 接口并多线程稳定运行。 -- [ ] P5 已切换到 worker 并通过并发稳定性测试。 -- [ ] P6 已通过崩溃恢复与并发竞态测试。 -- [ ] P7 已完成性能对照并证明加速收益。 - ---- - -## 5. 每阶段固定输出模板(执行时复用) - -1. 改动清单(文件 + 关键点)。 -2. 验证命令与结果。 -3. 风险/已知问题。 -4. 阶段状态勾选更新。 -5. 下一阶段入口条件是否满足。 - diff --git a/plan/baseline_commands.md b/plan/baseline_commands.md new file mode 100644 index 0000000..49da87c --- /dev/null +++ b/plan/baseline_commands.md @@ -0,0 +1,98 @@ +# Phase 0 - 基线命令与验证记录 + +- 日期: 2026-03-02 +- 主机: `ubuntu`(本地开发机) +- db_bench 路径: `/home/lian/env/rocksdb-test/db_bench` +- LD_PRELOAD 库: `/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so` + +## 1) 成功基线(不启用 LD_PRELOAD) + +### 命令 +```bash +rm -rf /tmp/rdb_phase0_plain && \ +strace -f -qq -o /tmp/phase0_plain.strace \ + -e trace=%file,%desc,fsync,fdatasync,ftruncate,fallocate,pread64,pwrite64,lseek,rename,renameat,renameat2 \ + /home/lian/env/rocksdb-test/db_bench \ + --benchmarks=fillseq \ + --db=/tmp/rdb_phase0_plain \ + --num=5000 \ + --value_size=128 \ + --threads=1 \ + --compression_type=none \ + --stats_interval_seconds=0 +``` + +### 结果摘要 +- 命令退出码: `0` +- db_bench 输出: `fillseq ... 5000 operations`(成功) +- errno 分布(trace 全局): + - `ENOENT x11` + - `EEXIST x9` +- 关键 syscall(全局计数): + - `openat 91`, `fcntl 82`, `pread64 8`, `fsync 10`, `fdatasync 10`, `ftruncate 5`, `fallocate 5`, `rename 9` +- DB 路径相关 syscall(`/tmp/rdb_phase0_plain`): + - `openat 61`, `mkdir 12`, `access 10`, `rename 9`, `unlink 12`, `rmdir 2` + +## 2) 失败分支(启用 LD_PRELOAD,/zvfs 路径) + +### 命令 +```bash +strace -f -qq -o /tmp/phase0_preload.strace \ + -e trace=%file,%desc,fsync,fdatasync,ftruncate,fallocate,pread64,pwrite64,lseek,rename,renameat,renameat2 \ + env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so \ + /home/lian/env/rocksdb-test/db_bench \ + --benchmarks=fillseq \ + --db=/zvfs/rdb_phase0_preload \ + --num=1000 \ + --value_size=128 \ + --threads=1 \ + --compression_type=none \ + --stats_interval_seconds=0 +``` + +### 结果摘要 +- 命令退出码: `1` +- 关键报错: + - `Cannot use IOVA as 'PA'` + - `Failed to initialize DPDK` + - `open error: ... While mkdir if missing: /zvfs/rdb_phase0_preload: No such file or directory` +- `/zvfs/rdb_phase0_preload` 相关错误码: + - `ENOENT x4` +- 直接证据(trace): + - `openat(..., "/zvfs/rdb_phase0_preload", ... O_DIRECTORY) = -1 ENOENT` + - `mkdir("/zvfs/rdb_phase0_preload", 0755) = -1 ENOENT` + +## 3) 提取统计的复用命令 + +### 3.1 syscall 频次(全局) +```bash +sed -E 's/^\[pid +[0-9]+\] +//; s/^[0-9]+ +//' /tmp/phase0_plain.strace \ + | grep -oP '^[a-zA-Z_][a-zA-Z0-9_]*(?=\()' \ + | sort | uniq -c | sort -nr +``` + +### 3.2 errno 分布 +```bash +grep -oP '= -1 [A-Z0-9]+' /tmp/phase0_plain.strace | awk '{print $3}' | sort | uniq -c | sort -nr +grep -oP '= -1 [A-Z0-9]+' /tmp/phase0_preload.strace | awk '{print $3}' | sort | uniq -c | sort -nr +``` + +### 3.3 DB 路径相关 syscall +```bash +grep '/tmp/rdb_phase0_plain' /tmp/phase0_plain.strace \ + | sed -E 's/^\[pid +[0-9]+\] +//; s/^[0-9]+ +//' \ + | grep -oP '^[a-zA-Z_][a-zA-Z0-9_]*(?=\()' \ + | sort | uniq -c | sort -nr +``` + +## 4) Phase0 正确性验证清单(执行记录) + +- [x] 每个“phase2 必须实现 syscall”都在矩阵中给出复现证据或来源。 +- [x] 基线命令可重复执行(成功路径与失败路径各 1 组)。 +- [x] 输出包含 syscall 统计与错误码分布。 +- [x] 验证记录已写入 `plan/baseline_commands.md`。 + +## 5) 已知限制(phase1 前需处理) + +1. 当前环境中 SPDK 初始化失败(IOVA/PA),导致无法在本机直接完成完整 `/zvfs` I/O 链路验证。 +2. phase0 主要覆盖 `fillseq`;phase1/phase2 开始前应补 `readrandom/overwrite` 的 syscall 采样。 diff --git a/plan/phase1_validation.md b/plan/phase1_validation.md new file mode 100644 index 0000000..19ab4be --- /dev/null +++ b/plan/phase1_validation.md @@ -0,0 +1,168 @@ +# Phase 1 验证方案(用户执行) + +> 背景:本轮已完成 phase1 代码改造,但当前代理环境无法在 root + SPDK 运行条件下完成端到端验证。下面步骤请你在本机执行。 + +## 0) 目标 + +验证 3 件事: +1. `zvfs_io_req` 解耦后,`read/write` 旧路径行为不回退。 +2. 新增 `zvfs_pread/zvfs_pwrite` API 能正确处理 offset I/O。 +3. 构建产物 `libzvfs.so` 可正常编译链接。 + +--- + +## 1) 构建验证 + +```bash +cd /home/lian/share/10.1-spdk/zvfs +make -C zvfs -j +``` + +期望: +- 编译成功,无报错。 +- 生成 `/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so`。 + +--- + +## 2) Phase1 新 API 验证(不依赖 LD_PRELOAD hook) + +### 2.1 生成临时测试程序 + +```bash +cat >/tmp/phase1_api_check.c <<'EOF' +#include +#include +#include +#include "zvfs.h" + +static int expect_eq(const char *name, const void *got, const void *exp, size_t n) { + if (memcmp(got, exp, n) != 0) { + fprintf(stderr, "[FAIL] %s mismatch\n", name); + return -1; + } + printf("[OK] %s\n", name); + return 0; +} + +int main(void) { + int rc = 1; + int mounted = 0; + int created = 0; + if (zvfs_env_setup() != 0) { + fprintf(stderr, "zvfs_env_setup failed\n"); + return rc; + } + + zvfs_t *fs = calloc(1, sizeof(*fs)); + zvfs_file_t *file = calloc(1, sizeof(*file)); + zvfs_dirent_t *dirent = calloc(1, sizeof(*dirent)); + if (!fs || !file || !dirent) { + rc = 2; + goto out; + } + + if (!zvfs_mount(fs)) { + fprintf(stderr, "zvfs_mount failed\n"); + rc = 3; + goto out; + } + mounted = 1; + + file->fs = fs; + file->dirent = dirent; + if (!zvfs_create(file)) { + fprintf(stderr, "zvfs_create failed\n"); + rc = 4; + goto out; + } + created = 1; + + /* 验证 pwrite + pread 的 offset 语义 */ + const char *a = "AAAA"; + const char *b = "BBBB"; + if (zvfs_pwrite(file, (const uint8_t *)a, 4, 0) != 4) { rc = 5; goto out; } + if (zvfs_pwrite(file, (const uint8_t *)b, 4, 8) != 4) { rc = 6; goto out; } + + uint8_t got[12] = {0}; + uint8_t exp[12] = {'A','A','A','A',0,0,0,0,'B','B','B','B'}; + if (zvfs_pread(file, got, sizeof(got), 0) != 12) { rc = 7; goto out; } + if (expect_eq("pread/pwrite-hole", got, exp, sizeof(exp)) != 0) { rc = 8; goto out; } + + /* 验证旧 read/write 顺序语义未回退 */ + file->current_offset = 0; + const char *c = "CCCC"; + if (zvfs_write(file, (const uint8_t *)c, 4) != 4) { rc = 9; goto out; } + file->current_offset = 0; + uint8_t got2[4] = {0}; + if (zvfs_read(file, got2, sizeof(got2)) != 4) { rc = 10; goto out; } + if (expect_eq("read/write-seq", got2, c, 4) != 0) { rc = 11; goto out; } + + rc = 0; + printf("[PASS] phase1_api_check\n"); + +out: + if (created) { + (void)zvfs_close(file); + (void)zvfs_delete(file); + } + if (mounted) { + (void)zvfs_umount(fs); + } + + free(dirent); + free(file); + free(fs); + return rc; +} +EOF +``` + +### 2.2 编译并运行 + +```bash +gcc -O2 -Wall -Wextra -std=gnu11 \ + -I/home/lian/share/10.1-spdk/zvfs/zvfs \ + -I/home/lian/share/10.1-spdk/zvfs/spdk/include \ + -o /tmp/phase1_api_check /tmp/phase1_api_check.c \ + /home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so -ldl + +sudo /tmp/phase1_api_check +``` + +期望: +- 输出包含: + - `[OK] pread/pwrite-hole` + - `[OK] read/write-seq` + - `[PASS] phase1_api_check` + +--- + +## 3) 旧 POSIX 路径回归(LD_PRELOAD) + +```bash +cd /home/lian/share/10.1-spdk/zvfs +make -C test -j + +sudo env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_basic /zvfs +sudo env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_lseek /zvfs +sudo env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_two_files /zvfs +``` + +期望: +- `test_basic`、`test_lseek`、`test_two_files` 通过。 +- 无崩溃、无明显数据错乱。 + +若出现一次性 `zvfs_mount failed`,建议先确保没有残留测试进程,再重跑一次验证命令。 + +--- + +## 4) 验证完成后回填 + +请把验证结果回填到 `plan/plan.md` 的 phase1 区域: +- `### 阶段验收` +- `### 正确性验证方案` + +若任何一项失败,请附上: +1. 命令; +2. 错误输出; +3. 是否可稳定复现。 diff --git a/plan/phase2_validation.md b/plan/phase2_validation.md new file mode 100644 index 0000000..0e8271e --- /dev/null +++ b/plan/phase2_validation.md @@ -0,0 +1,82 @@ +# Phase 2 验证方案(用户执行) + +目标:验证 phase2 新增 hook 与 POSIX 语义(openat/pread/pwrite/fsync/ftruncate/stat/rename 等)。 + +## 1) 构建 + +```bash +cd /home/lian/share/10.1-spdk/zvfs +make -C zvfs -j +make -C test -j +``` + +## 2) 关键回归(LD_PRELOAD) + +```bash +cd /home/lian/share/10.1-spdk/zvfs +sudo env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_basic /zvfs +sudo env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_lseek /zvfs +sudo env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_two_files /zvfs +sudo env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_phase2_posix /zvfs +``` + +期望: +- 以上 4 个测试全部 PASSED。 +- `test_phase2_posix` 会覆盖: + - `mkdir/openat/close/rmdir` + - `pread/pwrite` + 稀疏洞校验 + - `fsync/fdatasync` + - `ftruncate` + - `fstat/stat/access` + - `rename` + - `O_EXCL` 与只读 fd 上 `write` 的 errno + +## 3) db_bench 最小验证 + +```bash +cd /home/lian/share/10.1-spdk/zvfs +sudo env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so \ + /home/lian/env/rocksdb-test/db_bench \ + --benchmarks=fillseq \ + --db=/zvfs/rdb_phase2 \ + --num=200000 \ + --value_size=128 \ + --threads=1 \ + --compression_type=none \ + --stats_interval_seconds=5 + + +sudo /home/lian/env/rocksdb-test/db_bench \ + --benchmarks=fillseq \ + --db=/tmp/rdb_native \ + --num=200000 \ + --value_size=128 \ + --threads=1 \ + --compression_type=none \ + --stats_interval_seconds=5 +``` + +期望: +- 不出现 `Function not implemented` +- 不出现 `Bad file descriptor` +- 不出现 `While mkdir if missing` 相关 `ENOENT` + +## 4) 失败场景建议(可选) + +用 `strace` 辅助确认关键 syscall 已被接管: + +```bash +sudo strace -f -qq -o /tmp/phase2_check.strace \ + -e trace=%file,%desc,fsync,fdatasync,ftruncate,pread64,pwrite64,rename,fcntl \ + env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_phase2_posix /zvfs +``` + +看点: +- `openat("/zvfs/...")` 不再因目录缺失直接失败 +- `rename`/`ftruncate`/`pread64`/`fdatasync` 调用链完整 + +## 5) 回填要求 + +执行后请把结果回填到 `plan/plan.md` 的 phase2 区域: +- `### 阶段验收` +- `### 正确性验证方案` diff --git a/plan/plan.md b/plan/plan.md new file mode 100644 index 0000000..281e69a --- /dev/null +++ b/plan/plan.md @@ -0,0 +1,249 @@ +# ZVFS -> RocksDB (LD_PRELOAD) 分阶段改造计划(可重入) + +本文档是给后续编码代理(我)使用的执行计划。目标是:在不破坏现有功能的前提下,逐步把 `zvfs.c + zvfs_hook.c` 改造成可支撑 RocksDB 的 LD_PRELOAD 存储层。 + +--- + +## 0. 使用规则(可重入协议) + +每次开始新一轮开发时,严格按以下顺序执行: + +1. 读取本文件,定位“当前阶段”和“未完成任务”。 +2. 先做“状态核对”: + - 代码是否已存在同名结构/函数; + - 测试是否已覆盖该阶段验收项; + - 若已完成则跳过,不重复改动(幂等)。 +3. 只推进一个阶段内的最小闭环,不跨阶段大改。 +4. 每完成一个任务,立即更新本文件: + - 勾选完成项; + - 记录关键变更文件; + - 记录剩余风险; + - 记录本阶段“正确性验证”执行结果。 +5. 若发现与计划冲突的现实约束(SPDK限制/接口差异/语义冲突),先在“变更记录”追加说明,再调整后续子任务,不直接删原计划。 +6. 非代码文档统一写入 `plan/` 目录(用户约束)。 + +--- + +## 1. 总目标与约束 + +### 1.1 总目标 +- 支持 RocksDB 关键 I/O 路径(优先 db_bench 可运行并稳定)。 +- 提供正确 POSIX 语义(至少覆盖 RocksDB 依赖子集)。 +- 从单线程串行模型演进到可并行数据面。 +- 保证崩溃后一致性(元数据持久化可恢复)。 + +### 1.2 当前主要问题(已确认) +- 单线程 + busy wait + 全局串行,队列深度近似 1。 +- hook 覆盖不足,缺少 pread/pwrite/fsync/ftruncate/openat/stat 等。 +- I/O 参数耦合在 `zvfs_file_t`,不利于 pread/pwrite 和并发。 +- 全局状态无并发保护(`g_fs/fd_table/dirents/open_count` 等)。 +- open/write/close/unlink errno 与 POSIX 语义不完整。 +- 元数据依赖宿主文本文件,容量小、非原子、不可恢复。 +- 配置路径硬编码,部署迁移性差。 + +### 1.3 强制要求(新增) +- 每个阶段都必须定义“正确性验证方案”,且在阶段结束前执行并记录结果。 +- 未完成正确性验证,不得将该阶段标记为完成。 + +--- + +## 2. 阶段总览 + +- 阶段0:基线与兼容清单(先知道 RocksDB 真实需要什么) +- 阶段1:架构解耦(file vs io_req) +- 阶段2:补齐关键 hook 与 POSIX 语义 +- 阶段3:并发模型升级(控制面/数据面分离) +- 阶段4:元数据持久化重构(super blob / 日志) +- 阶段5:性能与稳定性验收 + +--- + +## 3. 详细阶段计划 + +## 阶段0:基线与兼容清单 + +### 目标 +- 建立“RocksDB syscall 最小集合”和“当前实现缺口表”。 + +### 任务 +- [x] 用 `strace` 或等效方式采集 `db_bench` syscall(含失败分支)。 +- [x] 形成兼容矩阵:必须实现 / 可降级透传 / 暂不支持。 +- [x] 固化第一批验收用例(最小 db_bench 参数 + 现有单测集合)。 + +### 交付物 +- [x] `plan/rocksdb_syscall_matrix.md` +- [x] `plan/baseline_commands.md` + +### 阶段验收 +- [x] 能明确列出阶段2必须完成的 hook 列表。 + +### 正确性验证方案 +- [x] 验证矩阵中的每个“必须实现 syscall”都有最小复现样例(命令或小程序)。 +- [x] 基线命令可重复执行,输出包含 syscall 统计与错误码分布。 +- [x] 验证结果记录到 `plan/baseline_commands.md`(含日期和环境信息)。 + +--- + +## 阶段1:架构解耦(关键) + +### 目标 +- 把 I/O 请求参数从 `zvfs_file_t` 中拆出,建立请求对象,先打通 pread/pwrite 内核路径。 + +### 任务 +- [x] 新增 `zvfs_io_req`(字段至少:op/buf/len/offset/flags/result/errno/finished)。 +- [x] 重构 `zvfs_read/zvfs_write` 为基于 `zvfs_io_req` 的通用入口。 +- [x] 实现内部 `zvfs_pread_internal/zvfs_pwrite_internal`,不改 hook 先可调用。 +- [x] 移除 `zvfs_file_t` 中仅一次请求有效的临时字段(或标记弃用)。 + +### 交付物 +- [x] `zvfs/zvfs.h` 新结构与接口 +- [x] `zvfs/zvfs.c` 读写路径重构完成 +- [x] `plan/phase1_validation.md`(用户侧验证步骤) + +### 阶段验收 +- [x] 现有 read/write/lseek 测试全通过。 +- [x] 新增 pread/pwrite 单测通过(可先直连接口,不经 hook)。 + +### 正确性验证方案 +- [x] 新旧接口结果一致性校验:同一输入下 `read/write` 与 `pread/pwrite` 数据一致。 +- [x] 边界测试:offset=0、EOF、跨 page、非对齐、空写入。 +- [x] 失败路径测试:非法 fd/参数时返回值与 `errno` 符合预期。 + +--- + +## 阶段2:补齐 hook 与 POSIX 语义 + +### 目标 +- 满足 RocksDB 最小兼容 API 集,保证语义与 errno 正确。 + +### 首批必须实现 +- [x] `pread/pwrite`(及 `pread64/pwrite64` 视平台符号而定) +- [x] `open/open64/openat` +- [x] `fsync/fdatasync` +- [x] `ftruncate` +- [x] `fstat/stat/lstat`(至少满足 RocksDB 元数据查询) +- [x] `rename`(原子替换语义) + +### 语义修复 +- [x] `open` 支持 `O_TRUNC/O_APPEND/O_EXCL` +- [x] 权限检查(`O_RDONLY/O_WRONLY/O_RDWR`) +- [x] 返回值与 `errno` 映射统一(失败路径不可吞错) + +### 交付物 +- [x] `zvfs/zvfs_hook.c`(phase2 hook 覆盖与语义修复) +- [x] `test/test_phase2_posix.c`(phase2 POSIX 回归用例) +- [x] `plan/phase2_validation.md`(用户侧验证步骤) + +### 阶段验收 +- [ ] db_bench 基础 workload 可跑通(单线程先行)。 +- [ ] 不支持的 syscall 必须明确透传且行为可解释。 + +### 正确性验证方案 +- [ ] 每个新增 hook 至少 1 个正例 + 1 个反例(权限/参数/不存在文件)。 +- [ ] `open` 语义验证:`O_TRUNC/O_APPEND/O_EXCL` 行为与本地文件系统对齐。 +- [ ] `errno` 对照验证:对关键失败场景做预期值断言。 +- [ ] 跑最小 `db_bench`,确认无 “Function not implemented/Bad file descriptor” 类错误。 + +--- + +## 阶段3:并发模型升级 + +### 目标 +- 保持 hook 层阻塞语义,但底层可并行提交处理。 + +### 任务 +- [ ] 设计并实现 `控制面线程 + N个数据面worker`。 +- [ ] 每 worker 使用独立 io_channel。 +- [ ] 引入并发保护:fd_table、dirents、open_count、全局挂载状态。 +- [ ] 修复生命周期竞态(close/unlink 并发、延迟删除)。 + +### 阶段验收 +- [ ] 多线程压测下无崩溃/无明显数据错乱。 +- [ ] QD>1 场景吞吐显著高于阶段2。 + +### 正确性验证方案 +- [ ] 多线程读写一致性校验(文件内容 hash 或区块比对)。 +- [ ] 并发场景稳定性:长时间压测无崩溃、无死锁、无句柄泄漏。 +- [ ] 竞态回归:`close/unlink`、双开同文件、并发 append 场景正确。 + +--- + +## 阶段4:元数据持久化重构 + +### 目标 +- 去掉宿主文本元数据文件,转向 blobstore 内部可恢复元数据。 + +### 任务 +- [ ] 使用 super blob(或单独 metadata blob)管理目录与 inode-like 信息。 +- [ ] 建立日志或 copy-on-write 更新流程,支持崩溃恢复。 +- [ ] 对 `create/unlink/rename/truncate` 实现原子更新策略。 + +### 阶段验收 +- [ ] 强制中断后重启可恢复一致目录和文件大小信息。 +- [ ] 不再依赖固定绝对路径元数据文件。 + +### 正确性验证方案 +- [ ] 故障注入:在 create/write/rename/truncate 中间点中断后重启验证一致性。 +- [ ] 元数据回放验证:目录项数量、文件大小、blob_id 映射正确。 +- [ ] 对比验证:与中断前快照(或日志)比对差异可解释。 + +--- + +## 阶段5:性能与稳定性验收 + +### 目标 +- 形成“可用 + 可解释 + 可回归”的最终版本。 + +### 任务 +- [ ] buffer 管理优化(池化、减少拷贝、减少重复 preread)。 +- [ ] 完整回归:现有单测 + 新增 hook 语义测试 + db_bench 组合。 +- [ ] 输出性能报告(吞吐、延迟、CPU、错误率)。 + +### 阶段验收 +- [ ] 关键 workload 稳定运行,结果可复现。 + +### 正确性验证方案 +- [ ] 全量回归连续执行至少 3 轮,结果一致且无新增失败。 +- [ ] 性能结果包含波动范围(平均值与离散度),可复现实验命令。 +- [ ] 最终发布前执行一次“从空盘到回归完成”的冷启动验证流程。 + +--- + +## 4. 当前阶段状态 + +- 当前阶段:`阶段2` +- 阶段状态:`pending_user_validation` +- 本轮目标:用户按 `plan/phase2_validation.md` 执行 phase2 正确性验证 + +--- + +## 5. 每轮执行后必须更新的记录 + +## 5.1 变更记录(按时间追加) +- [x] 2026-03-02: 完成 phase0(db_bench syscall 基线 + 失败分支 + 兼容矩阵);文件:`plan/plan.md`、`plan/rocksdb_syscall_matrix.md`、`plan/baseline_commands.md`;风险:当前环境 SPDK 初始化失败(IOVA PA 不可用),需在 phase1 前确定运行环境策略 +- [x] 2026-03-02: 完成 phase1 代码改造(file/io_req 解耦 + 新增 pread/pwrite API);文件:`zvfs/zvfs.h`、`zvfs/zvfs.c`、`plan/plan.md`、`plan/phase1_validation.md`;风险:本轮未在 root+LD_PRELOAD 环境完成端到端验证 +- [x] 2026-03-02: 根据用户反馈修复 phase1 边界问题(preread 失败时将临时缓冲区清零,避免洞区读到脏数据);文件:`zvfs/zvfs.c`、`plan/phase1_validation.md`;风险:仍需用户侧端到端复测确认 +- [x] 2026-03-02: 根据用户复测继续修复 sparse hole 问题(offset>EOF 时清零本次覆盖页内 gap 区间);文件:`zvfs/zvfs.c`、`plan/plan.md`;风险:跨多页大洞写入语义仍需在 phase2 做专项覆盖 +- [x] 2026-03-02: 完成 phase2 代码改造(补齐 openat/pread/pwrite/fsync/ftruncate/stat/rename/fcntl/mkdir/rmdir 等 hook 与关键语义);文件:`zvfs/zvfs_hook.c`、`zvfs/zvfs.h`、`test/test_phase2_posix.c`、`test/Makefile`、`plan/phase2_validation.md`、`plan/plan.md`;风险:尚未在用户环境完成 db_bench 端到端验收 +- [x] 2026-03-02: 根据用户 db_bench 反馈修复目录打开语义(目录允许 O_RDONLY 打开,不再强制要求 O_DIRECTORY);文件:`zvfs/zvfs_hook.c`;风险:仍需用户复测 db_bench +- [x] 2026-03-02: 根据用户 db_bench 反馈补齐 fadvise/fallocate/sync_file_range hook,避免 pseudo-fd 被内核路径误判为 EBADF;文件:`zvfs/zvfs_hook.c`、`zvfs/zvfs.h`;风险:仍需用户复测 db_bench + +## 5.2 风险清单(持续维护) +- [ ] 线程模型改造可能引入 SPDK thread affinity 问题 +- [ ] rename/truncate 语义与 blobstore 能力映射复杂 +- [ ] LD_PRELOAD 多符号拦截顺序可能受 libc/应用实现影响 + +## 5.3 决策记录(ADR-lite) +- [ ] D-001: 为什么选择控制面/数据面分离 +- [ ] D-002: 为什么选择 super blob 元数据格式 +- [ ] D-003: 不支持 syscall 的透传策略与边界 + +--- + +## 6. 完成定义(DoD) + +满足以下条件才可标记“计划完成”: +- [ ] 阶段0~5全部验收项勾选完成 +- [ ] RocksDB db_bench 至少 3 类 workload 稳定通过 +- [ ] 关键 POSIX 语义测试通过并有失败用例说明 +- [ ] 文档包含部署方式、限制项、回归命令 diff --git a/plan/rocksdb_syscall_matrix.md b/plan/rocksdb_syscall_matrix.md new file mode 100644 index 0000000..7eccf56 --- /dev/null +++ b/plan/rocksdb_syscall_matrix.md @@ -0,0 +1,87 @@ +# Phase 0 - RocksDB syscall 兼容矩阵 + +- 日期: 2026-03-02 +- 采样对象: `/home/lian/env/rocksdb-test/db_bench` +- 当前 LD_PRELOAD: `/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so` +- 说明: 本矩阵基于 phase0 的 `fillseq` 最小工作负载和失败分支采样;用于定义 phase2 必做 hook 范围。 + +## 1) 采样结论(关键) + +### 1.1 成功基线(不启用 LD_PRELOAD,`--db=/tmp/rdb_phase0_plain`) +关键 syscall 计数(来自 strace 汇总): + +| syscall | count | +|---|---:| +| openat | 91 | +| fcntl | 82 | +| read | 56 | +| fstat | 56 | +| getdents64 | 36 | +| stat | 14 | +| mkdir | 12 | +| unlink | 12 | +| access | 11 | +| fsync | 10 | +| fdatasync | 10 | +| rename | 9 | +| pread64 | 8 | +| ftruncate | 5 | +| fallocate | 5 | +| lseek | 1 | + +路径级(仅匹配 DB 路径)高频调用: +- `openat`, `mkdir`, `access`, `rename`, `unlink`, `rmdir` + +### 1.2 失败分支(启用 LD_PRELOAD,`--db=/zvfs/rdb_phase0_preload`) +观察到的关键失败: +- SPDK 初始化失败:`Cannot use IOVA as PA`,随后 `spdk_env_init` 失败。 +- RocksDB 目录创建路径使用 `openat + mkdir`,对 `/zvfs/rdb_phase0_preload` 返回 `ENOENT`。 +- 与 DB 路径相关错误码分布:`ENOENT x4`。 + +结论:仅 hook `open/read/write/close/unlink/lseek` 无法支撑 RocksDB;至少要覆盖目录/元数据/同步/随机读写相关 syscall。 + +## 2) 当前实现 vs RocksDB 需求 + +当前已实现 hook(`zvfs_hook.c`): +- `open`, `read`, `write`, `close`, `unlink`, `lseek` + +当前缺失但在 phase0 中已观测到(或由 RocksDB 常规路径强依赖): +- `openat/openat64`, `mkdir`, `rmdir` +- `pread64/pwrite64`(`pwrite` 在 fillseq 未显式出现,但 RocksDB 通常依赖) +- `fsync/fdatasync` +- `ftruncate/fallocate` +- `fcntl`(文件锁) +- `stat/fstat/lstat/newfstatat`(含 `statx` 兼容策略) +- `rename`(CURRENT / OPTIONS / MANIFEST 原子更新) +- `access/faccessat`, `unlinkat` + +## 3) Phase 2 必做清单(来自 phase0 证据) + +> 这是“必须实现或必须可正确透传”的最小集合。 + +| API/syscall | 证据来源 | 当前状态 | Phase2 目标 | +|---|---|---|---| +| open/open64 | 现有 hook + RocksDB 文件创建 | 部分支持 | 补齐 flag 语义(`O_TRUNC/O_APPEND/O_EXCL`) | +| openat/openat2 | phase0 strace 高频 | 缺失 | 必须支持 `/zvfs` 路径 | +| close | 已实现 | 支持 | 保持并修复错误传播 | +| read/write | 已实现 | 支持 | 保持 | +| pread64/pwrite64 | phase0 观测到 pread64 | 缺失 | 必须支持 offset I/O | +| fsync/fdatasync | phase0 高频 | 缺失 | 必须支持(至少语义正确) | +| ftruncate | phase0 观测到 | 缺失 | 必须支持 | +| fallocate | phase0 观测到 | 缺失 | 必须支持或明确降级策略 | +| fcntl(F_SETLK等) | phase0 高频 | 缺失 | 必须支持最小锁语义 | +| stat/fstat/lstat/newfstatat | phase0 高频 | 缺失 | 必须支持最小元数据语义 | +| mkdir/rmdir | phase0 路径级高频 | 缺失 | 必须支持 `/zvfs` 目录层 | +| rename | phase0 路径级高频 | 缺失 | 必须支持原子替换语义 | +| unlink/unlinkat | unlink 已有,unlinkat 缺失 | 部分支持 | 补齐 unlinkat | +| access/faccessat | phase0 路径级高频 | 缺失 | 必须支持或一致透传 | + +## 4) 可降级透传(phase2 可先不接管) + +- `getdents64`, `readlink`, `statfs/fstatfs` 可先透传(前提:不作用于 `/zvfs` 或语义可接受)。 +- 若作用于 `/zvfs` 路径,必须在 phase2 给出明确行为(支持或返回可解释错误)。 + +## 5) 风险与备注 + +1. 当前机器环境下 SPDK 初始化存在 IOVA/PA 限制,影响“真实 I/O 路径”验证,需要在 phase1 前先明确运行环境策略。 +2. 仅凭 `fillseq` 不能覆盖全部 syscall;phase2 开始前建议补 `readrandom/overwrite` 采样以确认 `pwrite64` 等调用比例。 diff --git a/rocksdb.md b/rocksdb.md index 65bd411..8771b17 100644 --- a/rocksdb.md +++ b/rocksdb.md @@ -3,7 +3,16 @@ sudo apt install -y build-essential cmake git \ libgflags-dev libsnappy-dev zlib1g-dev libbz2-dev \ liblz4-dev libzstd-dev librocksdb-dev +mkdir -p ~/env/rocksdb-test +cd ~/env/rocksdb-test +cp ~/env/rocksdb/db_bench . - - +./db_bench \ + --benchmarks=fillseq \ + --num=20000000 \ + --value_size=400 \ + --compression_type=none \ + --statistics \ + --stats_interval_seconds=5 \ + --threads=8 ``` \ No newline at end of file diff --git a/test/Makefile b/test/Makefile index 06f7cee..6e01e25 100755 --- a/test/Makefile +++ b/test/Makefile @@ -8,7 +8,8 @@ BINS := $(addprefix $(BIN_DIR)/,$(BIN_NAMES)) RUN_DIR ?= /tmp/zvfs-test RUN_BINS ?= test_basic test_lseek test_dual_open_same_file test_two_files \ test_single_file_perf test_single_file_random_perf \ - test_single_file_random_noaligned_perf test_write_file test_read_delete_file + test_single_file_random_noaligned_perf test_write_file test_read_delete_file \ + test_phase2_posix .PHONY: all clean list run-test diff --git a/test/test_phase2_posix.c b/test/test_phase2_posix.c new file mode 100644 index 0000000..8c4055c --- /dev/null +++ b/test/test_phase2_posix.c @@ -0,0 +1,98 @@ +#include "test_utils.h" +#include + +static int expect_errno(const char *what, int exp) +{ + if (errno != exp) { + fprintf(stderr, "%s: errno=%d expected=%d\n", what, errno, exp); + return -1; + } + return 0; +} + +static int test_phase2(const char *root) +{ + char dir[PATH_MAX]; + char file[PATH_MAX]; + char file2[PATH_MAX]; + struct stat st; + int dfd = -1; + int fd = -1; + + snprintf(dir, sizeof(dir), "%s/p2db", root); + snprintf(file, sizeof(file), "%s/p2db/data.log", root); + snprintf(file2, sizeof(file2), "%s/p2db/data2.log", root); + + (void)unlink(file2); + (void)unlink(file); + (void)rmdir(dir); + + if (mkdir(dir, 0755) != 0) { perror("mkdir"); return 1; } + dfd = open(dir, O_RDONLY | O_DIRECTORY); + if (dfd < 0) { perror("open dir"); return 2; } + + fd = openat(dfd, "data.log", O_CREAT | O_RDWR | O_TRUNC, 0644); + if (fd < 0) { perror("openat create"); return 3; } + + if (write(fd, "ABCD", 4) != 4) { perror("write"); return 4; } + if (pwrite(fd, "XYZ", 3, 8) != 3) { perror("pwrite"); return 5; } + + char buf[16] = {0}; + ssize_t n = pread(fd, buf, 11, 0); + if (n != 11) { perror("pread"); return 6; } + if (memcmp(buf, "ABCD", 4) != 0 || buf[4] || buf[5] || buf[6] || buf[7] || + memcmp(buf + 8, "XYZ", 3) != 0) { + fprintf(stderr, "pread sparse verify failed\n"); + return 7; + } + + if (fsync(fd) != 0) { perror("fsync"); return 8; } + if (fdatasync(fd) != 0) { perror("fdatasync"); return 9; } + + if (fstat(fd, &st) != 0) { perror("fstat"); return 10; } + if (st.st_size != 11) { + fprintf(stderr, "fstat size=%ld expected=11\n", (long)st.st_size); + return 11; + } + + if (ftruncate(fd, 4) != 0) { perror("ftruncate"); return 12; } + memset(buf, 0, sizeof(buf)); + n = pread(fd, buf, 16, 0); + if (n != 4 || memcmp(buf, "ABCD", 4) != 0) { + fprintf(stderr, "truncate readback failed n=%zd\n", n); + return 13; + } + + if (rename(file, file2) != 0) { perror("rename"); return 14; } + if (access(file, F_OK) == 0 || expect_errno("access old", ENOENT) != 0) { + return 15; + } + if (access(file2, F_OK) != 0) { perror("access new"); return 16; } + + int fd2 = open(file2, O_CREAT | O_EXCL | O_RDWR, 0644); + if (fd2 >= 0 || expect_errno("open excl", EEXIST) != 0) { + if (fd2 >= 0) close(fd2); + return 17; + } + + int rd = open(file2, O_RDONLY); + if (rd < 0) { perror("open rdonly"); return 18; } + if (write(rd, "Q", 1) >= 0 || expect_errno("write rdonly", EBADF) != 0) { + close(rd); + return 19; + } + close(rd); + + close(fd); + close(dfd); + if (unlink(file2) != 0) { perror("unlink"); return 20; } + if (rmdir(dir) != 0) { perror("rmdir"); return 21; } + return 0; +} + +int main(int argc, char **argv) +{ + const char *root = argc >= 2 ? argv[1] : "/zvfs"; + int rc = test_phase2(root); + return report_result("test_phase2_posix", rc); +} diff --git a/zvfs/zvfs.c b/zvfs/zvfs.c index 67b67bc..885bbbe 100755 --- a/zvfs/zvfs.c +++ b/zvfs/zvfs.c @@ -35,7 +35,7 @@ void zvfs_spdk_blob_read_cb(void *arg, int bserrno); // write void zvfs_do_write(void *arg); -void zvfs_do_write_io(zvfs_file_t *file); +void zvfs_do_write_io(zvfs_io_req_t *req); void zvfs_spdk_blob_write_preread_cb(void *arg, int bserrno); void zvfs_spdk_blob_write_resize_cb(void *arg, int bserrno); void zvfs_spdk_blob_write_sync_cb(void *arg, int bserrno); @@ -58,6 +58,9 @@ void json_app_load_done(int rc, void *ctx); // unmount void zvfs_do_umount(void *arg); void zvfs_spdk_bs_unload_cb(void *arg, int bserrno); +static int zvfs_submit_io_req(zvfs_io_req_t *req, spdk_msg_fn submit_fn, const char *op_name); +static int zvfs_pread_internal(zvfs_io_req_t *req); +static int zvfs_pwrite_internal(zvfs_io_req_t *req); /* ================================================================== */ /* HELPER */ @@ -68,18 +71,15 @@ static uint64_t zvfs_need_clusters(zvfs_t *fs, uint64_t end_byte) { } /* ---------- 辅助:计算本次 IO 涉及的 LBA 范围 ---------- */ -static void calc_lba_range(zvfs_file_t *file, - uint64_t *out_lba, - uint64_t *out_page_off, - uint64_t *out_lba_count) +static void calc_lba_range(zvfs_io_req_t *req) { - uint64_t io_unit = file->fs->io_unit_size; - uint64_t off = file->current_offset; - uint64_t cnt = file->io_count; + uint64_t io_unit = req->file->fs->io_unit_size; + uint64_t off = req->offset; + uint64_t cnt = req->len; - *out_lba = off / io_unit; - *out_page_off = off % io_unit; - *out_lba_count = (*out_page_off + cnt + io_unit - 1) / io_unit; + req->lba = off / io_unit; + req->page_off = off % io_unit; + req->lba_count = (req->page_off + cnt + io_unit - 1) / io_unit; } /* ---------- 确保 dma_buf 足够大 ---------- */ @@ -124,6 +124,46 @@ bool waiter(struct spdk_thread *thread, spdk_msg_fn start_fn, void *ctx, bool *f return true; } +static int zvfs_submit_io_req(zvfs_io_req_t *req, spdk_msg_fn submit_fn, const char *op_name) +{ + if (req == NULL || submit_fn == NULL || global_thread == NULL) { + return -1; + } + + req->op_errno = 0; + req->result = 0; + req->finished = false; + if (req->file != NULL) { + req->file->op_errno = 0; + } + + bool ok = waiter(global_thread, submit_fn, req, &req->finished); + if (!ok) { + SPDK_ERRLOG("%s result: ok=%d\n", op_name, ok); + if (req->file != NULL) { + req->file->op_errno = -EIO; + } + return -1; + } + if (req->op_errno != 0) { + if (req->file != NULL) { + req->file->op_errno = req->op_errno; + } + return -1; + } + return (int)req->result; +} + +static int zvfs_pread_internal(zvfs_io_req_t *req) +{ + return zvfs_submit_io_req(req, zvfs_do_read, "pread"); +} + +static int zvfs_pwrite_internal(zvfs_io_req_t *req) +{ + return zvfs_submit_io_req(req, zvfs_do_write, "pwrite"); +} + /* ================================================================== */ /* MOUNT */ /* ================================================================== */ @@ -393,77 +433,83 @@ void zvfs_spdk_bs_open_blob_cb2(void *arg, struct spdk_blob *blb, int bserrno) { /* READ */ /* ================================================================== */ void zvfs_do_read(void *arg) { - zvfs_file_t *file = (zvfs_file_t *)arg; - if (file == NULL || file->fs == NULL || file->blob == NULL || file->fs->channel == NULL) { - if (file != NULL) { - file->op_errno = -EINVAL; - file->io_count = 0; - file->actual_io_count = 0; - file->finished = true; + zvfs_io_req_t *req = (zvfs_io_req_t *)arg; + zvfs_file_t *file = req ? req->file : NULL; + if (req == NULL || file == NULL || file->fs == NULL || file->blob == NULL || + file->fs->channel == NULL || req->buf == NULL) { + if (req != NULL) { + req->op_errno = -EINVAL; + req->result = 0; + req->finished = true; } return; } - file->op_errno = 0; + req->op_errno = 0; - uint64_t io_unit = file->fs->io_unit_size; + if (req->len == 0) { + req->result = 0; + req->finished = true; + return; + } + + uint64_t io_unit = file->fs->io_unit_size; if (io_unit == 0) { - file->op_errno = -EIO; - file->actual_io_count = 0; - file->finished = true; + req->op_errno = -EIO; + req->result = 0; + req->finished = true; return; } - uint64_t offset = file->current_offset; - uint64_t file_sz = file->dirent ? file->dirent->file_size : 0; - /* EOF 检查 */ - if (offset >= file_sz) { + uint64_t file_sz = file->dirent ? file->dirent->file_size : 0; + if (req->offset >= file_sz) { SPDK_DEBUGLOG("read: EOF\n"); - file->io_count = 0; - file->actual_io_count = 0; - file->finished = true; + req->result = 0; + req->finished = true; return; } - /* 截断到文件末尾 */ - if (offset + file->io_count > file_sz){ - file->io_count = file_sz - offset; - } + if (req->offset + req->len > file_sz) { + req->len = file_sz - req->offset; + } + req->result = req->len; - file->actual_io_count = file->io_count; + calc_lba_range(req); - uint64_t lba, page_off, lba_count; - calc_lba_range(file, &lba, &page_off, &lba_count); - - uint64_t buf_need = lba_count * io_unit; - if (ensure_dma_buf(file, buf_need) != 0) { + uint64_t buf_need = req->lba_count * io_unit; + if (ensure_dma_buf(file, buf_need) != 0) { SPDK_ERRLOG("ensure_dma_buf failed\n"); - file->op_errno = -ENOMEM; - file->actual_io_count = 0; - file->finished = true; + req->op_errno = -ENOMEM; + req->result = 0; + req->finished = true; return; } spdk_blob_io_read(file->blob, file->fs->channel, - file->dma_buf, - lba, lba_count, - zvfs_spdk_blob_read_cb, file); + file->dma_buf, + req->lba, req->lba_count, + zvfs_spdk_blob_read_cb, req); } void zvfs_spdk_blob_read_cb(void *arg, int bserrno) { - zvfs_file_t *file = (zvfs_file_t *)arg; + zvfs_io_req_t *req = (zvfs_io_req_t *)arg; + zvfs_file_t *file = req ? req->file : NULL; + if (req == NULL || file == NULL) { + return; + } if (bserrno) { SPDK_ERRLOG("blob_read error: %d\n", bserrno); - file->op_errno = zvfs_err_from_bserrno(bserrno); - file->io_count = 0; - file->actual_io_count = 0; - file->finished = true; + req->op_errno = zvfs_err_from_bserrno(bserrno); + req->result = 0; + req->finished = true; return; } - file->current_offset += file->io_count; - SPDK_DEBUGLOG("read complete, new offset=%" PRIu64 "\n", file->current_offset); - file->finished = true; + memcpy(req->buf, + (uint8_t *)file->dma_buf + req->page_off, + req->result); + SPDK_DEBUGLOG("read complete, offset=%" PRIu64 " len=%zu\n", req->offset, req->result); + req->finished = true; } /* ================================================================== */ @@ -487,101 +533,106 @@ void zvfs_spdk_blob_read_cb(void *arg, int bserrno) { /* Step 1 : 进入 write,先把覆盖范围内的扇区读出来(read-modify-write) */ void zvfs_do_write(void *arg) { - zvfs_file_t *file = (zvfs_file_t *)arg; - if (file == NULL || file->fs == NULL || file->blob == NULL || file->fs->channel == NULL) { - if (file != NULL) { - file->op_errno = -EINVAL; - file->finished = true; + zvfs_io_req_t *req = (zvfs_io_req_t *)arg; + zvfs_file_t *file = req ? req->file : NULL; + if (req == NULL || file == NULL || file->fs == NULL || file->blob == NULL || + file->fs->channel == NULL) { + if (req != NULL) { + req->op_errno = -EINVAL; + req->finished = true; } return; } - if (file->write_staging_buf == NULL && file->io_count != 0) { - file->op_errno = -EINVAL; - file->finished = true; + if (req->buf == NULL && req->len != 0) { + req->op_errno = -EINVAL; + req->finished = true; return; } - if (file->io_count == 0) { - file->finished = true; + if (req->len == 0) { + req->result = 0; + req->finished = true; return; } - file->op_errno = 0; + req->op_errno = 0; uint64_t io_unit = file->fs->io_unit_size; if (io_unit == 0) { - file->op_errno = -EIO; - file->finished = true; + req->op_errno = -EIO; + req->finished = true; return; } - uint64_t lba, page_off, lba_count; - calc_lba_range(file, &lba, &page_off, &lba_count); - uint64_t buf_need = lba_count * io_unit; + calc_lba_range(req); + + uint64_t buf_need = req->lba_count * io_unit; if (ensure_dma_buf(file, buf_need) != 0) { SPDK_ERRLOG("ensure_dma_buf failed\n"); - file->op_errno = -ENOMEM; - file->finished = true; + req->op_errno = -ENOMEM; + req->finished = true; return; } - file->aligned = (file->current_offset % io_unit == 0) && - (file->io_count % io_unit == 0); + req->aligned = (req->offset % io_unit == 0) && + (req->len % io_unit == 0); - // static uint64_t aligned_count = 0; - // static uint64_t unaligned_count = 0; - // if (aligned) { - // aligned_count++; - // } else { - // unaligned_count++; - // } - // if ((aligned_count + unaligned_count) % 1000 == 0) { - // printf("aligned=%lu unaligned=%lu\n", aligned_count, unaligned_count); - // } - - if (file->aligned) { - /* 直接把用户数据拷到 dma_buf,跳过 preread */ - memcpy(file->dma_buf, file->write_staging_buf, file->io_count); - /* 直接进 preread_cb 的后半段逻辑(扩容判断+写) */ - zvfs_spdk_blob_write_preread_cb(file, 0); + if (req->aligned) { + memcpy(file->dma_buf, req->buf, req->len); + zvfs_spdk_blob_write_preread_cb(req, 0); } else { - /* - * 先把涉及的扇区读出,read 完成后在 preread_cb 里 patch 数据再写。 - * 注意:把用户数据暂存在 file->write_buf / write_count, - * 或者借用 file->io_count(io_count 不变)。 - * 这里我们把用户数据已经由上层调用者拷贝到了 write_staging_buf, - */ - /* 不管是否需要扩容,先 preread */ spdk_blob_io_read(file->blob, file->fs->channel, - file->dma_buf, - lba, lba_count, - zvfs_spdk_blob_write_preread_cb, file); + file->dma_buf, + req->lba, req->lba_count, + zvfs_spdk_blob_write_preread_cb, req); } } /* Step 2 : preread 完成,patch dma_buf,然后决定是否扩容 */ void zvfs_spdk_blob_write_preread_cb(void *arg, int bserrno){ - zvfs_file_t *file = (zvfs_file_t *)arg; - if (file == NULL) { + zvfs_io_req_t *req = (zvfs_io_req_t *)arg; + zvfs_file_t *file = req ? req->file : NULL; + if (req == NULL || file == NULL) { + return; + } + + uint64_t io_unit = file->fs->io_unit_size; + if (io_unit == 0) { + req->op_errno = -EIO; + req->finished = true; return; } /* preread 失败也没关系——如果是新分配区域全零即可, - 这里仍然继续(SPDK 对未写过的区域返回全零)。*/ + 这里仍然继续(SPDK 对未写过的区域返回全零)。*/ if (bserrno) { SPDK_DEBUGLOG("preread error %d (may be uninitialized, continue)\n", bserrno); + memset(file->dma_buf, 0, req->lba_count * io_unit); } /* 只有非对齐情况才需要 patch,对齐情况下数据已经在 dma_buf 里了(do_write 里拷好的)*/ - uint64_t io_unit = file->fs->io_unit_size; + if (!req->aligned) { + memcpy((uint8_t *)file->dma_buf + req->page_off, + req->buf, + req->len); + } - if (!file->aligned) { - uint64_t page_off = file->current_offset % io_unit; - memcpy((uint8_t *)file->dma_buf + page_off, - file->write_staging_buf, - file->io_count); + /* + * 稀疏写语义:当写偏移超过旧 EOF 时,gap 区间应当读为 0。 + * 这里至少把本次覆盖到的页内 gap 清零,避免把底层旧数据带入新文件逻辑范围。 + */ + uint64_t old_eof = file->dirent ? file->dirent->file_size : 0; + if (req->offset > old_eof) { + uint64_t page_start = req->lba * io_unit; + uint64_t page_end = page_start + req->lba_count * io_unit; + uint64_t zero_start = old_eof > page_start ? old_eof : page_start; + uint64_t zero_end = req->offset < page_end ? req->offset : page_end; + if (zero_end > zero_start) { + memset((uint8_t *)file->dma_buf + (zero_start - page_start), 0, + zero_end - zero_start); + } } /* 判断是否需要扩容 */ - uint64_t end_byte = file->current_offset + file->io_count; + uint64_t end_byte = req->offset + req->len; uint64_t need_clusters = zvfs_need_clusters(file->fs, end_byte); uint64_t cur_clusters = file->dirent ? file->dirent->allocated_clusters : spdk_blob_get_num_clusters(file->blob); @@ -590,39 +641,47 @@ void zvfs_spdk_blob_write_preread_cb(void *arg, int bserrno){ uint64_t free_clusters = spdk_bs_free_cluster_count(file->fs->bs); if (need_clusters - cur_clusters > free_clusters) { SPDK_ERRLOG("no free clusters\n"); - file->op_errno = -ENOSPC; - file->finished = true; + req->op_errno = -ENOSPC; + req->finished = true; return; } spdk_blob_resize(file->blob, need_clusters, - zvfs_spdk_blob_write_resize_cb, file); + zvfs_spdk_blob_write_resize_cb, req); } else { - zvfs_do_write_io(file); + zvfs_do_write_io(req); } } /* Step 3a : resize 完成 → sync */ void zvfs_spdk_blob_write_resize_cb(void *arg, int bserrno) { - zvfs_file_t *file = (zvfs_file_t *)arg; + zvfs_io_req_t *req = (zvfs_io_req_t *)arg; + zvfs_file_t *file = req ? req->file : NULL; + if (req == NULL || file == NULL) { + return; + } if (bserrno) { SPDK_ERRLOG("write resize error: %d\n", bserrno); - file->op_errno = zvfs_err_from_bserrno(bserrno); - file->finished = true; + req->op_errno = zvfs_err_from_bserrno(bserrno); + req->finished = true; return; } - spdk_blob_sync_md(file->blob, zvfs_spdk_blob_write_sync_cb, file); + spdk_blob_sync_md(file->blob, zvfs_spdk_blob_write_sync_cb, req); } /* Step 3b : sync 完成 → 真正写 */ void zvfs_spdk_blob_write_sync_cb(void *arg, int bserrno) { - zvfs_file_t *file = (zvfs_file_t *)arg; + zvfs_io_req_t *req = (zvfs_io_req_t *)arg; + zvfs_file_t *file = req ? req->file : NULL; + if (req == NULL || file == NULL) { + return; + } if (bserrno) { SPDK_ERRLOG("write sync error: %d\n", bserrno); - file->op_errno = zvfs_err_from_bserrno(bserrno); - file->finished = true; + req->op_errno = zvfs_err_from_bserrno(bserrno); + req->finished = true; return; } @@ -631,46 +690,53 @@ void zvfs_spdk_blob_write_sync_cb(void *arg, int bserrno) { (uint32_t)spdk_blob_get_num_clusters(file->blob); } - zvfs_do_write_io(file); + zvfs_do_write_io(req); } /* Step 4 : 实际写入(dma_buf 已经是 patch 后的整扇区数据) */ -void zvfs_do_write_io(zvfs_file_t *file) { - uint64_t io_unit_size = file->fs->io_unit_size; - if (io_unit_size == 0) { - file->op_errno = -EIO; - file->finished = true; +void zvfs_do_write_io(zvfs_io_req_t *req) { + zvfs_file_t *file = req ? req->file : NULL; + if (req == NULL || file == NULL || file->fs == NULL) { return; } - uint64_t lba = file->current_offset / io_unit_size; - uint64_t page_off = file->current_offset % io_unit_size; - uint64_t lba_count = (page_off + file->io_count + io_unit_size - 1) / io_unit_size; + + uint64_t io_unit_size = file->fs->io_unit_size; + if (io_unit_size == 0) { + req->op_errno = -EIO; + req->finished = true; + return; + } + uint64_t lba_count = (req->page_off + req->len + io_unit_size - 1) / io_unit_size; spdk_blob_io_write(file->blob, file->fs->channel, file->dma_buf, - lba, lba_count, - zvfs_spdk_blob_write_cb, file); + req->lba, lba_count, + zvfs_spdk_blob_write_cb, req); } /* Step 5 : 写完成 */ void zvfs_spdk_blob_write_cb(void *arg, int bserrno) { - zvfs_file_t *file = (zvfs_file_t *)arg; + zvfs_io_req_t *req = (zvfs_io_req_t *)arg; + zvfs_file_t *file = req ? req->file : NULL; + if (req == NULL || file == NULL) { + return; + } if (bserrno) { SPDK_ERRLOG("blob_write error: %d\n", bserrno); - file->op_errno = zvfs_err_from_bserrno(bserrno); - file->finished = true; + req->op_errno = zvfs_err_from_bserrno(bserrno); + req->finished = true; return; } - uint64_t new_end = file->current_offset + file->io_count; + uint64_t new_end = req->offset + req->len; if (file->dirent && new_end > file->dirent->file_size) { file->dirent->file_size = new_end; } - file->current_offset = new_end; + req->result = req->len; - SPDK_DEBUGLOG("write complete, new offset=%" PRIu64 "\n", file->current_offset); - file->finished = true; + SPDK_DEBUGLOG("write complete, offset=%" PRIu64 " len=%zu\n", req->offset, req->result); + req->finished = true; } @@ -921,46 +987,81 @@ int zvfs_read(struct zvfs_file_s *file, uint8_t *buffer, size_t count) { if (count == 0) { return 0; } - file->op_errno = 0; - file->io_count = count; - file->actual_io_count = 0; - file->finished = false; - bool ok = waiter(global_thread, zvfs_do_read, file, &file->finished); - if(!ok) SPDK_ERRLOG("read result: ok=%d\n", ok); - if (!ok || file->op_errno != 0) return -1; - if (file->actual_io_count == 0) return 0; + zvfs_io_req_t req = { + .file = file, + .op = ZVFS_IO_READ, + .buf = buffer, + .len = count, + .offset = file->current_offset, + .flags = 0, + }; - /* - * dma_buf 里存的是从 LBA 边界开始的整扇区数据, - * page_off 是 current_offset(读之前)相对于 LBA 边界的字节偏移。 - * - * current_offset 在 read_cb 里已经 += actual_io_count, - * 所以读之前的 offset = current_offset - actual_io_count。 - */ - - uint64_t pre_offset = file->current_offset - file->actual_io_count; - uint64_t page_off = pre_offset % file->fs->io_unit_size; - - memcpy(buffer, - (uint8_t *)file->dma_buf + page_off, - file->actual_io_count); - - return (int)file->actual_io_count; + int rc = zvfs_pread_internal(&req); + if (rc > 0) { + file->current_offset += (uint64_t)rc; + } + return rc; } // write int zvfs_write(struct zvfs_file_s *file, const uint8_t *buffer, size_t count) { if (file == NULL || global_thread == NULL) { return -1; } - file->op_errno = 0; - file->io_count = count; - file->write_staging_buf = buffer; - file->finished = false; - bool ok = waiter(global_thread, zvfs_do_write, file, &file->finished); - if(!ok) SPDK_ERRLOG("write result: ok=%d\n", ok); - return (ok && file->op_errno == 0) ? (int)count : -1; + zvfs_io_req_t req = { + .file = file, + .op = ZVFS_IO_WRITE, + .buf = (uint8_t *)buffer, + .len = count, + .offset = file->current_offset, + .flags = 0, + }; + + int rc = zvfs_pwrite_internal(&req); + if (rc > 0) { + file->current_offset += (uint64_t)rc; + } + return rc; +} + +int zvfs_pread(struct zvfs_file_s *file, uint8_t *buffer, size_t count, uint64_t offset) +{ + if (file == NULL || buffer == NULL || global_thread == NULL) { + return -1; + } + if (count == 0) { + return 0; + } + + zvfs_io_req_t req = { + .file = file, + .op = ZVFS_IO_READ, + .buf = buffer, + .len = count, + .offset = offset, + .flags = 0, + }; + + return zvfs_pread_internal(&req); +} + +int zvfs_pwrite(struct zvfs_file_s *file, const uint8_t *buffer, size_t count, uint64_t offset) +{ + if (file == NULL || global_thread == NULL) { + return -1; + } + + zvfs_io_req_t req = { + .file = file, + .op = ZVFS_IO_WRITE, + .buf = (uint8_t *)buffer, + .len = count, + .offset = offset, + .flags = 0, + }; + + return zvfs_pwrite_internal(&req); } // close int zvfs_close(struct zvfs_file_s *file) { @@ -1032,5 +1133,3 @@ int zvfs_delete(struct zvfs_file_s *file) { // free(fs); // } - - diff --git a/zvfs/zvfs.h b/zvfs/zvfs.h index f244240..66f5d76 100755 --- a/zvfs/zvfs.h +++ b/zvfs/zvfs.h @@ -2,6 +2,9 @@ #define __ZVFS_HOOK_H__ #include +#include +#include +#include #include #include @@ -68,16 +71,34 @@ typedef struct zvfs_file_s { /* 临时DMA缓冲区(可选:每个file一个,避免每次malloc) */ void *dma_buf; uint64_t dma_buf_size; - - size_t actual_io_count; - const uint8_t *write_staging_buf; - int aligned; - size_t io_count; int op_errno; bool finished; } zvfs_file_t; +typedef enum { + ZVFS_IO_READ = 0, + ZVFS_IO_WRITE = 1, +} zvfs_io_op_t; + +typedef struct zvfs_io_req_s { + zvfs_file_t *file; + zvfs_io_op_t op; + uint8_t *buf; + size_t len; + uint64_t offset; + int flags; + + size_t result; + int op_errno; + bool finished; + + uint64_t lba; + uint64_t page_off; + uint64_t lba_count; + bool aligned; +} zvfs_io_req_t; + bool waiter(struct spdk_thread *thread, spdk_msg_fn start_fn, void *ctx, bool *finished); int zvfs_env_setup(void); @@ -87,15 +108,40 @@ int zvfs_create(struct zvfs_file_s *file); int zvfs_open(struct zvfs_file_s *file); int zvfs_read(struct zvfs_file_s *file, uint8_t *buffer, size_t count); int zvfs_write(struct zvfs_file_s *file, const uint8_t *buffer, size_t count); +int zvfs_pread(struct zvfs_file_s *file, uint8_t *buffer, size_t count, uint64_t offset); +int zvfs_pwrite(struct zvfs_file_s *file, const uint8_t *buffer, size_t count, uint64_t offset); int zvfs_close(struct zvfs_file_s *file); int zvfs_delete(struct zvfs_file_s *file); /* POSIX hook API(zvfs_hook.c 实现) */ int open(const char *path, int flags, ...); +int open64(const char *path, int flags, ...); +int openat(int dirfd, const char *path, int flags, ...); +int openat64(int dirfd, const char *path, int flags, ...); ssize_t read(int fd, void *buf, size_t count); ssize_t write(int fd, const void *buf, size_t count); +ssize_t pread(int fd, void *buf, size_t count, off_t offset); +ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset); +ssize_t pread64(int fd, void *buf, size_t count, off_t offset); +ssize_t pwrite64(int fd, const void *buf, size_t count, off_t offset); int close(int fd); int unlink(const char *name); +int unlinkat(int dirfd, const char *name, int flags); off_t lseek(int fd, off_t offset, int whence); +int fsync(int fd); +int fdatasync(int fd); +int ftruncate(int fd, off_t length); +int fallocate(int fd, int mode, off_t offset, off_t len); +int posix_fadvise(int fd, off_t offset, off_t len, int advice); +int sync_file_range(int fd, off_t offset, off_t nbytes, unsigned int flags); +int mkdir(const char *path, mode_t mode); +int rmdir(const char *path); +int rename(const char *oldpath, const char *newpath); +int access(const char *path, int mode); +int fcntl(int fd, int cmd, ...); +int stat(const char *path, struct stat *st); +int lstat(const char *path, struct stat *st); +int fstat(int fd, struct stat *st); +int fstatat(int dirfd, const char *path, struct stat *st, int flags); #endif diff --git a/zvfs/zvfs_hook.c b/zvfs/zvfs_hook.c old mode 100755 new mode 100644 index 7607b95..2fc211d --- a/zvfs/zvfs_hook.c +++ b/zvfs/zvfs_hook.c @@ -1,232 +1,525 @@ - #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif + #include -#include -#include -#include -#include -#include -#include -#include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "zvfs.h" +#ifndef FALLOC_FL_KEEP_SIZE +#define FALLOC_FL_KEEP_SIZE 0x01 +#endif + /* ------------------------------------------------------------------ */ -/* 全局状态 */ +/* 全局状态 */ /* ------------------------------------------------------------------ */ -static zvfs_t *g_fs = NULL; /* 全局文件系统,NULL 表示未初始化 */ +static zvfs_t *g_fs = NULL; static bool g_mounted = false; static bool g_env_init = false; +static bool g_debug = false; +static bool g_debug_init = false; +static const char *g_debug_filter = NULL; - -/* 元数据文件路径 */ static const char *META_FILE = "/home/lian/share/10.1-spdk/zvfs/zvfs_meta.txt"; -/* 伪 fd 起始值,避免和真实 fd 冲突 */ -#define FD_BASE 10000 +#define FD_BASE 10000 +#define DIRFD_BASE 20000 +#define ZVFS_MAX_DIRFD 64 +#define ZVFS_MAX_DIRS 1024 -/* 只拦截以 /zvfs 开头的路径 */ #define ZVFS_PATH_PREFIX "/zvfs" -static int (*real_open_fn) (const char*, int, ...) = NULL; -static ssize_t (*real_read_fn) (int, void*, size_t) = NULL; -static ssize_t (*real_write_fn)(int, const void*, size_t) = NULL; -static int (*real_close_fn)(int) = NULL; -static int (*real_unlink_fn)(const char *name) = NULL; -static off_t (*real_lseek_fn)(int fd, off_t offset, int whence) = NULL; +typedef struct { + bool used; + int flags; + char path[PATH_MAX]; +} zvfs_dirfd_t; + +static zvfs_dirfd_t g_dirfd_table[ZVFS_MAX_DIRFD]; +static char *g_dirs[ZVFS_MAX_DIRS]; +static size_t g_dir_count = 0; + +static int (*real_open_fn)(const char *, int, ...) = NULL; +static int (*real_openat_fn)(int, const char *, int, ...) = NULL; +static DIR * (*real_opendir_fn)(const char *) = NULL; +static DIR * (*real_fdopendir_fn)(int) = NULL; +static struct dirent *(*real_readdir_fn)(DIR *) = NULL; +static struct dirent64 *(*real_readdir64_fn)(DIR *) = NULL; +static int (*real_closedir_fn)(DIR *) = NULL; +static int (*real_dirfd_fn)(DIR *) = NULL; +static FILE * (*real_fopen_fn)(const char *, const char *) = NULL; +static FILE * (*real_fopen64_fn)(const char *, const char *) = NULL; +static FILE * (*real_fdopen_fn)(int, const char *) = NULL; +static ssize_t (*real_read_fn)(int, void *, size_t) = NULL; +static ssize_t (*real_write_fn)(int, const void *, size_t) = NULL; +static ssize_t (*real_pread_fn)(int, void *, size_t, off_t) = NULL; +static ssize_t (*real_pwrite_fn)(int, const void *, size_t, off_t) = NULL; +static int (*real_close_fn)(int) = NULL; +static int (*real_unlink_fn)(const char *) = NULL; +static int (*real_unlinkat_fn)(int, const char *, int) = NULL; +static off_t (*real_lseek_fn)(int, off_t, int) = NULL; +static int (*real_fsync_fn)(int) = NULL; +static int (*real_fdatasync_fn)(int) = NULL; +static int (*real_ftruncate_fn)(int, off_t) = NULL; +static int (*real_fallocate_fn)(int, int, off_t, off_t) = NULL; +static int (*real_posix_fadvise_fn)(int, off_t, off_t, int) = NULL; +static int (*real_sync_file_range_fn)(int, off_t, off_t, unsigned int) = NULL; +static int (*real_rename_fn)(const char *, const char *) = NULL; +static int (*real_stat_fn)(const char *, struct stat *) = NULL; +static int (*real_lstat_fn)(const char *, struct stat *) = NULL; +static int (*real_fstat_fn)(int, struct stat *) = NULL; +static int (*real_fstatat_fn)(int, const char *, struct stat *, int) = NULL; +static int (*real_stat64_fn)(const char *, struct stat64 *) = NULL; +static int (*real_lstat64_fn)(const char *, struct stat64 *) = NULL; +static int (*real_fstat64_fn)(int, struct stat64 *) = NULL; +static int (*real_fstatat64_fn)(int, const char *, struct stat64 *, int) = NULL; +static int (*real___xstat64_fn)(int, const char *, struct stat64 *) = NULL; +static int (*real___lxstat64_fn)(int, const char *, struct stat64 *) = NULL; +static int (*real___fxstat64_fn)(int, int, struct stat64 *) = NULL; +static int (*real___fxstatat64_fn)(int, int, const char *, struct stat64 *, int) = NULL; +static int (*real_access_fn)(const char *, int) = NULL; +static int (*real_mkdir_fn)(const char *, mode_t) = NULL; +static int (*real_rmdir_fn)(const char *) = NULL; +static int (*real_fcntl_fn)(int, int, ...) = NULL; + +/* Forward declarations for wrappers used by stdio cookie callbacks. */ +ssize_t read(int fd, void *buf, size_t count); +ssize_t write(int fd, const void *buf, size_t count); +off64_t lseek64(int fd, off64_t offset, int whence); +int close(int fd); +static int zvfs_ensure_mounted(void); + +__attribute__((format(printf, 1, 2))) +static void debug_log(const char *fmt, ...) +{ + va_list ap; + + if (!g_debug) { + return; + } + va_start(ap, fmt); + fprintf(stderr, "[zvfs-hook][pid=%d] ", getpid()); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n"); + va_end(ap); +} + +static int real_open_passthrough(const char *path, int flags, mode_t mode, bool has_mode) +{ + if (!real_open_fn) { + errno = ENOSYS; + return -1; + } + return has_mode ? real_open_fn(path, flags, mode) : real_open_fn(path, flags); +} + +static int real_openat_passthrough(int dirfd, const char *path, int flags, mode_t mode, bool has_mode) +{ + if (!real_openat_fn) { + errno = ENOSYS; + return -1; + } + return has_mode ? real_openat_fn(dirfd, path, flags, mode) : real_openat_fn(dirfd, path, flags); +} + +static void debug_init_once(void) +{ + const char *debug_env; + const char *filter_env; + + if (g_debug_init) { + return; + } + g_debug_init = true; + + debug_env = getenv("ZVFS_HOOK_DEBUG"); + if (debug_env && debug_env[0] != '\0' && strcmp(debug_env, "0") != 0) { + g_debug = true; + } + filter_env = getenv("ZVFS_HOOK_DEBUG_PATH"); + if (filter_env && filter_env[0] != '\0') { + g_debug_filter = filter_env; + } +} + +static bool debug_path_enabled(const char *path) +{ + debug_init_once(); + if (!g_debug) { + return false; + } + if (!g_debug_filter) { + return true; + } + return path && strstr(path, g_debug_filter) != NULL; +} __attribute__((constructor)) -static void zvfs_preload_init(void) { - real_open_fn = dlsym(RTLD_NEXT, "open"); - real_read_fn = dlsym(RTLD_NEXT, "read"); - real_write_fn = dlsym(RTLD_NEXT, "write"); - real_close_fn = dlsym(RTLD_NEXT, "close"); - real_unlink_fn= dlsym(RTLD_NEXT, "unlink"); - real_lseek_fn = dlsym(RTLD_NEXT, "lseek"); +static void zvfs_preload_init(void) +{ + real_open_fn = dlsym(RTLD_NEXT, "open"); + real_openat_fn = dlsym(RTLD_NEXT, "openat"); + real_opendir_fn = dlsym(RTLD_NEXT, "opendir"); + real_fdopendir_fn = dlsym(RTLD_NEXT, "fdopendir"); + real_readdir_fn = dlsym(RTLD_NEXT, "readdir"); + real_readdir64_fn = dlsym(RTLD_NEXT, "readdir64"); + real_closedir_fn = dlsym(RTLD_NEXT, "closedir"); + real_dirfd_fn = dlsym(RTLD_NEXT, "dirfd"); + real_fopen_fn = dlsym(RTLD_NEXT, "fopen"); + real_fopen64_fn = dlsym(RTLD_NEXT, "fopen64"); + real_fdopen_fn = dlsym(RTLD_NEXT, "fdopen"); + real_read_fn = dlsym(RTLD_NEXT, "read"); + real_write_fn = dlsym(RTLD_NEXT, "write"); + real_pread_fn = dlsym(RTLD_NEXT, "pread"); + real_pwrite_fn = dlsym(RTLD_NEXT, "pwrite"); + real_close_fn = dlsym(RTLD_NEXT, "close"); + real_unlink_fn = dlsym(RTLD_NEXT, "unlink"); + real_unlinkat_fn = dlsym(RTLD_NEXT, "unlinkat"); + real_lseek_fn = dlsym(RTLD_NEXT, "lseek"); + real_fsync_fn = dlsym(RTLD_NEXT, "fsync"); + real_fdatasync_fn = dlsym(RTLD_NEXT, "fdatasync"); + real_ftruncate_fn = dlsym(RTLD_NEXT, "ftruncate"); + real_fallocate_fn = dlsym(RTLD_NEXT, "fallocate"); + real_posix_fadvise_fn = dlsym(RTLD_NEXT, "posix_fadvise"); + real_sync_file_range_fn = dlsym(RTLD_NEXT, "sync_file_range"); + real_rename_fn = dlsym(RTLD_NEXT, "rename"); + real_stat_fn = dlsym(RTLD_NEXT, "stat"); + real_lstat_fn = dlsym(RTLD_NEXT, "lstat"); + real_fstat_fn = dlsym(RTLD_NEXT, "fstat"); + real_fstatat_fn = dlsym(RTLD_NEXT, "fstatat"); + real_stat64_fn = dlsym(RTLD_NEXT, "stat64"); + real_lstat64_fn = dlsym(RTLD_NEXT, "lstat64"); + real_fstat64_fn = dlsym(RTLD_NEXT, "fstat64"); + real_fstatat64_fn = dlsym(RTLD_NEXT, "fstatat64"); + real___xstat64_fn = dlsym(RTLD_NEXT, "__xstat64"); + real___lxstat64_fn = dlsym(RTLD_NEXT, "__lxstat64"); + real___fxstat64_fn = dlsym(RTLD_NEXT, "__fxstat64"); + real___fxstatat64_fn = dlsym(RTLD_NEXT, "__fxstatat64"); + real_access_fn = dlsym(RTLD_NEXT, "access"); + real_mkdir_fn = dlsym(RTLD_NEXT, "mkdir"); + real_rmdir_fn = dlsym(RTLD_NEXT, "rmdir"); + real_fcntl_fn = dlsym(RTLD_NEXT, "fcntl"); + debug_init_once(); } -/* 判断路径是否由我们接管 */ -static inline bool is_zvfs_path(const char *path) { - return path && strncmp(path, ZVFS_PATH_PREFIX, sizeof(ZVFS_PATH_PREFIX) - 1) == 0; +/* ------------------------------------------------------------------ */ +/* 路径与目录助手 */ +/* ------------------------------------------------------------------ */ + +static inline bool is_zvfs_path(const char *path) +{ + if (!path || strncmp(path, ZVFS_PATH_PREFIX, sizeof(ZVFS_PATH_PREFIX) - 1) != 0) { + return false; + } + return path[sizeof(ZVFS_PATH_PREFIX) - 1] == '\0' || + path[sizeof(ZVFS_PATH_PREFIX) - 1] == '/'; } -/* 判断 fd 是否是我们的伪 fd */ -static inline bool is_zvfs_fd(int fd) { +static inline bool is_zvfs_fd(int fd) +{ return fd >= FD_BASE && fd < FD_BASE + ZVFS_MAX_FD; } -/* ------------------------------------------------------------------ */ -/* 元数据文件 I/O */ -/* ------------------------------------------------------------------ */ +static inline bool is_zvfs_dirfd(int fd) +{ + return fd >= DIRFD_BASE && fd < DIRFD_BASE + ZVFS_MAX_DIRFD; +} -/* - * 格式:每行一条记录,字段用空格分隔 - * filename blob_id file_size allocated_clusters - * - * 例: - * hello.txt 4294967296 26 1 - */ +static int normalize_path(const char *path, char *out, size_t out_sz) +{ + size_t len; + if (!path || !out || out_sz == 0) { + return -1; + } + len = strnlen(path, out_sz); + if (len == 0 || len >= out_sz) { + return -1; + } + memcpy(out, path, len); + out[len] = '\0'; + while (len > 1 && out[len - 1] == '/') { + out[--len] = '\0'; + } + return 0; +} -static int meta_load(zvfs_t *fs) { - int fd = real_open_fn(META_FILE, O_RDONLY); - if (fd < 0) { - /* 文件不存在,当作空目录 */ - fs->dirent_count = 0; +static inline bool is_zvfs_root(const char *path) +{ + return strcmp(path, ZVFS_PATH_PREFIX) == 0; +} + +static int get_parent_dir(const char *path, char *out, size_t out_sz) +{ + char tmp[PATH_MAX]; + char *slash; + size_t n; + + if (normalize_path(path, tmp, sizeof(tmp)) != 0 || is_zvfs_root(tmp)) { + return -1; + } + slash = strrchr(tmp, '/'); + if (!slash || slash == tmp) { + return -1; + } + n = (size_t)(slash - tmp); + if (n >= out_sz) { + return -1; + } + memcpy(out, tmp, n); + out[n] = '\0'; + return 0; +} + +static int dirs_add(const char *path) +{ + char norm[PATH_MAX]; + size_t i; + + if (normalize_path(path, norm, sizeof(norm)) != 0 || !is_zvfs_path(norm)) { + return -1; + } + if (is_zvfs_root(norm)) { return 0; } + for (i = 0; i < g_dir_count; i++) { + if (strcmp(g_dirs[i], norm) == 0) { + return 0; + } + } + if (g_dir_count >= ZVFS_MAX_DIRS) { + return -1; + } + g_dirs[g_dir_count] = strdup(norm); + if (!g_dirs[g_dir_count]) { + return -1; + } + g_dir_count++; + return 0; +} - /* 一次性读进来 */ - char buf[4096] = {0}; - ssize_t n = real_read_fn(fd, buf, sizeof(buf) - 1); - close(fd); - if (n <= 0) return 0; +static bool dirs_has_exact(const char *path) +{ + size_t i; + char norm[PATH_MAX]; + if (normalize_path(path, norm, sizeof(norm)) != 0) { + return false; + } + if (is_zvfs_root(norm)) { + return true; + } + for (i = 0; i < g_dir_count; i++) { + if (strcmp(g_dirs[i], norm) == 0) { + return true; + } + } + return false; +} - char *line = buf; - while (*line) { - /* 找行尾 */ - char *nl = strchr(line, '\n'); - if (nl) *nl = '\0'; +static bool dirs_has_children(const char *path) +{ + size_t i; + size_t plen = strlen(path); + for (i = 0; i < g_dir_count; i++) { + if (strncmp(g_dirs[i], path, plen) == 0 && g_dirs[i][plen] == '/') { + return true; + } + } + if (g_fs) { + uint32_t j; + for (j = 0; j < g_fs->dirent_count; j++) { + zvfs_dirent_t *d = g_fs->dirents[j]; + if (!d || !d->is_valid) { + continue; + } + if (strncmp(d->filename, path, plen) == 0 && d->filename[plen] == '/') { + return true; + } + } + } + return false; +} - if (*line == '\0') { - line = nl ? nl + 1 : line + strlen(line); +static bool dirs_exists(const char *path) +{ + size_t plen; + uint32_t i; + + if (!path || !is_zvfs_path(path)) { + return false; + } + if (dirs_has_exact(path)) { + return true; + } + if (!g_fs) { + return false; + } + + plen = strlen(path); + for (i = 0; i < g_fs->dirent_count; i++) { + zvfs_dirent_t *d = g_fs->dirents[i]; + if (!d || !d->is_valid) { continue; } - - if (fs->dirent_count >= ZVFS_MAX_FILES) break; - - zvfs_dirent_t *d = calloc(1, sizeof(zvfs_dirent_t)); - if (!d) break; - - int ret = sscanf(line, "%255s %"PRIu64" %"PRIu64" %"PRIu64, - d->filename, - &d->blob_id, - &d->file_size, - &d->allocated_clusters); - if (ret == 4) { - d->is_valid = true; - d->open_count = 0; - fs->dirents[fs->dirent_count++] = d; - } else { - free(d); + if (strncmp(d->filename, path, plen) == 0 && d->filename[plen] == '/') { + return true; } - - line = nl ? nl + 1 : line + strlen(line); } - - return 0; + return false; } -static int meta_save(zvfs_t *fs) { - int fd = real_open_fn (META_FILE, O_WRONLY | O_CREAT | O_TRUNC, 0644); - if (fd < 0) return -1; - - for (uint32_t i = 0; i < fs->dirent_count; i++) { - zvfs_dirent_t *d = fs->dirents[i]; - if (!d || !d->is_valid) continue; - - char line[512]; - int len = snprintf(line, sizeof(line), "%s %"PRIu64" %"PRIu64" %"PRIu64"\n", - d->filename, d->blob_id, d->file_size, d->allocated_clusters); - real_write_fn(fd, line, len); - } - - real_close_fn(fd); - return 0; -} - - -/* ------------------------------------------------------------------ */ -/* 初始化(第一次 open 时调用) */ -/* ------------------------------------------------------------------ */ -// 退出的时候调用 save 和 unmount -static void zvfs_atexit(void) { - if (!g_mounted || !g_fs) return; - SPDK_NOTICELOG("umount\n"); - meta_save(g_fs); - zvfs_umount(g_fs); -} - -static int zvfs_ensure_mounted(void) { - if (g_mounted) return 0; - - g_fs = calloc(1, sizeof(zvfs_t)); - if (!g_fs) return -1; - - g_fs->fd_base = FD_BASE; - - /* 加载元数据 */ - if (meta_load(g_fs) != 0) { - free(g_fs); - g_fs = NULL; +static int dirs_remove(const char *path) +{ + size_t i; + char norm[PATH_MAX]; + if (normalize_path(path, norm, sizeof(norm)) != 0 || is_zvfs_root(norm)) { return -1; } - - /* 初始化 SPDK 环境并 mount */ - if (!g_env_init) { - if( zvfs_env_setup() != 0) { - free(g_fs); - g_fs = NULL; - return -1; + for (i = 0; i < g_dir_count; i++) { + if (strcmp(g_dirs[i], norm) == 0) { + free(g_dirs[i]); + g_dirs[i] = g_dirs[g_dir_count - 1]; + g_dirs[g_dir_count - 1] = NULL; + g_dir_count--; + return 0; } - g_env_init = true; } - - if (!zvfs_mount(g_fs)) { - zvfs_umount(g_fs); - free(g_fs); - g_fs = NULL; - return -1; - } - - g_mounted = true; - atexit(zvfs_atexit); - SPDK_NOTICELOG("mount\n"); - return 0; + return -1; } +static void dirs_reset(void) +{ + size_t i; + for (i = 0; i < g_dir_count; i++) { + free(g_dirs[i]); + g_dirs[i] = NULL; + } + g_dir_count = 0; + (void)dirs_add(ZVFS_PATH_PREFIX); +} -/* ------------------------------------------------------------------ */ -/* 目录查找 / 分配 */ -/* ------------------------------------------------------------------ */ - -static zvfs_dirent_t *dirent_find(const char *filename) { - for (uint32_t i = 0; i < g_fs->dirent_count; i++) { +static void dirs_rebuild_from_files(void) +{ + uint32_t i; + char tmp[PATH_MAX]; + for (i = 0; i < g_fs->dirent_count; i++) { zvfs_dirent_t *d = g_fs->dirents[i]; - if (d && d->is_valid && strcmp(d->filename, filename) == 0) + char *p; + if (!d || !d->is_valid) { + continue; + } + if (normalize_path(d->filename, tmp, sizeof(tmp)) != 0) { + continue; + } + p = strrchr(tmp, '/'); + while (p && strcmp(tmp, ZVFS_PATH_PREFIX) != 0) { + *p = '\0'; + (void)dirs_add(tmp); + p = strrchr(tmp, '/'); + } + } +} + +static int join_dir_path(const char *dir, const char *name, char *out, size_t out_sz) +{ + int n; + if (!dir || !name || !out) { + return -1; + } + if (name[0] == '/') { + return normalize_path(name, out, out_sz); + } + n = snprintf(out, out_sz, "%s/%s", dir, name); + if (n <= 0 || (size_t)n >= out_sz) { + return -1; + } + return normalize_path(out, out, out_sz); +} + +static bool can_read(const zvfs_file_t *file) +{ + int mode = file->flags & O_ACCMODE; + return mode != O_WRONLY; +} + +static bool can_write(const zvfs_file_t *file) +{ + int mode = file->flags & O_ACCMODE; + return mode != O_RDONLY; +} + +/* ------------------------------------------------------------------ */ +/* 目录项/FD 辅助 */ +/* ------------------------------------------------------------------ */ + +static zvfs_dirent_t *dirent_find(const char *filename) +{ + uint32_t i; + for (i = 0; i < g_fs->dirent_count; i++) { + zvfs_dirent_t *d = g_fs->dirents[i]; + if (d && d->is_valid && strcmp(d->filename, filename) == 0) { return d; + } } return NULL; } -static zvfs_dirent_t *dirent_alloc(const char *filename) { - if (g_fs->dirent_count >= ZVFS_MAX_FILES) return NULL; - - zvfs_dirent_t *d = calloc(1, sizeof(zvfs_dirent_t)); - if (!d) return NULL; - - strncpy(d->filename, filename, sizeof(d->filename) - 1); - d->is_valid = true; - d->open_count = 0; - d->file_size = 0; - d->allocated_clusters = 0; - d->blob_id = 0; - +static zvfs_dirent_t *dirent_alloc(const char *filename) +{ + zvfs_dirent_t *d; + if (g_fs->dirent_count >= ZVFS_MAX_FILES) { + return NULL; + } + if (strlen(filename) >= 256) { + errno = ENAMETOOLONG; + return NULL; + } + d = calloc(1, sizeof(*d)); + if (!d) { + return NULL; + } + memcpy(d->filename, filename, strlen(filename) + 1); + d->is_valid = true; g_fs->dirents[g_fs->dirent_count++] = d; return d; } +static void dirent_remove(zvfs_dirent_t *d) +{ + uint32_t i; + if (!d) { + return; + } + for (i = 0; i < g_fs->dirent_count; i++) { + if (g_fs->dirents[i] == d) { + free(d); + g_fs->dirents[i] = g_fs->dirents[g_fs->dirent_count - 1]; + g_fs->dirents[g_fs->dirent_count - 1] = NULL; + g_fs->dirent_count--; + return; + } + } +} -/* ------------------------------------------------------------------ */ -/* fd 表管理 */ -/* ------------------------------------------------------------------ */ - -/* 分配一个空闲 slot,返回伪 fd;失败返回 -1 */ -static int fd_alloc(zvfs_file_t *file) { - for (int i = 0; i < ZVFS_MAX_FD; i++) { - if (g_fs->fd_table[i] == NULL) { +static int fd_alloc(zvfs_file_t *file) +{ + int i; + for (i = 0; i < ZVFS_MAX_FD; i++) { + if (!g_fs->fd_table[i]) { g_fs->fd_table[i] = file; file->pseudo_fd = FD_BASE + i; g_fs->openfd_count++; @@ -236,270 +529,1090 @@ static int fd_alloc(zvfs_file_t *file) { return -1; } -/* 释放 slot */ -static void fd_free(int pseudo_fd) { +static void fd_free(int pseudo_fd) +{ int idx = pseudo_fd - FD_BASE; - if (idx < 0 || idx >= ZVFS_MAX_FD) return; - g_fs->fd_table[idx] = NULL; - g_fs->openfd_count--; + if (idx < 0 || idx >= ZVFS_MAX_FD) { + return; + } + if (g_fs && g_fs->fd_table[idx]) { + g_fs->fd_table[idx] = NULL; + g_fs->openfd_count--; + } } -/* 通过伪 fd 查找 file */ -static zvfs_file_t *fd_lookup(int pseudo_fd) { +static zvfs_file_t *fd_lookup(int pseudo_fd) +{ int idx = pseudo_fd - FD_BASE; - if (idx < 0 || idx >= ZVFS_MAX_FD) return NULL; + if (!g_fs || idx < 0 || idx >= ZVFS_MAX_FD) { + return NULL; + } return g_fs->fd_table[idx]; } -/* ------------------------------------------------------------------ */ -/* POSIX hook */ -/* ------------------------------------------------------------------ */ - -/* ------------------------------------------------------------------ */ -/* POSIX hook: open */ -/* ------------------------------------------------------------------ */ -/** - * O_RDONLY - * O_WRONLY - * O_RDWR - * O_CREAT - */ -int open(const char *path, int flags, ...) { - if (!is_zvfs_path(path)) { - mode_t mode = 0; - if (flags & O_CREAT) { - va_list ap; - va_start(ap, flags); - mode = va_arg(ap, mode_t); - va_end(ap); +static int dirfd_alloc(const char *path, int flags) +{ + int i; + for (i = 0; i < ZVFS_MAX_DIRFD; i++) { + if (!g_dirfd_table[i].used) { + g_dirfd_table[i].used = true; + g_dirfd_table[i].flags = flags; + snprintf(g_dirfd_table[i].path, sizeof(g_dirfd_table[i].path), "%s", path); + return DIRFD_BASE + i; } - return real_open_fn(path, flags, mode); } + return -1; +} - /* 确保 fs 已经 mount */ +static void dirfd_free(int fd) +{ + int idx = fd - DIRFD_BASE; + if (idx < 0 || idx >= ZVFS_MAX_DIRFD) { + return; + } + g_dirfd_table[idx].used = false; + g_dirfd_table[idx].flags = 0; + g_dirfd_table[idx].path[0] = '\0'; +} + +static zvfs_dirfd_t *dirfd_lookup(int fd) +{ + int idx = fd - DIRFD_BASE; + if (idx < 0 || idx >= ZVFS_MAX_DIRFD || !g_dirfd_table[idx].used) { + return NULL; + } + return &g_dirfd_table[idx]; +} + +static const char *debug_fd_path(int fd) +{ + if (is_zvfs_fd(fd)) { + zvfs_file_t *file = fd_lookup(fd); + if (file && file->dirent) { + return file->dirent->filename; + } + } else if (is_zvfs_dirfd(fd)) { + zvfs_dirfd_t *d = dirfd_lookup(fd); + if (d) { + return d->path; + } + } + return NULL; +} + +static bool debug_fd_enabled(int fd) +{ + return debug_path_enabled(debug_fd_path(fd)); +} + +typedef struct { + char name[NAME_MAX + 1]; + unsigned char type; +} zvfs_dir_item_t; + +typedef struct { + uint64_t magic; + int pseudo_fd; + size_t pos; + size_t count; + zvfs_dir_item_t *items; + struct dirent ent; + struct dirent64 ent64; +} zvfs_dir_stream_t; + +#define ZVFS_DIR_STREAM_MAGIC 0x5a56465344495231ULL + +static bool extract_direct_child(const char *parent, const char *path, char *name_out, size_t out_sz) +{ + size_t plen; + const char *start; + const char *slash; + size_t nlen; + + if (!parent || !path || !name_out || out_sz == 0) { + return false; + } + plen = strlen(parent); + if (strncmp(path, parent, plen) != 0) { + return false; + } + if (path[plen] != '/') { + return false; + } + start = path + plen + 1; + if (*start == '\0') { + return false; + } + slash = strchr(start, '/'); + if (slash) { + return false; + } + nlen = strlen(start); + if (nlen == 0 || nlen >= out_sz) { + return false; + } + memcpy(name_out, start, nlen + 1); + return true; +} + +static bool add_dir_item(zvfs_dir_item_t **items, size_t *count, const char *name, unsigned char type) +{ + size_t i; + zvfs_dir_item_t *tmp; + + if (!items || !count || !name) { + return false; + } + for (i = 0; i < *count; i++) { + if (strcmp((*items)[i].name, name) == 0) { + if ((*items)[i].type != DT_DIR && type == DT_DIR) { + (*items)[i].type = DT_DIR; + } + return true; + } + } + tmp = realloc(*items, (*count + 1) * sizeof(**items)); + if (!tmp) { + return false; + } + *items = tmp; + snprintf((*items)[*count].name, sizeof((*items)[*count].name), "%s", name); + (*items)[*count].type = type; + (*count)++; + return true; +} + +static int build_dir_items(const char *path, zvfs_dir_item_t **items_out, size_t *count_out) +{ + size_t i; + uint32_t j; + char name[NAME_MAX + 1]; + zvfs_dir_item_t *items = NULL; + size_t count = 0; + + if (!items_out || !count_out) { + errno = EINVAL; + return -1; + } if (zvfs_ensure_mounted() != 0) { errno = EIO; return -1; } - - /* 查找 dirent */ - zvfs_dirent_t *dirent = dirent_find(path); - - if (!dirent) { - /* 文件不存在 */ - if (!(flags & O_CREAT)) { - errno = ENOENT; - return -1; - } - /* 新建 dirent */ - dirent = dirent_alloc(path); - if (!dirent) { - errno = ENOMEM; - return -1; - } - meta_save(g_fs); + if (!dirs_exists(path)) { + errno = ENOENT; + return -1; } - /* 创建 file 句柄 */ - zvfs_file_t *file = calloc(1, sizeof(zvfs_file_t)); - if (!file) { + if (!add_dir_item(&items, &count, ".", DT_DIR) || + !add_dir_item(&items, &count, "..", DT_DIR)) { + free(items); errno = ENOMEM; return -1; } - file->fs = g_fs; - file->dirent = dirent; - file->flags = flags; - file->current_offset = 0; - - int ok; - if (dirent->blob_id == 0) { - /* 新文件:create blob,open 时不 resize,write 时按需扩容 */ - ok = zvfs_create(file); /* 内部 create → open → resize → alloc dma_buf */ - SPDK_DEBUGLOG("create: %ld\n", file->blob_id); - /* 把新分配的 blob_id 写回 dirent */ - dirent->blob_id = file->blob_id; - } else { - /* 已有文件:直接 open 已有 blob */ - file->blob_id = dirent->blob_id; - SPDK_DEBUGLOG("open: %ld\n", file->blob_id); - ok = zvfs_open(file); + for (i = 0; i < g_dir_count; i++) { + if (!g_dirs[i] || is_zvfs_root(g_dirs[i])) { + continue; + } + if (extract_direct_child(path, g_dirs[i], name, sizeof(name))) { + if (!add_dir_item(&items, &count, name, DT_DIR)) { + free(items); + errno = ENOMEM; + return -1; + } + } } + if (g_fs) { + for (j = 0; j < g_fs->dirent_count; j++) { + zvfs_dirent_t *d = g_fs->dirents[j]; + if (!d || !d->is_valid) { + continue; + } + if (extract_direct_child(path, d->filename, name, sizeof(name))) { + if (!add_dir_item(&items, &count, name, DT_REG)) { + free(items); + errno = ENOMEM; + return -1; + } + } + } + } + + *items_out = items; + *count_out = count; + return 0; +} + +static inline bool is_zvfs_dirstream(DIR *dirp) +{ + zvfs_dir_stream_t *s = (zvfs_dir_stream_t *)(void *)dirp; + return s && s->magic == ZVFS_DIR_STREAM_MAGIC; +} + +static DIR *create_zvfs_dirstream(const char *path, int pseudo_fd) +{ + zvfs_dir_stream_t *s; + + s = calloc(1, sizeof(*s)); + if (!s) { + errno = ENOMEM; + return NULL; + } + if (build_dir_items(path, &s->items, &s->count) != 0) { + free(s); + return NULL; + } + s->magic = ZVFS_DIR_STREAM_MAGIC; + s->pseudo_fd = pseudo_fd; + return (DIR *)(void *)s; +} + +typedef struct { + int fd; +} zvfs_stdio_cookie_t; + +static ssize_t zvfs_cookie_read(void *c, char *buf, size_t size) +{ + zvfs_stdio_cookie_t *ck = (zvfs_stdio_cookie_t *)c; + return read(ck->fd, buf, size); +} + +static ssize_t zvfs_cookie_write(void *c, const char *buf, size_t size) +{ + zvfs_stdio_cookie_t *ck = (zvfs_stdio_cookie_t *)c; + return write(ck->fd, buf, size); +} + +static int zvfs_cookie_seek(void *c, off64_t *off, int whence) +{ + zvfs_stdio_cookie_t *ck = (zvfs_stdio_cookie_t *)c; + off64_t rc = lseek64(ck->fd, *off, whence); + if (rc < 0) { + return -1; + } + *off = rc; + return 0; +} + +static int zvfs_cookie_close(void *c) +{ + zvfs_stdio_cookie_t *ck = (zvfs_stdio_cookie_t *)c; + int rc = close(ck->fd); + free(ck); + return rc; +} + +/* ------------------------------------------------------------------ */ +/* 元数据文件 I/O */ +/* ------------------------------------------------------------------ */ + +static int meta_load(zvfs_t *fs) +{ + int fd = real_open_passthrough(META_FILE, O_RDONLY, 0, false); + char buf[4096] = {0}; + char *line; + if (fd < 0) { + fs->dirent_count = 0; + return 0; + } + if (real_read_fn(fd, buf, sizeof(buf) - 1) <= 0) { + (void)real_close_fn(fd); + return 0; + } + (void)real_close_fn(fd); + + line = buf; + while (*line) { + char *nl = strchr(line, '\n'); + zvfs_dirent_t *d; + int ret; + if (nl) { + *nl = '\0'; + } + if (*line == '\0') { + line = nl ? nl + 1 : line + strlen(line); + continue; + } + if (fs->dirent_count >= ZVFS_MAX_FILES) { + break; + } + d = calloc(1, sizeof(*d)); + if (!d) { + break; + } + ret = sscanf(line, "%255s %" PRIu64 " %" PRIu64 " %" PRIu64, + d->filename, &d->blob_id, &d->file_size, &d->allocated_clusters); + if (ret == 4) { + d->is_valid = true; + fs->dirents[fs->dirent_count++] = d; + } else { + free(d); + } + line = nl ? nl + 1 : line + strlen(line); + } + return 0; +} + +static int meta_save(zvfs_t *fs) +{ + uint32_t i; + int fd = real_open_passthrough(META_FILE, O_WRONLY | O_CREAT | O_TRUNC, 0644, true); + if (fd < 0) { + return -1; + } + for (i = 0; i < fs->dirent_count; i++) { + zvfs_dirent_t *d = fs->dirents[i]; + char line[512]; + int len; + if (!d || !d->is_valid) { + continue; + } + len = snprintf(line, sizeof(line), "%s %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", + d->filename, d->blob_id, d->file_size, d->allocated_clusters); + if (len > 0) { + (void)real_write_fn(fd, line, (size_t)len); + } + } + (void)real_close_fn(fd); + return 0; +} + +/* ------------------------------------------------------------------ */ +/* 初始化 */ +/* ------------------------------------------------------------------ */ + +static void zvfs_atexit(void) +{ + if (!g_mounted || !g_fs) { + dirs_reset(); + return; + } + (void)meta_save(g_fs); + (void)zvfs_umount(g_fs); + dirs_reset(); +} + +static int zvfs_ensure_mounted(void) +{ + if (g_mounted) { + return 0; + } + + g_fs = calloc(1, sizeof(*g_fs)); + if (!g_fs) { + return -1; + } + g_fs->fd_base = FD_BASE; + dirs_reset(); + + if (meta_load(g_fs) != 0) { + free(g_fs); + g_fs = NULL; + return -1; + } + dirs_rebuild_from_files(); + + if (!g_env_init) { + if (zvfs_env_setup() != 0) { + free(g_fs); + g_fs = NULL; + return -1; + } + g_env_init = true; + } + if (!zvfs_mount(g_fs)) { + (void)zvfs_umount(g_fs); + free(g_fs); + g_fs = NULL; + return -1; + } + g_mounted = true; + atexit(zvfs_atexit); + return 0; +} + +/* ------------------------------------------------------------------ */ +/* stat helpers */ +/* ------------------------------------------------------------------ */ + +static uint64_t path_hash(const char *s) +{ + uint64_t h = 1469598103934665603ULL; + while (*s) { + h ^= (unsigned char)*s++; + h *= 1099511628211ULL; + } + return h; +} + +static void fill_stat(struct stat *st, mode_t mode, off_t size, uint64_t ino) +{ + time_t now = time(NULL); + memset(st, 0, sizeof(*st)); + st->st_mode = mode; + st->st_nlink = S_ISDIR(mode) ? 2 : 1; + st->st_uid = getuid(); + st->st_gid = getgid(); + st->st_size = size; + st->st_blksize = 4096; + st->st_blocks = (blkcnt_t)((size + 511) / 512); + st->st_ino = (ino_t)ino; + st->st_atime = now; + st->st_mtime = now; + st->st_ctime = now; +} + +static void fill_stat64(struct stat64 *st, mode_t mode, off64_t size, uint64_t ino) +{ + time_t now = time(NULL); + memset(st, 0, sizeof(*st)); + st->st_mode = mode; + st->st_nlink = S_ISDIR(mode) ? 2 : 1; + st->st_uid = getuid(); + st->st_gid = getgid(); + st->st_size = size; + st->st_blksize = 4096; + st->st_blocks = (blkcnt64_t)((size + 511) / 512); + st->st_ino = (ino64_t)ino; + st->st_atime = now; + st->st_mtime = now; + st->st_ctime = now; +} + +/* ------------------------------------------------------------------ */ +/* open helpers */ +/* ------------------------------------------------------------------ */ + +static int open_zvfs_file(const char *path, int flags) +{ + char norm[PATH_MAX]; + char parent[PATH_MAX]; + zvfs_dirent_t *dirent; + zvfs_file_t *file; + bool created = false; + int ok; + int fd; + + if (zvfs_ensure_mounted() != 0) { + if (debug_path_enabled(path)) { + debug_log("open_zvfs_file path=%s flags=0x%x mount failed", path, flags); + } + errno = EIO; + return -1; + } + if (normalize_path(path, norm, sizeof(norm)) != 0) { + if (debug_path_enabled(path)) { + debug_log("open_zvfs_file path=%s flags=0x%x normalize failed", path, flags); + } + errno = ENOENT; + return -1; + } + + if (dirs_exists(norm)) { + int accmode = flags & O_ACCMODE; + if (accmode != O_RDONLY || (flags & (O_CREAT | O_TRUNC))) { + errno = EISDIR; + return -1; + } + fd = dirfd_alloc(norm, flags); + if (fd < 0) { + errno = EMFILE; + return -1; + } + if (debug_path_enabled(norm)) { + debug_log("open_zvfs_file path=%s flags=0x%x -> dirfd=%d", norm, flags, fd); + } + return fd; + } + if (flags & O_DIRECTORY) { + errno = ENOTDIR; + return -1; + } + if (get_parent_dir(norm, parent, sizeof(parent)) != 0 || !dirs_exists(parent)) { + errno = ENOENT; + return -1; + } + + dirent = dirent_find(norm); + if (!dirent) { + if (!(flags & O_CREAT)) { + errno = ENOENT; + return -1; + } + dirent = dirent_alloc(norm); + if (!dirent) { + errno = ENOMEM; + return -1; + } + created = true; + } else { + if ((flags & O_CREAT) && (flags & O_EXCL)) { + errno = EEXIST; + return -1; + } + } + + file = calloc(1, sizeof(*file)); + if (!file) { + errno = ENOMEM; + return -1; + } + file->fs = g_fs; + file->dirent = dirent; + file->flags = flags; + + if (dirent->blob_id == 0) { + ok = zvfs_create(file); + if (ok) { + dirent->blob_id = file->blob_id; + } + } else { + file->blob_id = dirent->blob_id; + ok = zvfs_open(file); + } if (!ok) { free(file); errno = EIO; return -1; } - /* 分配伪 fd */ - int fd = fd_alloc(file); + if ((flags & O_TRUNC) && can_write(file)) { + dirent->file_size = 0; + file->current_offset = 0; + } else if (flags & O_APPEND) { + file->current_offset = dirent->file_size; + } + + fd = fd_alloc(file); if (fd < 0) { - zvfs_close(file); + (void)zvfs_close(file); free(file); errno = EMFILE; return -1; } - dirent->open_count++; + if (created || (flags & O_TRUNC)) { + (void)meta_save(g_fs); + } + if (debug_path_enabled(norm)) { + debug_log("open_zvfs_file path=%s flags=0x%x created=%d blob=%" PRIu64 " -> fd=%d", + norm, flags, created ? 1 : 0, file->blob_id, fd); + } return fd; } - -/* ------------------------------------------------------------------ */ -/* POSIX hook: read */ -/* ------------------------------------------------------------------ */ -ssize_t read(int fd, void *buf, size_t count) { - if (!is_zvfs_fd(fd)) { - return real_read_fn(fd, buf, count); - } - - zvfs_file_t *file = fd_lookup(fd); - if (!file) { - errno = EBADF; +static int resolve_path_at(int dirfd, const char *path, char *resolved, size_t sz) +{ + if (!path || !resolved || sz == 0) { + errno = EINVAL; return -1; } - - if (!(file->flags & O_RDWR) && (file->flags & O_WRONLY)) { - errno = EBADF; - return -1; - } - - return zvfs_read(file, (uint8_t *)buf, count); -} - - -/* ------------------------------------------------------------------ */ -/* POSIX hook: write */ -/* ------------------------------------------------------------------ */ - -ssize_t write(int fd, const void *buf, size_t count) { - if (!is_zvfs_fd(fd)) { - return real_write_fn(fd, buf, count); - } - - zvfs_file_t *file = fd_lookup(fd); - if (!file) { - errno = EBADF; - return -1; - } - - return zvfs_write(file, (const uint8_t *)buf, count); -} - - -/* ------------------------------------------------------------------ */ -/* POSIX hook: close */ -/* ------------------------------------------------------------------ */ - -int close(int fd) { - if (!is_zvfs_fd(fd)) { - return real_close_fn(fd); - } - - zvfs_file_t *file = fd_lookup(fd); - if (!file) { - errno = EBADF; - return -1; - } - - zvfs_dirent_t *dirent = file->dirent; - - /* 关闭 blob */ - zvfs_close(file); - - /* 释放 fd slot */ - fd_free(fd); - - /* 更新引用计数 */ - if (dirent) { - dirent->open_count--; - if(dirent->open_count == 0 && !dirent->is_valid){ - zvfs_delete(file); - /* 从 dirents 数组中移除 */ - for (uint32_t i = 0; i < g_fs->dirent_count; i++) { - if (g_fs->dirents[i] == dirent) { - free(dirent); - g_fs->dirents[i] = g_fs->dirents[--g_fs->dirent_count]; - g_fs->dirents[g_fs->dirent_count] = NULL; - break; - } - } - meta_save(g_fs); + if (path[0] == '/') { + if (normalize_path(path, resolved, sz) != 0) { + errno = ENOENT; + return -1; } + return 0; } - - free(file); - return 0; + if (dirfd == AT_FDCWD) { + return -1; + } + if (is_zvfs_fd(dirfd)) { + errno = ENOTDIR; + return -1; + } + if (is_zvfs_dirfd(dirfd)) { + zvfs_dirfd_t *d = dirfd_lookup(dirfd); + if (!d || join_dir_path(d->path, path, resolved, sz) != 0) { + errno = ENOENT; + return -1; + } + return 0; + } + return -1; } /* ------------------------------------------------------------------ */ -/* POSIX hook: unlink */ +/* POSIX hooks */ /* ------------------------------------------------------------------ */ -int unlink(const char *name) { + +int __open_2(const char *path, int flags) +{ + return open(path, flags); +} + +int __open64_2(const char *path, int flags) +{ + return open64(path, flags); +} + +int __openat_2(int dirfd, const char *path, int flags) +{ + return openat(dirfd, path, flags); +} + +int __openat64_2(int dirfd, const char *path, int flags) +{ + return openat64(dirfd, path, flags); +} + +DIR *opendir(const char *name) +{ + DIR *dirp; + int fd; + if (!is_zvfs_path(name)) { - return real_unlink_fn(name); - } - - if (zvfs_ensure_mounted() != 0) { - errno = EIO; - return -1; - } - - zvfs_dirent_t *dirent = dirent_find(name); - if (!dirent) { - errno = ENOENT; - return -1; - } - - if (dirent->open_count > 0) { - /* 还有人打开着,延迟删除:标记无效,等最后一次 close 时再 delete blob */ - dirent->is_valid = false; - } else { - /* 没人打开,直接删除 blob */ - zvfs_file_t tmp = {0}; - tmp.fs = g_fs; - tmp.dirent = dirent; - tmp.blob_id = dirent->blob_id; - zvfs_delete(&tmp); - - /* 从 dirents 数组中移除 */ - for (uint32_t i = 0; i < g_fs->dirent_count; i++) { - if (g_fs->dirents[i] == dirent) { - free(dirent); - g_fs->dirents[i] = g_fs->dirents[--g_fs->dirent_count]; - g_fs->dirents[g_fs->dirent_count] = NULL; - break; - } + if (!real_opendir_fn) { + errno = ENOSYS; + return NULL; } - - meta_save(g_fs); + return real_opendir_fn(name); } + fd = open_zvfs_file(name, O_RDONLY | O_DIRECTORY); + if (fd < 0) { + return NULL; + } + dirp = create_zvfs_dirstream(name, fd); + if (!dirp) { + (void)close(fd); + return NULL; + } + if (debug_path_enabled(name)) { + debug_log("opendir path=%s -> dirp=%p fd=%d", name, (void *)dirp, fd); + } + return dirp; +} + +DIR *fdopendir(int fd) +{ + DIR *dirp; + zvfs_dirfd_t *d; + + if (!is_zvfs_dirfd(fd)) { + if (!real_fdopendir_fn) { + errno = ENOSYS; + return NULL; + } + return real_fdopendir_fn(fd); + } + d = dirfd_lookup(fd); + if (!d) { + errno = EBADF; + return NULL; + } + dirp = create_zvfs_dirstream(d->path, fd); + if (debug_fd_enabled(fd)) { + debug_log("fdopendir fd=%d path=%s -> dirp=%p", fd, d->path, (void *)dirp); + } + return dirp; +} + +struct dirent *readdir(DIR *dirp) +{ + zvfs_dir_stream_t *s; + + if (!is_zvfs_dirstream(dirp)) { + return real_readdir_fn ? real_readdir_fn(dirp) : NULL; + } + s = (zvfs_dir_stream_t *)(void *)dirp; + if (s->pos >= s->count) { + return NULL; + } + memset(&s->ent, 0, sizeof(s->ent)); + s->ent.d_ino = (ino_t)path_hash(s->items[s->pos].name); + s->ent.d_off = (off_t)(s->pos + 1); + s->ent.d_reclen = (unsigned short)sizeof(struct dirent); + s->ent.d_type = s->items[s->pos].type; + snprintf(s->ent.d_name, sizeof(s->ent.d_name), "%s", s->items[s->pos].name); + s->pos++; + return &s->ent; +} + +struct dirent64 *readdir64(DIR *dirp) +{ + zvfs_dir_stream_t *s; + + if (!is_zvfs_dirstream(dirp)) { + return real_readdir64_fn ? real_readdir64_fn(dirp) : (struct dirent64 *)readdir(dirp); + } + s = (zvfs_dir_stream_t *)(void *)dirp; + if (s->pos >= s->count) { + return NULL; + } + memset(&s->ent64, 0, sizeof(s->ent64)); + s->ent64.d_ino = (ino64_t)path_hash(s->items[s->pos].name); + s->ent64.d_off = (off64_t)(s->pos + 1); + s->ent64.d_reclen = (unsigned short)sizeof(struct dirent64); + s->ent64.d_type = s->items[s->pos].type; + snprintf(s->ent64.d_name, sizeof(s->ent64.d_name), "%s", s->items[s->pos].name); + s->pos++; + return &s->ent64; +} + +int closedir(DIR *dirp) +{ + zvfs_dir_stream_t *s; + + if (!is_zvfs_dirstream(dirp)) { + return real_closedir_fn ? real_closedir_fn(dirp) : -1; + } + s = (zvfs_dir_stream_t *)(void *)dirp; + if (s->pseudo_fd >= 0) { + (void)close(s->pseudo_fd); + s->pseudo_fd = -1; + } + free(s->items); + s->items = NULL; + s->magic = 0; + free(s); return 0; } -/* ------------------------------------------------------------------ */ -/* POSIX hook: unlink */ -/* ------------------------------------------------------------------ */ -/** - * SEEK_SET - * SEEK_CUR - * SEEK_END - */ -off_t lseek(int fd, off_t offset, int whence){ - if (!is_zvfs_fd(fd)) { - return real_lseek_fn(fd, offset, whence); +int dirfd(DIR *dirp) +{ + zvfs_dir_stream_t *s; + + if (!is_zvfs_dirstream(dirp)) { + return real_dirfd_fn ? real_dirfd_fn(dirp) : -1; + } + s = (zvfs_dir_stream_t *)(void *)dirp; + if (s->pseudo_fd < 0) { + errno = EINVAL; + return -1; + } + return s->pseudo_fd; +} + +int open(const char *path, int flags, ...) +{ + mode_t mode = 0; + bool has_mode = (flags & O_CREAT) != 0; + if (has_mode) { + va_list ap; + va_start(ap, flags); + mode = va_arg(ap, mode_t); + va_end(ap); + } + if (!is_zvfs_path(path)) { + return real_open_passthrough(path, flags, mode, has_mode); + } + if (debug_path_enabled(path)) { + debug_log("open path=%s flags=0x%x", path, flags); + } + return open_zvfs_file(path, flags); +} + +int open64(const char *path, int flags, ...) +{ + mode_t mode = 0; + bool has_mode = (flags & O_CREAT) != 0; + if (has_mode) { + va_list ap; + va_start(ap, flags); + mode = va_arg(ap, mode_t); + va_end(ap); + } + if (!is_zvfs_path(path)) { + return real_open_passthrough(path, flags, mode, has_mode); + } + if (debug_path_enabled(path)) { + debug_log("open64 path=%s flags=0x%x", path, flags); + } + return open_zvfs_file(path, flags); +} + +int openat(int dirfd, const char *path, int flags, ...) +{ + mode_t mode = 0; + bool has_mode = (flags & O_CREAT) != 0; + char resolved[PATH_MAX]; + + if (has_mode) { + va_list ap; + va_start(ap, flags); + mode = va_arg(ap, mode_t); + va_end(ap); } - zvfs_file_t *file = fd_lookup(fd); - if (!file) { errno = EBADF; return -1; } + if (path[0] == '/' && is_zvfs_path(path)) { + if (debug_path_enabled(path)) { + debug_log("openat dirfd=%d path=%s flags=0x%x (abs)", dirfd, path, flags); + } + return open_zvfs_file(path, flags); + } + if (resolve_path_at(dirfd, path, resolved, sizeof(resolved)) == 0 && is_zvfs_path(resolved)) { + if (debug_path_enabled(resolved)) { + debug_log("openat dirfd=%d path=%s resolved=%s flags=0x%x", dirfd, path, resolved, flags); + } + return open_zvfs_file(resolved, flags); + } + return real_openat_passthrough(dirfd, path, flags, mode, has_mode); +} - off_t new_offset; - uint64_t file_size = file->dirent ? file->dirent->file_size : 0; +int openat64(int dirfd, const char *path, int flags, ...) +{ + mode_t mode = 0; + bool has_mode = (flags & O_CREAT) != 0; + char resolved[PATH_MAX]; + + if (has_mode) { + va_list ap; + va_start(ap, flags); + mode = va_arg(ap, mode_t); + va_end(ap); + } + + if (path[0] == '/' && is_zvfs_path(path)) { + if (debug_path_enabled(path)) { + debug_log("openat64 dirfd=%d path=%s flags=0x%x (abs)", dirfd, path, flags); + } + return open_zvfs_file(path, flags); + } + if (resolve_path_at(dirfd, path, resolved, sizeof(resolved)) == 0 && is_zvfs_path(resolved)) { + if (debug_path_enabled(resolved)) { + debug_log("openat64 dirfd=%d path=%s resolved=%s flags=0x%x", dirfd, path, resolved, flags); + } + return open_zvfs_file(resolved, flags); + } + return real_openat_passthrough(dirfd, path, flags, mode, has_mode); +} + +FILE *fopen(const char *path, const char *mode) +{ + FILE *ret; + int saved_errno = 0; + + if (!real_fopen_fn) { + errno = ENOSYS; + return NULL; + } + ret = real_fopen_fn(path, mode); + if (!ret) { + saved_errno = errno; + } + if (debug_path_enabled(path)) { + debug_log("fopen path=%s mode=%s -> %p errno=%d", path, mode, (void *)ret, saved_errno); + } + if (!ret) { + errno = saved_errno; + } + return ret; +} + +FILE *fopen64(const char *path, const char *mode) +{ + FILE *ret; + int saved_errno = 0; + + if (!real_fopen64_fn) { + return fopen(path, mode); + } + ret = real_fopen64_fn(path, mode); + if (!ret) { + saved_errno = errno; + } + if (debug_path_enabled(path)) { + debug_log("fopen64 path=%s mode=%s -> %p errno=%d", path, mode, (void *)ret, saved_errno); + } + if (!ret) { + errno = saved_errno; + } + return ret; +} + +FILE *fdopen(int fd, const char *mode) +{ + FILE *ret; + int saved_errno = 0; + + if (!is_zvfs_fd(fd)) { + if (!real_fdopen_fn) { + errno = ENOSYS; + return NULL; + } + ret = real_fdopen_fn(fd, mode); + if (!ret) { + saved_errno = errno; + } + if (debug_fd_enabled(fd)) { + const char *path = debug_fd_path(fd); + debug_log("fdopen(real) fd=%d path=%s mode=%s -> %p errno=%d", + fd, path ? path : "?", mode, (void *)ret, saved_errno); + } + if (!ret) { + errno = saved_errno; + } + return ret; + } - switch (whence) { + cookie_io_functions_t io = {0}; + zvfs_stdio_cookie_t *cookie; + + cookie = calloc(1, sizeof(*cookie)); + if (!cookie) { + errno = ENOMEM; + return NULL; + } + cookie->fd = fd; + io.read = zvfs_cookie_read; + io.write = zvfs_cookie_write; + io.seek = zvfs_cookie_seek; + io.close = zvfs_cookie_close; + + ret = fopencookie(cookie, mode, io); + if (!ret) { + saved_errno = errno; + free(cookie); + } + + if (debug_fd_enabled(fd)) { + const char *path = debug_fd_path(fd); + debug_log("fdopen(cookie) fd=%d path=%s mode=%s -> %p errno=%d", + fd, path ? path : "?", mode, (void *)ret, saved_errno); + } + if (!ret) { + errno = saved_errno; + } + return ret; + } +} + +ssize_t read(int fd, void *buf, size_t count) +{ + zvfs_file_t *file; + int rc; + const char *path; + + if (!is_zvfs_fd(fd)) { + return real_read_fn ? real_read_fn(fd, buf, count) : -1; + } + path = debug_fd_path(fd); + file = fd_lookup(fd); + if (!file || !can_read(file)) { + if (debug_fd_enabled(fd)) { + debug_log("read fd=%d path=%s count=%zu -> EBADF", fd, path ? path : "?", count); + } + errno = EBADF; + return -1; + } + rc = zvfs_read(file, (uint8_t *)buf, count); + if (rc < 0) { + errno = file->op_errno ? -file->op_errno : EIO; + } + if (debug_fd_enabled(fd) && (rc < 0 || count <= 64)) { + debug_log("read fd=%d path=%s count=%zu -> rc=%d errno=%d", fd, path ? path : "?", count, rc, rc < 0 ? errno : 0); + } + return rc; +} + +ssize_t write(int fd, const void *buf, size_t count) +{ + zvfs_file_t *file; + int rc; + + if (!is_zvfs_fd(fd)) { + return real_write_fn ? real_write_fn(fd, buf, count) : -1; + } + file = fd_lookup(fd); + if (!file || !can_write(file)) { + errno = EBADF; + return -1; + } + if (file->flags & O_APPEND) { + file->current_offset = file->dirent ? file->dirent->file_size : file->current_offset; + } + rc = zvfs_write(file, (const uint8_t *)buf, count); + if (rc < 0) { + errno = file->op_errno ? -file->op_errno : EIO; + } + return rc; +} + +ssize_t pread(int fd, void *buf, size_t count, off_t offset) +{ + zvfs_file_t *file; + int rc; + const char *path; + if (!is_zvfs_fd(fd)) { + return real_pread_fn ? real_pread_fn(fd, buf, count, offset) : -1; + } + if (offset < 0) { + errno = EINVAL; + return -1; + } + path = debug_fd_path(fd); + file = fd_lookup(fd); + if (!file || !can_read(file)) { + if (debug_fd_enabled(fd)) { + debug_log("pread fd=%d path=%s count=%zu off=%lld -> EBADF", + fd, path ? path : "?", count, (long long)offset); + } + errno = EBADF; + return -1; + } + rc = zvfs_pread(file, (uint8_t *)buf, count, (uint64_t)offset); + if (rc < 0) { + errno = file->op_errno ? -file->op_errno : EIO; + } + if (debug_fd_enabled(fd) && (rc < 0 || count <= 64)) { + debug_log("pread fd=%d path=%s count=%zu off=%lld -> rc=%d errno=%d", + fd, path ? path : "?", count, (long long)offset, rc, rc < 0 ? errno : 0); + } + return rc; +} + +ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset) +{ + zvfs_file_t *file; + int rc; + if (!is_zvfs_fd(fd)) { + return real_pwrite_fn ? real_pwrite_fn(fd, buf, count, offset) : -1; + } + if (offset < 0) { + errno = EINVAL; + return -1; + } + file = fd_lookup(fd); + if (!file || !can_write(file)) { + errno = EBADF; + return -1; + } + rc = zvfs_pwrite(file, (const uint8_t *)buf, count, (uint64_t)offset); + if (rc < 0) { + errno = file->op_errno ? -file->op_errno : EIO; + } + return rc; +} + +ssize_t pread64(int fd, void *buf, size_t count, off_t offset) +{ + return pread(fd, buf, count, offset); +} + +ssize_t pwrite64(int fd, const void *buf, size_t count, off_t offset) +{ + return pwrite(fd, buf, count, offset); +} + +off_t lseek(int fd, off_t offset, int whence) +{ + zvfs_file_t *file; + off_t new_offset; + uint64_t file_size; + + if (!is_zvfs_fd(fd)) { + return real_lseek_fn ? real_lseek_fn(fd, offset, whence) : -1; + } + file = fd_lookup(fd); + if (!file) { + errno = EBADF; + return -1; + } + file_size = file->dirent ? file->dirent->file_size : 0; + switch (whence) { case SEEK_SET: new_offset = offset; break; @@ -509,17 +1622,754 @@ off_t lseek(int fd, off_t offset, int whence){ case SEEK_END: new_offset = (off_t)file_size + offset; break; - default: errno = EINVAL; return -1; } - if (new_offset < 0) { errno = EINVAL; return -1; } - file->current_offset = (uint64_t)new_offset; + if (debug_fd_enabled(fd)) { + const char *path = debug_fd_path(fd); + debug_log("lseek fd=%d path=%s off=%lld whence=%d -> %lld", + fd, path ? path : "?", (long long)offset, whence, (long long)new_offset); + } return new_offset; -} \ No newline at end of file +} + +int close(int fd) +{ + zvfs_file_t *file; + zvfs_dirent_t *dirent; + char log_path[PATH_MAX]; + bool log_enabled = false; + + if (!is_zvfs_fd(fd) && !is_zvfs_dirfd(fd)) { + return real_close_fn ? real_close_fn(fd) : -1; + } + if (is_zvfs_dirfd(fd)) { + if (debug_fd_enabled(fd)) { + const char *path = debug_fd_path(fd); + debug_log("close dirfd=%d path=%s", fd, path ? path : "?"); + } + dirfd_free(fd); + return 0; + } + + file = fd_lookup(fd); + if (!file) { + errno = EBADF; + return -1; + } + dirent = file->dirent; + if (dirent && debug_path_enabled(dirent->filename)) { + snprintf(log_path, sizeof(log_path), "%s", dirent->filename); + log_enabled = true; + } + + if (!zvfs_close(file)) { + errno = file->op_errno ? -file->op_errno : EIO; + return -1; + } + fd_free(fd); + + if (dirent) { + dirent->open_count--; + if (dirent->open_count == 0 && !dirent->is_valid) { + if (!zvfs_delete(file)) { + errno = file->op_errno ? -file->op_errno : EIO; + return -1; + } + dirent_remove(dirent); + (void)meta_save(g_fs); + } + } + free(file); + if (log_enabled) { + debug_log("close fd=%d path=%s", fd, log_path); + } + return 0; +} + +int unlink(const char *name) +{ + char norm[PATH_MAX]; + zvfs_dirent_t *d; + if (!is_zvfs_path(name)) { + return real_unlink_fn ? real_unlink_fn(name) : -1; + } + if (zvfs_ensure_mounted() != 0 || normalize_path(name, norm, sizeof(norm)) != 0) { + errno = EIO; + return -1; + } + if (dirs_exists(norm)) { + errno = EISDIR; + return -1; + } + d = dirent_find(norm); + if (!d) { + errno = ENOENT; + return -1; + } + if (d->open_count > 0) { + d->is_valid = false; + return 0; + } + + { + zvfs_file_t tmp = {0}; + tmp.fs = g_fs; + tmp.dirent = d; + tmp.blob_id = d->blob_id; + if (!zvfs_delete(&tmp)) { + errno = tmp.op_errno ? -tmp.op_errno : EIO; + return -1; + } + } + dirent_remove(d); + (void)meta_save(g_fs); + return 0; +} + +int unlinkat(int dirfd, const char *path, int flags) +{ + char resolved[PATH_MAX]; + + if (flags & AT_REMOVEDIR) { + if (path[0] == '/' && is_zvfs_path(path)) { + return rmdir(path); + } + if (resolve_path_at(dirfd, path, resolved, sizeof(resolved)) == 0 && is_zvfs_path(resolved)) { + return rmdir(resolved); + } + } else { + if (path[0] == '/' && is_zvfs_path(path)) { + return unlink(path); + } + if (resolve_path_at(dirfd, path, resolved, sizeof(resolved)) == 0 && is_zvfs_path(resolved)) { + return unlink(resolved); + } + } + return real_unlinkat_fn ? real_unlinkat_fn(dirfd, path, flags) : -1; +} + +int fsync(int fd) +{ + if (is_zvfs_fd(fd) || is_zvfs_dirfd(fd)) { + return 0; + } + return real_fsync_fn ? real_fsync_fn(fd) : -1; +} + +int fdatasync(int fd) +{ + if (is_zvfs_fd(fd) || is_zvfs_dirfd(fd)) { + return 0; + } + return real_fdatasync_fn ? real_fdatasync_fn(fd) : -1; +} + +int ftruncate(int fd, off_t length) +{ + zvfs_file_t *file; + if (!is_zvfs_fd(fd)) { + return real_ftruncate_fn ? real_ftruncate_fn(fd, length) : -1; + } + if (length < 0) { + errno = EINVAL; + return -1; + } + file = fd_lookup(fd); + if (!file || !can_write(file)) { + errno = EBADF; + return -1; + } + + if ((uint64_t)length > file->dirent->file_size && length > 0) { + uint8_t zero = 0; + if (zvfs_pwrite(file, &zero, 1, (uint64_t)length - 1) != 1) { + errno = file->op_errno ? -file->op_errno : EIO; + return -1; + } + } + file->dirent->file_size = (uint64_t)length; + if (file->current_offset > (uint64_t)length) { + file->current_offset = (uint64_t)length; + } + (void)meta_save(g_fs); + return 0; +} + +int fallocate(int fd, int mode, off_t offset, off_t len) +{ + zvfs_file_t *file; + uint8_t zero = 0; + uint64_t end; + bool keep_size; + + if (!is_zvfs_fd(fd)) { + return real_fallocate_fn ? real_fallocate_fn(fd, mode, offset, len) : -1; + } + if (offset < 0 || len < 0) { + errno = EINVAL; + return -1; + } + file = fd_lookup(fd); + if (!file || !can_write(file)) { + errno = EBADF; + return -1; + } + + /* Minimal support: mode=0 or KEEP_SIZE only. */ + keep_size = (mode & FALLOC_FL_KEEP_SIZE) != 0; + if (mode & ~FALLOC_FL_KEEP_SIZE) { + errno = EOPNOTSUPP; + return -1; + } + if (len == 0) { + return 0; + } + + end = (uint64_t)offset + (uint64_t)len; + if (!keep_size) { + if (end > file->dirent->file_size) { + if (zvfs_pwrite(file, &zero, 1, end - 1) != 1) { + errno = file->op_errno ? -file->op_errno : EIO; + return -1; + } + } + } + return 0; +} + +int posix_fadvise(int fd, off_t offset, off_t len, int advice) +{ + (void)offset; + (void)len; + (void)advice; + if (is_zvfs_fd(fd) || is_zvfs_dirfd(fd)) { + return 0; + } + if (!real_posix_fadvise_fn) { + return 0; + } + return real_posix_fadvise_fn(fd, offset, len, advice); +} + +int posix_fadvise64(int fd, off_t offset, off_t len, int advice) +{ + return posix_fadvise(fd, offset, len, advice); +} + +int sync_file_range(int fd, off_t offset, off_t nbytes, unsigned int flags) +{ + (void)offset; + (void)nbytes; + (void)flags; + if (is_zvfs_fd(fd) || is_zvfs_dirfd(fd)) { + return 0; + } + return real_sync_file_range_fn ? real_sync_file_range_fn(fd, offset, nbytes, flags) : 0; +} + +int fallocate64(int fd, int mode, off_t offset, off_t len) +{ + return fallocate(fd, mode, offset, len); +} + +int mkdir(const char *path, mode_t mode) +{ + char norm[PATH_MAX]; + char parent[PATH_MAX]; + (void)mode; + + if (!is_zvfs_path(path)) { + return real_mkdir_fn ? real_mkdir_fn(path, mode) : -1; + } + if (zvfs_ensure_mounted() != 0 || normalize_path(path, norm, sizeof(norm)) != 0) { + errno = EIO; + return -1; + } + if (dirs_exists(norm) || dirent_find(norm)) { + errno = EEXIST; + return -1; + } + if (get_parent_dir(norm, parent, sizeof(parent)) != 0 || !dirs_exists(parent)) { + errno = ENOENT; + return -1; + } + if (dirs_add(norm) != 0) { + errno = ENOSPC; + return -1; + } + return 0; +} + +int rmdir(const char *path) +{ + char norm[PATH_MAX]; + if (!is_zvfs_path(path)) { + return real_rmdir_fn ? real_rmdir_fn(path) : -1; + } + if (zvfs_ensure_mounted() != 0 || normalize_path(path, norm, sizeof(norm)) != 0) { + errno = EIO; + return -1; + } + if (is_zvfs_root(norm)) { + errno = EBUSY; + return -1; + } + if (!dirs_has_exact(norm)) { + errno = ENOENT; + return -1; + } + if (dirs_has_children(norm)) { + errno = ENOTEMPTY; + return -1; + } + if (dirs_remove(norm) != 0) { + errno = ENOENT; + return -1; + } + return 0; +} + +int rename(const char *oldpath, const char *newpath) +{ + char old_norm[PATH_MAX]; + char new_norm[PATH_MAX]; + char parent[PATH_MAX]; + zvfs_dirent_t *src; + zvfs_dirent_t *dst; + + if (!is_zvfs_path(oldpath) && !is_zvfs_path(newpath)) { + return real_rename_fn ? real_rename_fn(oldpath, newpath) : -1; + } + if (!is_zvfs_path(oldpath) || !is_zvfs_path(newpath)) { + errno = EXDEV; + return -1; + } + if (zvfs_ensure_mounted() != 0 || + normalize_path(oldpath, old_norm, sizeof(old_norm)) != 0 || + normalize_path(newpath, new_norm, sizeof(new_norm)) != 0) { + errno = EIO; + return -1; + } + if (dirs_exists(old_norm) || dirs_exists(new_norm)) { + errno = EISDIR; + return -1; + } + if (get_parent_dir(new_norm, parent, sizeof(parent)) != 0 || !dirs_exists(parent)) { + errno = ENOENT; + return -1; + } + + src = dirent_find(old_norm); + if (!src) { + errno = ENOENT; + return -1; + } + dst = dirent_find(new_norm); + if (dst) { + if (dst->open_count > 0) { + errno = EBUSY; + return -1; + } + { + zvfs_file_t tmp = {0}; + tmp.fs = g_fs; + tmp.dirent = dst; + tmp.blob_id = dst->blob_id; + if (!zvfs_delete(&tmp)) { + errno = tmp.op_errno ? -tmp.op_errno : EIO; + return -1; + } + } + dirent_remove(dst); + } + strncpy(src->filename, new_norm, sizeof(src->filename) - 1); + src->filename[sizeof(src->filename) - 1] = '\0'; + (void)meta_save(g_fs); + return 0; +} + +int access(const char *path, int mode) +{ + char norm[PATH_MAX]; + (void)mode; + if (!is_zvfs_path(path)) { + return real_access_fn ? real_access_fn(path, mode) : -1; + } + if (zvfs_ensure_mounted() != 0 || normalize_path(path, norm, sizeof(norm)) != 0) { + errno = EIO; + return -1; + } + if (dirs_exists(norm) || dirent_find(norm)) { + return 0; + } + errno = ENOENT; + return -1; +} + +int fcntl(int fd, int cmd, ...) +{ + va_list ap; + uintptr_t arg = 0; + bool has_arg = false; + + switch (cmd) { + case F_DUPFD: + case F_DUPFD_CLOEXEC: + case F_SETFD: + case F_SETFL: + case F_SETLK: + case F_SETLKW: + case F_GETLK: + has_arg = true; + break; + default: + break; + } + + va_start(ap, cmd); + if (has_arg) { + arg = va_arg(ap, uintptr_t); + } + va_end(ap); + + if (!is_zvfs_fd(fd) && !is_zvfs_dirfd(fd)) { + if (!real_fcntl_fn) { + errno = ENOSYS; + return -1; + } + return has_arg ? real_fcntl_fn(fd, cmd, arg) : real_fcntl_fn(fd, cmd); + } + + if (debug_fd_enabled(fd)) { + const char *path = debug_fd_path(fd); + debug_log("fcntl fd=%d path=%s cmd=%d has_arg=%d", fd, path ? path : "?", cmd, has_arg ? 1 : 0); + } + + switch (cmd) { + case F_GETFD: + return 0; + case F_SETFD: + return 0; + case F_GETFL: + if (is_zvfs_fd(fd)) { + zvfs_file_t *file = fd_lookup(fd); + if (!file) { + errno = EBADF; + return -1; + } + return file->flags; + } + if (is_zvfs_dirfd(fd)) { + zvfs_dirfd_t *d = dirfd_lookup(fd); + if (!d) { + errno = EBADF; + return -1; + } + return d->flags; + } + errno = EBADF; + return -1; + case F_SETFL: + if (is_zvfs_fd(fd)) { + zvfs_file_t *file = fd_lookup(fd); + if (!file) { + errno = EBADF; + return -1; + } + file->flags = (file->flags & O_ACCMODE) | ((int)arg & ~O_ACCMODE); + return 0; + } + return 0; + case F_SETLK: + case F_SETLKW: + return 0; + case F_GETLK: + if ((void *)arg) { + struct flock *lk = (struct flock *)(void *)arg; + lk->l_type = F_UNLCK; + } + return 0; + default: + if (debug_fd_enabled(fd)) { + const char *path = debug_fd_path(fd); + debug_log("fcntl fd=%d path=%s cmd=%d -> EOPNOTSUPP", fd, path ? path : "?", cmd); + } + errno = EOPNOTSUPP; + return -1; + } +} + +int fcntl64(int fd, int cmd, ...) +{ + va_list ap; + uintptr_t arg = 0; + bool has_arg = false; + + switch (cmd) { + case F_DUPFD: + case F_DUPFD_CLOEXEC: + case F_SETFD: + case F_SETFL: + case F_SETLK: + case F_SETLKW: + case F_GETLK: + has_arg = true; + break; + default: + break; + } + + va_start(ap, cmd); + if (has_arg) { + arg = va_arg(ap, uintptr_t); + } + va_end(ap); + + if (has_arg) { + return fcntl(fd, cmd, arg); + } + return fcntl(fd, cmd); +} + +int stat(const char *path, struct stat *st) +{ + char norm[PATH_MAX]; + zvfs_dirent_t *d; + + if (!is_zvfs_path(path)) { + return real_stat_fn ? real_stat_fn(path, st) : -1; + } + if (zvfs_ensure_mounted() != 0 || normalize_path(path, norm, sizeof(norm)) != 0) { + errno = EIO; + return -1; + } + + if (dirs_exists(norm)) { + fill_stat(st, S_IFDIR | 0755, 4096, path_hash(norm)); + return 0; + } + d = dirent_find(norm); + if (d) { + fill_stat(st, S_IFREG | 0644, (off_t)d->file_size, d->blob_id); + return 0; + } + errno = ENOENT; + return -1; +} + +int lstat(const char *path, struct stat *st) +{ + if (!is_zvfs_path(path)) { + return real_lstat_fn ? real_lstat_fn(path, st) : -1; + } + return stat(path, st); +} + +int fstat(int fd, struct stat *st) +{ + if (!is_zvfs_fd(fd) && !is_zvfs_dirfd(fd)) { + return real_fstat_fn ? real_fstat_fn(fd, st) : -1; + } + if (is_zvfs_dirfd(fd)) { + zvfs_dirfd_t *d = dirfd_lookup(fd); + if (!d) { + errno = EBADF; + return -1; + } + fill_stat(st, S_IFDIR | 0755, 4096, path_hash(d->path)); + return 0; + } + + { + zvfs_file_t *file = fd_lookup(fd); + if (!file || !file->dirent) { + errno = EBADF; + return -1; + } + fill_stat(st, S_IFREG | 0644, (off_t)file->dirent->file_size, file->dirent->blob_id); + return 0; + } +} + +int fstatat(int dirfd, const char *path, struct stat *st, int flags) +{ + char resolved[PATH_MAX]; + (void)flags; + if (path[0] == '/' && is_zvfs_path(path)) { + return stat(path, st); + } + if (resolve_path_at(dirfd, path, resolved, sizeof(resolved)) == 0 && is_zvfs_path(resolved)) { + return stat(resolved, st); + } + return real_fstatat_fn ? real_fstatat_fn(dirfd, path, st, flags) : -1; +} + +/* glibc versioned stat wrappers */ +int __xstat(int ver, const char *path, struct stat *st) +{ + (void)ver; + return stat(path, st); +} + +int __lxstat(int ver, const char *path, struct stat *st) +{ + (void)ver; + return lstat(path, st); +} + +int __fxstat(int ver, int fd, struct stat *st) +{ + (void)ver; + return fstat(fd, st); +} + +int __fxstatat(int ver, int dirfd, const char *path, struct stat *st, int flags) +{ + (void)ver; + return fstatat(dirfd, path, st, flags); +} + +off64_t lseek64(int fd, off64_t offset, int whence) +{ + return (off64_t)lseek(fd, (off_t)offset, whence); +} + +int stat64(const char *path, struct stat64 *st) +{ + char norm[PATH_MAX]; + zvfs_dirent_t *d; + + if (!is_zvfs_path(path)) { + if (real_stat64_fn) { + return real_stat64_fn(path, st); + } + return real_stat_fn ? real_stat_fn(path, (struct stat *)(void *)st) : -1; + } + if (zvfs_ensure_mounted() != 0 || normalize_path(path, norm, sizeof(norm)) != 0) { + errno = EIO; + return -1; + } + + if (dirs_exists(norm)) { + fill_stat64(st, S_IFDIR | 0755, 4096, path_hash(norm)); + return 0; + } + d = dirent_find(norm); + if (d) { + fill_stat64(st, S_IFREG | 0644, (off64_t)d->file_size, d->blob_id); + return 0; + } + errno = ENOENT; + return -1; +} + +int lstat64(const char *path, struct stat64 *st) +{ + if (!is_zvfs_path(path)) { + if (real_lstat64_fn) { + return real_lstat64_fn(path, st); + } + return real_lstat_fn ? real_lstat_fn(path, (struct stat *)(void *)st) : -1; + } + return stat64(path, st); +} + +int fstat64(int fd, struct stat64 *st) +{ + if (!is_zvfs_fd(fd) && !is_zvfs_dirfd(fd)) { + if (real_fstat64_fn) { + return real_fstat64_fn(fd, st); + } + return real_fstat_fn ? real_fstat_fn(fd, (struct stat *)(void *)st) : -1; + } + if (is_zvfs_dirfd(fd)) { + zvfs_dirfd_t *d = dirfd_lookup(fd); + if (!d) { + errno = EBADF; + return -1; + } + fill_stat64(st, S_IFDIR | 0755, 4096, path_hash(d->path)); + return 0; + } + + { + zvfs_file_t *file = fd_lookup(fd); + if (!file || !file->dirent) { + errno = EBADF; + return -1; + } + fill_stat64(st, S_IFREG | 0644, (off64_t)file->dirent->file_size, file->dirent->blob_id); + return 0; + } +} + +int fstatat64(int dirfd, const char *path, struct stat64 *st, int flags) +{ + char resolved[PATH_MAX]; + (void)flags; + if (path[0] == '/' && is_zvfs_path(path)) { + return stat64(path, st); + } + if (resolve_path_at(dirfd, path, resolved, sizeof(resolved)) == 0 && is_zvfs_path(resolved)) { + return stat64(resolved, st); + } + if (real_fstatat64_fn) { + return real_fstatat64_fn(dirfd, path, st, flags); + } + return real_fstatat_fn ? real_fstatat_fn(dirfd, path, (struct stat *)(void *)st, flags) : -1; +} + +int __xstat64(int ver, const char *path, struct stat64 *st) +{ + if (is_zvfs_path(path)) { + return stat64(path, st); + } + if (real___xstat64_fn) { + return real___xstat64_fn(ver, path, st); + } + return stat64(path, st); +} + +int __lxstat64(int ver, const char *path, struct stat64 *st) +{ + if (is_zvfs_path(path)) { + return lstat64(path, st); + } + if (real___lxstat64_fn) { + return real___lxstat64_fn(ver, path, st); + } + return lstat64(path, st); +} + +int __fxstat64(int ver, int fd, struct stat64 *st) +{ + if (is_zvfs_fd(fd) || is_zvfs_dirfd(fd)) { + return fstat64(fd, st); + } + if (real___fxstat64_fn) { + return real___fxstat64_fn(ver, fd, st); + } + return fstat64(fd, st); +} + +int __fxstatat64(int ver, int dirfd, const char *path, struct stat64 *st, int flags) +{ + char resolved[PATH_MAX]; + if (path[0] == '/' && is_zvfs_path(path)) { + return fstatat64(dirfd, path, st, flags); + } + if (resolve_path_at(dirfd, path, resolved, sizeof(resolved)) == 0 && is_zvfs_path(resolved)) { + return fstatat64(dirfd, path, st, flags); + } + if (real___fxstatat64_fn) { + return real___fxstatat64_fn(ver, dirfd, path, st, flags); + } + return fstatat64(dirfd, path, st, flags); +}