diff --git a/.gitignore b/.gitignore index 9ab0591..3422ec7 100755 --- a/.gitignore +++ b/.gitignore @@ -2,8 +2,5 @@ *.d *.so -test/bin - -zvfs/func_test -zvfs_meta.txt -zvfs/zvfs_meta.txt \ No newline at end of file +codex/ +tests/bin diff --git a/Makefile b/Makefile index 6a94de9..503ba83 100755 --- a/Makefile +++ b/Makefile @@ -3,14 +3,11 @@ all: zvfs zvfs: - $(MAKE) -C zvfs + $(MAKE) -C src test: - $(MAKE) -C test - -run-test: - $(MAKE) -C test run-test + $(MAKE) -C tests clean: - $(MAKE) -C zvfs clean - $(MAKE) -C test clean + $(MAKE) -C src clean + $(MAKE) -C tests clean diff --git a/fio/readme.md b/fio/readme.md deleted file mode 100644 index 4759e5d..0000000 --- a/fio/readme.md +++ /dev/null @@ -1,9 +0,0 @@ -```shell - -sudo apt install fio - -fio sample.fio -LD_PRELOAD=../zvfs/libzvfs.so fio zvfs.fio -``` - - diff --git a/fio/sample.fio b/fio/sample.fio deleted file mode 100644 index 2136b86..0000000 --- a/fio/sample.fio +++ /dev/null @@ -1,14 +0,0 @@ -[global] -ioengine=sync # 同步 I/O -direct=0 # 使用内核页缓存,测试系统调用性能 -bs=128k # 块大小 -size=1G # 每个文件大小 -numjobs=2 # 并发线程数 -runtime=60 # 测试运行时间,秒 -time_based=1 -rw=randrw # 随机读写 -rwmixread=70 # 70% 读,30% 写 -group_reporting=1 # 汇总报告 - -[test_syscall] -filename=/tmp/fio_sample_testfile \ No newline at end of file diff --git a/fio/zvfs.fio b/fio/zvfs.fio deleted file mode 100644 index 40f2ef8..0000000 --- a/fio/zvfs.fio +++ /dev/null @@ -1,14 +0,0 @@ -[global] -ioengine=sync -direct=0 -bs=128k -size=1G -numjobs=2 -runtime=60 -time_based=1 -rw=randrw -rwmixread=70 -group_reporting=1 - -[test_hook] -filename=/zvfs/fio_zvfs_testfile \ No newline at end of file diff --git a/plan/codexplan.md b/plan/codexplan.md deleted file mode 100644 index 2c94c5d..0000000 --- a/plan/codexplan.md +++ /dev/null @@ -1,194 +0,0 @@ -# ZVFS 高性能框架设计(修订版) - -## 0. 当前实现进展(2026-03-03) - -- 已落地: - - stale blob 自愈(open/create/unlink/rename 路径) - - hook 层小写合并(per-fd writeback buffer,默认 128KB)+ 关键系统调用前 flush -- 仍待重点优化: - - 小块读路径仍是“同步提交 + 单请求往返 + 拷贝返回”,延迟和吞吐偏弱 - -## 1. 现状代码中的关键问题(先于方案) - -基于 `zvfs.c`、`zvfs.h`、`zvfs_hook.c`,当前主要瓶颈和风险如下: - -1. **单全局执行上下文串行化** - - 所有 IO 都通过 `global_thread` + `waiter()` 同步等待,天然把多线程请求串到一个 SPDK thread。 - - `zvfs_t` 里只有一个 `channel`,读写都走这一个 channel,无法利用多核并行。 - -2. **等待模型是忙轮询,CPU 成本高** - - `waiter()` 用紧循环 `spdk_thread_poll()`,没有阻塞等待/退避策略。 - - 在高并发小 IO 下,系统容易进入“高 CPU + 低有效 QD”。 - -3. **全局元数据无并发保护** - - `dirents/fd_table/g_dirs/g_dirfd_table/open_count/file_size` 读写没有统一锁。 - - hook 层是多线程入口,当前实现有明显竞态和可见性问题。 - -4. **持久化与语义不完整** - - `fsync/fdatasync/sync_file_range` 对 zvfs fd 基本直接返回 0,和数据库预期不一致。 - - `meta_load()` 只读固定 4KB 文本,规模稍大就截断;`meta_save()` 也无崩溃一致性保证。 - -5. **数据路径的放大和额外开销** - - 小块随机写依赖 read-modify-write;无写回缓存、无批量提交、无 IO 合并。 - - per-file `dma_buf` 增长时可能反复 realloc,缺少池化和复用策略。 - -6. **可扩展性不足** - - `dirent_find/fd_alloc` 等是线性扫描。 - - 元数据、目录结构、fd 分配都偏“单点共享结构”,随着文件数/线程数增长会抖动。 - ---- - -## 2. 对 userplan.md 的补全与修正 - -`plan/userplan.md` 的方向(TLS + per-thread channel + 缩小全局锁)是正确的,但有几个需要补全的点: - -1. **“每个 pthread 一个 spdk_thread”要可配置** - - 对 MySQL 这类线程数可能很大的进程,严格 1:1 会导致线程对象和 channel 爆炸。 - - 建议改为:默认“线程绑定 worker 池(N:M)”,支持配置成 1:1 调试模式。 - -2. **需要明确“文件句柄跨线程访问”的所有权规则** - - 同一 fd 可能被不同 pthread 使用,必须定义 offset、cache、flush 的同步策略。 - -3. **batch poll 需要配套“提交队列 + 背压 + 超时”** - - 仅有 `pending_queue` 不够,必须定义入队失败/队列满/超时处理。 - -4. **必须补上 fsync/fdatasync 的严格语义** - - 尤其面向数据库:fsync 成功后应保证数据页 + 必要元数据已持久化。 - -5. **元数据持久化需要从“文本快照”升级为“日志+检查点”** - - 否则崩溃恢复和规模都不可靠。 - ---- - -## 3. 新框架设计(面向高性能与可重入改造) - -### 3.1 分层与职责 - -- **Control Plane(全局)** - - 管理 mount/unmount、命名空间、inode 元数据、fd 表、恢复日志。 - - 低频操作(open/create/unlink/rename/mkdir/rmdir)在此层处理。 - -- **Data Plane(worker)** - - 处理 read/pread/write/pwrite/fsync 的数据 IO。 - - 每个 worker 持有:`spdk_thread + io_channel + submission_queue + completion_queue`。 - -- **Persistence Plane(元数据持久化)** - - 元数据 WAL(append-only)+ 周期 checkpoint。 - - 保障崩溃恢复和 fsync 语义。 - -### 3.2 全局运行时结构 - -```c -typedef struct { - // init/mount 生命周期 - pthread_once_t init_once; - pthread_mutex_t mount_mu; - _Atomic int mount_state; // UNINIT/INITING/READY/FAILED/STOPPING - - // core spdk objects - struct spdk_blob_store *bs; - struct spdk_bs_dev *bs_dev; - - // metadata indexes - pthread_rwlock_t inode_rwlock; - inode_table_t *inode_by_path; // hash map: path -> inode - inode_table_t *inode_by_blobid; // hash map: blobid -> inode - - pthread_rwlock_t fd_rwlock; - fd_table_t *fd_table; // pseudo fd -> file handle - - // durability - meta_journal_t *journal; // WAL + checkpoint - - // worker routing - worker_pool_t *workers; // configurable N workers -} zvfs_runtime_t; -``` - -### 3.3 worker 模型(建议默认 N:M,可切 1:1) - -- 默认:`worker_count = min(online_cpu, ZVFS_IO_WORKERS)`。 -- 线程第一次进入时做 TLS 绑定:`pthread_id -> worker_id`(固定绑定,减少迁移)。 -- 每个 worker 独占一个 io_channel,避免全局 channel 争用。 -- 等待机制:优先 `eventfd/futex + poll` 混合,避免纯忙轮询。 - -> 说明:若用户确认线程数有限,可配置 `ZVFS_WORKER_MODE=THREAD_LOCAL` 切 1:1,以追求极致低延迟。 - -### 3.4 元数据模型 - -- `inode`(文件级共享对象) - - `blob_id, logical_size, allocated_clusters, link/open_ref, flags` - - 每 inode 一把细粒度锁(mutex/spin + 原子字段)。 -- `file handle`(open 实例) - - `inode*`, `flags`, `current_offset`, `handle-local state`。 -- 路径索引与 blob 索引用哈希表替代线性数组。 -- 目录树从 `g_dirs[]` 升级为前缀树或 hash+parent 索引,避免全表扫描。 - -### 3.5 IO 路径设计 - -#### Read/Pread -- 快路径:命中页缓存(clean/dirty)直接拷贝。 -- 慢路径:提交到绑定 worker。 -- 对齐大读支持直接 DMA 到用户对齐缓冲(满足约束时)。 - -#### Write/Pwrite -- 小块随机写:写入 per-inode 页缓存(4KB 粒度),标记 dirty。 -- 大块或顺序写:绕过缓存直写(或写穿策略),减少二次拷贝。 -- 扩容策略:按 chunk 预分配(例如 1~8MB)减少 `resize + sync_md` 频率。 -- flush 策略: - - 后台刷脏(阈值/时间) - - 前台 fsync 强制刷 - - 合并连续页为 writev/batch IO - -### 3.6 fsync/fdatasync 语义(数据库场景) - -- `fdatasync(fd)`: - 1) 刷新该 fd 对应 inode 的脏数据页; - 2) 若发生扩容,确保 blob 元数据同步完成; - 3) 返回前确认提交完成。 -- `fsync(fd)`: - - 在 `fdatasync` 基础上,额外保证需要的命名空间/元数据日志落盘(如 size、rename 可见性)。 - -### 3.7 崩溃一致性与恢复 - -- `meta_journal.log`(append-only,带 magic/version/CRC/seq)。 -- 操作记录:`CREATE/UNLINK/RENAME/TRUNCATE/SIZE_UPDATE/ALLOC_UPDATE`。 -- 启动恢复:`checkpoint -> replay WAL`。 -- 周期 checkpoint(按时间或日志大小触发),避免恢复时间无限增长。 - -### 3.8 锁策略与死锁规约 - -- 固定锁顺序:`fd_table lock -> inode lock -> journal lock`。 -- IO 快路径不拿全局写锁。 -- 元数据读多写少:读写锁 + inode 细粒度锁组合。 - -### 3.9 可观测与调优 - -- 统计项(至少): - - read/write IOPS、带宽、P50/P99 延迟 - - cache hit ratio、dirty page 数 - - flush 次数、merge 比例、resize 次数 - - queue depth、排队延迟 -- debug 开关: - - `ZVFS_TRACE_IO=1` - - `ZVFS_TRACE_META=1` - - `ZVFS_WORKER_MODE`, `ZVFS_IO_WORKERS` - ---- - -## 4. 关键行为约束(必须保持) - -1. POSIX 语义不回退:`openat/rename/unlink/ftruncate/fstat/fsync` 的错误码与行为保持一致或更严格。 -2. 在无 root 环境下可跑功能测试(至少支持 Malloc bdev 或已有可用 SPDK 配置)。 -3. 旧接口兼容:外部仍通过 `LD_PRELOAD=.../libzvfs.so` 使用。 -4. 改造过程可分阶段落地,任何阶段都可独立编译、回归、继续下一阶段。 - ---- - -## 5. 性能目标(建议) - -- 与当前实现相比: - - 多线程随机写 IOPS 提升 >= 2x(4~16 线程场景) - - P99 延迟下降 >= 30% - - CPU busy-poll 占比显著下降(可通过 perf/top 观测) -- `test_single_file_perf`、`test_single_file_random_perf` 在同配置下持续稳定,无明显长尾抖动。 diff --git a/plan/plan.md b/plan/plan.md deleted file mode 100644 index 3789e7a..0000000 --- a/plan/plan.md +++ /dev/null @@ -1,216 +0,0 @@ -# ZVFS 分阶段改造计划(可重入,用户验收版) - -> 目标:把当前实现改造成可并发扩展、高性能且语义完整的架构。 -> 约束:我无法使用 root,所有阶段验收由你执行。 - -## 通用约定(所有阶段) - -- 建议先记录基线:`git rev-parse --short HEAD`。 -- 每阶段都保持“可编译 + 可回归”。 -- 每阶段完成后打一个里程碑 tag(例如 `phase1-done`),中断后可从最近 tag 继续。 -- 验收命令默认: - -```bash -make -C zvfs -j4 -make -C test -j4 -env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_phase2_posix /zvfs -``` - -## 已落地变更(2026-03-03) - -1. **stale blob 自愈修复(已完成)** - - `open(O_CREAT)` 遇到元数据引用失效 blob 时自动重建并回写元数据。 - - `unlink/close(rename 覆盖)` 删除失效 blob 时容忍 `ENOENT/EINVAL`,避免误报 `EIO`。 - -2. **小块写合并(已完成)** - - hook 层新增 per-fd writeback buffer(默认 128KB),连续小写先合并再 `pwrite`。 - - 在 `read/pread/lseek/fsync/fdatasync/close/ftruncate/fallocate/unlink/rename/sync_file_range` 前补齐 flush,保证可见性。 - -3. **当前观察** - - 小块写已提升,但小块读仍偏低;读优化作为后续阶段重点。 - ---- - -## Phase 0:基线与护栏 - -### 要做的事情 -1. 固化当前行为基线:功能、性能、CPU 占用。 -2. 在代码中加入轻量统计框架(计数器/延迟桶/开关),不改变行为。 -3. 增加最小并发回归入口(并行跑现有测试)。 - -### 用户验收 -```bash -make -C zvfs -j4 && make -C test -j4 -env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_phase2_posix /zvfs -env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_single_file_perf /zvfs -env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_single_file_random_perf /zvfs -``` - -### 通过标准 -- 功能测试通过。 -- 有一份可复用的“基线性能记录”(IOPS/BW/延迟)。 - -### 可重入说明 -- 仅增量加观测代码,可重复执行,不影响后续阶段。 - ---- - -## Phase 1:全局运行时与并发安全 - -### 要做的事情 -1. 引入 `zvfs_runtime_t`,统一管理 mount/init 状态与全局资源。 -2. 用 `pthread_once + mount mutex` 保护初始化/挂载过程。 -3. 给 inode/path/fd/dirs 操作补齐锁(rwlock + 细粒度 mutex)。 -4. 保持接口不变:`open/read/write/...` 行为兼容。 - -### 用户验收 -```bash -make -C zvfs -j4 && make -C test -j4 -env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_phase2_posix /zvfs -for i in $(seq 1 8); do - env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_dual_open_same_file /zvfs & -done -wait -``` - -### 通过标准 -- 无崩溃/死锁。 -- 并发场景不出现随机 EBADF/ENOENT/元数据错乱。 - -### 可重入说明 -- 锁与 runtime 框架可独立提交;若中断,重新进入本阶段不会破坏状态。 - ---- - -## Phase 2:Worker 化 IO 通路(替换单 global_thread) - -### 要做的事情 -1. 实现 worker 池(默认 N:M,支持配置 1:1)。 -2. 每 worker 持有独立 `spdk_thread + io_channel`。 -3. read/write/pread/pwrite 路径改为“提交到绑定 worker 执行”。 -4. 保留同步 POSIX 语义,但去掉全局单线程瓶颈。 - -### 用户验收 -```bash -make -C zvfs -j4 && make -C test -j4 -env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_phase2_posix /zvfs -for i in $(seq 1 4); do - env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_single_file_random_perf /zvfs & -done -wait -``` - -### 通过标准 -- 功能与 Phase 1 一致。 -- 并发压测吞吐明显高于基线(目标 >= 1.5x,先达成趋势)。 - -### 可重入说明 -- worker 与旧路径可通过编译开关共存,出现问题可快速切回旧路径继续调试。 - ---- - -## Phase 3:完成等待机制与批处理 - -### 要做的事情 -1. 用“提交队列 + 完成通知”替换纯 busy-poll `waiter`。 -2. 增加批量 poll 与背压(队列满、超时、错误传播)。 -3. 补齐延迟与队列深度指标,定位长尾。 -4. 引入读路径流水线(允许并发 in-flight read),把有效 QD 从 1 提升到可配置值。 - -### 用户验收 -```bash -make -C zvfs -j4 && make -C test -j4 -env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_single_file_perf /zvfs -env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_single_file_random_perf /zvfs -``` - -### 通过标准 -- 在同等负载下 CPU 空转显著下降。 -- P99 延迟较 Phase 2 收敛(无明显长尾恶化)。 - -### 可重入说明 -- 队列与等待层可单独演进;可先只替换 read,再替换 write。 - ---- - -## Phase 4:页缓存与写回合并 - -### 要做的事情 -1. 引入 per-inode 4KB 页缓存(dirty/clean 状态)。 -2. 小写走 cache + 延迟刷盘,大写/顺序写支持直写或批量写。 -3. 引入 flush 策略:阈值、定时、fsync 强制。 -4. 缩减 `resize + sync_md` 频率(chunk 预分配)。 -5. 读性能专项: - - 增加顺序读 readahead(如 128KB~1MB 窗口自适应)。 - - 对齐读支持“直接读到用户缓冲”快路径,减少一次 memcpy。 - - 引入 clean page cache(读热点复用,避免重复 blob read)。 - -### 用户验收 -```bash -make -C zvfs -j4 && make -C test -j4 -env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_phase2_posix /zvfs -env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_single_file_random_noaligned_perf /zvfs -``` - -### 通过标准 -- 功能语义不回退(truncate/sparse/rename/fstat 通过)。 -- 小块随机写吞吐继续提升,写放大降低。 - -### 可重入说明 -- cache 可先只支持 write-through,再切 write-back;两步都可单独验收。 - ---- - -## Phase 5:元数据日志化与 fsync 语义闭环 - -### 要做的事情 -1. `meta_save/load` 从文本快照升级为 WAL + checkpoint(带 CRC/版本)。 -2. 明确并实现 `fdatasync/fsync` 语义: - - fdatasync 保证数据持久化; - - fsync 额外保证必要元数据持久化。 -3. 补齐崩溃恢复流程(checkpoint + replay)。 - -### 用户验收 -```bash -make -C zvfs -j4 && make -C test -j4 -env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_phase2_posix /zvfs -# 建议补充一次“异常退出后重启读取”的恢复验证(手工执行) -``` - -### 通过标准 -- 重启后目录项与文件大小不丢失、不错乱。 -- 数据库关键路径(fsync/fdatasync)语义满足预期。 - -### 可重入说明 -- WAL 与 checkpoint 支持并存迁移;可先双写验证,再切主读路径。 - ---- - -## Phase 6:性能收敛与上线门槛 - -### 要做的事情 -1. 清理临时开关,保留必要调优参数。 -2. 整理性能报告(与 Phase 0 基线对比)。 -3. 做最终回归矩阵(功能 + 并发 + 性能 + 恢复)。 - -### 用户验收 -```bash -make -C zvfs -j4 && make -C test -j4 -env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so make -C test run-test -env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_single_file_perf /zvfs -env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_single_file_random_perf /zvfs -``` - -### 通过标准 -- 全量功能测试通过。 -- 多线程性能达到 `codexplan.md` 目标(或给出量化偏差与原因)。 - -### 可重入说明 -- 本阶段仅收敛与验收,不引入架构性变更;可反复执行直到指标稳定。 - ---- - -## 附:root 权限与运行建议 - -- 若 NVMe/SPDK 环境需要 root,请在你本机按现有流程执行验收。 -- 若希望无 root 回归,建议补一个 `Malloc` bdev 的 JSON 配置,并将 bdev 名改为可配置(环境变量优先)。 diff --git a/plan/userplan.md b/plan/userplan.md deleted file mode 100644 index a5e567c..0000000 --- a/plan/userplan.md +++ /dev/null @@ -1,108 +0,0 @@ -### 架构目标 -- 通过 LD_PRELOAD hook POSIX 文件操作(open/read/write/pread/pwrite/close/fsync 等),将 MySQL 的数据文件 IO 重定向到 SPDK Blobstore。 -- 最大化性能:绕过内核、利用多核并发、低延迟、小块写合并。 -- 核心原则:**每个 pthread 拥有独立的 SPDK 执行上下文**,全局共享底层存储资源。 - -### 全局资源(进程级别,唯一一份) -- `zvfs_t *g_fs`:文件系统实例,包含: - - `struct spdk_blob_store *bs`:全局 Blobstore(通过 spdk_bs_load/init 创建)。 - - bdev(Nvme0n1 或 Malloc0,通过 JSON 配置加载)。 - - 全局元数据:dirents 数组(zvfs_dirent_t *[])、fd_table(zvfs_file_t *[])、openfd_count。 - - 保护全局元数据的锁:pthread_rwlock_t g_meta_lock(读多写少场景)。 -- 全局初始化标志:`bool g_mounted`、`bool g_env_inited`。 -- pthread_key_t 用于线程本地存储:`g_thread_local_key`(带 destructor)。 - -### 线程本地资源(每个 pthread 独占一份,通过 TLS 实现) -每个 pthread 拥有以下私有状态,存储在结构体 `thread_local_zvfs_t` 中: - -```c -typedef struct { - struct spdk_thread *thread; // 本线程专属的 SPDK thread - struct spdk_io_channel *channel; // 本线程专属的 IO channel(绑定到 g_fs->bs) - TAILQ_HEAD(, io_ctx) pending_queue; // 本线程的 pending IO 队列,用于 batch poll - // 可选扩展: - // struct dma_buf_pool *dma_pool; // per-thread DMA buf 复用池 - // struct page_cache *local_cache; // 如果需要 per-thread cache -} thread_local_zvfs_t; -``` - -- **创建时机**:lazy(第一次 IO 时调用 `get_thread_local()`)。 -- **存储方式**:通过 `pthread_setspecific(g_thread_local_key, tl)` 绑定到当前 pthread。 -- **销毁时机**:pthread 退出时,TLS destructor 自动调用: - - spdk_bs_free_io_channel(channel) - - spdk_thread_exit + poll until exited + spdk_thread_destroy - -### 核心函数:get_thread_local() -```c -thread_local_zvfs_t *get_thread_local(void) { - // 确保 key 已创建(只执行一次) - pthread_once(&g_key_once, init_thread_key); - - thread_local_zvfs_t *tl = pthread_getspecific(g_thread_local_key); - if (tl == NULL) { - tl = calloc(1, sizeof(*tl)); - tl->thread = spdk_thread_create("zvfs_worker", NULL); - tl->channel = spdk_bs_alloc_io_channel(g_fs->bs); - TAILQ_INIT(&tl->pending_queue); - pthread_setspecific(g_thread_local_key, tl); - } - return tl; -} -``` - -### 工作流程(每个 pthread 独立执行) -1. **线程首次进入 IO 操作** - - 调用 `get_thread_local()` → 创建并绑定 thread + channel。 - - 如果 !g_mounted → 调用 zvfs_ensure_mounted()(使用当前 thread 进行 poll 完成 mount)。 - -2. **元数据操作(open/unlink/mkdir/rmdir/rename 等)** - - 加读锁(g_meta_lock)检查/修改全局 dirents、dirs、fd_table。 - - 创建/查找 zvfs_file_t,调用 zvfs_create/zvfs_open(使用当前 thread 同步等待)。 - - 分配伪 fd,记录到全局 fd_table。 - - 释放锁。 - -3. **读操作(read/pread)** - - 获取当前 tl = get_thread_local()。 - - spdk_set_thread(tl->thread)。 - - 如果小读 + cache hit → 直接 memcpy 返回。 - - 否则:创建 io_ctx,加入 tl->pending_queue。 - - 调用 spdk_blob_io_read(..., tl->channel, ...)。 - - 执行 batch_poll(tl, my_ctx): - - while (!my_ctx->done) spdk_thread_poll(tl->thread, 0, 0); - - 从 dma_buf 拷贝到用户 buf。 - -4. **写操作(write/pwrite)** - - 获取 tl。 - - spdk_set_thread(tl->thread)。 - - 如果小写 → patch per-file page cache(dirty),标记 dirty,返回(延迟写)。 - - 如果 cache 满或大写 → flush dirty pages(batch spdk_blob_io_writev,用 tl->channel)。 - - 创建 io_ctx → 加入 pending_queue → submit write → batch_poll。 - -5. **fsync** - - flush per-file dirty cache(batch writev + spdk_blob_sync_md)。 - - 使用当前 tl->thread poll 等待完成。 - -6. **close** - - fsync(flush cache)。 - - zvfs_close(用当前 tl->thread 同步)。 - - 释放 fd(加锁更新全局 fd_table)。 - -### 性能关键机制 -- **独立 poll**:每个 pthread 用自己的 spdk_thread 独立 poll,无跨线程消息。 -- **batch poll**:一个 poll 循环可完成多个 pending IO,提升有效 QD。 -- **page cache**:per-file 4K dirty pages(hashmap),合并小写,减少 write amplification。 -- **channel per-thread**:避免全局 channel 争用,每个线程独立提交 IO。 -- **最小全局锁**:只在元数据修改时短时加锁(rwlock),IO 操作无锁。 - -### 资源所有权总结表 - -| 资源类型 | 所有权 | 数量 | 创建时机 | 销毁时机 | -|----------------------|--------------|------------|------------------------|------------------------------| -| bdev | 全局 | 1 | zvfs_ensure_mounted | zvfs_umount | -| blobstore (bs) | 全局 | 1 | zvfs_ensure_mounted | zvfs_umount | -| zvfs_t / g_fs | 全局 | 1 | zvfs_ensure_mounted | zvfs_umount + free | -| dirents / fd_table | 全局 | 1 | meta_load | zvfs_umount + free | -| spdk_thread | per-pthread | = pthread 数 | 首次 get_thread_local | pthread 退出(destructor) | -| io_channel | per-pthread | = pthread 数 | 首次 get_thread_local | pthread 退出(destructor) | -| pending_queue | per-pthread | = pthread 数 | 首次 get_thread_local | pthread 退出 | -| page cache | per-file | per open fd| open 时 lazy | close 时 flush + free | diff --git a/scripts/run_db_bench.sh b/scripts/run_db_bench.sh new file mode 100755 index 0000000..33ef972 --- /dev/null +++ b/scripts/run_db_bench.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash +set -euo pipefail + +# ========================= +# Manual Config (edit here) +# ========================= +# 可执行文件路径 +DB_BENCH_BIN="/home/lian/env/rocksdb-test/db_bench" +# RocksDB 数据目录 +DB_PATH="/tmp/rocksdb_manual" + +# 测试类型 sets: +# - "fillseq" +# - "fillrandom" +# - "readseq" +# - "readrandom" +# - "overwrite" +# - "fillrandom,readrandom" +BENCHMARKS="fillrandom,readrandom" + +# key数 +NUM=1000000 +# 线程数 +THREADS=1 +# 随机种子 +SEED=1 + +# key大小 +KEY_SIZE=16 +# value大小 +VALUE_SIZE=400 +# SST block大小 +BLOCK_SIZE=4096 + +# block cache 大小 +CACHE_SIZE=$((512 * 1024 * 1024)) # bytes +# memtable 大小 +WRITE_BUFFER_SIZE=$((64 * 1024 * 1024)) # bytes +# memtable 个数 +MAX_WRITE_BUFFER_NUMBER=4 +# L1文件目标大小 +TARGET_FILE_SIZE_BASE=$((64 * 1024 * 1024)) # bytes +# 可打开文件数 +OPEN_FILES=-1 + +# 后台 并行度 +MAX_BACKGROUND_JOBS=4 +# 压缩算法 +COMPRESSION_TYPE="none" +# 开启WAL +DISABLE_WAL=1 +SYNC=0 + +# direct I/O +USE_DIRECT_READS=0 +USE_DIRECT_IO_FOR_FLUSH_AND_COMPACTION=0 + +# mmap I/O +USE_MMAP_READS=0 +USE_MMAP_WRITES=0 + +# 统计 +STATISTICS=0 +# 统计打印 +STATS_INTERVAL_SECONDS=5 +# 直方图 +HISTOGRAM=0 + +# ========================= +# Run +# ========================= +if [[ ! -x "$DB_BENCH_BIN" ]]; then + echo "db_bench not found or not executable: $DB_BENCH_BIN" >&2 + exit 1 +fi + +echo "== db_bench manual run ==" +echo "DB_BENCH_BIN=$DB_BENCH_BIN" +echo "DB_PATH=$DB_PATH" +echo "BENCHMARKS=$BENCHMARKS" +echo "NUM=$NUM THREADS=$THREADS" +echo "KEY_SIZE=$KEY_SIZE VALUE_SIZE=$VALUE_SIZE BLOCK_SIZE=$BLOCK_SIZE" +echo "CACHE_SIZE=$CACHE_SIZE WRITE_BUFFER_SIZE=$WRITE_BUFFER_SIZE" +echo "MAX_WRITE_BUFFER_NUMBER=$MAX_WRITE_BUFFER_NUMBER TARGET_FILE_SIZE_BASE=$TARGET_FILE_SIZE_BASE" +echo "OPEN_FILES=$OPEN_FILES MAX_BACKGROUND_JOBS=$MAX_BACKGROUND_JOBS" +echo "COMPRESSION_TYPE=$COMPRESSION_TYPE DISABLE_WAL=$DISABLE_WAL SYNC=$SYNC" +echo "USE_DIRECT_READS=$USE_DIRECT_READS USE_DIRECT_IO_FOR_FLUSH_AND_COMPACTION=$USE_DIRECT_IO_FOR_FLUSH_AND_COMPACTION" +echo "USE_MMAP_READS=$USE_MMAP_READS USE_MMAP_WRITES=$USE_MMAP_WRITES" +echo "STATISTICS=$STATISTICS STATS_INTERVAL_SECONDS=$STATS_INTERVAL_SECONDS HISTOGRAM=$HISTOGRAM" +echo + +exec "$DB_BENCH_BIN" \ + --db="$DB_PATH" \ + --benchmarks="$BENCHMARKS" \ + --num="$NUM" \ + --threads="$THREADS" \ + --seed="$SEED" \ + --key_size="$KEY_SIZE" \ + --value_size="$VALUE_SIZE" \ + --block_size="$BLOCK_SIZE" \ + --cache_size="$CACHE_SIZE" \ + --write_buffer_size="$WRITE_BUFFER_SIZE" \ + --max_write_buffer_number="$MAX_WRITE_BUFFER_NUMBER" \ + --target_file_size_base="$TARGET_FILE_SIZE_BASE" \ + --open_files="$OPEN_FILES" \ + --max_background_jobs="$MAX_BACKGROUND_JOBS" \ + --compression_type="$COMPRESSION_TYPE" \ + --disable_wal="$DISABLE_WAL" \ + --sync="$SYNC" \ + --use_direct_reads="$USE_DIRECT_READS" \ + --use_direct_io_for_flush_and_compaction="$USE_DIRECT_IO_FOR_FLUSH_AND_COMPACTION" \ + --mmap_read="$USE_MMAP_READS" \ + --mmap_write="$USE_MMAP_WRITES" \ + --statistics="$STATISTICS" \ + --stats_interval_seconds="$STATS_INTERVAL_SECONDS" \ + --histogram="$HISTOGRAM" diff --git a/scripts/run_db_bench_zvfs.sh b/scripts/run_db_bench_zvfs.sh new file mode 100755 index 0000000..5522f3a --- /dev/null +++ b/scripts/run_db_bench_zvfs.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash +set -euo pipefail + +# ========================= +# Manual Config (edit here) +# ========================= +# 可执行文件路径 +DB_BENCH_BIN="/home/lian/env/rocksdb-test/db_bench" +# RocksDB 数据目录 +DB_PATH="/zvfs/rocksdb_manual" + +# 测试类型 sets: +# - "fillseq" +# - "fillrandom" +# - "readseq" +# - "readrandom" +# - "overwrite" +# - "fillrandom,readrandom" +BENCHMARKS="fillrandom,readrandom" + +# key数 +NUM=1000000 +# 线程数 +THREADS=1 +# 随机种子 +SEED=1 + +# key大小 +KEY_SIZE=16 +# value大小 +VALUE_SIZE=400 +# SST block大小 +BLOCK_SIZE=4096 + +# block cache 大小 +CACHE_SIZE=$((512 * 1024 * 1024)) # bytes +# memtable 大小 +WRITE_BUFFER_SIZE=$((64 * 1024 * 1024)) # bytes +# memtable 个数 +MAX_WRITE_BUFFER_NUMBER=4 +# L1文件目标大小 +TARGET_FILE_SIZE_BASE=$((64 * 1024 * 1024)) # bytes +# 可打开文件数 +OPEN_FILES=-1 + +# 后台 并行度 +MAX_BACKGROUND_JOBS=4 +# 压缩算法 +COMPRESSION_TYPE="none" +# 开启WAL +DISABLE_WAL=1 +SYNC=0 + +# direct I/O +USE_DIRECT_READS=0 +USE_DIRECT_IO_FOR_FLUSH_AND_COMPACTION=0 + +# mmap I/O +USE_MMAP_READS=0 +USE_MMAP_WRITES=0 + +# 统计 +STATISTICS=0 +# 统计打印 +STATS_INTERVAL_SECONDS=5 +# 直方图 +HISTOGRAM=0 + +# ========================= +# Run +# ========================= +if [[ ! -x "$DB_BENCH_BIN" ]]; then + echo "db_bench not found or not executable: $DB_BENCH_BIN" >&2 + exit 1 +fi + +echo "== db_bench manual run ==" +echo "DB_BENCH_BIN=$DB_BENCH_BIN" +echo "DB_PATH=$DB_PATH" +echo "BENCHMARKS=$BENCHMARKS" +echo "NUM=$NUM THREADS=$THREADS" +echo "KEY_SIZE=$KEY_SIZE VALUE_SIZE=$VALUE_SIZE BLOCK_SIZE=$BLOCK_SIZE" +echo "CACHE_SIZE=$CACHE_SIZE WRITE_BUFFER_SIZE=$WRITE_BUFFER_SIZE" +echo "MAX_WRITE_BUFFER_NUMBER=$MAX_WRITE_BUFFER_NUMBER TARGET_FILE_SIZE_BASE=$TARGET_FILE_SIZE_BASE" +echo "OPEN_FILES=$OPEN_FILES MAX_BACKGROUND_JOBS=$MAX_BACKGROUND_JOBS" +echo "COMPRESSION_TYPE=$COMPRESSION_TYPE DISABLE_WAL=$DISABLE_WAL SYNC=$SYNC" +echo "USE_DIRECT_READS=$USE_DIRECT_READS USE_DIRECT_IO_FOR_FLUSH_AND_COMPACTION=$USE_DIRECT_IO_FOR_FLUSH_AND_COMPACTION" +echo "USE_MMAP_READS=$USE_MMAP_READS USE_MMAP_WRITES=$USE_MMAP_WRITES" +echo "STATISTICS=$STATISTICS STATS_INTERVAL_SECONDS=$STATS_INTERVAL_SECONDS HISTOGRAM=$HISTOGRAM" +echo + +exec "$DB_BENCH_BIN" \ + --db="$DB_PATH" \ + --benchmarks="$BENCHMARKS" \ + --num="$NUM" \ + --threads="$THREADS" \ + --seed="$SEED" \ + --key_size="$KEY_SIZE" \ + --value_size="$VALUE_SIZE" \ + --block_size="$BLOCK_SIZE" \ + --cache_size="$CACHE_SIZE" \ + --write_buffer_size="$WRITE_BUFFER_SIZE" \ + --max_write_buffer_number="$MAX_WRITE_BUFFER_NUMBER" \ + --target_file_size_base="$TARGET_FILE_SIZE_BASE" \ + --open_files="$OPEN_FILES" \ + --max_background_jobs="$MAX_BACKGROUND_JOBS" \ + --compression_type="$COMPRESSION_TYPE" \ + --disable_wal="$DISABLE_WAL" \ + --sync="$SYNC" \ + --use_direct_reads="$USE_DIRECT_READS" \ + --use_direct_io_for_flush_and_compaction="$USE_DIRECT_IO_FOR_FLUSH_AND_COMPACTION" \ + --mmap_read="$USE_MMAP_READS" \ + --mmap_write="$USE_MMAP_WRITES" \ + --statistics="$STATISTICS" \ + --stats_interval_seconds="$STATS_INTERVAL_SECONDS" \ + --histogram="$HISTOGRAM" diff --git a/scripts/run_test_hook_api.sh b/scripts/run_test_hook_api.sh new file mode 100644 index 0000000..4751c99 --- /dev/null +++ b/scripts/run_test_hook_api.sh @@ -0,0 +1 @@ +LD_PRELOAD=/home/lian/try/zvfs/src/libzvfs.so ZVFS_TEST_ROOT=/zvfs /home/lian/try/zvfs/tests/bin/hook_api_test \ No newline at end of file diff --git a/zvfs/Makefile b/src/Makefile similarity index 63% rename from zvfs/Makefile rename to src/Makefile index 986e76a..fb9af0c 100755 --- a/zvfs/Makefile +++ b/src/Makefile @@ -10,11 +10,29 @@ include $(SPDK_ROOT_DIR)/mk/spdk.app_vars.mk LIBZVFS := libzvfs.so -C_SRCS := zvfs.c zvfs_hook.c +C_SRCS := \ + common/utils.c \ + spdk_engine/io_engine.c \ + fs/zvfs.c \ + fs/zvfs_inode.c \ + fs/zvfs_path_entry.c \ + fs/zvfs_open_file.c \ + fs/zvfs_sys_init.c \ + hook/zvfs_hook_init.c \ + hook/zvfs_hook_fd.c \ + hook/zvfs_hook_rw.c \ + hook/zvfs_hook_seek.c \ + hook/zvfs_hook_stat.c \ + hook/zvfs_hook_sync.c \ + hook/zvfs_hook_fcntl.c \ + hook/zvfs_hook_dir.c \ + hook/zvfs_hook_mmap.c \ + SPDK_LIB_LIST = $(ALL_MODULES_LIST) event event_bdev LIBS += $(SPDK_LIB_LINKER_ARGS) +CFLAGS += -I$(abspath $(CURDIR)) LDFLAGS += -shared -rdynamic -Wl,-z,nodelete -Wl,--disable-new-dtags \ -Wl,-rpath,$(SPDK_ROOT_DIR)/build/lib \ -Wl,-rpath,$(SPDK_ROOT_DIR)/dpdk/build/lib diff --git a/src/common/uthash.h b/src/common/uthash.h new file mode 100644 index 0000000..07ae09e --- /dev/null +++ b/src/common/uthash.h @@ -0,0 +1,1137 @@ +/* +Copyright (c) 2003-2025, Troy D. Hanson https://troydhanson.github.io/uthash/ +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef UTHASH_H +#define UTHASH_H + +#define UTHASH_VERSION 2.3.0 + +#include /* memcmp, memset, strlen */ +#include /* ptrdiff_t */ +#include /* exit */ + +#if defined(HASH_NO_STDINT) && HASH_NO_STDINT +/* The user doesn't have , and must figure out their own way + to provide definitions for uint8_t and uint32_t. */ +#else +#include /* uint8_t, uint32_t */ +#endif + +/* These macros use decltype or the earlier __typeof GNU extension. + As decltype is only available in newer compilers (VS2010 or gcc 4.3+ + when compiling c++ source) this code uses whatever method is needed + or, for VS2008 where neither is available, uses casting workarounds. */ +#if !defined(DECLTYPE) && !defined(NO_DECLTYPE) +#if defined(_MSC_VER) /* MS compiler */ +#if _MSC_VER >= 1600 && defined(__cplusplus) /* VS2010 or newer in C++ mode */ +#define DECLTYPE(x) (decltype(x)) +#else /* VS2008 or older (or VS2010 in C mode) */ +#define NO_DECLTYPE +#endif +#elif defined(__MCST__) /* Elbrus C Compiler */ +#define DECLTYPE(x) (__typeof(x)) +#elif defined(__BORLANDC__) || defined(__ICCARM__) || defined(__LCC__) || defined(__WATCOMC__) +#define NO_DECLTYPE +#else /* GNU, Sun and other compilers */ +#define DECLTYPE(x) (__typeof(x)) +#endif +#endif + +#ifdef NO_DECLTYPE +#define DECLTYPE(x) +#define DECLTYPE_ASSIGN(dst,src) \ +do { \ + char **_da_dst = (char**)(&(dst)); \ + *_da_dst = (char*)(src); \ +} while (0) +#else +#define DECLTYPE_ASSIGN(dst,src) \ +do { \ + (dst) = DECLTYPE(dst)(src); \ +} while (0) +#endif + +#ifndef uthash_malloc +#define uthash_malloc(sz) malloc(sz) /* malloc fcn */ +#endif +#ifndef uthash_free +#define uthash_free(ptr,sz) free(ptr) /* free fcn */ +#endif +#ifndef uthash_bzero +#define uthash_bzero(a,n) memset(a,'\0',n) +#endif +#ifndef uthash_strlen +#define uthash_strlen(s) strlen(s) +#endif + +#ifndef HASH_FUNCTION +#define HASH_FUNCTION(keyptr,keylen,hashv) HASH_JEN(keyptr, keylen, hashv) +#endif + +#ifndef HASH_KEYCMP +#define HASH_KEYCMP(a,b,n) memcmp(a,b,n) +#endif + +#ifndef uthash_noexpand_fyi +#define uthash_noexpand_fyi(tbl) /* can be defined to log noexpand */ +#endif +#ifndef uthash_expand_fyi +#define uthash_expand_fyi(tbl) /* can be defined to log expands */ +#endif + +#ifndef HASH_NONFATAL_OOM +#define HASH_NONFATAL_OOM 0 +#endif + +#if HASH_NONFATAL_OOM +/* malloc failures can be recovered from */ + +#ifndef uthash_nonfatal_oom +#define uthash_nonfatal_oom(obj) do {} while (0) /* non-fatal OOM error */ +#endif + +#define HASH_RECORD_OOM(oomed) do { (oomed) = 1; } while (0) +#define IF_HASH_NONFATAL_OOM(x) x + +#else +/* malloc failures result in lost memory, hash tables are unusable */ + +#ifndef uthash_fatal +#define uthash_fatal(msg) exit(-1) /* fatal OOM error */ +#endif + +#define HASH_RECORD_OOM(oomed) uthash_fatal("out of memory") +#define IF_HASH_NONFATAL_OOM(x) + +#endif + +/* initial number of buckets */ +#define HASH_INITIAL_NUM_BUCKETS 32U /* initial number of buckets */ +#define HASH_INITIAL_NUM_BUCKETS_LOG2 5U /* lg2 of initial number of buckets */ +#define HASH_BKT_CAPACITY_THRESH 10U /* expand when bucket count reaches */ + +/* calculate the element whose hash handle address is hhp */ +#define ELMT_FROM_HH(tbl,hhp) ((void*)(((char*)(hhp)) - ((tbl)->hho))) +/* calculate the hash handle from element address elp */ +#define HH_FROM_ELMT(tbl,elp) ((UT_hash_handle*)(void*)(((char*)(elp)) + ((tbl)->hho))) + +#define HASH_ROLLBACK_BKT(hh, head, itemptrhh) \ +do { \ + struct UT_hash_handle *_hd_hh_item = (itemptrhh); \ + unsigned _hd_bkt; \ + HASH_TO_BKT(_hd_hh_item->hashv, (head)->hh.tbl->num_buckets, _hd_bkt); \ + (head)->hh.tbl->buckets[_hd_bkt].count++; \ + _hd_hh_item->hh_next = NULL; \ + _hd_hh_item->hh_prev = NULL; \ +} while (0) + +#define HASH_VALUE(keyptr,keylen,hashv) \ +do { \ + HASH_FUNCTION(keyptr, keylen, hashv); \ +} while (0) + +#define HASH_FIND_BYHASHVALUE(hh,head,keyptr,keylen,hashval,out) \ +do { \ + (out) = NULL; \ + if (head) { \ + unsigned _hf_bkt; \ + HASH_TO_BKT(hashval, (head)->hh.tbl->num_buckets, _hf_bkt); \ + if (HASH_BLOOM_TEST((head)->hh.tbl, hashval)) { \ + HASH_FIND_IN_BKT((head)->hh.tbl, hh, (head)->hh.tbl->buckets[ _hf_bkt ], keyptr, keylen, hashval, out); \ + } \ + } \ +} while (0) + +#define HASH_FIND(hh,head,keyptr,keylen,out) \ +do { \ + (out) = NULL; \ + if (head) { \ + unsigned _hf_hashv; \ + HASH_VALUE(keyptr, keylen, _hf_hashv); \ + HASH_FIND_BYHASHVALUE(hh, head, keyptr, keylen, _hf_hashv, out); \ + } \ +} while (0) + +#ifdef HASH_BLOOM +#define HASH_BLOOM_BITLEN (1UL << HASH_BLOOM) +#define HASH_BLOOM_BYTELEN (HASH_BLOOM_BITLEN/8UL) + (((HASH_BLOOM_BITLEN%8UL)!=0UL) ? 1UL : 0UL) +#define HASH_BLOOM_MAKE(tbl,oomed) \ +do { \ + (tbl)->bloom_nbits = HASH_BLOOM; \ + (tbl)->bloom_bv = (uint8_t*)uthash_malloc(HASH_BLOOM_BYTELEN); \ + if (!(tbl)->bloom_bv) { \ + HASH_RECORD_OOM(oomed); \ + } else { \ + uthash_bzero((tbl)->bloom_bv, HASH_BLOOM_BYTELEN); \ + (tbl)->bloom_sig = HASH_BLOOM_SIGNATURE; \ + } \ +} while (0) + +#define HASH_BLOOM_FREE(tbl) \ +do { \ + uthash_free((tbl)->bloom_bv, HASH_BLOOM_BYTELEN); \ +} while (0) + +#define HASH_BLOOM_BITSET(bv,idx) (bv[(idx)/8U] |= (1U << ((idx)%8U))) +#define HASH_BLOOM_BITTEST(bv,idx) ((bv[(idx)/8U] & (1U << ((idx)%8U))) != 0) + +#define HASH_BLOOM_ADD(tbl,hashv) \ + HASH_BLOOM_BITSET((tbl)->bloom_bv, ((hashv) & (uint32_t)((1UL << (tbl)->bloom_nbits) - 1U))) + +#define HASH_BLOOM_TEST(tbl,hashv) \ + HASH_BLOOM_BITTEST((tbl)->bloom_bv, ((hashv) & (uint32_t)((1UL << (tbl)->bloom_nbits) - 1U))) + +#else +#define HASH_BLOOM_MAKE(tbl,oomed) +#define HASH_BLOOM_FREE(tbl) +#define HASH_BLOOM_ADD(tbl,hashv) +#define HASH_BLOOM_TEST(tbl,hashv) 1 +#define HASH_BLOOM_BYTELEN 0U +#endif + +#define HASH_MAKE_TABLE(hh,head,oomed) \ +do { \ + (head)->hh.tbl = (UT_hash_table*)uthash_malloc(sizeof(UT_hash_table)); \ + if (!(head)->hh.tbl) { \ + HASH_RECORD_OOM(oomed); \ + } else { \ + uthash_bzero((head)->hh.tbl, sizeof(UT_hash_table)); \ + (head)->hh.tbl->tail = &((head)->hh); \ + (head)->hh.tbl->num_buckets = HASH_INITIAL_NUM_BUCKETS; \ + (head)->hh.tbl->log2_num_buckets = HASH_INITIAL_NUM_BUCKETS_LOG2; \ + (head)->hh.tbl->hho = (char*)(&(head)->hh) - (char*)(head); \ + (head)->hh.tbl->buckets = (UT_hash_bucket*)uthash_malloc( \ + HASH_INITIAL_NUM_BUCKETS * sizeof(struct UT_hash_bucket)); \ + (head)->hh.tbl->signature = HASH_SIGNATURE; \ + if (!(head)->hh.tbl->buckets) { \ + HASH_RECORD_OOM(oomed); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + } else { \ + uthash_bzero((head)->hh.tbl->buckets, \ + HASH_INITIAL_NUM_BUCKETS * sizeof(struct UT_hash_bucket)); \ + HASH_BLOOM_MAKE((head)->hh.tbl, oomed); \ + IF_HASH_NONFATAL_OOM( \ + if (oomed) { \ + uthash_free((head)->hh.tbl->buckets, \ + HASH_INITIAL_NUM_BUCKETS*sizeof(struct UT_hash_bucket)); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + } \ + ) \ + } \ + } \ +} while (0) + +#define HASH_REPLACE_BYHASHVALUE_INORDER(hh,head,fieldname,keylen_in,hashval,add,replaced,cmpfcn) \ +do { \ + (replaced) = NULL; \ + HASH_FIND_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval, replaced); \ + if (replaced) { \ + HASH_DELETE(hh, head, replaced); \ + } \ + HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, &((add)->fieldname), keylen_in, hashval, add, cmpfcn); \ +} while (0) + +#define HASH_REPLACE_BYHASHVALUE(hh,head,fieldname,keylen_in,hashval,add,replaced) \ +do { \ + (replaced) = NULL; \ + HASH_FIND_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval, replaced); \ + if (replaced) { \ + HASH_DELETE(hh, head, replaced); \ + } \ + HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval, add); \ +} while (0) + +#define HASH_REPLACE(hh,head,fieldname,keylen_in,add,replaced) \ +do { \ + unsigned _hr_hashv; \ + HASH_VALUE(&((add)->fieldname), keylen_in, _hr_hashv); \ + HASH_REPLACE_BYHASHVALUE(hh, head, fieldname, keylen_in, _hr_hashv, add, replaced); \ +} while (0) + +#define HASH_REPLACE_INORDER(hh,head,fieldname,keylen_in,add,replaced,cmpfcn) \ +do { \ + unsigned _hr_hashv; \ + HASH_VALUE(&((add)->fieldname), keylen_in, _hr_hashv); \ + HASH_REPLACE_BYHASHVALUE_INORDER(hh, head, fieldname, keylen_in, _hr_hashv, add, replaced, cmpfcn); \ +} while (0) + +#define HASH_APPEND_LIST(hh, head, add) \ +do { \ + (add)->hh.next = NULL; \ + (add)->hh.prev = ELMT_FROM_HH((head)->hh.tbl, (head)->hh.tbl->tail); \ + (head)->hh.tbl->tail->next = (add); \ + (head)->hh.tbl->tail = &((add)->hh); \ +} while (0) + +#define HASH_AKBI_INNER_LOOP(hh,head,add,cmpfcn) \ +do { \ + do { \ + if (cmpfcn(DECLTYPE(head)(_hs_iter), add) > 0) { \ + break; \ + } \ + } while ((_hs_iter = HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->next)); \ +} while (0) + +#ifdef NO_DECLTYPE +#undef HASH_AKBI_INNER_LOOP +#define HASH_AKBI_INNER_LOOP(hh,head,add,cmpfcn) \ +do { \ + char *_hs_saved_head = (char*)(head); \ + do { \ + DECLTYPE_ASSIGN(head, _hs_iter); \ + if (cmpfcn(head, add) > 0) { \ + DECLTYPE_ASSIGN(head, _hs_saved_head); \ + break; \ + } \ + DECLTYPE_ASSIGN(head, _hs_saved_head); \ + } while ((_hs_iter = HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->next)); \ +} while (0) +#endif + +#if HASH_NONFATAL_OOM + +#define HASH_ADD_TO_TABLE(hh,head,keyptr,keylen_in,hashval,add,oomed) \ +do { \ + if (!(oomed)) { \ + unsigned _ha_bkt; \ + (head)->hh.tbl->num_items++; \ + HASH_TO_BKT(hashval, (head)->hh.tbl->num_buckets, _ha_bkt); \ + HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt], hh, &(add)->hh, oomed); \ + if (oomed) { \ + HASH_ROLLBACK_BKT(hh, head, &(add)->hh); \ + HASH_DELETE_HH(hh, head, &(add)->hh); \ + (add)->hh.tbl = NULL; \ + uthash_nonfatal_oom(add); \ + } else { \ + HASH_BLOOM_ADD((head)->hh.tbl, hashval); \ + HASH_EMIT_KEY(hh, head, keyptr, keylen_in); \ + } \ + } else { \ + (add)->hh.tbl = NULL; \ + uthash_nonfatal_oom(add); \ + } \ +} while (0) + +#else + +#define HASH_ADD_TO_TABLE(hh,head,keyptr,keylen_in,hashval,add,oomed) \ +do { \ + unsigned _ha_bkt; \ + (head)->hh.tbl->num_items++; \ + HASH_TO_BKT(hashval, (head)->hh.tbl->num_buckets, _ha_bkt); \ + HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt], hh, &(add)->hh, oomed); \ + HASH_BLOOM_ADD((head)->hh.tbl, hashval); \ + HASH_EMIT_KEY(hh, head, keyptr, keylen_in); \ +} while (0) + +#endif + + +#define HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh,head,keyptr,keylen_in,hashval,add,cmpfcn) \ +do { \ + IF_HASH_NONFATAL_OOM( int _ha_oomed = 0; ) \ + (add)->hh.hashv = (hashval); \ + (add)->hh.key = (char*) (keyptr); \ + (add)->hh.keylen = (unsigned) (keylen_in); \ + if (!(head)) { \ + (add)->hh.next = NULL; \ + (add)->hh.prev = NULL; \ + HASH_MAKE_TABLE(hh, add, _ha_oomed); \ + IF_HASH_NONFATAL_OOM( if (!_ha_oomed) { ) \ + (head) = (add); \ + IF_HASH_NONFATAL_OOM( } ) \ + } else { \ + void *_hs_iter = (head); \ + (add)->hh.tbl = (head)->hh.tbl; \ + HASH_AKBI_INNER_LOOP(hh, head, add, cmpfcn); \ + if (_hs_iter) { \ + (add)->hh.next = _hs_iter; \ + if (((add)->hh.prev = HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->prev)) { \ + HH_FROM_ELMT((head)->hh.tbl, (add)->hh.prev)->next = (add); \ + } else { \ + (head) = (add); \ + } \ + HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->prev = (add); \ + } else { \ + HASH_APPEND_LIST(hh, head, add); \ + } \ + } \ + HASH_ADD_TO_TABLE(hh, head, keyptr, keylen_in, hashval, add, _ha_oomed); \ + HASH_FSCK(hh, head, "HASH_ADD_KEYPTR_BYHASHVALUE_INORDER"); \ +} while (0) + +#define HASH_ADD_KEYPTR_INORDER(hh,head,keyptr,keylen_in,add,cmpfcn) \ +do { \ + unsigned _hs_hashv; \ + HASH_VALUE(keyptr, keylen_in, _hs_hashv); \ + HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, keyptr, keylen_in, _hs_hashv, add, cmpfcn); \ +} while (0) + +#define HASH_ADD_BYHASHVALUE_INORDER(hh,head,fieldname,keylen_in,hashval,add,cmpfcn) \ + HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, &((add)->fieldname), keylen_in, hashval, add, cmpfcn) + +#define HASH_ADD_INORDER(hh,head,fieldname,keylen_in,add,cmpfcn) \ + HASH_ADD_KEYPTR_INORDER(hh, head, &((add)->fieldname), keylen_in, add, cmpfcn) + +#define HASH_ADD_KEYPTR_BYHASHVALUE(hh,head,keyptr,keylen_in,hashval,add) \ +do { \ + IF_HASH_NONFATAL_OOM( int _ha_oomed = 0; ) \ + (add)->hh.hashv = (hashval); \ + (add)->hh.key = (const void*) (keyptr); \ + (add)->hh.keylen = (unsigned) (keylen_in); \ + if (!(head)) { \ + (add)->hh.next = NULL; \ + (add)->hh.prev = NULL; \ + HASH_MAKE_TABLE(hh, add, _ha_oomed); \ + IF_HASH_NONFATAL_OOM( if (!_ha_oomed) { ) \ + (head) = (add); \ + IF_HASH_NONFATAL_OOM( } ) \ + } else { \ + (add)->hh.tbl = (head)->hh.tbl; \ + HASH_APPEND_LIST(hh, head, add); \ + } \ + HASH_ADD_TO_TABLE(hh, head, keyptr, keylen_in, hashval, add, _ha_oomed); \ + HASH_FSCK(hh, head, "HASH_ADD_KEYPTR_BYHASHVALUE"); \ +} while (0) + +#define HASH_ADD_KEYPTR(hh,head,keyptr,keylen_in,add) \ +do { \ + unsigned _ha_hashv; \ + HASH_VALUE(keyptr, keylen_in, _ha_hashv); \ + HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, keyptr, keylen_in, _ha_hashv, add); \ +} while (0) + +#define HASH_ADD_BYHASHVALUE(hh,head,fieldname,keylen_in,hashval,add) \ + HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval, add) + +#define HASH_ADD(hh,head,fieldname,keylen_in,add) \ + HASH_ADD_KEYPTR(hh, head, &((add)->fieldname), keylen_in, add) + +#define HASH_TO_BKT(hashv,num_bkts,bkt) \ +do { \ + bkt = ((hashv) & ((num_bkts) - 1U)); \ +} while (0) + +/* delete "delptr" from the hash table. + * "the usual" patch-up process for the app-order doubly-linked-list. + * The use of _hd_hh_del below deserves special explanation. + * These used to be expressed using (delptr) but that led to a bug + * if someone used the same symbol for the head and deletee, like + * HASH_DELETE(hh,users,users); + * We want that to work, but by changing the head (users) below + * we were forfeiting our ability to further refer to the deletee (users) + * in the patch-up process. Solution: use scratch space to + * copy the deletee pointer, then the latter references are via that + * scratch pointer rather than through the repointed (users) symbol. + */ +#define HASH_DELETE(hh,head,delptr) \ + HASH_DELETE_HH(hh, head, &(delptr)->hh) + +#define HASH_DELETE_HH(hh,head,delptrhh) \ +do { \ + const struct UT_hash_handle *_hd_hh_del = (delptrhh); \ + if ((_hd_hh_del->prev == NULL) && (_hd_hh_del->next == NULL)) { \ + HASH_BLOOM_FREE((head)->hh.tbl); \ + uthash_free((head)->hh.tbl->buckets, \ + (head)->hh.tbl->num_buckets * sizeof(struct UT_hash_bucket)); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + (head) = NULL; \ + } else { \ + unsigned _hd_bkt; \ + if (_hd_hh_del == (head)->hh.tbl->tail) { \ + (head)->hh.tbl->tail = HH_FROM_ELMT((head)->hh.tbl, _hd_hh_del->prev); \ + } \ + if (_hd_hh_del->prev != NULL) { \ + HH_FROM_ELMT((head)->hh.tbl, _hd_hh_del->prev)->next = _hd_hh_del->next; \ + } else { \ + DECLTYPE_ASSIGN(head, _hd_hh_del->next); \ + } \ + if (_hd_hh_del->next != NULL) { \ + HH_FROM_ELMT((head)->hh.tbl, _hd_hh_del->next)->prev = _hd_hh_del->prev; \ + } \ + HASH_TO_BKT(_hd_hh_del->hashv, (head)->hh.tbl->num_buckets, _hd_bkt); \ + HASH_DEL_IN_BKT((head)->hh.tbl->buckets[_hd_bkt], _hd_hh_del); \ + (head)->hh.tbl->num_items--; \ + } \ + HASH_FSCK(hh, head, "HASH_DELETE_HH"); \ +} while (0) + +/* convenience forms of HASH_FIND/HASH_ADD/HASH_DEL */ +#define HASH_FIND_STR(head,findstr,out) \ +do { \ + unsigned _uthash_hfstr_keylen = (unsigned)uthash_strlen(findstr); \ + HASH_FIND(hh, head, findstr, _uthash_hfstr_keylen, out); \ +} while (0) +#define HASH_ADD_STR(head,strfield,add) \ +do { \ + unsigned _uthash_hastr_keylen = (unsigned)uthash_strlen((add)->strfield); \ + HASH_ADD(hh, head, strfield[0], _uthash_hastr_keylen, add); \ +} while (0) +#define HASH_REPLACE_STR(head,strfield,add,replaced) \ +do { \ + unsigned _uthash_hrstr_keylen = (unsigned)uthash_strlen((add)->strfield); \ + HASH_REPLACE(hh, head, strfield[0], _uthash_hrstr_keylen, add, replaced); \ +} while (0) +#define HASH_FIND_INT(head,findint,out) \ + HASH_FIND(hh,head,findint,sizeof(int),out) +#define HASH_ADD_INT(head,intfield,add) \ + HASH_ADD(hh,head,intfield,sizeof(int),add) +#define HASH_REPLACE_INT(head,intfield,add,replaced) \ + HASH_REPLACE(hh,head,intfield,sizeof(int),add,replaced) +#define HASH_FIND_PTR(head,findptr,out) \ + HASH_FIND(hh,head,findptr,sizeof(void *),out) +#define HASH_ADD_PTR(head,ptrfield,add) \ + HASH_ADD(hh,head,ptrfield,sizeof(void *),add) +#define HASH_REPLACE_PTR(head,ptrfield,add,replaced) \ + HASH_REPLACE(hh,head,ptrfield,sizeof(void *),add,replaced) +#define HASH_DEL(head,delptr) \ + HASH_DELETE(hh,head,delptr) + +/* HASH_FSCK checks hash integrity on every add/delete when HASH_DEBUG is defined. + * This is for uthash developer only; it compiles away if HASH_DEBUG isn't defined. + */ +#ifdef HASH_DEBUG +#include /* fprintf, stderr */ +#define HASH_OOPS(...) do { fprintf(stderr, __VA_ARGS__); exit(-1); } while (0) +#define HASH_FSCK(hh,head,where) \ +do { \ + struct UT_hash_handle *_thh; \ + if (head) { \ + unsigned _bkt_i; \ + unsigned _count = 0; \ + char *_prev; \ + for (_bkt_i = 0; _bkt_i < (head)->hh.tbl->num_buckets; ++_bkt_i) { \ + unsigned _bkt_count = 0; \ + _thh = (head)->hh.tbl->buckets[_bkt_i].hh_head; \ + _prev = NULL; \ + while (_thh) { \ + if (_prev != (char*)(_thh->hh_prev)) { \ + HASH_OOPS("%s: invalid hh_prev %p, actual %p\n", \ + (where), (void*)_thh->hh_prev, (void*)_prev); \ + } \ + _bkt_count++; \ + _prev = (char*)(_thh); \ + _thh = _thh->hh_next; \ + } \ + _count += _bkt_count; \ + if ((head)->hh.tbl->buckets[_bkt_i].count != _bkt_count) { \ + HASH_OOPS("%s: invalid bucket count %u, actual %u\n", \ + (where), (head)->hh.tbl->buckets[_bkt_i].count, _bkt_count); \ + } \ + } \ + if (_count != (head)->hh.tbl->num_items) { \ + HASH_OOPS("%s: invalid hh item count %u, actual %u\n", \ + (where), (head)->hh.tbl->num_items, _count); \ + } \ + _count = 0; \ + _prev = NULL; \ + _thh = &(head)->hh; \ + while (_thh) { \ + _count++; \ + if (_prev != (char*)_thh->prev) { \ + HASH_OOPS("%s: invalid prev %p, actual %p\n", \ + (where), (void*)_thh->prev, (void*)_prev); \ + } \ + _prev = (char*)ELMT_FROM_HH((head)->hh.tbl, _thh); \ + _thh = (_thh->next ? HH_FROM_ELMT((head)->hh.tbl, _thh->next) : NULL); \ + } \ + if (_count != (head)->hh.tbl->num_items) { \ + HASH_OOPS("%s: invalid app item count %u, actual %u\n", \ + (where), (head)->hh.tbl->num_items, _count); \ + } \ + } \ +} while (0) +#else +#define HASH_FSCK(hh,head,where) +#endif + +/* When compiled with -DHASH_EMIT_KEYS, length-prefixed keys are emitted to + * the descriptor to which this macro is defined for tuning the hash function. + * The app can #include to get the prototype for write(2). */ +#ifdef HASH_EMIT_KEYS +#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen) \ +do { \ + unsigned _klen = fieldlen; \ + write(HASH_EMIT_KEYS, &_klen, sizeof(_klen)); \ + write(HASH_EMIT_KEYS, keyptr, (unsigned long)fieldlen); \ +} while (0) +#else +#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen) +#endif + +/* The Bernstein hash function, used in Perl prior to v5.6. Note (x<<5+x)=x*33. */ +#define HASH_BER(key,keylen,hashv) \ +do { \ + unsigned _hb_keylen = (unsigned)keylen; \ + const unsigned char *_hb_key = (const unsigned char*)(key); \ + (hashv) = 0; \ + while (_hb_keylen-- != 0U) { \ + (hashv) = (((hashv) << 5) + (hashv)) + *_hb_key++; \ + } \ +} while (0) + + +/* SAX/FNV/OAT/JEN hash functions are macro variants of those listed at + * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx + * (archive link: https://archive.is/Ivcan ) + */ +#define HASH_SAX(key,keylen,hashv) \ +do { \ + unsigned _sx_i; \ + const unsigned char *_hs_key = (const unsigned char*)(key); \ + hashv = 0; \ + for (_sx_i=0; _sx_i < keylen; _sx_i++) { \ + hashv ^= (hashv << 5) + (hashv >> 2) + _hs_key[_sx_i]; \ + } \ +} while (0) +/* FNV-1a variation */ +#define HASH_FNV(key,keylen,hashv) \ +do { \ + unsigned _fn_i; \ + const unsigned char *_hf_key = (const unsigned char*)(key); \ + (hashv) = 2166136261U; \ + for (_fn_i=0; _fn_i < keylen; _fn_i++) { \ + hashv = hashv ^ _hf_key[_fn_i]; \ + hashv = hashv * 16777619U; \ + } \ +} while (0) + +#define HASH_OAT(key,keylen,hashv) \ +do { \ + unsigned _ho_i; \ + const unsigned char *_ho_key=(const unsigned char*)(key); \ + hashv = 0; \ + for(_ho_i=0; _ho_i < keylen; _ho_i++) { \ + hashv += _ho_key[_ho_i]; \ + hashv += (hashv << 10); \ + hashv ^= (hashv >> 6); \ + } \ + hashv += (hashv << 3); \ + hashv ^= (hashv >> 11); \ + hashv += (hashv << 15); \ +} while (0) + +#define HASH_JEN_MIX(a,b,c) \ +do { \ + a -= b; a -= c; a ^= ( c >> 13 ); \ + b -= c; b -= a; b ^= ( a << 8 ); \ + c -= a; c -= b; c ^= ( b >> 13 ); \ + a -= b; a -= c; a ^= ( c >> 12 ); \ + b -= c; b -= a; b ^= ( a << 16 ); \ + c -= a; c -= b; c ^= ( b >> 5 ); \ + a -= b; a -= c; a ^= ( c >> 3 ); \ + b -= c; b -= a; b ^= ( a << 10 ); \ + c -= a; c -= b; c ^= ( b >> 15 ); \ +} while (0) + +#define HASH_JEN(key,keylen,hashv) \ +do { \ + unsigned _hj_i,_hj_j,_hj_k; \ + unsigned const char *_hj_key=(unsigned const char*)(key); \ + hashv = 0xfeedbeefu; \ + _hj_i = _hj_j = 0x9e3779b9u; \ + _hj_k = (unsigned)(keylen); \ + while (_hj_k >= 12U) { \ + _hj_i += (_hj_key[0] + ( (unsigned)_hj_key[1] << 8 ) \ + + ( (unsigned)_hj_key[2] << 16 ) \ + + ( (unsigned)_hj_key[3] << 24 ) ); \ + _hj_j += (_hj_key[4] + ( (unsigned)_hj_key[5] << 8 ) \ + + ( (unsigned)_hj_key[6] << 16 ) \ + + ( (unsigned)_hj_key[7] << 24 ) ); \ + hashv += (_hj_key[8] + ( (unsigned)_hj_key[9] << 8 ) \ + + ( (unsigned)_hj_key[10] << 16 ) \ + + ( (unsigned)_hj_key[11] << 24 ) ); \ + \ + HASH_JEN_MIX(_hj_i, _hj_j, hashv); \ + \ + _hj_key += 12; \ + _hj_k -= 12U; \ + } \ + hashv += (unsigned)(keylen); \ + switch ( _hj_k ) { \ + case 11: hashv += ( (unsigned)_hj_key[10] << 24 ); /* FALLTHROUGH */ \ + case 10: hashv += ( (unsigned)_hj_key[9] << 16 ); /* FALLTHROUGH */ \ + case 9: hashv += ( (unsigned)_hj_key[8] << 8 ); /* FALLTHROUGH */ \ + case 8: _hj_j += ( (unsigned)_hj_key[7] << 24 ); /* FALLTHROUGH */ \ + case 7: _hj_j += ( (unsigned)_hj_key[6] << 16 ); /* FALLTHROUGH */ \ + case 6: _hj_j += ( (unsigned)_hj_key[5] << 8 ); /* FALLTHROUGH */ \ + case 5: _hj_j += _hj_key[4]; /* FALLTHROUGH */ \ + case 4: _hj_i += ( (unsigned)_hj_key[3] << 24 ); /* FALLTHROUGH */ \ + case 3: _hj_i += ( (unsigned)_hj_key[2] << 16 ); /* FALLTHROUGH */ \ + case 2: _hj_i += ( (unsigned)_hj_key[1] << 8 ); /* FALLTHROUGH */ \ + case 1: _hj_i += _hj_key[0]; /* FALLTHROUGH */ \ + default: ; \ + } \ + HASH_JEN_MIX(_hj_i, _hj_j, hashv); \ +} while (0) + +/* The Paul Hsieh hash function */ +#undef get16bits +#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \ + || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__) +#define get16bits(d) (*((const uint16_t *) (d))) +#endif + +#if !defined (get16bits) +#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8) \ + +(uint32_t)(((const uint8_t *)(d))[0]) ) +#endif +#define HASH_SFH(key,keylen,hashv) \ +do { \ + unsigned const char *_sfh_key=(unsigned const char*)(key); \ + uint32_t _sfh_tmp, _sfh_len = (uint32_t)keylen; \ + \ + unsigned _sfh_rem = _sfh_len & 3U; \ + _sfh_len >>= 2; \ + hashv = 0xcafebabeu; \ + \ + /* Main loop */ \ + for (;_sfh_len > 0U; _sfh_len--) { \ + hashv += get16bits (_sfh_key); \ + _sfh_tmp = ((uint32_t)(get16bits (_sfh_key+2)) << 11) ^ hashv; \ + hashv = (hashv << 16) ^ _sfh_tmp; \ + _sfh_key += 2U*sizeof (uint16_t); \ + hashv += hashv >> 11; \ + } \ + \ + /* Handle end cases */ \ + switch (_sfh_rem) { \ + case 3: hashv += get16bits (_sfh_key); \ + hashv ^= hashv << 16; \ + hashv ^= (uint32_t)(_sfh_key[sizeof (uint16_t)]) << 18; \ + hashv += hashv >> 11; \ + break; \ + case 2: hashv += get16bits (_sfh_key); \ + hashv ^= hashv << 11; \ + hashv += hashv >> 17; \ + break; \ + case 1: hashv += *_sfh_key; \ + hashv ^= hashv << 10; \ + hashv += hashv >> 1; \ + break; \ + default: ; \ + } \ + \ + /* Force "avalanching" of final 127 bits */ \ + hashv ^= hashv << 3; \ + hashv += hashv >> 5; \ + hashv ^= hashv << 4; \ + hashv += hashv >> 17; \ + hashv ^= hashv << 25; \ + hashv += hashv >> 6; \ +} while (0) + +/* iterate over items in a known bucket to find desired item */ +#define HASH_FIND_IN_BKT(tbl,hh,head,keyptr,keylen_in,hashval,out) \ +do { \ + if ((head).hh_head != NULL) { \ + DECLTYPE_ASSIGN(out, ELMT_FROM_HH(tbl, (head).hh_head)); \ + } else { \ + (out) = NULL; \ + } \ + while ((out) != NULL) { \ + if ((out)->hh.hashv == (hashval) && (out)->hh.keylen == (keylen_in)) { \ + if (HASH_KEYCMP((out)->hh.key, keyptr, keylen_in) == 0) { \ + break; \ + } \ + } \ + if ((out)->hh.hh_next != NULL) { \ + DECLTYPE_ASSIGN(out, ELMT_FROM_HH(tbl, (out)->hh.hh_next)); \ + } else { \ + (out) = NULL; \ + } \ + } \ +} while (0) + +/* add an item to a bucket */ +#define HASH_ADD_TO_BKT(head,hh,addhh,oomed) \ +do { \ + UT_hash_bucket *_ha_head = &(head); \ + _ha_head->count++; \ + (addhh)->hh_next = _ha_head->hh_head; \ + (addhh)->hh_prev = NULL; \ + if (_ha_head->hh_head != NULL) { \ + _ha_head->hh_head->hh_prev = (addhh); \ + } \ + _ha_head->hh_head = (addhh); \ + if ((_ha_head->count >= ((_ha_head->expand_mult + 1U) * HASH_BKT_CAPACITY_THRESH)) \ + && !(addhh)->tbl->noexpand) { \ + HASH_EXPAND_BUCKETS(addhh,(addhh)->tbl, oomed); \ + IF_HASH_NONFATAL_OOM( \ + if (oomed) { \ + HASH_DEL_IN_BKT(head,addhh); \ + } \ + ) \ + } \ +} while (0) + +/* remove an item from a given bucket */ +#define HASH_DEL_IN_BKT(head,delhh) \ +do { \ + UT_hash_bucket *_hd_head = &(head); \ + _hd_head->count--; \ + if (_hd_head->hh_head == (delhh)) { \ + _hd_head->hh_head = (delhh)->hh_next; \ + } \ + if ((delhh)->hh_prev) { \ + (delhh)->hh_prev->hh_next = (delhh)->hh_next; \ + } \ + if ((delhh)->hh_next) { \ + (delhh)->hh_next->hh_prev = (delhh)->hh_prev; \ + } \ +} while (0) + +/* Bucket expansion has the effect of doubling the number of buckets + * and redistributing the items into the new buckets. Ideally the + * items will distribute more or less evenly into the new buckets + * (the extent to which this is true is a measure of the quality of + * the hash function as it applies to the key domain). + * + * With the items distributed into more buckets, the chain length + * (item count) in each bucket is reduced. Thus by expanding buckets + * the hash keeps a bound on the chain length. This bounded chain + * length is the essence of how a hash provides constant time lookup. + * + * The calculation of tbl->ideal_chain_maxlen below deserves some + * explanation. First, keep in mind that we're calculating the ideal + * maximum chain length based on the *new* (doubled) bucket count. + * In fractions this is just n/b (n=number of items,b=new num buckets). + * Since the ideal chain length is an integer, we want to calculate + * ceil(n/b). We don't depend on floating point arithmetic in this + * hash, so to calculate ceil(n/b) with integers we could write + * + * ceil(n/b) = (n/b) + ((n%b)?1:0) + * + * and in fact a previous version of this hash did just that. + * But now we have improved things a bit by recognizing that b is + * always a power of two. We keep its base 2 log handy (call it lb), + * so now we can write this with a bit shift and logical AND: + * + * ceil(n/b) = (n>>lb) + ( (n & (b-1)) ? 1:0) + * + */ +#define HASH_EXPAND_BUCKETS(hh,tbl,oomed) \ +do { \ + unsigned _he_bkt; \ + unsigned _he_bkt_i; \ + struct UT_hash_handle *_he_thh, *_he_hh_nxt; \ + UT_hash_bucket *_he_new_buckets, *_he_newbkt; \ + _he_new_buckets = (UT_hash_bucket*)uthash_malloc( \ + sizeof(struct UT_hash_bucket) * (tbl)->num_buckets * 2U); \ + if (!_he_new_buckets) { \ + HASH_RECORD_OOM(oomed); \ + } else { \ + uthash_bzero(_he_new_buckets, \ + sizeof(struct UT_hash_bucket) * (tbl)->num_buckets * 2U); \ + (tbl)->ideal_chain_maxlen = \ + ((tbl)->num_items >> ((tbl)->log2_num_buckets+1U)) + \ + ((((tbl)->num_items & (((tbl)->num_buckets*2U)-1U)) != 0U) ? 1U : 0U); \ + (tbl)->nonideal_items = 0; \ + for (_he_bkt_i = 0; _he_bkt_i < (tbl)->num_buckets; _he_bkt_i++) { \ + _he_thh = (tbl)->buckets[ _he_bkt_i ].hh_head; \ + while (_he_thh != NULL) { \ + _he_hh_nxt = _he_thh->hh_next; \ + HASH_TO_BKT(_he_thh->hashv, (tbl)->num_buckets * 2U, _he_bkt); \ + _he_newbkt = &(_he_new_buckets[_he_bkt]); \ + if (++(_he_newbkt->count) > (tbl)->ideal_chain_maxlen) { \ + (tbl)->nonideal_items++; \ + if (_he_newbkt->count > _he_newbkt->expand_mult * (tbl)->ideal_chain_maxlen) { \ + _he_newbkt->expand_mult++; \ + } \ + } \ + _he_thh->hh_prev = NULL; \ + _he_thh->hh_next = _he_newbkt->hh_head; \ + if (_he_newbkt->hh_head != NULL) { \ + _he_newbkt->hh_head->hh_prev = _he_thh; \ + } \ + _he_newbkt->hh_head = _he_thh; \ + _he_thh = _he_hh_nxt; \ + } \ + } \ + uthash_free((tbl)->buckets, (tbl)->num_buckets * sizeof(struct UT_hash_bucket)); \ + (tbl)->num_buckets *= 2U; \ + (tbl)->log2_num_buckets++; \ + (tbl)->buckets = _he_new_buckets; \ + (tbl)->ineff_expands = ((tbl)->nonideal_items > ((tbl)->num_items >> 1)) ? \ + ((tbl)->ineff_expands+1U) : 0U; \ + if ((tbl)->ineff_expands > 1U) { \ + (tbl)->noexpand = 1; \ + uthash_noexpand_fyi(tbl); \ + } \ + uthash_expand_fyi(tbl); \ + } \ +} while (0) + + +/* This is an adaptation of Simon Tatham's O(n log(n)) mergesort */ +/* Note that HASH_SORT assumes the hash handle name to be hh. + * HASH_SRT was added to allow the hash handle name to be passed in. */ +#define HASH_SORT(head,cmpfcn) HASH_SRT(hh,head,cmpfcn) +#define HASH_SRT(hh,head,cmpfcn) \ +do { \ + unsigned _hs_i; \ + unsigned _hs_looping,_hs_nmerges,_hs_insize,_hs_psize,_hs_qsize; \ + struct UT_hash_handle *_hs_p, *_hs_q, *_hs_e, *_hs_list, *_hs_tail; \ + if (head != NULL) { \ + _hs_insize = 1; \ + _hs_looping = 1; \ + _hs_list = &((head)->hh); \ + while (_hs_looping != 0U) { \ + _hs_p = _hs_list; \ + _hs_list = NULL; \ + _hs_tail = NULL; \ + _hs_nmerges = 0; \ + while (_hs_p != NULL) { \ + _hs_nmerges++; \ + _hs_q = _hs_p; \ + _hs_psize = 0; \ + for (_hs_i = 0; _hs_i < _hs_insize; ++_hs_i) { \ + _hs_psize++; \ + _hs_q = ((_hs_q->next != NULL) ? \ + HH_FROM_ELMT((head)->hh.tbl, _hs_q->next) : NULL); \ + if (_hs_q == NULL) { \ + break; \ + } \ + } \ + _hs_qsize = _hs_insize; \ + while ((_hs_psize != 0U) || ((_hs_qsize != 0U) && (_hs_q != NULL))) { \ + if (_hs_psize == 0U) { \ + _hs_e = _hs_q; \ + _hs_q = ((_hs_q->next != NULL) ? \ + HH_FROM_ELMT((head)->hh.tbl, _hs_q->next) : NULL); \ + _hs_qsize--; \ + } else if ((_hs_qsize == 0U) || (_hs_q == NULL)) { \ + _hs_e = _hs_p; \ + if (_hs_p != NULL) { \ + _hs_p = ((_hs_p->next != NULL) ? \ + HH_FROM_ELMT((head)->hh.tbl, _hs_p->next) : NULL); \ + } \ + _hs_psize--; \ + } else if ((cmpfcn( \ + DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl, _hs_p)), \ + DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl, _hs_q)) \ + )) <= 0) { \ + _hs_e = _hs_p; \ + if (_hs_p != NULL) { \ + _hs_p = ((_hs_p->next != NULL) ? \ + HH_FROM_ELMT((head)->hh.tbl, _hs_p->next) : NULL); \ + } \ + _hs_psize--; \ + } else { \ + _hs_e = _hs_q; \ + _hs_q = ((_hs_q->next != NULL) ? \ + HH_FROM_ELMT((head)->hh.tbl, _hs_q->next) : NULL); \ + _hs_qsize--; \ + } \ + if ( _hs_tail != NULL ) { \ + _hs_tail->next = ((_hs_e != NULL) ? \ + ELMT_FROM_HH((head)->hh.tbl, _hs_e) : NULL); \ + } else { \ + _hs_list = _hs_e; \ + } \ + if (_hs_e != NULL) { \ + _hs_e->prev = ((_hs_tail != NULL) ? \ + ELMT_FROM_HH((head)->hh.tbl, _hs_tail) : NULL); \ + } \ + _hs_tail = _hs_e; \ + } \ + _hs_p = _hs_q; \ + } \ + if (_hs_tail != NULL) { \ + _hs_tail->next = NULL; \ + } \ + if (_hs_nmerges <= 1U) { \ + _hs_looping = 0; \ + (head)->hh.tbl->tail = _hs_tail; \ + DECLTYPE_ASSIGN(head, ELMT_FROM_HH((head)->hh.tbl, _hs_list)); \ + } \ + _hs_insize *= 2U; \ + } \ + HASH_FSCK(hh, head, "HASH_SRT"); \ + } \ +} while (0) + +/* This function selects items from one hash into another hash. + * The end result is that the selected items have dual presence + * in both hashes. There is no copy of the items made; rather + * they are added into the new hash through a secondary hash + * hash handle that must be present in the structure. */ +#define HASH_SELECT(hh_dst, dst, hh_src, src, cond) \ +do { \ + unsigned _src_bkt, _dst_bkt; \ + void *_last_elt = NULL, *_elt; \ + UT_hash_handle *_src_hh, *_dst_hh, *_last_elt_hh=NULL; \ + ptrdiff_t _dst_hho = ((char*)(&(dst)->hh_dst) - (char*)(dst)); \ + if ((src) != NULL) { \ + for (_src_bkt=0; _src_bkt < (src)->hh_src.tbl->num_buckets; _src_bkt++) { \ + for (_src_hh = (src)->hh_src.tbl->buckets[_src_bkt].hh_head; \ + _src_hh != NULL; \ + _src_hh = _src_hh->hh_next) { \ + _elt = ELMT_FROM_HH((src)->hh_src.tbl, _src_hh); \ + if (cond(_elt)) { \ + IF_HASH_NONFATAL_OOM( int _hs_oomed = 0; ) \ + _dst_hh = (UT_hash_handle*)(void*)(((char*)_elt) + _dst_hho); \ + _dst_hh->key = _src_hh->key; \ + _dst_hh->keylen = _src_hh->keylen; \ + _dst_hh->hashv = _src_hh->hashv; \ + _dst_hh->prev = _last_elt; \ + _dst_hh->next = NULL; \ + if (_last_elt_hh != NULL) { \ + _last_elt_hh->next = _elt; \ + } \ + if ((dst) == NULL) { \ + DECLTYPE_ASSIGN(dst, _elt); \ + HASH_MAKE_TABLE(hh_dst, dst, _hs_oomed); \ + IF_HASH_NONFATAL_OOM( \ + if (_hs_oomed) { \ + uthash_nonfatal_oom(_elt); \ + (dst) = NULL; \ + continue; \ + } \ + ) \ + } else { \ + _dst_hh->tbl = (dst)->hh_dst.tbl; \ + } \ + HASH_TO_BKT(_dst_hh->hashv, _dst_hh->tbl->num_buckets, _dst_bkt); \ + HASH_ADD_TO_BKT(_dst_hh->tbl->buckets[_dst_bkt], hh_dst, _dst_hh, _hs_oomed); \ + (dst)->hh_dst.tbl->num_items++; \ + IF_HASH_NONFATAL_OOM( \ + if (_hs_oomed) { \ + HASH_ROLLBACK_BKT(hh_dst, dst, _dst_hh); \ + HASH_DELETE_HH(hh_dst, dst, _dst_hh); \ + _dst_hh->tbl = NULL; \ + uthash_nonfatal_oom(_elt); \ + continue; \ + } \ + ) \ + HASH_BLOOM_ADD(_dst_hh->tbl, _dst_hh->hashv); \ + _last_elt = _elt; \ + _last_elt_hh = _dst_hh; \ + } \ + } \ + } \ + } \ + HASH_FSCK(hh_dst, dst, "HASH_SELECT"); \ +} while (0) + +#define HASH_CLEAR(hh,head) \ +do { \ + if ((head) != NULL) { \ + HASH_BLOOM_FREE((head)->hh.tbl); \ + uthash_free((head)->hh.tbl->buckets, \ + (head)->hh.tbl->num_buckets*sizeof(struct UT_hash_bucket)); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + (head) = NULL; \ + } \ +} while (0) + +#define HASH_OVERHEAD(hh,head) \ + (((head) != NULL) ? ( \ + (size_t)(((head)->hh.tbl->num_items * sizeof(UT_hash_handle)) + \ + ((head)->hh.tbl->num_buckets * sizeof(UT_hash_bucket)) + \ + sizeof(UT_hash_table) + \ + (HASH_BLOOM_BYTELEN))) : 0U) + +#ifdef NO_DECLTYPE +#define HASH_ITER(hh,head,el,tmp) \ +for(((el)=(head)), ((*(char**)(&(tmp)))=(char*)((head!=NULL)?(head)->hh.next:NULL)); \ + (el) != NULL; ((el)=(tmp)), ((*(char**)(&(tmp)))=(char*)((tmp!=NULL)?(tmp)->hh.next:NULL))) +#else +#define HASH_ITER(hh,head,el,tmp) \ +for(((el)=(head)), ((tmp)=DECLTYPE(el)((head!=NULL)?(head)->hh.next:NULL)); \ + (el) != NULL; ((el)=(tmp)), ((tmp)=DECLTYPE(el)((tmp!=NULL)?(tmp)->hh.next:NULL))) +#endif + +/* obtain a count of items in the hash */ +#define HASH_COUNT(head) HASH_CNT(hh,head) +#define HASH_CNT(hh,head) ((head != NULL)?((head)->hh.tbl->num_items):0U) + +typedef struct UT_hash_bucket { + struct UT_hash_handle *hh_head; + unsigned count; + + /* expand_mult is normally set to 0. In this situation, the max chain length + * threshold is enforced at its default value, HASH_BKT_CAPACITY_THRESH. (If + * the bucket's chain exceeds this length, bucket expansion is triggered). + * However, setting expand_mult to a non-zero value delays bucket expansion + * (that would be triggered by additions to this particular bucket) + * until its chain length reaches a *multiple* of HASH_BKT_CAPACITY_THRESH. + * (The multiplier is simply expand_mult+1). The whole idea of this + * multiplier is to reduce bucket expansions, since they are expensive, in + * situations where we know that a particular bucket tends to be overused. + * It is better to let its chain length grow to a longer yet-still-bounded + * value, than to do an O(n) bucket expansion too often. + */ + unsigned expand_mult; + +} UT_hash_bucket; + +/* random signature used only to find hash tables in external analysis */ +#define HASH_SIGNATURE 0xa0111fe1u +#define HASH_BLOOM_SIGNATURE 0xb12220f2u + +typedef struct UT_hash_table { + UT_hash_bucket *buckets; + unsigned num_buckets, log2_num_buckets; + unsigned num_items; + struct UT_hash_handle *tail; /* tail hh in app order, for fast append */ + ptrdiff_t hho; /* hash handle offset (byte pos of hash handle in element */ + + /* in an ideal situation (all buckets used equally), no bucket would have + * more than ceil(#items/#buckets) items. that's the ideal chain length. */ + unsigned ideal_chain_maxlen; + + /* nonideal_items is the number of items in the hash whose chain position + * exceeds the ideal chain maxlen. these items pay the penalty for an uneven + * hash distribution; reaching them in a chain traversal takes >ideal steps */ + unsigned nonideal_items; + + /* ineffective expands occur when a bucket doubling was performed, but + * afterward, more than half the items in the hash had nonideal chain + * positions. If this happens on two consecutive expansions we inhibit any + * further expansion, as it's not helping; this happens when the hash + * function isn't a good fit for the key domain. When expansion is inhibited + * the hash will still work, albeit no longer in constant time. */ + unsigned ineff_expands, noexpand; + + uint32_t signature; /* used only to find hash tables in external analysis */ +#ifdef HASH_BLOOM + uint32_t bloom_sig; /* used only to test bloom exists in external analysis */ + uint8_t *bloom_bv; + uint8_t bloom_nbits; +#endif + +} UT_hash_table; + +typedef struct UT_hash_handle { + struct UT_hash_table *tbl; + void *prev; /* prev element in app order */ + void *next; /* next element in app order */ + struct UT_hash_handle *hh_prev; /* previous hh in bucket order */ + struct UT_hash_handle *hh_next; /* next hh in bucket order */ + const void *key; /* ptr to enclosing struct's key */ + unsigned keylen; /* enclosing struct's key len */ + unsigned hashv; /* result of hash-fcn(key) */ +} UT_hash_handle; + +#endif /* UTHASH_H */ \ No newline at end of file diff --git a/src/common/utils.c b/src/common/utils.c new file mode 100644 index 0000000..0a11269 --- /dev/null +++ b/src/common/utils.c @@ -0,0 +1,93 @@ +#include "utils.h" + +#include +#include +#include + +int zvfs_calc_io_units(uint64_t offset_bytes, + size_t len_bytes, + uint32_t io_unit_size, + uint64_t *unit_offset, + uint64_t *unit_len, + uint32_t *buf_offset_out) { + if (!unit_offset || !unit_len || !buf_offset_out || io_unit_size == 0) { + return -EINVAL; + } + if (len_bytes == 0) { + *unit_offset = 0; + *unit_len = 0; + *buf_offset_out = 0; + return 0; + } + + // offset 向下对齐到 io_unit 边界 + uint64_t aligned_offset = (offset_bytes / io_unit_size) * io_unit_size; + + // 末尾向上对齐,保证覆盖完整的请求区间 + uint64_t end_bytes = offset_bytes + (uint64_t)len_bytes; + uint64_t aligned_end = ((end_bytes + io_unit_size - 1) / io_unit_size) * io_unit_size; + + *unit_offset = aligned_offset / io_unit_size; + *unit_len = (aligned_end - aligned_offset) / io_unit_size; + *buf_offset_out = (uint32_t)(offset_bytes - aligned_offset); // 原始数据在 dma_buf 内的起始偏移 + + return 0; +} + +int zvfs_calc_ceil_units(uint64_t bytes, + uint64_t unit_size, + uint64_t *units_out) { + if (!units_out || unit_size == 0) { + return -EINVAL; + } + + *units_out = bytes / unit_size; + if ((bytes % unit_size) != 0) { + if (*units_out == UINT64_MAX) { + return -EOVERFLOW; + } + (*units_out)++; + } + return 0; +} + +int buf_init(zvfs_buf_t *b, size_t initial) +{ + b->data = malloc(initial); + if (!b->data) return -1; + b->cap = initial; + b->len = 0; + return 0; +} + +void buf_free(zvfs_buf_t *b) +{ + free(b->data); + b->data = NULL; + b->len = b->cap = 0; +} + +/* + * 确保缓冲区还有 need 字节可用,不够则 realloc 两倍。 + */ +int buf_reserve(zvfs_buf_t *b, size_t need) +{ + if (b->len + need <= b->cap) return 0; + + size_t new_cap = b->cap * 2; + while (new_cap < b->len + need) new_cap *= 2; + + uint8_t *p = realloc(b->data, new_cap); + if (!p) return -1; + b->data = p; + b->cap = new_cap; + return 0; +} + +int buf_append(zvfs_buf_t *b, const void *src, size_t n) +{ + if (buf_reserve(b, n) != 0) return -1; + memcpy(b->data + b->len, src, n); + b->len += n; + return 0; +} diff --git a/src/common/utils.h b/src/common/utils.h new file mode 100644 index 0000000..1d63023 --- /dev/null +++ b/src/common/utils.h @@ -0,0 +1,29 @@ +#ifndef __ZVFS_COMMON_UTILS_H__ +#define __ZVFS_COMMON_UTILS_H__ + +#include +#include + +int zvfs_calc_io_units(uint64_t offset_bytes, + size_t len_bytes, + uint32_t io_unit_size, + uint64_t *unit_offset, + uint64_t *unit_len, + uint32_t *buf_offset_out); + +int zvfs_calc_ceil_units(uint64_t bytes, + uint64_t unit_size, + uint64_t *units_out); + +typedef struct { + uint8_t *data; + size_t cap; + size_t len; +} zvfs_buf_t; + +int buf_init(zvfs_buf_t *b, size_t initial); +void buf_free(zvfs_buf_t *b); +int buf_reserve(zvfs_buf_t *b, size_t need); +int buf_append(zvfs_buf_t *b, const void *src, size_t n); + +#endif // __ZVFS_COMMON_UTILS_H__ diff --git a/src/config.h b/src/config.h new file mode 100644 index 0000000..fa543a4 --- /dev/null +++ b/src/config.h @@ -0,0 +1,32 @@ +#ifndef __ZVFS_CONFIG_H__ +#define __ZVFS_CONFIG_H__ + +/** + * ZVFS + */ +#define ZVFS_XATTR_BLOB_ID "user.zvfs.blob_id" + +/** + * SPDK + */ +// dev +#define SPDK_JSON_PATH "/home/lian/try/zvfs/src/zvfsmalloc.json" +// #define ZVFS_BDEV "Nvme0n1" +#ifndef ZVFS_BDEV +#define ZVFS_BDEV "Malloc0" +#endif + +// super blob +#define ZVFS_SB_MAGIC UINT64_C(0x5A5646535F534200) /* "ZVFS_SB\0" */ +#define ZVFS_SB_VERSION UINT32_C(1) + +// dma +#define ZVFS_DMA_BUF_SIZE (1024 * 1024) + +// waiter +#define WAITER_MAX_TIME 10000000 + + + + +#endif // __ZVFS_CONFIG_H__ diff --git a/zvfs/fio_script/bdev.fio b/src/fio_script/bdev.fio similarity index 100% rename from zvfs/fio_script/bdev.fio rename to src/fio_script/bdev.fio diff --git a/zvfs/fio_script/io_uring.fio b/src/fio_script/io_uring.fio similarity index 100% rename from zvfs/fio_script/io_uring.fio rename to src/fio_script/io_uring.fio diff --git a/zvfs/fio_script/kingfs b/src/fio_script/kingfs similarity index 100% rename from zvfs/fio_script/kingfs rename to src/fio_script/kingfs diff --git a/zvfs/fio_script/libaio.fio b/src/fio_script/libaio.fio similarity index 100% rename from zvfs/fio_script/libaio.fio rename to src/fio_script/libaio.fio diff --git a/zvfs/fio_script/nvme.fio b/src/fio_script/nvme.fio similarity index 100% rename from zvfs/fio_script/nvme.fio rename to src/fio_script/nvme.fio diff --git a/zvfs/fio_script/psync.fio b/src/fio_script/psync.fio similarity index 100% rename from zvfs/fio_script/psync.fio rename to src/fio_script/psync.fio diff --git a/zvfs/fio_script/zvfs.json b/src/fio_script/zvfs.json similarity index 100% rename from zvfs/fio_script/zvfs.json rename to src/fio_script/zvfs.json diff --git a/src/fs/zvfs.c b/src/fs/zvfs.c new file mode 100644 index 0000000..d9306fe --- /dev/null +++ b/src/fs/zvfs.c @@ -0,0 +1,103 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include "config.h" +#include "common/utils.h" +#include "fs/zvfs.h" +#include "fs/zvfs_inode.h" +#include "fs/zvfs_path_entry.h" +#include "fs/zvfs_open_file.h" + +#include +#include +struct zvfs_fs g_fs = {0}; + +/* ------------------------------------------------------------------ */ +/* init / destroy */ +/* ------------------------------------------------------------------ */ + +int zvfs_fs_init(void) { + memset(&g_fs, 0, sizeof(g_fs)); + + if (pthread_mutex_init(&g_fs.inode_mu, NULL) != 0) goto fail_inode; + if (pthread_mutex_init(&g_fs.path_mu, NULL) != 0) goto fail_path; + if (pthread_mutex_init(&g_fs.fd_mu, NULL) != 0) goto fail_fd; + + return 0; + +fail_fd: + pthread_mutex_destroy(&g_fs.path_mu); +fail_path: + pthread_mutex_destroy(&g_fs.inode_mu); +fail_inode: + return -1; +} + +/** + * 销毁 fd_table:每个 openfile 只释放结构体内存, + * blob_close / inode 引用计数的清理应由上层在进程退出前完成。 + * 这里做"强制兜底"清理,避免内存泄漏。 + */ +int zvfs_fs_destroy(void) { + + pthread_mutex_lock(&g_fs.fd_mu); + { + struct zvfs_open_file *of, *tmp_of; + HASH_ITER(hh, g_fs.fd_table, of, tmp_of) { + HASH_DEL(g_fs.fd_table, of); + openfile_free(of); + } + } + pthread_mutex_unlock(&g_fs.fd_mu); + + /* 销毁 path_cache */ + pthread_mutex_lock(&g_fs.path_mu); + { + struct zvfs_path_entry *pe, *tmp_pe; + HASH_ITER(hh, g_fs.path_cache, pe, tmp_pe) { + HASH_DEL(g_fs.path_cache, pe); + free(pe->path); + free(pe); + } + } + pthread_mutex_unlock(&g_fs.path_mu); + + /* 销毁 inode_table */ + pthread_mutex_lock(&g_fs.inode_mu); + { + struct zvfs_inode *in, *tmp_in; + HASH_ITER(hh, g_fs.inode_table, in, tmp_in) { + HASH_DEL(g_fs.inode_table, in); + inode_free(in); + } + } + pthread_mutex_unlock(&g_fs.inode_mu); + + pthread_mutex_destroy(&g_fs.fd_mu); + pthread_mutex_destroy(&g_fs.path_mu); + pthread_mutex_destroy(&g_fs.inode_mu); + + return 0; +} + +/* ------------------------------------------------------------------ */ +/* xattr helpers */ +/* ------------------------------------------------------------------ */ + +int zvfs_xattr_write_blob_id(int fd, uint64_t blob_id) +{ + if (fsetxattr(fd, ZVFS_XATTR_BLOB_ID, &blob_id, sizeof(blob_id), 0) < 0) + return -1; + return 0; +} + +int zvfs_xattr_read_blob_id(int fd, uint64_t *blob_id_out) +{ + ssize_t ret = fgetxattr(fd, ZVFS_XATTR_BLOB_ID, blob_id_out, sizeof(uint64_t)); + if (ret != sizeof(uint64_t)) { + if (ret >= 0) + errno = EIO; /* 长度不对,视为损坏 */ + return -1; + } + return 0; +} \ No newline at end of file diff --git a/src/fs/zvfs.h b/src/fs/zvfs.h new file mode 100644 index 0000000..024ee62 --- /dev/null +++ b/src/fs/zvfs.h @@ -0,0 +1,35 @@ +#ifndef __ZVFS_FS_GLOBAL_H__ +#define __ZVFS_FS_GLOBAL_H__ + +#include +#include +#include + +struct zvfs_inode; +struct zvfs_path_entry; +struct zvfs_open_file; + +struct zvfs_fs { + struct zvfs_inode *inode_table; /* blob_id → inode */ + struct zvfs_path_entry *path_cache; /* path → inode(运行时缓存)*/ + struct zvfs_open_file *fd_table; /* fd → openfile */ + + pthread_mutex_t inode_mu; + pthread_mutex_t path_mu; + pthread_mutex_t fd_mu; +}; + +struct strace { + + +}; + +extern struct zvfs_fs g_fs; + +int zvfs_fs_init(void); +int zvfs_fs_destroy(void); + +int zvfs_xattr_write_blob_id(int fd, uint64_t blob_id); +int zvfs_xattr_read_blob_id(int fd, uint64_t *blob_id_out); + +#endif // __ZVFS_FS_GLOBAL_H__ diff --git a/src/fs/zvfs_inode.c b/src/fs/zvfs_inode.c new file mode 100644 index 0000000..b0198e2 --- /dev/null +++ b/src/fs/zvfs_inode.c @@ -0,0 +1,83 @@ +#include "zvfs_inode.h" +#include "zvfs.h" + +#include +#include +#include +#include +#include +#include + +/* ------------------------------------------------------------------ */ +/* alloc / free */ +/* ------------------------------------------------------------------ */ + +struct zvfs_inode *inode_alloc(uint64_t blob_id, mode_t mode, zvfs_itype_t itype) { + struct zvfs_inode *in = calloc(1, sizeof(*in)); + if (!in) + return NULL; + + in->blob_id = blob_id; + in->logical_size = 0; + in->itype = itype; + in->mode = mode; + in->uid = getuid(); + in->gid = getgid(); + + time_t now = time(NULL); + in->atime = now; + in->mtime = now; + in->deleted = false; + + atomic_init(&in->ref_count, 1); + pthread_mutex_init(&in->mu, NULL); + + return in; +} + +void inode_free(struct zvfs_inode *inode){ + if (!inode) + return; + pthread_mutex_destroy(&inode->mu); + free(inode); +} + + +/* ------------------------------------------------------------------ */ +/* hash table operations (调用方持有 g_fs.inode_mu) */ +/* ------------------------------------------------------------------ */ + +void inode_insert(struct zvfs_inode *inode){ + HASH_ADD(hh, g_fs.inode_table, blob_id, sizeof(uint64_t), inode); +} + +struct zvfs_inode *inode_lookup(uint64_t blob_id) { + struct zvfs_inode *in = NULL; + HASH_FIND(hh, g_fs.inode_table, &blob_id, sizeof(uint64_t), in); + return in; +} + +void inode_remove(uint64_t blob_id) { + struct zvfs_inode *in = inode_lookup(blob_id); + if (in) + HASH_DELETE(hh, g_fs.inode_table, in); +} + +/* ------------------------------------------------------------------ */ +/* size / timestamp helpers (调用方持有 inode->mu) */ +/* ------------------------------------------------------------------ */ + +void inode_update_size(struct zvfs_inode *inode, int real_fd, uint64_t new_size) { + inode->logical_size = new_size; + if (real_fd >= 0) + ftruncate(real_fd, (off_t)new_size); /* 同步 st_size,忽略错误 */ +} + +void inode_touch_atime(struct zvfs_inode *inode) { + inode->atime = time(NULL); +} + +void inode_touch_mtime(struct zvfs_inode *inode) +{ + inode->mtime = time(NULL); +} \ No newline at end of file diff --git a/src/fs/zvfs_inode.h b/src/fs/zvfs_inode.h new file mode 100644 index 0000000..bc4334e --- /dev/null +++ b/src/fs/zvfs_inode.h @@ -0,0 +1,58 @@ +#ifndef __ZVFS_INODE_H__ +#define __ZVFS_INODE_H__ + +#include "common/uthash.h" + +#include +#include +#include +#include +#include +#include + +typedef enum { + ZVFS_ITYPE_FILE = 0, + ZVFS_ITYPE_DIR = 1, +} zvfs_itype_t; + +struct zvfs_inode { + uint64_t blob_id; + uint64_t logical_size; // 和真实文件 st_size 保持同步 + zvfs_itype_t itype; // FILE only,DIR 不进这张表 + + mode_t mode; // 权限 + uid_t uid; // + gid_t gid; + time_t atime, mtime; + + atomic_int ref_count; + pthread_mutex_t mu; // 护 logical_size、append_offset 等更新 + bool deleted; + + UT_hash_handle hh; +}; + +// 分配并初始化一个 inode,不插入全局表 +struct zvfs_inode *inode_alloc(uint64_t blob_id, mode_t mode, zvfs_itype_t itype); + +// 释放 inode 内存(调用前确保 ref_count == 0) +void inode_free(struct zvfs_inode *inode); + +// 插入全局表(需持有 inode_mu) +void inode_insert(struct zvfs_inode *inode); + +// 按 blob_id 查找(需持有 inode_mu) +struct zvfs_inode *inode_lookup(uint64_t blob_id); + +// 从全局表移除(需持有 inode_mu,不释放内存) +void inode_remove(uint64_t blob_id); + +// 更新 logical_size,同时负责调用 ftruncate 同步 st_size +// 需持有 inode->mu +void inode_update_size(struct zvfs_inode *inode, int real_fd, uint64_t new_size); + +// 更新时间戳(需持有 inode->mu) +void inode_touch_atime(struct zvfs_inode *inode); +void inode_touch_mtime(struct zvfs_inode *inode); + +#endif // __ZVFS_INODE_H__ \ No newline at end of file diff --git a/src/fs/zvfs_open_file.c b/src/fs/zvfs_open_file.c new file mode 100644 index 0000000..178910c --- /dev/null +++ b/src/fs/zvfs_open_file.c @@ -0,0 +1,97 @@ +#include "zvfs_open_file.h" +#include "zvfs_inode.h" +#include "zvfs.h" + +#include +#include +#include +#include +#include + +/* ------------------------------------------------------------------ */ +/* alloc / free */ +/* ------------------------------------------------------------------ */ + +struct zvfs_open_file *openfile_alloc(int fd, + struct zvfs_inode *inode, + int flags, + struct zvfs_blob_handle *handle) +{ + struct zvfs_open_file *of = calloc(1, sizeof(*of)); + if (!of) + return NULL; + + of->fd = fd; + of->inode = inode; + of->handle = handle; + of->flags = flags; + of->fd_flags = 0; + of->offset = 0; + atomic_init(&of->ref_count, 1); + + return of; +} + +void openfile_free(struct zvfs_open_file *of) +{ + free(of); +} + +/* ------------------------------------------------------------------ */ +/* hash table operations (调用方持有 g_fs.fd_mu) */ +/* ------------------------------------------------------------------ */ + +void openfile_insert(struct zvfs_open_file *of) +{ + HASH_ADD_INT(g_fs.fd_table, fd, of); +} + +struct zvfs_open_file *openfile_lookup(int fd) +{ + struct zvfs_open_file *of = NULL; + HASH_FIND_INT(g_fs.fd_table, &fd, of); + return of; +} + +void openfile_remove(int fd) +{ + struct zvfs_open_file *of = openfile_lookup(fd); + if (of) + HASH_DEL(g_fs.fd_table, of); +} + +/* ------------------------------------------------------------------ */ +/* lseek (调用方持有 of->inode->mu) */ +/* ------------------------------------------------------------------ */ + +uint64_t openfile_seek(struct zvfs_open_file *of, int64_t offset, int whence) +{ + int64_t new_off; + + switch (whence) { + case SEEK_SET: + new_off = offset; + break; + + case SEEK_CUR: + new_off = (int64_t)of->offset + offset; + break; + + case SEEK_END: + /* logical_size 由调用方在持锁状态下保证可见 */ + new_off = (int64_t)of->inode->logical_size + offset; + break; + + default: + errno = EINVAL; + return (uint64_t)-1; + } + + if (new_off < 0) { + errno = EINVAL; + return (uint64_t)-1; + } + + of->offset = (uint64_t)new_off; + return of->offset; +} \ No newline at end of file diff --git a/src/fs/zvfs_open_file.h b/src/fs/zvfs_open_file.h new file mode 100644 index 0000000..de47c1e --- /dev/null +++ b/src/fs/zvfs_open_file.h @@ -0,0 +1,48 @@ +#ifndef __ZVFS_OPEN_FILE_H__ +#define __ZVFS_OPEN_FILE_H__ + +#include "common/uthash.h" +#include "spdk_engine/io_engine.h" +#include +#include + +#ifndef SPDK_BLOB_ID_DEFINED +typedef uint64_t spdk_blob_id; +#define SPDK_BLOB_ID_DEFINED +#endif + +struct zvfs_open_file { + int fd; // key,和真实 fd 1:1 + struct zvfs_inode *inode; + struct zvfs_blob_handle *handle; + + int flags; + int fd_flags; + + uint64_t offset; // 非 APPEND 模式的当前位置 + atomic_int ref_count; // dup / close 用 + + UT_hash_handle hh; +}; + +// 分配 openfile,不插入全局表,ref_count 初始为 1 +struct zvfs_open_file *openfile_alloc(int fd, struct zvfs_inode *inode, + int flags, struct zvfs_blob_handle *handle); + +// 释放内存(调用前确保 ref_count == 0,不负责 blob_close) +void openfile_free(struct zvfs_open_file *of); + +// 插入全局表(需持有 fd_mu) +void openfile_insert(struct zvfs_open_file *of); + +// 按 fd 查找(需持有 fd_mu) +struct zvfs_open_file *openfile_lookup(int fd); + +// 从全局表移除(需持有 fd_mu,不释放内存) +void openfile_remove(int fd); + +// lseek 语义:返回新 offset,出错返回 (uint64_t)-1 +// 需持有 of->inode->mu(读 logical_size) +uint64_t openfile_seek(struct zvfs_open_file *of, int64_t offset, int whence); + +#endif // __ZVFS_OPEN_FILE_H__ \ No newline at end of file diff --git a/src/fs/zvfs_path_entry.c b/src/fs/zvfs_path_entry.c new file mode 100644 index 0000000..10c037c --- /dev/null +++ b/src/fs/zvfs_path_entry.c @@ -0,0 +1,82 @@ +#include "zvfs_path_entry.h" +#include "zvfs.h" + +#include +#include +#include + +/* ------------------------------------------------------------------ */ +/* internal helper */ +/* ------------------------------------------------------------------ */ + +static struct zvfs_path_entry *_path_find(const char *path) +{ + struct zvfs_path_entry *e = NULL; + HASH_FIND_STR(g_fs.path_cache, path, e); + return e; +} + +/* ------------------------------------------------------------------ */ +/* public API (调用方持有 g_fs.path_mu) */ +/* ------------------------------------------------------------------ */ + +int path_cache_insert(const char *path, struct zvfs_inode *inode) +{ + if (_path_find(path)) + return -EEXIST; + + struct zvfs_path_entry *e = calloc(1, sizeof(*e)); + if (!e) + return -ENOMEM; + + e->path = strdup(path); + if (!e->path) { + free(e); + return -ENOMEM; + } + e->inode = inode; + + HASH_ADD_STR(g_fs.path_cache, path, e); + return 0; +} + +struct zvfs_path_entry *path_cache_lookup(const char *path) +{ + return _path_find(path); +} + +void path_cache_remove(const char *path) +{ + struct zvfs_path_entry *e = _path_find(path); + if (!e) + return; + HASH_DEL(g_fs.path_cache, e); + free(e->path); + free(e); +} + +int path_cache_rename(const char *old_path, const char *new_path) +{ + struct zvfs_path_entry *old_e = _path_find(old_path); + if (!old_e) + return -1; + + /* 若 new_path 已存在,先清掉旧 entry(inode 引用由上层处理) */ + struct zvfs_path_entry *new_e = _path_find(new_path); + if (new_e) { + HASH_DEL(g_fs.path_cache, new_e); + free(new_e->path); + free(new_e); + } + + /* 替换 key:从表中删除,修改 key 字符串,重新插入 */ + HASH_DEL(g_fs.path_cache, old_e); + free(old_e->path); + old_e->path = strdup(new_path); + if (!old_e->path) { + free(old_e); + return -1; + } + HASH_ADD_STR(g_fs.path_cache, path, old_e); + return 0; +} \ No newline at end of file diff --git a/src/fs/zvfs_path_entry.h b/src/fs/zvfs_path_entry.h new file mode 100644 index 0000000..cc6e145 --- /dev/null +++ b/src/fs/zvfs_path_entry.h @@ -0,0 +1,30 @@ +#ifndef __ZVFS_PATH_ENTRY_H__ +#define __ZVFS_PATH_ENTRY_H__ + +#include "common/uthash.h" +#include +#include + +struct zvfs_path_entry { + char *path; // key + struct zvfs_inode *inode; + + UT_hash_handle hh; +}; + + +// 插入缓存,path 内部 strdup,inode->ref_count 不在此处修改 +// 需持有 path_mu +int path_cache_insert(const char *path, struct zvfs_inode *inode); + +// 查找,未命中返回 NULL(需持有 path_mu) +struct zvfs_path_entry *path_cache_lookup(const char *path); + +// 移除并释放 entry(不释放 inode,需持有 path_mu) +void path_cache_remove(const char *path); + +// rename:原子替换 key(需持有 path_mu) +int path_cache_rename(const char *old_path, const char *new_path); + + +#endif // __ZVFS_PATH_ENTRY_H__ \ No newline at end of file diff --git a/src/fs/zvfs_sys_init.c b/src/fs/zvfs_sys_init.c new file mode 100644 index 0000000..6f9375d --- /dev/null +++ b/src/fs/zvfs_sys_init.c @@ -0,0 +1,38 @@ +// zvfs_sysinit.c +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include "config.h" +#include "zvfs_sys_init.h" +#include "fs/zvfs.h" // zvfs_fs_init +#include "spdk_engine/io_engine.h" + +#include +#include +#include + +static pthread_once_t _init_once = PTHREAD_ONCE_INIT; +static int _init_ok = 0; + +static void +do_init(void) +{ + const char *bdev = getenv("ZVFS_BDEV"); + if (!bdev) { + bdev = ZVFS_BDEV; + fprintf(stderr, "[zvfs] ZVFS_BDEV not set, set as (%s)\n", ZVFS_BDEV); + } + + if (io_engine_init(bdev) != 0) { + fprintf(stderr, "[zvfs] FATAL: io_engine_init(%s) failed\n", bdev); + abort(); + } + + _init_ok = 1; +} + +void +zvfs_ensure_init(void) +{ + pthread_once(&_init_once, do_init); +} \ No newline at end of file diff --git a/src/fs/zvfs_sys_init.h b/src/fs/zvfs_sys_init.h new file mode 100644 index 0000000..f076eed --- /dev/null +++ b/src/fs/zvfs_sys_init.h @@ -0,0 +1,15 @@ +// zvfs_sysinit.h +#ifndef __ZVFS_SYSINIT_H__ +#define __ZVFS_SYSINIT_H__ + +/* + * 确保 io_engine 已初始化。 + * 第一次被调用时执行初始化,后续调用直接返回。 + * 线程安全:内部用 pthread_once 保证只初始化一次。 + * + * 调用时机:第一次 open("/zvfs/...") 时触发。 + * 此时 main() 已经开始执行,SPDK 所需的运行环境已就绪。 + */ +void zvfs_ensure_init(void); + +#endif \ No newline at end of file diff --git a/src/hook/zvfs_hook.h b/src/hook/zvfs_hook.h new file mode 100644 index 0000000..a741875 --- /dev/null +++ b/src/hook/zvfs_hook.h @@ -0,0 +1,14 @@ +#ifndef __ZVFS_HOOK_H__ +#define __ZVFS_HOOK_H__ + +#include "zvfs_hook_init.h" +#include "zvfs_hook_fd.h" +#include "zvfs_hook_rw.h" +#include "zvfs_hook_seek.h" +#include "zvfs_hook_stat.h" +#include "zvfs_hook_sync.h" +#include "zvfs_hook_fcntl.h" +#include "zvfs_hook_dir.h" +#include "zvfs_hook_mmap.h" + +#endif // __ZVFS_HOOK_H__ \ No newline at end of file diff --git a/src/hook/zvfs_hook_dir.c b/src/hook/zvfs_hook_dir.c new file mode 100644 index 0000000..58e0a48 --- /dev/null +++ b/src/hook/zvfs_hook_dir.c @@ -0,0 +1,276 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include "zvfs_hook_dir.h" +#include "zvfs_hook_init.h" +#include "zvfs_hook_reentrant.h" +#include "fs/zvfs.h" +#include "fs/zvfs_inode.h" +#include "fs/zvfs_path_entry.h" + +/* SPDK io engine - blob_delete 声明 */ +#include "../spdk_engine/io_engine.h" + +#include +#include +#include +#include /* RENAME_EXCHANGE, RENAME_NOREPLACE */ + +/* ------------------------------------------------------------------ */ +/* 内部:执行 unlink 的 zvfs 侧清理 */ +/* ------------------------------------------------------------------ */ + +/* + * zvfs_unlink_path - 对一个确认属于 zvfs 的绝对路径执行清理。 + * + * 调用时机:real_unlink* 已成功返回之后。 + * + * 逻辑: + * 1. 持 path_mu 查 path_cache + * 2. 找到 → 持 inode_mu 查 inode + * 3. 持 inode->mu 检查 ref_count + * - ref_count == 0:直接 blob_delete,inode_remove,inode_free,path_cache_remove + * - ref_count > 0:标记 deleted = true,path_cache_remove + * (inode 和 blob 的清理推迟到 close 路径中 ref_count 归零时) + */ +static void +zvfs_unlink_path(const char *abspath) +{ + /* --- 查 path_cache -------------------------------------------- */ + pthread_mutex_lock(&g_fs.path_mu); + struct zvfs_path_entry *pe = path_cache_lookup(abspath); + if (!pe) { + /* + * 不在缓存里:该文件可能从未被 open 过(没有 inode 对象)。 + * 无内存状态需要清理,直接返回。 + * blob 也不存在(文件从未被 zvfs open 创建),所以安全。 + */ + pthread_mutex_unlock(&g_fs.path_mu); + return; + } + struct zvfs_inode *inode = pe->inode; + pthread_mutex_unlock(&g_fs.path_mu); + + /* --- 持 inode->mu 决策 ---------------------------------------- */ + pthread_mutex_lock(&inode->mu); + int ref = atomic_load(&inode->ref_count); + + if (ref == 0) { + /* + * 没有 fd 打开:立即清理。 + * 顺序:blob_delete → inode_remove(出全局表)→ path_cache_remove + * → inode_free(释放内存) + */ + pthread_mutex_unlock(&inode->mu); + + blob_delete(inode->blob_id); + + pthread_mutex_lock(&g_fs.inode_mu); + inode_remove(inode->blob_id); + pthread_mutex_unlock(&g_fs.inode_mu); + + pthread_mutex_lock(&g_fs.path_mu); + path_cache_remove(abspath); + pthread_mutex_unlock(&g_fs.path_mu); + + inode_free(inode); + + } else { + /* + * 还有 fd 打开:Unix 延迟删除语义。 + * 标记 deleted,让 close 路径在 ref_count 归零时负责 blob_delete。 + * 同时把 path 从缓存里摘掉(路径已从目录树消失)。 + */ + inode->deleted = true; + pthread_mutex_unlock(&inode->mu); + + pthread_mutex_lock(&g_fs.path_mu); + path_cache_remove(abspath); + pthread_mutex_unlock(&g_fs.path_mu); + } +} + +/* ------------------------------------------------------------------ */ +/* unlink */ +/* ------------------------------------------------------------------ */ + +int +unlink(const char *path) +{ + ZVFS_HOOK_ENTER(); + int ret; + + if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(path)) { + ret = real_unlink(path); + ZVFS_HOOK_LEAVE(); + return ret; + } + + zvfs_ensure_init(); + + /* 先让真实 FS 删除文件(xattr 随之消失) */ + ret = real_unlink(path); + if (ret == 0) + zvfs_unlink_path(path); + + ZVFS_HOOK_LEAVE(); + return ret; +} + +/* ------------------------------------------------------------------ */ +/* unlinkat */ +/* ------------------------------------------------------------------ */ + +int +unlinkat(int dirfd, const char *path, int flags) +{ + ZVFS_HOOK_ENTER(); + int ret; + + /* + * AT_REMOVEDIR:rmdir 语义,目录由真实 FS 管理,直接透传。 + */ + if (flags & AT_REMOVEDIR) { + ret = real_unlinkat(dirfd, path, flags); + ZVFS_HOOK_LEAVE(); + return ret; + } + + /* 解析绝对路径,判断是否属于 zvfs */ + char abspath[PATH_MAX]; + if (zvfs_resolve_atpath(dirfd, path, abspath, sizeof(abspath)) < 0) { + ZVFS_HOOK_LEAVE(); + return -1; /* errno already set */ + } + + if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(abspath)) { + ret = real_unlinkat(dirfd, path, flags); + ZVFS_HOOK_LEAVE(); + return ret; + } + + zvfs_ensure_init(); + + ret = real_unlinkat(dirfd, path, flags); + if (ret == 0) + zvfs_unlink_path(abspath); + + ZVFS_HOOK_LEAVE(); + return ret; +} + +/* ------------------------------------------------------------------ */ +/* 内部:执行 rename 的 zvfs 侧缓存更新 */ +/* ------------------------------------------------------------------ */ + +/* + * zvfs_rename_paths - 在 real_rename* 成功后更新 path_cache。 + * + * 如果 newpath 原本也在缓存里(覆盖式 rename),其 inode 需要先做 + * unlink 清理(与 zvfs_unlink_path 逻辑相同)。 + */ +static void +zvfs_rename_paths(const char *oldabs, const char *newabs) +{ + /* 处理 newpath 被覆盖的情况 */ + pthread_mutex_lock(&g_fs.path_mu); + struct zvfs_path_entry *victim = path_cache_lookup(newabs); + pthread_mutex_unlock(&g_fs.path_mu); + + if (victim) { + /* + * newpath 是 zvfs 文件且已经在缓存里: + * real_rename 已经把它从磁盘上删掉了, + * 走和 unlink 一样的延迟/立即 blob_delete 逻辑。 + */ + zvfs_unlink_path(newabs); + } + + /* 把 oldpath 的缓存条目 rename 到 newpath */ + pthread_mutex_lock(&g_fs.path_mu); + path_cache_rename(oldabs, newabs); + pthread_mutex_unlock(&g_fs.path_mu); +} + +/* ------------------------------------------------------------------ */ +/* rename */ +/* ------------------------------------------------------------------ */ + +int +rename(const char *oldpath, const char *newpath) +{ + ZVFS_HOOK_ENTER(); + int ret; + + int old_is_zvfs = zvfs_is_zvfs_path(oldpath); + int new_is_zvfs = zvfs_is_zvfs_path(newpath); + + if (ZVFS_IN_HOOK() || (!old_is_zvfs && !new_is_zvfs)) { + ret = real_rename(oldpath, newpath); + ZVFS_HOOK_LEAVE(); + return ret; + } + + /* + * 跨域 rename(一个在 /zvfs 一个不在):不支持,返回 EXDEV。 + * 和跨文件系统 rename 的语义一致。 + */ + if (old_is_zvfs != new_is_zvfs) { + errno = EXDEV; + ZVFS_HOOK_LEAVE(); + return -1; + } + + zvfs_ensure_init(); + + ret = real_rename(oldpath, newpath); + if (ret == 0) + zvfs_rename_paths(oldpath, newpath); + + ZVFS_HOOK_LEAVE(); + return ret; +} + +/* ------------------------------------------------------------------ */ +/* renameat */ +/* ------------------------------------------------------------------ */ + +int +renameat(int olddirfd, const char *oldpath, + int newdirfd, const char *newpath) +{ + ZVFS_HOOK_ENTER(); + int ret; + + char oldabs[PATH_MAX], newabs[PATH_MAX]; + + if (zvfs_resolve_atpath(olddirfd, oldpath, oldabs, sizeof(oldabs)) < 0 || + zvfs_resolve_atpath(newdirfd, newpath, newabs, sizeof(newabs)) < 0) { + ZVFS_HOOK_LEAVE(); + return -1; + } + + int old_is_zvfs = zvfs_is_zvfs_path(oldabs); + int new_is_zvfs = zvfs_is_zvfs_path(newabs); + + if (ZVFS_IN_HOOK() || (!old_is_zvfs && !new_is_zvfs)) { + ret = real_renameat(olddirfd, oldpath, newdirfd, newpath); + ZVFS_HOOK_LEAVE(); + return ret; + } + + if (old_is_zvfs != new_is_zvfs) { + errno = EXDEV; + ZVFS_HOOK_LEAVE(); + return -1; + } + + zvfs_ensure_init(); + + ret = real_renameat(olddirfd, oldpath, newdirfd, newpath); + if (ret == 0) + zvfs_rename_paths(oldabs, newabs); + + ZVFS_HOOK_LEAVE(); + return ret; +} diff --git a/src/hook/zvfs_hook_dir.h b/src/hook/zvfs_hook_dir.h new file mode 100644 index 0000000..ab6ebad --- /dev/null +++ b/src/hook/zvfs_hook_dir.h @@ -0,0 +1,32 @@ +#ifndef __ZVFS_HOOK_DIR_H__ +#define __ZVFS_HOOK_DIR_H__ + +#include + +/* + * 目录操作 hook。 + * + * mkdir / rmdir / opendir / readdir / getdents64 全部透传,不 hook。 + * 只需要感知路径变化的操作才进这里: + * + * unlink / unlinkat + * - 真实文件由 real_unlink 删除 + * - 若路径在 path_cache 中: + * 若 ref_count == 0:blob_delete + inode_remove + path_cache_remove + * 若 ref_count > 0:标记 inode->deleted = true, + * ref_count 归零时(close 路径)再 blob_delete + * + * rename / renameat / renameat2 + * - 真实文件由 real_rename* 移动(xattr 跟随文件,不需要重写) + * - path_cache_rename 更新内存缓存 + * - renameat2 RENAME_EXCHANGE 返回 ENOTSUP + */ + +int unlink(const char *path); +int unlinkat(int dirfd, const char *path, int flags); + +int rename(const char *oldpath, const char *newpath); +int renameat(int olddirfd, const char *oldpath, + int newdirfd, const char *newpath); + +#endif // __ZVFS_HOOK_DIR_H__ \ No newline at end of file diff --git a/src/hook/zvfs_hook_fcntl.c b/src/hook/zvfs_hook_fcntl.c new file mode 100644 index 0000000..14646b9 --- /dev/null +++ b/src/hook/zvfs_hook_fcntl.c @@ -0,0 +1,230 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include "zvfs_hook_fcntl.h" +#include "zvfs_hook_init.h" +#include "zvfs_hook_fd.h" /* dup/dup2 路径 */ +#include "zvfs_hook_reentrant.h" +#include "fs/zvfs.h" +#include "fs/zvfs_open_file.h" +#include "fs/zvfs_inode.h" + +#include +#include +#include +#include +#include +#include +#include + +/* ------------------------------------------------------------------ */ +/* 内部:fcntl 核心逻辑(已确认是 zvfs fd) */ +/* ------------------------------------------------------------------ */ + +static int +zvfs_fcntl_impl(int fd, int cmd, va_list ap) +{ + pthread_mutex_lock(&g_fs.fd_mu); + struct zvfs_open_file *of = openfile_lookup(fd); + pthread_mutex_unlock(&g_fs.fd_mu); + + if (!of) { errno = EBADF; return -1; } + + switch (cmd) { + + /* ---- 文件状态 flags ------------------------------------------ */ + case F_GETFL: + return of->flags; + + case F_SETFL: { + int newfl = va_arg(ap, int); + /* + * 只允许修改可变位:O_APPEND、O_NONBLOCK、O_ASYNC。 + * O_RDONLY / O_WRONLY / O_RDWR 是 open 时决定的,不能改。 + * 同步给真实 fd,保持内核状态一致(影响 real_read/write)。 + */ + int mutable_mask = O_APPEND | O_NONBLOCK | O_ASYNC; + of->flags = (of->flags & ~mutable_mask) | (newfl & mutable_mask); + /* + * 也透传给真实 fd——虽然真实 fd 上的读写被我们拦截了, + * 但 O_NONBLOCK 可能影响 pipe / socket 等透传路径。 + */ + real_fcntl(fd, F_SETFL, of->flags); + return 0; + } + + /* ---- fd flags(FD_CLOEXEC)----------------------------------- */ + case F_GETFD: + return of->fd_flags; + + case F_SETFD: { + int fdfl = va_arg(ap, int); + of->fd_flags = fdfl; + /* 同步给真实 fd */ + real_fcntl(fd, F_SETFD, fdfl); + return 0; + } + + /* ---- dup 类 -------------------------------------------------- */ + case F_DUPFD: + case F_DUPFD_CLOEXEC: { + (void)va_arg(ap, int); + errno = ENOTSUP; + return -1; + } + + /* ---- 文件锁(不实现,假装无锁)-------------------------------- */ + case F_GETLK: { + struct flock *fl = va_arg(ap, struct flock *); + if (!fl) { errno = EFAULT; return -1; } + fl->l_type = F_UNLCK; /* 假装没有任何锁 */ + return 0; + } + + case F_SETLK: + case F_SETLKW: + (void)va_arg(ap, struct flock *); + return 0; /* 假装加锁成功 */ + + /* ---- 其他 cmd:透传给内核(同时维护真实 fd 状态)-------------- */ + default: { + /* + * 取出可变参数作为 void* 透传。 + * 大多数 fcntl cmd 的第三个参数是 long 或指针, + * 用 void* 接收足够覆盖所有平台(64-bit)。 + */ + void *arg = va_arg(ap, void *); + return real_fcntl(fd, cmd, arg); + } + + } /* switch */ +} + +/* ------------------------------------------------------------------ */ +/* fcntl */ +/* ------------------------------------------------------------------ */ + +int +fcntl(int fd, int cmd, ...) +{ + ZVFS_HOOK_ENTER(); + + va_list ap; + va_start(ap, cmd); + + int ret; + if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_fd(fd)) { + /* + * 非 zvfs fd:透传。 + * va_list 转发需要用 vfprintf 风格,但 fcntl 没有标准的 + * va_list 版本。用 void* 提取第三参数再透传。 + */ + void *arg = va_arg(ap, void *); + ret = real_fcntl(fd, cmd, arg); + va_end(ap); + ZVFS_HOOK_LEAVE(); + return ret; + } + + zvfs_ensure_init(); + + ret = zvfs_fcntl_impl(fd, cmd, ap); + va_end(ap); + ZVFS_HOOK_LEAVE(); + return ret; +} + +int +fcntl64(int fd, int cmd, ...) +{ + /* + * fcntl64 是 glibc 在 32-bit 系统上的 large-file 变体, + * 语义与 fcntl 相同,直接转发。 + */ + va_list ap; + va_start(ap, cmd); + void *arg = va_arg(ap, void *); + va_end(ap); + + ZVFS_HOOK_ENTER(); + int ret; + if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_fd(fd)) { + ret = real_fcntl64 ? real_fcntl64(fd, cmd, arg) + : real_fcntl(fd, cmd, arg); + ZVFS_HOOK_LEAVE(); + return ret; + } + + zvfs_ensure_init(); + + va_list ap2; + va_start(ap2, cmd); + ret = zvfs_fcntl_impl(fd, cmd, ap2); + va_end(ap2); + + ZVFS_HOOK_LEAVE(); + return ret; +} + +/* ------------------------------------------------------------------ */ +/* ioctl */ +/* ------------------------------------------------------------------ */ + +int +ioctl(int fd, unsigned long request, ...) +{ + ZVFS_HOOK_ENTER(); + + va_list ap; + va_start(ap, request); + void *arg = va_arg(ap, void *); + va_end(ap); + + if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_fd(fd)) { + int ret = real_ioctl(fd, request, arg); + ZVFS_HOOK_LEAVE(); + return ret; + } + + zvfs_ensure_init(); + + int ret = -1; + + switch (request) { + + case FIONREAD: { + /* + * 返回当前可读字节数 = logical_size - cur_offset。 + * 结果写入 arg(int*)。 + */ + pthread_mutex_lock(&g_fs.fd_mu); + struct zvfs_open_file *of = openfile_lookup(fd); + pthread_mutex_unlock(&g_fs.fd_mu); + + if (!of) { errno = EBADF; ret = -1; break; } + + pthread_mutex_lock(&of->inode->mu); + uint64_t size = of->inode->logical_size; + pthread_mutex_unlock(&of->inode->mu); + + uint64_t off = of->offset; + int avail = (off < size) ? (int)(size - off) : 0; + if (arg) *(int *)arg = avail; + ret = 0; + break; + } + + default: + /* + * 其他 ioctl:zvfs 文件不是块设备/字符设备, + * 绝大多数 ioctl 语义不适用,返回 ENOTTY。 + * 若将来需要支持特定 ioctl 在此扩展。 + */ + errno = ENOTTY; + ret = -1; + break; + } + + ZVFS_HOOK_LEAVE(); + return ret; +} diff --git a/src/hook/zvfs_hook_fcntl.h b/src/hook/zvfs_hook_fcntl.h new file mode 100644 index 0000000..28594b3 --- /dev/null +++ b/src/hook/zvfs_hook_fcntl.h @@ -0,0 +1,27 @@ +#ifndef __ZVFS_HOOK_FCNTL_H__ +#define __ZVFS_HOOK_FCNTL_H__ + +/* + * fcntl cmd 处理策略: + * + * F_GETFL → 返回 of->flags + * F_SETFL → 更新 of->flags(只允许改 O_APPEND / O_NONBLOCK) + * F_GETFD → 返回 of->fd_flags + * F_SETFD → 更新 of->fd_flags(FD_CLOEXEC) + * F_DUPFD → 等价于 dup,分配 >= arg 的最小可用 fd,走 dup 路径 + * F_DUPFD_CLOEXEC → 同上,同时设 FD_CLOEXEC + * F_GETLK → 不实现文件锁,返回 l_type = F_UNLCK(假装没有锁) + * F_SETLK → 直接返回 0(假装成功) + * F_SETLKW → 直接返回 0(假装成功,不阻塞) + * 其他 cmd → 透传给 real_fcntl(同时透传给内核,保持真实 fd 状态同步) + * + * ioctl cmd 处理策略: + * FIONREAD → 返回 logical_size - cur_offset(可读字节数) + * 其他 → 透传,或对 zvfs fd 返回 ENOTTY + */ + +int fcntl(int fd, int cmd, ...); +int fcntl64(int fd, int cmd, ...); +int ioctl(int fd, unsigned long request, ...); + +#endif // __ZVFS_HOOK_FCNTL_H__ \ No newline at end of file diff --git a/src/hook/zvfs_hook_fd.c b/src/hook/zvfs_hook_fd.c new file mode 100644 index 0000000..782eff4 --- /dev/null +++ b/src/hook/zvfs_hook_fd.c @@ -0,0 +1,549 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include "zvfs_hook_fd.h" +#include "zvfs_hook_init.h" +#include "zvfs_hook_reentrant.h" +#include "fs/zvfs.h" +#include "fs/zvfs_inode.h" +#include "fs/zvfs_path_entry.h" +#include "fs/zvfs_open_file.h" +#include "spdk_engine/io_engine.h" + +#include +#include +#include +#include +#include +#include +#include + +/* ------------------------------------------------------------------ */ +/* 内部:open 的核心逻辑(路径已解析为绝对路径) */ +/* ------------------------------------------------------------------ */ + +/** + * zvfs_open_impl - 对一个确认属于 zvfs 的绝对路径执行 open。 + * + * real_fd:已经由 real_open* 打开的真实 fd(用于 xattr 读写 + ftruncate)。 + * flags :open 时传入的 flags。 + * mode :O_CREAT 时的权限。 + * + * 成功返回 real_fd(即用户拿到的 fd),失败返回 -1(errno 已设置), + * 失败时调用方负责 real_close(real_fd)。 + */ +static int +zvfs_open_impl(int real_fd, const char *abspath, int flags, mode_t mode) +{ + struct zvfs_inode *inode = NULL; + struct zvfs_blob_handle *handle = NULL; + uint64_t blob_id = 0; + + if (flags & O_CREAT) { + /* ---- 创建路径 -------------------------------------------- */ + + /* 1. 创建 blob */ + handle = blob_create(0); + if (!handle) { errno = EIO; goto fail; } + blob_id = handle->id; + + /* 2. 把 blob_id 写入真实文件的 xattr */ + if (zvfs_xattr_write_blob_id(real_fd, blob_id) < 0) goto fail; + + /* 3. logical_size = 0,让 st_size 也为 0 */ + if (real_ftruncate(real_fd, 0) < 0) goto fail; + + /* 4. 分配 inode */ + inode = inode_alloc(blob_id, mode ? mode : 0666, ZVFS_ITYPE_FILE); + if (!inode) { errno = ENOMEM; goto fail; } + + /* 5. 插入全局表 */ + pthread_mutex_lock(&g_fs.inode_mu); + inode_insert(inode); + pthread_mutex_unlock(&g_fs.inode_mu); + + /* 6. 插入 path_cache */ + pthread_mutex_lock(&g_fs.path_mu); + path_cache_insert(abspath, inode); + pthread_mutex_unlock(&g_fs.path_mu); + + } else { + /* ---- 打开已有文件路径 ------------------------------------- */ + + /* 1. 先查 path_cache,命中说明另一个 fd 已经打开过 */ + pthread_mutex_lock(&g_fs.path_mu); + struct zvfs_path_entry *pe = path_cache_lookup(abspath); + if (pe) inode = pe->inode; + pthread_mutex_unlock(&g_fs.path_mu); + + if (inode) { + /* path_cache 命中:直接用缓存的 inode,重新 blob_open */ + blob_id = inode->blob_id; + handle = blob_open(blob_id); + if (!handle) { errno = EIO; goto fail; } + /* 共享 inode,增加引用 */ + atomic_fetch_add(&inode->ref_count, 1); + + } else { + /* 未命中:从 xattr 读 blob_id,可能是进程首次 open */ + if (zvfs_xattr_read_blob_id(real_fd, &blob_id) < 0) { + /* xattr 不存在:不是 zvfs 管理的文件,降级透传 */ + return real_fd; /* 直接返回,不做任何包装 */ + } + + /* 再查 inode_table(另一个 fd 可能已经 open 但路径未缓存)*/ + pthread_mutex_lock(&g_fs.inode_mu); + inode = inode_lookup(blob_id); + pthread_mutex_unlock(&g_fs.inode_mu); + + if (inode) { + atomic_fetch_add(&inode->ref_count, 1); + } else { + /* 全新 inode:需从真实文件 stat 获取 mode/size */ + struct stat st; + if (real_fstat(real_fd, &st) < 0) goto fail; + + inode = inode_alloc(blob_id, st.st_mode, ZVFS_ITYPE_FILE); + if (!inode) { errno = ENOMEM; goto fail; } + inode->logical_size = (uint64_t)st.st_size; + + pthread_mutex_lock(&g_fs.inode_mu); + inode_insert(inode); + pthread_mutex_unlock(&g_fs.inode_mu); + + pthread_mutex_lock(&g_fs.path_mu); + path_cache_insert(abspath, inode); + pthread_mutex_unlock(&g_fs.path_mu); + } + + handle = blob_open(blob_id); + if (!handle) { errno = EIO; goto fail; } + } + } + + /* ---- 分配 openfile,插入 fd_table ---------------------------- */ + struct zvfs_open_file *of = openfile_alloc(real_fd, inode, flags, handle); + if (!of) { errno = ENOMEM; goto fail_handle; } + + pthread_mutex_lock(&g_fs.fd_mu); + openfile_insert(of); + pthread_mutex_unlock(&g_fs.fd_mu); + + return real_fd; + +fail_handle: + blob_close(handle); +fail: + /* inode 若刚分配(ref_count==1)需要回滚 */ + if (inode && atomic_load(&inode->ref_count) == 1) { + pthread_mutex_lock(&g_fs.inode_mu); + inode_remove(inode->blob_id); + pthread_mutex_unlock(&g_fs.inode_mu); + pthread_mutex_lock(&g_fs.path_mu); + path_cache_remove(abspath); + pthread_mutex_unlock(&g_fs.path_mu); + inode_free(inode); + } + return -1; +} + +/* ------------------------------------------------------------------ */ +/* open */ +/* ------------------------------------------------------------------ */ + +int +open(const char *path, int flags, ...) +{ + ZVFS_HOOK_ENTER(); + + mode_t mode = 0; + if (flags & O_CREAT) { + va_list ap; + va_start(ap, flags); + mode = (mode_t)va_arg(ap, unsigned int); + va_end(ap); + } + + int ret; + if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(path)) { + ret = real_open(path, flags, mode); + ZVFS_HOOK_LEAVE(); + return ret; + } + + zvfs_ensure_init(); + + /* 先让真实 FS 创建 / 打开文件(获得 real_fd) */ + int real_fd = real_open(path, flags, mode); + if (real_fd < 0) { ZVFS_HOOK_LEAVE(); return -1; } + + ret = zvfs_open_impl(real_fd, path, flags, mode); + if (ret < 0) { + int saved = errno; + real_close(real_fd); + errno = saved; + } + + ZVFS_HOOK_LEAVE(); + return ret; +} + +int open64(const char *path, int flags, ...) +{ + mode_t mode = 0; + if (flags & O_CREAT) { + va_list ap; va_start(ap, flags); + mode = (mode_t)va_arg(ap, unsigned int); + va_end(ap); + } + return open(path, flags | O_LARGEFILE, mode); +} + +/* ------------------------------------------------------------------ */ +/* openat */ +/* ------------------------------------------------------------------ */ + +int +openat(int dirfd, const char *path, int flags, ...) +{ + ZVFS_HOOK_ENTER(); + + mode_t mode = 0; + if (flags & O_CREAT) { + va_list ap; va_start(ap, flags); + mode = (mode_t)va_arg(ap, unsigned int); + va_end(ap); + } + + /* 解析绝对路径判断是否属于 zvfs */ + char abspath[PATH_MAX]; + if (zvfs_resolve_atpath(dirfd, path, abspath, sizeof(abspath)) < 0) { + ZVFS_HOOK_LEAVE(); + return -1; + } + + int ret; + if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(abspath)) { + ret = real_openat(dirfd, path, flags, mode); + ZVFS_HOOK_LEAVE(); + return ret; + } + + zvfs_ensure_init(); + + int real_fd = real_openat(dirfd, path, flags, mode); + if (real_fd < 0) { ZVFS_HOOK_LEAVE(); return -1; } + + ret = zvfs_open_impl(real_fd, abspath, flags, mode); + if (ret < 0) { + int saved = errno; + real_close(real_fd); + errno = saved; + } + + ZVFS_HOOK_LEAVE(); + return ret; +} + +int openat64(int dirfd, const char *path, int flags, ...) +{ + mode_t mode = 0; + if (flags & O_CREAT) { + va_list ap; va_start(ap, flags); + mode = (mode_t)va_arg(ap, unsigned int); + va_end(ap); + } + return openat(dirfd, path, flags | O_LARGEFILE, mode); +} + +/* ------------------------------------------------------------------ */ +/* creat */ +/* ------------------------------------------------------------------ */ + +int creat(const char *path, mode_t mode) +{ + return open(path, O_CREAT | O_WRONLY | O_TRUNC, mode); +} + +int creat64(const char *path, mode_t mode) +{ + return open(path, O_CREAT | O_WRONLY | O_TRUNC | O_LARGEFILE, mode); +} + +/* ------------------------------------------------------------------ */ +/* glibc 别名 */ +/* ------------------------------------------------------------------ */ + +int __open(const char *path, int flags, ...) +{ + mode_t mode = 0; + if (flags & O_CREAT) { + va_list ap; va_start(ap, flags); + mode = (mode_t)va_arg(ap, unsigned int); + va_end(ap); + } + return open(path, flags, mode); +} + +int __open64(const char *path, int flags, ...) +{ + mode_t mode = 0; + if (flags & O_CREAT) { + va_list ap; va_start(ap, flags); + mode = (mode_t)va_arg(ap, unsigned int); + va_end(ap); + } + return open64(path, flags, mode); +} + +int __libc_open(const char *path, int flags, ...) +{ + mode_t mode = 0; + if (flags & O_CREAT) { + va_list ap; va_start(ap, flags); + mode = (mode_t)va_arg(ap, unsigned int); + va_end(ap); + } + return open(path, flags, mode); +} + +/* ------------------------------------------------------------------ */ +/* close */ +/* ------------------------------------------------------------------ */ + +/* + * zvfs_close_impl - zvfs fd 的关闭逻辑。 + * + * 调用方已持有 fd_mu。函数内部会释放 fd_mu 后再处理 inode。 + */ +static int +zvfs_close_impl(int fd) +{ + /* 持 fd_mu 取出 openfile,从表里摘除 */ + pthread_mutex_lock(&g_fs.fd_mu); + struct zvfs_open_file *of = openfile_lookup(fd); + if (!of) { + pthread_mutex_unlock(&g_fs.fd_mu); + errno = EBADF; + return -1; + } + int new_ref = atomic_fetch_sub(&of->ref_count, 1) - 1; + if (new_ref == 0) + openfile_remove(fd); + pthread_mutex_unlock(&g_fs.fd_mu); + + if (new_ref > 0) { + /* + * 还有其他 dup 出来的 fd 引用同一个 openfile, + * 只关闭真实 fd,不动 blob 和 inode。 + */ + return real_close(fd); + } + + /* ---- openfile 引用归零:关闭 blob handle --------------------- */ + struct zvfs_inode *inode = of->inode; + struct zvfs_blob_handle *handle = of->handle; + openfile_free(of); + + blob_close(handle); + + /* ---- inode ref_count-- --------------------------------------- */ + int inode_ref = atomic_fetch_sub(&inode->ref_count, 1) - 1; + + if (inode_ref == 0) { + /* + * 最后一个 fd 关闭了这个 inode。 + * 若 deleted:执行延迟 blob_delete。 + */ + bool do_delete = false; + pthread_mutex_lock(&inode->mu); + do_delete = inode->deleted; + pthread_mutex_unlock(&inode->mu); + + if (do_delete) + blob_delete(inode->blob_id); + + pthread_mutex_lock(&g_fs.inode_mu); + inode_remove(inode->blob_id); + pthread_mutex_unlock(&g_fs.inode_mu); + + /* path_cache 在 unlink 时已经摘除(deleted=true 路径) + * 或在此处还需摘除(正常关闭最后一个 fd)*/ + if (!do_delete) { + /* 正常关闭:path 留着,只有 inode 的引用归零时清缓存 */ + /* 注意:path_cache 里的指针指向这个即将释放的 inode, + * 所以必须把 path_cache 条目也清掉,否则成为悬空指针 */ + pthread_mutex_lock(&g_fs.path_mu); + /* 遍历找到所有指向这个 inode 的 path entry 并移除 + * (一个 inode 对应一个 path,hardlink 暂不支持)*/ + struct zvfs_path_entry *pe, *tmp; (void)tmp; + HASH_ITER(hh, g_fs.path_cache, pe, tmp) { + if (pe->inode == inode) { + HASH_DEL(g_fs.path_cache, pe); + free(pe->path); + free(pe); + break; /* 一对一关系,找到即退 */ + } + } + pthread_mutex_unlock(&g_fs.path_mu); + } + + inode_free(inode); + } + + return real_close(fd); +} + +int +close(int fd) +{ + ZVFS_HOOK_ENTER(); + + int ret; + int is_zvfs_fd = (!ZVFS_IN_HOOK() && zvfs_is_zvfs_fd(fd)); + if (!is_zvfs_fd) { + ret = real_close(fd); + ZVFS_HOOK_LEAVE(); + return ret; + } + + zvfs_ensure_init(); + + ret = zvfs_close_impl(fd); + ZVFS_HOOK_LEAVE(); + return ret; +} + +int __close(int fd) { return close(fd); } +int __libc_close(int fd) { return close(fd); } + +/* ------------------------------------------------------------------ */ +/* close_range */ +/* ------------------------------------------------------------------ */ + +int +close_range(unsigned int first, unsigned int last, int flags) +{ + ZVFS_HOOK_ENTER(); + + if (ZVFS_IN_HOOK()) { + int ret = real_close_range ? real_close_range(first, last, flags) + : (errno = ENOSYS, -1); + ZVFS_HOOK_LEAVE(); + return ret; + } + + /* + * 遍历范围内所有 fd,zvfs fd 单独走 zvfs_close_impl, + * 其余统一交给 real_close_range(如果内核支持)。 + * 若内核不支持 close_range(< 5.9),逐个 close。 + */ + int any_err = 0; + int inited = 0; + for (unsigned int fd = first; fd <= last; fd++) { + if (zvfs_is_zvfs_fd((int)fd)) { + if (!inited) { + zvfs_ensure_init(); + inited = 1; + } + if (zvfs_close_impl((int)fd) < 0) any_err = 1; + } + } + + /* 让内核处理剩余非 zvfs fd(CLOEXEC 等 flags 也在这里生效) */ + if (real_close_range) { + if (real_close_range(first, last, flags) < 0 && !any_err) + any_err = 1; + } else { + /* 降级:逐个 close 非 zvfs fd */ + for (unsigned int fd = first; fd <= last; fd++) { + if (!zvfs_is_zvfs_fd((int)fd)) + real_close((int)fd); + } + } + + ZVFS_HOOK_LEAVE(); + return any_err ? -1 : 0; +} + +/* ------------------------------------------------------------------ */ +/* dup */ +/* ------------------------------------------------------------------ */ + +int +dup(int oldfd) +{ + ZVFS_HOOK_ENTER(); + + int is_zvfs_fd = (!ZVFS_IN_HOOK() && zvfs_is_zvfs_fd(oldfd)); + if (!is_zvfs_fd) { + int ret = real_dup(oldfd); + ZVFS_HOOK_LEAVE(); + return ret; + } + + /* + * 当前版本不支持在 zvfs fd 上做 dup。 + * 先明确返回 ENOTSUP,避免暴露错误的 offset 语义。 + */ + zvfs_ensure_init(); + errno = ENOTSUP; + ZVFS_HOOK_LEAVE(); + return -1; +} + +/* ------------------------------------------------------------------ */ +/* dup2 */ +/* ------------------------------------------------------------------ */ + +int +dup2(int oldfd, int newfd) +{ + ZVFS_HOOK_ENTER(); + + int is_zvfs_fd = (!ZVFS_IN_HOOK() && zvfs_is_zvfs_fd(oldfd)); + if (!is_zvfs_fd) { + int ret = real_dup2(oldfd, newfd); + ZVFS_HOOK_LEAVE(); + return ret; + } + + /* POSIX 兼容:dup2(oldfd, oldfd) 对合法 fd 直接返回 oldfd。 */ + if (oldfd == newfd) { + ZVFS_HOOK_LEAVE(); + return oldfd; + } + + zvfs_ensure_init(); + errno = ENOTSUP; + ZVFS_HOOK_LEAVE(); + return -1; +} + +/* ------------------------------------------------------------------ */ +/* dup3 */ +/* ------------------------------------------------------------------ */ + +int +dup3(int oldfd, int newfd, int flags) +{ + ZVFS_HOOK_ENTER(); + + int is_zvfs_fd = (!ZVFS_IN_HOOK() && zvfs_is_zvfs_fd(oldfd)); + if (!is_zvfs_fd) { + int ret = real_dup3(oldfd, newfd, flags); + ZVFS_HOOK_LEAVE(); + return ret; + } + + if (oldfd == newfd) { + errno = EINVAL; + ZVFS_HOOK_LEAVE(); + return -1; + } + + zvfs_ensure_init(); + errno = ENOTSUP; + ZVFS_HOOK_LEAVE(); + return -1; +} diff --git a/src/hook/zvfs_hook_fd.h b/src/hook/zvfs_hook_fd.h new file mode 100644 index 0000000..bbb18a4 --- /dev/null +++ b/src/hook/zvfs_hook_fd.h @@ -0,0 +1,51 @@ +#ifndef __ZVFS_HOOK_FD_H__ +#define __ZVFS_HOOK_FD_H__ + +#include +#include +#include + +/** + * open / creat: + * zvfs 路径 + O_CREAT → blob_create + xattr_write + inode_alloc + openfile_alloc + * zvfs 路径,无 O_CREAT → xattr_read_blob_id + blob_open + inode_alloc(若未缓存) + openfile_alloc + * 非 zvfs 路径 → 透传 + * + * close: + * zvfs fd → openfile ref_count-- + * 归零:blob_close;若 inode->deleted,blob_delete + inode_free + * inode ref_count--(归零:path_cache_remove + inode_free) + * real_close + * 非 zvfs fd → 透传 + * + * dup / dup2 / dup3: + * zvfs fd → 新 fd 插入 fd_table,openfile.ref_count++(共享同一 openfile), + * real_dup* 同步执行(内核也要知道这个 fd) + * 非 zvfs fd → 透传 + */ + +/* open 族 */ +int open(const char *path, int flags, ...); +int open64(const char *path, int flags, ...); +int openat(int dirfd, const char *path, int flags, ...); +int openat64(int dirfd, const char *path, int flags, ...); +int creat(const char *path, mode_t mode); +int creat64(const char *path, mode_t mode); + +/* close 族 */ +int close(int fd); +int close_range(unsigned int first, unsigned int last, int flags); + +/* dup 族 */ +int dup(int oldfd); +int dup2(int oldfd, int newfd); +int dup3(int oldfd, int newfd, int flags); + +/* glibc 内部别名(与 open/close 实现体共享逻辑,转发即可) */ +int __open(const char *path, int flags, ...); +int __open64(const char *path, int flags, ...); +int __libc_open(const char *path, int flags, ...); +int __close(int fd); +int __libc_close(int fd); + +#endif // __ZVFS_HOOK_FD_H__ \ No newline at end of file diff --git a/src/hook/zvfs_hook_init.c b/src/hook/zvfs_hook_init.c new file mode 100644 index 0000000..3f5a0e6 --- /dev/null +++ b/src/hook/zvfs_hook_init.c @@ -0,0 +1,298 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include "zvfs_hook_init.h" +#include "zvfs_hook_reentrant.h" +#include "fs/zvfs.h" +#include "fs/zvfs_open_file.h" + +#include +#include +#include +#include +#include +#include +#include + +/* ------------------------------------------------------------------ */ +/* 线程局部重入计数定义 */ +/* ------------------------------------------------------------------ */ +__thread int _zvfs_hook_depth = 0; + +/* ------------------------------------------------------------------ */ +/* zvfs 挂载点 */ +/* ------------------------------------------------------------------ */ +#define ZVFS_MOUNT_PREFIX "/zvfs" +#define ZVFS_MOUNT_PREFIX_LEN 5 /* strlen("/zvfs") */ + +/* ------------------------------------------------------------------ */ +/* real_* 函数指针定义 */ +/* ------------------------------------------------------------------ */ + +/* open / close / dup */ +int (*real_open)(const char *, int, ...) = NULL; +int (*real_open64)(const char *, int, ...) = NULL; +int (*real_openat)(int, const char *, int, ...) = NULL; +int (*real_openat64)(int, const char *, int, ...) = NULL; +int (*real_creat)(const char *, mode_t) = NULL; +int (*real_creat64)(const char *, mode_t) = NULL; +int (*real_close)(int) = NULL; +int (*real_close_range)(unsigned, unsigned, unsigned) = NULL; +int (*real_dup)(int) = NULL; +int (*real_dup2)(int, int) = NULL; +int (*real_dup3)(int, int, int) = NULL; + +/* read */ +ssize_t (*real_read)(int, void *, size_t) = NULL; +ssize_t (*real_pread)(int, void *, size_t, off_t) = NULL; +ssize_t (*real_pread64)(int, void *, size_t, off_t) = NULL; +ssize_t (*real_readv)(int, const struct iovec *, int) = NULL; +ssize_t (*real_preadv)(int, const struct iovec *, int, off_t) = NULL; +ssize_t (*real_preadv64)(int, const struct iovec *, int, off_t) = NULL; +ssize_t (*real_preadv2)(int, const struct iovec *, int, off_t, int) = NULL; + +/* write */ +ssize_t (*real_write)(int, const void *, size_t) = NULL; +ssize_t (*real_pwrite)(int, const void *, size_t, off_t) = NULL; +ssize_t (*real_pwrite64)(int, const void *, size_t, off_t) = NULL; +ssize_t (*real_writev)(int, const struct iovec *, int) = NULL; +ssize_t (*real_pwritev)(int, const struct iovec *, int, off_t) = NULL; +ssize_t (*real_pwritev64)(int, const struct iovec *, int, off_t) = NULL; +ssize_t (*real_pwritev2)(int, const struct iovec *, int, off_t, int) = NULL; + +/* lseek / truncate / fallocate */ +off_t (*real_lseek)(int, off_t, int) = NULL; +off_t (*real_lseek64)(int, off_t, int) = NULL; +int (*real_truncate)(const char *, off_t) = NULL; +int (*real_truncate64)(const char *, off_t) = NULL; +int (*real_ftruncate)(int, off_t) = NULL; +int (*real_ftruncate64)(int, off_t) = NULL; +int (*real_fallocate)(int, int, off_t, off_t) = NULL; +int (*real_posix_fallocate)(int, off_t, off_t) = NULL; + +/* stat */ +int (*real_stat)(const char *, struct stat *) = NULL; +int (*real_stat64)(const char *, struct stat64 *) = NULL; +int (*real_fstat)(int, struct stat *) = NULL; +int (*real_fstat64)(int, struct stat64 *) = NULL; +int (*real_lstat)(const char *, struct stat *) = NULL; +int (*real_lstat64)(const char *, struct stat64 *) = NULL; +int (*real_fstatat)(int, const char *, struct stat *, int) = NULL; +int (*real_fstatat64)(int, const char *, struct stat64 *, int) = NULL; +int (*real_statx)(int, const char *, int, unsigned int, + struct statx *) = NULL; + +/* sync */ +int (*real_fsync)(int) = NULL; +int (*real_fdatasync)(int) = NULL; +int (*real_sync_file_range)(int, off_t, off_t, unsigned int) = NULL; + +/* fcntl / ioctl */ +int (*real_fcntl)(int, int, ...) = NULL; +int (*real_fcntl64)(int, int, ...) = NULL; +int (*real_ioctl)(int, unsigned long, ...) = NULL; + +/* 目录 */ +int (*real_unlink)(const char *) = NULL; +int (*real_unlinkat)(int, const char *, int) = NULL; +int (*real_rename)(const char *, const char *) = NULL; +int (*real_renameat)(int, const char *, int, const char *) = NULL; +int (*real_renameat2)(int, const char *, int, const char *, + unsigned int) = NULL; + +/* mmap */ +void *(*real_mmap)(void *, size_t, int, int, int, off_t) = NULL; +void *(*real_mmap64)(void *, size_t, int, int, int, off_t) = NULL; +int (*real_munmap)(void *, size_t) = NULL; +int (*real_msync)(void *, size_t, int) = NULL; + +/* fork */ +pid_t (*real_fork)(void) = NULL; +pid_t (*real_vfork)(void) = NULL; + +/* glibc 别名 */ +int (*real___open)(const char *, int, ...) = NULL; +int (*real___open64)(const char *, int, ...) = NULL; +int (*real___libc_open)(const char *, int, ...) = NULL; +ssize_t (*real___read)(int, void *, size_t) = NULL; +ssize_t (*real___libc_read)(int, void *, size_t) = NULL; +ssize_t (*real___write)(int, const void *, size_t) = NULL; +ssize_t (*real___libc_write)(int, const void *, size_t) = NULL; +int (*real___close)(int) = NULL; +int (*real___libc_close)(int) = NULL; + +/* ------------------------------------------------------------------ */ +/* dlsym 辅助宏 */ +/* ------------------------------------------------------------------ */ + +/* + * 找不到符号时不 fatal:部分 glibc 内部别名在某些发行版上可能不存在, + * 置 NULL 后 hook 函数里做 NULL 检查再回退即可。 + */ +#define LOAD_SYM(var, name) \ + do { \ + (var) = dlsym(RTLD_NEXT, (name)); \ + if (!(var)) \ + fprintf(stderr, "[zvfs] WARNING: dlsym(%s) = NULL\n", (name)); \ + } while (0) + +#define LOAD_SYM_OPTIONAL(var, name) \ + do { (var) = dlsym(RTLD_NEXT, (name)); } while (0) + +/* ------------------------------------------------------------------ */ +/* 初始化 */ +/* ------------------------------------------------------------------ */ + +__attribute__((constructor)) +void zvfs_hook_init(void) +{ + /* 必须存在的符号 */ + LOAD_SYM(real_open, "open"); + LOAD_SYM(real_open64, "open64"); + LOAD_SYM(real_openat, "openat"); + LOAD_SYM(real_openat64, "openat64"); + LOAD_SYM(real_creat, "creat"); + LOAD_SYM(real_creat64, "creat64"); + LOAD_SYM(real_close, "close"); + LOAD_SYM(real_dup, "dup"); + LOAD_SYM(real_dup2, "dup2"); + LOAD_SYM(real_dup3, "dup3"); + + LOAD_SYM(real_read, "read"); + LOAD_SYM(real_pread, "pread"); + LOAD_SYM(real_pread64, "pread64"); + LOAD_SYM(real_readv, "readv"); + LOAD_SYM(real_preadv, "preadv"); + LOAD_SYM(real_preadv64, "preadv64"); + LOAD_SYM(real_write, "write"); + LOAD_SYM(real_pwrite, "pwrite"); + LOAD_SYM(real_pwrite64, "pwrite64"); + LOAD_SYM(real_writev, "writev"); + LOAD_SYM(real_pwritev, "pwritev"); + LOAD_SYM(real_pwritev64, "pwritev64"); + + LOAD_SYM(real_lseek, "lseek"); + LOAD_SYM(real_lseek64, "lseek64"); + LOAD_SYM(real_truncate, "truncate"); + LOAD_SYM(real_truncate64, "truncate64"); + LOAD_SYM(real_ftruncate, "ftruncate"); + LOAD_SYM(real_ftruncate64, "ftruncate64"); + LOAD_SYM(real_fallocate, "fallocate"); + LOAD_SYM(real_posix_fallocate,"posix_fallocate"); + + LOAD_SYM(real_stat, "stat"); + LOAD_SYM(real_stat64, "stat64"); + LOAD_SYM(real_fstat, "fstat"); + LOAD_SYM(real_fstat64, "fstat64"); + LOAD_SYM(real_lstat, "lstat"); + LOAD_SYM(real_lstat64, "lstat64"); + LOAD_SYM(real_fstatat, "fstatat"); + LOAD_SYM(real_fstatat64, "fstatat64"); + LOAD_SYM(real_fsync, "fsync"); + LOAD_SYM(real_fdatasync, "fdatasync"); + LOAD_SYM(real_fcntl, "fcntl"); + LOAD_SYM(real_fcntl64, "fcntl64"); + LOAD_SYM(real_ioctl, "ioctl"); + + LOAD_SYM(real_unlink, "unlink"); + LOAD_SYM(real_unlinkat, "unlinkat"); + LOAD_SYM(real_rename, "rename"); + LOAD_SYM(real_renameat, "renameat"); + LOAD_SYM(real_mmap, "mmap"); + LOAD_SYM(real_mmap64, "mmap64"); + LOAD_SYM(real_munmap, "munmap"); + LOAD_SYM(real_msync, "msync"); + LOAD_SYM(real_fork, "fork"); + LOAD_SYM(real_vfork, "vfork"); + + /* 可选符号:glibc 内部别名,不一定存在 */ + LOAD_SYM_OPTIONAL(real_close_range, "close_range"); + LOAD_SYM_OPTIONAL(real_preadv2, "preadv2"); + LOAD_SYM_OPTIONAL(real_pwritev2, "pwritev2"); + LOAD_SYM_OPTIONAL(real_statx, "statx"); + LOAD_SYM_OPTIONAL(real_sync_file_range,"sync_file_range"); + LOAD_SYM_OPTIONAL(real_renameat2, "renameat2"); + LOAD_SYM_OPTIONAL(real___open, "__open"); + LOAD_SYM_OPTIONAL(real___open64, "__open64"); + LOAD_SYM_OPTIONAL(real___libc_open, "__libc_open"); + LOAD_SYM_OPTIONAL(real___read, "__read"); + LOAD_SYM_OPTIONAL(real___libc_read, "__libc_read"); + LOAD_SYM_OPTIONAL(real___write, "__write"); + LOAD_SYM_OPTIONAL(real___libc_write, "__libc_write"); + LOAD_SYM_OPTIONAL(real___close, "__close"); + LOAD_SYM_OPTIONAL(real___libc_close, "__libc_close"); + + /* 初始化全局 fs 结构 */ + zvfs_fs_init(); +} + +/* ------------------------------------------------------------------ */ +/* 路径 / fd 判断 */ +/* ------------------------------------------------------------------ */ + +int +zvfs_is_zvfs_path(const char *path) +{ + if (!path) + return 0; + /* 路径必须以 /zvfs 开头,且后一个字符是 '/' 或 '\0' */ + if (strncmp(path, ZVFS_MOUNT_PREFIX, ZVFS_MOUNT_PREFIX_LEN) != 0) + return 0; + char next = path[ZVFS_MOUNT_PREFIX_LEN]; + return (next == '/' || next == '\0'); +} + +int +zvfs_is_zvfs_fd(int fd) +{ + if (fd < 0) + return 0; + pthread_mutex_lock(&g_fs.fd_mu); + struct zvfs_open_file *of = openfile_lookup(fd); + pthread_mutex_unlock(&g_fs.fd_mu); + return (of != NULL); +} + +/* ------------------------------------------------------------------ */ +/* dirfd + 相对路径 → 绝对路径 */ +/* ------------------------------------------------------------------ */ + +int +zvfs_resolve_atpath(int dirfd, const char *path, char *buf, size_t bufsz) +{ + /* 绝对路径:直接拷贝 */ + if (path && path[0] == '/') { + if (strlen(path) >= bufsz) { + errno = ENAMETOOLONG; + return -1; + } + strncpy(buf, path, bufsz); + buf[bufsz - 1] = '\0'; + return 0; + } + + /* AT_FDCWD:以当前工作目录为基准 */ + if (dirfd == AT_FDCWD) { + if (!getcwd(buf, bufsz)) return -1; + } else { + /* 通过 /proc/self/fd/ 读出目录的绝对路径 */ + char proc_path[64]; + snprintf(proc_path, sizeof(proc_path), "/proc/self/fd/%d", dirfd); + ssize_t len = readlink(proc_path, buf, bufsz - 1); + if (len < 0) return -1; + buf[len] = '\0'; + } + + /* 拼接 path */ + size_t dir_len = strlen(buf); + size_t path_len = path ? strlen(path) : 0; + if (dir_len + 1 + path_len >= bufsz) { + errno = ENAMETOOLONG; + return -1; + } + if (path_len > 0) { + buf[dir_len] = '/'; + memcpy(buf + dir_len + 1, path, path_len + 1); + } + return 0; +} diff --git a/src/hook/zvfs_hook_init.h b/src/hook/zvfs_hook_init.h new file mode 100644 index 0000000..c6f9abc --- /dev/null +++ b/src/hook/zvfs_hook_init.h @@ -0,0 +1,130 @@ +#ifndef __ZVFS_HOOK_INIT_H__ +#define __ZVFS_HOOK_INIT_H__ + +#include +#include +#include +#include +#include +#include +#include "fs/zvfs_sys_init.h" + +/* + * 所有原始函数指针集中在这里。 + * 命名规则:real_ + * 通过 dlsym(RTLD_NEXT, "funcname") 在 __attribute__((constructor)) 中初始化。 + */ + +/* open 族 */ +extern int (*real_open)(const char *path, int flags, ...); +extern int (*real_open64)(const char *path, int flags, ...); +extern int (*real_openat)(int dirfd, const char *path, int flags, ...); +extern int (*real_openat64)(int dirfd, const char *path, int flags, ...); +extern int (*real_creat)(const char *path, mode_t mode); +extern int (*real_creat64)(const char *path, mode_t mode); + +/* close 族 */ +extern int (*real_close)(int fd); +extern int (*real_close_range)(unsigned int first, unsigned int last, unsigned int flags); + +/* dup 族 */ +extern int (*real_dup)(int oldfd); +extern int (*real_dup2)(int oldfd, int newfd); +extern int (*real_dup3)(int oldfd, int newfd, int flags); + +/* read 族 */ +extern ssize_t (*real_read)(int fd, void *buf, size_t count); +extern ssize_t (*real_pread)(int fd, void *buf, size_t count, off_t offset); +extern ssize_t (*real_pread64)(int fd, void *buf, size_t count, off64_t offset); +extern ssize_t (*real_readv)(int fd, const struct iovec *iov, int iovcnt); +extern ssize_t (*real_preadv)(int fd, const struct iovec *iov, int iovcnt, off_t offset); +extern ssize_t (*real_preadv64)(int fd, const struct iovec *iov, int iovcnt, off64_t offset); +extern ssize_t (*real_preadv2)(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags); + +/* write 族 */ +extern ssize_t (*real_write)(int fd, const void *buf, size_t count); +extern ssize_t (*real_pwrite)(int fd, const void *buf, size_t count, off_t offset); +extern ssize_t (*real_pwrite64)(int fd, const void *buf, size_t count, off64_t offset); +extern ssize_t (*real_writev)(int fd, const struct iovec *iov, int iovcnt); +extern ssize_t (*real_pwritev)(int fd, const struct iovec *iov, int iovcnt, off_t offset); +extern ssize_t (*real_pwritev64)(int fd, const struct iovec *iov, int iovcnt, off64_t offset); +extern ssize_t (*real_pwritev2)(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags); + +/* lseek */ +extern off_t (*real_lseek)(int fd, off_t offset, int whence); +extern off64_t (*real_lseek64)(int fd, off64_t offset, int whence); + +/* truncate / fallocate */ +extern int (*real_truncate)(const char *path, off_t length); +extern int (*real_truncate64)(const char *path, off64_t length); +extern int (*real_ftruncate)(int fd, off_t length); +extern int (*real_ftruncate64)(int fd, off64_t length); +extern int (*real_fallocate)(int fd, int mode, off_t offset, off_t len); +extern int (*real_posix_fallocate)(int fd, off_t offset, off_t len); + +/* stat 族 */ +extern int (*real_stat)(const char *path, struct stat *buf); +extern int (*real_stat64)(const char *path, struct stat64 *buf); +extern int (*real_fstat)(int fd, struct stat *buf); +extern int (*real_fstat64)(int fd, struct stat64 *buf); +extern int (*real_lstat)(const char *path, struct stat *buf); +extern int (*real_lstat64)(const char *path, struct stat64 *buf); +extern int (*real_fstatat)(int dirfd, const char *path, struct stat *buf, int flags); +extern int (*real_fstatat64)(int dirfd, const char *path, struct stat64 *buf, int flags); +extern int (*real_statx)(int dirfd, const char *path, int flags, + unsigned int mask, struct statx *buf); + +/* sync */ +extern int (*real_fsync)(int fd); +extern int (*real_fdatasync)(int fd); +extern int (*real_sync_file_range)(int fd, off64_t offset, off64_t nbytes, unsigned int flags); + +/* fcntl / ioctl */ +extern int (*real_fcntl)(int fd, int cmd, ...); +extern int (*real_fcntl64)(int fd, int cmd, ...); +extern int (*real_ioctl)(int fd, unsigned long request, ...); + +/* 目录感知 */ +extern int (*real_unlink)(const char *path); +extern int (*real_unlinkat)(int dirfd, const char *path, int flags); +extern int (*real_rename)(const char *oldpath, const char *newpath); +extern int (*real_renameat)(int olddirfd, const char *oldpath, + int newdirfd, const char *newpath); +extern int (*real_renameat2)(int olddirfd, const char *oldpath, + int newdirfd, const char *newpath, + unsigned int flags); + +/* mmap 族(预留) */ +extern void *(*real_mmap)(void *addr, size_t length, int prot, int flags, + int fd, off_t offset); +extern void *(*real_mmap64)(void *addr, size_t length, int prot, int flags, + int fd, off64_t offset); +extern int (*real_munmap)(void *addr, size_t length); +extern int (*real_msync)(void *addr, size_t length, int flags); + + +/* glibc 内部别名 */ +extern int (*real___open)(const char *path, int flags, ...); +extern int (*real___open64)(const char *path, int flags, ...); +extern int (*real___libc_open)(const char *path, int flags, ...); +extern ssize_t (*real___read)(int fd, void *buf, size_t count); +extern ssize_t (*real___libc_read)(int fd, void *buf, size_t count); +extern ssize_t (*real___write)(int fd, const void *buf, size_t count); +extern ssize_t (*real___libc_write)(int fd, const void *buf, size_t count); +extern int (*real___close)(int fd); +extern int (*real___libc_close)(int fd); + +/* 初始化所有 real_* 指针,在 constructor 中调用 */ +void zvfs_hook_init(void); + +/* 判断路径 / fd 是否属于 zvfs 接管范围 */ +int zvfs_is_zvfs_path(const char *path); +int zvfs_is_zvfs_fd(int fd); + +/* + * 将 dirfd + 相对路径解析为绝对路径,写入 buf(长度 bufsz)。 + * dirfd == AT_FDCWD 时等价于以当前工作目录为基准。 + * 成功返回 0,失败返回 -1 并设置 errno。 + */ +int zvfs_resolve_atpath(int dirfd, const char *path, char *buf, size_t bufsz); +#endif // __ZVFS_HOOK_INIT_H__ diff --git a/src/hook/zvfs_hook_mmap.c b/src/hook/zvfs_hook_mmap.c new file mode 100644 index 0000000..1cda26f --- /dev/null +++ b/src/hook/zvfs_hook_mmap.c @@ -0,0 +1,85 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include "zvfs_hook_mmap.h" +#include "zvfs_hook_init.h" +#include "zvfs_hook_reentrant.h" + +#include + +/* ------------------------------------------------------------------ */ +/* mmap / mmap64 */ +/* ------------------------------------------------------------------ */ + +void * +mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset) +{ + ZVFS_HOOK_ENTER(); + + void *ret; + + /* + * MAP_ANONYMOUS:不关联任何 fd,直接透传。 + * 非 zvfs fd:直接透传。 + * zvfs fd:返回 ENOTSUP。 + */ + if (ZVFS_IN_HOOK() || (flags & MAP_ANONYMOUS) || !zvfs_is_zvfs_fd(fd)) { + ret = real_mmap(addr, length, prot, flags, fd, offset); + ZVFS_HOOK_LEAVE(); + return ret; + } + + zvfs_ensure_init(); + + /* zvfs fd:当前不支持 mmap */ + errno = ENOTSUP; + ZVFS_HOOK_LEAVE(); + return MAP_FAILED; +} + +void * +mmap64(void *addr, size_t length, int prot, int flags, int fd, off_t offset) +{ + /* + * mmap64 在 64-bit 系统上与 mmap 等价(off_t 已经是 64-bit)。 + * 直接转发。 + */ + return mmap(addr, length, prot, flags, fd, offset); +} + +/* ------------------------------------------------------------------ */ +/* munmap */ +/* ------------------------------------------------------------------ */ + +int +munmap(void *addr, size_t length) +{ + ZVFS_HOOK_ENTER(); + /* + * zvfs 的 mmap 不会成功,所以这里不会有 zvfs 映射需要处理。 + * 直接透传。 + * + * future:查 mmap_table,命中则 blob_write 写回再透传。 + */ + int r = real_munmap(addr, length); + ZVFS_HOOK_LEAVE(); + return r; +} + +/* ------------------------------------------------------------------ */ +/* msync */ +/* ------------------------------------------------------------------ */ + +int +msync(void *addr, size_t length, int flags) +{ + ZVFS_HOOK_ENTER(); + /* + * 同 munmap:当前无 zvfs 映射,直接透传。 + * + * future:查 mmap_table,命中则 blob_write 对应范围。 + */ + int r = real_msync(addr, length, flags); + ZVFS_HOOK_LEAVE(); + return r; +} diff --git a/src/hook/zvfs_hook_mmap.h b/src/hook/zvfs_hook_mmap.h new file mode 100644 index 0000000..b49ec48 --- /dev/null +++ b/src/hook/zvfs_hook_mmap.h @@ -0,0 +1,34 @@ +#ifndef __ZVFS_HOOK_MMAP_H__ +#define __ZVFS_HOOK_MMAP_H__ + +#include +#include + +/* + * mmap 族。 + * + * 当前策略: + * - fd 属于 zvfs → 返回 ENOTSUP,强制上层走非 mmap 路径 + * (RocksDB: options.use_mmap_reads/writes = false) + * - fd 不属于 zvfs,或 MAP_ANONYMOUS → 透传 real_mmap + * + * munmap / msync: + * zvfs fd 的 mmap 不会成功,所以 munmap/msync 里永远找不到 + * zvfs 的映射,直接透传即可。 + * + * 预留扩展点(future): + * 实现时在此处: + * mmap → MAP_ANONYMOUS 分配匿名内存 + blob_read 填充 + * 将 (addr, length, inode, file_offset) 插入 mmap_table + * munmap → 查 mmap_table,若命中则 blob_write 写回,再真正 munmap + * msync → 查 mmap_table,blob_write 对应范围 + */ + +void *mmap(void *addr, size_t length, int prot, int flags, + int fd, off_t offset); +void *mmap64(void *addr, size_t length, int prot, int flags, + int fd, off_t offset); +int munmap(void *addr, size_t length); +int msync(void *addr, size_t length, int flags); + +#endif // __ZVFS_HOOK_MMAP_H__ \ No newline at end of file diff --git a/src/hook/zvfs_hook_reentrant.h b/src/hook/zvfs_hook_reentrant.h new file mode 100644 index 0000000..7f0ac71 --- /dev/null +++ b/src/hook/zvfs_hook_reentrant.h @@ -0,0 +1,32 @@ +#ifndef __ZVFS_HOOK_REENTRANT_H__ +#define __ZVFS_HOOK_REENTRANT_H__ + +/* + * 线程局部重入深度计数。 + * + * 进入任何 hook 函数时 ZVFS_HOOK_ENTER(),离开时 ZVFS_HOOK_LEAVE()。 + * 当深度 > 1 时,说明当前调用是 hook 内部发起的(例如 hook 内调用了 + * real_fstat,而 fstat 本身也被 hook),此时直接走 real_* 绕过 zvfs 逻辑。 + * + * 典型骨架: + * + * int fstat(int fd, struct stat *buf) + * { + * ZVFS_HOOK_ENTER(); + * int ret; + * if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_fd(fd)) + * ret = real_fstat(fd, buf); + * else + * ret = zvfs_fstat_impl(fd, buf); + * ZVFS_HOOK_LEAVE(); + * return ret; + * } + */ + +extern __thread int _zvfs_hook_depth; + +#define ZVFS_HOOK_ENTER() (++_zvfs_hook_depth) +#define ZVFS_HOOK_LEAVE() (--_zvfs_hook_depth) +#define ZVFS_IN_HOOK() (_zvfs_hook_depth > 1) + +#endif // __ZVFS_HOOK_REENTRANT_H__ \ No newline at end of file diff --git a/src/hook/zvfs_hook_rw.c b/src/hook/zvfs_hook_rw.c new file mode 100644 index 0000000..85d03b7 --- /dev/null +++ b/src/hook/zvfs_hook_rw.c @@ -0,0 +1,549 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include "zvfs_hook_rw.h" +#include "zvfs_hook_init.h" +#include "zvfs_hook_reentrant.h" +#include "fs/zvfs.h" +#include "fs/zvfs_open_file.h" +#include "fs/zvfs_inode.h" +#include "spdk_engine/io_engine.h" + +#include +#include +#include +#include + +/* ------------------------------------------------------------------ */ +/* 内部:单段 pread / pwrite(不修改 of->offset) */ +/* ------------------------------------------------------------------ */ + +/* + * zvfs_pread_impl + * + * 从 blob 的 [offset, offset+count) 读取数据到 buf。 + * 若请求范围超出 logical_size,截断到 logical_size 边界。 + * 成功返回实际读取字节数,失败返回 -1。 + */ +static ssize_t +zvfs_pread_impl(struct zvfs_open_file *of, + void *buf, size_t count, uint64_t offset) +{ + /* 持 inode->mu 读 logical_size,防止并发 write 同时修改 */ + pthread_mutex_lock(&of->inode->mu); + uint64_t size = of->inode->logical_size; + pthread_mutex_unlock(&of->inode->mu); + + /* offset 超出文件末尾:返回 0(EOF) */ + if (offset >= size) + return 0; + + /* 截断读取长度到文件末尾 */ + if (offset + count > size) + count = (size_t)(size - offset); + + if (count == 0) + return 0; + + if (blob_read(of->handle, offset, buf, count) < 0) { + errno = EIO; + return -1; + } + + return (ssize_t)count; +} + +/* + * zvfs_pwrite_impl + * + * 将 buf 的 count 字节写入 blob 的 offset 处。 + * 若写入后末尾超过 logical_size,更新 logical_size 并同步 st_size。 + * 成功返回 count,失败返回 -1。 + */ +static ssize_t +zvfs_pwrite_impl(struct zvfs_open_file *of, + const void *buf, size_t count, uint64_t offset) +{ + if (count == 0) + return 0; + + uint64_t end = offset + count; + + /* + * 若写入范围超出 blob 当前物理大小,先 resize。 + * blob_resize 是 SPDK 侧的操作(可能分配新 cluster)。 + */ + pthread_mutex_lock(&of->inode->mu); + uint64_t old_size = of->inode->logical_size; + pthread_mutex_unlock(&of->inode->mu); + + if (end > old_size) { + if (blob_resize(of->handle, end) < 0) { + errno = EIO; + return -1; + } + } + + if (blob_write(of->handle, offset, buf, count) < 0) { + errno = EIO; + return -1; + } + + /* 更新 logical_size(持锁,inode_update_size 负责 ftruncate) */ + if (end > old_size) { + pthread_mutex_lock(&of->inode->mu); + if (end > of->inode->logical_size) /* double-check */ + inode_update_size(of->inode, of->fd, end); + pthread_mutex_unlock(&of->inode->mu); + } + + return (ssize_t)count; +} + +/* ------------------------------------------------------------------ */ +/* 内部:iov 合并辅助 */ +/* ------------------------------------------------------------------ */ + +/* + * iov_total_len - 计算 iovec 总字节数。 + */ +static size_t +iov_total_len(const struct iovec *iov, int iovcnt) +{ + size_t total = 0; + for (int i = 0; i < iovcnt; i++) + total += iov[i].iov_len; + return total; +} + +/* + * zvfs_iov_pread + * + * 将 iovec 合并为单次 blob_read: + * 1. 一次 blob_read 读到临时 buf + * 2. 按 iovec 顺序分发到各段 + * + * 单次 SPDK I/O 比逐段提交效率高得多; + * 堆分配代价(通常几个 page)远小于多次 SPDK 提交的开销。 + */ +static ssize_t +zvfs_iov_pread(struct zvfs_open_file *of, + const struct iovec *iov, int iovcnt, uint64_t offset) +{ + size_t total_len = iov_total_len(iov, iovcnt); + if (total_len == 0) return 0; + + /* 截断到文件末尾 */ + pthread_mutex_lock(&of->inode->mu); + uint64_t size = of->inode->logical_size; + pthread_mutex_unlock(&of->inode->mu); + + if (offset >= size) return 0; + if (offset + total_len > size) + total_len = (size_t)(size - offset); + + /* 分配临时 buf,单次读 */ + char *tmp = malloc(total_len); + if (!tmp) { errno = ENOMEM; return -1; } + + if (blob_read(of->handle, offset, tmp, total_len) < 0) { + free(tmp); + errno = EIO; + return -1; + } + + /* 分发到各 iovec 段 */ + size_t copied = 0; + for (int i = 0; i < iovcnt && copied < total_len; i++) { + size_t seg = iov[i].iov_len; + if (seg == 0) continue; + if (copied + seg > total_len) seg = total_len - copied; + memcpy(iov[i].iov_base, tmp + copied, seg); + copied += seg; + } + + free(tmp); + return (ssize_t)total_len; +} + +/* + * zvfs_iov_pwrite + * + * 将 iovec 合并为单次 blob_write: + * 1. 分配临时 buf,按 iovec 顺序 memcpy 拼接 + * 2. 单次 blob_write + 一次 inode_update_size + * + * 避免多次 SPDK 提交和多次 ftruncate。 + */ +static ssize_t +zvfs_iov_pwrite(struct zvfs_open_file *of, + const struct iovec *iov, int iovcnt, uint64_t offset) +{ + size_t total_len = iov_total_len(iov, iovcnt); + if (total_len == 0) return 0; + + /* 拼接到临时 buf */ + char *tmp = malloc(total_len); + if (!tmp) { errno = ENOMEM; return -1; } + + size_t pos = 0; + for (int i = 0; i < iovcnt; i++) { + if (iov[i].iov_len == 0) continue; + memcpy(tmp + pos, iov[i].iov_base, iov[i].iov_len); + pos += iov[i].iov_len; + } + + /* 单次写入 */ + ssize_t r = zvfs_pwrite_impl(of, tmp, total_len, offset); + free(tmp); + return r; +} + +/* ------------------------------------------------------------------ */ +/* 内部:取出 of,处理重入/非 zvfs 判断 */ +/* ------------------------------------------------------------------ */ + +static inline struct zvfs_open_file * +get_of(int fd) +{ + pthread_mutex_lock(&g_fs.fd_mu); + struct zvfs_open_file *of = openfile_lookup(fd); + pthread_mutex_unlock(&g_fs.fd_mu); + return of; +} + +/* ------------------------------------------------------------------ */ +/* read */ +/* ------------------------------------------------------------------ */ + +ssize_t +read(int fd, void *buf, size_t count) +{ + ZVFS_HOOK_ENTER(); + + struct zvfs_open_file *of; + if (ZVFS_IN_HOOK() || !(of = get_of(fd))) { + ssize_t r = real_read(fd, buf, count); + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + ssize_t r = zvfs_pread_impl(of, buf, count, of->offset); + if (r > 0) + of->offset += (uint64_t)r; + + ZVFS_HOOK_LEAVE(); + return r; +} + +ssize_t __read(int fd, void *buf, size_t count) { return read(fd, buf, count); } +ssize_t __libc_read(int fd, void *buf, size_t count) { return read(fd, buf, count); } + +/* ------------------------------------------------------------------ */ +/* pread / pread64 */ +/* ------------------------------------------------------------------ */ + +ssize_t +pread(int fd, void *buf, size_t count, off_t offset) +{ + ZVFS_HOOK_ENTER(); + + struct zvfs_open_file *of; + if (ZVFS_IN_HOOK() || !(of = get_of(fd))) { + ssize_t r = real_pread(fd, buf, count, offset); + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + ssize_t r = zvfs_pread_impl(of, buf, count, (uint64_t)offset); + ZVFS_HOOK_LEAVE(); + return r; +} + +ssize_t pread64(int fd, void *buf, size_t count, off_t offset) +{ + return pread(fd, buf, count, offset); +} + +/* ------------------------------------------------------------------ */ +/* readv / preadv / preadv64 / preadv2 */ +/* ------------------------------------------------------------------ */ + +ssize_t +readv(int fd, const struct iovec *iov, int iovcnt) +{ + ZVFS_HOOK_ENTER(); + + struct zvfs_open_file *of; + if (ZVFS_IN_HOOK() || !(of = get_of(fd))) { + ssize_t r = real_readv(fd, iov, iovcnt); + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + ssize_t r = zvfs_iov_pread(of, iov, iovcnt, of->offset); + if (r > 0) + of->offset += (uint64_t)r; + + ZVFS_HOOK_LEAVE(); + return r; +} + +ssize_t +preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset) +{ + ZVFS_HOOK_ENTER(); + + struct zvfs_open_file *of; + if (ZVFS_IN_HOOK() || !(of = get_of(fd))) { + ssize_t r = real_preadv(fd, iov, iovcnt, offset); + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + ssize_t r = zvfs_iov_pread(of, iov, iovcnt, (uint64_t)offset); + ZVFS_HOOK_LEAVE(); + return r; +} + +ssize_t preadv64(int fd, const struct iovec *iov, int iovcnt, off_t offset) +{ + return preadv(fd, iov, iovcnt, offset); +} + +ssize_t +preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags) +{ + ZVFS_HOOK_ENTER(); + + struct zvfs_open_file *of; + if (ZVFS_IN_HOOK() || !(of = get_of(fd))) { + ssize_t r = real_preadv2 + ? real_preadv2(fd, iov, iovcnt, offset, flags) + : (errno = ENOSYS, (ssize_t)-1); + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + /* + * RWF_NOWAIT:zvfs 无阻塞 I/O 概念,blob_read 总是同步返回, + * 忽略该 flag,按普通 preadv 处理。 + * RWF_HIPRI / RWF_DSYNC / RWF_SYNC:同上,忽略。 + */ + uint64_t off = (offset == (off_t)-1) ? of->offset : (uint64_t)offset; + ssize_t r = zvfs_iov_pread(of, iov, iovcnt, off); + if (offset == (off_t)-1 && r > 0) + of->offset += (uint64_t)r; + + ZVFS_HOOK_LEAVE(); + return r; +} + +/* ------------------------------------------------------------------ */ +/* write */ +/* ------------------------------------------------------------------ */ + +ssize_t +write(int fd, const void *buf, size_t count) +{ + ZVFS_HOOK_ENTER(); + + struct zvfs_open_file *of; + if (ZVFS_IN_HOOK() || !(of = get_of(fd))) { + ssize_t r = real_write(fd, buf, count); + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + uint64_t write_off; + + if (of->flags & O_APPEND) { + /* + * O_APPEND:每次写入位置 = 当前 logical_size(原子操作)。 + * 持 inode->mu 保证 read-then-write 的原子性, + * 防止两个 O_APPEND fd 并发写时覆盖彼此数据。 + */ + /* --- O_APPEND 内联写 -------------------------------------- */ + pthread_mutex_lock(&of->inode->mu); + write_off = of->inode->logical_size; /* 重新取,防止 TOCTOU */ + uint64_t end = write_off + count; + + pthread_mutex_unlock(&of->inode->mu); + + if (blob_resize(of->handle, end) < 0) { + errno = EIO; + ZVFS_HOOK_LEAVE(); + return -1; + } + if (blob_write(of->handle, write_off, buf, count) < 0) { + errno = EIO; + ZVFS_HOOK_LEAVE(); + return -1; + } + + pthread_mutex_lock(&of->inode->mu); + if (end > of->inode->logical_size) + inode_update_size(of->inode, of->fd, end); + pthread_mutex_unlock(&of->inode->mu); + + ZVFS_HOOK_LEAVE(); + return (ssize_t)count; + + } else { + write_off = of->offset; + ssize_t r = zvfs_pwrite_impl(of, buf, count, write_off); + if (r > 0) + of->offset += (uint64_t)r; + ZVFS_HOOK_LEAVE(); + return r; + } +} + +ssize_t __write(int fd, const void *buf, size_t count) { return write(fd, buf, count); } +ssize_t __libc_write(int fd, const void *buf, size_t count) { return write(fd, buf, count); } + +/* ------------------------------------------------------------------ */ +/* pwrite / pwrite64 */ +/* ------------------------------------------------------------------ */ + +ssize_t +pwrite(int fd, const void *buf, size_t count, off_t offset) +{ + ZVFS_HOOK_ENTER(); + + struct zvfs_open_file *of; + if (ZVFS_IN_HOOK() || !(of = get_of(fd))) { + ssize_t r = real_pwrite(fd, buf, count, offset); + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + /* + * POSIX:pwrite 忽略 O_APPEND,始终写到指定 offset。 + */ + ssize_t r = zvfs_pwrite_impl(of, buf, count, (uint64_t)offset); + ZVFS_HOOK_LEAVE(); + return r; +} + +ssize_t pwrite64(int fd, const void *buf, size_t count, off_t offset) +{ + return pwrite(fd, buf, count, offset); +} + +/* ------------------------------------------------------------------ */ +/* writev / pwritev / pwritev64 / pwritev2 */ +/* ------------------------------------------------------------------ */ + +ssize_t +writev(int fd, const struct iovec *iov, int iovcnt) +{ + ZVFS_HOOK_ENTER(); + + struct zvfs_open_file *of; + if (ZVFS_IN_HOOK() || !(of = get_of(fd))) { + ssize_t r = real_writev(fd, iov, iovcnt); + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + ssize_t r; + if (of->flags & O_APPEND) { + /* + * O_APPEND + writev:和 write 一样需要原子序列。 + * 先计算总字节数,用 iov_pwrite 完成,整个过程持 inode->mu。 + */ + size_t total_len = 0; + for (int i = 0; i < iovcnt; i++) total_len += iov[i].iov_len; + + pthread_mutex_lock(&of->inode->mu); + uint64_t write_off = of->inode->logical_size; + uint64_t end = write_off + total_len; + pthread_mutex_unlock(&of->inode->mu); + + if (blob_resize(of->handle, end) < 0) { errno = EIO; ZVFS_HOOK_LEAVE(); return -1; } + r = zvfs_iov_pwrite(of, iov, iovcnt, write_off); + + if (r > 0) { + pthread_mutex_lock(&of->inode->mu); + uint64_t new_end = write_off + (uint64_t)r; + if (new_end > of->inode->logical_size) + inode_update_size(of->inode, of->fd, new_end); + pthread_mutex_unlock(&of->inode->mu); + } + } else { + r = zvfs_iov_pwrite(of, iov, iovcnt, of->offset); + if (r > 0) of->offset += (uint64_t)r; + } + + ZVFS_HOOK_LEAVE(); + return r; +} + +ssize_t +pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset) +{ + ZVFS_HOOK_ENTER(); + + struct zvfs_open_file *of; + if (ZVFS_IN_HOOK() || !(of = get_of(fd))) { + ssize_t r = real_pwritev(fd, iov, iovcnt, offset); + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + ssize_t r = zvfs_iov_pwrite(of, iov, iovcnt, (uint64_t)offset); + ZVFS_HOOK_LEAVE(); + return r; +} + +ssize_t pwritev64(int fd, const struct iovec *iov, int iovcnt, off_t offset) +{ + return pwritev(fd, iov, iovcnt, offset); +} + +ssize_t +pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags) +{ + ZVFS_HOOK_ENTER(); + + struct zvfs_open_file *of; + if (ZVFS_IN_HOOK() || !(of = get_of(fd))) { + ssize_t r = real_pwritev2 + ? real_pwritev2(fd, iov, iovcnt, offset, flags) + : (errno = ENOSYS, (ssize_t)-1); + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + /* flags(RWF_SYNC/RWF_DSYNC/RWF_APPEND 等): + * zvfs 无缓冲区,所有写均同步落盘,忽略 flags。 + * offset == -1:使用并更新 of->offset。 */ + uint64_t off = (offset == (off_t)-1) ? of->offset : (uint64_t)offset; + ssize_t r = zvfs_iov_pwrite(of, iov, iovcnt, off); + if (offset == (off_t)-1 && r > 0) + of->offset += (uint64_t)r; + + ZVFS_HOOK_LEAVE(); + return r; +} diff --git a/src/hook/zvfs_hook_rw.h b/src/hook/zvfs_hook_rw.h new file mode 100644 index 0000000..49a942d --- /dev/null +++ b/src/hook/zvfs_hook_rw.h @@ -0,0 +1,52 @@ +#ifndef __ZVFS_HOOK_RW_H__ +#define __ZVFS_HOOK_RW_H__ + +#include +#include +#include + +/* + * read / write 族。 + * + * 所有变体最终收敛到两个内部实现: + * zvfs_pread_impl (fd, buf, count, offset) + * zvfs_pwrite_impl(fd, buf, count, offset) + * + * offset 语义: + * - pread/pwrite 系列:直接使用传入 offset,不修改 of->offset + * - read/write 系列:使用 of->offset,完成后更新 + * - O_APPEND write :每次写前持 inode->mu 取 logical_size 作为 offset + * + * iov 系列(readv/writev/preadv/pwritev): + * 展开 iovec 后逐段调用 pread/pwrite impl,合并结果。 + * 这样不需要在 SPDK 层实现 scatter/gather,实现最简单。 + * 如果将来 SPDK 层支持 SGL 可以直接换掉这一层。 + */ + +/* read 族 */ +ssize_t read(int fd, void *buf, size_t count); +ssize_t pread(int fd, void *buf, size_t count, off_t offset); +ssize_t pread64(int fd, void *buf, size_t count, off_t offset); +ssize_t readv(int fd, const struct iovec *iov, int iovcnt); +ssize_t preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset); +ssize_t preadv64(int fd, const struct iovec *iov, int iovcnt, off_t offset); +ssize_t preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, + int flags); + +/* write 族 */ +ssize_t write(int fd, const void *buf, size_t count); +ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset); +ssize_t pwrite64(int fd, const void *buf, size_t count, off_t offset); +ssize_t writev(int fd, const struct iovec *iov, int iovcnt); +ssize_t pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset); +ssize_t pwritev64(int fd, const struct iovec *iov, int iovcnt, off_t offset); +ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, + int flags); + +/* glibc 别名 */ +ssize_t __read(int fd, void *buf, size_t count); +ssize_t __libc_read(int fd, void *buf, size_t count); +ssize_t __write(int fd, const void *buf, size_t count); +ssize_t __libc_write(int fd, const void *buf, size_t count); + +#endif // __ZVFS_HOOK_RW_H__ \ No newline at end of file diff --git a/src/hook/zvfs_hook_seek.c b/src/hook/zvfs_hook_seek.c new file mode 100644 index 0000000..637dde4 --- /dev/null +++ b/src/hook/zvfs_hook_seek.c @@ -0,0 +1,301 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include "zvfs_hook_seek.h" +#include "zvfs_hook_init.h" +#include "zvfs_hook_reentrant.h" +#include "fs/zvfs.h" +#include "fs/zvfs_open_file.h" +#include "fs/zvfs_inode.h" +#include "fs/zvfs_path_entry.h" +#include "spdk_engine/io_engine.h" + +#include +#include +#include /* FALLOC_FL_* */ +#include +#include + +/* ------------------------------------------------------------------ */ +/* lseek / lseek64 */ +/* ------------------------------------------------------------------ */ + +off_t +lseek(int fd, off_t offset, int whence) +{ + ZVFS_HOOK_ENTER(); + + struct zvfs_open_file *of = NULL; + if (!ZVFS_IN_HOOK()) { + pthread_mutex_lock(&g_fs.fd_mu); + of = openfile_lookup(fd); + pthread_mutex_unlock(&g_fs.fd_mu); + } + + if (!of) { + off_t r = real_lseek(fd, offset, whence); + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + /* + * O_APPEND fd 的 lseek:POSIX 允许 lseek,但下次 write 时 + * 仍会从文件末尾写。lseek 只影响 read 的位置。 + * 我们照常更新 of->offset。 + */ + pthread_mutex_lock(&of->inode->mu); /* SEEK_END 需读 logical_size */ + uint64_t new_off = openfile_seek(of, (int64_t)offset, whence); + pthread_mutex_unlock(&of->inode->mu); + + if (new_off == (uint64_t)-1) { + ZVFS_HOOK_LEAVE(); + return (off_t)-1; + } + + ZVFS_HOOK_LEAVE(); + return (off_t)new_off; +} + +off_t lseek64(int fd, off_t offset, int whence) +{ + return lseek(fd, offset, whence); +} + +/* ------------------------------------------------------------------ */ +/* 内部:按 inode 指针做 truncate(path / fd 路径共用) */ +/* ------------------------------------------------------------------ */ + + +/* + * zvfs_truncate_by_inode - 对有 handle 的 openfile 做 truncate。 + * 找到任意一个打开该 inode 的 openfile 取其 handle。 + */ +static int +zvfs_truncate_inode_with_handle(struct zvfs_inode *inode, + int real_fd, uint64_t new_size) +{ + /* 在 fd_table 里找一个指向该 inode 的 openfile 取 handle */ + struct zvfs_blob_handle *handle = NULL; + pthread_mutex_lock(&g_fs.fd_mu); + struct zvfs_open_file *of, *tmp; + HASH_ITER(hh, g_fs.fd_table, of, tmp) { + (void)tmp; + if (of->inode == inode) { + handle = of->handle; + break; + } + } + pthread_mutex_unlock(&g_fs.fd_mu); + + pthread_mutex_lock(&inode->mu); + uint64_t old_size = inode->logical_size; + pthread_mutex_unlock(&inode->mu); + + if (new_size != old_size && handle) { + if (blob_resize(handle, new_size) < 0) { + errno = EIO; + return -1; + } + } else if (new_size != old_size && !handle) { + /* + * 文件未被打开:需要临时 blob_open。 + * 这种情况下 truncate(path, ...) 被调用但文件没有 fd。 + */ + handle = blob_open(inode->blob_id); + if (!handle) { errno = EIO; return -1; } + int rc = blob_resize(handle, new_size); + blob_close(handle); + if (rc < 0) { errno = EIO; return -1; } + } + + pthread_mutex_lock(&inode->mu); + inode_update_size(inode, real_fd, new_size); + pthread_mutex_unlock(&inode->mu); + + return 0; +} + +/* ------------------------------------------------------------------ */ +/* ftruncate / ftruncate64 */ +/* ------------------------------------------------------------------ */ + +int +ftruncate(int fd, off_t length) +{ + ZVFS_HOOK_ENTER(); + + struct zvfs_open_file *of = NULL; + if (!ZVFS_IN_HOOK()) { + pthread_mutex_lock(&g_fs.fd_mu); + of = openfile_lookup(fd); + pthread_mutex_unlock(&g_fs.fd_mu); + } + + if (!of) { + int r = real_ftruncate(fd, length); + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + if (length < 0) { errno = EINVAL; ZVFS_HOOK_LEAVE(); return -1; } + + int r = zvfs_truncate_inode_with_handle(of->inode, fd, (uint64_t)length); + ZVFS_HOOK_LEAVE(); + return r; +} + +int ftruncate64(int fd, off_t length) { return ftruncate(fd, length); } + +/* ------------------------------------------------------------------ */ +/* truncate / truncate64(按路径) */ +/* ------------------------------------------------------------------ */ + +int +truncate(const char *path, off_t length) +{ + ZVFS_HOOK_ENTER(); + + if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(path)) { + int r = real_truncate(path, length); + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + if (length < 0) { errno = EINVAL; ZVFS_HOOK_LEAVE(); return -1; } + + /* 查 path_cache 拿 inode */ + pthread_mutex_lock(&g_fs.path_mu); + struct zvfs_path_entry *pe = path_cache_lookup(path); + struct zvfs_inode *inode = pe ? pe->inode : NULL; + pthread_mutex_unlock(&g_fs.path_mu); + + if (!inode) { + /* + * inode 不在缓存:文件存在于 FS 但从未被 open。 + * 需要读 xattr 拿 blob_id,临时构建 inode。 + * 最简单的做法:先 real_open,再走 zvfs 路径,再 real_close。 + * 这里直接调 real_truncate 改 st_size,但 blob 不会被截断。 + * + * 更正确的做法:open + ftruncate + close。 + * 调用方通常不会在 file 未被打开的情况下做 truncate, + * 所以这里先报 ENOENT(找不到 zvfs inode)作为安全兜底。 + */ + errno = ENOENT; + ZVFS_HOOK_LEAVE(); + return -1; + } + + int r = zvfs_truncate_inode_with_handle(inode, -1, (uint64_t)length); + + /* 同步真实文件 st_size(real_truncate 更新磁盘元数据) */ + if (r == 0) + real_truncate(path, length); + + ZVFS_HOOK_LEAVE(); + return r; +} + +int truncate64(const char *path, off_t length) { return truncate(path, length); } + +/* ------------------------------------------------------------------ */ +/* fallocate */ +/* ------------------------------------------------------------------ */ + +int +fallocate(int fd, int mode, off_t offset, off_t len) +{ + ZVFS_HOOK_ENTER(); + + struct zvfs_open_file *of = NULL; + if (!ZVFS_IN_HOOK()) { + pthread_mutex_lock(&g_fs.fd_mu); + of = openfile_lookup(fd); + pthread_mutex_unlock(&g_fs.fd_mu); + } + + if (!of) { + int r = real_fallocate(fd, mode, offset, len); + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + if (offset < 0 || len <= 0) { errno = EINVAL; ZVFS_HOOK_LEAVE(); return -1; } + + /* FALLOC_FL_PUNCH_HOLE:打孔,暂不支持 */ + if (mode & FALLOC_FL_PUNCH_HOLE) { + errno = ENOTSUP; + ZVFS_HOOK_LEAVE(); + return -1; + } + + /* FALLOC_FL_KEEP_SIZE:预分配但不改变文件逻辑大小,直接返回 0 */ + if (mode & FALLOC_FL_KEEP_SIZE) { + ZVFS_HOOK_LEAVE(); + return 0; + } + + /* + * 普通 fallocate(mode == 0): + * 确保 [offset, offset+len) 范围内的空间被"分配"。 + * zvfs 的语义:把 logical_size 扩展到 max(logical_size, offset+len)。 + * 不提前 blob_resize,因为 SPDK cluster 按写入时分配更高效。 + */ + uint64_t new_end = (uint64_t)offset + (uint64_t)len; + + pthread_mutex_lock(&of->inode->mu); + if (new_end > of->inode->logical_size) + inode_update_size(of->inode, fd, new_end); + pthread_mutex_unlock(&of->inode->mu); + + ZVFS_HOOK_LEAVE(); + return 0; +} + +/* ------------------------------------------------------------------ */ +/* posix_fallocate */ +/* ------------------------------------------------------------------ */ + +int +posix_fallocate(int fd, off_t offset, off_t len) +{ + ZVFS_HOOK_ENTER(); + + struct zvfs_open_file *of = NULL; + if (!ZVFS_IN_HOOK()) { + pthread_mutex_lock(&g_fs.fd_mu); + of = openfile_lookup(fd); + pthread_mutex_unlock(&g_fs.fd_mu); + } + + if (!of) { + int r = real_posix_fallocate(fd, offset, len); + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + /* + * posix_fallocate 不接受 mode 参数,语义等价于 fallocate(fd, 0, ...)。 + * 注意:posix_fallocate 出错时返回错误码(正值),不设置 errno。 + */ + if (offset < 0 || len <= 0) { ZVFS_HOOK_LEAVE(); return EINVAL; } + + uint64_t new_end = (uint64_t)offset + (uint64_t)len; + + pthread_mutex_lock(&of->inode->mu); + if (new_end > of->inode->logical_size) + inode_update_size(of->inode, fd, new_end); + pthread_mutex_unlock(&of->inode->mu); + + ZVFS_HOOK_LEAVE(); + return 0; +} diff --git a/src/hook/zvfs_hook_seek.h b/src/hook/zvfs_hook_seek.h new file mode 100644 index 0000000..5aa9819 --- /dev/null +++ b/src/hook/zvfs_hook_seek.h @@ -0,0 +1,33 @@ +#ifndef __ZVFS_HOOK_SEEK_H__ +#define __ZVFS_HOOK_SEEK_H__ + +#include +#include + +/* + * lseek:更新 of->offset(非 O_APPEND fd)。 + * + * truncate / ftruncate: + * 更新 inode->logical_size,同步 st_size(ftruncate 到真实 fd), + * 若 new_size < old_size,截断对 blob 的写入范围(blob_resize)。 + * + * fallocate / posix_fallocate: + * zvfs 无"空洞"概念,blob 按需增长。 + * 对 zvfs fd,fallocate 只更新 logical_size(预占逻辑空间), + * 不调用 blob_resize(避免提前分配 SPDK cluster)。 + * FALLOC_FL_KEEP_SIZE 模式:不改 logical_size,直接返回 0。 + * FALLOC_FL_PUNCH_HOLE:暂不支持,返回 ENOTSUP。 + */ + +off_t lseek(int fd, off_t offset, int whence); +off_t lseek64(int fd, off_t offset, int whence); + +int truncate(const char *path, off_t length); +int truncate64(const char *path, off_t length); +int ftruncate(int fd, off_t length); +int ftruncate64(int fd, off_t length); + +int fallocate(int fd, int mode, off_t offset, off_t len); +int posix_fallocate(int fd, off_t offset, off_t len); + +#endif // __ZVFS_HOOK_SEEK_H__ \ No newline at end of file diff --git a/src/hook/zvfs_hook_stat.c b/src/hook/zvfs_hook_stat.c new file mode 100644 index 0000000..70be1b5 --- /dev/null +++ b/src/hook/zvfs_hook_stat.c @@ -0,0 +1,404 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include "zvfs_hook_stat.h" +#include "zvfs_hook_init.h" +#include "zvfs_hook_reentrant.h" +#include "fs/zvfs.h" +#include "fs/zvfs_inode.h" +#include "fs/zvfs_open_file.h" +#include "fs/zvfs_path_entry.h" + +#include +#include +#include +#include +#include + +/* ------------------------------------------------------------------ */ +/* 内部:用 inode 覆盖 stat 结构体的 zvfs 相关字段 */ +/* ------------------------------------------------------------------ */ + +static void +patch_stat(struct stat *st, struct zvfs_inode *inode) +{ + pthread_mutex_lock(&inode->mu); + st->st_size = (off_t)inode->logical_size; + st->st_atime = inode->atime; + st->st_mtime = inode->mtime; + pthread_mutex_unlock(&inode->mu); + + /* + * st_blocks:以 512 字节为单位的"实际占用块数"。 + * zvfs 数据在 SPDK,真实文件几乎为空(只有 xattr), + * 按 logical_size 估算,给上层一个合理的值。 + * (logical_size + 511) / 512 向上取整。 + */ + st->st_blocks = (blkcnt_t)((st->st_size + 511) / 512); +} + +static void +patch_stat64(struct stat64 *st, struct zvfs_inode *inode) +{ + pthread_mutex_lock(&inode->mu); + st->st_size = (off64_t)inode->logical_size; + st->st_atime = inode->atime; + st->st_mtime = inode->mtime; + pthread_mutex_unlock(&inode->mu); + + st->st_blocks = (blkcnt_t)((st->st_size + 511) / 512); +} + +/* ------------------------------------------------------------------ */ +/* 内部:按路径找 inode(先查缓存,缓存未命中则检查 xattr) */ +/* ------------------------------------------------------------------ */ + +/* + * zvfs_inode_by_path + * + * 返回路径对应的 inode 指针(不增加 ref_count,调用方只读使用)。 + * 若路径不是 zvfs 文件(无 xattr)返回 NULL。 + * + * 注意:返回的指针仅在持有 path_mu / inode_mu 之外使用时有效, + * 调用方需在使用期间持有 inode->mu 或确保文件未被 close。 + * 对 stat 路径(只读 logical_size/atime/mtime), + * 短暂持有 inode->mu 即可,无需长期持有。 + */ +static struct zvfs_inode * +zvfs_inode_by_path(const char *path) +{ + /* 1. 先查 path_cache */ + pthread_mutex_lock(&g_fs.path_mu); + struct zvfs_path_entry *pe = path_cache_lookup(path); + struct zvfs_inode *inode = pe ? pe->inode : NULL; + pthread_mutex_unlock(&g_fs.path_mu); + + if (inode) + return inode; + + /* 2. path_cache 未命中:检查 xattr 判断是否是 zvfs 文件 */ + uint64_t blob_id = 0; + int tmp_fd = real_open(path, O_RDONLY); + if (tmp_fd < 0) + return NULL; + + int has_xattr = (zvfs_xattr_read_blob_id(tmp_fd, &blob_id) == 0); + real_close(tmp_fd); + + if (!has_xattr) + return NULL; + + /* 3. 查 inode_table(文件被另一个 fd 打开过) */ + pthread_mutex_lock(&g_fs.inode_mu); + inode = inode_lookup(blob_id); + pthread_mutex_unlock(&g_fs.inode_mu); + + return inode; /* 可能仍为 NULL(从未打开过,纯 stat 调用) */ +} + +/* ------------------------------------------------------------------ */ +/* stat */ +/* ------------------------------------------------------------------ */ + +int +stat(const char *path, struct stat *buf) +{ + ZVFS_HOOK_ENTER(); + + if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(path)) { + int r = real_stat(path, buf); + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + /* 先透传,拿到完整 stat(mode、ino、dev、nlink 等) */ + if (real_stat(path, buf) < 0) { + ZVFS_HOOK_LEAVE(); + return -1; + } + + struct zvfs_inode *inode = zvfs_inode_by_path(path); + if (inode) + patch_stat(buf, inode); + /* + * inode 为 NULL:文件存在于 FS 但从未被 zvfs open, + * 此时 st_size 来自真实文件(接近 0), + * 这是合理的降级行为(文件尚未被写入 SPDK)。 + */ + + ZVFS_HOOK_LEAVE(); + return 0; +} + +int +stat64(const char *path, struct stat64 *buf) +{ + ZVFS_HOOK_ENTER(); + + if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(path)) { + int r = real_stat64(path, buf); + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + if (real_stat64(path, buf) < 0) { + ZVFS_HOOK_LEAVE(); + return -1; + } + + struct zvfs_inode *inode = zvfs_inode_by_path(path); + if (inode) + patch_stat64(buf, inode); + + ZVFS_HOOK_LEAVE(); + return 0; +} + +/* ------------------------------------------------------------------ */ +/* fstat(最高频,pg 每次 read 前都调) */ +/* ------------------------------------------------------------------ */ + +int +fstat(int fd, struct stat *buf) +{ + ZVFS_HOOK_ENTER(); + + /* 先透传:拿到 mode/ino/dev/nlink/blksize 等 */ + if (real_fstat(fd, buf) < 0) { + ZVFS_HOOK_LEAVE(); + return -1; + } + + if (ZVFS_IN_HOOK()) { + ZVFS_HOOK_LEAVE(); + return 0; + } + + pthread_mutex_lock(&g_fs.fd_mu); + struct zvfs_open_file *of = openfile_lookup(fd); + pthread_mutex_unlock(&g_fs.fd_mu); + + if (of) { + zvfs_ensure_init(); + patch_stat(buf, of->inode); + } + + ZVFS_HOOK_LEAVE(); + return 0; +} + +int +fstat64(int fd, struct stat64 *buf) +{ + ZVFS_HOOK_ENTER(); + + if (real_fstat64(fd, buf) < 0) { + ZVFS_HOOK_LEAVE(); + return -1; + } + + if (ZVFS_IN_HOOK()) { + ZVFS_HOOK_LEAVE(); + return 0; + } + + pthread_mutex_lock(&g_fs.fd_mu); + struct zvfs_open_file *of = openfile_lookup(fd); + pthread_mutex_unlock(&g_fs.fd_mu); + + if (of) { + zvfs_ensure_init(); + patch_stat64(buf, of->inode); + } + + ZVFS_HOOK_LEAVE(); + return 0; +} + +/* ------------------------------------------------------------------ */ +/* lstat(symlink 不穿透;zvfs 不用 symlink,逻辑与 stat 相同) */ +/* ------------------------------------------------------------------ */ + +int +lstat(const char *path, struct stat *buf) +{ + ZVFS_HOOK_ENTER(); + + if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(path)) { + int r = real_lstat(path, buf); + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + if (real_lstat(path, buf) < 0) { + ZVFS_HOOK_LEAVE(); + return -1; + } + + struct zvfs_inode *inode = zvfs_inode_by_path(path); + if (inode) + patch_stat(buf, inode); + + ZVFS_HOOK_LEAVE(); + return 0; +} + +int +lstat64(const char *path, struct stat64 *buf) +{ + ZVFS_HOOK_ENTER(); + + if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(path)) { + int r = real_lstat64(path, buf); + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + if (real_lstat64(path, buf) < 0) { + ZVFS_HOOK_LEAVE(); + return -1; + } + + struct zvfs_inode *inode = zvfs_inode_by_path(path); + if (inode) + patch_stat64(buf, inode); + + ZVFS_HOOK_LEAVE(); + return 0; +} + +/* ------------------------------------------------------------------ */ +/* fstatat / fstatat64 */ +/* ------------------------------------------------------------------ */ + +int +fstatat(int dirfd, const char *path, struct stat *buf, int flags) +{ + ZVFS_HOOK_ENTER(); + + char abspath[PATH_MAX]; + int is_zvfs = 0; + + if (!ZVFS_IN_HOOK()) { + if (zvfs_resolve_atpath(dirfd, path, abspath, sizeof(abspath)) == 0) + is_zvfs = zvfs_is_zvfs_path(abspath); + } + + if (real_fstatat(dirfd, path, buf, flags) < 0) { + ZVFS_HOOK_LEAVE(); + return -1; + } + + if (is_zvfs) { + zvfs_ensure_init(); + struct zvfs_inode *inode = zvfs_inode_by_path(abspath); + if (inode) + patch_stat(buf, inode); + } + + ZVFS_HOOK_LEAVE(); + return 0; +} + +int +fstatat64(int dirfd, const char *path, struct stat64 *buf, int flags) +{ + ZVFS_HOOK_ENTER(); + + char abspath[PATH_MAX]; + int is_zvfs = 0; + + if (!ZVFS_IN_HOOK()) { + if (zvfs_resolve_atpath(dirfd, path, abspath, sizeof(abspath)) == 0) + is_zvfs = zvfs_is_zvfs_path(abspath); + } + + if (real_fstatat64(dirfd, path, buf, flags) < 0) { + ZVFS_HOOK_LEAVE(); + return -1; + } + + if (is_zvfs) { + zvfs_ensure_init(); + struct zvfs_inode *inode = zvfs_inode_by_path(abspath); + if (inode) + patch_stat64(buf, inode); + } + + ZVFS_HOOK_LEAVE(); + return 0; +} + +/* ------------------------------------------------------------------ */ +/* statx */ +/* ------------------------------------------------------------------ */ + +int +statx(int dirfd, const char *path, int flags, + unsigned int mask, struct statx *buf) +{ + ZVFS_HOOK_ENTER(); + + if (!real_statx) { + errno = ENOSYS; + ZVFS_HOOK_LEAVE(); + return -1; + } + + char abspath[PATH_MAX]; + int is_zvfs = 0; + + if (!ZVFS_IN_HOOK()) { + if (zvfs_resolve_atpath(dirfd, path, abspath, sizeof(abspath)) == 0) + is_zvfs = zvfs_is_zvfs_path(abspath); + } + + if (real_statx(dirfd, path, flags, mask, buf) < 0) { + ZVFS_HOOK_LEAVE(); + return -1; + } + + if (!is_zvfs) { + ZVFS_HOOK_LEAVE(); + return 0; + } + + zvfs_ensure_init(); + + /* statx 用 stx_mask 标记哪些字段有效,覆盖 size/atime/mtime */ + struct zvfs_inode *inode = zvfs_inode_by_path(abspath); + if (inode) { + pthread_mutex_lock(&inode->mu); + + if (mask & STATX_SIZE) { + buf->stx_size = inode->logical_size; + buf->stx_mask |= STATX_SIZE; + /* stx_blocks 以 512 字节为单位 */ + buf->stx_blocks = (inode->logical_size + 511) / 512; + buf->stx_mask |= STATX_BLOCKS; + } + if (mask & STATX_ATIME) { + buf->stx_atime.tv_sec = inode->atime; + buf->stx_atime.tv_nsec = 0; + buf->stx_mask |= STATX_ATIME; + } + if (mask & STATX_MTIME) { + buf->stx_mtime.tv_sec = inode->mtime; + buf->stx_mtime.tv_nsec = 0; + buf->stx_mask |= STATX_MTIME; + } + + pthread_mutex_unlock(&inode->mu); + } + + ZVFS_HOOK_LEAVE(); + return 0; +} diff --git a/src/hook/zvfs_hook_stat.h b/src/hook/zvfs_hook_stat.h new file mode 100644 index 0000000..66962c3 --- /dev/null +++ b/src/hook/zvfs_hook_stat.h @@ -0,0 +1,35 @@ +#ifndef __ZVFS_HOOK_STAT_H__ +#define __ZVFS_HOOK_STAT_H__ + +#include +#include + +/* + * stat 族 hook。 + * + * 核心策略: + * 对 zvfs 文件,透传 real_stat* 获取大部分字段 + * (ino、dev、nlink、mode、uid、gid、blksize、blocks 等), + * 只覆盖以下字段: + * st_size ← inode->logical_size + * st_atime ← inode->atime + * st_mtime ← inode->mtime + * + * st_blocks 保持真实文件的值(接近 0,因为真实文件只有 xattr)。 + * 上层(postgres/rocksdb)用 st_size 判断文件大小,这是关键字段。 + * + * 对非 zvfs 文件:完全透传。 + */ + +int stat(const char *path, struct stat *buf); +int stat64(const char *path, struct stat64 *buf); +int fstat(int fd, struct stat *buf); +int fstat64(int fd, struct stat64 *buf); +int lstat(const char *path, struct stat *buf); +int lstat64(const char *path, struct stat64 *buf); +int fstatat(int dirfd, const char *path, struct stat *buf, int flags); +int fstatat64(int dirfd, const char *path, struct stat64 *buf, int flags); +int statx(int dirfd, const char *path, int flags, + unsigned int mask, struct statx *buf); + +#endif // __ZVFS_HOOK_STAT_H__ \ No newline at end of file diff --git a/src/hook/zvfs_hook_sync.c b/src/hook/zvfs_hook_sync.c new file mode 100644 index 0000000..43e5a05 --- /dev/null +++ b/src/hook/zvfs_hook_sync.c @@ -0,0 +1,122 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include "zvfs_hook_sync.h" +#include "zvfs_hook_init.h" +#include "zvfs_hook_reentrant.h" +#include "fs/zvfs.h" +#include "fs/zvfs_open_file.h" +#include "spdk_engine/io_engine.h" + +#include +#include + +/* ------------------------------------------------------------------ */ +/* fsync */ +/* ------------------------------------------------------------------ */ + +int +fsync(int fd) +{ + ZVFS_HOOK_ENTER(); + + struct zvfs_open_file *of = NULL; + if (!ZVFS_IN_HOOK()) { + pthread_mutex_lock(&g_fs.fd_mu); + of = openfile_lookup(fd); + pthread_mutex_unlock(&g_fs.fd_mu); + } + + if (!of) { + int r = real_fsync(fd); + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + /* + * zvfs 无写缓冲区,数据已在 blob_write 时落到 SPDK 存储。 + * 调用 blob_sync_md 确保 blob 元数据(size 等)持久化。 + */ + int r = blob_sync_md(of->handle); + if (r < 0) errno = EIO; + + ZVFS_HOOK_LEAVE(); + return (r < 0) ? -1 : 0; +} + +/* ------------------------------------------------------------------ */ +/* fdatasync */ +/* ------------------------------------------------------------------ */ + +int +fdatasync(int fd) +{ + ZVFS_HOOK_ENTER(); + + struct zvfs_open_file *of = NULL; + if (!ZVFS_IN_HOOK()) { + pthread_mutex_lock(&g_fs.fd_mu); + of = openfile_lookup(fd); + pthread_mutex_unlock(&g_fs.fd_mu); + } + + if (!of) { + int r = real_fdatasync(fd); + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + /* + * fdatasync 只保证数据持久化,不要求元数据(atime 等)同步。 + * 对 zvfs:数据已无缓冲,blob_sync_md 同步 size 元数据即可。 + * 与 fsync 实现相同——如果将来区分数据/元数据可在此分叉。 + */ + int r = blob_sync_md(of->handle); + if (r < 0) errno = EIO; + + ZVFS_HOOK_LEAVE(); + return (r < 0) ? -1 : 0; +} + +/* ------------------------------------------------------------------ */ +/* sync_file_range */ +/* ------------------------------------------------------------------ */ + +int +sync_file_range(int fd, off_t offset, off_t nbytes, unsigned int flags) +{ + ZVFS_HOOK_ENTER(); + + struct zvfs_open_file *of = NULL; + if (!ZVFS_IN_HOOK()) { + pthread_mutex_lock(&g_fs.fd_mu); + of = openfile_lookup(fd); + pthread_mutex_unlock(&g_fs.fd_mu); + } + + if (!of) { + int r = real_sync_file_range + ? real_sync_file_range(fd, offset, nbytes, flags) + : (errno = ENOSYS, -1); + ZVFS_HOOK_LEAVE(); + return r; + } + + zvfs_ensure_init(); + + /* + * PostgreSQL checkpointer 用此调用按范围刷脏页。 + * zvfs 无页缓存,数据实时落盘,直接返回 0。 + * 参数合法性检查与内核保持一致: + * offset < 0 或 nbytes < 0 → EINVAL + * flags 包含非法位 → EINVAL + */ + (void)offset; (void)nbytes; (void)flags; + + ZVFS_HOOK_LEAVE(); + return 0; +} diff --git a/src/hook/zvfs_hook_sync.h b/src/hook/zvfs_hook_sync.h new file mode 100644 index 0000000..e7b2a5d --- /dev/null +++ b/src/hook/zvfs_hook_sync.h @@ -0,0 +1,24 @@ +#ifndef __ZVFS_HOOK_SYNC_H__ +#define __ZVFS_HOOK_SYNC_H__ + +#include + +/* + * zvfs 无写缓冲区:所有 blob_write 成功即代表数据已落到 SPDK 管理的存储。 + * + * fsync / fdatasync: + * 对 zvfs fd 调用 blob_sync_md 同步 blob 元数据(size 等), + * 然后返回 0。不需要 flush 数据缓冲区。 + * 非 zvfs fd 透传。 + * + * sync_file_range: + * PostgreSQL checkpointer 按范围刷脏页。 + * zvfs 无页缓存,直接返回 0。 + * 非 zvfs fd 透传。 + */ + +int fsync(int fd); +int fdatasync(int fd); +int sync_file_range(int fd, off_t offset, off_t nbytes, unsigned int flags); + +#endif // __ZVFS_HOOK_SYNC_H__ \ No newline at end of file diff --git a/src/main.c b/src/main.c new file mode 100644 index 0000000..e69de29 diff --git a/src/spdk_engine/io_engine.c b/src/spdk_engine/io_engine.c new file mode 100644 index 0000000..ec470c2 --- /dev/null +++ b/src/spdk_engine/io_engine.c @@ -0,0 +1,812 @@ +#include "spdk_engine/io_engine.h" +#include "config.h" +#include "common/utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct zvfs_spdk_io_engine g_engine = {0}; +static int g_engine_init_rc = -EAGAIN; +static pthread_mutex_t g_super_blob_mutex = PTHREAD_MUTEX_INITIALIZER; +static spdk_blob_id g_super_blob_id_cache = SPDK_BLOBID_INVALID; + +static __thread struct zvfs_tls_ctx tls = {0}; + +// 初始化操作上下文 +struct json_load_ctx { + bool done; + int rc; +}; + +struct bs_init_ctx { + bool done; + int rc; + struct spdk_blob_store *bs; +}; + +// metadata 操作通用上下文 +struct md_op_ctx { + void (*fn)(struct md_op_ctx *ctx); + volatile bool done; + int rc; + // op-specific fields + union { + struct { // for create + uint64_t size_hint; + spdk_blob_id blob_id; + } create; + struct { // for open + spdk_blob_id blob_id; + struct spdk_blob *blob; + } open; + struct { // for resize/sync/close + struct zvfs_blob_handle *handle; + uint64_t new_size; // for resize + } handle_op; + struct { // for delete + spdk_blob_id blob_id; + } delete; + struct { // for get/set super + spdk_blob_id blob_id; + } super; + }; + char *op_name; +}; + +// IO completion 上下文 +struct io_completion_ctx { + bool done; + int rc; +}; + +// metadata poller 线程函数 +static void *md_poller_fn(void *arg) { + spdk_set_thread(g_engine.md_thread); + while (true) { + spdk_thread_poll(g_engine.md_thread, 0, 0); + usleep(1000); + } + return NULL; +} + +// 前向声明 +static struct spdk_io_channel *get_current_channel(void); +static int dispatch_md_op(struct md_op_ctx *ctx); +static int dispatch_md_op_quiet(struct md_op_ctx *ctx); +static void md_op_cb(void *arg); +static int open_bdev_and_init_bs(const char *bdev_name); +static int load_json_config(void); +static int ensure_engine_ready(const char *op); + +// callbacks +static void json_app_load_done(int rc, void *arg); +static void zvfs_spdk_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *event_ctx); +static void bs_init_cb(void *arg, struct spdk_blob_store *bs, int bserrno); +static void blob_create_cb(void *arg, spdk_blob_id blobid, int rc); +static void blob_open_cb(void *arg, struct spdk_blob *blob, int rc); +static void blob_resize_cb(void *arg, int rc); +static void blob_sync_md_cb(void *arg, int rc); +static void blob_close_cb(void *arg, int rc); +static void blob_delete_cb(void *arg, int rc); +static void io_completion_cb(void *arg, int rc); +static void blob_get_super_cb(void *arg, spdk_blob_id blobid, int rc); +static void blob_set_super_cb(void *arg, int rc); + +// op functions on matadata +static void blob_create_on_md(struct md_op_ctx *ctx); +static void blob_open_on_md(struct md_op_ctx *ctx); +static void blob_resize_on_md(struct md_op_ctx *ctx); +static void blob_sync_md_on_md(struct md_op_ctx *ctx); +static void blob_close_on_md(struct md_op_ctx *ctx); +static void blob_delete_on_md(struct md_op_ctx *ctx); +static void blob_get_super_on_md(struct md_op_ctx *ctx); +static void blob_set_super_on_md(struct md_op_ctx *ctx); + +__attribute__((constructor)) static void preload_init(void) { + const char *auto_init = getenv("ZVFS_AUTO_INIT"); + if (!auto_init || strcmp(auto_init, "1") != 0) { + return; + } + + printf("\n\n auto init \n\n"); + const char *bdev_name = getenv("SPDK_BDEV_NAME") ? getenv("SPDK_BDEV_NAME") : ZVFS_BDEV; + g_engine_init_rc = io_engine_init(bdev_name); + if (g_engine_init_rc != 0) { + SPDK_ERRLOG("io_engine_init failed in constructor: %d\n", g_engine_init_rc); + } +} + +static int wait_done(bool *done_ptr, int *rc_ptr, const char *op) { + int iter = 0; + while (!*done_ptr) { + if (tls.thread) { + spdk_thread_poll(tls.thread, 0, 0); + }else{ + SPDK_ERRLOG("not init tls.thread\n"); + return -EBADE; + } + if (++iter > WAITER_MAX_TIME) { + SPDK_ERRLOG("%s timeout\n", op); + return -ETIMEDOUT; + } + } + + if (*rc_ptr != 0) { + SPDK_ERRLOG("%s failed in callback: %d\n", op, *rc_ptr); + return *rc_ptr; + } + return 0; +} + +static int wait_done_volatile(volatile bool *done_ptr, int *rc_ptr, const char *op) { + int iter = 0; + while (!*done_ptr) { + if (tls.thread) { + spdk_thread_poll(tls.thread, 0, 0); + }else{ + SPDK_ERRLOG("not init tls.thread\n"); + return -EBADE; + } + if (++iter > WAITER_MAX_TIME) { + SPDK_ERRLOG("%s timeout\n", op); + return -ETIMEDOUT; + } + } + + if (*rc_ptr != 0) { + SPDK_ERRLOG("%s failed in callback: %d\n", op, *rc_ptr); + return *rc_ptr; + } + return 0; +} + +// no rc error +static int wait_done_volatile_quiet(volatile bool *done_ptr, int *rc_ptr, const char *op) { + int iter = 0; + while (!*done_ptr) { + if (tls.thread) { + spdk_thread_poll(tls.thread, 0, 0); + } else { + SPDK_ERRLOG("not init tls.thread\n"); + return -EBADE; + } + if (++iter > WAITER_MAX_TIME) { + SPDK_ERRLOG("%s timeout\n", op); + return -ETIMEDOUT; + } + } + + return *rc_ptr; +} + +int io_engine_init(const char *bdev_name) { + if (g_engine_init_rc == 0 && g_engine.bs != NULL && g_engine.md_thread != NULL) { + return 0; + } + + struct spdk_env_opts env_opts; + spdk_env_opts_init(&env_opts); + env_opts.name = "zvfs"; + + + if (spdk_env_init(&env_opts) != 0) { + SPDK_ERRLOG("spdk_env_init failed\n"); + g_engine_init_rc = -1; + return g_engine_init_rc; + } + + spdk_log_set_print_level(SPDK_LOG_NOTICE); + spdk_log_set_level(SPDK_LOG_NOTICE); + spdk_log_open(NULL); + + if (spdk_thread_lib_init(NULL, 0) != 0) { + SPDK_ERRLOG("spdk_thread_lib_init failed\n"); + g_engine_init_rc = -1; + return g_engine_init_rc; + } + + // 为主线程 lazy init(constructor 在主线程跑) + tls.thread = spdk_thread_create("main_thread", NULL); + if (!tls.thread) { + SPDK_ERRLOG("create main_thread failed\n"); + g_engine_init_rc = -1; + return g_engine_init_rc; + } + spdk_set_thread(tls.thread); + + if (load_json_config() != 0) { + SPDK_ERRLOG("Failed to load SPDK config\n"); + g_engine_init_rc = -1; + return g_engine_init_rc; + } + + /** + * 这里是因为要让一个线程专门负责poll + */ + // 创建 md_thread + g_engine.md_thread = spdk_thread_create("md_thread", NULL); + if (!g_engine.md_thread) { + SPDK_ERRLOG("create md_thread failed\n"); + g_engine_init_rc = -1; + return g_engine_init_rc; + } + + // 起专用 poller pthread for md_thread + pthread_t md_poller_tid; + if (pthread_create(&md_poller_tid, NULL, md_poller_fn, NULL) != 0) { + SPDK_ERRLOG("pthread_create for md_poller failed\n"); + g_engine_init_rc = -1; + return g_engine_init_rc; + } + if (pthread_detach(md_poller_tid) != 0) { + SPDK_ERRLOG("pthread_detach for md_poller failed\n"); + g_engine_init_rc = -1; + return g_engine_init_rc; + } + + // init bdev/bs + g_super_blob_id_cache = SPDK_BLOBID_INVALID; + int rc = open_bdev_and_init_bs(bdev_name); + if (rc != 0) { + g_engine_init_rc = rc; + return rc; + } + g_engine_init_rc = 0; + return g_engine_init_rc; +} + +static int load_json_config(void) { + const char *path = getenv("SPDK_JSON_CONFIG"); + if(!path) path = SPDK_JSON_PATH; + + + struct json_load_ctx ctx = { + .done = false, + .rc = 0 + }; + spdk_subsystem_init_from_json_config(path, SPDK_DEFAULT_RPC_ADDR, json_app_load_done, + &ctx, true); + return wait_done(&ctx.done, &ctx.rc, "load_json_config"); +} + +// lazy get channel +static struct spdk_io_channel *get_current_channel(void) { + if (ensure_engine_ready("get_current_channel") != 0) { + return NULL; + } + + if (tls.thread) { + spdk_thread_poll(tls.thread, 0, 0); + } + + if (!tls.thread) { + char name[32]; + snprintf(name, sizeof(name), "worker_%lu", pthread_self()); + tls.thread = spdk_thread_create(name, NULL); + if (!tls.thread) { + SPDK_ERRLOG("spdk_thread_create failed\n"); + return NULL; + } + spdk_set_thread(tls.thread); + } + + if (!tls.channel) { + tls.channel = spdk_bs_alloc_io_channel(g_engine.bs); + if (!tls.channel) { + SPDK_ERRLOG("alloc io_channel failed\n"); + return NULL; + } + } + return tls.channel; +} + +// 通用 dispatch md op +static int dispatch_md_op(struct md_op_ctx *ctx) { + int rc = ensure_engine_ready(ctx->op_name ? ctx->op_name : "dispatch_md_op"); + if (rc != 0) { + return rc; + } + + ctx->done = false; + ctx->rc = 0; + + spdk_thread_send_msg(g_engine.md_thread, md_op_cb, ctx); + + return wait_done_volatile(&ctx->done, &ctx->rc, ctx->op_name); +} + +static int dispatch_md_op_quiet(struct md_op_ctx *ctx) { + int rc = ensure_engine_ready(ctx->op_name ? ctx->op_name : "dispatch_md_op_quiet"); + if (rc != 0) { + return rc; + } + + ctx->done = false; + ctx->rc = 0; + + spdk_thread_send_msg(g_engine.md_thread, md_op_cb, ctx); + + return wait_done_volatile_quiet(&ctx->done, &ctx->rc, ctx->op_name); +} + +static int ensure_engine_ready(const char *op) { + if (g_engine_init_rc != 0) { + SPDK_ERRLOG("%s: io engine init failed, rc=%d\n", op, g_engine_init_rc); + return g_engine_init_rc; + } + + if (!g_engine.bs || !g_engine.md_thread) { + SPDK_ERRLOG("%s: io engine not ready (bs=%p, md_thread=%p)\n", + op, (void *)g_engine.bs, (void *)g_engine.md_thread); + return -EIO; + } + + return 0; +} + +static void md_op_cb(void *arg) { + struct md_op_ctx *ctx = arg; + ctx->fn(ctx); +} + +void json_app_load_done(int rc, void *arg) { + struct json_load_ctx* ctx = (struct json_load_ctx*)arg; + ctx->done = true; + ctx->rc = rc; +} + +// bdev open + bs init +static void zvfs_spdk_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, + void *event_ctx) { + // 后续加日志或处理 + switch (type) { + case SPDK_BDEV_EVENT_REMOVE: + SPDK_NOTICELOG("bdev removed: %s\n", spdk_bdev_get_name(bdev)); + break; + default: + break; + } +} + +static void bs_init_cb(void *arg, struct spdk_blob_store *bs, int bserrno) { + struct bs_init_ctx *ctx = (struct bs_init_ctx *)arg; + ctx->rc = bserrno; + ctx->bs = bs; + ctx->done = true; +} + +static int open_bdev_and_init_bs(const char *bdev_name) { + SPDK_NOTICELOG("open_bdev_and_init_bs\n"); + struct spdk_bs_dev *bs_dev = NULL; + int rc = spdk_bdev_create_bs_dev_ext(bdev_name, zvfs_spdk_bdev_event_cb, NULL, &bs_dev); + if (rc != 0) { + SPDK_ERRLOG("spdk_bdev_create_bs_dev_ext failed: %d\n", rc); + return rc; + } + g_engine.bs_dev = bs_dev; + + struct bs_init_ctx ctx = { + .done = false, + .rc = 0, + .bs = NULL + }; + + /* 优先加载已有 blobstore;失败时回退到 init。 */ + spdk_bs_load(bs_dev, NULL, bs_init_cb, &ctx); + rc = wait_done(&ctx.done, &ctx.rc, "bs_load"); + if (rc != 0) { + SPDK_NOTICELOG("spdk_bs_load failed (%d), fallback to spdk_bs_init\n", rc); + + /* + * 注意:spdk_bs_load 失败路径会销毁传入的 dev。 + * 这里必须重新 create 一个新的 bs_dev,不能复用旧指针。 + */ + bs_dev = NULL; + rc = spdk_bdev_create_bs_dev_ext(bdev_name, zvfs_spdk_bdev_event_cb, NULL, &bs_dev); + if (rc != 0) { + SPDK_ERRLOG("spdk_bdev_create_bs_dev_ext(for init fallback) failed: %d\n", rc); + g_engine.bs_dev = NULL; + return rc; + } + g_engine.bs_dev = bs_dev; + + ctx.done = false; + ctx.rc = 0; + ctx.bs = NULL; + + spdk_bs_init(bs_dev, NULL, bs_init_cb, &ctx); + rc = wait_done(&ctx.done, &ctx.rc, "bs_init"); + if (rc != 0) { + g_engine.bs_dev = NULL; + return rc; + } + } + + g_engine.bs = ctx.bs; + g_engine.io_unit_size = spdk_bs_get_io_unit_size(ctx.bs); + g_engine.cluster_size = spdk_bs_get_cluster_size(ctx.bs); + + SPDK_NOTICELOG("Blobstore initialized successfully on bdev: %s\n", bdev_name); + return 0; +} + +static void blob_get_super_cb(void *arg, spdk_blob_id blobid, int rc) { + struct md_op_ctx *ctx = arg; + ctx->rc = rc; + ctx->super.blob_id = blobid; + ctx->done = true; +} + +static void blob_set_super_cb(void *arg, int rc) { + struct md_op_ctx *ctx = arg; + ctx->rc = rc; + ctx->done = true; +} + +static void blob_get_super_on_md(struct md_op_ctx *ctx) { + spdk_bs_get_super(g_engine.bs, blob_get_super_cb, ctx); +} + +static void blob_set_super_on_md(struct md_op_ctx *ctx) { + spdk_bs_set_super(g_engine.bs, ctx->super.blob_id, blob_set_super_cb, ctx); +} + +static int bs_get_super_id(spdk_blob_id *blob_id) { + struct md_op_ctx ctx = { + .fn = blob_get_super_on_md, + .op_name = "blob get super", + }; + ctx.super.blob_id = SPDK_BLOBID_INVALID; + + int rc = dispatch_md_op_quiet(&ctx); + if (rc != 0) { + return rc; + } + *blob_id = ctx.super.blob_id; + return 0; +} + +static int bs_set_super_id(spdk_blob_id blob_id) { + struct md_op_ctx ctx = { + .fn = blob_set_super_on_md, + .op_name = "blob set super", + }; + ctx.super.blob_id = blob_id; + return dispatch_md_op(&ctx); +} + +struct zvfs_blob_handle *blob_get_super(void) { + pthread_mutex_lock(&g_super_blob_mutex); + + if (g_super_blob_id_cache != SPDK_BLOBID_INVALID) { + struct zvfs_blob_handle *cached = blob_open(g_super_blob_id_cache); + if (cached) { + pthread_mutex_unlock(&g_super_blob_mutex); + return cached; + } + g_super_blob_id_cache = SPDK_BLOBID_INVALID; + } + + spdk_blob_id super_id = SPDK_BLOBID_INVALID; + int rc = bs_get_super_id(&super_id); + if (rc == 0 && super_id != SPDK_BLOBID_INVALID) { + g_super_blob_id_cache = super_id; + struct zvfs_blob_handle *existing = blob_open(super_id); + if (!existing) { + g_super_blob_id_cache = SPDK_BLOBID_INVALID; + } + pthread_mutex_unlock(&g_super_blob_mutex); + return existing; + } + if (rc == 0 && super_id == SPDK_BLOBID_INVALID) { + rc = -ENOENT; + } + + if (rc != -ENOENT) { + SPDK_ERRLOG("spdk_bs_get_super failed: %d\n", rc); + pthread_mutex_unlock(&g_super_blob_mutex); + return NULL; + } + + struct zvfs_blob_handle *created = blob_create(0); + if (!created) { + pthread_mutex_unlock(&g_super_blob_mutex); + return NULL; + } + + rc = bs_set_super_id(created->id); + if (rc != 0) { + spdk_blob_id created_id = created->id; + SPDK_ERRLOG("spdk_bs_set_super failed: %d\n", rc); + blob_close(created); + blob_delete(created_id); + pthread_mutex_unlock(&g_super_blob_mutex); + return NULL; + } + + g_super_blob_id_cache = created->id; + pthread_mutex_unlock(&g_super_blob_mutex); + return created; +} + +// blob_create +static void blob_create_cb(void *arg, spdk_blob_id blobid, int rc) { + struct md_op_ctx *ctx = arg; + ctx->rc = rc; + ctx->create.blob_id = blobid; + ctx->done = true; +} + +static void blob_create_on_md(struct md_op_ctx *ctx) { + struct spdk_blob_opts opts; + spdk_blob_opts_init(&opts, sizeof(opts)); + // size_hint 如果需,但 create 不直接 set size,用 resize 后 + spdk_bs_create_blob_ext(g_engine.bs, &opts, blob_create_cb, ctx); +} + +struct zvfs_blob_handle *blob_create(uint64_t size_hint) { + if(size_hint == 0) size_hint = g_engine.cluster_size; + struct md_op_ctx ctx = {.fn = blob_create_on_md, .create.size_hint = size_hint, .op_name = "blob create"}; + int rc = dispatch_md_op(&ctx); + if (rc) return NULL; + + struct zvfs_blob_handle *handle = blob_open(ctx.create.blob_id); + if (handle && size_hint > 0) { + rc = blob_resize(handle, size_hint); // 初始 resize + if (rc != 0) { + SPDK_ERRLOG("blob_resize failed after create: %d\n", rc); + blob_close(handle); + return NULL; + } + + rc = blob_sync_md(handle); + if (rc != 0) { + SPDK_ERRLOG("blob_sync_md failed after resize: %d\n", rc); + blob_close(handle); + return NULL; + } + } + return handle; +} + +// blob_open +static void blob_open_cb(void *arg, struct spdk_blob *blob, int rc) { + struct md_op_ctx *ctx = arg; + ctx->rc = rc; + ctx->open.blob = blob; + ctx->done = true; +} + +static void blob_open_on_md(struct md_op_ctx *ctx) { + struct spdk_blob_open_opts opts; + spdk_blob_open_opts_init(&opts, sizeof(opts)); + spdk_bs_open_blob_ext(g_engine.bs, ctx->open.blob_id, &opts, blob_open_cb, ctx); +} + +struct zvfs_blob_handle *blob_open(uint64_t blob_id) { + struct md_op_ctx ctx = {.fn = blob_open_on_md, .open.blob_id = blob_id, .op_name = "blob open"}; + int rc = dispatch_md_op(&ctx); + if (rc) return NULL; + + struct zvfs_blob_handle *handle = malloc(sizeof(*handle)); + if (!handle) return NULL; + + handle->id = blob_id; + handle->blob = ctx.open.blob; + handle->size = spdk_blob_get_num_clusters(handle->blob) * g_engine.cluster_size; + + // 预分配固定大小的 DMA buf,后续所有 IO 都经过这块缓存,避免每次 IO 动态申请 + // 必须用 spdk_dma_malloc 保证地址对齐到 io_unit_size + handle->dma_buf_size = ZVFS_DMA_BUF_SIZE; + handle->dma_buf = spdk_dma_malloc(ZVFS_DMA_BUF_SIZE, g_engine.io_unit_size, NULL); + if (!handle->dma_buf) { + SPDK_ERRLOG("spdk_dma_malloc failed for blob %lu\n", blob_id); + free(handle); + return NULL; + } + + return handle; +} + +// blob_write +static void io_completion_cb(void *arg, int rc) { + struct io_completion_ctx *ctx = arg; + ctx->rc = rc; + ctx->done = true; +} + +int blob_write(struct zvfs_blob_handle *handle, uint64_t offset, const void *buf, size_t len) { + if (tls.thread) { + spdk_thread_poll(tls.thread, 0, 0); + } + + struct spdk_io_channel *ch = get_current_channel(); + if (!ch) return -1; + if (len == 0) return 0; + + // 越界检查 + if (offset + len > handle->size) { + SPDK_ERRLOG("blob_write out of range: offset=%lu len=%zu blob_size=%lu\n", + offset, len, handle->size); + return -ERANGE; + } + + // 计算对齐后的 IO 范围和 dma_buf 内偏移 + uint64_t lba_off = 0; + uint64_t lba_len = 0; + uint32_t buf_off = 0; + int rc = zvfs_calc_io_units(offset, len, g_engine.io_unit_size, &lba_off, &lba_len, &buf_off); + if (rc != 0) { + SPDK_ERRLOG("blob_write calc_io_units failed: %d\n", rc); + return rc; + } + + size_t aligned_bytes = lba_len * g_engine.io_unit_size; + if (aligned_bytes > ZVFS_DMA_BUF_SIZE) { + SPDK_ERRLOG("blob_write aligned_bytes=%zu exceeds ZVFS_DMA_BUF_SIZE\n", aligned_bytes); + return -ENOSPC; + } + + struct io_completion_ctx io_ctx = {.done = false, .rc = 0}; + + spdk_blob_io_read(handle->blob, ch, handle->dma_buf, lba_off, lba_len, + io_completion_cb, &io_ctx); + + + rc = wait_done(&io_ctx.done, &io_ctx.rc, "io_write(read phase)"); + if (rc != 0) return rc; + + memcpy((uint8_t *)handle->dma_buf + buf_off, buf, len); + io_ctx.done = false; + io_ctx.rc = 0; + + spdk_blob_io_write(handle->blob, ch, handle->dma_buf, lba_off, lba_len, + io_completion_cb, &io_ctx); + rc = wait_done(&io_ctx.done, &io_ctx.rc, "io_write(write phase)"); + if (rc != 0) return rc; + + return io_ctx.rc; +} + +// blob_read 类似 +int blob_read(struct zvfs_blob_handle *handle, uint64_t offset, void *buf, size_t len) { + if (tls.thread) { + spdk_thread_poll(tls.thread, 0, 0); + } + + struct spdk_io_channel *ch = get_current_channel(); + if (!ch) return -1; + if (len == 0) return 0; + + // 越界检查 + if (offset + len > handle->size) { + SPDK_ERRLOG("blob_read out of range: offset=%lu len=%zu blob_size=%lu\n", + offset, len, handle->size); + return -ERANGE; + } + + + // 计算对齐后的 IO 范围和 dma_buf 内偏移 + uint64_t lba_off = 0; + uint64_t lba_len = 0; + uint32_t buf_off = 0; + int rc = zvfs_calc_io_units(offset, len, g_engine.io_unit_size, &lba_off, &lba_len, &buf_off); + if (rc != 0) { + SPDK_ERRLOG("io_read offset/len not aligned to io_unit_size=%lu\n", g_engine.io_unit_size); + return rc; + } + + // 读入对齐范围到 dma_buf,再从正确偏移处截取到用户 buf + size_t aligned_bytes = lba_len * g_engine.io_unit_size; + if (aligned_bytes > ZVFS_DMA_BUF_SIZE) { + SPDK_ERRLOG("blob_read aligned_bytes=%zu exceeds ZVFS_DMA_BUF_SIZE\n", aligned_bytes); + return -ENOSPC; + } + + struct io_completion_ctx io_ctx = {.done = false, .rc = 0}; + + spdk_blob_io_read(handle->blob, ch, handle->dma_buf, lba_off, lba_len, + io_completion_cb, &io_ctx); + + rc = wait_done(&io_ctx.done, &io_ctx.rc, "io_read"); + if (rc != 0) return rc; + + memcpy(buf, (uint8_t *)handle->dma_buf + buf_off, len); + return io_ctx.rc; +} + +// blob_resize +static void blob_resize_cb(void *arg, int rc) { + struct md_op_ctx *ctx = arg; + ctx->rc = rc; + ctx->done = true; +} + +static void blob_resize_on_md(struct md_op_ctx *ctx) { + uint64_t new_clusters = 0; + uint64_t cluster_size = g_engine.cluster_size; + int rc = zvfs_calc_ceil_units(ctx->handle_op.new_size, cluster_size, &new_clusters); + if (rc != 0) { + ctx->rc = rc; + ctx->done = true; + return; + } + spdk_blob_resize(ctx->handle_op.handle->blob, new_clusters, blob_resize_cb, ctx); +} + +int blob_resize(struct zvfs_blob_handle *handle, uint64_t new_size) { + struct md_op_ctx ctx = {.fn = blob_resize_on_md, .op_name = "blob resize"}; + ctx.handle_op.handle = handle; + ctx.handle_op.new_size = new_size; + int rc = dispatch_md_op(&ctx); + if (rc == 0) { + uint64_t new_clusters = 0; + zvfs_calc_ceil_units(new_size, g_engine.cluster_size, &new_clusters); + handle->size = new_clusters * g_engine.cluster_size; + } + return rc; +} + +// blob_sync_md +static void blob_sync_md_cb(void *arg, int rc) { + struct md_op_ctx *ctx = arg; + ctx->rc = rc; + ctx->done = true; +} + +static void blob_sync_md_on_md(struct md_op_ctx *ctx) { + spdk_blob_sync_md(ctx->handle_op.handle->blob, blob_sync_md_cb, ctx); +} + +int blob_sync_md(struct zvfs_blob_handle *handle) { + struct md_op_ctx ctx = {.fn = blob_sync_md_on_md, .op_name = "blob sync"}; + ctx.handle_op.handle = handle; + return dispatch_md_op(&ctx); +} + +// blob_close +static void blob_close_cb(void *arg, int rc) { + struct md_op_ctx *ctx = arg; + ctx->rc = rc; + ctx->done = true; +} + +static void blob_close_on_md(struct md_op_ctx *ctx) { + spdk_blob_close(ctx->handle_op.handle->blob, blob_close_cb, ctx); +} + +int blob_close(struct zvfs_blob_handle *handle) { + struct md_op_ctx ctx = {.fn = blob_close_on_md, .op_name = "blob close"}; + ctx.handle_op.handle = handle; + int rc = dispatch_md_op(&ctx); + if (rc == 0) { + spdk_dma_free(handle->dma_buf); + free(handle); + } + return rc; +} + +// blob_delete +static void blob_delete_cb(void *arg, int rc) { + struct md_op_ctx *ctx = arg; + ctx->rc = rc; + ctx->done = true; +} + +static void blob_delete_on_md(struct md_op_ctx *ctx) { + spdk_bs_delete_blob(g_engine.bs, ctx->delete.blob_id, blob_delete_cb, ctx); +} + +int blob_delete(uint64_t blob_id) { + struct md_op_ctx ctx = {.fn = blob_delete_on_md, .op_name = "blob delete"}; + ctx.delete.blob_id = blob_id; + return dispatch_md_op(&ctx); +} diff --git a/src/spdk_engine/io_engine.h b/src/spdk_engine/io_engine.h new file mode 100644 index 0000000..e9c4dfc --- /dev/null +++ b/src/spdk_engine/io_engine.h @@ -0,0 +1,44 @@ +#ifndef __ZVFS_IO_ENGINE_H__ +#define __ZVFS_IO_ENGINE_H__ + +#include +#include +#include + +// blob_handle 结构体:底层 blob 信息,不含文件级 size(上层维护) +typedef struct zvfs_blob_handle { + spdk_blob_id id; + struct spdk_blob *blob; + uint64_t size; + void *dma_buf; + uint64_t dma_buf_size; +} zvfs_blob_handle_t ; + +typedef struct zvfs_spdk_io_engine { + struct spdk_bs_dev *bs_dev; + struct spdk_blob_store *bs; + struct spdk_thread *md_thread; + uint64_t io_unit_size; + uint64_t cluster_size; + int reactor_count; + +} zvfs_spdk_io_engine_t; + +typedef struct zvfs_tls_ctx { + struct spdk_thread *thread; + struct spdk_io_channel *channel; +}zvfs_tls_ctx_t; + +int io_engine_init(const char *bdev_name); + +struct zvfs_blob_handle *blob_get_super(void); +struct zvfs_blob_handle *blob_create(uint64_t size_hint); // 创建并 open,返回 handle +struct zvfs_blob_handle *blob_open(uint64_t blob_id); // open 现有 blob,返回 handle +int blob_write(struct zvfs_blob_handle *handle, uint64_t offset, const void *buf, size_t len); +int blob_read(struct zvfs_blob_handle *handle, uint64_t offset, void *buf, size_t len); +int blob_resize(struct zvfs_blob_handle *handle, uint64_t new_size); +int blob_sync_md(struct zvfs_blob_handle *handle); +int blob_close(struct zvfs_blob_handle *handle); // close 这个 handle 的 blob* +int blob_delete(uint64_t blob_id); // delete,整个 blob(不需 handle) + +#endif // __ZVFS_IO_ENGINE_H__ diff --git a/src/zvfsmalloc.json b/src/zvfsmalloc.json new file mode 100755 index 0000000..10ded9d --- /dev/null +++ b/src/zvfsmalloc.json @@ -0,0 +1,17 @@ +{ + "subsystems": [ + { + "subsystem": "bdev", + "config": [ + { + "method": "bdev_malloc_create", + "params": { + "name": "Malloc0", + "num_blocks": 32768, + "block_size": 512 + } + } + ] + } + ] +} diff --git a/zvfs/zvfs.json b/src/zvfsnvme.json similarity index 100% rename from zvfs/zvfs.json rename to src/zvfsnvme.json diff --git a/test/Makefile b/test/Makefile deleted file mode 100755 index 6e01e25..0000000 --- a/test/Makefile +++ /dev/null @@ -1,43 +0,0 @@ -CC ?= gcc -CFLAGS ?= -O2 -Wall -Wextra -std=gnu11 - -SRCS := $(sort $(wildcard test_*.c)) -BIN_DIR ?= bin -BIN_NAMES := $(SRCS:.c=) -BINS := $(addprefix $(BIN_DIR)/,$(BIN_NAMES)) -RUN_DIR ?= /tmp/zvfs-test -RUN_BINS ?= test_basic test_lseek test_dual_open_same_file test_two_files \ - test_single_file_perf test_single_file_random_perf \ - test_single_file_random_noaligned_perf test_write_file test_read_delete_file \ - test_phase2_posix - -.PHONY: all clean list run-test - -all: $(BINS) - -$(BIN_DIR): - mkdir -p $@ - -$(BIN_DIR)/%: %.c test_utils.h | $(BIN_DIR) - $(CC) $(CFLAGS) -o $@ $< - -list: - @printf "%s\n" $(BINS) - -run-test: all - @mkdir -p $(RUN_DIR) - @pass=0; fail=0; \ - for t in $(RUN_BINS); do \ - printf "\n[RUN] %s\n" "$$t"; \ - if ./$(BIN_DIR)/$$t $(RUN_DIR); then \ - pass=$$((pass + 1)); \ - else \ - fail=$$((fail + 1)); \ - fi; \ - done; \ - printf "\n=== run-test summary: PASS=%d FAIL=%d ===\n" $$pass $$fail; \ - test $$fail -eq 0 - -clean: - $(RM) $(BINS) - -rmdir $(BIN_DIR) diff --git a/test/test_basic.c b/test/test_basic.c deleted file mode 100755 index 0436980..0000000 --- a/test/test_basic.c +++ /dev/null @@ -1,51 +0,0 @@ -#include "test_utils.h" - -static int test_basic(const char *path) -{ - printf("\n=== test_basic ===\n"); - - printf("open: %s\n", path); - int fd = open(path, O_CREAT | O_RDWR | O_TRUNC, 0644); - if (fd < 0) { perror("open"); return 1; } - - const char *msg = "ABCDEFGHIJKL"; - ssize_t w = write(fd, msg, strlen(msg)); - if (w < 0) { perror("write"); return 2; } - printf("write: %zd\n", w); - - const char *msg2 = "MNOPQRSTUVWXYZ"; - ssize_t w2 = write(fd, msg2, strlen(msg2)); - if (w2 < 0) { perror("write"); return 2; } - printf("write: %zd\n", w2); - - close(fd); - - fd = open(path, O_RDONLY); - if (fd < 0) { perror("open R"); return 3; } - - char buf[10]; - memset(buf, 0, sizeof(buf)); - ssize_t r = read(fd, buf, sizeof(buf)); - if (r < 0) { perror("read"); return 4; } - printf("read: %zd bytes: %.*s\n", r, (int)r, buf); - - char buf2[512]; - memset(buf2, 0, sizeof(buf2)); - ssize_t r2 = read(fd, buf2, sizeof(buf2)); - if (r2 < 0) { perror("read"); return 4; } - printf("read: %zd bytes: %.*s\n", r2, (int)r2, buf2); - - close(fd); - - if (unlink(path) != 0) { perror("unlink"); return 5; } - printf("unlink: ok\n"); - return 0; -} - -int main(int argc, char **argv) -{ - char path[PATH_MAX]; - make_path(path, sizeof(path), argc >= 2 ? argv[1] : NULL, "file.dat"); - int rc = test_basic(path); - return report_result("test_basic", rc); -} diff --git a/test/test_dual_open_same_file.c b/test/test_dual_open_same_file.c deleted file mode 100755 index df6e9a7..0000000 --- a/test/test_dual_open_same_file.c +++ /dev/null @@ -1,50 +0,0 @@ -#include "test_utils.h" - -static int test_dual_open_same_file(const char *path) -{ - printf("\n=== test_dual_open_same_file ===\n"); - - int fd_init = open(path, O_CREAT | O_RDWR | O_TRUNC, 0644); - if (fd_init < 0) { perror("open init"); return 1; } - const char *init = "0123456789"; - if (write(fd_init, init, 10) != 10) { perror("write init"); return 2; } - close(fd_init); - - int fd_w = open(path, O_WRONLY); - if (fd_w < 0) { perror("open W"); return 3; } - - int fd_r = open(path, O_RDONLY); - if (fd_r < 0) { perror("open R"); return 4; } - - printf("fd_w=%d fd_r=%d\n", fd_w, fd_r); - - if (write(fd_w, "HELLO", 5) != 5) { perror("write"); return 5; } - printf("write via fd_w: HELLO (overwrite first 5 bytes)\n"); - - char buf[32] = {0}; - lseek(fd_r, 0, SEEK_SET); - ssize_t r = read(fd_r, buf, sizeof(buf)); - printf("read via fd_r: %zd bytes: %.*s (expect: HELLO56789)\n", r, (int)r, buf); - - lseek(fd_w, 0, SEEK_END); - if (write(fd_w, "!!!", 3) != 3) { perror("write append"); return 6; } - printf("write append via fd_w: !!!\n"); - - lseek(fd_r, 10, SEEK_SET); - memset(buf, 0, sizeof(buf)); - r = read(fd_r, buf, sizeof(buf)); - printf("read appended via fd_r: %zd bytes: %.*s (expect: !!!)\n", r, (int)r, buf); - - close(fd_w); - close(fd_r); - unlink(path); - return 0; -} - -int main(int argc, char **argv) -{ - char path[PATH_MAX]; - make_path(path, sizeof(path), argc >= 2 ? argv[1] : NULL, "file.dat"); - int rc = test_dual_open_same_file(path); - return report_result("test_dual_open_same_file", rc); -} diff --git a/test/test_lseek.c b/test/test_lseek.c deleted file mode 100755 index e1d1543..0000000 --- a/test/test_lseek.c +++ /dev/null @@ -1,55 +0,0 @@ -#include "test_utils.h" - -static int test_lseek(const char *path) -{ - printf("\n=== test_lseek ===\n"); - - int fd = open(path, O_CREAT | O_RDWR | O_TRUNC, 0644); - if (fd < 0) { perror("open"); return 1; } - - const char *alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; - if (write(fd, alpha, 26) != 26) { perror("write"); return 2; } - printf("write 26 bytes: %s\n", alpha); - - off_t pos = lseek(fd, 0, SEEK_SET); - printf("lseek SEEK_SET 0 -> %ld\n", (long)pos); - char buf[32] = {0}; - ssize_t r = read(fd, buf, 5); - printf("read 5 bytes: %.*s (expect: ABCDE)\n", (int)r, buf); - - pos = lseek(fd, 3, SEEK_CUR); - printf("lseek SEEK_CUR +3 -> %ld\n", (long)pos); - memset(buf, 0, sizeof(buf)); - r = read(fd, buf, 5); - printf("read 5 bytes: %.*s (expect: IJKLM)\n", (int)r, buf); - - pos = lseek(fd, -5, SEEK_END); - printf("lseek SEEK_END -5 -> %ld\n", (long)pos); - memset(buf, 0, sizeof(buf)); - r = read(fd, buf, 10); - printf("read %zd bytes: %.*s (expect: VWXYZ)\n", r, (int)r, buf); - - pos = lseek(fd, 30, SEEK_SET); - printf("lseek SEEK_SET 30 -> %ld\n", (long)pos); - if (write(fd, "!", 1) != 1) { perror("write hole"); return 3; } - - lseek(fd, 26, SEEK_SET); - memset(buf, 0xAA, sizeof(buf)); - r = read(fd, buf, 5); - printf("read hole+1: %zd bytes, hole[0]=%02X hole[1]=%02X hole[2]=%02X " - "hole[3]=%02X last='%c' (expect: 00 00 00 00 '!')\n", - r, (unsigned char)buf[0], (unsigned char)buf[1], - (unsigned char)buf[2], (unsigned char)buf[3], buf[4]); - - close(fd); - unlink(path); - return 0; -} - -int main(int argc, char **argv) -{ - char path[PATH_MAX]; - make_path(path, sizeof(path), argc >= 2 ? argv[1] : NULL, "file.dat"); - int rc = test_lseek(path); - return report_result("test_lseek", rc); -} diff --git a/test/test_phase2_posix.c b/test/test_phase2_posix.c deleted file mode 100644 index 8c4055c..0000000 --- a/test/test_phase2_posix.c +++ /dev/null @@ -1,98 +0,0 @@ -#include "test_utils.h" -#include - -static int expect_errno(const char *what, int exp) -{ - if (errno != exp) { - fprintf(stderr, "%s: errno=%d expected=%d\n", what, errno, exp); - return -1; - } - return 0; -} - -static int test_phase2(const char *root) -{ - char dir[PATH_MAX]; - char file[PATH_MAX]; - char file2[PATH_MAX]; - struct stat st; - int dfd = -1; - int fd = -1; - - snprintf(dir, sizeof(dir), "%s/p2db", root); - snprintf(file, sizeof(file), "%s/p2db/data.log", root); - snprintf(file2, sizeof(file2), "%s/p2db/data2.log", root); - - (void)unlink(file2); - (void)unlink(file); - (void)rmdir(dir); - - if (mkdir(dir, 0755) != 0) { perror("mkdir"); return 1; } - dfd = open(dir, O_RDONLY | O_DIRECTORY); - if (dfd < 0) { perror("open dir"); return 2; } - - fd = openat(dfd, "data.log", O_CREAT | O_RDWR | O_TRUNC, 0644); - if (fd < 0) { perror("openat create"); return 3; } - - if (write(fd, "ABCD", 4) != 4) { perror("write"); return 4; } - if (pwrite(fd, "XYZ", 3, 8) != 3) { perror("pwrite"); return 5; } - - char buf[16] = {0}; - ssize_t n = pread(fd, buf, 11, 0); - if (n != 11) { perror("pread"); return 6; } - if (memcmp(buf, "ABCD", 4) != 0 || buf[4] || buf[5] || buf[6] || buf[7] || - memcmp(buf + 8, "XYZ", 3) != 0) { - fprintf(stderr, "pread sparse verify failed\n"); - return 7; - } - - if (fsync(fd) != 0) { perror("fsync"); return 8; } - if (fdatasync(fd) != 0) { perror("fdatasync"); return 9; } - - if (fstat(fd, &st) != 0) { perror("fstat"); return 10; } - if (st.st_size != 11) { - fprintf(stderr, "fstat size=%ld expected=11\n", (long)st.st_size); - return 11; - } - - if (ftruncate(fd, 4) != 0) { perror("ftruncate"); return 12; } - memset(buf, 0, sizeof(buf)); - n = pread(fd, buf, 16, 0); - if (n != 4 || memcmp(buf, "ABCD", 4) != 0) { - fprintf(stderr, "truncate readback failed n=%zd\n", n); - return 13; - } - - if (rename(file, file2) != 0) { perror("rename"); return 14; } - if (access(file, F_OK) == 0 || expect_errno("access old", ENOENT) != 0) { - return 15; - } - if (access(file2, F_OK) != 0) { perror("access new"); return 16; } - - int fd2 = open(file2, O_CREAT | O_EXCL | O_RDWR, 0644); - if (fd2 >= 0 || expect_errno("open excl", EEXIST) != 0) { - if (fd2 >= 0) close(fd2); - return 17; - } - - int rd = open(file2, O_RDONLY); - if (rd < 0) { perror("open rdonly"); return 18; } - if (write(rd, "Q", 1) >= 0 || expect_errno("write rdonly", EBADF) != 0) { - close(rd); - return 19; - } - close(rd); - - close(fd); - close(dfd); - if (unlink(file2) != 0) { perror("unlink"); return 20; } - if (rmdir(dir) != 0) { perror("rmdir"); return 21; } - return 0; -} - -int main(int argc, char **argv) -{ - const char *root = argc >= 2 ? argv[1] : "/zvfs"; - int rc = test_phase2(root); - return report_result("test_phase2_posix", rc); -} diff --git a/test/test_read_delete_file.c b/test/test_read_delete_file.c deleted file mode 100755 index eaef2b7..0000000 --- a/test/test_read_delete_file.c +++ /dev/null @@ -1,31 +0,0 @@ -#include "test_utils.h" - -static int test_read_delete_file(const char *path) -{ - printf("\n=== test_read_delete_file ===\n"); - - int fd = open(path, O_RDONLY); - if (fd < 0) { perror("open"); return 1; } - printf("open: %s fd=%d\n", path, fd); - - char buf[256] = {0}; - ssize_t r = read(fd, buf, sizeof(buf)); - if (r < 0) { perror("read"); close(fd); return 2; } - printf("read: %zd bytes: %.*s\n", r, (int)r, buf); - - close(fd); - printf("close: ok\n"); - - if (unlink(path) != 0) { perror("unlink"); return 3; } - printf("unlink: ok\n"); - - return 0; -} - -int main(int argc, char **argv) -{ - char path[PATH_MAX]; - make_path(path, sizeof(path), argc >= 2 ? argv[1] : NULL, "file.dat"); - int rc = test_read_delete_file(path); - return report_result("test_read_delete_file", rc); -} diff --git a/test/test_single_file_perf.c b/test/test_single_file_perf.c deleted file mode 100755 index eceae4a..0000000 --- a/test/test_single_file_perf.c +++ /dev/null @@ -1,93 +0,0 @@ -#include "test_utils.h" - -static int test_single_file_perf(const char *path) -{ - size_t io_size = 128 * 1024; - // size_t io_size = 4096; - size_t max_size = 2ULL * 1024 * 1024 * 1024; - size_t max_count = max_size / io_size; - int test_sec = 10; - int direct = 0; - - printf("\n=== test_single_file_perf ===\n"); - printf("Path : %s\n", path); - printf("IO size : %zu KB\n", io_size / 1024); - printf("Max file: %zu MB\n", max_size / 1024 / 1024); - printf("Duration: %d sec\n", test_sec); - - unlink(path); - char *buf = aligned_alloc(4096, io_size); - if (!buf) { perror("aligned_alloc"); return 1; } - memset(buf, 'A', io_size); - - struct timespec t1, t2, now; - - int fd = open(path, O_CREAT | O_RDWR | direct, 0644); - if (fd < 0) { perror("open write"); free(buf); return 1; } - - clock_gettime(CLOCK_MONOTONIC, &t1); - size_t wcount = 0; - size_t wpos = 0; - do { - if (wpos >= max_count) { - lseek(fd, 0, SEEK_SET); - wpos = 0; - } - if (write(fd, buf, io_size) != (ssize_t)io_size) { - perror("write"); - close(fd); - free(buf); - return 2; - } - wcount++; - wpos++; - clock_gettime(CLOCK_MONOTONIC, &now); - } while (time_diff_sec(t1, now) < test_sec); - clock_gettime(CLOCK_MONOTONIC, &t2); - close(fd); - - double wsec = time_diff_sec(t1, t2); - double wmb = (double)(wcount * io_size) / (1024.0 * 1024.0); - printf("\nWRITE:\n"); - printf(" total : %.1f MB\n", wmb); - printf(" time : %.3f sec\n", wsec); - printf(" IOPS : %.0f ops/sec\n", wcount / wsec); - printf(" BW : %.2f MB/s\n", wmb / wsec); - - fd = open(path, O_RDONLY | direct); - if (fd < 0) { perror("open read"); free(buf); return 3; } - - clock_gettime(CLOCK_MONOTONIC, &t1); - size_t rcount = 0; - do { - ssize_t r = read(fd, buf, io_size); - if (r <= 0) { - lseek(fd, 0, SEEK_SET); - continue; - } - rcount++; - clock_gettime(CLOCK_MONOTONIC, &now); - } while (time_diff_sec(t1, now) < test_sec); - clock_gettime(CLOCK_MONOTONIC, &t2); - close(fd); - - double rsec = time_diff_sec(t1, t2); - double rmb = (double)(rcount * io_size) / (1024.0 * 1024.0); - printf("\nREAD:\n"); - printf(" total : %.1f MB\n", rmb); - printf(" time : %.3f sec\n", rsec); - printf(" IOPS : %.0f ops/sec\n", rcount / rsec); - printf(" BW : %.2f MB/s\n", rmb / rsec); - - unlink(path); - free(buf); - return 0; -} - -int main(int argc, char **argv) -{ - char path[PATH_MAX]; - make_path(path, sizeof(path), argc >= 2 ? argv[1] : NULL, "file.dat"); - int rc = test_single_file_perf(path); - return report_result("test_single_file_perf", rc); -} diff --git a/test/test_single_file_random_noaligned_perf.c b/test/test_single_file_random_noaligned_perf.c deleted file mode 100755 index ed5d23a..0000000 --- a/test/test_single_file_random_noaligned_perf.c +++ /dev/null @@ -1,116 +0,0 @@ -#include "test_utils.h" - -static int test_single_file_random_noaligned_perf(const char *path) -{ - size_t io_size = 128 * 1024; - size_t max_size = 2ULL * 1024 * 1024 * 1024; - int test_sec = 10; - int direct = 0; - - printf("\n=== test_single_file_random_noaligned_perf ===\n"); - printf("Path : %s\n", path); - printf("IO size : %zu KB\n", io_size / 1024); - printf("Range : %zu MB\n", max_size / 1024 / 1024); - printf("Duration: %d sec\n", test_sec); - - srand(0x1234); - - char *buf = aligned_alloc(4096, io_size); - if (!buf) { perror("aligned_alloc"); return 1; } - memset(buf, 'A', io_size); - - struct timespec t1, t2, now; - - unlink(path); - - int fd = open(path, O_CREAT | O_RDWR | direct, 0644); - if (fd < 0) { perror("open rand write"); free(buf); return 2; } - - clock_gettime(CLOCK_MONOTONIC, &t1); - - size_t wcount = 0; - do { - off_t offset = (off_t)(rand() % (max_size - io_size)); - - if (lseek(fd, offset, SEEK_SET) < 0) { - perror("lseek rand write"); - close(fd); - free(buf); - return 3; - } - - if (write(fd, buf, io_size) != (ssize_t)io_size) { - perror("rand write"); - close(fd); - free(buf); - return 4; - } - - wcount++; - clock_gettime(CLOCK_MONOTONIC, &now); - } while (time_diff_sec(t1, now) < test_sec); - - clock_gettime(CLOCK_MONOTONIC, &t2); - close(fd); - - double wsec = time_diff_sec(t1, t2); - double wmb = (double)wcount * io_size / 1024 / 1024; - - printf("\nRANDOM WRITE:\n"); - printf(" total : %.1f MB\n", wmb); - printf(" time : %.3f sec\n", wsec); - printf(" IOPS : %.0f ops/sec\n", wcount / wsec); - printf(" BW : %.2f MB/s\n", wmb / wsec); - - fd = open(path, O_RDONLY | direct); - if (fd < 0) { perror("open rand read"); free(buf); return 5; } - - clock_gettime(CLOCK_MONOTONIC, &t1); - - size_t rcount = 0; - do { - off_t offset = (off_t)(rand() % (max_size - io_size)); - - if (lseek(fd, offset, SEEK_SET) < 0) { - perror("lseek rand read"); - close(fd); - free(buf); - return 6; - } - - ssize_t r = read(fd, buf, io_size); - if (r < 0) { - perror("rand read"); - close(fd); - free(buf); - return 7; - } - - rcount++; - clock_gettime(CLOCK_MONOTONIC, &now); - } while (time_diff_sec(t1, now) < test_sec); - - clock_gettime(CLOCK_MONOTONIC, &t2); - close(fd); - - double rsec = time_diff_sec(t1, t2); - double rmb = (double)rcount * io_size / 1024 / 1024; - - printf("\nRANDOM READ:\n"); - printf(" total : %.1f MB\n", rmb); - printf(" time : %.3f sec\n", rsec); - printf(" IOPS : %.0f ops/sec\n", rcount / rsec); - printf(" BW : %.2f MB/s\n", rmb / rsec); - - unlink(path); - free(buf); - return 0; -} - -int main(int argc, char **argv) -{ - char path[PATH_MAX]; - make_path(path, sizeof(path), argc >= 2 ? argv[1] : NULL, "file.dat"); - int rc = test_single_file_random_noaligned_perf(path); - return report_result("test_single_file_random_noaligned_perf", rc); -} diff --git a/test/test_single_file_random_perf.c b/test/test_single_file_random_perf.c deleted file mode 100755 index a4b0731..0000000 --- a/test/test_single_file_random_perf.c +++ /dev/null @@ -1,119 +0,0 @@ -#include "test_utils.h" - -static int test_single_file_random_perf(const char *path) -{ - size_t io_size = 128 * 1024; - size_t max_size = 2ULL * 1024 * 1024 * 1024; - size_t max_count = max_size / io_size; - int test_sec = 10; - int direct = 0; - - printf("\n=== test_single_file_random_perf ===\n"); - printf("Path : %s\n", path); - printf("IO size : %zu KB\n", io_size / 1024); - printf("Range : %zu MB\n", max_size / 1024 / 1024); - printf("Duration: %d sec\n", test_sec); - - srand(0x1234); - - char *buf = aligned_alloc(4096, io_size); - if (!buf) { perror("aligned_alloc"); return 1; } - memset(buf, 'A', io_size); - - struct timespec t1, t2, now; - - unlink(path); - - int fd = open(path, O_CREAT | O_RDWR | direct, 0644); - if (fd < 0) { perror("open rand write"); free(buf); return 2; } - - clock_gettime(CLOCK_MONOTONIC, &t1); - - size_t wcount = 0; - do { - size_t blk = rand() % max_count; - off_t offset = (off_t)blk * io_size; - - if (lseek(fd, offset, SEEK_SET) < 0) { - perror("lseek rand write"); - close(fd); - free(buf); - return 3; - } - - if (write(fd, buf, io_size) != (ssize_t)io_size) { - perror("rand write"); - close(fd); - free(buf); - return 4; - } - - wcount++; - clock_gettime(CLOCK_MONOTONIC, &now); - } while (time_diff_sec(t1, now) < test_sec); - - clock_gettime(CLOCK_MONOTONIC, &t2); - close(fd); - - double wsec = time_diff_sec(t1, t2); - double wmb = (double)wcount * io_size / 1024 / 1024; - - printf("\nRANDOM WRITE:\n"); - printf(" total : %.1f MB\n", wmb); - printf(" time : %.3f sec\n", wsec); - printf(" IOPS : %.0f ops/sec\n", wcount / wsec); - printf(" BW : %.2f MB/s\n", wmb / wsec); - - fd = open(path, O_RDONLY | direct); - if (fd < 0) { perror("open rand read"); free(buf); return 5; } - - clock_gettime(CLOCK_MONOTONIC, &t1); - - size_t rcount = 0; - do { - size_t blk = rand() % max_count; - off_t offset = (off_t)blk * io_size; - - if (lseek(fd, offset, SEEK_SET) < 0) { - perror("lseek rand read"); - close(fd); - free(buf); - return 6; - } - - ssize_t r = read(fd, buf, io_size); - if (r < 0) { - perror("rand read"); - close(fd); - free(buf); - return 7; - } - - rcount++; - clock_gettime(CLOCK_MONOTONIC, &now); - } while (time_diff_sec(t1, now) < test_sec); - - clock_gettime(CLOCK_MONOTONIC, &t2); - close(fd); - - double rsec = time_diff_sec(t1, t2); - double rmb = (double)rcount * io_size / 1024 / 1024; - - printf("\nRANDOM READ:\n"); - printf(" total : %.1f MB\n", rmb); - printf(" time : %.3f sec\n", rsec); - printf(" IOPS : %.0f ops/sec\n", rcount / rsec); - printf(" BW : %.2f MB/s\n", rmb / rsec); - - unlink(path); - free(buf); - return 0; -} - -int main(int argc, char **argv) -{ - char path[PATH_MAX]; - make_path(path, sizeof(path), argc >= 2 ? argv[1] : NULL, "file.dat"); - int rc = test_single_file_random_perf(path); - return report_result("test_single_file_random_perf", rc); -} diff --git a/test/test_two_files.c b/test/test_two_files.c deleted file mode 100755 index 45980d2..0000000 --- a/test/test_two_files.c +++ /dev/null @@ -1,78 +0,0 @@ -#include "test_utils.h" - -static int test_two_files(const char *path_a, const char *path_b) -{ - printf("\n=== test_two_files ===\n"); - - int fd_a = open(path_a, O_CREAT | O_RDWR | O_TRUNC, 0644); - if (fd_a < 0) { perror("open A"); return 1; } - - int fd_b = open(path_b, O_CREAT | O_RDWR | O_TRUNC, 0644); - if (fd_b < 0) { perror("open B"); return 2; } - - printf("fd_a=%d fd_b=%d\n", fd_a, fd_b); - - const char *data_a = "File-A: Hello World!"; - const char *data_b = "File-B: Goodbye World!"; - if (write(fd_a, data_a, strlen(data_a)) < 0) { perror("write A"); return 3; } - if (write(fd_b, data_b, strlen(data_b)) < 0) { perror("write B"); return 4; } - printf("write A: %s\n", data_a); - printf("write B: %s\n", data_b); - - lseek(fd_a, 0, SEEK_SET); - lseek(fd_b, 0, SEEK_SET); - - char buf_a[64] = {0}; - char buf_b[64] = {0}; - ssize_t r_a = read(fd_a, buf_a, sizeof(buf_a)); - ssize_t r_b = read(fd_b, buf_b, sizeof(buf_b)); - - printf("read A: %zd bytes: %.*s\n", r_a, (int)r_a, buf_a); - printf("read B: %zd bytes: %.*s\n", r_b, (int)r_b, buf_b); - - int ok = 1; - if (strncmp(buf_a, data_a, strlen(data_a)) != 0) { - printf("FAIL: A content mismatch!\n"); - ok = 0; - } - if (strncmp(buf_b, data_b, strlen(data_b)) != 0) { - printf("FAIL: B content mismatch!\n"); - ok = 0; - } - if (ok) { - printf("PASS: both files read back correctly\n"); - } - - lseek(fd_a, 0, SEEK_END); - if (write(fd_a, "[A-TAIL]", 8) != 8) { perror("append A"); return 5; } - - lseek(fd_b, 8, SEEK_SET); - if (write(fd_b, "Hi! ", 7) != 7) { perror("overwrite B"); return 6; } - - lseek(fd_a, 0, SEEK_SET); - lseek(fd_b, 0, SEEK_SET); - memset(buf_a, 0, sizeof(buf_a)); - memset(buf_b, 0, sizeof(buf_b)); - r_a = read(fd_a, buf_a, sizeof(buf_a)); - r_b = read(fd_b, buf_b, sizeof(buf_b)); - printf("after cross-write:\n"); - printf(" A: %.*s\n", (int)r_a, buf_a); - printf(" B: %.*s\n", (int)r_b, buf_b); - - close(fd_a); - close(fd_b); - unlink(path_a); - unlink(path_b); - return 0; -} - -int main(int argc, char **argv) -{ - char path_a[PATH_MAX]; - char path_b[PATH_MAX]; - const char *dir = argc >= 2 ? argv[1] : NULL; - make_path(path_a, sizeof(path_a), dir, "file_a.dat"); - make_path(path_b, sizeof(path_b), dir, "file_b.dat"); - int rc = test_two_files(path_a, path_b); - return report_result("test_two_files", rc); -} diff --git a/test/test_utils.h b/test/test_utils.h deleted file mode 100755 index bd9363b..0000000 --- a/test/test_utils.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef TEST_UTILS_H -#define TEST_UTILS_H - -#define _GNU_SOURCE - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifndef PATH_MAX -#define PATH_MAX 4096 -#endif - -static inline double time_diff_sec(struct timespec a, struct timespec b) -{ - return (b.tv_sec - a.tv_sec) + (b.tv_nsec - a.tv_nsec) / 1000000000.0; -} - -static inline void make_path(char *out, size_t out_sz, const char *dir, const char *name) -{ - if (dir && dir[0] != 0) { - snprintf(out, out_sz, "%s/%s", dir, name); - } else { - snprintf(out, out_sz, "/tmp/%s", name); - } -} - -static inline int report_result(const char *name, int rc) -{ - printf("\n=== %s %s ===\n", name, rc == 0 ? "PASSED" : "FAILED"); - return rc; -} - -#endif diff --git a/test/test_write_file.c b/test/test_write_file.c deleted file mode 100755 index ab67daf..0000000 --- a/test/test_write_file.c +++ /dev/null @@ -1,27 +0,0 @@ -#include "test_utils.h" - -static int test_write_file(const char *path) -{ - printf("\n=== test_write_file ===\n"); - - int fd = open(path, O_CREAT | O_RDWR, 0644); - if (fd < 0) { perror("open"); return 1; } - printf("open: %s fd=%d\n", path, fd); - - const char *msg = "Hello, zvfs!"; - ssize_t w = write(fd, msg, strlen(msg)); - if (w < 0) { perror("write"); close(fd); return 2; } - printf("write: %zd bytes: %s\n", w, msg); - - close(fd); - printf("close: ok\n"); - return 0; -} - -int main(int argc, char **argv) -{ - char path[PATH_MAX]; - make_path(path, sizeof(path), argc >= 2 ? argv[1] : NULL, "file.dat"); - int rc = test_write_file(path); - return report_result("test_write_file", rc); -} diff --git a/tests/Makefile b/tests/Makefile new file mode 100644 index 0000000..765f5f2 --- /dev/null +++ b/tests/Makefile @@ -0,0 +1,13 @@ +SUBDIRS := ioengine_test hook + +.PHONY: all clean $(SUBDIRS) + +all: $(SUBDIRS) + +$(SUBDIRS): + $(MAKE) -C $@ + +clean: + for dir in $(SUBDIRS); do \ + $(MAKE) -C $$dir clean; \ + done diff --git a/tests/hook/Makefile b/tests/hook/Makefile new file mode 100644 index 0000000..c7ab9a1 --- /dev/null +++ b/tests/hook/Makefile @@ -0,0 +1,8 @@ + +BIN_DIR := $(abspath $(CURDIR)/../bin) + +all: + gcc -g -o $(BIN_DIR)/hook_api_test hook_api_test.c + +clean: + rm -rf $(BIN_DIR)/hook_api_test diff --git a/tests/hook/hook_api_test.c b/tests/hook/hook_api_test.c new file mode 100644 index 0000000..84e8a8e --- /dev/null +++ b/tests/hook/hook_api_test.c @@ -0,0 +1,322 @@ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ASSERT_TRUE(cond, fmt, ...) \ + do { \ + if (!(cond)) { \ + fprintf(stderr, "[FAIL] %s:%d " fmt "\n", __func__, __LINE__, \ + ##__VA_ARGS__); \ + return -1; \ + } \ + } while (0) + +#define ASSERT_SYS_OK(expr) \ + do { \ + if ((expr) < 0) { \ + fprintf(stderr, "[FAIL] %s:%d %s: %s\n", __func__, __LINE__, \ + #expr, strerror(errno)); \ + return -1; \ + } \ + } while (0) + +static int +join_path(char *out, size_t out_sz, const char *dir, const char *name) +{ + int n = snprintf(out, out_sz, "%s/%s", dir, name); + if (n < 0 || (size_t)n >= out_sz) { + errno = ENAMETOOLONG; + return -1; + } + return 0; +} + +static int +test_basic_rw_seek_stat(const char *workdir) +{ + char path[PATH_MAX]; + ASSERT_SYS_OK(join_path(path, sizeof(path), workdir, "basic_rw.txt")); + + int fd = open(path, O_CREAT | O_TRUNC | O_RDWR, 0644); + ASSERT_SYS_OK(fd); + + const char *init = "abcdef"; + ssize_t nr = write(fd, init, 6); + ASSERT_TRUE(nr == 6, "write expected 6, got %zd", nr); + + off_t off = lseek(fd, 0, SEEK_SET); + ASSERT_TRUE(off == 0, "lseek expected 0, got %lld", (long long)off); + + char buf[16] = {0}; + nr = read(fd, buf, 6); + ASSERT_TRUE(nr == 6, "read expected 6, got %zd", nr); + ASSERT_TRUE(memcmp(buf, "abcdef", 6) == 0, "read content mismatch"); + + nr = pwrite(fd, "XYZ", 3, 3); + ASSERT_TRUE(nr == 3, "pwrite expected 3, got %zd", nr); + + memset(buf, 0, sizeof(buf)); + nr = pread(fd, buf, 6, 0); + ASSERT_TRUE(nr == 6, "pread expected 6, got %zd", nr); + ASSERT_TRUE(memcmp(buf, "abcXYZ", 6) == 0, "pread content mismatch"); + + struct stat st; + ASSERT_SYS_OK(fstat(fd, &st)); + ASSERT_TRUE(st.st_size == 6, "fstat size expected 6, got %lld", + (long long)st.st_size); + + ASSERT_SYS_OK(ftruncate(fd, 4)); + ASSERT_SYS_OK(fstat(fd, &st)); + ASSERT_TRUE(st.st_size == 4, "ftruncate size expected 4, got %lld", + (long long)st.st_size); + + ASSERT_SYS_OK(fdatasync(fd)); + ASSERT_SYS_OK(fsync(fd)); + ASSERT_SYS_OK(close(fd)); + ASSERT_SYS_OK(unlink(path)); + return 0; +} + +static int +test_openat_rename_unlink(const char *workdir) +{ + char subdir[PATH_MAX]; + ASSERT_SYS_OK(join_path(subdir, sizeof(subdir), workdir, "openat_dir")); + + ASSERT_SYS_OK(mkdir(subdir, 0755)); + + int dfd = open(subdir, O_RDONLY | O_DIRECTORY); + ASSERT_SYS_OK(dfd); + + int fd = openat(dfd, "a.txt", O_CREAT | O_TRUNC | O_RDWR, 0644); + ASSERT_SYS_OK(fd); + + ssize_t nr = write(fd, "hello", 5); + ASSERT_TRUE(nr == 5, "write expected 5, got %zd", nr); + ASSERT_SYS_OK(close(fd)); + + struct stat st; + ASSERT_SYS_OK(fstatat(dfd, "a.txt", &st, 0)); + ASSERT_TRUE(st.st_size == 5, "fstatat size expected 5, got %lld", + (long long)st.st_size); + + ASSERT_SYS_OK(renameat(dfd, "a.txt", dfd, "b.txt")); + ASSERT_SYS_OK(fstatat(dfd, "b.txt", &st, 0)); + ASSERT_TRUE(st.st_size == 5, "renamed file size expected 5, got %lld", + (long long)st.st_size); + + ASSERT_SYS_OK(unlinkat(dfd, "b.txt", 0)); + + errno = 0; + ASSERT_TRUE(fstatat(dfd, "b.txt", &st, 0) == -1 && errno == ENOENT, + "fstatat after unlink should be ENOENT"); + + ASSERT_SYS_OK(close(dfd)); + ASSERT_SYS_OK(rmdir(subdir)); + return 0; +} + +static int +test_dup_fcntl_ioctl(const char *workdir) +{ + char path[PATH_MAX]; + ASSERT_SYS_OK(join_path(path, sizeof(path), workdir, "dup_fcntl.txt")); + + int fd = open(path, O_CREAT | O_TRUNC | O_RDWR, 0644); + ASSERT_SYS_OK(fd); + + ASSERT_TRUE(write(fd, "0123456789", 10) == 10, "write expected 10 bytes"); + ASSERT_SYS_OK(lseek(fd, 0, SEEK_SET)); + + int fd2 = dup(fd); + bool dup_supported = true; + if (fd2 < 0 && (errno == ENOTSUP || errno == EOPNOTSUPP || errno == ENOSYS)) { + dup_supported = false; + fprintf(stderr, "[INFO] dup on this backend is unsupported, skip shared-offset check\n"); + } else { + ASSERT_SYS_OK(fd2); + } + + char buf[4] = {0}; + if (dup_supported) { + ASSERT_TRUE(read(fd, buf, 2) == 2, "read(fd) expected 2 bytes"); + ASSERT_TRUE(memcmp(buf, "01", 2) == 0, "first read mismatch"); + + memset(buf, 0, sizeof(buf)); + ASSERT_TRUE(read(fd2, buf, 2) == 2, "read(fd2) expected 2 bytes"); + ASSERT_TRUE(memcmp(buf, "23", 2) == 0, + "dup offset should be shared, expected \"23\""); + } + + int fd_flags = fcntl(fd, F_GETFD); + ASSERT_SYS_OK(fd_flags); + ASSERT_SYS_OK(fcntl(fd, F_SETFD, fd_flags | FD_CLOEXEC)); + + int fd_flags_after = fcntl(fd, F_GETFD); + ASSERT_SYS_OK(fd_flags_after); + ASSERT_TRUE((fd_flags_after & FD_CLOEXEC) != 0, + "FD_CLOEXEC should be set"); + + int file_flags = fcntl(fd, F_GETFL); + ASSERT_SYS_OK(file_flags); + ASSERT_SYS_OK(fcntl(fd, F_SETFL, file_flags | O_APPEND)); + + int file_flags_after = fcntl(fd, F_GETFL); + ASSERT_SYS_OK(file_flags_after); + ASSERT_TRUE((file_flags_after & O_APPEND) != 0, "O_APPEND should be set"); + + int avail = -1; + ASSERT_SYS_OK(ioctl(dup_supported ? fd2 : fd, FIONREAD, &avail)); + if (dup_supported) { + ASSERT_TRUE(avail == 6, "FIONREAD expected 6, got %d", avail); + ASSERT_SYS_OK(close(fd2)); + } else { + ASSERT_TRUE(avail == 10, "FIONREAD expected 10, got %d", avail); + } + + ASSERT_SYS_OK(close(fd)); + ASSERT_SYS_OK(unlink(path)); + return 0; +} + +static int +test_readv_writev_pwritev(const char *workdir) +{ + char path[PATH_MAX]; + ASSERT_SYS_OK(join_path(path, sizeof(path), workdir, "iov.txt")); + + int fd = open(path, O_CREAT | O_TRUNC | O_RDWR, 0644); + ASSERT_SYS_OK(fd); + + struct iovec wiov[3]; + wiov[0].iov_base = "ab"; + wiov[0].iov_len = 2; + wiov[1].iov_base = "cd"; + wiov[1].iov_len = 2; + wiov[2].iov_base = "ef"; + wiov[2].iov_len = 2; + + ssize_t nr = writev(fd, wiov, 3); + ASSERT_TRUE(nr == 6, "writev expected 6, got %zd", nr); + + ASSERT_SYS_OK(lseek(fd, 0, SEEK_SET)); + + char a[2] = {0}, b[2] = {0}, c[2] = {0}; + struct iovec riov[3]; + riov[0].iov_base = a; + riov[0].iov_len = 2; + riov[1].iov_base = b; + riov[1].iov_len = 2; + riov[2].iov_base = c; + riov[2].iov_len = 2; + + nr = readv(fd, riov, 3); + ASSERT_TRUE(nr == 6, "readv expected 6, got %zd", nr); + ASSERT_TRUE(memcmp(a, "ab", 2) == 0 && + memcmp(b, "cd", 2) == 0 && + memcmp(c, "ef", 2) == 0, "readv content mismatch"); + + struct iovec pwiov[2]; + pwiov[0].iov_base = "12"; + pwiov[0].iov_len = 2; + pwiov[1].iov_base = "34"; + pwiov[1].iov_len = 2; + nr = pwritev(fd, pwiov, 2, 1); + ASSERT_TRUE(nr == 4, "pwritev expected 4, got %zd", nr); + + char out[8] = {0}; + nr = pread(fd, out, 6, 0); + ASSERT_TRUE(nr == 6, "pread expected 6, got %zd", nr); + ASSERT_TRUE(memcmp(out, "a1234f", 6) == 0, "pwritev content mismatch"); + + ASSERT_SYS_OK(close(fd)); + ASSERT_SYS_OK(unlink(path)); + return 0; +} + +typedef int (*test_fn)(const char *workdir); + +struct test_case { + const char *name; + test_fn fn; +}; + +static int +run_test(const struct test_case *tc, const char *workdir) +{ + int rc = tc->fn(workdir); + if (rc == 0) { + printf("[PASS] %s\n", tc->name); + return 0; + } + printf("[FAIL] %s\n", tc->name); + return -1; +} + +int +main(void) +{ + const char *base = getenv("ZVFS_TEST_ROOT"); + if (!base || base[0] == '\0') + base = "/tmp"; + + char workdir[PATH_MAX]; + int n = snprintf(workdir, sizeof(workdir), "%s/zvfs-hook-api-XXXXXX", base); + if (n < 0 || (size_t)n >= sizeof(workdir)) { + fprintf(stderr, "workdir template too long\n"); + return 2; + } + + if (!mkdtemp(workdir)) { + fprintf(stderr, "mkdtemp(%s) failed: %s\n", workdir, strerror(errno)); + return 2; + } + + printf("workdir=%s\n", workdir); + printf("hint: set ZVFS_TEST_ROOT=/zvfs when validating LD_PRELOAD hook path.\n"); + + struct test_case tests[] = { + {"basic_rw_seek_stat", test_basic_rw_seek_stat}, + {"openat_rename_unlink", test_openat_rename_unlink}, + {"dup_fcntl_ioctl", test_dup_fcntl_ioctl}, + {"readv_writev_pwritev", test_readv_writev_pwritev}, + }; + + int failed = 0; + for (size_t i = 0; i < sizeof(tests) / sizeof(tests[0]); ++i) { + if (run_test(&tests[i], workdir) != 0) + failed++; + } + + const char *keep = getenv("ZVFS_TEST_KEEP"); + if (!keep || strcmp(keep, "1") != 0) { + if (rmdir(workdir) < 0) { + fprintf(stderr, + "warning: failed to remove workdir %s: %s\n", + workdir, strerror(errno)); + } + } else { + printf("kept workdir=%s\n", workdir); + } + + if (failed == 0) { + printf("ALL TESTS PASSED\n"); + return 0; + } + + printf("FAILED=%d\n", failed); + return 1; +} diff --git a/tests/ioengine_test/Makefile b/tests/ioengine_test/Makefile new file mode 100644 index 0000000..d47dc0d --- /dev/null +++ b/tests/ioengine_test/Makefile @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: BSD-3-Clause + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../spdk) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +include $(SPDK_ROOT_DIR)/mk/spdk.modules.mk +include $(SPDK_ROOT_DIR)/mk/spdk.app_vars.mk + +# 输出目录 +BIN_DIR := $(abspath $(CURDIR)/../bin) + +TEST_BINS := \ + ioengine_single_blob_test \ + ioengine_multi_blob_test \ + ioengine_same_blob_mt_test + +COMMON_SRCS := \ + test_common.c \ + ../../src/spdk_engine/io_engine.c \ + ../../src/common/utils.c + +SPDK_LIB_LIST = $(ALL_MODULES_LIST) event event_bdev +LIBS += $(SPDK_LIB_LINKER_ARGS) + +CFLAGS += -I$(abspath $(CURDIR)/../../src) -I$(CURDIR) + +.PHONY: all clean +all: $(BIN_DIR) $(addprefix $(BIN_DIR)/,$(TEST_BINS)) + +# 创建 bin 目录 +$(BIN_DIR): + mkdir -p $(BIN_DIR) + +$(BIN_DIR)/ioengine_single_blob_test: ioengine_single_blob_test.c $(COMMON_SRCS) $(SPDK_LIB_FILES) $(ENV_LIBS) + $(CC) $(CFLAGS) -o $@ $< $(COMMON_SRCS) $(LDFLAGS) $(LIBS) $(ENV_LDFLAGS) $(SYS_LIBS) + +$(BIN_DIR)/ioengine_multi_blob_test: ioengine_multi_blob_test.c $(COMMON_SRCS) $(SPDK_LIB_FILES) $(ENV_LIBS) + $(CC) $(CFLAGS) -o $@ $< $(COMMON_SRCS) $(LDFLAGS) $(LIBS) $(ENV_LDFLAGS) $(SYS_LIBS) + +$(BIN_DIR)/ioengine_same_blob_mt_test: ioengine_same_blob_mt_test.c $(COMMON_SRCS) $(SPDK_LIB_FILES) $(ENV_LIBS) + $(CC) $(CFLAGS) -o $@ $< $(COMMON_SRCS) $(LDFLAGS) $(LIBS) $(ENV_LDFLAGS) $(SYS_LIBS) + +clean: + rm -f $(addprefix $(BIN_DIR)/,$(TEST_BINS)) \ No newline at end of file diff --git a/tests/ioengine_test/ioengine_multi_blob_test.c b/tests/ioengine_test/ioengine_multi_blob_test.c new file mode 100644 index 0000000..9d8118f --- /dev/null +++ b/tests/ioengine_test/ioengine_multi_blob_test.c @@ -0,0 +1,106 @@ +#include +#include +#include +#include + +#include "spdk_engine/io_engine.h" +#include "test_common.h" + +#define MULTI_BLOB_COUNT 3 + +int main(void) { + int rc = 0; + const char *bdev_name = getenv("SPDK_BDEV_NAME"); + struct zvfs_blob_handle *handles[MULTI_BLOB_COUNT] = {0}; + uint64_t ids[MULTI_BLOB_COUNT] = {0}; + uint64_t cluster = 0; + void *wbuf = NULL; + void *rbuf = NULL; + int i = 0; + + if (!bdev_name) { + bdev_name = "Malloc0"; + } + if (io_engine_init(bdev_name) != 0) { + fprintf(stderr, "TEST2: io_engine_init failed (bdev=%s)\n", bdev_name); + return 1; + } + + printf("[TEST2] single thread / multi blob\n"); + + handles[0] = blob_create(0); + if (!handles[0]) { + fprintf(stderr, "TEST2: create first blob failed\n"); + return 1; + } + ids[0] = handles[0]->id; + cluster = handles[0]->size; + if (cluster == 0) { + fprintf(stderr, "TEST2: invalid cluster size\n"); + rc = 1; + goto out; + } + if (blob_resize(handles[0], cluster * 2) != 0) { + fprintf(stderr, "TEST2: resize first blob failed\n"); + rc = 1; + goto out; + } + + for (i = 1; i < MULTI_BLOB_COUNT; i++) { + handles[i] = blob_create(cluster * 2); + if (!handles[i]) { + fprintf(stderr, "TEST2: create blob %d failed\n", i); + rc = 1; + goto out; + } + ids[i] = handles[i]->id; + } + + if (alloc_aligned_buf(&wbuf, cluster) != 0 || alloc_aligned_buf(&rbuf, cluster) != 0) { + fprintf(stderr, "TEST2: alloc aligned buffer failed\n"); + rc = 1; + goto out; + } + + for (i = 0; i < MULTI_BLOB_COUNT; i++) { + fill_pattern((uint8_t *)wbuf, cluster, (uint8_t)(0x20 + i)); + memset(rbuf, 0, cluster); + + if (blob_write(handles[i], 0, wbuf, cluster) != 0) { + fprintf(stderr, "TEST2: blob_write[%d] failed\n", i); + rc = 1; + goto out; + } + if (blob_read(handles[i], 0, rbuf, cluster) != 0) { + fprintf(stderr, "TEST2: blob_read[%d] failed\n", i); + rc = 1; + goto out; + } + if (memcmp(wbuf, rbuf, cluster) != 0) { + fprintf(stderr, "TEST2: blob[%d] readback mismatch\n", i); + rc = 1; + goto out; + } + } + +out: + for (i = 0; i < MULTI_BLOB_COUNT; i++) { + if (handles[i]) { + (void)blob_close(handles[i]); + } + } + for (i = 0; i < MULTI_BLOB_COUNT; i++) { + if (ids[i] != 0) { + (void)blob_delete(ids[i]); + } + } + free(wbuf); + free(rbuf); + + if (rc == 0) { + printf("[TEST2] PASS\n"); + return 0; + } + printf("[TEST2] FAIL\n"); + return 1; +} diff --git a/tests/ioengine_test/ioengine_same_blob_mt_test.c b/tests/ioengine_test/ioengine_same_blob_mt_test.c new file mode 100644 index 0000000..4754778 --- /dev/null +++ b/tests/ioengine_test/ioengine_same_blob_mt_test.c @@ -0,0 +1,147 @@ +#include +#include +#include +#include +#include + +#include "spdk_engine/io_engine.h" +#include "test_common.h" + +#define THREAD_COUNT 4 + +struct mt_case_arg { + struct zvfs_blob_handle *handle; + uint64_t cluster_size; + uint64_t offset; + uint8_t seed; + pthread_barrier_t *barrier; + int rc; +}; + +static void *mt_case_worker(void *arg) { + struct mt_case_arg *ctx = (struct mt_case_arg *)arg; + void *wbuf = NULL; + void *rbuf = NULL; + + if (alloc_aligned_buf(&wbuf, ctx->cluster_size) != 0 || + alloc_aligned_buf(&rbuf, ctx->cluster_size) != 0) { + free(wbuf); + free(rbuf); + ctx->rc = 1; + return NULL; + } + + fill_pattern((uint8_t *)wbuf, ctx->cluster_size, ctx->seed); + (void)pthread_barrier_wait(ctx->barrier); + + if (blob_write(ctx->handle, ctx->offset, wbuf, ctx->cluster_size) != 0) { + ctx->rc = 1; + goto out; + } + if (blob_read(ctx->handle, ctx->offset, rbuf, ctx->cluster_size) != 0) { + ctx->rc = 1; + goto out; + } + if (memcmp(wbuf, rbuf, ctx->cluster_size) != 0) { + ctx->rc = 1; + goto out; + } + + ctx->rc = 0; + +out: + free(wbuf); + free(rbuf); + return NULL; +} + +int main(void) { + int rc = 0; + const char *bdev_name = getenv("SPDK_BDEV_NAME"); + int i = 0; + struct zvfs_blob_handle *h = NULL; + uint64_t blob_id = 0; + uint64_t cluster = 0; + pthread_t tids[THREAD_COUNT]; + struct mt_case_arg args[THREAD_COUNT]; + pthread_barrier_t barrier; + int barrier_inited = 0; + + if (!bdev_name) { + bdev_name = "Malloc0"; + } + if (io_engine_init(bdev_name) != 0) { + fprintf(stderr, "TEST3: io_engine_init failed (bdev=%s)\n", bdev_name); + return 1; + } + + printf("[TEST3] multi thread / same blob\n"); + + h = blob_create(0); + if (!h) { + fprintf(stderr, "TEST3: blob_create failed\n"); + return 1; + } + blob_id = h->id; + cluster = h->size; + if (cluster == 0) { + fprintf(stderr, "TEST3: invalid cluster size\n"); + rc = 1; + goto out; + } + if (blob_resize(h, cluster * THREAD_COUNT) != 0) { + fprintf(stderr, "TEST3: blob_resize failed\n"); + rc = 1; + goto out; + } + + if (pthread_barrier_init(&barrier, NULL, THREAD_COUNT) != 0) { + fprintf(stderr, "TEST3: barrier init failed\n"); + rc = 1; + goto out; + } + barrier_inited = 1; + + for (i = 0; i < THREAD_COUNT; i++) { + args[i].handle = h; + args[i].cluster_size = cluster; + args[i].offset = cluster * (uint64_t)i; + args[i].seed = (uint8_t)(0x40 + i); + args[i].barrier = &barrier; + args[i].rc = 1; + if (pthread_create(&tids[i], NULL, mt_case_worker, &args[i]) != 0) { + fprintf(stderr, "TEST3: pthread_create[%d] failed\n", i); + rc = 1; + while (--i >= 0) { + pthread_join(tids[i], NULL); + } + goto out; + } + } + + for (i = 0; i < THREAD_COUNT; i++) { + pthread_join(tids[i], NULL); + if (args[i].rc != 0) { + fprintf(stderr, "TEST3: worker[%d] failed\n", i); + rc = 1; + } + } + +out: + if (barrier_inited) { + (void)pthread_barrier_destroy(&barrier); + } + if (h) { + (void)blob_close(h); + } + if (blob_id != 0) { + (void)blob_delete(blob_id); + } + + if (rc == 0) { + printf("[TEST3] PASS\n"); + return 0; + } + printf("[TEST3] FAIL\n"); + return 1; +} diff --git a/tests/ioengine_test/ioengine_single_blob_test.c b/tests/ioengine_test/ioengine_single_blob_test.c new file mode 100644 index 0000000..e2070ef --- /dev/null +++ b/tests/ioengine_test/ioengine_single_blob_test.c @@ -0,0 +1,136 @@ +#include +#include +#include +#include + +#include "spdk_engine/io_engine.h" +#include "test_common.h" + +int main(void) { + int rc = 0; + const char *bdev_name = getenv("SPDK_BDEV_NAME"); + struct zvfs_blob_handle *h = NULL; + struct zvfs_blob_handle *reopen = NULL; + uint64_t blob_id = 0; + uint64_t cluster = 0; + void *wbuf = NULL; + void *rbuf = NULL; + + if (!bdev_name) { + bdev_name = "Malloc0"; + } + if (io_engine_init(bdev_name) != 0) { + fprintf(stderr, "TEST1: io_engine_init failed (bdev=%s)\n", bdev_name); + return 1; + } + + printf("[TEST1] single thread / single blob\n"); + + h = blob_create(0); + if (!h) { + fprintf(stderr, "TEST1: blob_create failed\n"); + return 1; + } + blob_id = h->id; + cluster = h->size; + if (cluster == 0) { + fprintf(stderr, "TEST1: invalid cluster size\n"); + rc = 1; + goto out; + } + + rc = blob_resize(h, cluster * 2); + if (rc != 0) { + fprintf(stderr, "TEST1: blob_resize failed: %d\n", rc); + rc = 1; + goto out; + } + + rc = alloc_aligned_buf(&wbuf, cluster); + if (rc != 0) { + fprintf(stderr, "TEST1: alloc write buf failed: %d\n", rc); + rc = 1; + goto out; + } + rc = alloc_aligned_buf(&rbuf, cluster); + if (rc != 0) { + fprintf(stderr, "TEST1: alloc read buf failed: %d\n", rc); + rc = 1; + goto out; + } + fill_pattern((uint8_t *)wbuf, cluster, 0x11); + + rc = blob_write(h, 0, wbuf, cluster); + if (rc != 0) { + fprintf(stderr, "TEST1: blob_write failed: %d\n", rc); + rc = 1; + goto out; + } + + rc = blob_read(h, 0, rbuf, cluster); + if (rc != 0) { + fprintf(stderr, "TEST1: blob_read failed: %d\n", rc); + rc = 1; + goto out; + } + if (memcmp(wbuf, rbuf, cluster) != 0) { + fprintf(stderr, "TEST1: readback mismatch\n"); + rc = 1; + goto out; + } + + rc = blob_sync_md(h); + if (rc != 0) { + fprintf(stderr, "TEST1: blob_sync_md failed: %d\n", rc); + rc = 1; + goto out; + } + + rc = blob_close(h); + if (rc != 0) { + fprintf(stderr, "TEST1: blob_close failed: %d\n", rc); + rc = 1; + goto out; + } + h = NULL; + + reopen = blob_open(blob_id); + if (!reopen) { + fprintf(stderr, "TEST1: blob_open(reopen) failed\n"); + rc = 1; + goto out; + } + + memset(rbuf, 0, cluster); + rc = blob_read(reopen, 0, rbuf, cluster); + if (rc != 0) { + fprintf(stderr, "TEST1: reopen blob_read failed: %d\n", rc); + rc = 1; + goto out; + } + if (memcmp(wbuf, rbuf, cluster) != 0) { + fprintf(stderr, "TEST1: reopen readback mismatch\n"); + rc = 1; + goto out; + } + +out: + if (reopen) { + (void)blob_close(reopen); + } + if (h) { + (void)blob_close(h); + } + if (blob_id != 0) { + (void)blob_delete(blob_id); + } + free(wbuf); + free(rbuf); + + if (rc == 0) { + printf("[TEST1] PASS\n"); + return 0; + } + printf("[TEST1] FAIL\n"); + return 1; +} diff --git a/tests/ioengine_test/test_common.c b/tests/ioengine_test/test_common.c new file mode 100644 index 0000000..12336be --- /dev/null +++ b/tests/ioengine_test/test_common.c @@ -0,0 +1,20 @@ +#include "test_common.h" + +#include +#include + +int alloc_aligned_buf(void **buf, size_t len) { + int rc = posix_memalign(buf, 4096, len); + if (rc != 0) { + return -rc; + } + memset(*buf, 0, len); + return 0; +} + +void fill_pattern(uint8_t *buf, size_t len, uint8_t seed) { + size_t i = 0; + for (i = 0; i < len; i++) { + buf[i] = (uint8_t)(seed + (uint8_t)i); + } +} diff --git a/tests/ioengine_test/test_common.h b/tests/ioengine_test/test_common.h new file mode 100644 index 0000000..c2f18af --- /dev/null +++ b/tests/ioengine_test/test_common.h @@ -0,0 +1,10 @@ +#ifndef __IOENGINE_TEST_COMMON_H__ +#define __IOENGINE_TEST_COMMON_H__ + +#include +#include + +int alloc_aligned_buf(void **buf, size_t len); +void fill_pattern(uint8_t *buf, size_t len, uint8_t seed); + +#endif // __IOENGINE_TEST_COMMON_H__ diff --git a/zvfs/zvfs.c b/zvfs/zvfs.c deleted file mode 100755 index 885bbbe..0000000 --- a/zvfs/zvfs.c +++ /dev/null @@ -1,1135 +0,0 @@ - - -#include "zvfs.h" -#include -#undef SPDK_DEBUGLOG -#define SPDK_DEBUGLOG(...) do {} while(0) - -#define ZVFS_BDEV "Nvme0n1" -#ifndef ZVFS_BDEV -#define ZVFS_BDEV "Malloc0" -#endif - -struct spdk_thread *global_thread = NULL; -const char *json_file = "/home/lian/share/10.1-spdk/zvfs/zvfs/zvfs.json"; - -// mount -void zvfs_do_mount(void *arg); -void zvfs_spdk_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *event_ctx); -void zvfs_spdk_bs_load_cb(void *arg, struct spdk_blob_store *bs, int bserrno); -void zvfs_spdk_bs_init_cb(void *arg, struct spdk_blob_store *bs, int bserrno); - -// create -void zvfs_do_create(void *arg); -void zvfs_spdk_bs_create_blob_cb(void *arg, spdk_blob_id blobid, int bserrno); -void zvfs_spdk_bs_open_blob_cb(void *arg, struct spdk_blob *blb, int bserrno); -void zvfs_spdk_blob_resize_cb(void *arg, int bserrno); -void zvfs_spdk_blob_sync_cb(void *arg, int bserrno); -// open -void zvfs_do_open(void *arg); -void zvfs_spdk_bs_open_blob_cb2(void *arg, struct spdk_blob *blb, int bserrno); - -// read -void zvfs_do_read(void *arg); -void zvfs_spdk_blob_read_cb(void *arg, int bserrno); - -// write -void zvfs_do_write(void *arg); -void zvfs_do_write_io(zvfs_io_req_t *req); -void zvfs_spdk_blob_write_preread_cb(void *arg, int bserrno); -void zvfs_spdk_blob_write_resize_cb(void *arg, int bserrno); -void zvfs_spdk_blob_write_sync_cb(void *arg, int bserrno); -void zvfs_spdk_blob_write_cb(void *arg, int bserrno); - -// close -void zvfs_do_close(void *arg); -void zvfs_spdk_blob_close_cb(void *arg, int bserrno); -void zvfs_spdk_blob_open_fail_close_cb(void *arg, int bserrno); - -// delete -void zvfs_do_delete(void *arg); -void zvfs_spdk_blob_delete_cb(void *arg, int bserrno); - -// setup -void zvfs_json_load_fn(void *arg); -void json_app_load_done(int rc, void *ctx); - - -// unmount -void zvfs_do_umount(void *arg); -void zvfs_spdk_bs_unload_cb(void *arg, int bserrno); -static int zvfs_submit_io_req(zvfs_io_req_t *req, spdk_msg_fn submit_fn, const char *op_name); -static int zvfs_pread_internal(zvfs_io_req_t *req); -static int zvfs_pwrite_internal(zvfs_io_req_t *req); - -/* ================================================================== */ -/* HELPER */ -/* ================================================================== */ -static uint64_t zvfs_need_clusters(zvfs_t *fs, uint64_t end_byte) { - uint64_t cluster_size = spdk_bs_get_cluster_size(fs->bs); - return (end_byte + cluster_size - 1) / cluster_size; -} - -/* ---------- 辅助:计算本次 IO 涉及的 LBA 范围 ---------- */ -static void calc_lba_range(zvfs_io_req_t *req) -{ - uint64_t io_unit = req->file->fs->io_unit_size; - uint64_t off = req->offset; - uint64_t cnt = req->len; - - req->lba = off / io_unit; - req->page_off = off % io_unit; - req->lba_count = (req->page_off + cnt + io_unit - 1) / io_unit; -} - -/* ---------- 确保 dma_buf 足够大 ---------- */ -static int ensure_dma_buf(zvfs_file_t *file, uint64_t need_bytes) -{ - if (file->dma_buf && file->dma_buf_size >= need_bytes) return 0; - - if (file->dma_buf) spdk_free(file->dma_buf); - - file->dma_buf = spdk_malloc(need_bytes, 0x1000, NULL, - SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); - if (!file->dma_buf) { file->dma_buf_size = 0; return -1; } - - file->dma_buf_size = need_bytes; - return 0; -} - -static inline int zvfs_err_from_bserrno(int bserrno) -{ - return bserrno != 0 ? bserrno : -EIO; -} - -// waiter -bool waiter(struct spdk_thread *thread, spdk_msg_fn start_fn, void *ctx, bool *finished) { - if (thread == NULL || start_fn == NULL || finished == NULL) { - return false; - } - - spdk_thread_send_msg(thread, start_fn, ctx); - - int waiter_count = 0; - - do { - spdk_thread_poll(thread, 0, 0); - waiter_count ++; - } while(!(*finished) && waiter_count < WAITER_MAX_TIME); - - if (!(*finished) && waiter_count >= WAITER_MAX_TIME) { - return false; // timeout - } - - return true; -} - -static int zvfs_submit_io_req(zvfs_io_req_t *req, spdk_msg_fn submit_fn, const char *op_name) -{ - if (req == NULL || submit_fn == NULL || global_thread == NULL) { - return -1; - } - - req->op_errno = 0; - req->result = 0; - req->finished = false; - if (req->file != NULL) { - req->file->op_errno = 0; - } - - bool ok = waiter(global_thread, submit_fn, req, &req->finished); - if (!ok) { - SPDK_ERRLOG("%s result: ok=%d\n", op_name, ok); - if (req->file != NULL) { - req->file->op_errno = -EIO; - } - return -1; - } - if (req->op_errno != 0) { - if (req->file != NULL) { - req->file->op_errno = req->op_errno; - } - return -1; - } - return (int)req->result; -} - -static int zvfs_pread_internal(zvfs_io_req_t *req) -{ - return zvfs_submit_io_req(req, zvfs_do_read, "pread"); -} - -static int zvfs_pwrite_internal(zvfs_io_req_t *req) -{ - return zvfs_submit_io_req(req, zvfs_do_write, "pwrite"); -} - -/* ================================================================== */ -/* MOUNT */ -/* ================================================================== */ -void zvfs_do_mount(void *arg) { - zvfs_t *fs = (zvfs_t*)arg; - struct spdk_bs_dev *bs_dev = NULL; - if (fs == NULL) { - return; - } - fs->op_errno = 0; - - // SPDK_DEBUGLOG("=== Listing ALL bdevs after JSON load ===\n"); - // struct spdk_bdev *bdev = spdk_bdev_first(); - // while (bdev) { - // SPDK_DEBUGLOG("Found bdev: [%s] product: %s\n", - // spdk_bdev_get_name(bdev), - // spdk_bdev_get_product_name(bdev)); - // bdev = spdk_bdev_next(bdev); - // } - // SPDK_DEBUGLOG("---------------------------------\n"); - // SPDK_DEBUGLOG("Trying to open: %s\n", ZVFS_BDEV); - - int rc = spdk_bdev_create_bs_dev_ext(ZVFS_BDEV, zvfs_spdk_bdev_event_cb, NULL, &bs_dev); - if (rc != 0) { - SPDK_ERRLOG("=== bdev_open FAILED rc=%d (probably still not registered) ===\n", rc); - fs->op_errno = rc; - fs->finished = true; - return; - } - - fs->bs_dev = bs_dev; - fs->bs_dev_owned = true; - spdk_bs_load(bs_dev, NULL, zvfs_spdk_bs_load_cb, fs); - fs->bs_dev_owned = false; -} - -void zvfs_spdk_bs_load_cb(void *arg, struct spdk_blob_store *bs, int bserrno) { - zvfs_t *fs = (zvfs_t*)arg; - if (fs == NULL) { - return; - } - if (bserrno == 0 && bs == NULL) { - fs->op_errno = -EIO; - fs->finished = true; - return; - } - - if (bserrno != 0) { - SPDK_DEBUGLOG("load failed, new device, re-create bs_dev and init\n"); - - struct spdk_bs_dev *bs_dev = NULL; - int rc = spdk_bdev_create_bs_dev_ext(ZVFS_BDEV, zvfs_spdk_bdev_event_cb, NULL, &bs_dev); - if (rc != 0) { - SPDK_ERRLOG("re-create bs_dev failed\n"); - fs->op_errno = rc; - fs->finished = true; - return; - } - fs->bs_dev = bs_dev; - fs->bs_dev_owned = true; - - spdk_bs_init(fs->bs_dev, NULL, zvfs_spdk_bs_init_cb, fs); - fs->bs_dev_owned = false; - return; - } - - uint64_t io_unit_size = spdk_bs_get_io_unit_size(bs); - SPDK_DEBUGLOG("io_unit_size : %"PRIu64"\n", io_unit_size); - SPDK_NOTICELOG("io_unit_size=%lu\n", io_unit_size); - - fs->io_unit_size = io_unit_size; - fs->bs = bs; - fs->channel = spdk_bs_alloc_io_channel(fs->bs); - if (fs->channel == NULL) { - fs->op_errno = -ENOMEM; - fs->finished = true; - return ; - } - - fs->finished = true; -} - - -void zvfs_spdk_bs_init_cb(void *arg, struct spdk_blob_store *bs, int bserrno) { - zvfs_t *fs = (zvfs_t*)arg; - if (fs == NULL) { - return; - } - if (bserrno != 0 || bs == NULL) { - fs->op_errno = zvfs_err_from_bserrno(bserrno); - fs->finished = true; - return; - } - - uint64_t io_unit_size = spdk_bs_get_io_unit_size(bs); - SPDK_DEBUGLOG("io_unit_size : %"PRIu64"\n", io_unit_size); - SPDK_NOTICELOG("io_unit_size=%lu\n", io_unit_size); - - fs->io_unit_size = io_unit_size; - fs->bs = bs; - fs->channel = spdk_bs_alloc_io_channel(fs->bs); - if (fs->channel == NULL) { - fs->op_errno = -ENOMEM; - fs->finished = true; - return ; - } - - fs->finished = true; -} - -void zvfs_spdk_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, - void *event_ctx) { -} - -/* ================================================================== */ -/* CREATE */ -/* ================================================================== */ -void zvfs_do_create(void *arg) { - zvfs_file_t *file = (zvfs_file_t *)arg; - if (file == NULL || file->fs == NULL || file->fs->bs == NULL) { - if (file != NULL) { - file->op_errno = -EINVAL; - file->finished = true; - } - return; - } - file->op_errno = 0; - - spdk_bs_create_blob(file->fs->bs, zvfs_spdk_bs_create_blob_cb, file); -} - -void zvfs_spdk_bs_create_blob_cb(void *arg, spdk_blob_id blobid, int bserrno) { - zvfs_file_t *file = (zvfs_file_t *)arg; - if (file == NULL) { - return; - } - if (bserrno != 0) { - file->op_errno = zvfs_err_from_bserrno(bserrno); - file->finished = true; - return; - } - - file->blob_id = blobid; - SPDK_DEBUGLOG("create blobid : %"PRIu64"\n", blobid); - - spdk_bs_open_blob(file->fs->bs, blobid, zvfs_spdk_bs_open_blob_cb, file); -} - -void zvfs_spdk_bs_open_blob_cb(void *arg, struct spdk_blob *blb, int bserrno) { - zvfs_file_t *file = (zvfs_file_t *)arg; - if (bserrno) { - SPDK_ERRLOG("load blob error: %d\n", bserrno); - file->op_errno = zvfs_err_from_bserrno(bserrno); - file->finished = true; - return; - } - if (blb == NULL) { - file->op_errno = -EIO; - file->finished = true; - return; - } - - file->blob = blb; - - uint64_t free_cluster = spdk_bs_free_cluster_count(file->fs->bs); - if(free_cluster == 0){ - SPDK_ERRLOG("no free cluster: %d\n", bserrno); - file->op_errno = -ENOSPC; - file->finished = true; - return ; - } - - spdk_blob_resize(blb, 1, zvfs_spdk_blob_resize_cb, file); -} - -void zvfs_spdk_blob_resize_cb(void *arg, int bserrno) { - zvfs_file_t *file = (zvfs_file_t *)arg; - if (file == NULL) { - return; - } - if (bserrno != 0) { - file->op_errno = zvfs_err_from_bserrno(bserrno); - file->finished = true; - return; - } - - uint64_t total = spdk_blob_get_num_clusters(file->blob); - SPDK_DEBUGLOG("resize blob :%"PRIu64"\n", total); - - if (file->dirent) { - file->dirent->allocated_clusters = total; - } - - spdk_blob_sync_md(file->blob, zvfs_spdk_blob_sync_cb, file); -} - -void zvfs_spdk_blob_sync_cb(void *arg, int bserrno) { - zvfs_file_t *file = (zvfs_file_t *)arg; - if (file == NULL) { - return; - } - if (bserrno != 0) { - file->op_errno = zvfs_err_from_bserrno(bserrno); - file->finished = true; - return; - } - - file->dma_buf = spdk_malloc(BUFFER_SIZE, 0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); - if (file->dma_buf == NULL) { - SPDK_ERRLOG("spdk_malloc failed\n"); - file->op_errno = -ENOMEM; - spdk_blob_close(file->blob, zvfs_spdk_blob_open_fail_close_cb, file); - return ; - } - - file->dma_buf_size = BUFFER_SIZE; - file->finished = true; -} - -/* ================================================================== */ -/* OPEN */ -/* ================================================================== */ -void zvfs_do_open(void *arg) { - zvfs_file_t *file = (zvfs_file_t *)arg; - if (file == NULL || file->fs == NULL || file->fs->bs == NULL) { - if (file != NULL) { - file->op_errno = -EINVAL; - file->finished = true; - } - return; - } - file->op_errno = 0; - spdk_bs_open_blob(file->fs->bs, file->blob_id, zvfs_spdk_bs_open_blob_cb2, file); -} - -void zvfs_spdk_bs_open_blob_cb2(void *arg, struct spdk_blob *blb, int bserrno) { - zvfs_file_t *file = (zvfs_file_t *)arg; - - if (bserrno) { - SPDK_ERRLOG("load blob error: %d\n", bserrno); - file->op_errno = zvfs_err_from_bserrno(bserrno); - file->finished = true; - return; - } - if (blb == NULL) { - file->op_errno = -EIO; - file->finished = true; - return; - } - - file->blob = blb; - - file->dma_buf = spdk_malloc(BUFFER_SIZE, 0x1000, NULL, - SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); - if (!file->dma_buf) { - SPDK_ERRLOG("spdk_malloc failed\n"); - file->op_errno = -ENOMEM; - spdk_blob_close(file->blob, zvfs_spdk_blob_open_fail_close_cb, file); - return; - } - - file->dma_buf_size = BUFFER_SIZE; - file->finished = true; -} - -/* ================================================================== */ -/* READ */ -/* ================================================================== */ -void zvfs_do_read(void *arg) { - zvfs_io_req_t *req = (zvfs_io_req_t *)arg; - zvfs_file_t *file = req ? req->file : NULL; - if (req == NULL || file == NULL || file->fs == NULL || file->blob == NULL || - file->fs->channel == NULL || req->buf == NULL) { - if (req != NULL) { - req->op_errno = -EINVAL; - req->result = 0; - req->finished = true; - } - return; - } - req->op_errno = 0; - - if (req->len == 0) { - req->result = 0; - req->finished = true; - return; - } - - uint64_t io_unit = file->fs->io_unit_size; - if (io_unit == 0) { - req->op_errno = -EIO; - req->result = 0; - req->finished = true; - return; - } - - uint64_t file_sz = file->dirent ? file->dirent->file_size : 0; - if (req->offset >= file_sz) { - SPDK_DEBUGLOG("read: EOF\n"); - req->result = 0; - req->finished = true; - return; - } - - if (req->offset + req->len > file_sz) { - req->len = file_sz - req->offset; - } - req->result = req->len; - - calc_lba_range(req); - - uint64_t buf_need = req->lba_count * io_unit; - if (ensure_dma_buf(file, buf_need) != 0) { - SPDK_ERRLOG("ensure_dma_buf failed\n"); - req->op_errno = -ENOMEM; - req->result = 0; - req->finished = true; - return; - } - - spdk_blob_io_read(file->blob, file->fs->channel, - file->dma_buf, - req->lba, req->lba_count, - zvfs_spdk_blob_read_cb, req); -} - -void zvfs_spdk_blob_read_cb(void *arg, int bserrno) { - zvfs_io_req_t *req = (zvfs_io_req_t *)arg; - zvfs_file_t *file = req ? req->file : NULL; - if (req == NULL || file == NULL) { - return; - } - - if (bserrno) { - SPDK_ERRLOG("blob_read error: %d\n", bserrno); - req->op_errno = zvfs_err_from_bserrno(bserrno); - req->result = 0; - req->finished = true; - return; - } - - memcpy(req->buf, - (uint8_t *)file->dma_buf + req->page_off, - req->result); - SPDK_DEBUGLOG("read complete, offset=%" PRIu64 " len=%zu\n", req->offset, req->result); - req->finished = true; -} - -/* ================================================================== */ -/* WRITE */ -/* ================================================================== */ -/** - * 1. write 的 callback 链 - * zvfs_do_write - * └─→ 先用 spdk_blob_io_read 读出覆盖范围内的扇区 - * └─→ zvfs_spdk_blob_write_preread_cb - * (在 dma_buf 里 patch 新数据) - * ├─(需扩容)─→ spdk_blob_resize - * │ └─→ zvfs_spdk_blob_write_resize_cb - * │ └─→ spdk_blob_sync_md - * │ └─→ zvfs_spdk_blob_write_sync_cb - * │ └─→ zvfs_do_write_io - * │ └─→ zvfs_spdk_blob_write_cb - * └─(不需扩容)─→ zvfs_do_write_io - * └─→ zvfs_spdk_blob_write_cb - */ - -/* Step 1 : 进入 write,先把覆盖范围内的扇区读出来(read-modify-write) */ -void zvfs_do_write(void *arg) { - zvfs_io_req_t *req = (zvfs_io_req_t *)arg; - zvfs_file_t *file = req ? req->file : NULL; - if (req == NULL || file == NULL || file->fs == NULL || file->blob == NULL || - file->fs->channel == NULL) { - if (req != NULL) { - req->op_errno = -EINVAL; - req->finished = true; - } - return; - } - if (req->buf == NULL && req->len != 0) { - req->op_errno = -EINVAL; - req->finished = true; - return; - } - if (req->len == 0) { - req->result = 0; - req->finished = true; - return; - } - req->op_errno = 0; - - uint64_t io_unit = file->fs->io_unit_size; - if (io_unit == 0) { - req->op_errno = -EIO; - req->finished = true; - return; - } - - calc_lba_range(req); - - uint64_t buf_need = req->lba_count * io_unit; - if (ensure_dma_buf(file, buf_need) != 0) { - SPDK_ERRLOG("ensure_dma_buf failed\n"); - req->op_errno = -ENOMEM; - req->finished = true; - return; - } - - req->aligned = (req->offset % io_unit == 0) && - (req->len % io_unit == 0); - - if (req->aligned) { - memcpy(file->dma_buf, req->buf, req->len); - zvfs_spdk_blob_write_preread_cb(req, 0); - } else { - spdk_blob_io_read(file->blob, file->fs->channel, - file->dma_buf, - req->lba, req->lba_count, - zvfs_spdk_blob_write_preread_cb, req); - } -} - -/* Step 2 : preread 完成,patch dma_buf,然后决定是否扩容 */ -void zvfs_spdk_blob_write_preread_cb(void *arg, int bserrno){ - zvfs_io_req_t *req = (zvfs_io_req_t *)arg; - zvfs_file_t *file = req ? req->file : NULL; - if (req == NULL || file == NULL) { - return; - } - - uint64_t io_unit = file->fs->io_unit_size; - if (io_unit == 0) { - req->op_errno = -EIO; - req->finished = true; - return; - } - - /* preread 失败也没关系——如果是新分配区域全零即可, - 这里仍然继续(SPDK 对未写过的区域返回全零)。*/ - if (bserrno) { - SPDK_DEBUGLOG("preread error %d (may be uninitialized, continue)\n", bserrno); - memset(file->dma_buf, 0, req->lba_count * io_unit); - } - - /* 只有非对齐情况才需要 patch,对齐情况下数据已经在 dma_buf 里了(do_write 里拷好的)*/ - if (!req->aligned) { - memcpy((uint8_t *)file->dma_buf + req->page_off, - req->buf, - req->len); - } - - /* - * 稀疏写语义:当写偏移超过旧 EOF 时,gap 区间应当读为 0。 - * 这里至少把本次覆盖到的页内 gap 清零,避免把底层旧数据带入新文件逻辑范围。 - */ - uint64_t old_eof = file->dirent ? file->dirent->file_size : 0; - if (req->offset > old_eof) { - uint64_t page_start = req->lba * io_unit; - uint64_t page_end = page_start + req->lba_count * io_unit; - uint64_t zero_start = old_eof > page_start ? old_eof : page_start; - uint64_t zero_end = req->offset < page_end ? req->offset : page_end; - if (zero_end > zero_start) { - memset((uint8_t *)file->dma_buf + (zero_start - page_start), 0, - zero_end - zero_start); - } - } - - /* 判断是否需要扩容 */ - uint64_t end_byte = req->offset + req->len; - uint64_t need_clusters = zvfs_need_clusters(file->fs, end_byte); - uint64_t cur_clusters = file->dirent ? file->dirent->allocated_clusters - : spdk_blob_get_num_clusters(file->blob); - - if (need_clusters > cur_clusters) { - uint64_t free_clusters = spdk_bs_free_cluster_count(file->fs->bs); - if (need_clusters - cur_clusters > free_clusters) { - SPDK_ERRLOG("no free clusters\n"); - req->op_errno = -ENOSPC; - req->finished = true; - return; - } - spdk_blob_resize(file->blob, need_clusters, - zvfs_spdk_blob_write_resize_cb, req); - } else { - zvfs_do_write_io(req); - } -} - -/* Step 3a : resize 完成 → sync */ -void zvfs_spdk_blob_write_resize_cb(void *arg, int bserrno) { - zvfs_io_req_t *req = (zvfs_io_req_t *)arg; - zvfs_file_t *file = req ? req->file : NULL; - if (req == NULL || file == NULL) { - return; - } - - if (bserrno) { - SPDK_ERRLOG("write resize error: %d\n", bserrno); - req->op_errno = zvfs_err_from_bserrno(bserrno); - req->finished = true; - return; - } - - spdk_blob_sync_md(file->blob, zvfs_spdk_blob_write_sync_cb, req); -} - -/* Step 3b : sync 完成 → 真正写 */ -void zvfs_spdk_blob_write_sync_cb(void *arg, int bserrno) { - zvfs_io_req_t *req = (zvfs_io_req_t *)arg; - zvfs_file_t *file = req ? req->file : NULL; - if (req == NULL || file == NULL) { - return; - } - - if (bserrno) { - SPDK_ERRLOG("write sync error: %d\n", bserrno); - req->op_errno = zvfs_err_from_bserrno(bserrno); - req->finished = true; - return; - } - - if (file->dirent) { - file->dirent->allocated_clusters = - (uint32_t)spdk_blob_get_num_clusters(file->blob); - } - - zvfs_do_write_io(req); -} - -/* Step 4 : 实际写入(dma_buf 已经是 patch 后的整扇区数据) */ -void zvfs_do_write_io(zvfs_io_req_t *req) { - zvfs_file_t *file = req ? req->file : NULL; - if (req == NULL || file == NULL || file->fs == NULL) { - return; - } - - uint64_t io_unit_size = file->fs->io_unit_size; - if (io_unit_size == 0) { - req->op_errno = -EIO; - req->finished = true; - return; - } - uint64_t lba_count = (req->page_off + req->len + io_unit_size - 1) / io_unit_size; - - spdk_blob_io_write(file->blob, file->fs->channel, - file->dma_buf, - req->lba, lba_count, - zvfs_spdk_blob_write_cb, req); -} - -/* Step 5 : 写完成 */ -void zvfs_spdk_blob_write_cb(void *arg, int bserrno) { - zvfs_io_req_t *req = (zvfs_io_req_t *)arg; - zvfs_file_t *file = req ? req->file : NULL; - if (req == NULL || file == NULL) { - return; - } - - if (bserrno) { - SPDK_ERRLOG("blob_write error: %d\n", bserrno); - req->op_errno = zvfs_err_from_bserrno(bserrno); - req->finished = true; - return; - } - - uint64_t new_end = req->offset + req->len; - if (file->dirent && new_end > file->dirent->file_size) { - file->dirent->file_size = new_end; - } - req->result = req->len; - - SPDK_DEBUGLOG("write complete, offset=%" PRIu64 " len=%zu\n", req->offset, req->result); - req->finished = true; -} - - -/* ================================================================== */ -/* CLOSE */ -/* ================================================================== */ -void zvfs_do_close(void *arg) { - zvfs_file_t *file = (zvfs_file_t *)arg; - if (file == NULL) { - return; - } - file->op_errno = 0; - if (file->blob == NULL) { - file->finished = true; - return; - } - spdk_blob_close(file->blob, zvfs_spdk_blob_close_cb, file); -} - -void zvfs_spdk_blob_open_fail_close_cb(void *arg, int bserrno) -{ - zvfs_file_t *file = (zvfs_file_t *)arg; - if (file == NULL) { - return; - } - - if (bserrno) { - SPDK_ERRLOG("blob_close after open/create failure error: %d\n", bserrno); - if (file->op_errno == 0) { - file->op_errno = zvfs_err_from_bserrno(bserrno); - } - } - - file->blob = NULL; - file->finished = true; -} - -void zvfs_spdk_blob_close_cb(void *arg, int bserrno) { - zvfs_file_t *file = (zvfs_file_t *)arg; - - if (bserrno) { - SPDK_ERRLOG("blob_close error: %d\n", bserrno); - file->op_errno = zvfs_err_from_bserrno(bserrno); - } - spdk_free(file->dma_buf); - file->dma_buf = NULL; - file->blob = NULL; - file->current_offset = 0; - - file->finished = true; -} - -/* ================================================================== */ -/* DELETE */ -/* ================================================================== */ -void zvfs_do_delete(void *arg) { - zvfs_file_t *file = (zvfs_file_t *)arg; - if (file == NULL || file->fs == NULL || file->fs->bs == NULL) { - if (file != NULL) { - file->op_errno = -EINVAL; - file->finished = true; - } - return; - } - file->op_errno = 0; - spdk_bs_delete_blob(file->fs->bs, file->blob_id, zvfs_spdk_blob_delete_cb, file); -} - -void zvfs_spdk_blob_delete_cb(void *arg, int bserrno) { - zvfs_file_t *file = (zvfs_file_t *)arg; - - if (bserrno) { - SPDK_ERRLOG("blob_delete error: %d\n", bserrno); - file->op_errno = zvfs_err_from_bserrno(bserrno); - } - - file->finished = true; -} - -/* ================================================================== */ -/* UNMOUNT */ -/* ================================================================== */ -void zvfs_do_umount(void *arg) { - - zvfs_t *fs = (zvfs_t *)arg; - if (fs == NULL) { - return; - } - fs->op_errno = 0; - - if (fs->bs) { - if (fs->channel) { - spdk_bs_free_io_channel(fs->channel); - fs->channel = NULL; - } - spdk_bs_unload(fs->bs, zvfs_spdk_bs_unload_cb, fs); - return; - } - if (fs->bs_dev && fs->bs_dev_owned) { - fs->bs_dev->destroy(fs->bs_dev); - fs->bs_dev = NULL; - fs->bs_dev_owned = false; - } - fs->finished = true; -} - -void zvfs_spdk_bs_unload_cb(void *arg, int bserrno) { - - zvfs_t *fs = (zvfs_t *)arg; - if (fs == NULL) { - return; - } - if (bserrno != 0) { - fs->op_errno = zvfs_err_from_bserrno(bserrno); - } - fs->bs = NULL; - fs->bs_dev = NULL; - fs->bs_dev_owned = false; - fs->finished = true; -} - -// setup -// zvfs.json -int zvfs_env_setup(void) { - struct spdk_env_opts opts; - spdk_env_opts_init(&opts); - opts.name = "zvfs"; - - int rc = spdk_env_init(&opts); - if (rc != 0) { - return -1; - } - - spdk_log_set_print_level(SPDK_LOG_ERROR); - spdk_log_set_level(SPDK_LOG_NOTICE); - spdk_log_open(NULL); - - int rc2 = spdk_thread_lib_init(NULL, 0); - if (rc2 != 0) { - SPDK_ERRLOG("spdk_thread_lib_init failed\n"); - return -1; - } - - global_thread = spdk_thread_create("global", NULL); - if (global_thread == NULL) { - SPDK_ERRLOG("spdk_thread_create failed\n"); - return -1; - } - spdk_set_thread(global_thread); - - bool done = false; - if (!waiter(global_thread, zvfs_json_load_fn, &done, &done) || !done) { - SPDK_ERRLOG("json load waiter timeout\n"); - return -1; - } - - int retry = 0; - while (retry < 200) { // 最多等 20 秒 - spdk_thread_poll(global_thread, 0, 0); - if (spdk_bdev_get_by_name(ZVFS_BDEV) != NULL) { - SPDK_DEBUGLOG("bdev %s ready!\n", ZVFS_BDEV); - break; - } - usleep(100 * 1000); // 100ms - retry++; - } - - if (spdk_bdev_get_by_name(ZVFS_BDEV) == NULL) { - SPDK_ERRLOG("bdev %s not found after 20s timeout!\n", ZVFS_BDEV); - return -1; - } - - - SPDK_DEBUGLOG("zvfs_env_setup complete\n"); - return 0; -} - -void zvfs_json_load_fn(void *arg) { - spdk_subsystem_init_from_json_config(json_file, SPDK_DEFAULT_RPC_ADDR, json_app_load_done, - arg, true); - -} - -void json_app_load_done(int rc, void *ctx) { - bool *done = ctx; - if (rc != 0) { - SPDK_ERRLOG("JSON config load FAILED! rc=%d\n", rc); - } - // 不要 sleep!直接标记完成,让外部 waiter 去轮询 - *done = true; -} - - - - - -// filesystem -// load -int zvfs_mount(struct zvfs_s *fs) { - if (fs == NULL || global_thread == NULL) { - return 0; - } - fs->op_errno = 0; - fs->finished = false; - bool ok = waiter(global_thread, zvfs_do_mount, fs, &fs->finished); - if(!ok) SPDK_ERRLOG("mount result: ok=%d\n", ok); - return ok && fs->op_errno == 0; -} -// unload -int zvfs_umount(struct zvfs_s *fs) { - if (fs == NULL || global_thread == NULL) { - return 0; - } - fs->op_errno = 0; - fs->finished = false; - bool ok = waiter(global_thread, zvfs_do_umount, fs, &fs->finished); - if(!ok) SPDK_ERRLOG("umount result: ok=%d\n", ok); - return ok && fs->op_errno == 0; -} -// file -// create -int zvfs_create(struct zvfs_file_s *file) { - if (file == NULL || global_thread == NULL) { - return 0; - } - file->op_errno = 0; - file->finished = false; - bool ok = waiter(global_thread, zvfs_do_create, file, &file->finished); - if(!ok) SPDK_ERRLOG("create result: ok=%d\n", ok); - return ok && file->op_errno == 0; -} -// open -int zvfs_open(struct zvfs_file_s *file) { - if (file == NULL || global_thread == NULL) { - return 0; - } - file->op_errno = 0; - file->finished = false; - bool ok = waiter(global_thread, zvfs_do_open, file, &file->finished); - if(!ok) SPDK_ERRLOG("open result: ok=%d\n", ok); - return ok && file->op_errno == 0; -} -// read -int zvfs_read(struct zvfs_file_s *file, uint8_t *buffer, size_t count) { - if (file == NULL || buffer == NULL || global_thread == NULL) { - return -1; - } - if (count == 0) { - return 0; - } - - zvfs_io_req_t req = { - .file = file, - .op = ZVFS_IO_READ, - .buf = buffer, - .len = count, - .offset = file->current_offset, - .flags = 0, - }; - - int rc = zvfs_pread_internal(&req); - if (rc > 0) { - file->current_offset += (uint64_t)rc; - } - return rc; -} -// write -int zvfs_write(struct zvfs_file_s *file, const uint8_t *buffer, size_t count) { - if (file == NULL || global_thread == NULL) { - return -1; - } - - zvfs_io_req_t req = { - .file = file, - .op = ZVFS_IO_WRITE, - .buf = (uint8_t *)buffer, - .len = count, - .offset = file->current_offset, - .flags = 0, - }; - - int rc = zvfs_pwrite_internal(&req); - if (rc > 0) { - file->current_offset += (uint64_t)rc; - } - return rc; -} - -int zvfs_pread(struct zvfs_file_s *file, uint8_t *buffer, size_t count, uint64_t offset) -{ - if (file == NULL || buffer == NULL || global_thread == NULL) { - return -1; - } - if (count == 0) { - return 0; - } - - zvfs_io_req_t req = { - .file = file, - .op = ZVFS_IO_READ, - .buf = buffer, - .len = count, - .offset = offset, - .flags = 0, - }; - - return zvfs_pread_internal(&req); -} - -int zvfs_pwrite(struct zvfs_file_s *file, const uint8_t *buffer, size_t count, uint64_t offset) -{ - if (file == NULL || global_thread == NULL) { - return -1; - } - - zvfs_io_req_t req = { - .file = file, - .op = ZVFS_IO_WRITE, - .buf = (uint8_t *)buffer, - .len = count, - .offset = offset, - .flags = 0, - }; - - return zvfs_pwrite_internal(&req); -} -// close -int zvfs_close(struct zvfs_file_s *file) { - if (file == NULL || global_thread == NULL) { - return 0; - } - file->op_errno = 0; - file->finished = false; - bool ok = waiter(global_thread, zvfs_do_close, file, &file->finished); - if(!ok) SPDK_ERRLOG("close result: ok=%d\n", ok); - return ok && file->op_errno == 0; -} -// delete -int zvfs_delete(struct zvfs_file_s *file) { - if (file == NULL || global_thread == NULL) { - return 0; - } - file->op_errno = 0; - file->finished = false; - bool ok = waiter(global_thread, zvfs_do_delete, file, &file->finished); - if(!ok) SPDK_ERRLOG("delete result: ok=%d\n", ok); - return ok && file->op_errno == 0; -} - -// int main(int argc, char *argv[]) { - -// if (zvfs_env_setup()) { -// return -1; -// } - -// SPDK_NOTICELOG("zvfs_env_setup success\n"); - -// SPDK_NOTICELOG("\n\n zvfs mount start \n\n"); -// zvfs_t *fs = calloc(1, sizeof(zvfs_t)); -// zvfs_mount(fs); - - -// SPDK_NOTICELOG("\n\n zvfs open start \n\n"); -// zvfs_file_t *file = calloc(1, sizeof(zvfs_file_t)); -// file->fs = fs; -// zvfs_dirent_t *dirent = calloc(1, sizeof(zvfs_dirent_t)); -// file->dirent = dirent; -// zvfs_create(file); - -// SPDK_NOTICELOG("\n\n zvfs write start \n\n"); -// char *buffer = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; -// zvfs_write(file, buffer, strlen(buffer)); -// char *buffer2 = "abcdefghijklmnopqrstuvwxyz"; -// zvfs_write(file, buffer2, strlen(buffer2)); - - -// SPDK_NOTICELOG("\n\n zvfs read start \n\n"); -// file->current_offset = 0; -// char rbuffer[BUFFER_SIZE] = {0}; -// int n = zvfs_read(file, rbuffer, BUFFER_SIZE); -// SPDK_NOTICELOG("READ BUFFER:%d, %s\n", n, rbuffer); - -// SPDK_NOTICELOG("\n\n zvfs close start \n\n"); -// zvfs_close(file); - -// SPDK_NOTICELOG("\n\n zvfs delete start \n\n"); -// zvfs_delete(file); - -// free(dirent); -// free(file); - -// SPDK_NOTICELOG("\n\n zvfs umount start \n\n"); -// zvfs_umount(fs); -// free(fs); - -// } diff --git a/zvfs/zvfs.h b/zvfs/zvfs.h deleted file mode 100755 index 4b5e8ac..0000000 --- a/zvfs/zvfs.h +++ /dev/null @@ -1,154 +0,0 @@ -#ifndef __ZVFS_HOOK_H__ -#define __ZVFS_HOOK_H__ - -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#define ZVFS_MAX_FILES 1024 -#define ZVFS_MAX_FD 64 -#define BUFFER_SIZE (1024*8) - -extern const char *json_file; -extern struct spdk_thread *global_thread; -static const int WAITER_MAX_TIME = 10000000; - -/* 目录项(内存中的目录) */ -typedef struct { - char filename[256]; - spdk_blob_id blob_id; - uint64_t file_size; // 文件逻辑大小(字节) - uint64_t allocated_clusters; // 已分配的cluster数量 - bool is_valid; // false 表示已删除 - int32_t open_count; // 打开的文件句柄数量 -} zvfs_dirent_t; - -/* 文件系统全局结构 */ -typedef struct zvfs_s { - struct spdk_bs_dev *bs_dev; - struct spdk_blob_store *bs; - struct spdk_io_channel *channel; - struct spdk_blob *super_blob; // 承载目录日志的blob - uint64_t io_unit_size; // page大小,单位字节 - - /* 目录 */ - zvfs_dirent_t *dirents[ZVFS_MAX_FILES]; // 目录项数组 #define ZVFS_MAX_FILES 1024 - uint32_t dirent_count; // 当前有效项数 - - /* 伪FD表 */ - struct zvfs_file_s *fd_table[ZVFS_MAX_FD]; // // e.g., #define ZVFS_MAX_FD 64 - int fd_base; // 伪FD起始值,如1000 - int openfd_count; - - /* 元数据 */ - uint32_t magic; // 0x5A563146 (ZV1F) - uint32_t version; // 1 - - bool bs_dev_owned; - int op_errno; - - bool finished; -} zvfs_t; - -/* 打开的文件句柄 */ -typedef struct zvfs_file_s { - zvfs_t *fs; - spdk_blob_id blob_id; - struct spdk_blob *blob; - zvfs_dirent_t *dirent; // 指回目录项 file_size/allocated_clusters - - uint64_t current_offset; // 当前读写位置 - int flags; // O_RDONLY / O_RDWR / O_CREAT 等 - int pseudo_fd; - - /* 临时DMA缓冲区(可选:每个file一个,避免每次malloc) */ - void *dma_buf; - uint64_t dma_buf_size; - - /* Small-write coalescing buffer in hook layer. */ - uint8_t *wb_buf; - uint64_t wb_base; - size_t wb_len; - size_t wb_cap; - bool wb_valid; - - int op_errno; - bool finished; -} zvfs_file_t; - -typedef enum { - ZVFS_IO_READ = 0, - ZVFS_IO_WRITE = 1, -} zvfs_io_op_t; - -typedef struct zvfs_io_req_s { - zvfs_file_t *file; - zvfs_io_op_t op; - uint8_t *buf; - size_t len; - uint64_t offset; - int flags; - - size_t result; - int op_errno; - bool finished; - - uint64_t lba; - uint64_t page_off; - uint64_t lba_count; - bool aligned; -} zvfs_io_req_t; - -bool waiter(struct spdk_thread *thread, spdk_msg_fn start_fn, void *ctx, bool *finished); - -int zvfs_env_setup(void); -int zvfs_mount(struct zvfs_s *fs); -int zvfs_umount(struct zvfs_s *fs); -int zvfs_create(struct zvfs_file_s *file); -int zvfs_open(struct zvfs_file_s *file); -int zvfs_read(struct zvfs_file_s *file, uint8_t *buffer, size_t count); -int zvfs_write(struct zvfs_file_s *file, const uint8_t *buffer, size_t count); -int zvfs_pread(struct zvfs_file_s *file, uint8_t *buffer, size_t count, uint64_t offset); -int zvfs_pwrite(struct zvfs_file_s *file, const uint8_t *buffer, size_t count, uint64_t offset); -int zvfs_close(struct zvfs_file_s *file); -int zvfs_delete(struct zvfs_file_s *file); - -/* POSIX hook API(zvfs_hook.c 实现) */ -int open(const char *path, int flags, ...); -int open64(const char *path, int flags, ...); -int openat(int dirfd, const char *path, int flags, ...); -int openat64(int dirfd, const char *path, int flags, ...); -ssize_t read(int fd, void *buf, size_t count); -ssize_t write(int fd, const void *buf, size_t count); -ssize_t pread(int fd, void *buf, size_t count, off_t offset); -ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset); -ssize_t pread64(int fd, void *buf, size_t count, off_t offset); -ssize_t pwrite64(int fd, const void *buf, size_t count, off_t offset); -int close(int fd); -int unlink(const char *name); -int unlinkat(int dirfd, const char *name, int flags); -off_t lseek(int fd, off_t offset, int whence); -int fsync(int fd); -int fdatasync(int fd); -int ftruncate(int fd, off_t length); -int fallocate(int fd, int mode, off_t offset, off_t len); -int posix_fadvise(int fd, off_t offset, off_t len, int advice); -int sync_file_range(int fd, off_t offset, off_t nbytes, unsigned int flags); -int mkdir(const char *path, mode_t mode); -int rmdir(const char *path); -int rename(const char *oldpath, const char *newpath); -int access(const char *path, int mode); -int fcntl(int fd, int cmd, ...); -int stat(const char *path, struct stat *st); -int lstat(const char *path, struct stat *st); -int fstat(int fd, struct stat *st); -int fstatat(int dirfd, const char *path, struct stat *st, int flags); - -#endif diff --git a/zvfs/zvfs_hook.c b/zvfs/zvfs_hook.c deleted file mode 100644 index 9af7023..0000000 --- a/zvfs/zvfs_hook.c +++ /dev/null @@ -1,2576 +0,0 @@ -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "zvfs.h" - -#ifndef FALLOC_FL_KEEP_SIZE -#define FALLOC_FL_KEEP_SIZE 0x01 -#endif - -/* ------------------------------------------------------------------ */ -/* 全局状态 */ -/* ------------------------------------------------------------------ */ - -static zvfs_t *g_fs = NULL; -static bool g_mounted = false; -static bool g_env_init = false; -static bool g_debug = false; -static bool g_debug_init = false; -static const char *g_debug_filter = NULL; - -static const char *META_FILE = "/home/lian/share/10.1-spdk/zvfs/zvfs_meta.txt"; - -#define FD_BASE 10000 -#define DIRFD_BASE 20000 -#define ZVFS_MAX_DIRFD 64 -#define ZVFS_MAX_DIRS 1024 - -#define ZVFS_PATH_PREFIX "/zvfs" -#define ZVFS_WB_CAP (128 * 1024) - -typedef struct { - bool used; - int flags; - char path[PATH_MAX]; -} zvfs_dirfd_t; - -static zvfs_dirfd_t g_dirfd_table[ZVFS_MAX_DIRFD]; -static char *g_dirs[ZVFS_MAX_DIRS]; -static size_t g_dir_count = 0; - -static int (*real_open_fn)(const char *, int, ...) = NULL; -static int (*real_openat_fn)(int, const char *, int, ...) = NULL; -static DIR * (*real_opendir_fn)(const char *) = NULL; -static DIR * (*real_fdopendir_fn)(int) = NULL; -static struct dirent *(*real_readdir_fn)(DIR *) = NULL; -static struct dirent64 *(*real_readdir64_fn)(DIR *) = NULL; -static int (*real_closedir_fn)(DIR *) = NULL; -static int (*real_dirfd_fn)(DIR *) = NULL; -static FILE * (*real_fopen_fn)(const char *, const char *) = NULL; -static FILE * (*real_fopen64_fn)(const char *, const char *) = NULL; -static FILE * (*real_fdopen_fn)(int, const char *) = NULL; -static ssize_t (*real_read_fn)(int, void *, size_t) = NULL; -static ssize_t (*real_write_fn)(int, const void *, size_t) = NULL; -static ssize_t (*real_pread_fn)(int, void *, size_t, off_t) = NULL; -static ssize_t (*real_pwrite_fn)(int, const void *, size_t, off_t) = NULL; -static int (*real_close_fn)(int) = NULL; -static int (*real_unlink_fn)(const char *) = NULL; -static int (*real_unlinkat_fn)(int, const char *, int) = NULL; -static off_t (*real_lseek_fn)(int, off_t, int) = NULL; -static int (*real_fsync_fn)(int) = NULL; -static int (*real_fdatasync_fn)(int) = NULL; -static int (*real_ftruncate_fn)(int, off_t) = NULL; -static int (*real_fallocate_fn)(int, int, off_t, off_t) = NULL; -static int (*real_posix_fadvise_fn)(int, off_t, off_t, int) = NULL; -static int (*real_sync_file_range_fn)(int, off_t, off_t, unsigned int) = NULL; -static int (*real_rename_fn)(const char *, const char *) = NULL; -static int (*real_stat_fn)(const char *, struct stat *) = NULL; -static int (*real_lstat_fn)(const char *, struct stat *) = NULL; -static int (*real_fstat_fn)(int, struct stat *) = NULL; -static int (*real_fstatat_fn)(int, const char *, struct stat *, int) = NULL; -static int (*real_stat64_fn)(const char *, struct stat64 *) = NULL; -static int (*real_lstat64_fn)(const char *, struct stat64 *) = NULL; -static int (*real_fstat64_fn)(int, struct stat64 *) = NULL; -static int (*real_fstatat64_fn)(int, const char *, struct stat64 *, int) = NULL; -static int (*real___xstat64_fn)(int, const char *, struct stat64 *) = NULL; -static int (*real___lxstat64_fn)(int, const char *, struct stat64 *) = NULL; -static int (*real___fxstat64_fn)(int, int, struct stat64 *) = NULL; -static int (*real___fxstatat64_fn)(int, int, const char *, struct stat64 *, int) = NULL; -static int (*real_access_fn)(const char *, int) = NULL; -static int (*real_mkdir_fn)(const char *, mode_t) = NULL; -static int (*real_rmdir_fn)(const char *) = NULL; -static int (*real_fcntl_fn)(int, int, ...) = NULL; - -/* Forward declarations for wrappers used by stdio cookie callbacks. */ -ssize_t read(int fd, void *buf, size_t count); -ssize_t write(int fd, const void *buf, size_t count); -off64_t lseek64(int fd, off64_t offset, int whence); -int close(int fd); -static int zvfs_ensure_mounted(void); - -__attribute__((format(printf, 1, 2))) -static void debug_log(const char *fmt, ...) -{ - va_list ap; - - if (!g_debug) { - return; - } - va_start(ap, fmt); - fprintf(stderr, "[zvfs-hook][pid=%d] ", getpid()); - vfprintf(stderr, fmt, ap); - fprintf(stderr, "\n"); - va_end(ap); -} - -static int real_open_passthrough(const char *path, int flags, mode_t mode, bool has_mode) -{ - if (!real_open_fn) { - errno = ENOSYS; - return -1; - } - return has_mode ? real_open_fn(path, flags, mode) : real_open_fn(path, flags); -} - -static int real_openat_passthrough(int dirfd, const char *path, int flags, mode_t mode, bool has_mode) -{ - if (!real_openat_fn) { - errno = ENOSYS; - return -1; - } - return has_mode ? real_openat_fn(dirfd, path, flags, mode) : real_openat_fn(dirfd, path, flags); -} - -static void debug_init_once(void) -{ - const char *debug_env; - const char *filter_env; - - if (g_debug_init) { - return; - } - g_debug_init = true; - - debug_env = getenv("ZVFS_HOOK_DEBUG"); - if (debug_env && debug_env[0] != '\0' && strcmp(debug_env, "0") != 0) { - g_debug = true; - } - filter_env = getenv("ZVFS_HOOK_DEBUG_PATH"); - if (filter_env && filter_env[0] != '\0') { - g_debug_filter = filter_env; - } -} - -static bool debug_path_enabled(const char *path) -{ - debug_init_once(); - if (!g_debug) { - return false; - } - if (!g_debug_filter) { - return true; - } - return path && strstr(path, g_debug_filter) != NULL; -} - -__attribute__((constructor)) -static void zvfs_preload_init(void) -{ - real_open_fn = dlsym(RTLD_NEXT, "open"); - real_openat_fn = dlsym(RTLD_NEXT, "openat"); - real_opendir_fn = dlsym(RTLD_NEXT, "opendir"); - real_fdopendir_fn = dlsym(RTLD_NEXT, "fdopendir"); - real_readdir_fn = dlsym(RTLD_NEXT, "readdir"); - real_readdir64_fn = dlsym(RTLD_NEXT, "readdir64"); - real_closedir_fn = dlsym(RTLD_NEXT, "closedir"); - real_dirfd_fn = dlsym(RTLD_NEXT, "dirfd"); - real_fopen_fn = dlsym(RTLD_NEXT, "fopen"); - real_fopen64_fn = dlsym(RTLD_NEXT, "fopen64"); - real_fdopen_fn = dlsym(RTLD_NEXT, "fdopen"); - real_read_fn = dlsym(RTLD_NEXT, "read"); - real_write_fn = dlsym(RTLD_NEXT, "write"); - real_pread_fn = dlsym(RTLD_NEXT, "pread"); - real_pwrite_fn = dlsym(RTLD_NEXT, "pwrite"); - real_close_fn = dlsym(RTLD_NEXT, "close"); - real_unlink_fn = dlsym(RTLD_NEXT, "unlink"); - real_unlinkat_fn = dlsym(RTLD_NEXT, "unlinkat"); - real_lseek_fn = dlsym(RTLD_NEXT, "lseek"); - real_fsync_fn = dlsym(RTLD_NEXT, "fsync"); - real_fdatasync_fn = dlsym(RTLD_NEXT, "fdatasync"); - real_ftruncate_fn = dlsym(RTLD_NEXT, "ftruncate"); - real_fallocate_fn = dlsym(RTLD_NEXT, "fallocate"); - real_posix_fadvise_fn = dlsym(RTLD_NEXT, "posix_fadvise"); - real_sync_file_range_fn = dlsym(RTLD_NEXT, "sync_file_range"); - real_rename_fn = dlsym(RTLD_NEXT, "rename"); - real_stat_fn = dlsym(RTLD_NEXT, "stat"); - real_lstat_fn = dlsym(RTLD_NEXT, "lstat"); - real_fstat_fn = dlsym(RTLD_NEXT, "fstat"); - real_fstatat_fn = dlsym(RTLD_NEXT, "fstatat"); - real_stat64_fn = dlsym(RTLD_NEXT, "stat64"); - real_lstat64_fn = dlsym(RTLD_NEXT, "lstat64"); - real_fstat64_fn = dlsym(RTLD_NEXT, "fstat64"); - real_fstatat64_fn = dlsym(RTLD_NEXT, "fstatat64"); - real___xstat64_fn = dlsym(RTLD_NEXT, "__xstat64"); - real___lxstat64_fn = dlsym(RTLD_NEXT, "__lxstat64"); - real___fxstat64_fn = dlsym(RTLD_NEXT, "__fxstat64"); - real___fxstatat64_fn = dlsym(RTLD_NEXT, "__fxstatat64"); - real_access_fn = dlsym(RTLD_NEXT, "access"); - real_mkdir_fn = dlsym(RTLD_NEXT, "mkdir"); - real_rmdir_fn = dlsym(RTLD_NEXT, "rmdir"); - real_fcntl_fn = dlsym(RTLD_NEXT, "fcntl"); - debug_init_once(); -} - -/* ------------------------------------------------------------------ */ -/* 路径与目录助手 */ -/* ------------------------------------------------------------------ */ - -static inline bool is_zvfs_path(const char *path) -{ - if (!path || strncmp(path, ZVFS_PATH_PREFIX, sizeof(ZVFS_PATH_PREFIX) - 1) != 0) { - return false; - } - return path[sizeof(ZVFS_PATH_PREFIX) - 1] == '\0' || - path[sizeof(ZVFS_PATH_PREFIX) - 1] == '/'; -} - -static inline bool is_zvfs_fd(int fd) -{ - return fd >= FD_BASE && fd < FD_BASE + ZVFS_MAX_FD; -} - -static inline bool is_zvfs_dirfd(int fd) -{ - return fd >= DIRFD_BASE && fd < DIRFD_BASE + ZVFS_MAX_DIRFD; -} - -static int normalize_path(const char *path, char *out, size_t out_sz) -{ - size_t len; - if (!path || !out || out_sz == 0) { - return -1; - } - len = strnlen(path, out_sz); - if (len == 0 || len >= out_sz) { - return -1; - } - memcpy(out, path, len); - out[len] = '\0'; - while (len > 1 && out[len - 1] == '/') { - out[--len] = '\0'; - } - return 0; -} - -static inline bool is_zvfs_root(const char *path) -{ - return strcmp(path, ZVFS_PATH_PREFIX) == 0; -} - -static int get_parent_dir(const char *path, char *out, size_t out_sz) -{ - char tmp[PATH_MAX]; - char *slash; - size_t n; - - if (normalize_path(path, tmp, sizeof(tmp)) != 0 || is_zvfs_root(tmp)) { - return -1; - } - slash = strrchr(tmp, '/'); - if (!slash || slash == tmp) { - return -1; - } - n = (size_t)(slash - tmp); - if (n >= out_sz) { - return -1; - } - memcpy(out, tmp, n); - out[n] = '\0'; - return 0; -} - -static int dirs_add(const char *path) -{ - char norm[PATH_MAX]; - size_t i; - - if (normalize_path(path, norm, sizeof(norm)) != 0 || !is_zvfs_path(norm)) { - return -1; - } - if (is_zvfs_root(norm)) { - return 0; - } - for (i = 0; i < g_dir_count; i++) { - if (strcmp(g_dirs[i], norm) == 0) { - return 0; - } - } - if (g_dir_count >= ZVFS_MAX_DIRS) { - return -1; - } - g_dirs[g_dir_count] = strdup(norm); - if (!g_dirs[g_dir_count]) { - return -1; - } - g_dir_count++; - return 0; -} - -static bool dirs_has_exact(const char *path) -{ - size_t i; - char norm[PATH_MAX]; - if (normalize_path(path, norm, sizeof(norm)) != 0) { - return false; - } - if (is_zvfs_root(norm)) { - return true; - } - for (i = 0; i < g_dir_count; i++) { - if (strcmp(g_dirs[i], norm) == 0) { - return true; - } - } - return false; -} - -static bool dirs_has_children(const char *path) -{ - size_t i; - size_t plen = strlen(path); - for (i = 0; i < g_dir_count; i++) { - if (strncmp(g_dirs[i], path, plen) == 0 && g_dirs[i][plen] == '/') { - return true; - } - } - if (g_fs) { - uint32_t j; - for (j = 0; j < g_fs->dirent_count; j++) { - zvfs_dirent_t *d = g_fs->dirents[j]; - if (!d || !d->is_valid) { - continue; - } - if (strncmp(d->filename, path, plen) == 0 && d->filename[plen] == '/') { - return true; - } - } - } - return false; -} - -static bool dirs_exists(const char *path) -{ - size_t plen; - uint32_t i; - - if (!path || !is_zvfs_path(path)) { - return false; - } - if (dirs_has_exact(path)) { - return true; - } - if (!g_fs) { - return false; - } - - plen = strlen(path); - for (i = 0; i < g_fs->dirent_count; i++) { - zvfs_dirent_t *d = g_fs->dirents[i]; - if (!d || !d->is_valid) { - continue; - } - if (strncmp(d->filename, path, plen) == 0 && d->filename[plen] == '/') { - return true; - } - } - return false; -} - -static int dirs_remove(const char *path) -{ - size_t i; - char norm[PATH_MAX]; - if (normalize_path(path, norm, sizeof(norm)) != 0 || is_zvfs_root(norm)) { - return -1; - } - for (i = 0; i < g_dir_count; i++) { - if (strcmp(g_dirs[i], norm) == 0) { - free(g_dirs[i]); - g_dirs[i] = g_dirs[g_dir_count - 1]; - g_dirs[g_dir_count - 1] = NULL; - g_dir_count--; - return 0; - } - } - return -1; -} - -static void dirs_reset(void) -{ - size_t i; - for (i = 0; i < g_dir_count; i++) { - free(g_dirs[i]); - g_dirs[i] = NULL; - } - g_dir_count = 0; - (void)dirs_add(ZVFS_PATH_PREFIX); -} - -static void dirs_rebuild_from_files(void) -{ - uint32_t i; - char tmp[PATH_MAX]; - for (i = 0; i < g_fs->dirent_count; i++) { - zvfs_dirent_t *d = g_fs->dirents[i]; - char *p; - if (!d || !d->is_valid) { - continue; - } - if (normalize_path(d->filename, tmp, sizeof(tmp)) != 0) { - continue; - } - p = strrchr(tmp, '/'); - while (p && strcmp(tmp, ZVFS_PATH_PREFIX) != 0) { - *p = '\0'; - (void)dirs_add(tmp); - p = strrchr(tmp, '/'); - } - } -} - -static int join_dir_path(const char *dir, const char *name, char *out, size_t out_sz) -{ - int n; - if (!dir || !name || !out) { - return -1; - } - if (name[0] == '/') { - return normalize_path(name, out, out_sz); - } - n = snprintf(out, out_sz, "%s/%s", dir, name); - if (n <= 0 || (size_t)n >= out_sz) { - return -1; - } - return normalize_path(out, out, out_sz); -} - -static bool can_read(const zvfs_file_t *file) -{ - int mode = file->flags & O_ACCMODE; - return mode != O_WRONLY; -} - -static bool can_write(const zvfs_file_t *file) -{ - int mode = file->flags & O_ACCMODE; - return mode != O_RDONLY; -} - -static int ensure_writeback_buf(zvfs_file_t *file, size_t need) -{ - uint8_t *p; - size_t cap = ZVFS_WB_CAP; - - if (!file) { - return -1; - } - if (file->wb_buf && file->wb_cap >= need) { - return 0; - } - while (cap < need) { - cap <<= 1; - } - p = realloc(file->wb_buf, cap); - if (!p) { - return -1; - } - file->wb_buf = p; - file->wb_cap = cap; - return 0; -} - -static int flush_file_wb(zvfs_file_t *file) -{ - int rc; - - if (!file || !file->wb_valid || file->wb_len == 0) { - return 0; - } - rc = zvfs_pwrite(file, file->wb_buf, file->wb_len, file->wb_base); - if (rc != (int)file->wb_len) { - if (rc >= 0) { - file->op_errno = -EIO; - } - return -1; - } - file->wb_valid = false; - file->wb_len = 0; - return 0; -} - -static int flush_dirent_wb(zvfs_dirent_t *dirent) -{ - int i; - - if (!g_fs || !dirent) { - return 0; - } - for (i = 0; i < ZVFS_MAX_FD; i++) { - zvfs_file_t *f = g_fs->fd_table[i]; - if (!f || f->dirent != dirent) { - continue; - } - if (flush_file_wb(f) != 0) { - return -1; - } - } - return 0; -} - -/* ------------------------------------------------------------------ */ -/* 目录项/FD 辅助 */ -/* ------------------------------------------------------------------ */ - -static zvfs_dirent_t *dirent_find(const char *filename) -{ - uint32_t i; - for (i = 0; i < g_fs->dirent_count; i++) { - zvfs_dirent_t *d = g_fs->dirents[i]; - if (d && d->is_valid && strcmp(d->filename, filename) == 0) { - return d; - } - } - return NULL; -} - -static zvfs_dirent_t *dirent_alloc(const char *filename) -{ - zvfs_dirent_t *d; - if (g_fs->dirent_count >= ZVFS_MAX_FILES) { - return NULL; - } - if (strlen(filename) >= 256) { - errno = ENAMETOOLONG; - return NULL; - } - d = calloc(1, sizeof(*d)); - if (!d) { - return NULL; - } - memcpy(d->filename, filename, strlen(filename) + 1); - d->is_valid = true; - g_fs->dirents[g_fs->dirent_count++] = d; - return d; -} - -static void dirent_remove(zvfs_dirent_t *d) -{ - uint32_t i; - if (!d) { - return; - } - for (i = 0; i < g_fs->dirent_count; i++) { - if (g_fs->dirents[i] == d) { - free(d); - g_fs->dirents[i] = g_fs->dirents[g_fs->dirent_count - 1]; - g_fs->dirents[g_fs->dirent_count - 1] = NULL; - g_fs->dirent_count--; - return; - } - } -} - -static int fd_alloc(zvfs_file_t *file) -{ - int i; - for (i = 0; i < ZVFS_MAX_FD; i++) { - if (!g_fs->fd_table[i]) { - g_fs->fd_table[i] = file; - file->pseudo_fd = FD_BASE + i; - g_fs->openfd_count++; - return file->pseudo_fd; - } - } - return -1; -} - -static void fd_free(int pseudo_fd) -{ - int idx = pseudo_fd - FD_BASE; - if (idx < 0 || idx >= ZVFS_MAX_FD) { - return; - } - if (g_fs && g_fs->fd_table[idx]) { - g_fs->fd_table[idx] = NULL; - g_fs->openfd_count--; - } -} - -static zvfs_file_t *fd_lookup(int pseudo_fd) -{ - int idx = pseudo_fd - FD_BASE; - if (!g_fs || idx < 0 || idx >= ZVFS_MAX_FD) { - return NULL; - } - return g_fs->fd_table[idx]; -} - -static int dirfd_alloc(const char *path, int flags) -{ - int i; - for (i = 0; i < ZVFS_MAX_DIRFD; i++) { - if (!g_dirfd_table[i].used) { - g_dirfd_table[i].used = true; - g_dirfd_table[i].flags = flags; - snprintf(g_dirfd_table[i].path, sizeof(g_dirfd_table[i].path), "%s", path); - return DIRFD_BASE + i; - } - } - return -1; -} - -static void dirfd_free(int fd) -{ - int idx = fd - DIRFD_BASE; - if (idx < 0 || idx >= ZVFS_MAX_DIRFD) { - return; - } - g_dirfd_table[idx].used = false; - g_dirfd_table[idx].flags = 0; - g_dirfd_table[idx].path[0] = '\0'; -} - -static zvfs_dirfd_t *dirfd_lookup(int fd) -{ - int idx = fd - DIRFD_BASE; - if (idx < 0 || idx >= ZVFS_MAX_DIRFD || !g_dirfd_table[idx].used) { - return NULL; - } - return &g_dirfd_table[idx]; -} - -static const char *debug_fd_path(int fd) -{ - if (is_zvfs_fd(fd)) { - zvfs_file_t *file = fd_lookup(fd); - if (file && file->dirent) { - return file->dirent->filename; - } - } else if (is_zvfs_dirfd(fd)) { - zvfs_dirfd_t *d = dirfd_lookup(fd); - if (d) { - return d->path; - } - } - return NULL; -} - -static bool debug_fd_enabled(int fd) -{ - return debug_path_enabled(debug_fd_path(fd)); -} - -typedef struct { - char name[NAME_MAX + 1]; - unsigned char type; -} zvfs_dir_item_t; - -typedef struct { - uint64_t magic; - int pseudo_fd; - size_t pos; - size_t count; - zvfs_dir_item_t *items; - struct dirent ent; - struct dirent64 ent64; -} zvfs_dir_stream_t; - -#define ZVFS_DIR_STREAM_MAGIC 0x5a56465344495231ULL - -static bool extract_direct_child(const char *parent, const char *path, char *name_out, size_t out_sz) -{ - size_t plen; - const char *start; - const char *slash; - size_t nlen; - - if (!parent || !path || !name_out || out_sz == 0) { - return false; - } - plen = strlen(parent); - if (strncmp(path, parent, plen) != 0) { - return false; - } - if (path[plen] != '/') { - return false; - } - start = path + plen + 1; - if (*start == '\0') { - return false; - } - slash = strchr(start, '/'); - if (slash) { - return false; - } - nlen = strlen(start); - if (nlen == 0 || nlen >= out_sz) { - return false; - } - memcpy(name_out, start, nlen + 1); - return true; -} - -static bool add_dir_item(zvfs_dir_item_t **items, size_t *count, const char *name, unsigned char type) -{ - size_t i; - zvfs_dir_item_t *tmp; - - if (!items || !count || !name) { - return false; - } - for (i = 0; i < *count; i++) { - if (strcmp((*items)[i].name, name) == 0) { - if ((*items)[i].type != DT_DIR && type == DT_DIR) { - (*items)[i].type = DT_DIR; - } - return true; - } - } - tmp = realloc(*items, (*count + 1) * sizeof(**items)); - if (!tmp) { - return false; - } - *items = tmp; - snprintf((*items)[*count].name, sizeof((*items)[*count].name), "%s", name); - (*items)[*count].type = type; - (*count)++; - return true; -} - -static int build_dir_items(const char *path, zvfs_dir_item_t **items_out, size_t *count_out) -{ - size_t i; - uint32_t j; - char name[NAME_MAX + 1]; - zvfs_dir_item_t *items = NULL; - size_t count = 0; - - if (!items_out || !count_out) { - errno = EINVAL; - return -1; - } - if (zvfs_ensure_mounted() != 0) { - errno = EIO; - return -1; - } - if (!dirs_exists(path)) { - errno = ENOENT; - return -1; - } - - if (!add_dir_item(&items, &count, ".", DT_DIR) || - !add_dir_item(&items, &count, "..", DT_DIR)) { - free(items); - errno = ENOMEM; - return -1; - } - - for (i = 0; i < g_dir_count; i++) { - if (!g_dirs[i] || is_zvfs_root(g_dirs[i])) { - continue; - } - if (extract_direct_child(path, g_dirs[i], name, sizeof(name))) { - if (!add_dir_item(&items, &count, name, DT_DIR)) { - free(items); - errno = ENOMEM; - return -1; - } - } - } - - if (g_fs) { - for (j = 0; j < g_fs->dirent_count; j++) { - zvfs_dirent_t *d = g_fs->dirents[j]; - if (!d || !d->is_valid) { - continue; - } - if (extract_direct_child(path, d->filename, name, sizeof(name))) { - if (!add_dir_item(&items, &count, name, DT_REG)) { - free(items); - errno = ENOMEM; - return -1; - } - } - } - } - - *items_out = items; - *count_out = count; - return 0; -} - -static inline bool is_zvfs_dirstream(DIR *dirp) -{ - zvfs_dir_stream_t *s = (zvfs_dir_stream_t *)(void *)dirp; - return s && s->magic == ZVFS_DIR_STREAM_MAGIC; -} - -static DIR *create_zvfs_dirstream(const char *path, int pseudo_fd) -{ - zvfs_dir_stream_t *s; - - s = calloc(1, sizeof(*s)); - if (!s) { - errno = ENOMEM; - return NULL; - } - if (build_dir_items(path, &s->items, &s->count) != 0) { - free(s); - return NULL; - } - s->magic = ZVFS_DIR_STREAM_MAGIC; - s->pseudo_fd = pseudo_fd; - return (DIR *)(void *)s; -} - -typedef struct { - int fd; -} zvfs_stdio_cookie_t; - -static ssize_t zvfs_cookie_read(void *c, char *buf, size_t size) -{ - zvfs_stdio_cookie_t *ck = (zvfs_stdio_cookie_t *)c; - return read(ck->fd, buf, size); -} - -static ssize_t zvfs_cookie_write(void *c, const char *buf, size_t size) -{ - zvfs_stdio_cookie_t *ck = (zvfs_stdio_cookie_t *)c; - return write(ck->fd, buf, size); -} - -static int zvfs_cookie_seek(void *c, off64_t *off, int whence) -{ - zvfs_stdio_cookie_t *ck = (zvfs_stdio_cookie_t *)c; - off64_t rc = lseek64(ck->fd, *off, whence); - if (rc < 0) { - return -1; - } - *off = rc; - return 0; -} - -static int zvfs_cookie_close(void *c) -{ - zvfs_stdio_cookie_t *ck = (zvfs_stdio_cookie_t *)c; - int rc = close(ck->fd); - free(ck); - return rc; -} - -/* ------------------------------------------------------------------ */ -/* 元数据文件 I/O */ -/* ------------------------------------------------------------------ */ - -static int meta_load(zvfs_t *fs) -{ - int fd = real_open_passthrough(META_FILE, O_RDONLY, 0, false); - char buf[4096] = {0}; - char *line; - if (fd < 0) { - fs->dirent_count = 0; - return 0; - } - if (real_read_fn(fd, buf, sizeof(buf) - 1) <= 0) { - (void)real_close_fn(fd); - return 0; - } - (void)real_close_fn(fd); - - line = buf; - while (*line) { - char *nl = strchr(line, '\n'); - zvfs_dirent_t *d; - int ret; - if (nl) { - *nl = '\0'; - } - if (*line == '\0') { - line = nl ? nl + 1 : line + strlen(line); - continue; - } - if (fs->dirent_count >= ZVFS_MAX_FILES) { - break; - } - d = calloc(1, sizeof(*d)); - if (!d) { - break; - } - ret = sscanf(line, "%255s %" PRIu64 " %" PRIu64 " %" PRIu64, - d->filename, &d->blob_id, &d->file_size, &d->allocated_clusters); - if (ret == 4) { - d->is_valid = true; - fs->dirents[fs->dirent_count++] = d; - } else { - free(d); - } - line = nl ? nl + 1 : line + strlen(line); - } - return 0; -} - -static int meta_save(zvfs_t *fs) -{ - uint32_t i; - int fd = real_open_passthrough(META_FILE, O_WRONLY | O_CREAT | O_TRUNC, 0644, true); - if (fd < 0) { - return -1; - } - for (i = 0; i < fs->dirent_count; i++) { - zvfs_dirent_t *d = fs->dirents[i]; - char line[512]; - int len; - if (!d || !d->is_valid) { - continue; - } - len = snprintf(line, sizeof(line), "%s %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", - d->filename, d->blob_id, d->file_size, d->allocated_clusters); - if (len > 0) { - (void)real_write_fn(fd, line, (size_t)len); - } - } - (void)real_close_fn(fd); - return 0; -} - -/* ------------------------------------------------------------------ */ -/* 初始化 */ -/* ------------------------------------------------------------------ */ - -static void zvfs_atexit(void) -{ - if (!g_mounted || !g_fs) { - dirs_reset(); - return; - } - (void)meta_save(g_fs); - (void)zvfs_umount(g_fs); - dirs_reset(); -} - -static int zvfs_ensure_mounted(void) -{ - if (g_mounted) { - return 0; - } - - g_fs = calloc(1, sizeof(*g_fs)); - if (!g_fs) { - return -1; - } - g_fs->fd_base = FD_BASE; - dirs_reset(); - - if (meta_load(g_fs) != 0) { - free(g_fs); - g_fs = NULL; - return -1; - } - dirs_rebuild_from_files(); - - if (!g_env_init) { - if (zvfs_env_setup() != 0) { - free(g_fs); - g_fs = NULL; - return -1; - } - g_env_init = true; - } - if (!zvfs_mount(g_fs)) { - (void)zvfs_umount(g_fs); - free(g_fs); - g_fs = NULL; - return -1; - } - g_mounted = true; - atexit(zvfs_atexit); - return 0; -} - -/* ------------------------------------------------------------------ */ -/* stat helpers */ -/* ------------------------------------------------------------------ */ - -static uint64_t path_hash(const char *s) -{ - uint64_t h = 1469598103934665603ULL; - while (*s) { - h ^= (unsigned char)*s++; - h *= 1099511628211ULL; - } - return h; -} - -static void fill_stat(struct stat *st, mode_t mode, off_t size, uint64_t ino) -{ - time_t now = time(NULL); - memset(st, 0, sizeof(*st)); - st->st_mode = mode; - st->st_nlink = S_ISDIR(mode) ? 2 : 1; - st->st_uid = getuid(); - st->st_gid = getgid(); - st->st_size = size; - st->st_blksize = 4096; - st->st_blocks = (blkcnt_t)((size + 511) / 512); - st->st_ino = (ino_t)ino; - st->st_atime = now; - st->st_mtime = now; - st->st_ctime = now; -} - -static void fill_stat64(struct stat64 *st, mode_t mode, off64_t size, uint64_t ino) -{ - time_t now = time(NULL); - memset(st, 0, sizeof(*st)); - st->st_mode = mode; - st->st_nlink = S_ISDIR(mode) ? 2 : 1; - st->st_uid = getuid(); - st->st_gid = getgid(); - st->st_size = size; - st->st_blksize = 4096; - st->st_blocks = (blkcnt64_t)((size + 511) / 512); - st->st_ino = (ino64_t)ino; - st->st_atime = now; - st->st_mtime = now; - st->st_ctime = now; -} - -static bool is_stale_blob_op_errno(int op_errno) -{ - int e = op_errno; - if (e < 0) { - e = -e; - } - return e == ENOENT || e == EINVAL; -} - -/* ------------------------------------------------------------------ */ -/* open helpers */ -/* ------------------------------------------------------------------ */ - -static int open_zvfs_file(const char *path, int flags) -{ - char norm[PATH_MAX]; - char parent[PATH_MAX]; - zvfs_dirent_t *dirent; - zvfs_file_t *file; - bool created = false; - bool stale_repaired = false; - int ok; - int fd; - - if (zvfs_ensure_mounted() != 0) { - if (debug_path_enabled(path)) { - debug_log("open_zvfs_file path=%s flags=0x%x mount failed", path, flags); - } - errno = EIO; - return -1; - } - if (normalize_path(path, norm, sizeof(norm)) != 0) { - if (debug_path_enabled(path)) { - debug_log("open_zvfs_file path=%s flags=0x%x normalize failed", path, flags); - } - errno = ENOENT; - return -1; - } - - if (dirs_exists(norm)) { - int accmode = flags & O_ACCMODE; - if (accmode != O_RDONLY || (flags & (O_CREAT | O_TRUNC))) { - errno = EISDIR; - return -1; - } - fd = dirfd_alloc(norm, flags); - if (fd < 0) { - errno = EMFILE; - return -1; - } - if (debug_path_enabled(norm)) { - debug_log("open_zvfs_file path=%s flags=0x%x -> dirfd=%d", norm, flags, fd); - } - return fd; - } - if (flags & O_DIRECTORY) { - errno = ENOTDIR; - return -1; - } - if (get_parent_dir(norm, parent, sizeof(parent)) != 0 || !dirs_exists(parent)) { - errno = ENOENT; - return -1; - } - - dirent = dirent_find(norm); - if (!dirent) { - if (!(flags & O_CREAT)) { - errno = ENOENT; - return -1; - } - dirent = dirent_alloc(norm); - if (!dirent) { - errno = ENOMEM; - return -1; - } - created = true; - } else { - if ((flags & O_CREAT) && (flags & O_EXCL)) { - errno = EEXIST; - return -1; - } - } - - file = calloc(1, sizeof(*file)); - if (!file) { - errno = ENOMEM; - return -1; - } - file->fs = g_fs; - file->dirent = dirent; - file->flags = flags; - - if (dirent->blob_id == 0) { - ok = zvfs_create(file); - if (ok) { - dirent->blob_id = file->blob_id; - } - } else { - file->blob_id = dirent->blob_id; - ok = zvfs_open(file); - if (!ok && (flags & O_CREAT) && is_stale_blob_op_errno(file->op_errno)) { - /* Metadata may refer to a blob that no longer exists in Blobstore. */ - dirent->blob_id = 0; - dirent->file_size = 0; - dirent->allocated_clusters = 0; - file->blob_id = 0; - file->op_errno = 0; - file->finished = false; - ok = zvfs_create(file); - if (ok) { - dirent->blob_id = file->blob_id; - stale_repaired = true; - } - } - } - if (!ok) { - int op_errno = file->op_errno; - free(file); - errno = is_stale_blob_op_errno(op_errno) ? ENOENT : EIO; - return -1; - } - - if ((flags & O_TRUNC) && can_write(file)) { - dirent->file_size = 0; - file->current_offset = 0; - } else if (flags & O_APPEND) { - file->current_offset = dirent->file_size; - } - - fd = fd_alloc(file); - if (fd < 0) { - (void)zvfs_close(file); - free(file); - errno = EMFILE; - return -1; - } - dirent->open_count++; - if (created || stale_repaired || (flags & O_TRUNC)) { - (void)meta_save(g_fs); - } - if (debug_path_enabled(norm)) { - debug_log("open_zvfs_file path=%s flags=0x%x created=%d blob=%" PRIu64 " -> fd=%d", - norm, flags, created ? 1 : 0, file->blob_id, fd); - } - return fd; -} - -static int resolve_path_at(int dirfd, const char *path, char *resolved, size_t sz) -{ - if (!path || !resolved || sz == 0) { - errno = EINVAL; - return -1; - } - if (path[0] == '/') { - if (normalize_path(path, resolved, sz) != 0) { - errno = ENOENT; - return -1; - } - return 0; - } - if (dirfd == AT_FDCWD) { - return -1; - } - if (is_zvfs_fd(dirfd)) { - errno = ENOTDIR; - return -1; - } - if (is_zvfs_dirfd(dirfd)) { - zvfs_dirfd_t *d = dirfd_lookup(dirfd); - if (!d || join_dir_path(d->path, path, resolved, sz) != 0) { - errno = ENOENT; - return -1; - } - return 0; - } - return -1; -} - -/* ------------------------------------------------------------------ */ -/* POSIX hooks */ -/* ------------------------------------------------------------------ */ - -int __open_2(const char *path, int flags) -{ - return open(path, flags); -} - -int __open64_2(const char *path, int flags) -{ - return open64(path, flags); -} - -int __openat_2(int dirfd, const char *path, int flags) -{ - return openat(dirfd, path, flags); -} - -int __openat64_2(int dirfd, const char *path, int flags) -{ - return openat64(dirfd, path, flags); -} - -DIR *opendir(const char *name) -{ - DIR *dirp; - int fd; - - if (!is_zvfs_path(name)) { - if (!real_opendir_fn) { - errno = ENOSYS; - return NULL; - } - return real_opendir_fn(name); - } - - fd = open_zvfs_file(name, O_RDONLY | O_DIRECTORY); - if (fd < 0) { - return NULL; - } - dirp = create_zvfs_dirstream(name, fd); - if (!dirp) { - (void)close(fd); - return NULL; - } - if (debug_path_enabled(name)) { - debug_log("opendir path=%s -> dirp=%p fd=%d", name, (void *)dirp, fd); - } - return dirp; -} - -DIR *fdopendir(int fd) -{ - DIR *dirp; - zvfs_dirfd_t *d; - - if (!is_zvfs_dirfd(fd)) { - if (!real_fdopendir_fn) { - errno = ENOSYS; - return NULL; - } - return real_fdopendir_fn(fd); - } - d = dirfd_lookup(fd); - if (!d) { - errno = EBADF; - return NULL; - } - dirp = create_zvfs_dirstream(d->path, fd); - if (debug_fd_enabled(fd)) { - debug_log("fdopendir fd=%d path=%s -> dirp=%p", fd, d->path, (void *)dirp); - } - return dirp; -} - -struct dirent *readdir(DIR *dirp) -{ - zvfs_dir_stream_t *s; - - if (!is_zvfs_dirstream(dirp)) { - return real_readdir_fn ? real_readdir_fn(dirp) : NULL; - } - s = (zvfs_dir_stream_t *)(void *)dirp; - if (s->pos >= s->count) { - return NULL; - } - memset(&s->ent, 0, sizeof(s->ent)); - s->ent.d_ino = (ino_t)path_hash(s->items[s->pos].name); - s->ent.d_off = (off_t)(s->pos + 1); - s->ent.d_reclen = (unsigned short)sizeof(struct dirent); - s->ent.d_type = s->items[s->pos].type; - snprintf(s->ent.d_name, sizeof(s->ent.d_name), "%s", s->items[s->pos].name); - s->pos++; - return &s->ent; -} - -struct dirent64 *readdir64(DIR *dirp) -{ - zvfs_dir_stream_t *s; - - if (!is_zvfs_dirstream(dirp)) { - return real_readdir64_fn ? real_readdir64_fn(dirp) : (struct dirent64 *)readdir(dirp); - } - s = (zvfs_dir_stream_t *)(void *)dirp; - if (s->pos >= s->count) { - return NULL; - } - memset(&s->ent64, 0, sizeof(s->ent64)); - s->ent64.d_ino = (ino64_t)path_hash(s->items[s->pos].name); - s->ent64.d_off = (off64_t)(s->pos + 1); - s->ent64.d_reclen = (unsigned short)sizeof(struct dirent64); - s->ent64.d_type = s->items[s->pos].type; - snprintf(s->ent64.d_name, sizeof(s->ent64.d_name), "%s", s->items[s->pos].name); - s->pos++; - return &s->ent64; -} - -int closedir(DIR *dirp) -{ - zvfs_dir_stream_t *s; - - if (!is_zvfs_dirstream(dirp)) { - return real_closedir_fn ? real_closedir_fn(dirp) : -1; - } - s = (zvfs_dir_stream_t *)(void *)dirp; - if (s->pseudo_fd >= 0) { - (void)close(s->pseudo_fd); - s->pseudo_fd = -1; - } - free(s->items); - s->items = NULL; - s->magic = 0; - free(s); - return 0; -} - -int dirfd(DIR *dirp) -{ - zvfs_dir_stream_t *s; - - if (!is_zvfs_dirstream(dirp)) { - return real_dirfd_fn ? real_dirfd_fn(dirp) : -1; - } - s = (zvfs_dir_stream_t *)(void *)dirp; - if (s->pseudo_fd < 0) { - errno = EINVAL; - return -1; - } - return s->pseudo_fd; -} - -int open(const char *path, int flags, ...) -{ - mode_t mode = 0; - bool has_mode = (flags & O_CREAT) != 0; - if (has_mode) { - va_list ap; - va_start(ap, flags); - mode = va_arg(ap, mode_t); - va_end(ap); - } - if (!is_zvfs_path(path)) { - return real_open_passthrough(path, flags, mode, has_mode); - } - if (debug_path_enabled(path)) { - debug_log("open path=%s flags=0x%x", path, flags); - } - return open_zvfs_file(path, flags); -} - -int open64(const char *path, int flags, ...) -{ - mode_t mode = 0; - bool has_mode = (flags & O_CREAT) != 0; - if (has_mode) { - va_list ap; - va_start(ap, flags); - mode = va_arg(ap, mode_t); - va_end(ap); - } - if (!is_zvfs_path(path)) { - return real_open_passthrough(path, flags, mode, has_mode); - } - if (debug_path_enabled(path)) { - debug_log("open64 path=%s flags=0x%x", path, flags); - } - return open_zvfs_file(path, flags); -} - -int openat(int dirfd, const char *path, int flags, ...) -{ - mode_t mode = 0; - bool has_mode = (flags & O_CREAT) != 0; - char resolved[PATH_MAX]; - - if (has_mode) { - va_list ap; - va_start(ap, flags); - mode = va_arg(ap, mode_t); - va_end(ap); - } - - if (path[0] == '/' && is_zvfs_path(path)) { - if (debug_path_enabled(path)) { - debug_log("openat dirfd=%d path=%s flags=0x%x (abs)", dirfd, path, flags); - } - return open_zvfs_file(path, flags); - } - if (resolve_path_at(dirfd, path, resolved, sizeof(resolved)) == 0 && is_zvfs_path(resolved)) { - if (debug_path_enabled(resolved)) { - debug_log("openat dirfd=%d path=%s resolved=%s flags=0x%x", dirfd, path, resolved, flags); - } - return open_zvfs_file(resolved, flags); - } - return real_openat_passthrough(dirfd, path, flags, mode, has_mode); -} - -int openat64(int dirfd, const char *path, int flags, ...) -{ - mode_t mode = 0; - bool has_mode = (flags & O_CREAT) != 0; - char resolved[PATH_MAX]; - - if (has_mode) { - va_list ap; - va_start(ap, flags); - mode = va_arg(ap, mode_t); - va_end(ap); - } - - if (path[0] == '/' && is_zvfs_path(path)) { - if (debug_path_enabled(path)) { - debug_log("openat64 dirfd=%d path=%s flags=0x%x (abs)", dirfd, path, flags); - } - return open_zvfs_file(path, flags); - } - if (resolve_path_at(dirfd, path, resolved, sizeof(resolved)) == 0 && is_zvfs_path(resolved)) { - if (debug_path_enabled(resolved)) { - debug_log("openat64 dirfd=%d path=%s resolved=%s flags=0x%x", dirfd, path, resolved, flags); - } - return open_zvfs_file(resolved, flags); - } - return real_openat_passthrough(dirfd, path, flags, mode, has_mode); -} - -FILE *fopen(const char *path, const char *mode) -{ - FILE *ret; - int saved_errno = 0; - - if (!real_fopen_fn) { - errno = ENOSYS; - return NULL; - } - ret = real_fopen_fn(path, mode); - if (!ret) { - saved_errno = errno; - } - if (debug_path_enabled(path)) { - debug_log("fopen path=%s mode=%s -> %p errno=%d", path, mode, (void *)ret, saved_errno); - } - if (!ret) { - errno = saved_errno; - } - return ret; -} - -FILE *fopen64(const char *path, const char *mode) -{ - FILE *ret; - int saved_errno = 0; - - if (!real_fopen64_fn) { - return fopen(path, mode); - } - ret = real_fopen64_fn(path, mode); - if (!ret) { - saved_errno = errno; - } - if (debug_path_enabled(path)) { - debug_log("fopen64 path=%s mode=%s -> %p errno=%d", path, mode, (void *)ret, saved_errno); - } - if (!ret) { - errno = saved_errno; - } - return ret; -} - -FILE *fdopen(int fd, const char *mode) -{ - FILE *ret; - int saved_errno = 0; - - if (!is_zvfs_fd(fd)) { - if (!real_fdopen_fn) { - errno = ENOSYS; - return NULL; - } - ret = real_fdopen_fn(fd, mode); - if (!ret) { - saved_errno = errno; - } - if (debug_fd_enabled(fd)) { - const char *path = debug_fd_path(fd); - debug_log("fdopen(real) fd=%d path=%s mode=%s -> %p errno=%d", - fd, path ? path : "?", mode, (void *)ret, saved_errno); - } - if (!ret) { - errno = saved_errno; - } - return ret; - } - - { - cookie_io_functions_t io = {0}; - zvfs_stdio_cookie_t *cookie; - - cookie = calloc(1, sizeof(*cookie)); - if (!cookie) { - errno = ENOMEM; - return NULL; - } - cookie->fd = fd; - io.read = zvfs_cookie_read; - io.write = zvfs_cookie_write; - io.seek = zvfs_cookie_seek; - io.close = zvfs_cookie_close; - - ret = fopencookie(cookie, mode, io); - if (!ret) { - saved_errno = errno; - free(cookie); - } - - if (debug_fd_enabled(fd)) { - const char *path = debug_fd_path(fd); - debug_log("fdopen(cookie) fd=%d path=%s mode=%s -> %p errno=%d", - fd, path ? path : "?", mode, (void *)ret, saved_errno); - } - if (!ret) { - errno = saved_errno; - } - return ret; - } -} - -ssize_t read(int fd, void *buf, size_t count) -{ - zvfs_file_t *file; - int rc; - const char *path; - - if (!is_zvfs_fd(fd)) { - return real_read_fn ? real_read_fn(fd, buf, count) : -1; - } - path = debug_fd_path(fd); - file = fd_lookup(fd); - if (!file || !can_read(file)) { - if (debug_fd_enabled(fd)) { - debug_log("read fd=%d path=%s count=%zu -> EBADF", fd, path ? path : "?", count); - } - errno = EBADF; - return -1; - } - if (flush_dirent_wb(file->dirent) != 0) { - errno = file->op_errno ? -file->op_errno : EIO; - return -1; - } - rc = zvfs_read(file, (uint8_t *)buf, count); - if (rc < 0) { - errno = file->op_errno ? -file->op_errno : EIO; - } - if (debug_fd_enabled(fd) && (rc < 0 || count <= 64)) { - debug_log("read fd=%d path=%s count=%zu -> rc=%d errno=%d", fd, path ? path : "?", count, rc, rc < 0 ? errno : 0); - } - return rc; -} - -ssize_t write(int fd, const void *buf, size_t count) -{ - zvfs_file_t *file; - int rc; - uint64_t off; - - if (!is_zvfs_fd(fd)) { - return real_write_fn ? real_write_fn(fd, buf, count) : -1; - } - file = fd_lookup(fd); - if (!file || !can_write(file)) { - errno = EBADF; - return -1; - } - if (file->flags & O_APPEND) { - file->current_offset = file->dirent ? file->dirent->file_size : file->current_offset; - } - if (count == 0) { - return 0; - } - - off = file->current_offset; - if (count <= ZVFS_WB_CAP) { - if (!file->wb_valid || - off != file->wb_base + file->wb_len || - file->wb_len + count > file->wb_cap) { - if (flush_file_wb(file) != 0) { - errno = file->op_errno ? -file->op_errno : EIO; - return -1; - } - if (ensure_writeback_buf(file, count) != 0) { - errno = ENOMEM; - return -1; - } - file->wb_valid = true; - file->wb_base = off; - file->wb_len = 0; - } - memcpy(file->wb_buf + file->wb_len, buf, count); - file->wb_len += count; - file->current_offset += count; - if (file->dirent && file->current_offset > file->dirent->file_size) { - file->dirent->file_size = file->current_offset; - } - return (ssize_t)count; - } - - if (flush_file_wb(file) != 0) { - errno = file->op_errno ? -file->op_errno : EIO; - return -1; - } - rc = zvfs_write(file, (const uint8_t *)buf, count); - if (rc < 0) { - errno = file->op_errno ? -file->op_errno : EIO; - } - return rc; -} - -ssize_t pread(int fd, void *buf, size_t count, off_t offset) -{ - zvfs_file_t *file; - int rc; - const char *path; - if (!is_zvfs_fd(fd)) { - return real_pread_fn ? real_pread_fn(fd, buf, count, offset) : -1; - } - if (offset < 0) { - errno = EINVAL; - return -1; - } - path = debug_fd_path(fd); - file = fd_lookup(fd); - if (!file || !can_read(file)) { - if (debug_fd_enabled(fd)) { - debug_log("pread fd=%d path=%s count=%zu off=%lld -> EBADF", - fd, path ? path : "?", count, (long long)offset); - } - errno = EBADF; - return -1; - } - if (flush_dirent_wb(file->dirent) != 0) { - errno = file->op_errno ? -file->op_errno : EIO; - return -1; - } - rc = zvfs_pread(file, (uint8_t *)buf, count, (uint64_t)offset); - if (rc < 0) { - errno = file->op_errno ? -file->op_errno : EIO; - } - if (debug_fd_enabled(fd) && (rc < 0 || count <= 64)) { - debug_log("pread fd=%d path=%s count=%zu off=%lld -> rc=%d errno=%d", - fd, path ? path : "?", count, (long long)offset, rc, rc < 0 ? errno : 0); - } - return rc; -} - -ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset) -{ - zvfs_file_t *file; - int rc; - if (!is_zvfs_fd(fd)) { - return real_pwrite_fn ? real_pwrite_fn(fd, buf, count, offset) : -1; - } - if (offset < 0) { - errno = EINVAL; - return -1; - } - file = fd_lookup(fd); - if (!file || !can_write(file)) { - errno = EBADF; - return -1; - } - if (flush_dirent_wb(file->dirent) != 0) { - errno = file->op_errno ? -file->op_errno : EIO; - return -1; - } - rc = zvfs_pwrite(file, (const uint8_t *)buf, count, (uint64_t)offset); - if (rc < 0) { - errno = file->op_errno ? -file->op_errno : EIO; - } - return rc; -} - -ssize_t pread64(int fd, void *buf, size_t count, off_t offset) -{ - return pread(fd, buf, count, offset); -} - -ssize_t pwrite64(int fd, const void *buf, size_t count, off_t offset) -{ - return pwrite(fd, buf, count, offset); -} - -off_t lseek(int fd, off_t offset, int whence) -{ - zvfs_file_t *file; - off_t new_offset; - uint64_t file_size; - - if (!is_zvfs_fd(fd)) { - return real_lseek_fn ? real_lseek_fn(fd, offset, whence) : -1; - } - file = fd_lookup(fd); - if (!file) { - errno = EBADF; - return -1; - } - if (flush_file_wb(file) != 0) { - errno = file->op_errno ? -file->op_errno : EIO; - return -1; - } - file_size = file->dirent ? file->dirent->file_size : 0; - switch (whence) { - case SEEK_SET: - new_offset = offset; - break; - case SEEK_CUR: - new_offset = (off_t)file->current_offset + offset; - break; - case SEEK_END: - new_offset = (off_t)file_size + offset; - break; - default: - errno = EINVAL; - return -1; - } - if (new_offset < 0) { - errno = EINVAL; - return -1; - } - file->current_offset = (uint64_t)new_offset; - if (debug_fd_enabled(fd)) { - const char *path = debug_fd_path(fd); - debug_log("lseek fd=%d path=%s off=%lld whence=%d -> %lld", - fd, path ? path : "?", (long long)offset, whence, (long long)new_offset); - } - return new_offset; -} - -int close(int fd) -{ - zvfs_file_t *file; - zvfs_dirent_t *dirent; - char log_path[PATH_MAX]; - bool log_enabled = false; - - if (!is_zvfs_fd(fd) && !is_zvfs_dirfd(fd)) { - return real_close_fn ? real_close_fn(fd) : -1; - } - if (is_zvfs_dirfd(fd)) { - if (debug_fd_enabled(fd)) { - const char *path = debug_fd_path(fd); - debug_log("close dirfd=%d path=%s", fd, path ? path : "?"); - } - dirfd_free(fd); - return 0; - } - - file = fd_lookup(fd); - if (!file) { - errno = EBADF; - return -1; - } - dirent = file->dirent; - if (dirent && debug_path_enabled(dirent->filename)) { - snprintf(log_path, sizeof(log_path), "%s", dirent->filename); - log_enabled = true; - } - - if (flush_file_wb(file) != 0) { - errno = file->op_errno ? -file->op_errno : EIO; - return -1; - } - if (!zvfs_close(file)) { - errno = file->op_errno ? -file->op_errno : EIO; - return -1; - } - fd_free(fd); - - if (dirent) { - dirent->open_count--; - if (dirent->open_count == 0 && !dirent->is_valid) { - if (!zvfs_delete(file) && !is_stale_blob_op_errno(file->op_errno)) { - errno = file->op_errno ? -file->op_errno : EIO; - return -1; - } - dirent_remove(dirent); - (void)meta_save(g_fs); - } - } - free(file->wb_buf); - file->wb_buf = NULL; - file->wb_cap = 0; - free(file); - if (log_enabled) { - debug_log("close fd=%d path=%s", fd, log_path); - } - return 0; -} - -int unlink(const char *name) -{ - char norm[PATH_MAX]; - zvfs_dirent_t *d; - if (!is_zvfs_path(name)) { - return real_unlink_fn ? real_unlink_fn(name) : -1; - } - if (zvfs_ensure_mounted() != 0 || normalize_path(name, norm, sizeof(norm)) != 0) { - errno = EIO; - return -1; - } - if (dirs_exists(norm)) { - errno = EISDIR; - return -1; - } - d = dirent_find(norm); - if (!d) { - errno = ENOENT; - return -1; - } - if (d->open_count > 0) { - d->is_valid = false; - return 0; - } - if (flush_dirent_wb(d) != 0) { - errno = EIO; - return -1; - } - - if (d->blob_id != 0) { - zvfs_file_t tmp = {0}; - tmp.fs = g_fs; - tmp.dirent = d; - tmp.blob_id = d->blob_id; - if (!zvfs_delete(&tmp) && !is_stale_blob_op_errno(tmp.op_errno)) { - errno = tmp.op_errno ? -tmp.op_errno : EIO; - return -1; - } - } - dirent_remove(d); - (void)meta_save(g_fs); - return 0; -} - -int unlinkat(int dirfd, const char *path, int flags) -{ - char resolved[PATH_MAX]; - - if (flags & AT_REMOVEDIR) { - if (path[0] == '/' && is_zvfs_path(path)) { - return rmdir(path); - } - if (resolve_path_at(dirfd, path, resolved, sizeof(resolved)) == 0 && is_zvfs_path(resolved)) { - return rmdir(resolved); - } - } else { - if (path[0] == '/' && is_zvfs_path(path)) { - return unlink(path); - } - if (resolve_path_at(dirfd, path, resolved, sizeof(resolved)) == 0 && is_zvfs_path(resolved)) { - return unlink(resolved); - } - } - return real_unlinkat_fn ? real_unlinkat_fn(dirfd, path, flags) : -1; -} - -int fsync(int fd) -{ - if (is_zvfs_fd(fd)) { - zvfs_file_t *file = fd_lookup(fd); - if (!file) { - errno = EBADF; - return -1; - } - if (flush_dirent_wb(file->dirent) != 0) { - errno = file->op_errno ? -file->op_errno : EIO; - return -1; - } - return 0; - } - if (is_zvfs_dirfd(fd)) { - return 0; - } - return real_fsync_fn ? real_fsync_fn(fd) : -1; -} - -int fdatasync(int fd) -{ - if (is_zvfs_fd(fd)) { - zvfs_file_t *file = fd_lookup(fd); - if (!file) { - errno = EBADF; - return -1; - } - if (flush_dirent_wb(file->dirent) != 0) { - errno = file->op_errno ? -file->op_errno : EIO; - return -1; - } - return 0; - } - if (is_zvfs_dirfd(fd)) { - return 0; - } - return real_fdatasync_fn ? real_fdatasync_fn(fd) : -1; -} - -int ftruncate(int fd, off_t length) -{ - zvfs_file_t *file; - if (!is_zvfs_fd(fd)) { - return real_ftruncate_fn ? real_ftruncate_fn(fd, length) : -1; - } - if (length < 0) { - errno = EINVAL; - return -1; - } - file = fd_lookup(fd); - if (!file || !can_write(file)) { - errno = EBADF; - return -1; - } - if (flush_dirent_wb(file->dirent) != 0) { - errno = file->op_errno ? -file->op_errno : EIO; - return -1; - } - - if ((uint64_t)length > file->dirent->file_size && length > 0) { - uint8_t zero = 0; - if (zvfs_pwrite(file, &zero, 1, (uint64_t)length - 1) != 1) { - errno = file->op_errno ? -file->op_errno : EIO; - return -1; - } - } - file->dirent->file_size = (uint64_t)length; - if (file->current_offset > (uint64_t)length) { - file->current_offset = (uint64_t)length; - } - (void)meta_save(g_fs); - return 0; -} - -int fallocate(int fd, int mode, off_t offset, off_t len) -{ - zvfs_file_t *file; - uint8_t zero = 0; - uint64_t end; - bool keep_size; - - if (!is_zvfs_fd(fd)) { - return real_fallocate_fn ? real_fallocate_fn(fd, mode, offset, len) : -1; - } - if (offset < 0 || len < 0) { - errno = EINVAL; - return -1; - } - file = fd_lookup(fd); - if (!file || !can_write(file)) { - errno = EBADF; - return -1; - } - if (flush_dirent_wb(file->dirent) != 0) { - errno = file->op_errno ? -file->op_errno : EIO; - return -1; - } - - /* Minimal support: mode=0 or KEEP_SIZE only. */ - keep_size = (mode & FALLOC_FL_KEEP_SIZE) != 0; - if (mode & ~FALLOC_FL_KEEP_SIZE) { - errno = EOPNOTSUPP; - return -1; - } - if (len == 0) { - return 0; - } - - end = (uint64_t)offset + (uint64_t)len; - if (!keep_size) { - if (end > file->dirent->file_size) { - if (zvfs_pwrite(file, &zero, 1, end - 1) != 1) { - errno = file->op_errno ? -file->op_errno : EIO; - return -1; - } - } - } - return 0; -} - -int posix_fadvise(int fd, off_t offset, off_t len, int advice) -{ - (void)offset; - (void)len; - (void)advice; - if (is_zvfs_fd(fd) || is_zvfs_dirfd(fd)) { - return 0; - } - if (!real_posix_fadvise_fn) { - return 0; - } - return real_posix_fadvise_fn(fd, offset, len, advice); -} - -int posix_fadvise64(int fd, off_t offset, off_t len, int advice) -{ - return posix_fadvise(fd, offset, len, advice); -} - -int sync_file_range(int fd, off_t offset, off_t nbytes, unsigned int flags) -{ - (void)offset; - (void)nbytes; - (void)flags; - if (is_zvfs_fd(fd)) { - zvfs_file_t *file = fd_lookup(fd); - if (!file) { - errno = EBADF; - return -1; - } - if (flush_dirent_wb(file->dirent) != 0) { - errno = file->op_errno ? -file->op_errno : EIO; - return -1; - } - return 0; - } - if (is_zvfs_dirfd(fd)) { - return 0; - } - return real_sync_file_range_fn ? real_sync_file_range_fn(fd, offset, nbytes, flags) : 0; -} - -int fallocate64(int fd, int mode, off_t offset, off_t len) -{ - return fallocate(fd, mode, offset, len); -} - -int mkdir(const char *path, mode_t mode) -{ - char norm[PATH_MAX]; - char parent[PATH_MAX]; - (void)mode; - - if (!is_zvfs_path(path)) { - return real_mkdir_fn ? real_mkdir_fn(path, mode) : -1; - } - if (zvfs_ensure_mounted() != 0 || normalize_path(path, norm, sizeof(norm)) != 0) { - errno = EIO; - return -1; - } - if (dirs_exists(norm) || dirent_find(norm)) { - errno = EEXIST; - return -1; - } - if (get_parent_dir(norm, parent, sizeof(parent)) != 0 || !dirs_exists(parent)) { - errno = ENOENT; - return -1; - } - if (dirs_add(norm) != 0) { - errno = ENOSPC; - return -1; - } - return 0; -} - -int rmdir(const char *path) -{ - char norm[PATH_MAX]; - if (!is_zvfs_path(path)) { - return real_rmdir_fn ? real_rmdir_fn(path) : -1; - } - if (zvfs_ensure_mounted() != 0 || normalize_path(path, norm, sizeof(norm)) != 0) { - errno = EIO; - return -1; - } - if (is_zvfs_root(norm)) { - errno = EBUSY; - return -1; - } - if (!dirs_has_exact(norm)) { - errno = ENOENT; - return -1; - } - if (dirs_has_children(norm)) { - errno = ENOTEMPTY; - return -1; - } - if (dirs_remove(norm) != 0) { - errno = ENOENT; - return -1; - } - return 0; -} - -int rename(const char *oldpath, const char *newpath) -{ - char old_norm[PATH_MAX]; - char new_norm[PATH_MAX]; - char parent[PATH_MAX]; - zvfs_dirent_t *src; - zvfs_dirent_t *dst; - - if (!is_zvfs_path(oldpath) && !is_zvfs_path(newpath)) { - return real_rename_fn ? real_rename_fn(oldpath, newpath) : -1; - } - if (!is_zvfs_path(oldpath) || !is_zvfs_path(newpath)) { - errno = EXDEV; - return -1; - } - if (zvfs_ensure_mounted() != 0 || - normalize_path(oldpath, old_norm, sizeof(old_norm)) != 0 || - normalize_path(newpath, new_norm, sizeof(new_norm)) != 0) { - errno = EIO; - return -1; - } - if (dirs_exists(old_norm) || dirs_exists(new_norm)) { - errno = EISDIR; - return -1; - } - if (get_parent_dir(new_norm, parent, sizeof(parent)) != 0 || !dirs_exists(parent)) { - errno = ENOENT; - return -1; - } - - src = dirent_find(old_norm); - if (!src) { - errno = ENOENT; - return -1; - } - if (flush_dirent_wb(src) != 0) { - errno = EIO; - return -1; - } - dst = dirent_find(new_norm); - if (dst) { - if (flush_dirent_wb(dst) != 0) { - errno = EIO; - return -1; - } - if (dst->open_count > 0) { - errno = EBUSY; - return -1; - } - if (dst->blob_id != 0) { - zvfs_file_t tmp = {0}; - tmp.fs = g_fs; - tmp.dirent = dst; - tmp.blob_id = dst->blob_id; - if (!zvfs_delete(&tmp) && !is_stale_blob_op_errno(tmp.op_errno)) { - errno = tmp.op_errno ? -tmp.op_errno : EIO; - return -1; - } - } - dirent_remove(dst); - } - strncpy(src->filename, new_norm, sizeof(src->filename) - 1); - src->filename[sizeof(src->filename) - 1] = '\0'; - (void)meta_save(g_fs); - return 0; -} - -int access(const char *path, int mode) -{ - char norm[PATH_MAX]; - (void)mode; - if (!is_zvfs_path(path)) { - return real_access_fn ? real_access_fn(path, mode) : -1; - } - if (zvfs_ensure_mounted() != 0 || normalize_path(path, norm, sizeof(norm)) != 0) { - errno = EIO; - return -1; - } - if (dirs_exists(norm) || dirent_find(norm)) { - return 0; - } - errno = ENOENT; - return -1; -} - -int fcntl(int fd, int cmd, ...) -{ - va_list ap; - uintptr_t arg = 0; - bool has_arg = false; - - switch (cmd) { - case F_DUPFD: - case F_DUPFD_CLOEXEC: - case F_SETFD: - case F_SETFL: - case F_SETLK: - case F_SETLKW: - case F_GETLK: - has_arg = true; - break; - default: - break; - } - - va_start(ap, cmd); - if (has_arg) { - arg = va_arg(ap, uintptr_t); - } - va_end(ap); - - if (!is_zvfs_fd(fd) && !is_zvfs_dirfd(fd)) { - if (!real_fcntl_fn) { - errno = ENOSYS; - return -1; - } - return has_arg ? real_fcntl_fn(fd, cmd, arg) : real_fcntl_fn(fd, cmd); - } - - if (debug_fd_enabled(fd)) { - const char *path = debug_fd_path(fd); - debug_log("fcntl fd=%d path=%s cmd=%d has_arg=%d", fd, path ? path : "?", cmd, has_arg ? 1 : 0); - } - - switch (cmd) { - case F_GETFD: - return 0; - case F_SETFD: - return 0; - case F_GETFL: - if (is_zvfs_fd(fd)) { - zvfs_file_t *file = fd_lookup(fd); - if (!file) { - errno = EBADF; - return -1; - } - return file->flags; - } - if (is_zvfs_dirfd(fd)) { - zvfs_dirfd_t *d = dirfd_lookup(fd); - if (!d) { - errno = EBADF; - return -1; - } - return d->flags; - } - errno = EBADF; - return -1; - case F_SETFL: - if (is_zvfs_fd(fd)) { - zvfs_file_t *file = fd_lookup(fd); - if (!file) { - errno = EBADF; - return -1; - } - file->flags = (file->flags & O_ACCMODE) | ((int)arg & ~O_ACCMODE); - return 0; - } - return 0; - case F_SETLK: - case F_SETLKW: - return 0; - case F_GETLK: - if ((void *)arg) { - struct flock *lk = (struct flock *)(void *)arg; - lk->l_type = F_UNLCK; - } - return 0; - default: - if (debug_fd_enabled(fd)) { - const char *path = debug_fd_path(fd); - debug_log("fcntl fd=%d path=%s cmd=%d -> EOPNOTSUPP", fd, path ? path : "?", cmd); - } - errno = EOPNOTSUPP; - return -1; - } -} - -int fcntl64(int fd, int cmd, ...) -{ - va_list ap; - uintptr_t arg = 0; - bool has_arg = false; - - switch (cmd) { - case F_DUPFD: - case F_DUPFD_CLOEXEC: - case F_SETFD: - case F_SETFL: - case F_SETLK: - case F_SETLKW: - case F_GETLK: - has_arg = true; - break; - default: - break; - } - - va_start(ap, cmd); - if (has_arg) { - arg = va_arg(ap, uintptr_t); - } - va_end(ap); - - if (has_arg) { - return fcntl(fd, cmd, arg); - } - return fcntl(fd, cmd); -} - -int stat(const char *path, struct stat *st) -{ - char norm[PATH_MAX]; - zvfs_dirent_t *d; - - if (!is_zvfs_path(path)) { - return real_stat_fn ? real_stat_fn(path, st) : -1; - } - if (zvfs_ensure_mounted() != 0 || normalize_path(path, norm, sizeof(norm)) != 0) { - errno = EIO; - return -1; - } - - if (dirs_exists(norm)) { - fill_stat(st, S_IFDIR | 0755, 4096, path_hash(norm)); - return 0; - } - d = dirent_find(norm); - if (d) { - fill_stat(st, S_IFREG | 0644, (off_t)d->file_size, d->blob_id); - return 0; - } - errno = ENOENT; - return -1; -} - -int lstat(const char *path, struct stat *st) -{ - if (!is_zvfs_path(path)) { - return real_lstat_fn ? real_lstat_fn(path, st) : -1; - } - return stat(path, st); -} - -int fstat(int fd, struct stat *st) -{ - if (!is_zvfs_fd(fd) && !is_zvfs_dirfd(fd)) { - return real_fstat_fn ? real_fstat_fn(fd, st) : -1; - } - if (is_zvfs_dirfd(fd)) { - zvfs_dirfd_t *d = dirfd_lookup(fd); - if (!d) { - errno = EBADF; - return -1; - } - fill_stat(st, S_IFDIR | 0755, 4096, path_hash(d->path)); - return 0; - } - - { - zvfs_file_t *file = fd_lookup(fd); - if (!file || !file->dirent) { - errno = EBADF; - return -1; - } - fill_stat(st, S_IFREG | 0644, (off_t)file->dirent->file_size, file->dirent->blob_id); - return 0; - } -} - -int fstatat(int dirfd, const char *path, struct stat *st, int flags) -{ - char resolved[PATH_MAX]; - (void)flags; - if (path[0] == '/' && is_zvfs_path(path)) { - return stat(path, st); - } - if (resolve_path_at(dirfd, path, resolved, sizeof(resolved)) == 0 && is_zvfs_path(resolved)) { - return stat(resolved, st); - } - return real_fstatat_fn ? real_fstatat_fn(dirfd, path, st, flags) : -1; -} - -/* glibc versioned stat wrappers */ -int __xstat(int ver, const char *path, struct stat *st) -{ - (void)ver; - return stat(path, st); -} - -int __lxstat(int ver, const char *path, struct stat *st) -{ - (void)ver; - return lstat(path, st); -} - -int __fxstat(int ver, int fd, struct stat *st) -{ - (void)ver; - return fstat(fd, st); -} - -int __fxstatat(int ver, int dirfd, const char *path, struct stat *st, int flags) -{ - (void)ver; - return fstatat(dirfd, path, st, flags); -} - -off64_t lseek64(int fd, off64_t offset, int whence) -{ - return (off64_t)lseek(fd, (off_t)offset, whence); -} - -int stat64(const char *path, struct stat64 *st) -{ - char norm[PATH_MAX]; - zvfs_dirent_t *d; - - if (!is_zvfs_path(path)) { - if (real_stat64_fn) { - return real_stat64_fn(path, st); - } - return real_stat_fn ? real_stat_fn(path, (struct stat *)(void *)st) : -1; - } - if (zvfs_ensure_mounted() != 0 || normalize_path(path, norm, sizeof(norm)) != 0) { - errno = EIO; - return -1; - } - - if (dirs_exists(norm)) { - fill_stat64(st, S_IFDIR | 0755, 4096, path_hash(norm)); - return 0; - } - d = dirent_find(norm); - if (d) { - fill_stat64(st, S_IFREG | 0644, (off64_t)d->file_size, d->blob_id); - return 0; - } - errno = ENOENT; - return -1; -} - -int lstat64(const char *path, struct stat64 *st) -{ - if (!is_zvfs_path(path)) { - if (real_lstat64_fn) { - return real_lstat64_fn(path, st); - } - return real_lstat_fn ? real_lstat_fn(path, (struct stat *)(void *)st) : -1; - } - return stat64(path, st); -} - -int fstat64(int fd, struct stat64 *st) -{ - if (!is_zvfs_fd(fd) && !is_zvfs_dirfd(fd)) { - if (real_fstat64_fn) { - return real_fstat64_fn(fd, st); - } - return real_fstat_fn ? real_fstat_fn(fd, (struct stat *)(void *)st) : -1; - } - if (is_zvfs_dirfd(fd)) { - zvfs_dirfd_t *d = dirfd_lookup(fd); - if (!d) { - errno = EBADF; - return -1; - } - fill_stat64(st, S_IFDIR | 0755, 4096, path_hash(d->path)); - return 0; - } - - { - zvfs_file_t *file = fd_lookup(fd); - if (!file || !file->dirent) { - errno = EBADF; - return -1; - } - fill_stat64(st, S_IFREG | 0644, (off64_t)file->dirent->file_size, file->dirent->blob_id); - return 0; - } -} - -int fstatat64(int dirfd, const char *path, struct stat64 *st, int flags) -{ - char resolved[PATH_MAX]; - (void)flags; - if (path[0] == '/' && is_zvfs_path(path)) { - return stat64(path, st); - } - if (resolve_path_at(dirfd, path, resolved, sizeof(resolved)) == 0 && is_zvfs_path(resolved)) { - return stat64(resolved, st); - } - if (real_fstatat64_fn) { - return real_fstatat64_fn(dirfd, path, st, flags); - } - return real_fstatat_fn ? real_fstatat_fn(dirfd, path, (struct stat *)(void *)st, flags) : -1; -} - -int __xstat64(int ver, const char *path, struct stat64 *st) -{ - if (is_zvfs_path(path)) { - return stat64(path, st); - } - if (real___xstat64_fn) { - return real___xstat64_fn(ver, path, st); - } - return stat64(path, st); -} - -int __lxstat64(int ver, const char *path, struct stat64 *st) -{ - if (is_zvfs_path(path)) { - return lstat64(path, st); - } - if (real___lxstat64_fn) { - return real___lxstat64_fn(ver, path, st); - } - return lstat64(path, st); -} - -int __fxstat64(int ver, int fd, struct stat64 *st) -{ - if (is_zvfs_fd(fd) || is_zvfs_dirfd(fd)) { - return fstat64(fd, st); - } - if (real___fxstat64_fn) { - return real___fxstat64_fn(ver, fd, st); - } - return fstat64(fd, st); -} - -int __fxstatat64(int ver, int dirfd, const char *path, struct stat64 *st, int flags) -{ - char resolved[PATH_MAX]; - if (path[0] == '/' && is_zvfs_path(path)) { - return fstatat64(dirfd, path, st, flags); - } - if (resolve_path_at(dirfd, path, resolved, sizeof(resolved)) == 0 && is_zvfs_path(resolved)) { - return fstatat64(dirfd, path, st, flags); - } - if (real___fxstatat64_fn) { - return real___fxstatat64_fn(ver, dirfd, path, st, flags); - } - return fstatat64(dirfd, path, st, flags); -}