rebuild
This commit is contained in:
7
.gitignore
vendored
7
.gitignore
vendored
@@ -2,8 +2,5 @@
|
||||
*.d
|
||||
*.so
|
||||
|
||||
test/bin
|
||||
|
||||
zvfs/func_test
|
||||
zvfs_meta.txt
|
||||
zvfs/zvfs_meta.txt
|
||||
codex/
|
||||
tests/bin
|
||||
|
||||
11
Makefile
11
Makefile
@@ -3,14 +3,11 @@
|
||||
all: zvfs
|
||||
|
||||
zvfs:
|
||||
$(MAKE) -C zvfs
|
||||
$(MAKE) -C src
|
||||
|
||||
test:
|
||||
$(MAKE) -C test
|
||||
|
||||
run-test:
|
||||
$(MAKE) -C test run-test
|
||||
$(MAKE) -C tests
|
||||
|
||||
clean:
|
||||
$(MAKE) -C zvfs clean
|
||||
$(MAKE) -C test clean
|
||||
$(MAKE) -C src clean
|
||||
$(MAKE) -C tests clean
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
```shell
|
||||
|
||||
sudo apt install fio
|
||||
|
||||
fio sample.fio
|
||||
LD_PRELOAD=../zvfs/libzvfs.so fio zvfs.fio
|
||||
```
|
||||
|
||||
|
||||
@@ -1,14 +0,0 @@
|
||||
[global]
|
||||
ioengine=sync # 同步 I/O
|
||||
direct=0 # 使用内核页缓存,测试系统调用性能
|
||||
bs=128k # 块大小
|
||||
size=1G # 每个文件大小
|
||||
numjobs=2 # 并发线程数
|
||||
runtime=60 # 测试运行时间,秒
|
||||
time_based=1
|
||||
rw=randrw # 随机读写
|
||||
rwmixread=70 # 70% 读,30% 写
|
||||
group_reporting=1 # 汇总报告
|
||||
|
||||
[test_syscall]
|
||||
filename=/tmp/fio_sample_testfile
|
||||
14
fio/zvfs.fio
14
fio/zvfs.fio
@@ -1,14 +0,0 @@
|
||||
[global]
|
||||
ioengine=sync
|
||||
direct=0
|
||||
bs=128k
|
||||
size=1G
|
||||
numjobs=2
|
||||
runtime=60
|
||||
time_based=1
|
||||
rw=randrw
|
||||
rwmixread=70
|
||||
group_reporting=1
|
||||
|
||||
[test_hook]
|
||||
filename=/zvfs/fio_zvfs_testfile
|
||||
@@ -1,194 +0,0 @@
|
||||
# ZVFS 高性能框架设计(修订版)
|
||||
|
||||
## 0. 当前实现进展(2026-03-03)
|
||||
|
||||
- 已落地:
|
||||
- stale blob 自愈(open/create/unlink/rename 路径)
|
||||
- hook 层小写合并(per-fd writeback buffer,默认 128KB)+ 关键系统调用前 flush
|
||||
- 仍待重点优化:
|
||||
- 小块读路径仍是“同步提交 + 单请求往返 + 拷贝返回”,延迟和吞吐偏弱
|
||||
|
||||
## 1. 现状代码中的关键问题(先于方案)
|
||||
|
||||
基于 `zvfs.c`、`zvfs.h`、`zvfs_hook.c`,当前主要瓶颈和风险如下:
|
||||
|
||||
1. **单全局执行上下文串行化**
|
||||
- 所有 IO 都通过 `global_thread` + `waiter()` 同步等待,天然把多线程请求串到一个 SPDK thread。
|
||||
- `zvfs_t` 里只有一个 `channel`,读写都走这一个 channel,无法利用多核并行。
|
||||
|
||||
2. **等待模型是忙轮询,CPU 成本高**
|
||||
- `waiter()` 用紧循环 `spdk_thread_poll()`,没有阻塞等待/退避策略。
|
||||
- 在高并发小 IO 下,系统容易进入“高 CPU + 低有效 QD”。
|
||||
|
||||
3. **全局元数据无并发保护**
|
||||
- `dirents/fd_table/g_dirs/g_dirfd_table/open_count/file_size` 读写没有统一锁。
|
||||
- hook 层是多线程入口,当前实现有明显竞态和可见性问题。
|
||||
|
||||
4. **持久化与语义不完整**
|
||||
- `fsync/fdatasync/sync_file_range` 对 zvfs fd 基本直接返回 0,和数据库预期不一致。
|
||||
- `meta_load()` 只读固定 4KB 文本,规模稍大就截断;`meta_save()` 也无崩溃一致性保证。
|
||||
|
||||
5. **数据路径的放大和额外开销**
|
||||
- 小块随机写依赖 read-modify-write;无写回缓存、无批量提交、无 IO 合并。
|
||||
- per-file `dma_buf` 增长时可能反复 realloc,缺少池化和复用策略。
|
||||
|
||||
6. **可扩展性不足**
|
||||
- `dirent_find/fd_alloc` 等是线性扫描。
|
||||
- 元数据、目录结构、fd 分配都偏“单点共享结构”,随着文件数/线程数增长会抖动。
|
||||
|
||||
---
|
||||
|
||||
## 2. 对 userplan.md 的补全与修正
|
||||
|
||||
`plan/userplan.md` 的方向(TLS + per-thread channel + 缩小全局锁)是正确的,但有几个需要补全的点:
|
||||
|
||||
1. **“每个 pthread 一个 spdk_thread”要可配置**
|
||||
- 对 MySQL 这类线程数可能很大的进程,严格 1:1 会导致线程对象和 channel 爆炸。
|
||||
- 建议改为:默认“线程绑定 worker 池(N:M)”,支持配置成 1:1 调试模式。
|
||||
|
||||
2. **需要明确“文件句柄跨线程访问”的所有权规则**
|
||||
- 同一 fd 可能被不同 pthread 使用,必须定义 offset、cache、flush 的同步策略。
|
||||
|
||||
3. **batch poll 需要配套“提交队列 + 背压 + 超时”**
|
||||
- 仅有 `pending_queue` 不够,必须定义入队失败/队列满/超时处理。
|
||||
|
||||
4. **必须补上 fsync/fdatasync 的严格语义**
|
||||
- 尤其面向数据库:fsync 成功后应保证数据页 + 必要元数据已持久化。
|
||||
|
||||
5. **元数据持久化需要从“文本快照”升级为“日志+检查点”**
|
||||
- 否则崩溃恢复和规模都不可靠。
|
||||
|
||||
---
|
||||
|
||||
## 3. 新框架设计(面向高性能与可重入改造)
|
||||
|
||||
### 3.1 分层与职责
|
||||
|
||||
- **Control Plane(全局)**
|
||||
- 管理 mount/unmount、命名空间、inode 元数据、fd 表、恢复日志。
|
||||
- 低频操作(open/create/unlink/rename/mkdir/rmdir)在此层处理。
|
||||
|
||||
- **Data Plane(worker)**
|
||||
- 处理 read/pread/write/pwrite/fsync 的数据 IO。
|
||||
- 每个 worker 持有:`spdk_thread + io_channel + submission_queue + completion_queue`。
|
||||
|
||||
- **Persistence Plane(元数据持久化)**
|
||||
- 元数据 WAL(append-only)+ 周期 checkpoint。
|
||||
- 保障崩溃恢复和 fsync 语义。
|
||||
|
||||
### 3.2 全局运行时结构
|
||||
|
||||
```c
|
||||
typedef struct {
|
||||
// init/mount 生命周期
|
||||
pthread_once_t init_once;
|
||||
pthread_mutex_t mount_mu;
|
||||
_Atomic int mount_state; // UNINIT/INITING/READY/FAILED/STOPPING
|
||||
|
||||
// core spdk objects
|
||||
struct spdk_blob_store *bs;
|
||||
struct spdk_bs_dev *bs_dev;
|
||||
|
||||
// metadata indexes
|
||||
pthread_rwlock_t inode_rwlock;
|
||||
inode_table_t *inode_by_path; // hash map: path -> inode
|
||||
inode_table_t *inode_by_blobid; // hash map: blobid -> inode
|
||||
|
||||
pthread_rwlock_t fd_rwlock;
|
||||
fd_table_t *fd_table; // pseudo fd -> file handle
|
||||
|
||||
// durability
|
||||
meta_journal_t *journal; // WAL + checkpoint
|
||||
|
||||
// worker routing
|
||||
worker_pool_t *workers; // configurable N workers
|
||||
} zvfs_runtime_t;
|
||||
```
|
||||
|
||||
### 3.3 worker 模型(建议默认 N:M,可切 1:1)
|
||||
|
||||
- 默认:`worker_count = min(online_cpu, ZVFS_IO_WORKERS)`。
|
||||
- 线程第一次进入时做 TLS 绑定:`pthread_id -> worker_id`(固定绑定,减少迁移)。
|
||||
- 每个 worker 独占一个 io_channel,避免全局 channel 争用。
|
||||
- 等待机制:优先 `eventfd/futex + poll` 混合,避免纯忙轮询。
|
||||
|
||||
> 说明:若用户确认线程数有限,可配置 `ZVFS_WORKER_MODE=THREAD_LOCAL` 切 1:1,以追求极致低延迟。
|
||||
|
||||
### 3.4 元数据模型
|
||||
|
||||
- `inode`(文件级共享对象)
|
||||
- `blob_id, logical_size, allocated_clusters, link/open_ref, flags`
|
||||
- 每 inode 一把细粒度锁(mutex/spin + 原子字段)。
|
||||
- `file handle`(open 实例)
|
||||
- `inode*`, `flags`, `current_offset`, `handle-local state`。
|
||||
- 路径索引与 blob 索引用哈希表替代线性数组。
|
||||
- 目录树从 `g_dirs[]` 升级为前缀树或 hash+parent 索引,避免全表扫描。
|
||||
|
||||
### 3.5 IO 路径设计
|
||||
|
||||
#### Read/Pread
|
||||
- 快路径:命中页缓存(clean/dirty)直接拷贝。
|
||||
- 慢路径:提交到绑定 worker。
|
||||
- 对齐大读支持直接 DMA 到用户对齐缓冲(满足约束时)。
|
||||
|
||||
#### Write/Pwrite
|
||||
- 小块随机写:写入 per-inode 页缓存(4KB 粒度),标记 dirty。
|
||||
- 大块或顺序写:绕过缓存直写(或写穿策略),减少二次拷贝。
|
||||
- 扩容策略:按 chunk 预分配(例如 1~8MB)减少 `resize + sync_md` 频率。
|
||||
- flush 策略:
|
||||
- 后台刷脏(阈值/时间)
|
||||
- 前台 fsync 强制刷
|
||||
- 合并连续页为 writev/batch IO
|
||||
|
||||
### 3.6 fsync/fdatasync 语义(数据库场景)
|
||||
|
||||
- `fdatasync(fd)`:
|
||||
1) 刷新该 fd 对应 inode 的脏数据页;
|
||||
2) 若发生扩容,确保 blob 元数据同步完成;
|
||||
3) 返回前确认提交完成。
|
||||
- `fsync(fd)`:
|
||||
- 在 `fdatasync` 基础上,额外保证需要的命名空间/元数据日志落盘(如 size、rename 可见性)。
|
||||
|
||||
### 3.7 崩溃一致性与恢复
|
||||
|
||||
- `meta_journal.log`(append-only,带 magic/version/CRC/seq)。
|
||||
- 操作记录:`CREATE/UNLINK/RENAME/TRUNCATE/SIZE_UPDATE/ALLOC_UPDATE`。
|
||||
- 启动恢复:`checkpoint -> replay WAL`。
|
||||
- 周期 checkpoint(按时间或日志大小触发),避免恢复时间无限增长。
|
||||
|
||||
### 3.8 锁策略与死锁规约
|
||||
|
||||
- 固定锁顺序:`fd_table lock -> inode lock -> journal lock`。
|
||||
- IO 快路径不拿全局写锁。
|
||||
- 元数据读多写少:读写锁 + inode 细粒度锁组合。
|
||||
|
||||
### 3.9 可观测与调优
|
||||
|
||||
- 统计项(至少):
|
||||
- read/write IOPS、带宽、P50/P99 延迟
|
||||
- cache hit ratio、dirty page 数
|
||||
- flush 次数、merge 比例、resize 次数
|
||||
- queue depth、排队延迟
|
||||
- debug 开关:
|
||||
- `ZVFS_TRACE_IO=1`
|
||||
- `ZVFS_TRACE_META=1`
|
||||
- `ZVFS_WORKER_MODE`, `ZVFS_IO_WORKERS`
|
||||
|
||||
---
|
||||
|
||||
## 4. 关键行为约束(必须保持)
|
||||
|
||||
1. POSIX 语义不回退:`openat/rename/unlink/ftruncate/fstat/fsync` 的错误码与行为保持一致或更严格。
|
||||
2. 在无 root 环境下可跑功能测试(至少支持 Malloc bdev 或已有可用 SPDK 配置)。
|
||||
3. 旧接口兼容:外部仍通过 `LD_PRELOAD=.../libzvfs.so` 使用。
|
||||
4. 改造过程可分阶段落地,任何阶段都可独立编译、回归、继续下一阶段。
|
||||
|
||||
---
|
||||
|
||||
## 5. 性能目标(建议)
|
||||
|
||||
- 与当前实现相比:
|
||||
- 多线程随机写 IOPS 提升 >= 2x(4~16 线程场景)
|
||||
- P99 延迟下降 >= 30%
|
||||
- CPU busy-poll 占比显著下降(可通过 perf/top 观测)
|
||||
- `test_single_file_perf`、`test_single_file_random_perf` 在同配置下持续稳定,无明显长尾抖动。
|
||||
216
plan/plan.md
216
plan/plan.md
@@ -1,216 +0,0 @@
|
||||
# ZVFS 分阶段改造计划(可重入,用户验收版)
|
||||
|
||||
> 目标:把当前实现改造成可并发扩展、高性能且语义完整的架构。
|
||||
> 约束:我无法使用 root,所有阶段验收由你执行。
|
||||
|
||||
## 通用约定(所有阶段)
|
||||
|
||||
- 建议先记录基线:`git rev-parse --short HEAD`。
|
||||
- 每阶段都保持“可编译 + 可回归”。
|
||||
- 每阶段完成后打一个里程碑 tag(例如 `phase1-done`),中断后可从最近 tag 继续。
|
||||
- 验收命令默认:
|
||||
|
||||
```bash
|
||||
make -C zvfs -j4
|
||||
make -C test -j4
|
||||
env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_phase2_posix /zvfs
|
||||
```
|
||||
|
||||
## 已落地变更(2026-03-03)
|
||||
|
||||
1. **stale blob 自愈修复(已完成)**
|
||||
- `open(O_CREAT)` 遇到元数据引用失效 blob 时自动重建并回写元数据。
|
||||
- `unlink/close(rename 覆盖)` 删除失效 blob 时容忍 `ENOENT/EINVAL`,避免误报 `EIO`。
|
||||
|
||||
2. **小块写合并(已完成)**
|
||||
- hook 层新增 per-fd writeback buffer(默认 128KB),连续小写先合并再 `pwrite`。
|
||||
- 在 `read/pread/lseek/fsync/fdatasync/close/ftruncate/fallocate/unlink/rename/sync_file_range` 前补齐 flush,保证可见性。
|
||||
|
||||
3. **当前观察**
|
||||
- 小块写已提升,但小块读仍偏低;读优化作为后续阶段重点。
|
||||
|
||||
---
|
||||
|
||||
## Phase 0:基线与护栏
|
||||
|
||||
### 要做的事情
|
||||
1. 固化当前行为基线:功能、性能、CPU 占用。
|
||||
2. 在代码中加入轻量统计框架(计数器/延迟桶/开关),不改变行为。
|
||||
3. 增加最小并发回归入口(并行跑现有测试)。
|
||||
|
||||
### 用户验收
|
||||
```bash
|
||||
make -C zvfs -j4 && make -C test -j4
|
||||
env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_phase2_posix /zvfs
|
||||
env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_single_file_perf /zvfs
|
||||
env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_single_file_random_perf /zvfs
|
||||
```
|
||||
|
||||
### 通过标准
|
||||
- 功能测试通过。
|
||||
- 有一份可复用的“基线性能记录”(IOPS/BW/延迟)。
|
||||
|
||||
### 可重入说明
|
||||
- 仅增量加观测代码,可重复执行,不影响后续阶段。
|
||||
|
||||
---
|
||||
|
||||
## Phase 1:全局运行时与并发安全
|
||||
|
||||
### 要做的事情
|
||||
1. 引入 `zvfs_runtime_t`,统一管理 mount/init 状态与全局资源。
|
||||
2. 用 `pthread_once + mount mutex` 保护初始化/挂载过程。
|
||||
3. 给 inode/path/fd/dirs 操作补齐锁(rwlock + 细粒度 mutex)。
|
||||
4. 保持接口不变:`open/read/write/...` 行为兼容。
|
||||
|
||||
### 用户验收
|
||||
```bash
|
||||
make -C zvfs -j4 && make -C test -j4
|
||||
env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_phase2_posix /zvfs
|
||||
for i in $(seq 1 8); do
|
||||
env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_dual_open_same_file /zvfs &
|
||||
done
|
||||
wait
|
||||
```
|
||||
|
||||
### 通过标准
|
||||
- 无崩溃/死锁。
|
||||
- 并发场景不出现随机 EBADF/ENOENT/元数据错乱。
|
||||
|
||||
### 可重入说明
|
||||
- 锁与 runtime 框架可独立提交;若中断,重新进入本阶段不会破坏状态。
|
||||
|
||||
---
|
||||
|
||||
## Phase 2:Worker 化 IO 通路(替换单 global_thread)
|
||||
|
||||
### 要做的事情
|
||||
1. 实现 worker 池(默认 N:M,支持配置 1:1)。
|
||||
2. 每 worker 持有独立 `spdk_thread + io_channel`。
|
||||
3. read/write/pread/pwrite 路径改为“提交到绑定 worker 执行”。
|
||||
4. 保留同步 POSIX 语义,但去掉全局单线程瓶颈。
|
||||
|
||||
### 用户验收
|
||||
```bash
|
||||
make -C zvfs -j4 && make -C test -j4
|
||||
env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_phase2_posix /zvfs
|
||||
for i in $(seq 1 4); do
|
||||
env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_single_file_random_perf /zvfs &
|
||||
done
|
||||
wait
|
||||
```
|
||||
|
||||
### 通过标准
|
||||
- 功能与 Phase 1 一致。
|
||||
- 并发压测吞吐明显高于基线(目标 >= 1.5x,先达成趋势)。
|
||||
|
||||
### 可重入说明
|
||||
- worker 与旧路径可通过编译开关共存,出现问题可快速切回旧路径继续调试。
|
||||
|
||||
---
|
||||
|
||||
## Phase 3:完成等待机制与批处理
|
||||
|
||||
### 要做的事情
|
||||
1. 用“提交队列 + 完成通知”替换纯 busy-poll `waiter`。
|
||||
2. 增加批量 poll 与背压(队列满、超时、错误传播)。
|
||||
3. 补齐延迟与队列深度指标,定位长尾。
|
||||
4. 引入读路径流水线(允许并发 in-flight read),把有效 QD 从 1 提升到可配置值。
|
||||
|
||||
### 用户验收
|
||||
```bash
|
||||
make -C zvfs -j4 && make -C test -j4
|
||||
env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_single_file_perf /zvfs
|
||||
env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_single_file_random_perf /zvfs
|
||||
```
|
||||
|
||||
### 通过标准
|
||||
- 在同等负载下 CPU 空转显著下降。
|
||||
- P99 延迟较 Phase 2 收敛(无明显长尾恶化)。
|
||||
|
||||
### 可重入说明
|
||||
- 队列与等待层可单独演进;可先只替换 read,再替换 write。
|
||||
|
||||
---
|
||||
|
||||
## Phase 4:页缓存与写回合并
|
||||
|
||||
### 要做的事情
|
||||
1. 引入 per-inode 4KB 页缓存(dirty/clean 状态)。
|
||||
2. 小写走 cache + 延迟刷盘,大写/顺序写支持直写或批量写。
|
||||
3. 引入 flush 策略:阈值、定时、fsync 强制。
|
||||
4. 缩减 `resize + sync_md` 频率(chunk 预分配)。
|
||||
5. 读性能专项:
|
||||
- 增加顺序读 readahead(如 128KB~1MB 窗口自适应)。
|
||||
- 对齐读支持“直接读到用户缓冲”快路径,减少一次 memcpy。
|
||||
- 引入 clean page cache(读热点复用,避免重复 blob read)。
|
||||
|
||||
### 用户验收
|
||||
```bash
|
||||
make -C zvfs -j4 && make -C test -j4
|
||||
env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_phase2_posix /zvfs
|
||||
env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_single_file_random_noaligned_perf /zvfs
|
||||
```
|
||||
|
||||
### 通过标准
|
||||
- 功能语义不回退(truncate/sparse/rename/fstat 通过)。
|
||||
- 小块随机写吞吐继续提升,写放大降低。
|
||||
|
||||
### 可重入说明
|
||||
- cache 可先只支持 write-through,再切 write-back;两步都可单独验收。
|
||||
|
||||
---
|
||||
|
||||
## Phase 5:元数据日志化与 fsync 语义闭环
|
||||
|
||||
### 要做的事情
|
||||
1. `meta_save/load` 从文本快照升级为 WAL + checkpoint(带 CRC/版本)。
|
||||
2. 明确并实现 `fdatasync/fsync` 语义:
|
||||
- fdatasync 保证数据持久化;
|
||||
- fsync 额外保证必要元数据持久化。
|
||||
3. 补齐崩溃恢复流程(checkpoint + replay)。
|
||||
|
||||
### 用户验收
|
||||
```bash
|
||||
make -C zvfs -j4 && make -C test -j4
|
||||
env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_phase2_posix /zvfs
|
||||
# 建议补充一次“异常退出后重启读取”的恢复验证(手工执行)
|
||||
```
|
||||
|
||||
### 通过标准
|
||||
- 重启后目录项与文件大小不丢失、不错乱。
|
||||
- 数据库关键路径(fsync/fdatasync)语义满足预期。
|
||||
|
||||
### 可重入说明
|
||||
- WAL 与 checkpoint 支持并存迁移;可先双写验证,再切主读路径。
|
||||
|
||||
---
|
||||
|
||||
## Phase 6:性能收敛与上线门槛
|
||||
|
||||
### 要做的事情
|
||||
1. 清理临时开关,保留必要调优参数。
|
||||
2. 整理性能报告(与 Phase 0 基线对比)。
|
||||
3. 做最终回归矩阵(功能 + 并发 + 性能 + 恢复)。
|
||||
|
||||
### 用户验收
|
||||
```bash
|
||||
make -C zvfs -j4 && make -C test -j4
|
||||
env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so make -C test run-test
|
||||
env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_single_file_perf /zvfs
|
||||
env LD_PRELOAD=/home/lian/share/10.1-spdk/zvfs/zvfs/libzvfs.so ./test/bin/test_single_file_random_perf /zvfs
|
||||
```
|
||||
|
||||
### 通过标准
|
||||
- 全量功能测试通过。
|
||||
- 多线程性能达到 `codexplan.md` 目标(或给出量化偏差与原因)。
|
||||
|
||||
### 可重入说明
|
||||
- 本阶段仅收敛与验收,不引入架构性变更;可反复执行直到指标稳定。
|
||||
|
||||
---
|
||||
|
||||
## 附:root 权限与运行建议
|
||||
|
||||
- 若 NVMe/SPDK 环境需要 root,请在你本机按现有流程执行验收。
|
||||
- 若希望无 root 回归,建议补一个 `Malloc` bdev 的 JSON 配置,并将 bdev 名改为可配置(环境变量优先)。
|
||||
108
plan/userplan.md
108
plan/userplan.md
@@ -1,108 +0,0 @@
|
||||
### 架构目标
|
||||
- 通过 LD_PRELOAD hook POSIX 文件操作(open/read/write/pread/pwrite/close/fsync 等),将 MySQL 的数据文件 IO 重定向到 SPDK Blobstore。
|
||||
- 最大化性能:绕过内核、利用多核并发、低延迟、小块写合并。
|
||||
- 核心原则:**每个 pthread 拥有独立的 SPDK 执行上下文**,全局共享底层存储资源。
|
||||
|
||||
### 全局资源(进程级别,唯一一份)
|
||||
- `zvfs_t *g_fs`:文件系统实例,包含:
|
||||
- `struct spdk_blob_store *bs`:全局 Blobstore(通过 spdk_bs_load/init 创建)。
|
||||
- bdev(Nvme0n1 或 Malloc0,通过 JSON 配置加载)。
|
||||
- 全局元数据:dirents 数组(zvfs_dirent_t *[])、fd_table(zvfs_file_t *[])、openfd_count。
|
||||
- 保护全局元数据的锁:pthread_rwlock_t g_meta_lock(读多写少场景)。
|
||||
- 全局初始化标志:`bool g_mounted`、`bool g_env_inited`。
|
||||
- pthread_key_t 用于线程本地存储:`g_thread_local_key`(带 destructor)。
|
||||
|
||||
### 线程本地资源(每个 pthread 独占一份,通过 TLS 实现)
|
||||
每个 pthread 拥有以下私有状态,存储在结构体 `thread_local_zvfs_t` 中:
|
||||
|
||||
```c
|
||||
typedef struct {
|
||||
struct spdk_thread *thread; // 本线程专属的 SPDK thread
|
||||
struct spdk_io_channel *channel; // 本线程专属的 IO channel(绑定到 g_fs->bs)
|
||||
TAILQ_HEAD(, io_ctx) pending_queue; // 本线程的 pending IO 队列,用于 batch poll
|
||||
// 可选扩展:
|
||||
// struct dma_buf_pool *dma_pool; // per-thread DMA buf 复用池
|
||||
// struct page_cache *local_cache; // 如果需要 per-thread cache
|
||||
} thread_local_zvfs_t;
|
||||
```
|
||||
|
||||
- **创建时机**:lazy(第一次 IO 时调用 `get_thread_local()`)。
|
||||
- **存储方式**:通过 `pthread_setspecific(g_thread_local_key, tl)` 绑定到当前 pthread。
|
||||
- **销毁时机**:pthread 退出时,TLS destructor 自动调用:
|
||||
- spdk_bs_free_io_channel(channel)
|
||||
- spdk_thread_exit + poll until exited + spdk_thread_destroy
|
||||
|
||||
### 核心函数:get_thread_local()
|
||||
```c
|
||||
thread_local_zvfs_t *get_thread_local(void) {
|
||||
// 确保 key 已创建(只执行一次)
|
||||
pthread_once(&g_key_once, init_thread_key);
|
||||
|
||||
thread_local_zvfs_t *tl = pthread_getspecific(g_thread_local_key);
|
||||
if (tl == NULL) {
|
||||
tl = calloc(1, sizeof(*tl));
|
||||
tl->thread = spdk_thread_create("zvfs_worker", NULL);
|
||||
tl->channel = spdk_bs_alloc_io_channel(g_fs->bs);
|
||||
TAILQ_INIT(&tl->pending_queue);
|
||||
pthread_setspecific(g_thread_local_key, tl);
|
||||
}
|
||||
return tl;
|
||||
}
|
||||
```
|
||||
|
||||
### 工作流程(每个 pthread 独立执行)
|
||||
1. **线程首次进入 IO 操作**
|
||||
- 调用 `get_thread_local()` → 创建并绑定 thread + channel。
|
||||
- 如果 !g_mounted → 调用 zvfs_ensure_mounted()(使用当前 thread 进行 poll 完成 mount)。
|
||||
|
||||
2. **元数据操作(open/unlink/mkdir/rmdir/rename 等)**
|
||||
- 加读锁(g_meta_lock)检查/修改全局 dirents、dirs、fd_table。
|
||||
- 创建/查找 zvfs_file_t,调用 zvfs_create/zvfs_open(使用当前 thread 同步等待)。
|
||||
- 分配伪 fd,记录到全局 fd_table。
|
||||
- 释放锁。
|
||||
|
||||
3. **读操作(read/pread)**
|
||||
- 获取当前 tl = get_thread_local()。
|
||||
- spdk_set_thread(tl->thread)。
|
||||
- 如果小读 + cache hit → 直接 memcpy 返回。
|
||||
- 否则:创建 io_ctx,加入 tl->pending_queue。
|
||||
- 调用 spdk_blob_io_read(..., tl->channel, ...)。
|
||||
- 执行 batch_poll(tl, my_ctx):
|
||||
- while (!my_ctx->done) spdk_thread_poll(tl->thread, 0, 0);
|
||||
- 从 dma_buf 拷贝到用户 buf。
|
||||
|
||||
4. **写操作(write/pwrite)**
|
||||
- 获取 tl。
|
||||
- spdk_set_thread(tl->thread)。
|
||||
- 如果小写 → patch per-file page cache(dirty),标记 dirty,返回(延迟写)。
|
||||
- 如果 cache 满或大写 → flush dirty pages(batch spdk_blob_io_writev,用 tl->channel)。
|
||||
- 创建 io_ctx → 加入 pending_queue → submit write → batch_poll。
|
||||
|
||||
5. **fsync**
|
||||
- flush per-file dirty cache(batch writev + spdk_blob_sync_md)。
|
||||
- 使用当前 tl->thread poll 等待完成。
|
||||
|
||||
6. **close**
|
||||
- fsync(flush cache)。
|
||||
- zvfs_close(用当前 tl->thread 同步)。
|
||||
- 释放 fd(加锁更新全局 fd_table)。
|
||||
|
||||
### 性能关键机制
|
||||
- **独立 poll**:每个 pthread 用自己的 spdk_thread 独立 poll,无跨线程消息。
|
||||
- **batch poll**:一个 poll 循环可完成多个 pending IO,提升有效 QD。
|
||||
- **page cache**:per-file 4K dirty pages(hashmap),合并小写,减少 write amplification。
|
||||
- **channel per-thread**:避免全局 channel 争用,每个线程独立提交 IO。
|
||||
- **最小全局锁**:只在元数据修改时短时加锁(rwlock),IO 操作无锁。
|
||||
|
||||
### 资源所有权总结表
|
||||
|
||||
| 资源类型 | 所有权 | 数量 | 创建时机 | 销毁时机 |
|
||||
|----------------------|--------------|------------|------------------------|------------------------------|
|
||||
| bdev | 全局 | 1 | zvfs_ensure_mounted | zvfs_umount |
|
||||
| blobstore (bs) | 全局 | 1 | zvfs_ensure_mounted | zvfs_umount |
|
||||
| zvfs_t / g_fs | 全局 | 1 | zvfs_ensure_mounted | zvfs_umount + free |
|
||||
| dirents / fd_table | 全局 | 1 | meta_load | zvfs_umount + free |
|
||||
| spdk_thread | per-pthread | = pthread 数 | 首次 get_thread_local | pthread 退出(destructor) |
|
||||
| io_channel | per-pthread | = pthread 数 | 首次 get_thread_local | pthread 退出(destructor) |
|
||||
| pending_queue | per-pthread | = pthread 数 | 首次 get_thread_local | pthread 退出 |
|
||||
| page cache | per-file | per open fd| open 时 lazy | close 时 flush + free |
|
||||
116
scripts/run_db_bench.sh
Executable file
116
scripts/run_db_bench.sh
Executable file
@@ -0,0 +1,116 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# =========================
|
||||
# Manual Config (edit here)
|
||||
# =========================
|
||||
# 可执行文件路径
|
||||
DB_BENCH_BIN="/home/lian/env/rocksdb-test/db_bench"
|
||||
# RocksDB 数据目录
|
||||
DB_PATH="/tmp/rocksdb_manual"
|
||||
|
||||
# 测试类型 sets:
|
||||
# - "fillseq"
|
||||
# - "fillrandom"
|
||||
# - "readseq"
|
||||
# - "readrandom"
|
||||
# - "overwrite"
|
||||
# - "fillrandom,readrandom"
|
||||
BENCHMARKS="fillrandom,readrandom"
|
||||
|
||||
# key数
|
||||
NUM=1000000
|
||||
# 线程数
|
||||
THREADS=1
|
||||
# 随机种子
|
||||
SEED=1
|
||||
|
||||
# key大小
|
||||
KEY_SIZE=16
|
||||
# value大小
|
||||
VALUE_SIZE=400
|
||||
# SST block大小
|
||||
BLOCK_SIZE=4096
|
||||
|
||||
# block cache 大小
|
||||
CACHE_SIZE=$((512 * 1024 * 1024)) # bytes
|
||||
# memtable 大小
|
||||
WRITE_BUFFER_SIZE=$((64 * 1024 * 1024)) # bytes
|
||||
# memtable 个数
|
||||
MAX_WRITE_BUFFER_NUMBER=4
|
||||
# L1文件目标大小
|
||||
TARGET_FILE_SIZE_BASE=$((64 * 1024 * 1024)) # bytes
|
||||
# 可打开文件数
|
||||
OPEN_FILES=-1
|
||||
|
||||
# 后台 并行度
|
||||
MAX_BACKGROUND_JOBS=4
|
||||
# 压缩算法
|
||||
COMPRESSION_TYPE="none"
|
||||
# 开启WAL
|
||||
DISABLE_WAL=1
|
||||
SYNC=0
|
||||
|
||||
# direct I/O
|
||||
USE_DIRECT_READS=0
|
||||
USE_DIRECT_IO_FOR_FLUSH_AND_COMPACTION=0
|
||||
|
||||
# mmap I/O
|
||||
USE_MMAP_READS=0
|
||||
USE_MMAP_WRITES=0
|
||||
|
||||
# 统计
|
||||
STATISTICS=0
|
||||
# 统计打印
|
||||
STATS_INTERVAL_SECONDS=5
|
||||
# 直方图
|
||||
HISTOGRAM=0
|
||||
|
||||
# =========================
|
||||
# Run
|
||||
# =========================
|
||||
if [[ ! -x "$DB_BENCH_BIN" ]]; then
|
||||
echo "db_bench not found or not executable: $DB_BENCH_BIN" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "== db_bench manual run =="
|
||||
echo "DB_BENCH_BIN=$DB_BENCH_BIN"
|
||||
echo "DB_PATH=$DB_PATH"
|
||||
echo "BENCHMARKS=$BENCHMARKS"
|
||||
echo "NUM=$NUM THREADS=$THREADS"
|
||||
echo "KEY_SIZE=$KEY_SIZE VALUE_SIZE=$VALUE_SIZE BLOCK_SIZE=$BLOCK_SIZE"
|
||||
echo "CACHE_SIZE=$CACHE_SIZE WRITE_BUFFER_SIZE=$WRITE_BUFFER_SIZE"
|
||||
echo "MAX_WRITE_BUFFER_NUMBER=$MAX_WRITE_BUFFER_NUMBER TARGET_FILE_SIZE_BASE=$TARGET_FILE_SIZE_BASE"
|
||||
echo "OPEN_FILES=$OPEN_FILES MAX_BACKGROUND_JOBS=$MAX_BACKGROUND_JOBS"
|
||||
echo "COMPRESSION_TYPE=$COMPRESSION_TYPE DISABLE_WAL=$DISABLE_WAL SYNC=$SYNC"
|
||||
echo "USE_DIRECT_READS=$USE_DIRECT_READS USE_DIRECT_IO_FOR_FLUSH_AND_COMPACTION=$USE_DIRECT_IO_FOR_FLUSH_AND_COMPACTION"
|
||||
echo "USE_MMAP_READS=$USE_MMAP_READS USE_MMAP_WRITES=$USE_MMAP_WRITES"
|
||||
echo "STATISTICS=$STATISTICS STATS_INTERVAL_SECONDS=$STATS_INTERVAL_SECONDS HISTOGRAM=$HISTOGRAM"
|
||||
echo
|
||||
|
||||
exec "$DB_BENCH_BIN" \
|
||||
--db="$DB_PATH" \
|
||||
--benchmarks="$BENCHMARKS" \
|
||||
--num="$NUM" \
|
||||
--threads="$THREADS" \
|
||||
--seed="$SEED" \
|
||||
--key_size="$KEY_SIZE" \
|
||||
--value_size="$VALUE_SIZE" \
|
||||
--block_size="$BLOCK_SIZE" \
|
||||
--cache_size="$CACHE_SIZE" \
|
||||
--write_buffer_size="$WRITE_BUFFER_SIZE" \
|
||||
--max_write_buffer_number="$MAX_WRITE_BUFFER_NUMBER" \
|
||||
--target_file_size_base="$TARGET_FILE_SIZE_BASE" \
|
||||
--open_files="$OPEN_FILES" \
|
||||
--max_background_jobs="$MAX_BACKGROUND_JOBS" \
|
||||
--compression_type="$COMPRESSION_TYPE" \
|
||||
--disable_wal="$DISABLE_WAL" \
|
||||
--sync="$SYNC" \
|
||||
--use_direct_reads="$USE_DIRECT_READS" \
|
||||
--use_direct_io_for_flush_and_compaction="$USE_DIRECT_IO_FOR_FLUSH_AND_COMPACTION" \
|
||||
--mmap_read="$USE_MMAP_READS" \
|
||||
--mmap_write="$USE_MMAP_WRITES" \
|
||||
--statistics="$STATISTICS" \
|
||||
--stats_interval_seconds="$STATS_INTERVAL_SECONDS" \
|
||||
--histogram="$HISTOGRAM"
|
||||
116
scripts/run_db_bench_zvfs.sh
Executable file
116
scripts/run_db_bench_zvfs.sh
Executable file
@@ -0,0 +1,116 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# =========================
|
||||
# Manual Config (edit here)
|
||||
# =========================
|
||||
# 可执行文件路径
|
||||
DB_BENCH_BIN="/home/lian/env/rocksdb-test/db_bench"
|
||||
# RocksDB 数据目录
|
||||
DB_PATH="/zvfs/rocksdb_manual"
|
||||
|
||||
# 测试类型 sets:
|
||||
# - "fillseq"
|
||||
# - "fillrandom"
|
||||
# - "readseq"
|
||||
# - "readrandom"
|
||||
# - "overwrite"
|
||||
# - "fillrandom,readrandom"
|
||||
BENCHMARKS="fillrandom,readrandom"
|
||||
|
||||
# key数
|
||||
NUM=1000000
|
||||
# 线程数
|
||||
THREADS=1
|
||||
# 随机种子
|
||||
SEED=1
|
||||
|
||||
# key大小
|
||||
KEY_SIZE=16
|
||||
# value大小
|
||||
VALUE_SIZE=400
|
||||
# SST block大小
|
||||
BLOCK_SIZE=4096
|
||||
|
||||
# block cache 大小
|
||||
CACHE_SIZE=$((512 * 1024 * 1024)) # bytes
|
||||
# memtable 大小
|
||||
WRITE_BUFFER_SIZE=$((64 * 1024 * 1024)) # bytes
|
||||
# memtable 个数
|
||||
MAX_WRITE_BUFFER_NUMBER=4
|
||||
# L1文件目标大小
|
||||
TARGET_FILE_SIZE_BASE=$((64 * 1024 * 1024)) # bytes
|
||||
# 可打开文件数
|
||||
OPEN_FILES=-1
|
||||
|
||||
# 后台 并行度
|
||||
MAX_BACKGROUND_JOBS=4
|
||||
# 压缩算法
|
||||
COMPRESSION_TYPE="none"
|
||||
# 开启WAL
|
||||
DISABLE_WAL=1
|
||||
SYNC=0
|
||||
|
||||
# direct I/O
|
||||
USE_DIRECT_READS=0
|
||||
USE_DIRECT_IO_FOR_FLUSH_AND_COMPACTION=0
|
||||
|
||||
# mmap I/O
|
||||
USE_MMAP_READS=0
|
||||
USE_MMAP_WRITES=0
|
||||
|
||||
# 统计
|
||||
STATISTICS=0
|
||||
# 统计打印
|
||||
STATS_INTERVAL_SECONDS=5
|
||||
# 直方图
|
||||
HISTOGRAM=0
|
||||
|
||||
# =========================
|
||||
# Run
|
||||
# =========================
|
||||
if [[ ! -x "$DB_BENCH_BIN" ]]; then
|
||||
echo "db_bench not found or not executable: $DB_BENCH_BIN" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "== db_bench manual run =="
|
||||
echo "DB_BENCH_BIN=$DB_BENCH_BIN"
|
||||
echo "DB_PATH=$DB_PATH"
|
||||
echo "BENCHMARKS=$BENCHMARKS"
|
||||
echo "NUM=$NUM THREADS=$THREADS"
|
||||
echo "KEY_SIZE=$KEY_SIZE VALUE_SIZE=$VALUE_SIZE BLOCK_SIZE=$BLOCK_SIZE"
|
||||
echo "CACHE_SIZE=$CACHE_SIZE WRITE_BUFFER_SIZE=$WRITE_BUFFER_SIZE"
|
||||
echo "MAX_WRITE_BUFFER_NUMBER=$MAX_WRITE_BUFFER_NUMBER TARGET_FILE_SIZE_BASE=$TARGET_FILE_SIZE_BASE"
|
||||
echo "OPEN_FILES=$OPEN_FILES MAX_BACKGROUND_JOBS=$MAX_BACKGROUND_JOBS"
|
||||
echo "COMPRESSION_TYPE=$COMPRESSION_TYPE DISABLE_WAL=$DISABLE_WAL SYNC=$SYNC"
|
||||
echo "USE_DIRECT_READS=$USE_DIRECT_READS USE_DIRECT_IO_FOR_FLUSH_AND_COMPACTION=$USE_DIRECT_IO_FOR_FLUSH_AND_COMPACTION"
|
||||
echo "USE_MMAP_READS=$USE_MMAP_READS USE_MMAP_WRITES=$USE_MMAP_WRITES"
|
||||
echo "STATISTICS=$STATISTICS STATS_INTERVAL_SECONDS=$STATS_INTERVAL_SECONDS HISTOGRAM=$HISTOGRAM"
|
||||
echo
|
||||
|
||||
exec "$DB_BENCH_BIN" \
|
||||
--db="$DB_PATH" \
|
||||
--benchmarks="$BENCHMARKS" \
|
||||
--num="$NUM" \
|
||||
--threads="$THREADS" \
|
||||
--seed="$SEED" \
|
||||
--key_size="$KEY_SIZE" \
|
||||
--value_size="$VALUE_SIZE" \
|
||||
--block_size="$BLOCK_SIZE" \
|
||||
--cache_size="$CACHE_SIZE" \
|
||||
--write_buffer_size="$WRITE_BUFFER_SIZE" \
|
||||
--max_write_buffer_number="$MAX_WRITE_BUFFER_NUMBER" \
|
||||
--target_file_size_base="$TARGET_FILE_SIZE_BASE" \
|
||||
--open_files="$OPEN_FILES" \
|
||||
--max_background_jobs="$MAX_BACKGROUND_JOBS" \
|
||||
--compression_type="$COMPRESSION_TYPE" \
|
||||
--disable_wal="$DISABLE_WAL" \
|
||||
--sync="$SYNC" \
|
||||
--use_direct_reads="$USE_DIRECT_READS" \
|
||||
--use_direct_io_for_flush_and_compaction="$USE_DIRECT_IO_FOR_FLUSH_AND_COMPACTION" \
|
||||
--mmap_read="$USE_MMAP_READS" \
|
||||
--mmap_write="$USE_MMAP_WRITES" \
|
||||
--statistics="$STATISTICS" \
|
||||
--stats_interval_seconds="$STATS_INTERVAL_SECONDS" \
|
||||
--histogram="$HISTOGRAM"
|
||||
1
scripts/run_test_hook_api.sh
Normal file
1
scripts/run_test_hook_api.sh
Normal file
@@ -0,0 +1 @@
|
||||
LD_PRELOAD=/home/lian/try/zvfs/src/libzvfs.so ZVFS_TEST_ROOT=/zvfs /home/lian/try/zvfs/tests/bin/hook_api_test
|
||||
@@ -10,11 +10,29 @@ include $(SPDK_ROOT_DIR)/mk/spdk.app_vars.mk
|
||||
|
||||
LIBZVFS := libzvfs.so
|
||||
|
||||
C_SRCS := zvfs.c zvfs_hook.c
|
||||
C_SRCS := \
|
||||
common/utils.c \
|
||||
spdk_engine/io_engine.c \
|
||||
fs/zvfs.c \
|
||||
fs/zvfs_inode.c \
|
||||
fs/zvfs_path_entry.c \
|
||||
fs/zvfs_open_file.c \
|
||||
fs/zvfs_sys_init.c \
|
||||
hook/zvfs_hook_init.c \
|
||||
hook/zvfs_hook_fd.c \
|
||||
hook/zvfs_hook_rw.c \
|
||||
hook/zvfs_hook_seek.c \
|
||||
hook/zvfs_hook_stat.c \
|
||||
hook/zvfs_hook_sync.c \
|
||||
hook/zvfs_hook_fcntl.c \
|
||||
hook/zvfs_hook_dir.c \
|
||||
hook/zvfs_hook_mmap.c \
|
||||
|
||||
|
||||
SPDK_LIB_LIST = $(ALL_MODULES_LIST) event event_bdev
|
||||
|
||||
LIBS += $(SPDK_LIB_LINKER_ARGS)
|
||||
CFLAGS += -I$(abspath $(CURDIR))
|
||||
LDFLAGS += -shared -rdynamic -Wl,-z,nodelete -Wl,--disable-new-dtags \
|
||||
-Wl,-rpath,$(SPDK_ROOT_DIR)/build/lib \
|
||||
-Wl,-rpath,$(SPDK_ROOT_DIR)/dpdk/build/lib
|
||||
1137
src/common/uthash.h
Normal file
1137
src/common/uthash.h
Normal file
File diff suppressed because it is too large
Load Diff
93
src/common/utils.c
Normal file
93
src/common/utils.c
Normal file
@@ -0,0 +1,93 @@
|
||||
#include "utils.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int zvfs_calc_io_units(uint64_t offset_bytes,
|
||||
size_t len_bytes,
|
||||
uint32_t io_unit_size,
|
||||
uint64_t *unit_offset,
|
||||
uint64_t *unit_len,
|
||||
uint32_t *buf_offset_out) {
|
||||
if (!unit_offset || !unit_len || !buf_offset_out || io_unit_size == 0) {
|
||||
return -EINVAL;
|
||||
}
|
||||
if (len_bytes == 0) {
|
||||
*unit_offset = 0;
|
||||
*unit_len = 0;
|
||||
*buf_offset_out = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// offset 向下对齐到 io_unit 边界
|
||||
uint64_t aligned_offset = (offset_bytes / io_unit_size) * io_unit_size;
|
||||
|
||||
// 末尾向上对齐,保证覆盖完整的请求区间
|
||||
uint64_t end_bytes = offset_bytes + (uint64_t)len_bytes;
|
||||
uint64_t aligned_end = ((end_bytes + io_unit_size - 1) / io_unit_size) * io_unit_size;
|
||||
|
||||
*unit_offset = aligned_offset / io_unit_size;
|
||||
*unit_len = (aligned_end - aligned_offset) / io_unit_size;
|
||||
*buf_offset_out = (uint32_t)(offset_bytes - aligned_offset); // 原始数据在 dma_buf 内的起始偏移
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int zvfs_calc_ceil_units(uint64_t bytes,
|
||||
uint64_t unit_size,
|
||||
uint64_t *units_out) {
|
||||
if (!units_out || unit_size == 0) {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
*units_out = bytes / unit_size;
|
||||
if ((bytes % unit_size) != 0) {
|
||||
if (*units_out == UINT64_MAX) {
|
||||
return -EOVERFLOW;
|
||||
}
|
||||
(*units_out)++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int buf_init(zvfs_buf_t *b, size_t initial)
|
||||
{
|
||||
b->data = malloc(initial);
|
||||
if (!b->data) return -1;
|
||||
b->cap = initial;
|
||||
b->len = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void buf_free(zvfs_buf_t *b)
|
||||
{
|
||||
free(b->data);
|
||||
b->data = NULL;
|
||||
b->len = b->cap = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* 确保缓冲区还有 need 字节可用,不够则 realloc 两倍。
|
||||
*/
|
||||
int buf_reserve(zvfs_buf_t *b, size_t need)
|
||||
{
|
||||
if (b->len + need <= b->cap) return 0;
|
||||
|
||||
size_t new_cap = b->cap * 2;
|
||||
while (new_cap < b->len + need) new_cap *= 2;
|
||||
|
||||
uint8_t *p = realloc(b->data, new_cap);
|
||||
if (!p) return -1;
|
||||
b->data = p;
|
||||
b->cap = new_cap;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int buf_append(zvfs_buf_t *b, const void *src, size_t n)
|
||||
{
|
||||
if (buf_reserve(b, n) != 0) return -1;
|
||||
memcpy(b->data + b->len, src, n);
|
||||
b->len += n;
|
||||
return 0;
|
||||
}
|
||||
29
src/common/utils.h
Normal file
29
src/common/utils.h
Normal file
@@ -0,0 +1,29 @@
|
||||
#ifndef __ZVFS_COMMON_UTILS_H__
|
||||
#define __ZVFS_COMMON_UTILS_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
int zvfs_calc_io_units(uint64_t offset_bytes,
|
||||
size_t len_bytes,
|
||||
uint32_t io_unit_size,
|
||||
uint64_t *unit_offset,
|
||||
uint64_t *unit_len,
|
||||
uint32_t *buf_offset_out);
|
||||
|
||||
int zvfs_calc_ceil_units(uint64_t bytes,
|
||||
uint64_t unit_size,
|
||||
uint64_t *units_out);
|
||||
|
||||
typedef struct {
|
||||
uint8_t *data;
|
||||
size_t cap;
|
||||
size_t len;
|
||||
} zvfs_buf_t;
|
||||
|
||||
int buf_init(zvfs_buf_t *b, size_t initial);
|
||||
void buf_free(zvfs_buf_t *b);
|
||||
int buf_reserve(zvfs_buf_t *b, size_t need);
|
||||
int buf_append(zvfs_buf_t *b, const void *src, size_t n);
|
||||
|
||||
#endif // __ZVFS_COMMON_UTILS_H__
|
||||
32
src/config.h
Normal file
32
src/config.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef __ZVFS_CONFIG_H__
|
||||
#define __ZVFS_CONFIG_H__
|
||||
|
||||
/**
|
||||
* ZVFS
|
||||
*/
|
||||
#define ZVFS_XATTR_BLOB_ID "user.zvfs.blob_id"
|
||||
|
||||
/**
|
||||
* SPDK
|
||||
*/
|
||||
// dev
|
||||
#define SPDK_JSON_PATH "/home/lian/try/zvfs/src/zvfsmalloc.json"
|
||||
// #define ZVFS_BDEV "Nvme0n1"
|
||||
#ifndef ZVFS_BDEV
|
||||
#define ZVFS_BDEV "Malloc0"
|
||||
#endif
|
||||
|
||||
// super blob
|
||||
#define ZVFS_SB_MAGIC UINT64_C(0x5A5646535F534200) /* "ZVFS_SB\0" */
|
||||
#define ZVFS_SB_VERSION UINT32_C(1)
|
||||
|
||||
// dma
|
||||
#define ZVFS_DMA_BUF_SIZE (1024 * 1024)
|
||||
|
||||
// waiter
|
||||
#define WAITER_MAX_TIME 10000000
|
||||
|
||||
|
||||
|
||||
|
||||
#endif // __ZVFS_CONFIG_H__
|
||||
103
src/fs/zvfs.c
Normal file
103
src/fs/zvfs.c
Normal file
@@ -0,0 +1,103 @@
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
#include "config.h"
|
||||
#include "common/utils.h"
|
||||
#include "fs/zvfs.h"
|
||||
#include "fs/zvfs_inode.h"
|
||||
#include "fs/zvfs_path_entry.h"
|
||||
#include "fs/zvfs_open_file.h"
|
||||
|
||||
#include <sys/xattr.h>
|
||||
#include <sys/types.h>
|
||||
struct zvfs_fs g_fs = {0};
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* init / destroy */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int zvfs_fs_init(void) {
|
||||
memset(&g_fs, 0, sizeof(g_fs));
|
||||
|
||||
if (pthread_mutex_init(&g_fs.inode_mu, NULL) != 0) goto fail_inode;
|
||||
if (pthread_mutex_init(&g_fs.path_mu, NULL) != 0) goto fail_path;
|
||||
if (pthread_mutex_init(&g_fs.fd_mu, NULL) != 0) goto fail_fd;
|
||||
|
||||
return 0;
|
||||
|
||||
fail_fd:
|
||||
pthread_mutex_destroy(&g_fs.path_mu);
|
||||
fail_path:
|
||||
pthread_mutex_destroy(&g_fs.inode_mu);
|
||||
fail_inode:
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* 销毁 fd_table:每个 openfile 只释放结构体内存,
|
||||
* blob_close / inode 引用计数的清理应由上层在进程退出前完成。
|
||||
* 这里做"强制兜底"清理,避免内存泄漏。
|
||||
*/
|
||||
int zvfs_fs_destroy(void) {
|
||||
|
||||
pthread_mutex_lock(&g_fs.fd_mu);
|
||||
{
|
||||
struct zvfs_open_file *of, *tmp_of;
|
||||
HASH_ITER(hh, g_fs.fd_table, of, tmp_of) {
|
||||
HASH_DEL(g_fs.fd_table, of);
|
||||
openfile_free(of);
|
||||
}
|
||||
}
|
||||
pthread_mutex_unlock(&g_fs.fd_mu);
|
||||
|
||||
/* 销毁 path_cache */
|
||||
pthread_mutex_lock(&g_fs.path_mu);
|
||||
{
|
||||
struct zvfs_path_entry *pe, *tmp_pe;
|
||||
HASH_ITER(hh, g_fs.path_cache, pe, tmp_pe) {
|
||||
HASH_DEL(g_fs.path_cache, pe);
|
||||
free(pe->path);
|
||||
free(pe);
|
||||
}
|
||||
}
|
||||
pthread_mutex_unlock(&g_fs.path_mu);
|
||||
|
||||
/* 销毁 inode_table */
|
||||
pthread_mutex_lock(&g_fs.inode_mu);
|
||||
{
|
||||
struct zvfs_inode *in, *tmp_in;
|
||||
HASH_ITER(hh, g_fs.inode_table, in, tmp_in) {
|
||||
HASH_DEL(g_fs.inode_table, in);
|
||||
inode_free(in);
|
||||
}
|
||||
}
|
||||
pthread_mutex_unlock(&g_fs.inode_mu);
|
||||
|
||||
pthread_mutex_destroy(&g_fs.fd_mu);
|
||||
pthread_mutex_destroy(&g_fs.path_mu);
|
||||
pthread_mutex_destroy(&g_fs.inode_mu);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* xattr helpers */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int zvfs_xattr_write_blob_id(int fd, uint64_t blob_id)
|
||||
{
|
||||
if (fsetxattr(fd, ZVFS_XATTR_BLOB_ID, &blob_id, sizeof(blob_id), 0) < 0)
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int zvfs_xattr_read_blob_id(int fd, uint64_t *blob_id_out)
|
||||
{
|
||||
ssize_t ret = fgetxattr(fd, ZVFS_XATTR_BLOB_ID, blob_id_out, sizeof(uint64_t));
|
||||
if (ret != sizeof(uint64_t)) {
|
||||
if (ret >= 0)
|
||||
errno = EIO; /* 长度不对,视为损坏 */
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
35
src/fs/zvfs.h
Normal file
35
src/fs/zvfs.h
Normal file
@@ -0,0 +1,35 @@
|
||||
#ifndef __ZVFS_FS_GLOBAL_H__
|
||||
#define __ZVFS_FS_GLOBAL_H__
|
||||
|
||||
#include <pthread.h>
|
||||
#include <stdatomic.h>
|
||||
#include <stdint.h>
|
||||
|
||||
struct zvfs_inode;
|
||||
struct zvfs_path_entry;
|
||||
struct zvfs_open_file;
|
||||
|
||||
struct zvfs_fs {
|
||||
struct zvfs_inode *inode_table; /* blob_id → inode */
|
||||
struct zvfs_path_entry *path_cache; /* path → inode(运行时缓存)*/
|
||||
struct zvfs_open_file *fd_table; /* fd → openfile */
|
||||
|
||||
pthread_mutex_t inode_mu;
|
||||
pthread_mutex_t path_mu;
|
||||
pthread_mutex_t fd_mu;
|
||||
};
|
||||
|
||||
struct strace {
|
||||
|
||||
|
||||
};
|
||||
|
||||
extern struct zvfs_fs g_fs;
|
||||
|
||||
int zvfs_fs_init(void);
|
||||
int zvfs_fs_destroy(void);
|
||||
|
||||
int zvfs_xattr_write_blob_id(int fd, uint64_t blob_id);
|
||||
int zvfs_xattr_read_blob_id(int fd, uint64_t *blob_id_out);
|
||||
|
||||
#endif // __ZVFS_FS_GLOBAL_H__
|
||||
83
src/fs/zvfs_inode.c
Normal file
83
src/fs/zvfs_inode.c
Normal file
@@ -0,0 +1,83 @@
|
||||
#include "zvfs_inode.h"
|
||||
#include "zvfs.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <time.h>
|
||||
#include <errno.h>
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* alloc / free */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
struct zvfs_inode *inode_alloc(uint64_t blob_id, mode_t mode, zvfs_itype_t itype) {
|
||||
struct zvfs_inode *in = calloc(1, sizeof(*in));
|
||||
if (!in)
|
||||
return NULL;
|
||||
|
||||
in->blob_id = blob_id;
|
||||
in->logical_size = 0;
|
||||
in->itype = itype;
|
||||
in->mode = mode;
|
||||
in->uid = getuid();
|
||||
in->gid = getgid();
|
||||
|
||||
time_t now = time(NULL);
|
||||
in->atime = now;
|
||||
in->mtime = now;
|
||||
in->deleted = false;
|
||||
|
||||
atomic_init(&in->ref_count, 1);
|
||||
pthread_mutex_init(&in->mu, NULL);
|
||||
|
||||
return in;
|
||||
}
|
||||
|
||||
void inode_free(struct zvfs_inode *inode){
|
||||
if (!inode)
|
||||
return;
|
||||
pthread_mutex_destroy(&inode->mu);
|
||||
free(inode);
|
||||
}
|
||||
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* hash table operations (调用方持有 g_fs.inode_mu) */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
void inode_insert(struct zvfs_inode *inode){
|
||||
HASH_ADD(hh, g_fs.inode_table, blob_id, sizeof(uint64_t), inode);
|
||||
}
|
||||
|
||||
struct zvfs_inode *inode_lookup(uint64_t blob_id) {
|
||||
struct zvfs_inode *in = NULL;
|
||||
HASH_FIND(hh, g_fs.inode_table, &blob_id, sizeof(uint64_t), in);
|
||||
return in;
|
||||
}
|
||||
|
||||
void inode_remove(uint64_t blob_id) {
|
||||
struct zvfs_inode *in = inode_lookup(blob_id);
|
||||
if (in)
|
||||
HASH_DELETE(hh, g_fs.inode_table, in);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* size / timestamp helpers (调用方持有 inode->mu) */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
void inode_update_size(struct zvfs_inode *inode, int real_fd, uint64_t new_size) {
|
||||
inode->logical_size = new_size;
|
||||
if (real_fd >= 0)
|
||||
ftruncate(real_fd, (off_t)new_size); /* 同步 st_size,忽略错误 */
|
||||
}
|
||||
|
||||
void inode_touch_atime(struct zvfs_inode *inode) {
|
||||
inode->atime = time(NULL);
|
||||
}
|
||||
|
||||
void inode_touch_mtime(struct zvfs_inode *inode)
|
||||
{
|
||||
inode->mtime = time(NULL);
|
||||
}
|
||||
58
src/fs/zvfs_inode.h
Normal file
58
src/fs/zvfs_inode.h
Normal file
@@ -0,0 +1,58 @@
|
||||
#ifndef __ZVFS_INODE_H__
|
||||
#define __ZVFS_INODE_H__
|
||||
|
||||
#include "common/uthash.h"
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include <time.h>
|
||||
#include <pthread.h>
|
||||
#include <sys/types.h>
|
||||
#include <stdatomic.h>
|
||||
|
||||
typedef enum {
|
||||
ZVFS_ITYPE_FILE = 0,
|
||||
ZVFS_ITYPE_DIR = 1,
|
||||
} zvfs_itype_t;
|
||||
|
||||
struct zvfs_inode {
|
||||
uint64_t blob_id;
|
||||
uint64_t logical_size; // 和真实文件 st_size 保持同步
|
||||
zvfs_itype_t itype; // FILE only,DIR 不进这张表
|
||||
|
||||
mode_t mode; // 权限
|
||||
uid_t uid; //
|
||||
gid_t gid;
|
||||
time_t atime, mtime;
|
||||
|
||||
atomic_int ref_count;
|
||||
pthread_mutex_t mu; // 护 logical_size、append_offset 等更新
|
||||
bool deleted;
|
||||
|
||||
UT_hash_handle hh;
|
||||
};
|
||||
|
||||
// 分配并初始化一个 inode,不插入全局表
|
||||
struct zvfs_inode *inode_alloc(uint64_t blob_id, mode_t mode, zvfs_itype_t itype);
|
||||
|
||||
// 释放 inode 内存(调用前确保 ref_count == 0)
|
||||
void inode_free(struct zvfs_inode *inode);
|
||||
|
||||
// 插入全局表(需持有 inode_mu)
|
||||
void inode_insert(struct zvfs_inode *inode);
|
||||
|
||||
// 按 blob_id 查找(需持有 inode_mu)
|
||||
struct zvfs_inode *inode_lookup(uint64_t blob_id);
|
||||
|
||||
// 从全局表移除(需持有 inode_mu,不释放内存)
|
||||
void inode_remove(uint64_t blob_id);
|
||||
|
||||
// 更新 logical_size,同时负责调用 ftruncate 同步 st_size
|
||||
// 需持有 inode->mu
|
||||
void inode_update_size(struct zvfs_inode *inode, int real_fd, uint64_t new_size);
|
||||
|
||||
// 更新时间戳(需持有 inode->mu)
|
||||
void inode_touch_atime(struct zvfs_inode *inode);
|
||||
void inode_touch_mtime(struct zvfs_inode *inode);
|
||||
|
||||
#endif // __ZVFS_INODE_H__
|
||||
97
src/fs/zvfs_open_file.c
Normal file
97
src/fs/zvfs_open_file.c
Normal file
@@ -0,0 +1,97 @@
|
||||
#include "zvfs_open_file.h"
|
||||
#include "zvfs_inode.h"
|
||||
#include "zvfs.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <stdint.h>
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* alloc / free */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
struct zvfs_open_file *openfile_alloc(int fd,
|
||||
struct zvfs_inode *inode,
|
||||
int flags,
|
||||
struct zvfs_blob_handle *handle)
|
||||
{
|
||||
struct zvfs_open_file *of = calloc(1, sizeof(*of));
|
||||
if (!of)
|
||||
return NULL;
|
||||
|
||||
of->fd = fd;
|
||||
of->inode = inode;
|
||||
of->handle = handle;
|
||||
of->flags = flags;
|
||||
of->fd_flags = 0;
|
||||
of->offset = 0;
|
||||
atomic_init(&of->ref_count, 1);
|
||||
|
||||
return of;
|
||||
}
|
||||
|
||||
void openfile_free(struct zvfs_open_file *of)
|
||||
{
|
||||
free(of);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* hash table operations (调用方持有 g_fs.fd_mu) */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
void openfile_insert(struct zvfs_open_file *of)
|
||||
{
|
||||
HASH_ADD_INT(g_fs.fd_table, fd, of);
|
||||
}
|
||||
|
||||
struct zvfs_open_file *openfile_lookup(int fd)
|
||||
{
|
||||
struct zvfs_open_file *of = NULL;
|
||||
HASH_FIND_INT(g_fs.fd_table, &fd, of);
|
||||
return of;
|
||||
}
|
||||
|
||||
void openfile_remove(int fd)
|
||||
{
|
||||
struct zvfs_open_file *of = openfile_lookup(fd);
|
||||
if (of)
|
||||
HASH_DEL(g_fs.fd_table, of);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* lseek (调用方持有 of->inode->mu) */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
uint64_t openfile_seek(struct zvfs_open_file *of, int64_t offset, int whence)
|
||||
{
|
||||
int64_t new_off;
|
||||
|
||||
switch (whence) {
|
||||
case SEEK_SET:
|
||||
new_off = offset;
|
||||
break;
|
||||
|
||||
case SEEK_CUR:
|
||||
new_off = (int64_t)of->offset + offset;
|
||||
break;
|
||||
|
||||
case SEEK_END:
|
||||
/* logical_size 由调用方在持锁状态下保证可见 */
|
||||
new_off = (int64_t)of->inode->logical_size + offset;
|
||||
break;
|
||||
|
||||
default:
|
||||
errno = EINVAL;
|
||||
return (uint64_t)-1;
|
||||
}
|
||||
|
||||
if (new_off < 0) {
|
||||
errno = EINVAL;
|
||||
return (uint64_t)-1;
|
||||
}
|
||||
|
||||
of->offset = (uint64_t)new_off;
|
||||
return of->offset;
|
||||
}
|
||||
48
src/fs/zvfs_open_file.h
Normal file
48
src/fs/zvfs_open_file.h
Normal file
@@ -0,0 +1,48 @@
|
||||
#ifndef __ZVFS_OPEN_FILE_H__
|
||||
#define __ZVFS_OPEN_FILE_H__
|
||||
|
||||
#include "common/uthash.h"
|
||||
#include "spdk_engine/io_engine.h"
|
||||
#include <stdatomic.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifndef SPDK_BLOB_ID_DEFINED
|
||||
typedef uint64_t spdk_blob_id;
|
||||
#define SPDK_BLOB_ID_DEFINED
|
||||
#endif
|
||||
|
||||
struct zvfs_open_file {
|
||||
int fd; // key,和真实 fd 1:1
|
||||
struct zvfs_inode *inode;
|
||||
struct zvfs_blob_handle *handle;
|
||||
|
||||
int flags;
|
||||
int fd_flags;
|
||||
|
||||
uint64_t offset; // 非 APPEND 模式的当前位置
|
||||
atomic_int ref_count; // dup / close 用
|
||||
|
||||
UT_hash_handle hh;
|
||||
};
|
||||
|
||||
// 分配 openfile,不插入全局表,ref_count 初始为 1
|
||||
struct zvfs_open_file *openfile_alloc(int fd, struct zvfs_inode *inode,
|
||||
int flags, struct zvfs_blob_handle *handle);
|
||||
|
||||
// 释放内存(调用前确保 ref_count == 0,不负责 blob_close)
|
||||
void openfile_free(struct zvfs_open_file *of);
|
||||
|
||||
// 插入全局表(需持有 fd_mu)
|
||||
void openfile_insert(struct zvfs_open_file *of);
|
||||
|
||||
// 按 fd 查找(需持有 fd_mu)
|
||||
struct zvfs_open_file *openfile_lookup(int fd);
|
||||
|
||||
// 从全局表移除(需持有 fd_mu,不释放内存)
|
||||
void openfile_remove(int fd);
|
||||
|
||||
// lseek 语义:返回新 offset,出错返回 (uint64_t)-1
|
||||
// 需持有 of->inode->mu(读 logical_size)
|
||||
uint64_t openfile_seek(struct zvfs_open_file *of, int64_t offset, int whence);
|
||||
|
||||
#endif // __ZVFS_OPEN_FILE_H__
|
||||
82
src/fs/zvfs_path_entry.c
Normal file
82
src/fs/zvfs_path_entry.c
Normal file
@@ -0,0 +1,82 @@
|
||||
#include "zvfs_path_entry.h"
|
||||
#include "zvfs.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* internal helper */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
static struct zvfs_path_entry *_path_find(const char *path)
|
||||
{
|
||||
struct zvfs_path_entry *e = NULL;
|
||||
HASH_FIND_STR(g_fs.path_cache, path, e);
|
||||
return e;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* public API (调用方持有 g_fs.path_mu) */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int path_cache_insert(const char *path, struct zvfs_inode *inode)
|
||||
{
|
||||
if (_path_find(path))
|
||||
return -EEXIST;
|
||||
|
||||
struct zvfs_path_entry *e = calloc(1, sizeof(*e));
|
||||
if (!e)
|
||||
return -ENOMEM;
|
||||
|
||||
e->path = strdup(path);
|
||||
if (!e->path) {
|
||||
free(e);
|
||||
return -ENOMEM;
|
||||
}
|
||||
e->inode = inode;
|
||||
|
||||
HASH_ADD_STR(g_fs.path_cache, path, e);
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct zvfs_path_entry *path_cache_lookup(const char *path)
|
||||
{
|
||||
return _path_find(path);
|
||||
}
|
||||
|
||||
void path_cache_remove(const char *path)
|
||||
{
|
||||
struct zvfs_path_entry *e = _path_find(path);
|
||||
if (!e)
|
||||
return;
|
||||
HASH_DEL(g_fs.path_cache, e);
|
||||
free(e->path);
|
||||
free(e);
|
||||
}
|
||||
|
||||
int path_cache_rename(const char *old_path, const char *new_path)
|
||||
{
|
||||
struct zvfs_path_entry *old_e = _path_find(old_path);
|
||||
if (!old_e)
|
||||
return -1;
|
||||
|
||||
/* 若 new_path 已存在,先清掉旧 entry(inode 引用由上层处理) */
|
||||
struct zvfs_path_entry *new_e = _path_find(new_path);
|
||||
if (new_e) {
|
||||
HASH_DEL(g_fs.path_cache, new_e);
|
||||
free(new_e->path);
|
||||
free(new_e);
|
||||
}
|
||||
|
||||
/* 替换 key:从表中删除,修改 key 字符串,重新插入 */
|
||||
HASH_DEL(g_fs.path_cache, old_e);
|
||||
free(old_e->path);
|
||||
old_e->path = strdup(new_path);
|
||||
if (!old_e->path) {
|
||||
free(old_e);
|
||||
return -1;
|
||||
}
|
||||
HASH_ADD_STR(g_fs.path_cache, path, old_e);
|
||||
return 0;
|
||||
}
|
||||
30
src/fs/zvfs_path_entry.h
Normal file
30
src/fs/zvfs_path_entry.h
Normal file
@@ -0,0 +1,30 @@
|
||||
#ifndef __ZVFS_PATH_ENTRY_H__
|
||||
#define __ZVFS_PATH_ENTRY_H__
|
||||
|
||||
#include "common/uthash.h"
|
||||
#include <stdatomic.h>
|
||||
#include <stdint.h>
|
||||
|
||||
struct zvfs_path_entry {
|
||||
char *path; // key
|
||||
struct zvfs_inode *inode;
|
||||
|
||||
UT_hash_handle hh;
|
||||
};
|
||||
|
||||
|
||||
// 插入缓存,path 内部 strdup,inode->ref_count 不在此处修改
|
||||
// 需持有 path_mu
|
||||
int path_cache_insert(const char *path, struct zvfs_inode *inode);
|
||||
|
||||
// 查找,未命中返回 NULL(需持有 path_mu)
|
||||
struct zvfs_path_entry *path_cache_lookup(const char *path);
|
||||
|
||||
// 移除并释放 entry(不释放 inode,需持有 path_mu)
|
||||
void path_cache_remove(const char *path);
|
||||
|
||||
// rename:原子替换 key(需持有 path_mu)
|
||||
int path_cache_rename(const char *old_path, const char *new_path);
|
||||
|
||||
|
||||
#endif // __ZVFS_PATH_ENTRY_H__
|
||||
38
src/fs/zvfs_sys_init.c
Normal file
38
src/fs/zvfs_sys_init.c
Normal file
@@ -0,0 +1,38 @@
|
||||
// zvfs_sysinit.c
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
#include "config.h"
|
||||
#include "zvfs_sys_init.h"
|
||||
#include "fs/zvfs.h" // zvfs_fs_init
|
||||
#include "spdk_engine/io_engine.h"
|
||||
|
||||
#include <pthread.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
static pthread_once_t _init_once = PTHREAD_ONCE_INIT;
|
||||
static int _init_ok = 0;
|
||||
|
||||
static void
|
||||
do_init(void)
|
||||
{
|
||||
const char *bdev = getenv("ZVFS_BDEV");
|
||||
if (!bdev) {
|
||||
bdev = ZVFS_BDEV;
|
||||
fprintf(stderr, "[zvfs] ZVFS_BDEV not set, set as (%s)\n", ZVFS_BDEV);
|
||||
}
|
||||
|
||||
if (io_engine_init(bdev) != 0) {
|
||||
fprintf(stderr, "[zvfs] FATAL: io_engine_init(%s) failed\n", bdev);
|
||||
abort();
|
||||
}
|
||||
|
||||
_init_ok = 1;
|
||||
}
|
||||
|
||||
void
|
||||
zvfs_ensure_init(void)
|
||||
{
|
||||
pthread_once(&_init_once, do_init);
|
||||
}
|
||||
15
src/fs/zvfs_sys_init.h
Normal file
15
src/fs/zvfs_sys_init.h
Normal file
@@ -0,0 +1,15 @@
|
||||
// zvfs_sysinit.h
|
||||
#ifndef __ZVFS_SYSINIT_H__
|
||||
#define __ZVFS_SYSINIT_H__
|
||||
|
||||
/*
|
||||
* 确保 io_engine 已初始化。
|
||||
* 第一次被调用时执行初始化,后续调用直接返回。
|
||||
* 线程安全:内部用 pthread_once 保证只初始化一次。
|
||||
*
|
||||
* 调用时机:第一次 open("/zvfs/...") 时触发。
|
||||
* 此时 main() 已经开始执行,SPDK 所需的运行环境已就绪。
|
||||
*/
|
||||
void zvfs_ensure_init(void);
|
||||
|
||||
#endif
|
||||
14
src/hook/zvfs_hook.h
Normal file
14
src/hook/zvfs_hook.h
Normal file
@@ -0,0 +1,14 @@
|
||||
#ifndef __ZVFS_HOOK_H__
|
||||
#define __ZVFS_HOOK_H__
|
||||
|
||||
#include "zvfs_hook_init.h"
|
||||
#include "zvfs_hook_fd.h"
|
||||
#include "zvfs_hook_rw.h"
|
||||
#include "zvfs_hook_seek.h"
|
||||
#include "zvfs_hook_stat.h"
|
||||
#include "zvfs_hook_sync.h"
|
||||
#include "zvfs_hook_fcntl.h"
|
||||
#include "zvfs_hook_dir.h"
|
||||
#include "zvfs_hook_mmap.h"
|
||||
|
||||
#endif // __ZVFS_HOOK_H__
|
||||
276
src/hook/zvfs_hook_dir.c
Normal file
276
src/hook/zvfs_hook_dir.c
Normal file
@@ -0,0 +1,276 @@
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
#include "zvfs_hook_dir.h"
|
||||
#include "zvfs_hook_init.h"
|
||||
#include "zvfs_hook_reentrant.h"
|
||||
#include "fs/zvfs.h"
|
||||
#include "fs/zvfs_inode.h"
|
||||
#include "fs/zvfs_path_entry.h"
|
||||
|
||||
/* SPDK io engine - blob_delete 声明 */
|
||||
#include "../spdk_engine/io_engine.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
#include <linux/fs.h> /* RENAME_EXCHANGE, RENAME_NOREPLACE */
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* 内部:执行 unlink 的 zvfs 侧清理 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
/*
|
||||
* zvfs_unlink_path - 对一个确认属于 zvfs 的绝对路径执行清理。
|
||||
*
|
||||
* 调用时机:real_unlink* 已成功返回之后。
|
||||
*
|
||||
* 逻辑:
|
||||
* 1. 持 path_mu 查 path_cache
|
||||
* 2. 找到 → 持 inode_mu 查 inode
|
||||
* 3. 持 inode->mu 检查 ref_count
|
||||
* - ref_count == 0:直接 blob_delete,inode_remove,inode_free,path_cache_remove
|
||||
* - ref_count > 0:标记 deleted = true,path_cache_remove
|
||||
* (inode 和 blob 的清理推迟到 close 路径中 ref_count 归零时)
|
||||
*/
|
||||
static void
|
||||
zvfs_unlink_path(const char *abspath)
|
||||
{
|
||||
/* --- 查 path_cache -------------------------------------------- */
|
||||
pthread_mutex_lock(&g_fs.path_mu);
|
||||
struct zvfs_path_entry *pe = path_cache_lookup(abspath);
|
||||
if (!pe) {
|
||||
/*
|
||||
* 不在缓存里:该文件可能从未被 open 过(没有 inode 对象)。
|
||||
* 无内存状态需要清理,直接返回。
|
||||
* blob 也不存在(文件从未被 zvfs open 创建),所以安全。
|
||||
*/
|
||||
pthread_mutex_unlock(&g_fs.path_mu);
|
||||
return;
|
||||
}
|
||||
struct zvfs_inode *inode = pe->inode;
|
||||
pthread_mutex_unlock(&g_fs.path_mu);
|
||||
|
||||
/* --- 持 inode->mu 决策 ---------------------------------------- */
|
||||
pthread_mutex_lock(&inode->mu);
|
||||
int ref = atomic_load(&inode->ref_count);
|
||||
|
||||
if (ref == 0) {
|
||||
/*
|
||||
* 没有 fd 打开:立即清理。
|
||||
* 顺序:blob_delete → inode_remove(出全局表)→ path_cache_remove
|
||||
* → inode_free(释放内存)
|
||||
*/
|
||||
pthread_mutex_unlock(&inode->mu);
|
||||
|
||||
blob_delete(inode->blob_id);
|
||||
|
||||
pthread_mutex_lock(&g_fs.inode_mu);
|
||||
inode_remove(inode->blob_id);
|
||||
pthread_mutex_unlock(&g_fs.inode_mu);
|
||||
|
||||
pthread_mutex_lock(&g_fs.path_mu);
|
||||
path_cache_remove(abspath);
|
||||
pthread_mutex_unlock(&g_fs.path_mu);
|
||||
|
||||
inode_free(inode);
|
||||
|
||||
} else {
|
||||
/*
|
||||
* 还有 fd 打开:Unix 延迟删除语义。
|
||||
* 标记 deleted,让 close 路径在 ref_count 归零时负责 blob_delete。
|
||||
* 同时把 path 从缓存里摘掉(路径已从目录树消失)。
|
||||
*/
|
||||
inode->deleted = true;
|
||||
pthread_mutex_unlock(&inode->mu);
|
||||
|
||||
pthread_mutex_lock(&g_fs.path_mu);
|
||||
path_cache_remove(abspath);
|
||||
pthread_mutex_unlock(&g_fs.path_mu);
|
||||
}
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* unlink */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
unlink(const char *path)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
int ret;
|
||||
|
||||
if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(path)) {
|
||||
ret = real_unlink(path);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
/* 先让真实 FS 删除文件(xattr 随之消失) */
|
||||
ret = real_unlink(path);
|
||||
if (ret == 0)
|
||||
zvfs_unlink_path(path);
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* unlinkat */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
unlinkat(int dirfd, const char *path, int flags)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* AT_REMOVEDIR:rmdir 语义,目录由真实 FS 管理,直接透传。
|
||||
*/
|
||||
if (flags & AT_REMOVEDIR) {
|
||||
ret = real_unlinkat(dirfd, path, flags);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* 解析绝对路径,判断是否属于 zvfs */
|
||||
char abspath[PATH_MAX];
|
||||
if (zvfs_resolve_atpath(dirfd, path, abspath, sizeof(abspath)) < 0) {
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1; /* errno already set */
|
||||
}
|
||||
|
||||
if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(abspath)) {
|
||||
ret = real_unlinkat(dirfd, path, flags);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
ret = real_unlinkat(dirfd, path, flags);
|
||||
if (ret == 0)
|
||||
zvfs_unlink_path(abspath);
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* 内部:执行 rename 的 zvfs 侧缓存更新 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
/*
|
||||
* zvfs_rename_paths - 在 real_rename* 成功后更新 path_cache。
|
||||
*
|
||||
* 如果 newpath 原本也在缓存里(覆盖式 rename),其 inode 需要先做
|
||||
* unlink 清理(与 zvfs_unlink_path 逻辑相同)。
|
||||
*/
|
||||
static void
|
||||
zvfs_rename_paths(const char *oldabs, const char *newabs)
|
||||
{
|
||||
/* 处理 newpath 被覆盖的情况 */
|
||||
pthread_mutex_lock(&g_fs.path_mu);
|
||||
struct zvfs_path_entry *victim = path_cache_lookup(newabs);
|
||||
pthread_mutex_unlock(&g_fs.path_mu);
|
||||
|
||||
if (victim) {
|
||||
/*
|
||||
* newpath 是 zvfs 文件且已经在缓存里:
|
||||
* real_rename 已经把它从磁盘上删掉了,
|
||||
* 走和 unlink 一样的延迟/立即 blob_delete 逻辑。
|
||||
*/
|
||||
zvfs_unlink_path(newabs);
|
||||
}
|
||||
|
||||
/* 把 oldpath 的缓存条目 rename 到 newpath */
|
||||
pthread_mutex_lock(&g_fs.path_mu);
|
||||
path_cache_rename(oldabs, newabs);
|
||||
pthread_mutex_unlock(&g_fs.path_mu);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* rename */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
rename(const char *oldpath, const char *newpath)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
int ret;
|
||||
|
||||
int old_is_zvfs = zvfs_is_zvfs_path(oldpath);
|
||||
int new_is_zvfs = zvfs_is_zvfs_path(newpath);
|
||||
|
||||
if (ZVFS_IN_HOOK() || (!old_is_zvfs && !new_is_zvfs)) {
|
||||
ret = real_rename(oldpath, newpath);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* 跨域 rename(一个在 /zvfs 一个不在):不支持,返回 EXDEV。
|
||||
* 和跨文件系统 rename 的语义一致。
|
||||
*/
|
||||
if (old_is_zvfs != new_is_zvfs) {
|
||||
errno = EXDEV;
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
ret = real_rename(oldpath, newpath);
|
||||
if (ret == 0)
|
||||
zvfs_rename_paths(oldpath, newpath);
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* renameat */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
renameat(int olddirfd, const char *oldpath,
|
||||
int newdirfd, const char *newpath)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
int ret;
|
||||
|
||||
char oldabs[PATH_MAX], newabs[PATH_MAX];
|
||||
|
||||
if (zvfs_resolve_atpath(olddirfd, oldpath, oldabs, sizeof(oldabs)) < 0 ||
|
||||
zvfs_resolve_atpath(newdirfd, newpath, newabs, sizeof(newabs)) < 0) {
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1;
|
||||
}
|
||||
|
||||
int old_is_zvfs = zvfs_is_zvfs_path(oldabs);
|
||||
int new_is_zvfs = zvfs_is_zvfs_path(newabs);
|
||||
|
||||
if (ZVFS_IN_HOOK() || (!old_is_zvfs && !new_is_zvfs)) {
|
||||
ret = real_renameat(olddirfd, oldpath, newdirfd, newpath);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (old_is_zvfs != new_is_zvfs) {
|
||||
errno = EXDEV;
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
ret = real_renameat(olddirfd, oldpath, newdirfd, newpath);
|
||||
if (ret == 0)
|
||||
zvfs_rename_paths(oldabs, newabs);
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
32
src/hook/zvfs_hook_dir.h
Normal file
32
src/hook/zvfs_hook_dir.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef __ZVFS_HOOK_DIR_H__
|
||||
#define __ZVFS_HOOK_DIR_H__
|
||||
|
||||
#include <fcntl.h>
|
||||
|
||||
/*
|
||||
* 目录操作 hook。
|
||||
*
|
||||
* mkdir / rmdir / opendir / readdir / getdents64 全部透传,不 hook。
|
||||
* 只需要感知路径变化的操作才进这里:
|
||||
*
|
||||
* unlink / unlinkat
|
||||
* - 真实文件由 real_unlink 删除
|
||||
* - 若路径在 path_cache 中:
|
||||
* 若 ref_count == 0:blob_delete + inode_remove + path_cache_remove
|
||||
* 若 ref_count > 0:标记 inode->deleted = true,
|
||||
* ref_count 归零时(close 路径)再 blob_delete
|
||||
*
|
||||
* rename / renameat / renameat2
|
||||
* - 真实文件由 real_rename* 移动(xattr 跟随文件,不需要重写)
|
||||
* - path_cache_rename 更新内存缓存
|
||||
* - renameat2 RENAME_EXCHANGE 返回 ENOTSUP
|
||||
*/
|
||||
|
||||
int unlink(const char *path);
|
||||
int unlinkat(int dirfd, const char *path, int flags);
|
||||
|
||||
int rename(const char *oldpath, const char *newpath);
|
||||
int renameat(int olddirfd, const char *oldpath,
|
||||
int newdirfd, const char *newpath);
|
||||
|
||||
#endif // __ZVFS_HOOK_DIR_H__
|
||||
230
src/hook/zvfs_hook_fcntl.c
Normal file
230
src/hook/zvfs_hook_fcntl.c
Normal file
@@ -0,0 +1,230 @@
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
#include "zvfs_hook_fcntl.h"
|
||||
#include "zvfs_hook_init.h"
|
||||
#include "zvfs_hook_fd.h" /* dup/dup2 路径 */
|
||||
#include "zvfs_hook_reentrant.h"
|
||||
#include "fs/zvfs.h"
|
||||
#include "fs/zvfs_open_file.h"
|
||||
#include "fs/zvfs_inode.h"
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <pthread.h>
|
||||
#include <stdint.h>
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* 内部:fcntl 核心逻辑(已确认是 zvfs fd) */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
static int
|
||||
zvfs_fcntl_impl(int fd, int cmd, va_list ap)
|
||||
{
|
||||
pthread_mutex_lock(&g_fs.fd_mu);
|
||||
struct zvfs_open_file *of = openfile_lookup(fd);
|
||||
pthread_mutex_unlock(&g_fs.fd_mu);
|
||||
|
||||
if (!of) { errno = EBADF; return -1; }
|
||||
|
||||
switch (cmd) {
|
||||
|
||||
/* ---- 文件状态 flags ------------------------------------------ */
|
||||
case F_GETFL:
|
||||
return of->flags;
|
||||
|
||||
case F_SETFL: {
|
||||
int newfl = va_arg(ap, int);
|
||||
/*
|
||||
* 只允许修改可变位:O_APPEND、O_NONBLOCK、O_ASYNC。
|
||||
* O_RDONLY / O_WRONLY / O_RDWR 是 open 时决定的,不能改。
|
||||
* 同步给真实 fd,保持内核状态一致(影响 real_read/write)。
|
||||
*/
|
||||
int mutable_mask = O_APPEND | O_NONBLOCK | O_ASYNC;
|
||||
of->flags = (of->flags & ~mutable_mask) | (newfl & mutable_mask);
|
||||
/*
|
||||
* 也透传给真实 fd——虽然真实 fd 上的读写被我们拦截了,
|
||||
* 但 O_NONBLOCK 可能影响 pipe / socket 等透传路径。
|
||||
*/
|
||||
real_fcntl(fd, F_SETFL, of->flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ---- fd flags(FD_CLOEXEC)----------------------------------- */
|
||||
case F_GETFD:
|
||||
return of->fd_flags;
|
||||
|
||||
case F_SETFD: {
|
||||
int fdfl = va_arg(ap, int);
|
||||
of->fd_flags = fdfl;
|
||||
/* 同步给真实 fd */
|
||||
real_fcntl(fd, F_SETFD, fdfl);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ---- dup 类 -------------------------------------------------- */
|
||||
case F_DUPFD:
|
||||
case F_DUPFD_CLOEXEC: {
|
||||
(void)va_arg(ap, int);
|
||||
errno = ENOTSUP;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* ---- 文件锁(不实现,假装无锁)-------------------------------- */
|
||||
case F_GETLK: {
|
||||
struct flock *fl = va_arg(ap, struct flock *);
|
||||
if (!fl) { errno = EFAULT; return -1; }
|
||||
fl->l_type = F_UNLCK; /* 假装没有任何锁 */
|
||||
return 0;
|
||||
}
|
||||
|
||||
case F_SETLK:
|
||||
case F_SETLKW:
|
||||
(void)va_arg(ap, struct flock *);
|
||||
return 0; /* 假装加锁成功 */
|
||||
|
||||
/* ---- 其他 cmd:透传给内核(同时维护真实 fd 状态)-------------- */
|
||||
default: {
|
||||
/*
|
||||
* 取出可变参数作为 void* 透传。
|
||||
* 大多数 fcntl cmd 的第三个参数是 long 或指针,
|
||||
* 用 void* 接收足够覆盖所有平台(64-bit)。
|
||||
*/
|
||||
void *arg = va_arg(ap, void *);
|
||||
return real_fcntl(fd, cmd, arg);
|
||||
}
|
||||
|
||||
} /* switch */
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* fcntl */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
fcntl(int fd, int cmd, ...)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
va_list ap;
|
||||
va_start(ap, cmd);
|
||||
|
||||
int ret;
|
||||
if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_fd(fd)) {
|
||||
/*
|
||||
* 非 zvfs fd:透传。
|
||||
* va_list 转发需要用 vfprintf 风格,但 fcntl 没有标准的
|
||||
* va_list 版本。用 void* 提取第三参数再透传。
|
||||
*/
|
||||
void *arg = va_arg(ap, void *);
|
||||
ret = real_fcntl(fd, cmd, arg);
|
||||
va_end(ap);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
ret = zvfs_fcntl_impl(fd, cmd, ap);
|
||||
va_end(ap);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
int
|
||||
fcntl64(int fd, int cmd, ...)
|
||||
{
|
||||
/*
|
||||
* fcntl64 是 glibc 在 32-bit 系统上的 large-file 变体,
|
||||
* 语义与 fcntl 相同,直接转发。
|
||||
*/
|
||||
va_list ap;
|
||||
va_start(ap, cmd);
|
||||
void *arg = va_arg(ap, void *);
|
||||
va_end(ap);
|
||||
|
||||
ZVFS_HOOK_ENTER();
|
||||
int ret;
|
||||
if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_fd(fd)) {
|
||||
ret = real_fcntl64 ? real_fcntl64(fd, cmd, arg)
|
||||
: real_fcntl(fd, cmd, arg);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
va_list ap2;
|
||||
va_start(ap2, cmd);
|
||||
ret = zvfs_fcntl_impl(fd, cmd, ap2);
|
||||
va_end(ap2);
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* ioctl */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
ioctl(int fd, unsigned long request, ...)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
va_list ap;
|
||||
va_start(ap, request);
|
||||
void *arg = va_arg(ap, void *);
|
||||
va_end(ap);
|
||||
|
||||
if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_fd(fd)) {
|
||||
int ret = real_ioctl(fd, request, arg);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
int ret = -1;
|
||||
|
||||
switch (request) {
|
||||
|
||||
case FIONREAD: {
|
||||
/*
|
||||
* 返回当前可读字节数 = logical_size - cur_offset。
|
||||
* 结果写入 arg(int*)。
|
||||
*/
|
||||
pthread_mutex_lock(&g_fs.fd_mu);
|
||||
struct zvfs_open_file *of = openfile_lookup(fd);
|
||||
pthread_mutex_unlock(&g_fs.fd_mu);
|
||||
|
||||
if (!of) { errno = EBADF; ret = -1; break; }
|
||||
|
||||
pthread_mutex_lock(&of->inode->mu);
|
||||
uint64_t size = of->inode->logical_size;
|
||||
pthread_mutex_unlock(&of->inode->mu);
|
||||
|
||||
uint64_t off = of->offset;
|
||||
int avail = (off < size) ? (int)(size - off) : 0;
|
||||
if (arg) *(int *)arg = avail;
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
/*
|
||||
* 其他 ioctl:zvfs 文件不是块设备/字符设备,
|
||||
* 绝大多数 ioctl 语义不适用,返回 ENOTTY。
|
||||
* 若将来需要支持特定 ioctl 在此扩展。
|
||||
*/
|
||||
errno = ENOTTY;
|
||||
ret = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
27
src/hook/zvfs_hook_fcntl.h
Normal file
27
src/hook/zvfs_hook_fcntl.h
Normal file
@@ -0,0 +1,27 @@
|
||||
#ifndef __ZVFS_HOOK_FCNTL_H__
|
||||
#define __ZVFS_HOOK_FCNTL_H__
|
||||
|
||||
/*
|
||||
* fcntl cmd 处理策略:
|
||||
*
|
||||
* F_GETFL → 返回 of->flags
|
||||
* F_SETFL → 更新 of->flags(只允许改 O_APPEND / O_NONBLOCK)
|
||||
* F_GETFD → 返回 of->fd_flags
|
||||
* F_SETFD → 更新 of->fd_flags(FD_CLOEXEC)
|
||||
* F_DUPFD → 等价于 dup,分配 >= arg 的最小可用 fd,走 dup 路径
|
||||
* F_DUPFD_CLOEXEC → 同上,同时设 FD_CLOEXEC
|
||||
* F_GETLK → 不实现文件锁,返回 l_type = F_UNLCK(假装没有锁)
|
||||
* F_SETLK → 直接返回 0(假装成功)
|
||||
* F_SETLKW → 直接返回 0(假装成功,不阻塞)
|
||||
* 其他 cmd → 透传给 real_fcntl(同时透传给内核,保持真实 fd 状态同步)
|
||||
*
|
||||
* ioctl cmd 处理策略:
|
||||
* FIONREAD → 返回 logical_size - cur_offset(可读字节数)
|
||||
* 其他 → 透传,或对 zvfs fd 返回 ENOTTY
|
||||
*/
|
||||
|
||||
int fcntl(int fd, int cmd, ...);
|
||||
int fcntl64(int fd, int cmd, ...);
|
||||
int ioctl(int fd, unsigned long request, ...);
|
||||
|
||||
#endif // __ZVFS_HOOK_FCNTL_H__
|
||||
549
src/hook/zvfs_hook_fd.c
Normal file
549
src/hook/zvfs_hook_fd.c
Normal file
@@ -0,0 +1,549 @@
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
#include "zvfs_hook_fd.h"
|
||||
#include "zvfs_hook_init.h"
|
||||
#include "zvfs_hook_reentrant.h"
|
||||
#include "fs/zvfs.h"
|
||||
#include "fs/zvfs_inode.h"
|
||||
#include "fs/zvfs_path_entry.h"
|
||||
#include "fs/zvfs_open_file.h"
|
||||
#include "spdk_engine/io_engine.h"
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <unistd.h>
|
||||
#include <limits.h>
|
||||
#include <pthread.h>
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* 内部:open 的核心逻辑(路径已解析为绝对路径) */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
/**
|
||||
* zvfs_open_impl - 对一个确认属于 zvfs 的绝对路径执行 open。
|
||||
*
|
||||
* real_fd:已经由 real_open* 打开的真实 fd(用于 xattr 读写 + ftruncate)。
|
||||
* flags :open 时传入的 flags。
|
||||
* mode :O_CREAT 时的权限。
|
||||
*
|
||||
* 成功返回 real_fd(即用户拿到的 fd),失败返回 -1(errno 已设置),
|
||||
* 失败时调用方负责 real_close(real_fd)。
|
||||
*/
|
||||
static int
|
||||
zvfs_open_impl(int real_fd, const char *abspath, int flags, mode_t mode)
|
||||
{
|
||||
struct zvfs_inode *inode = NULL;
|
||||
struct zvfs_blob_handle *handle = NULL;
|
||||
uint64_t blob_id = 0;
|
||||
|
||||
if (flags & O_CREAT) {
|
||||
/* ---- 创建路径 -------------------------------------------- */
|
||||
|
||||
/* 1. 创建 blob */
|
||||
handle = blob_create(0);
|
||||
if (!handle) { errno = EIO; goto fail; }
|
||||
blob_id = handle->id;
|
||||
|
||||
/* 2. 把 blob_id 写入真实文件的 xattr */
|
||||
if (zvfs_xattr_write_blob_id(real_fd, blob_id) < 0) goto fail;
|
||||
|
||||
/* 3. logical_size = 0,让 st_size 也为 0 */
|
||||
if (real_ftruncate(real_fd, 0) < 0) goto fail;
|
||||
|
||||
/* 4. 分配 inode */
|
||||
inode = inode_alloc(blob_id, mode ? mode : 0666, ZVFS_ITYPE_FILE);
|
||||
if (!inode) { errno = ENOMEM; goto fail; }
|
||||
|
||||
/* 5. 插入全局表 */
|
||||
pthread_mutex_lock(&g_fs.inode_mu);
|
||||
inode_insert(inode);
|
||||
pthread_mutex_unlock(&g_fs.inode_mu);
|
||||
|
||||
/* 6. 插入 path_cache */
|
||||
pthread_mutex_lock(&g_fs.path_mu);
|
||||
path_cache_insert(abspath, inode);
|
||||
pthread_mutex_unlock(&g_fs.path_mu);
|
||||
|
||||
} else {
|
||||
/* ---- 打开已有文件路径 ------------------------------------- */
|
||||
|
||||
/* 1. 先查 path_cache,命中说明另一个 fd 已经打开过 */
|
||||
pthread_mutex_lock(&g_fs.path_mu);
|
||||
struct zvfs_path_entry *pe = path_cache_lookup(abspath);
|
||||
if (pe) inode = pe->inode;
|
||||
pthread_mutex_unlock(&g_fs.path_mu);
|
||||
|
||||
if (inode) {
|
||||
/* path_cache 命中:直接用缓存的 inode,重新 blob_open */
|
||||
blob_id = inode->blob_id;
|
||||
handle = blob_open(blob_id);
|
||||
if (!handle) { errno = EIO; goto fail; }
|
||||
/* 共享 inode,增加引用 */
|
||||
atomic_fetch_add(&inode->ref_count, 1);
|
||||
|
||||
} else {
|
||||
/* 未命中:从 xattr 读 blob_id,可能是进程首次 open */
|
||||
if (zvfs_xattr_read_blob_id(real_fd, &blob_id) < 0) {
|
||||
/* xattr 不存在:不是 zvfs 管理的文件,降级透传 */
|
||||
return real_fd; /* 直接返回,不做任何包装 */
|
||||
}
|
||||
|
||||
/* 再查 inode_table(另一个 fd 可能已经 open 但路径未缓存)*/
|
||||
pthread_mutex_lock(&g_fs.inode_mu);
|
||||
inode = inode_lookup(blob_id);
|
||||
pthread_mutex_unlock(&g_fs.inode_mu);
|
||||
|
||||
if (inode) {
|
||||
atomic_fetch_add(&inode->ref_count, 1);
|
||||
} else {
|
||||
/* 全新 inode:需从真实文件 stat 获取 mode/size */
|
||||
struct stat st;
|
||||
if (real_fstat(real_fd, &st) < 0) goto fail;
|
||||
|
||||
inode = inode_alloc(blob_id, st.st_mode, ZVFS_ITYPE_FILE);
|
||||
if (!inode) { errno = ENOMEM; goto fail; }
|
||||
inode->logical_size = (uint64_t)st.st_size;
|
||||
|
||||
pthread_mutex_lock(&g_fs.inode_mu);
|
||||
inode_insert(inode);
|
||||
pthread_mutex_unlock(&g_fs.inode_mu);
|
||||
|
||||
pthread_mutex_lock(&g_fs.path_mu);
|
||||
path_cache_insert(abspath, inode);
|
||||
pthread_mutex_unlock(&g_fs.path_mu);
|
||||
}
|
||||
|
||||
handle = blob_open(blob_id);
|
||||
if (!handle) { errno = EIO; goto fail; }
|
||||
}
|
||||
}
|
||||
|
||||
/* ---- 分配 openfile,插入 fd_table ---------------------------- */
|
||||
struct zvfs_open_file *of = openfile_alloc(real_fd, inode, flags, handle);
|
||||
if (!of) { errno = ENOMEM; goto fail_handle; }
|
||||
|
||||
pthread_mutex_lock(&g_fs.fd_mu);
|
||||
openfile_insert(of);
|
||||
pthread_mutex_unlock(&g_fs.fd_mu);
|
||||
|
||||
return real_fd;
|
||||
|
||||
fail_handle:
|
||||
blob_close(handle);
|
||||
fail:
|
||||
/* inode 若刚分配(ref_count==1)需要回滚 */
|
||||
if (inode && atomic_load(&inode->ref_count) == 1) {
|
||||
pthread_mutex_lock(&g_fs.inode_mu);
|
||||
inode_remove(inode->blob_id);
|
||||
pthread_mutex_unlock(&g_fs.inode_mu);
|
||||
pthread_mutex_lock(&g_fs.path_mu);
|
||||
path_cache_remove(abspath);
|
||||
pthread_mutex_unlock(&g_fs.path_mu);
|
||||
inode_free(inode);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* open */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
open(const char *path, int flags, ...)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
mode_t mode = 0;
|
||||
if (flags & O_CREAT) {
|
||||
va_list ap;
|
||||
va_start(ap, flags);
|
||||
mode = (mode_t)va_arg(ap, unsigned int);
|
||||
va_end(ap);
|
||||
}
|
||||
|
||||
int ret;
|
||||
if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(path)) {
|
||||
ret = real_open(path, flags, mode);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
/* 先让真实 FS 创建 / 打开文件(获得 real_fd) */
|
||||
int real_fd = real_open(path, flags, mode);
|
||||
if (real_fd < 0) { ZVFS_HOOK_LEAVE(); return -1; }
|
||||
|
||||
ret = zvfs_open_impl(real_fd, path, flags, mode);
|
||||
if (ret < 0) {
|
||||
int saved = errno;
|
||||
real_close(real_fd);
|
||||
errno = saved;
|
||||
}
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
int open64(const char *path, int flags, ...)
|
||||
{
|
||||
mode_t mode = 0;
|
||||
if (flags & O_CREAT) {
|
||||
va_list ap; va_start(ap, flags);
|
||||
mode = (mode_t)va_arg(ap, unsigned int);
|
||||
va_end(ap);
|
||||
}
|
||||
return open(path, flags | O_LARGEFILE, mode);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* openat */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
openat(int dirfd, const char *path, int flags, ...)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
mode_t mode = 0;
|
||||
if (flags & O_CREAT) {
|
||||
va_list ap; va_start(ap, flags);
|
||||
mode = (mode_t)va_arg(ap, unsigned int);
|
||||
va_end(ap);
|
||||
}
|
||||
|
||||
/* 解析绝对路径判断是否属于 zvfs */
|
||||
char abspath[PATH_MAX];
|
||||
if (zvfs_resolve_atpath(dirfd, path, abspath, sizeof(abspath)) < 0) {
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1;
|
||||
}
|
||||
|
||||
int ret;
|
||||
if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(abspath)) {
|
||||
ret = real_openat(dirfd, path, flags, mode);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
int real_fd = real_openat(dirfd, path, flags, mode);
|
||||
if (real_fd < 0) { ZVFS_HOOK_LEAVE(); return -1; }
|
||||
|
||||
ret = zvfs_open_impl(real_fd, abspath, flags, mode);
|
||||
if (ret < 0) {
|
||||
int saved = errno;
|
||||
real_close(real_fd);
|
||||
errno = saved;
|
||||
}
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
int openat64(int dirfd, const char *path, int flags, ...)
|
||||
{
|
||||
mode_t mode = 0;
|
||||
if (flags & O_CREAT) {
|
||||
va_list ap; va_start(ap, flags);
|
||||
mode = (mode_t)va_arg(ap, unsigned int);
|
||||
va_end(ap);
|
||||
}
|
||||
return openat(dirfd, path, flags | O_LARGEFILE, mode);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* creat */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int creat(const char *path, mode_t mode)
|
||||
{
|
||||
return open(path, O_CREAT | O_WRONLY | O_TRUNC, mode);
|
||||
}
|
||||
|
||||
int creat64(const char *path, mode_t mode)
|
||||
{
|
||||
return open(path, O_CREAT | O_WRONLY | O_TRUNC | O_LARGEFILE, mode);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* glibc 别名 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int __open(const char *path, int flags, ...)
|
||||
{
|
||||
mode_t mode = 0;
|
||||
if (flags & O_CREAT) {
|
||||
va_list ap; va_start(ap, flags);
|
||||
mode = (mode_t)va_arg(ap, unsigned int);
|
||||
va_end(ap);
|
||||
}
|
||||
return open(path, flags, mode);
|
||||
}
|
||||
|
||||
int __open64(const char *path, int flags, ...)
|
||||
{
|
||||
mode_t mode = 0;
|
||||
if (flags & O_CREAT) {
|
||||
va_list ap; va_start(ap, flags);
|
||||
mode = (mode_t)va_arg(ap, unsigned int);
|
||||
va_end(ap);
|
||||
}
|
||||
return open64(path, flags, mode);
|
||||
}
|
||||
|
||||
int __libc_open(const char *path, int flags, ...)
|
||||
{
|
||||
mode_t mode = 0;
|
||||
if (flags & O_CREAT) {
|
||||
va_list ap; va_start(ap, flags);
|
||||
mode = (mode_t)va_arg(ap, unsigned int);
|
||||
va_end(ap);
|
||||
}
|
||||
return open(path, flags, mode);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* close */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
/*
|
||||
* zvfs_close_impl - zvfs fd 的关闭逻辑。
|
||||
*
|
||||
* 调用方已持有 fd_mu。函数内部会释放 fd_mu 后再处理 inode。
|
||||
*/
|
||||
static int
|
||||
zvfs_close_impl(int fd)
|
||||
{
|
||||
/* 持 fd_mu 取出 openfile,从表里摘除 */
|
||||
pthread_mutex_lock(&g_fs.fd_mu);
|
||||
struct zvfs_open_file *of = openfile_lookup(fd);
|
||||
if (!of) {
|
||||
pthread_mutex_unlock(&g_fs.fd_mu);
|
||||
errno = EBADF;
|
||||
return -1;
|
||||
}
|
||||
int new_ref = atomic_fetch_sub(&of->ref_count, 1) - 1;
|
||||
if (new_ref == 0)
|
||||
openfile_remove(fd);
|
||||
pthread_mutex_unlock(&g_fs.fd_mu);
|
||||
|
||||
if (new_ref > 0) {
|
||||
/*
|
||||
* 还有其他 dup 出来的 fd 引用同一个 openfile,
|
||||
* 只关闭真实 fd,不动 blob 和 inode。
|
||||
*/
|
||||
return real_close(fd);
|
||||
}
|
||||
|
||||
/* ---- openfile 引用归零:关闭 blob handle --------------------- */
|
||||
struct zvfs_inode *inode = of->inode;
|
||||
struct zvfs_blob_handle *handle = of->handle;
|
||||
openfile_free(of);
|
||||
|
||||
blob_close(handle);
|
||||
|
||||
/* ---- inode ref_count-- --------------------------------------- */
|
||||
int inode_ref = atomic_fetch_sub(&inode->ref_count, 1) - 1;
|
||||
|
||||
if (inode_ref == 0) {
|
||||
/*
|
||||
* 最后一个 fd 关闭了这个 inode。
|
||||
* 若 deleted:执行延迟 blob_delete。
|
||||
*/
|
||||
bool do_delete = false;
|
||||
pthread_mutex_lock(&inode->mu);
|
||||
do_delete = inode->deleted;
|
||||
pthread_mutex_unlock(&inode->mu);
|
||||
|
||||
if (do_delete)
|
||||
blob_delete(inode->blob_id);
|
||||
|
||||
pthread_mutex_lock(&g_fs.inode_mu);
|
||||
inode_remove(inode->blob_id);
|
||||
pthread_mutex_unlock(&g_fs.inode_mu);
|
||||
|
||||
/* path_cache 在 unlink 时已经摘除(deleted=true 路径)
|
||||
* 或在此处还需摘除(正常关闭最后一个 fd)*/
|
||||
if (!do_delete) {
|
||||
/* 正常关闭:path 留着,只有 inode 的引用归零时清缓存 */
|
||||
/* 注意:path_cache 里的指针指向这个即将释放的 inode,
|
||||
* 所以必须把 path_cache 条目也清掉,否则成为悬空指针 */
|
||||
pthread_mutex_lock(&g_fs.path_mu);
|
||||
/* 遍历找到所有指向这个 inode 的 path entry 并移除
|
||||
* (一个 inode 对应一个 path,hardlink 暂不支持)*/
|
||||
struct zvfs_path_entry *pe, *tmp; (void)tmp;
|
||||
HASH_ITER(hh, g_fs.path_cache, pe, tmp) {
|
||||
if (pe->inode == inode) {
|
||||
HASH_DEL(g_fs.path_cache, pe);
|
||||
free(pe->path);
|
||||
free(pe);
|
||||
break; /* 一对一关系,找到即退 */
|
||||
}
|
||||
}
|
||||
pthread_mutex_unlock(&g_fs.path_mu);
|
||||
}
|
||||
|
||||
inode_free(inode);
|
||||
}
|
||||
|
||||
return real_close(fd);
|
||||
}
|
||||
|
||||
int
|
||||
close(int fd)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
int ret;
|
||||
int is_zvfs_fd = (!ZVFS_IN_HOOK() && zvfs_is_zvfs_fd(fd));
|
||||
if (!is_zvfs_fd) {
|
||||
ret = real_close(fd);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
ret = zvfs_close_impl(fd);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
int __close(int fd) { return close(fd); }
|
||||
int __libc_close(int fd) { return close(fd); }
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* close_range */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
close_range(unsigned int first, unsigned int last, int flags)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
if (ZVFS_IN_HOOK()) {
|
||||
int ret = real_close_range ? real_close_range(first, last, flags)
|
||||
: (errno = ENOSYS, -1);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* 遍历范围内所有 fd,zvfs fd 单独走 zvfs_close_impl,
|
||||
* 其余统一交给 real_close_range(如果内核支持)。
|
||||
* 若内核不支持 close_range(< 5.9),逐个 close。
|
||||
*/
|
||||
int any_err = 0;
|
||||
int inited = 0;
|
||||
for (unsigned int fd = first; fd <= last; fd++) {
|
||||
if (zvfs_is_zvfs_fd((int)fd)) {
|
||||
if (!inited) {
|
||||
zvfs_ensure_init();
|
||||
inited = 1;
|
||||
}
|
||||
if (zvfs_close_impl((int)fd) < 0) any_err = 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* 让内核处理剩余非 zvfs fd(CLOEXEC 等 flags 也在这里生效) */
|
||||
if (real_close_range) {
|
||||
if (real_close_range(first, last, flags) < 0 && !any_err)
|
||||
any_err = 1;
|
||||
} else {
|
||||
/* 降级:逐个 close 非 zvfs fd */
|
||||
for (unsigned int fd = first; fd <= last; fd++) {
|
||||
if (!zvfs_is_zvfs_fd((int)fd))
|
||||
real_close((int)fd);
|
||||
}
|
||||
}
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return any_err ? -1 : 0;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* dup */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
dup(int oldfd)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
int is_zvfs_fd = (!ZVFS_IN_HOOK() && zvfs_is_zvfs_fd(oldfd));
|
||||
if (!is_zvfs_fd) {
|
||||
int ret = real_dup(oldfd);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* 当前版本不支持在 zvfs fd 上做 dup。
|
||||
* 先明确返回 ENOTSUP,避免暴露错误的 offset 语义。
|
||||
*/
|
||||
zvfs_ensure_init();
|
||||
errno = ENOTSUP;
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* dup2 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
dup2(int oldfd, int newfd)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
int is_zvfs_fd = (!ZVFS_IN_HOOK() && zvfs_is_zvfs_fd(oldfd));
|
||||
if (!is_zvfs_fd) {
|
||||
int ret = real_dup2(oldfd, newfd);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* POSIX 兼容:dup2(oldfd, oldfd) 对合法 fd 直接返回 oldfd。 */
|
||||
if (oldfd == newfd) {
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return oldfd;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
errno = ENOTSUP;
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* dup3 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
dup3(int oldfd, int newfd, int flags)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
int is_zvfs_fd = (!ZVFS_IN_HOOK() && zvfs_is_zvfs_fd(oldfd));
|
||||
if (!is_zvfs_fd) {
|
||||
int ret = real_dup3(oldfd, newfd, flags);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (oldfd == newfd) {
|
||||
errno = EINVAL;
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
errno = ENOTSUP;
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1;
|
||||
}
|
||||
51
src/hook/zvfs_hook_fd.h
Normal file
51
src/hook/zvfs_hook_fd.h
Normal file
@@ -0,0 +1,51 @@
|
||||
#ifndef __ZVFS_HOOK_FD_H__
|
||||
#define __ZVFS_HOOK_FD_H__
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
/**
|
||||
* open / creat:
|
||||
* zvfs 路径 + O_CREAT → blob_create + xattr_write + inode_alloc + openfile_alloc
|
||||
* zvfs 路径,无 O_CREAT → xattr_read_blob_id + blob_open + inode_alloc(若未缓存) + openfile_alloc
|
||||
* 非 zvfs 路径 → 透传
|
||||
*
|
||||
* close:
|
||||
* zvfs fd → openfile ref_count--
|
||||
* 归零:blob_close;若 inode->deleted,blob_delete + inode_free
|
||||
* inode ref_count--(归零:path_cache_remove + inode_free)
|
||||
* real_close
|
||||
* 非 zvfs fd → 透传
|
||||
*
|
||||
* dup / dup2 / dup3:
|
||||
* zvfs fd → 新 fd 插入 fd_table,openfile.ref_count++(共享同一 openfile),
|
||||
* real_dup* 同步执行(内核也要知道这个 fd)
|
||||
* 非 zvfs fd → 透传
|
||||
*/
|
||||
|
||||
/* open 族 */
|
||||
int open(const char *path, int flags, ...);
|
||||
int open64(const char *path, int flags, ...);
|
||||
int openat(int dirfd, const char *path, int flags, ...);
|
||||
int openat64(int dirfd, const char *path, int flags, ...);
|
||||
int creat(const char *path, mode_t mode);
|
||||
int creat64(const char *path, mode_t mode);
|
||||
|
||||
/* close 族 */
|
||||
int close(int fd);
|
||||
int close_range(unsigned int first, unsigned int last, int flags);
|
||||
|
||||
/* dup 族 */
|
||||
int dup(int oldfd);
|
||||
int dup2(int oldfd, int newfd);
|
||||
int dup3(int oldfd, int newfd, int flags);
|
||||
|
||||
/* glibc 内部别名(与 open/close 实现体共享逻辑,转发即可) */
|
||||
int __open(const char *path, int flags, ...);
|
||||
int __open64(const char *path, int flags, ...);
|
||||
int __libc_open(const char *path, int flags, ...);
|
||||
int __close(int fd);
|
||||
int __libc_close(int fd);
|
||||
|
||||
#endif // __ZVFS_HOOK_FD_H__
|
||||
298
src/hook/zvfs_hook_init.c
Normal file
298
src/hook/zvfs_hook_init.c
Normal file
@@ -0,0 +1,298 @@
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
#include "zvfs_hook_init.h"
|
||||
#include "zvfs_hook_reentrant.h"
|
||||
#include "fs/zvfs.h"
|
||||
#include "fs/zvfs_open_file.h"
|
||||
|
||||
#include <dlfcn.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <errno.h>
|
||||
#include <limits.h>
|
||||
#include <pthread.h>
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* 线程局部重入计数定义 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
__thread int _zvfs_hook_depth = 0;
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* zvfs 挂载点 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
#define ZVFS_MOUNT_PREFIX "/zvfs"
|
||||
#define ZVFS_MOUNT_PREFIX_LEN 5 /* strlen("/zvfs") */
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* real_* 函数指针定义 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
/* open / close / dup */
|
||||
int (*real_open)(const char *, int, ...) = NULL;
|
||||
int (*real_open64)(const char *, int, ...) = NULL;
|
||||
int (*real_openat)(int, const char *, int, ...) = NULL;
|
||||
int (*real_openat64)(int, const char *, int, ...) = NULL;
|
||||
int (*real_creat)(const char *, mode_t) = NULL;
|
||||
int (*real_creat64)(const char *, mode_t) = NULL;
|
||||
int (*real_close)(int) = NULL;
|
||||
int (*real_close_range)(unsigned, unsigned, unsigned) = NULL;
|
||||
int (*real_dup)(int) = NULL;
|
||||
int (*real_dup2)(int, int) = NULL;
|
||||
int (*real_dup3)(int, int, int) = NULL;
|
||||
|
||||
/* read */
|
||||
ssize_t (*real_read)(int, void *, size_t) = NULL;
|
||||
ssize_t (*real_pread)(int, void *, size_t, off_t) = NULL;
|
||||
ssize_t (*real_pread64)(int, void *, size_t, off_t) = NULL;
|
||||
ssize_t (*real_readv)(int, const struct iovec *, int) = NULL;
|
||||
ssize_t (*real_preadv)(int, const struct iovec *, int, off_t) = NULL;
|
||||
ssize_t (*real_preadv64)(int, const struct iovec *, int, off_t) = NULL;
|
||||
ssize_t (*real_preadv2)(int, const struct iovec *, int, off_t, int) = NULL;
|
||||
|
||||
/* write */
|
||||
ssize_t (*real_write)(int, const void *, size_t) = NULL;
|
||||
ssize_t (*real_pwrite)(int, const void *, size_t, off_t) = NULL;
|
||||
ssize_t (*real_pwrite64)(int, const void *, size_t, off_t) = NULL;
|
||||
ssize_t (*real_writev)(int, const struct iovec *, int) = NULL;
|
||||
ssize_t (*real_pwritev)(int, const struct iovec *, int, off_t) = NULL;
|
||||
ssize_t (*real_pwritev64)(int, const struct iovec *, int, off_t) = NULL;
|
||||
ssize_t (*real_pwritev2)(int, const struct iovec *, int, off_t, int) = NULL;
|
||||
|
||||
/* lseek / truncate / fallocate */
|
||||
off_t (*real_lseek)(int, off_t, int) = NULL;
|
||||
off_t (*real_lseek64)(int, off_t, int) = NULL;
|
||||
int (*real_truncate)(const char *, off_t) = NULL;
|
||||
int (*real_truncate64)(const char *, off_t) = NULL;
|
||||
int (*real_ftruncate)(int, off_t) = NULL;
|
||||
int (*real_ftruncate64)(int, off_t) = NULL;
|
||||
int (*real_fallocate)(int, int, off_t, off_t) = NULL;
|
||||
int (*real_posix_fallocate)(int, off_t, off_t) = NULL;
|
||||
|
||||
/* stat */
|
||||
int (*real_stat)(const char *, struct stat *) = NULL;
|
||||
int (*real_stat64)(const char *, struct stat64 *) = NULL;
|
||||
int (*real_fstat)(int, struct stat *) = NULL;
|
||||
int (*real_fstat64)(int, struct stat64 *) = NULL;
|
||||
int (*real_lstat)(const char *, struct stat *) = NULL;
|
||||
int (*real_lstat64)(const char *, struct stat64 *) = NULL;
|
||||
int (*real_fstatat)(int, const char *, struct stat *, int) = NULL;
|
||||
int (*real_fstatat64)(int, const char *, struct stat64 *, int) = NULL;
|
||||
int (*real_statx)(int, const char *, int, unsigned int,
|
||||
struct statx *) = NULL;
|
||||
|
||||
/* sync */
|
||||
int (*real_fsync)(int) = NULL;
|
||||
int (*real_fdatasync)(int) = NULL;
|
||||
int (*real_sync_file_range)(int, off_t, off_t, unsigned int) = NULL;
|
||||
|
||||
/* fcntl / ioctl */
|
||||
int (*real_fcntl)(int, int, ...) = NULL;
|
||||
int (*real_fcntl64)(int, int, ...) = NULL;
|
||||
int (*real_ioctl)(int, unsigned long, ...) = NULL;
|
||||
|
||||
/* 目录 */
|
||||
int (*real_unlink)(const char *) = NULL;
|
||||
int (*real_unlinkat)(int, const char *, int) = NULL;
|
||||
int (*real_rename)(const char *, const char *) = NULL;
|
||||
int (*real_renameat)(int, const char *, int, const char *) = NULL;
|
||||
int (*real_renameat2)(int, const char *, int, const char *,
|
||||
unsigned int) = NULL;
|
||||
|
||||
/* mmap */
|
||||
void *(*real_mmap)(void *, size_t, int, int, int, off_t) = NULL;
|
||||
void *(*real_mmap64)(void *, size_t, int, int, int, off_t) = NULL;
|
||||
int (*real_munmap)(void *, size_t) = NULL;
|
||||
int (*real_msync)(void *, size_t, int) = NULL;
|
||||
|
||||
/* fork */
|
||||
pid_t (*real_fork)(void) = NULL;
|
||||
pid_t (*real_vfork)(void) = NULL;
|
||||
|
||||
/* glibc 别名 */
|
||||
int (*real___open)(const char *, int, ...) = NULL;
|
||||
int (*real___open64)(const char *, int, ...) = NULL;
|
||||
int (*real___libc_open)(const char *, int, ...) = NULL;
|
||||
ssize_t (*real___read)(int, void *, size_t) = NULL;
|
||||
ssize_t (*real___libc_read)(int, void *, size_t) = NULL;
|
||||
ssize_t (*real___write)(int, const void *, size_t) = NULL;
|
||||
ssize_t (*real___libc_write)(int, const void *, size_t) = NULL;
|
||||
int (*real___close)(int) = NULL;
|
||||
int (*real___libc_close)(int) = NULL;
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* dlsym 辅助宏 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
/*
|
||||
* 找不到符号时不 fatal:部分 glibc 内部别名在某些发行版上可能不存在,
|
||||
* 置 NULL 后 hook 函数里做 NULL 检查再回退即可。
|
||||
*/
|
||||
#define LOAD_SYM(var, name) \
|
||||
do { \
|
||||
(var) = dlsym(RTLD_NEXT, (name)); \
|
||||
if (!(var)) \
|
||||
fprintf(stderr, "[zvfs] WARNING: dlsym(%s) = NULL\n", (name)); \
|
||||
} while (0)
|
||||
|
||||
#define LOAD_SYM_OPTIONAL(var, name) \
|
||||
do { (var) = dlsym(RTLD_NEXT, (name)); } while (0)
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* 初始化 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
__attribute__((constructor))
|
||||
void zvfs_hook_init(void)
|
||||
{
|
||||
/* 必须存在的符号 */
|
||||
LOAD_SYM(real_open, "open");
|
||||
LOAD_SYM(real_open64, "open64");
|
||||
LOAD_SYM(real_openat, "openat");
|
||||
LOAD_SYM(real_openat64, "openat64");
|
||||
LOAD_SYM(real_creat, "creat");
|
||||
LOAD_SYM(real_creat64, "creat64");
|
||||
LOAD_SYM(real_close, "close");
|
||||
LOAD_SYM(real_dup, "dup");
|
||||
LOAD_SYM(real_dup2, "dup2");
|
||||
LOAD_SYM(real_dup3, "dup3");
|
||||
|
||||
LOAD_SYM(real_read, "read");
|
||||
LOAD_SYM(real_pread, "pread");
|
||||
LOAD_SYM(real_pread64, "pread64");
|
||||
LOAD_SYM(real_readv, "readv");
|
||||
LOAD_SYM(real_preadv, "preadv");
|
||||
LOAD_SYM(real_preadv64, "preadv64");
|
||||
LOAD_SYM(real_write, "write");
|
||||
LOAD_SYM(real_pwrite, "pwrite");
|
||||
LOAD_SYM(real_pwrite64, "pwrite64");
|
||||
LOAD_SYM(real_writev, "writev");
|
||||
LOAD_SYM(real_pwritev, "pwritev");
|
||||
LOAD_SYM(real_pwritev64, "pwritev64");
|
||||
|
||||
LOAD_SYM(real_lseek, "lseek");
|
||||
LOAD_SYM(real_lseek64, "lseek64");
|
||||
LOAD_SYM(real_truncate, "truncate");
|
||||
LOAD_SYM(real_truncate64, "truncate64");
|
||||
LOAD_SYM(real_ftruncate, "ftruncate");
|
||||
LOAD_SYM(real_ftruncate64, "ftruncate64");
|
||||
LOAD_SYM(real_fallocate, "fallocate");
|
||||
LOAD_SYM(real_posix_fallocate,"posix_fallocate");
|
||||
|
||||
LOAD_SYM(real_stat, "stat");
|
||||
LOAD_SYM(real_stat64, "stat64");
|
||||
LOAD_SYM(real_fstat, "fstat");
|
||||
LOAD_SYM(real_fstat64, "fstat64");
|
||||
LOAD_SYM(real_lstat, "lstat");
|
||||
LOAD_SYM(real_lstat64, "lstat64");
|
||||
LOAD_SYM(real_fstatat, "fstatat");
|
||||
LOAD_SYM(real_fstatat64, "fstatat64");
|
||||
LOAD_SYM(real_fsync, "fsync");
|
||||
LOAD_SYM(real_fdatasync, "fdatasync");
|
||||
LOAD_SYM(real_fcntl, "fcntl");
|
||||
LOAD_SYM(real_fcntl64, "fcntl64");
|
||||
LOAD_SYM(real_ioctl, "ioctl");
|
||||
|
||||
LOAD_SYM(real_unlink, "unlink");
|
||||
LOAD_SYM(real_unlinkat, "unlinkat");
|
||||
LOAD_SYM(real_rename, "rename");
|
||||
LOAD_SYM(real_renameat, "renameat");
|
||||
LOAD_SYM(real_mmap, "mmap");
|
||||
LOAD_SYM(real_mmap64, "mmap64");
|
||||
LOAD_SYM(real_munmap, "munmap");
|
||||
LOAD_SYM(real_msync, "msync");
|
||||
LOAD_SYM(real_fork, "fork");
|
||||
LOAD_SYM(real_vfork, "vfork");
|
||||
|
||||
/* 可选符号:glibc 内部别名,不一定存在 */
|
||||
LOAD_SYM_OPTIONAL(real_close_range, "close_range");
|
||||
LOAD_SYM_OPTIONAL(real_preadv2, "preadv2");
|
||||
LOAD_SYM_OPTIONAL(real_pwritev2, "pwritev2");
|
||||
LOAD_SYM_OPTIONAL(real_statx, "statx");
|
||||
LOAD_SYM_OPTIONAL(real_sync_file_range,"sync_file_range");
|
||||
LOAD_SYM_OPTIONAL(real_renameat2, "renameat2");
|
||||
LOAD_SYM_OPTIONAL(real___open, "__open");
|
||||
LOAD_SYM_OPTIONAL(real___open64, "__open64");
|
||||
LOAD_SYM_OPTIONAL(real___libc_open, "__libc_open");
|
||||
LOAD_SYM_OPTIONAL(real___read, "__read");
|
||||
LOAD_SYM_OPTIONAL(real___libc_read, "__libc_read");
|
||||
LOAD_SYM_OPTIONAL(real___write, "__write");
|
||||
LOAD_SYM_OPTIONAL(real___libc_write, "__libc_write");
|
||||
LOAD_SYM_OPTIONAL(real___close, "__close");
|
||||
LOAD_SYM_OPTIONAL(real___libc_close, "__libc_close");
|
||||
|
||||
/* 初始化全局 fs 结构 */
|
||||
zvfs_fs_init();
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* 路径 / fd 判断 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
zvfs_is_zvfs_path(const char *path)
|
||||
{
|
||||
if (!path)
|
||||
return 0;
|
||||
/* 路径必须以 /zvfs 开头,且后一个字符是 '/' 或 '\0' */
|
||||
if (strncmp(path, ZVFS_MOUNT_PREFIX, ZVFS_MOUNT_PREFIX_LEN) != 0)
|
||||
return 0;
|
||||
char next = path[ZVFS_MOUNT_PREFIX_LEN];
|
||||
return (next == '/' || next == '\0');
|
||||
}
|
||||
|
||||
int
|
||||
zvfs_is_zvfs_fd(int fd)
|
||||
{
|
||||
if (fd < 0)
|
||||
return 0;
|
||||
pthread_mutex_lock(&g_fs.fd_mu);
|
||||
struct zvfs_open_file *of = openfile_lookup(fd);
|
||||
pthread_mutex_unlock(&g_fs.fd_mu);
|
||||
return (of != NULL);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* dirfd + 相对路径 → 绝对路径 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
zvfs_resolve_atpath(int dirfd, const char *path, char *buf, size_t bufsz)
|
||||
{
|
||||
/* 绝对路径:直接拷贝 */
|
||||
if (path && path[0] == '/') {
|
||||
if (strlen(path) >= bufsz) {
|
||||
errno = ENAMETOOLONG;
|
||||
return -1;
|
||||
}
|
||||
strncpy(buf, path, bufsz);
|
||||
buf[bufsz - 1] = '\0';
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* AT_FDCWD:以当前工作目录为基准 */
|
||||
if (dirfd == AT_FDCWD) {
|
||||
if (!getcwd(buf, bufsz)) return -1;
|
||||
} else {
|
||||
/* 通过 /proc/self/fd/<dirfd> 读出目录的绝对路径 */
|
||||
char proc_path[64];
|
||||
snprintf(proc_path, sizeof(proc_path), "/proc/self/fd/%d", dirfd);
|
||||
ssize_t len = readlink(proc_path, buf, bufsz - 1);
|
||||
if (len < 0) return -1;
|
||||
buf[len] = '\0';
|
||||
}
|
||||
|
||||
/* 拼接 path */
|
||||
size_t dir_len = strlen(buf);
|
||||
size_t path_len = path ? strlen(path) : 0;
|
||||
if (dir_len + 1 + path_len >= bufsz) {
|
||||
errno = ENAMETOOLONG;
|
||||
return -1;
|
||||
}
|
||||
if (path_len > 0) {
|
||||
buf[dir_len] = '/';
|
||||
memcpy(buf + dir_len + 1, path, path_len + 1);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
130
src/hook/zvfs_hook_init.h
Normal file
130
src/hook/zvfs_hook_init.h
Normal file
@@ -0,0 +1,130 @@
|
||||
#ifndef __ZVFS_HOOK_INIT_H__
|
||||
#define __ZVFS_HOOK_INIT_H__
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/uio.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <stdint.h>
|
||||
#include "fs/zvfs_sys_init.h"
|
||||
|
||||
/*
|
||||
* 所有原始函数指针集中在这里。
|
||||
* 命名规则:real_<funcname>
|
||||
* 通过 dlsym(RTLD_NEXT, "funcname") 在 __attribute__((constructor)) 中初始化。
|
||||
*/
|
||||
|
||||
/* open 族 */
|
||||
extern int (*real_open)(const char *path, int flags, ...);
|
||||
extern int (*real_open64)(const char *path, int flags, ...);
|
||||
extern int (*real_openat)(int dirfd, const char *path, int flags, ...);
|
||||
extern int (*real_openat64)(int dirfd, const char *path, int flags, ...);
|
||||
extern int (*real_creat)(const char *path, mode_t mode);
|
||||
extern int (*real_creat64)(const char *path, mode_t mode);
|
||||
|
||||
/* close 族 */
|
||||
extern int (*real_close)(int fd);
|
||||
extern int (*real_close_range)(unsigned int first, unsigned int last, unsigned int flags);
|
||||
|
||||
/* dup 族 */
|
||||
extern int (*real_dup)(int oldfd);
|
||||
extern int (*real_dup2)(int oldfd, int newfd);
|
||||
extern int (*real_dup3)(int oldfd, int newfd, int flags);
|
||||
|
||||
/* read 族 */
|
||||
extern ssize_t (*real_read)(int fd, void *buf, size_t count);
|
||||
extern ssize_t (*real_pread)(int fd, void *buf, size_t count, off_t offset);
|
||||
extern ssize_t (*real_pread64)(int fd, void *buf, size_t count, off64_t offset);
|
||||
extern ssize_t (*real_readv)(int fd, const struct iovec *iov, int iovcnt);
|
||||
extern ssize_t (*real_preadv)(int fd, const struct iovec *iov, int iovcnt, off_t offset);
|
||||
extern ssize_t (*real_preadv64)(int fd, const struct iovec *iov, int iovcnt, off64_t offset);
|
||||
extern ssize_t (*real_preadv2)(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
|
||||
|
||||
/* write 族 */
|
||||
extern ssize_t (*real_write)(int fd, const void *buf, size_t count);
|
||||
extern ssize_t (*real_pwrite)(int fd, const void *buf, size_t count, off_t offset);
|
||||
extern ssize_t (*real_pwrite64)(int fd, const void *buf, size_t count, off64_t offset);
|
||||
extern ssize_t (*real_writev)(int fd, const struct iovec *iov, int iovcnt);
|
||||
extern ssize_t (*real_pwritev)(int fd, const struct iovec *iov, int iovcnt, off_t offset);
|
||||
extern ssize_t (*real_pwritev64)(int fd, const struct iovec *iov, int iovcnt, off64_t offset);
|
||||
extern ssize_t (*real_pwritev2)(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags);
|
||||
|
||||
/* lseek */
|
||||
extern off_t (*real_lseek)(int fd, off_t offset, int whence);
|
||||
extern off64_t (*real_lseek64)(int fd, off64_t offset, int whence);
|
||||
|
||||
/* truncate / fallocate */
|
||||
extern int (*real_truncate)(const char *path, off_t length);
|
||||
extern int (*real_truncate64)(const char *path, off64_t length);
|
||||
extern int (*real_ftruncate)(int fd, off_t length);
|
||||
extern int (*real_ftruncate64)(int fd, off64_t length);
|
||||
extern int (*real_fallocate)(int fd, int mode, off_t offset, off_t len);
|
||||
extern int (*real_posix_fallocate)(int fd, off_t offset, off_t len);
|
||||
|
||||
/* stat 族 */
|
||||
extern int (*real_stat)(const char *path, struct stat *buf);
|
||||
extern int (*real_stat64)(const char *path, struct stat64 *buf);
|
||||
extern int (*real_fstat)(int fd, struct stat *buf);
|
||||
extern int (*real_fstat64)(int fd, struct stat64 *buf);
|
||||
extern int (*real_lstat)(const char *path, struct stat *buf);
|
||||
extern int (*real_lstat64)(const char *path, struct stat64 *buf);
|
||||
extern int (*real_fstatat)(int dirfd, const char *path, struct stat *buf, int flags);
|
||||
extern int (*real_fstatat64)(int dirfd, const char *path, struct stat64 *buf, int flags);
|
||||
extern int (*real_statx)(int dirfd, const char *path, int flags,
|
||||
unsigned int mask, struct statx *buf);
|
||||
|
||||
/* sync */
|
||||
extern int (*real_fsync)(int fd);
|
||||
extern int (*real_fdatasync)(int fd);
|
||||
extern int (*real_sync_file_range)(int fd, off64_t offset, off64_t nbytes, unsigned int flags);
|
||||
|
||||
/* fcntl / ioctl */
|
||||
extern int (*real_fcntl)(int fd, int cmd, ...);
|
||||
extern int (*real_fcntl64)(int fd, int cmd, ...);
|
||||
extern int (*real_ioctl)(int fd, unsigned long request, ...);
|
||||
|
||||
/* 目录感知 */
|
||||
extern int (*real_unlink)(const char *path);
|
||||
extern int (*real_unlinkat)(int dirfd, const char *path, int flags);
|
||||
extern int (*real_rename)(const char *oldpath, const char *newpath);
|
||||
extern int (*real_renameat)(int olddirfd, const char *oldpath,
|
||||
int newdirfd, const char *newpath);
|
||||
extern int (*real_renameat2)(int olddirfd, const char *oldpath,
|
||||
int newdirfd, const char *newpath,
|
||||
unsigned int flags);
|
||||
|
||||
/* mmap 族(预留) */
|
||||
extern void *(*real_mmap)(void *addr, size_t length, int prot, int flags,
|
||||
int fd, off_t offset);
|
||||
extern void *(*real_mmap64)(void *addr, size_t length, int prot, int flags,
|
||||
int fd, off64_t offset);
|
||||
extern int (*real_munmap)(void *addr, size_t length);
|
||||
extern int (*real_msync)(void *addr, size_t length, int flags);
|
||||
|
||||
|
||||
/* glibc 内部别名 */
|
||||
extern int (*real___open)(const char *path, int flags, ...);
|
||||
extern int (*real___open64)(const char *path, int flags, ...);
|
||||
extern int (*real___libc_open)(const char *path, int flags, ...);
|
||||
extern ssize_t (*real___read)(int fd, void *buf, size_t count);
|
||||
extern ssize_t (*real___libc_read)(int fd, void *buf, size_t count);
|
||||
extern ssize_t (*real___write)(int fd, const void *buf, size_t count);
|
||||
extern ssize_t (*real___libc_write)(int fd, const void *buf, size_t count);
|
||||
extern int (*real___close)(int fd);
|
||||
extern int (*real___libc_close)(int fd);
|
||||
|
||||
/* 初始化所有 real_* 指针,在 constructor 中调用 */
|
||||
void zvfs_hook_init(void);
|
||||
|
||||
/* 判断路径 / fd 是否属于 zvfs 接管范围 */
|
||||
int zvfs_is_zvfs_path(const char *path);
|
||||
int zvfs_is_zvfs_fd(int fd);
|
||||
|
||||
/*
|
||||
* 将 dirfd + 相对路径解析为绝对路径,写入 buf(长度 bufsz)。
|
||||
* dirfd == AT_FDCWD 时等价于以当前工作目录为基准。
|
||||
* 成功返回 0,失败返回 -1 并设置 errno。
|
||||
*/
|
||||
int zvfs_resolve_atpath(int dirfd, const char *path, char *buf, size_t bufsz);
|
||||
#endif // __ZVFS_HOOK_INIT_H__
|
||||
85
src/hook/zvfs_hook_mmap.c
Normal file
85
src/hook/zvfs_hook_mmap.c
Normal file
@@ -0,0 +1,85 @@
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
#include "zvfs_hook_mmap.h"
|
||||
#include "zvfs_hook_init.h"
|
||||
#include "zvfs_hook_reentrant.h"
|
||||
|
||||
#include <errno.h>
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* mmap / mmap64 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
void *
|
||||
mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
void *ret;
|
||||
|
||||
/*
|
||||
* MAP_ANONYMOUS:不关联任何 fd,直接透传。
|
||||
* 非 zvfs fd:直接透传。
|
||||
* zvfs fd:返回 ENOTSUP。
|
||||
*/
|
||||
if (ZVFS_IN_HOOK() || (flags & MAP_ANONYMOUS) || !zvfs_is_zvfs_fd(fd)) {
|
||||
ret = real_mmap(addr, length, prot, flags, fd, offset);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return ret;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
/* zvfs fd:当前不支持 mmap */
|
||||
errno = ENOTSUP;
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return MAP_FAILED;
|
||||
}
|
||||
|
||||
void *
|
||||
mmap64(void *addr, size_t length, int prot, int flags, int fd, off_t offset)
|
||||
{
|
||||
/*
|
||||
* mmap64 在 64-bit 系统上与 mmap 等价(off_t 已经是 64-bit)。
|
||||
* 直接转发。
|
||||
*/
|
||||
return mmap(addr, length, prot, flags, fd, offset);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* munmap */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
munmap(void *addr, size_t length)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
/*
|
||||
* zvfs 的 mmap 不会成功,所以这里不会有 zvfs 映射需要处理。
|
||||
* 直接透传。
|
||||
*
|
||||
* future:查 mmap_table,命中则 blob_write 写回再透传。
|
||||
*/
|
||||
int r = real_munmap(addr, length);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* msync */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
msync(void *addr, size_t length, int flags)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
/*
|
||||
* 同 munmap:当前无 zvfs 映射,直接透传。
|
||||
*
|
||||
* future:查 mmap_table,命中则 blob_write 对应范围。
|
||||
*/
|
||||
int r = real_msync(addr, length, flags);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
34
src/hook/zvfs_hook_mmap.h
Normal file
34
src/hook/zvfs_hook_mmap.h
Normal file
@@ -0,0 +1,34 @@
|
||||
#ifndef __ZVFS_HOOK_MMAP_H__
|
||||
#define __ZVFS_HOOK_MMAP_H__
|
||||
|
||||
#include <sys/mman.h>
|
||||
#include <stddef.h>
|
||||
|
||||
/*
|
||||
* mmap 族。
|
||||
*
|
||||
* 当前策略:
|
||||
* - fd 属于 zvfs → 返回 ENOTSUP,强制上层走非 mmap 路径
|
||||
* (RocksDB: options.use_mmap_reads/writes = false)
|
||||
* - fd 不属于 zvfs,或 MAP_ANONYMOUS → 透传 real_mmap
|
||||
*
|
||||
* munmap / msync:
|
||||
* zvfs fd 的 mmap 不会成功,所以 munmap/msync 里永远找不到
|
||||
* zvfs 的映射,直接透传即可。
|
||||
*
|
||||
* 预留扩展点(future):
|
||||
* 实现时在此处:
|
||||
* mmap → MAP_ANONYMOUS 分配匿名内存 + blob_read 填充
|
||||
* 将 (addr, length, inode, file_offset) 插入 mmap_table
|
||||
* munmap → 查 mmap_table,若命中则 blob_write 写回,再真正 munmap
|
||||
* msync → 查 mmap_table,blob_write 对应范围
|
||||
*/
|
||||
|
||||
void *mmap(void *addr, size_t length, int prot, int flags,
|
||||
int fd, off_t offset);
|
||||
void *mmap64(void *addr, size_t length, int prot, int flags,
|
||||
int fd, off_t offset);
|
||||
int munmap(void *addr, size_t length);
|
||||
int msync(void *addr, size_t length, int flags);
|
||||
|
||||
#endif // __ZVFS_HOOK_MMAP_H__
|
||||
32
src/hook/zvfs_hook_reentrant.h
Normal file
32
src/hook/zvfs_hook_reentrant.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#ifndef __ZVFS_HOOK_REENTRANT_H__
|
||||
#define __ZVFS_HOOK_REENTRANT_H__
|
||||
|
||||
/*
|
||||
* 线程局部重入深度计数。
|
||||
*
|
||||
* 进入任何 hook 函数时 ZVFS_HOOK_ENTER(),离开时 ZVFS_HOOK_LEAVE()。
|
||||
* 当深度 > 1 时,说明当前调用是 hook 内部发起的(例如 hook 内调用了
|
||||
* real_fstat,而 fstat 本身也被 hook),此时直接走 real_* 绕过 zvfs 逻辑。
|
||||
*
|
||||
* 典型骨架:
|
||||
*
|
||||
* int fstat(int fd, struct stat *buf)
|
||||
* {
|
||||
* ZVFS_HOOK_ENTER();
|
||||
* int ret;
|
||||
* if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_fd(fd))
|
||||
* ret = real_fstat(fd, buf);
|
||||
* else
|
||||
* ret = zvfs_fstat_impl(fd, buf);
|
||||
* ZVFS_HOOK_LEAVE();
|
||||
* return ret;
|
||||
* }
|
||||
*/
|
||||
|
||||
extern __thread int _zvfs_hook_depth;
|
||||
|
||||
#define ZVFS_HOOK_ENTER() (++_zvfs_hook_depth)
|
||||
#define ZVFS_HOOK_LEAVE() (--_zvfs_hook_depth)
|
||||
#define ZVFS_IN_HOOK() (_zvfs_hook_depth > 1)
|
||||
|
||||
#endif // __ZVFS_HOOK_REENTRANT_H__
|
||||
549
src/hook/zvfs_hook_rw.c
Normal file
549
src/hook/zvfs_hook_rw.c
Normal file
@@ -0,0 +1,549 @@
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
#include "zvfs_hook_rw.h"
|
||||
#include "zvfs_hook_init.h"
|
||||
#include "zvfs_hook_reentrant.h"
|
||||
#include "fs/zvfs.h"
|
||||
#include "fs/zvfs_open_file.h"
|
||||
#include "fs/zvfs_inode.h"
|
||||
#include "spdk_engine/io_engine.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <pthread.h>
|
||||
#include <stdint.h>
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* 内部:单段 pread / pwrite(不修改 of->offset) */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
/*
|
||||
* zvfs_pread_impl
|
||||
*
|
||||
* 从 blob 的 [offset, offset+count) 读取数据到 buf。
|
||||
* 若请求范围超出 logical_size,截断到 logical_size 边界。
|
||||
* 成功返回实际读取字节数,失败返回 -1。
|
||||
*/
|
||||
static ssize_t
|
||||
zvfs_pread_impl(struct zvfs_open_file *of,
|
||||
void *buf, size_t count, uint64_t offset)
|
||||
{
|
||||
/* 持 inode->mu 读 logical_size,防止并发 write 同时修改 */
|
||||
pthread_mutex_lock(&of->inode->mu);
|
||||
uint64_t size = of->inode->logical_size;
|
||||
pthread_mutex_unlock(&of->inode->mu);
|
||||
|
||||
/* offset 超出文件末尾:返回 0(EOF) */
|
||||
if (offset >= size)
|
||||
return 0;
|
||||
|
||||
/* 截断读取长度到文件末尾 */
|
||||
if (offset + count > size)
|
||||
count = (size_t)(size - offset);
|
||||
|
||||
if (count == 0)
|
||||
return 0;
|
||||
|
||||
if (blob_read(of->handle, offset, buf, count) < 0) {
|
||||
errno = EIO;
|
||||
return -1;
|
||||
}
|
||||
|
||||
return (ssize_t)count;
|
||||
}
|
||||
|
||||
/*
|
||||
* zvfs_pwrite_impl
|
||||
*
|
||||
* 将 buf 的 count 字节写入 blob 的 offset 处。
|
||||
* 若写入后末尾超过 logical_size,更新 logical_size 并同步 st_size。
|
||||
* 成功返回 count,失败返回 -1。
|
||||
*/
|
||||
static ssize_t
|
||||
zvfs_pwrite_impl(struct zvfs_open_file *of,
|
||||
const void *buf, size_t count, uint64_t offset)
|
||||
{
|
||||
if (count == 0)
|
||||
return 0;
|
||||
|
||||
uint64_t end = offset + count;
|
||||
|
||||
/*
|
||||
* 若写入范围超出 blob 当前物理大小,先 resize。
|
||||
* blob_resize 是 SPDK 侧的操作(可能分配新 cluster)。
|
||||
*/
|
||||
pthread_mutex_lock(&of->inode->mu);
|
||||
uint64_t old_size = of->inode->logical_size;
|
||||
pthread_mutex_unlock(&of->inode->mu);
|
||||
|
||||
if (end > old_size) {
|
||||
if (blob_resize(of->handle, end) < 0) {
|
||||
errno = EIO;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (blob_write(of->handle, offset, buf, count) < 0) {
|
||||
errno = EIO;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* 更新 logical_size(持锁,inode_update_size 负责 ftruncate) */
|
||||
if (end > old_size) {
|
||||
pthread_mutex_lock(&of->inode->mu);
|
||||
if (end > of->inode->logical_size) /* double-check */
|
||||
inode_update_size(of->inode, of->fd, end);
|
||||
pthread_mutex_unlock(&of->inode->mu);
|
||||
}
|
||||
|
||||
return (ssize_t)count;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* 内部:iov 合并辅助 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
/*
|
||||
* iov_total_len - 计算 iovec 总字节数。
|
||||
*/
|
||||
static size_t
|
||||
iov_total_len(const struct iovec *iov, int iovcnt)
|
||||
{
|
||||
size_t total = 0;
|
||||
for (int i = 0; i < iovcnt; i++)
|
||||
total += iov[i].iov_len;
|
||||
return total;
|
||||
}
|
||||
|
||||
/*
|
||||
* zvfs_iov_pread
|
||||
*
|
||||
* 将 iovec 合并为单次 blob_read:
|
||||
* 1. 一次 blob_read 读到临时 buf
|
||||
* 2. 按 iovec 顺序分发到各段
|
||||
*
|
||||
* 单次 SPDK I/O 比逐段提交效率高得多;
|
||||
* 堆分配代价(通常几个 page)远小于多次 SPDK 提交的开销。
|
||||
*/
|
||||
static ssize_t
|
||||
zvfs_iov_pread(struct zvfs_open_file *of,
|
||||
const struct iovec *iov, int iovcnt, uint64_t offset)
|
||||
{
|
||||
size_t total_len = iov_total_len(iov, iovcnt);
|
||||
if (total_len == 0) return 0;
|
||||
|
||||
/* 截断到文件末尾 */
|
||||
pthread_mutex_lock(&of->inode->mu);
|
||||
uint64_t size = of->inode->logical_size;
|
||||
pthread_mutex_unlock(&of->inode->mu);
|
||||
|
||||
if (offset >= size) return 0;
|
||||
if (offset + total_len > size)
|
||||
total_len = (size_t)(size - offset);
|
||||
|
||||
/* 分配临时 buf,单次读 */
|
||||
char *tmp = malloc(total_len);
|
||||
if (!tmp) { errno = ENOMEM; return -1; }
|
||||
|
||||
if (blob_read(of->handle, offset, tmp, total_len) < 0) {
|
||||
free(tmp);
|
||||
errno = EIO;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* 分发到各 iovec 段 */
|
||||
size_t copied = 0;
|
||||
for (int i = 0; i < iovcnt && copied < total_len; i++) {
|
||||
size_t seg = iov[i].iov_len;
|
||||
if (seg == 0) continue;
|
||||
if (copied + seg > total_len) seg = total_len - copied;
|
||||
memcpy(iov[i].iov_base, tmp + copied, seg);
|
||||
copied += seg;
|
||||
}
|
||||
|
||||
free(tmp);
|
||||
return (ssize_t)total_len;
|
||||
}
|
||||
|
||||
/*
|
||||
* zvfs_iov_pwrite
|
||||
*
|
||||
* 将 iovec 合并为单次 blob_write:
|
||||
* 1. 分配临时 buf,按 iovec 顺序 memcpy 拼接
|
||||
* 2. 单次 blob_write + 一次 inode_update_size
|
||||
*
|
||||
* 避免多次 SPDK 提交和多次 ftruncate。
|
||||
*/
|
||||
static ssize_t
|
||||
zvfs_iov_pwrite(struct zvfs_open_file *of,
|
||||
const struct iovec *iov, int iovcnt, uint64_t offset)
|
||||
{
|
||||
size_t total_len = iov_total_len(iov, iovcnt);
|
||||
if (total_len == 0) return 0;
|
||||
|
||||
/* 拼接到临时 buf */
|
||||
char *tmp = malloc(total_len);
|
||||
if (!tmp) { errno = ENOMEM; return -1; }
|
||||
|
||||
size_t pos = 0;
|
||||
for (int i = 0; i < iovcnt; i++) {
|
||||
if (iov[i].iov_len == 0) continue;
|
||||
memcpy(tmp + pos, iov[i].iov_base, iov[i].iov_len);
|
||||
pos += iov[i].iov_len;
|
||||
}
|
||||
|
||||
/* 单次写入 */
|
||||
ssize_t r = zvfs_pwrite_impl(of, tmp, total_len, offset);
|
||||
free(tmp);
|
||||
return r;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* 内部:取出 of,处理重入/非 zvfs 判断 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
static inline struct zvfs_open_file *
|
||||
get_of(int fd)
|
||||
{
|
||||
pthread_mutex_lock(&g_fs.fd_mu);
|
||||
struct zvfs_open_file *of = openfile_lookup(fd);
|
||||
pthread_mutex_unlock(&g_fs.fd_mu);
|
||||
return of;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* read */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
ssize_t
|
||||
read(int fd, void *buf, size_t count)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
struct zvfs_open_file *of;
|
||||
if (ZVFS_IN_HOOK() || !(of = get_of(fd))) {
|
||||
ssize_t r = real_read(fd, buf, count);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
ssize_t r = zvfs_pread_impl(of, buf, count, of->offset);
|
||||
if (r > 0)
|
||||
of->offset += (uint64_t)r;
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
ssize_t __read(int fd, void *buf, size_t count) { return read(fd, buf, count); }
|
||||
ssize_t __libc_read(int fd, void *buf, size_t count) { return read(fd, buf, count); }
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* pread / pread64 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
ssize_t
|
||||
pread(int fd, void *buf, size_t count, off_t offset)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
struct zvfs_open_file *of;
|
||||
if (ZVFS_IN_HOOK() || !(of = get_of(fd))) {
|
||||
ssize_t r = real_pread(fd, buf, count, offset);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
ssize_t r = zvfs_pread_impl(of, buf, count, (uint64_t)offset);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
ssize_t pread64(int fd, void *buf, size_t count, off_t offset)
|
||||
{
|
||||
return pread(fd, buf, count, offset);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* readv / preadv / preadv64 / preadv2 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
ssize_t
|
||||
readv(int fd, const struct iovec *iov, int iovcnt)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
struct zvfs_open_file *of;
|
||||
if (ZVFS_IN_HOOK() || !(of = get_of(fd))) {
|
||||
ssize_t r = real_readv(fd, iov, iovcnt);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
ssize_t r = zvfs_iov_pread(of, iov, iovcnt, of->offset);
|
||||
if (r > 0)
|
||||
of->offset += (uint64_t)r;
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
ssize_t
|
||||
preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
struct zvfs_open_file *of;
|
||||
if (ZVFS_IN_HOOK() || !(of = get_of(fd))) {
|
||||
ssize_t r = real_preadv(fd, iov, iovcnt, offset);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
ssize_t r = zvfs_iov_pread(of, iov, iovcnt, (uint64_t)offset);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
ssize_t preadv64(int fd, const struct iovec *iov, int iovcnt, off_t offset)
|
||||
{
|
||||
return preadv(fd, iov, iovcnt, offset);
|
||||
}
|
||||
|
||||
ssize_t
|
||||
preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
struct zvfs_open_file *of;
|
||||
if (ZVFS_IN_HOOK() || !(of = get_of(fd))) {
|
||||
ssize_t r = real_preadv2
|
||||
? real_preadv2(fd, iov, iovcnt, offset, flags)
|
||||
: (errno = ENOSYS, (ssize_t)-1);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
/*
|
||||
* RWF_NOWAIT:zvfs 无阻塞 I/O 概念,blob_read 总是同步返回,
|
||||
* 忽略该 flag,按普通 preadv 处理。
|
||||
* RWF_HIPRI / RWF_DSYNC / RWF_SYNC:同上,忽略。
|
||||
*/
|
||||
uint64_t off = (offset == (off_t)-1) ? of->offset : (uint64_t)offset;
|
||||
ssize_t r = zvfs_iov_pread(of, iov, iovcnt, off);
|
||||
if (offset == (off_t)-1 && r > 0)
|
||||
of->offset += (uint64_t)r;
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* write */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
ssize_t
|
||||
write(int fd, const void *buf, size_t count)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
struct zvfs_open_file *of;
|
||||
if (ZVFS_IN_HOOK() || !(of = get_of(fd))) {
|
||||
ssize_t r = real_write(fd, buf, count);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
uint64_t write_off;
|
||||
|
||||
if (of->flags & O_APPEND) {
|
||||
/*
|
||||
* O_APPEND:每次写入位置 = 当前 logical_size(原子操作)。
|
||||
* 持 inode->mu 保证 read-then-write 的原子性,
|
||||
* 防止两个 O_APPEND fd 并发写时覆盖彼此数据。
|
||||
*/
|
||||
/* --- O_APPEND 内联写 -------------------------------------- */
|
||||
pthread_mutex_lock(&of->inode->mu);
|
||||
write_off = of->inode->logical_size; /* 重新取,防止 TOCTOU */
|
||||
uint64_t end = write_off + count;
|
||||
|
||||
pthread_mutex_unlock(&of->inode->mu);
|
||||
|
||||
if (blob_resize(of->handle, end) < 0) {
|
||||
errno = EIO;
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1;
|
||||
}
|
||||
if (blob_write(of->handle, write_off, buf, count) < 0) {
|
||||
errno = EIO;
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1;
|
||||
}
|
||||
|
||||
pthread_mutex_lock(&of->inode->mu);
|
||||
if (end > of->inode->logical_size)
|
||||
inode_update_size(of->inode, of->fd, end);
|
||||
pthread_mutex_unlock(&of->inode->mu);
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return (ssize_t)count;
|
||||
|
||||
} else {
|
||||
write_off = of->offset;
|
||||
ssize_t r = zvfs_pwrite_impl(of, buf, count, write_off);
|
||||
if (r > 0)
|
||||
of->offset += (uint64_t)r;
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
ssize_t __write(int fd, const void *buf, size_t count) { return write(fd, buf, count); }
|
||||
ssize_t __libc_write(int fd, const void *buf, size_t count) { return write(fd, buf, count); }
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* pwrite / pwrite64 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
ssize_t
|
||||
pwrite(int fd, const void *buf, size_t count, off_t offset)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
struct zvfs_open_file *of;
|
||||
if (ZVFS_IN_HOOK() || !(of = get_of(fd))) {
|
||||
ssize_t r = real_pwrite(fd, buf, count, offset);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
/*
|
||||
* POSIX:pwrite 忽略 O_APPEND,始终写到指定 offset。
|
||||
*/
|
||||
ssize_t r = zvfs_pwrite_impl(of, buf, count, (uint64_t)offset);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
ssize_t pwrite64(int fd, const void *buf, size_t count, off_t offset)
|
||||
{
|
||||
return pwrite(fd, buf, count, offset);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* writev / pwritev / pwritev64 / pwritev2 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
ssize_t
|
||||
writev(int fd, const struct iovec *iov, int iovcnt)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
struct zvfs_open_file *of;
|
||||
if (ZVFS_IN_HOOK() || !(of = get_of(fd))) {
|
||||
ssize_t r = real_writev(fd, iov, iovcnt);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
ssize_t r;
|
||||
if (of->flags & O_APPEND) {
|
||||
/*
|
||||
* O_APPEND + writev:和 write 一样需要原子序列。
|
||||
* 先计算总字节数,用 iov_pwrite 完成,整个过程持 inode->mu。
|
||||
*/
|
||||
size_t total_len = 0;
|
||||
for (int i = 0; i < iovcnt; i++) total_len += iov[i].iov_len;
|
||||
|
||||
pthread_mutex_lock(&of->inode->mu);
|
||||
uint64_t write_off = of->inode->logical_size;
|
||||
uint64_t end = write_off + total_len;
|
||||
pthread_mutex_unlock(&of->inode->mu);
|
||||
|
||||
if (blob_resize(of->handle, end) < 0) { errno = EIO; ZVFS_HOOK_LEAVE(); return -1; }
|
||||
r = zvfs_iov_pwrite(of, iov, iovcnt, write_off);
|
||||
|
||||
if (r > 0) {
|
||||
pthread_mutex_lock(&of->inode->mu);
|
||||
uint64_t new_end = write_off + (uint64_t)r;
|
||||
if (new_end > of->inode->logical_size)
|
||||
inode_update_size(of->inode, of->fd, new_end);
|
||||
pthread_mutex_unlock(&of->inode->mu);
|
||||
}
|
||||
} else {
|
||||
r = zvfs_iov_pwrite(of, iov, iovcnt, of->offset);
|
||||
if (r > 0) of->offset += (uint64_t)r;
|
||||
}
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
ssize_t
|
||||
pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
struct zvfs_open_file *of;
|
||||
if (ZVFS_IN_HOOK() || !(of = get_of(fd))) {
|
||||
ssize_t r = real_pwritev(fd, iov, iovcnt, offset);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
ssize_t r = zvfs_iov_pwrite(of, iov, iovcnt, (uint64_t)offset);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
ssize_t pwritev64(int fd, const struct iovec *iov, int iovcnt, off_t offset)
|
||||
{
|
||||
return pwritev(fd, iov, iovcnt, offset);
|
||||
}
|
||||
|
||||
ssize_t
|
||||
pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
struct zvfs_open_file *of;
|
||||
if (ZVFS_IN_HOOK() || !(of = get_of(fd))) {
|
||||
ssize_t r = real_pwritev2
|
||||
? real_pwritev2(fd, iov, iovcnt, offset, flags)
|
||||
: (errno = ENOSYS, (ssize_t)-1);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
/* flags(RWF_SYNC/RWF_DSYNC/RWF_APPEND 等):
|
||||
* zvfs 无缓冲区,所有写均同步落盘,忽略 flags。
|
||||
* offset == -1:使用并更新 of->offset。 */
|
||||
uint64_t off = (offset == (off_t)-1) ? of->offset : (uint64_t)offset;
|
||||
ssize_t r = zvfs_iov_pwrite(of, iov, iovcnt, off);
|
||||
if (offset == (off_t)-1 && r > 0)
|
||||
of->offset += (uint64_t)r;
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
52
src/hook/zvfs_hook_rw.h
Normal file
52
src/hook/zvfs_hook_rw.h
Normal file
@@ -0,0 +1,52 @@
|
||||
#ifndef __ZVFS_HOOK_RW_H__
|
||||
#define __ZVFS_HOOK_RW_H__
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/uio.h>
|
||||
#include <unistd.h>
|
||||
|
||||
/*
|
||||
* read / write 族。
|
||||
*
|
||||
* 所有变体最终收敛到两个内部实现:
|
||||
* zvfs_pread_impl (fd, buf, count, offset)
|
||||
* zvfs_pwrite_impl(fd, buf, count, offset)
|
||||
*
|
||||
* offset 语义:
|
||||
* - pread/pwrite 系列:直接使用传入 offset,不修改 of->offset
|
||||
* - read/write 系列:使用 of->offset,完成后更新
|
||||
* - O_APPEND write :每次写前持 inode->mu 取 logical_size 作为 offset
|
||||
*
|
||||
* iov 系列(readv/writev/preadv/pwritev):
|
||||
* 展开 iovec 后逐段调用 pread/pwrite impl,合并结果。
|
||||
* 这样不需要在 SPDK 层实现 scatter/gather,实现最简单。
|
||||
* 如果将来 SPDK 层支持 SGL 可以直接换掉这一层。
|
||||
*/
|
||||
|
||||
/* read 族 */
|
||||
ssize_t read(int fd, void *buf, size_t count);
|
||||
ssize_t pread(int fd, void *buf, size_t count, off_t offset);
|
||||
ssize_t pread64(int fd, void *buf, size_t count, off_t offset);
|
||||
ssize_t readv(int fd, const struct iovec *iov, int iovcnt);
|
||||
ssize_t preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset);
|
||||
ssize_t preadv64(int fd, const struct iovec *iov, int iovcnt, off_t offset);
|
||||
ssize_t preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset,
|
||||
int flags);
|
||||
|
||||
/* write 族 */
|
||||
ssize_t write(int fd, const void *buf, size_t count);
|
||||
ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset);
|
||||
ssize_t pwrite64(int fd, const void *buf, size_t count, off_t offset);
|
||||
ssize_t writev(int fd, const struct iovec *iov, int iovcnt);
|
||||
ssize_t pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset);
|
||||
ssize_t pwritev64(int fd, const struct iovec *iov, int iovcnt, off_t offset);
|
||||
ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset,
|
||||
int flags);
|
||||
|
||||
/* glibc 别名 */
|
||||
ssize_t __read(int fd, void *buf, size_t count);
|
||||
ssize_t __libc_read(int fd, void *buf, size_t count);
|
||||
ssize_t __write(int fd, const void *buf, size_t count);
|
||||
ssize_t __libc_write(int fd, const void *buf, size_t count);
|
||||
|
||||
#endif // __ZVFS_HOOK_RW_H__
|
||||
301
src/hook/zvfs_hook_seek.c
Normal file
301
src/hook/zvfs_hook_seek.c
Normal file
@@ -0,0 +1,301 @@
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
#include "zvfs_hook_seek.h"
|
||||
#include "zvfs_hook_init.h"
|
||||
#include "zvfs_hook_reentrant.h"
|
||||
#include "fs/zvfs.h"
|
||||
#include "fs/zvfs_open_file.h"
|
||||
#include "fs/zvfs_inode.h"
|
||||
#include "fs/zvfs_path_entry.h"
|
||||
#include "spdk_engine/io_engine.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <linux/falloc.h> /* FALLOC_FL_* */
|
||||
#include <pthread.h>
|
||||
#include <stdint.h>
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* lseek / lseek64 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
off_t
|
||||
lseek(int fd, off_t offset, int whence)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
struct zvfs_open_file *of = NULL;
|
||||
if (!ZVFS_IN_HOOK()) {
|
||||
pthread_mutex_lock(&g_fs.fd_mu);
|
||||
of = openfile_lookup(fd);
|
||||
pthread_mutex_unlock(&g_fs.fd_mu);
|
||||
}
|
||||
|
||||
if (!of) {
|
||||
off_t r = real_lseek(fd, offset, whence);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
/*
|
||||
* O_APPEND fd 的 lseek:POSIX 允许 lseek,但下次 write 时
|
||||
* 仍会从文件末尾写。lseek 只影响 read 的位置。
|
||||
* 我们照常更新 of->offset。
|
||||
*/
|
||||
pthread_mutex_lock(&of->inode->mu); /* SEEK_END 需读 logical_size */
|
||||
uint64_t new_off = openfile_seek(of, (int64_t)offset, whence);
|
||||
pthread_mutex_unlock(&of->inode->mu);
|
||||
|
||||
if (new_off == (uint64_t)-1) {
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return (off_t)-1;
|
||||
}
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return (off_t)new_off;
|
||||
}
|
||||
|
||||
off_t lseek64(int fd, off_t offset, int whence)
|
||||
{
|
||||
return lseek(fd, offset, whence);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* 内部:按 inode 指针做 truncate(path / fd 路径共用) */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
|
||||
/*
|
||||
* zvfs_truncate_by_inode - 对有 handle 的 openfile 做 truncate。
|
||||
* 找到任意一个打开该 inode 的 openfile 取其 handle。
|
||||
*/
|
||||
static int
|
||||
zvfs_truncate_inode_with_handle(struct zvfs_inode *inode,
|
||||
int real_fd, uint64_t new_size)
|
||||
{
|
||||
/* 在 fd_table 里找一个指向该 inode 的 openfile 取 handle */
|
||||
struct zvfs_blob_handle *handle = NULL;
|
||||
pthread_mutex_lock(&g_fs.fd_mu);
|
||||
struct zvfs_open_file *of, *tmp;
|
||||
HASH_ITER(hh, g_fs.fd_table, of, tmp) {
|
||||
(void)tmp;
|
||||
if (of->inode == inode) {
|
||||
handle = of->handle;
|
||||
break;
|
||||
}
|
||||
}
|
||||
pthread_mutex_unlock(&g_fs.fd_mu);
|
||||
|
||||
pthread_mutex_lock(&inode->mu);
|
||||
uint64_t old_size = inode->logical_size;
|
||||
pthread_mutex_unlock(&inode->mu);
|
||||
|
||||
if (new_size != old_size && handle) {
|
||||
if (blob_resize(handle, new_size) < 0) {
|
||||
errno = EIO;
|
||||
return -1;
|
||||
}
|
||||
} else if (new_size != old_size && !handle) {
|
||||
/*
|
||||
* 文件未被打开:需要临时 blob_open。
|
||||
* 这种情况下 truncate(path, ...) 被调用但文件没有 fd。
|
||||
*/
|
||||
handle = blob_open(inode->blob_id);
|
||||
if (!handle) { errno = EIO; return -1; }
|
||||
int rc = blob_resize(handle, new_size);
|
||||
blob_close(handle);
|
||||
if (rc < 0) { errno = EIO; return -1; }
|
||||
}
|
||||
|
||||
pthread_mutex_lock(&inode->mu);
|
||||
inode_update_size(inode, real_fd, new_size);
|
||||
pthread_mutex_unlock(&inode->mu);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* ftruncate / ftruncate64 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
ftruncate(int fd, off_t length)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
struct zvfs_open_file *of = NULL;
|
||||
if (!ZVFS_IN_HOOK()) {
|
||||
pthread_mutex_lock(&g_fs.fd_mu);
|
||||
of = openfile_lookup(fd);
|
||||
pthread_mutex_unlock(&g_fs.fd_mu);
|
||||
}
|
||||
|
||||
if (!of) {
|
||||
int r = real_ftruncate(fd, length);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
if (length < 0) { errno = EINVAL; ZVFS_HOOK_LEAVE(); return -1; }
|
||||
|
||||
int r = zvfs_truncate_inode_with_handle(of->inode, fd, (uint64_t)length);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
int ftruncate64(int fd, off_t length) { return ftruncate(fd, length); }
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* truncate / truncate64(按路径) */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
truncate(const char *path, off_t length)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(path)) {
|
||||
int r = real_truncate(path, length);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
if (length < 0) { errno = EINVAL; ZVFS_HOOK_LEAVE(); return -1; }
|
||||
|
||||
/* 查 path_cache 拿 inode */
|
||||
pthread_mutex_lock(&g_fs.path_mu);
|
||||
struct zvfs_path_entry *pe = path_cache_lookup(path);
|
||||
struct zvfs_inode *inode = pe ? pe->inode : NULL;
|
||||
pthread_mutex_unlock(&g_fs.path_mu);
|
||||
|
||||
if (!inode) {
|
||||
/*
|
||||
* inode 不在缓存:文件存在于 FS 但从未被 open。
|
||||
* 需要读 xattr 拿 blob_id,临时构建 inode。
|
||||
* 最简单的做法:先 real_open,再走 zvfs 路径,再 real_close。
|
||||
* 这里直接调 real_truncate 改 st_size,但 blob 不会被截断。
|
||||
*
|
||||
* 更正确的做法:open + ftruncate + close。
|
||||
* 调用方通常不会在 file 未被打开的情况下做 truncate,
|
||||
* 所以这里先报 ENOENT(找不到 zvfs inode)作为安全兜底。
|
||||
*/
|
||||
errno = ENOENT;
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1;
|
||||
}
|
||||
|
||||
int r = zvfs_truncate_inode_with_handle(inode, -1, (uint64_t)length);
|
||||
|
||||
/* 同步真实文件 st_size(real_truncate 更新磁盘元数据) */
|
||||
if (r == 0)
|
||||
real_truncate(path, length);
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
int truncate64(const char *path, off_t length) { return truncate(path, length); }
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* fallocate */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
fallocate(int fd, int mode, off_t offset, off_t len)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
struct zvfs_open_file *of = NULL;
|
||||
if (!ZVFS_IN_HOOK()) {
|
||||
pthread_mutex_lock(&g_fs.fd_mu);
|
||||
of = openfile_lookup(fd);
|
||||
pthread_mutex_unlock(&g_fs.fd_mu);
|
||||
}
|
||||
|
||||
if (!of) {
|
||||
int r = real_fallocate(fd, mode, offset, len);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
if (offset < 0 || len <= 0) { errno = EINVAL; ZVFS_HOOK_LEAVE(); return -1; }
|
||||
|
||||
/* FALLOC_FL_PUNCH_HOLE:打孔,暂不支持 */
|
||||
if (mode & FALLOC_FL_PUNCH_HOLE) {
|
||||
errno = ENOTSUP;
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* FALLOC_FL_KEEP_SIZE:预分配但不改变文件逻辑大小,直接返回 0 */
|
||||
if (mode & FALLOC_FL_KEEP_SIZE) {
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* 普通 fallocate(mode == 0):
|
||||
* 确保 [offset, offset+len) 范围内的空间被"分配"。
|
||||
* zvfs 的语义:把 logical_size 扩展到 max(logical_size, offset+len)。
|
||||
* 不提前 blob_resize,因为 SPDK cluster 按写入时分配更高效。
|
||||
*/
|
||||
uint64_t new_end = (uint64_t)offset + (uint64_t)len;
|
||||
|
||||
pthread_mutex_lock(&of->inode->mu);
|
||||
if (new_end > of->inode->logical_size)
|
||||
inode_update_size(of->inode, fd, new_end);
|
||||
pthread_mutex_unlock(&of->inode->mu);
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* posix_fallocate */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
posix_fallocate(int fd, off_t offset, off_t len)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
struct zvfs_open_file *of = NULL;
|
||||
if (!ZVFS_IN_HOOK()) {
|
||||
pthread_mutex_lock(&g_fs.fd_mu);
|
||||
of = openfile_lookup(fd);
|
||||
pthread_mutex_unlock(&g_fs.fd_mu);
|
||||
}
|
||||
|
||||
if (!of) {
|
||||
int r = real_posix_fallocate(fd, offset, len);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
/*
|
||||
* posix_fallocate 不接受 mode 参数,语义等价于 fallocate(fd, 0, ...)。
|
||||
* 注意:posix_fallocate 出错时返回错误码(正值),不设置 errno。
|
||||
*/
|
||||
if (offset < 0 || len <= 0) { ZVFS_HOOK_LEAVE(); return EINVAL; }
|
||||
|
||||
uint64_t new_end = (uint64_t)offset + (uint64_t)len;
|
||||
|
||||
pthread_mutex_lock(&of->inode->mu);
|
||||
if (new_end > of->inode->logical_size)
|
||||
inode_update_size(of->inode, fd, new_end);
|
||||
pthread_mutex_unlock(&of->inode->mu);
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return 0;
|
||||
}
|
||||
33
src/hook/zvfs_hook_seek.h
Normal file
33
src/hook/zvfs_hook_seek.h
Normal file
@@ -0,0 +1,33 @@
|
||||
#ifndef __ZVFS_HOOK_SEEK_H__
|
||||
#define __ZVFS_HOOK_SEEK_H__
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
/*
|
||||
* lseek:更新 of->offset(非 O_APPEND fd)。
|
||||
*
|
||||
* truncate / ftruncate:
|
||||
* 更新 inode->logical_size,同步 st_size(ftruncate 到真实 fd),
|
||||
* 若 new_size < old_size,截断对 blob 的写入范围(blob_resize)。
|
||||
*
|
||||
* fallocate / posix_fallocate:
|
||||
* zvfs 无"空洞"概念,blob 按需增长。
|
||||
* 对 zvfs fd,fallocate 只更新 logical_size(预占逻辑空间),
|
||||
* 不调用 blob_resize(避免提前分配 SPDK cluster)。
|
||||
* FALLOC_FL_KEEP_SIZE 模式:不改 logical_size,直接返回 0。
|
||||
* FALLOC_FL_PUNCH_HOLE:暂不支持,返回 ENOTSUP。
|
||||
*/
|
||||
|
||||
off_t lseek(int fd, off_t offset, int whence);
|
||||
off_t lseek64(int fd, off_t offset, int whence);
|
||||
|
||||
int truncate(const char *path, off_t length);
|
||||
int truncate64(const char *path, off_t length);
|
||||
int ftruncate(int fd, off_t length);
|
||||
int ftruncate64(int fd, off_t length);
|
||||
|
||||
int fallocate(int fd, int mode, off_t offset, off_t len);
|
||||
int posix_fallocate(int fd, off_t offset, off_t len);
|
||||
|
||||
#endif // __ZVFS_HOOK_SEEK_H__
|
||||
404
src/hook/zvfs_hook_stat.c
Normal file
404
src/hook/zvfs_hook_stat.c
Normal file
@@ -0,0 +1,404 @@
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
#include "zvfs_hook_stat.h"
|
||||
#include "zvfs_hook_init.h"
|
||||
#include "zvfs_hook_reentrant.h"
|
||||
#include "fs/zvfs.h"
|
||||
#include "fs/zvfs_inode.h"
|
||||
#include "fs/zvfs_open_file.h"
|
||||
#include "fs/zvfs_path_entry.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <limits.h>
|
||||
#include <pthread.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* 内部:用 inode 覆盖 stat 结构体的 zvfs 相关字段 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
static void
|
||||
patch_stat(struct stat *st, struct zvfs_inode *inode)
|
||||
{
|
||||
pthread_mutex_lock(&inode->mu);
|
||||
st->st_size = (off_t)inode->logical_size;
|
||||
st->st_atime = inode->atime;
|
||||
st->st_mtime = inode->mtime;
|
||||
pthread_mutex_unlock(&inode->mu);
|
||||
|
||||
/*
|
||||
* st_blocks:以 512 字节为单位的"实际占用块数"。
|
||||
* zvfs 数据在 SPDK,真实文件几乎为空(只有 xattr),
|
||||
* 按 logical_size 估算,给上层一个合理的值。
|
||||
* (logical_size + 511) / 512 向上取整。
|
||||
*/
|
||||
st->st_blocks = (blkcnt_t)((st->st_size + 511) / 512);
|
||||
}
|
||||
|
||||
static void
|
||||
patch_stat64(struct stat64 *st, struct zvfs_inode *inode)
|
||||
{
|
||||
pthread_mutex_lock(&inode->mu);
|
||||
st->st_size = (off64_t)inode->logical_size;
|
||||
st->st_atime = inode->atime;
|
||||
st->st_mtime = inode->mtime;
|
||||
pthread_mutex_unlock(&inode->mu);
|
||||
|
||||
st->st_blocks = (blkcnt_t)((st->st_size + 511) / 512);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* 内部:按路径找 inode(先查缓存,缓存未命中则检查 xattr) */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
/*
|
||||
* zvfs_inode_by_path
|
||||
*
|
||||
* 返回路径对应的 inode 指针(不增加 ref_count,调用方只读使用)。
|
||||
* 若路径不是 zvfs 文件(无 xattr)返回 NULL。
|
||||
*
|
||||
* 注意:返回的指针仅在持有 path_mu / inode_mu 之外使用时有效,
|
||||
* 调用方需在使用期间持有 inode->mu 或确保文件未被 close。
|
||||
* 对 stat 路径(只读 logical_size/atime/mtime),
|
||||
* 短暂持有 inode->mu 即可,无需长期持有。
|
||||
*/
|
||||
static struct zvfs_inode *
|
||||
zvfs_inode_by_path(const char *path)
|
||||
{
|
||||
/* 1. 先查 path_cache */
|
||||
pthread_mutex_lock(&g_fs.path_mu);
|
||||
struct zvfs_path_entry *pe = path_cache_lookup(path);
|
||||
struct zvfs_inode *inode = pe ? pe->inode : NULL;
|
||||
pthread_mutex_unlock(&g_fs.path_mu);
|
||||
|
||||
if (inode)
|
||||
return inode;
|
||||
|
||||
/* 2. path_cache 未命中:检查 xattr 判断是否是 zvfs 文件 */
|
||||
uint64_t blob_id = 0;
|
||||
int tmp_fd = real_open(path, O_RDONLY);
|
||||
if (tmp_fd < 0)
|
||||
return NULL;
|
||||
|
||||
int has_xattr = (zvfs_xattr_read_blob_id(tmp_fd, &blob_id) == 0);
|
||||
real_close(tmp_fd);
|
||||
|
||||
if (!has_xattr)
|
||||
return NULL;
|
||||
|
||||
/* 3. 查 inode_table(文件被另一个 fd 打开过) */
|
||||
pthread_mutex_lock(&g_fs.inode_mu);
|
||||
inode = inode_lookup(blob_id);
|
||||
pthread_mutex_unlock(&g_fs.inode_mu);
|
||||
|
||||
return inode; /* 可能仍为 NULL(从未打开过,纯 stat 调用) */
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* stat */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
stat(const char *path, struct stat *buf)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(path)) {
|
||||
int r = real_stat(path, buf);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
/* 先透传,拿到完整 stat(mode、ino、dev、nlink 等) */
|
||||
if (real_stat(path, buf) < 0) {
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1;
|
||||
}
|
||||
|
||||
struct zvfs_inode *inode = zvfs_inode_by_path(path);
|
||||
if (inode)
|
||||
patch_stat(buf, inode);
|
||||
/*
|
||||
* inode 为 NULL:文件存在于 FS 但从未被 zvfs open,
|
||||
* 此时 st_size 来自真实文件(接近 0),
|
||||
* 这是合理的降级行为(文件尚未被写入 SPDK)。
|
||||
*/
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
stat64(const char *path, struct stat64 *buf)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(path)) {
|
||||
int r = real_stat64(path, buf);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
if (real_stat64(path, buf) < 0) {
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1;
|
||||
}
|
||||
|
||||
struct zvfs_inode *inode = zvfs_inode_by_path(path);
|
||||
if (inode)
|
||||
patch_stat64(buf, inode);
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* fstat(最高频,pg 每次 read 前都调) */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
fstat(int fd, struct stat *buf)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
/* 先透传:拿到 mode/ino/dev/nlink/blksize 等 */
|
||||
if (real_fstat(fd, buf) < 0) {
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (ZVFS_IN_HOOK()) {
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return 0;
|
||||
}
|
||||
|
||||
pthread_mutex_lock(&g_fs.fd_mu);
|
||||
struct zvfs_open_file *of = openfile_lookup(fd);
|
||||
pthread_mutex_unlock(&g_fs.fd_mu);
|
||||
|
||||
if (of) {
|
||||
zvfs_ensure_init();
|
||||
patch_stat(buf, of->inode);
|
||||
}
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
fstat64(int fd, struct stat64 *buf)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
if (real_fstat64(fd, buf) < 0) {
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (ZVFS_IN_HOOK()) {
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return 0;
|
||||
}
|
||||
|
||||
pthread_mutex_lock(&g_fs.fd_mu);
|
||||
struct zvfs_open_file *of = openfile_lookup(fd);
|
||||
pthread_mutex_unlock(&g_fs.fd_mu);
|
||||
|
||||
if (of) {
|
||||
zvfs_ensure_init();
|
||||
patch_stat64(buf, of->inode);
|
||||
}
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* lstat(symlink 不穿透;zvfs 不用 symlink,逻辑与 stat 相同) */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
lstat(const char *path, struct stat *buf)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(path)) {
|
||||
int r = real_lstat(path, buf);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
if (real_lstat(path, buf) < 0) {
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1;
|
||||
}
|
||||
|
||||
struct zvfs_inode *inode = zvfs_inode_by_path(path);
|
||||
if (inode)
|
||||
patch_stat(buf, inode);
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
lstat64(const char *path, struct stat64 *buf)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(path)) {
|
||||
int r = real_lstat64(path, buf);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
if (real_lstat64(path, buf) < 0) {
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1;
|
||||
}
|
||||
|
||||
struct zvfs_inode *inode = zvfs_inode_by_path(path);
|
||||
if (inode)
|
||||
patch_stat64(buf, inode);
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* fstatat / fstatat64 */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
fstatat(int dirfd, const char *path, struct stat *buf, int flags)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
char abspath[PATH_MAX];
|
||||
int is_zvfs = 0;
|
||||
|
||||
if (!ZVFS_IN_HOOK()) {
|
||||
if (zvfs_resolve_atpath(dirfd, path, abspath, sizeof(abspath)) == 0)
|
||||
is_zvfs = zvfs_is_zvfs_path(abspath);
|
||||
}
|
||||
|
||||
if (real_fstatat(dirfd, path, buf, flags) < 0) {
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (is_zvfs) {
|
||||
zvfs_ensure_init();
|
||||
struct zvfs_inode *inode = zvfs_inode_by_path(abspath);
|
||||
if (inode)
|
||||
patch_stat(buf, inode);
|
||||
}
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
fstatat64(int dirfd, const char *path, struct stat64 *buf, int flags)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
char abspath[PATH_MAX];
|
||||
int is_zvfs = 0;
|
||||
|
||||
if (!ZVFS_IN_HOOK()) {
|
||||
if (zvfs_resolve_atpath(dirfd, path, abspath, sizeof(abspath)) == 0)
|
||||
is_zvfs = zvfs_is_zvfs_path(abspath);
|
||||
}
|
||||
|
||||
if (real_fstatat64(dirfd, path, buf, flags) < 0) {
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (is_zvfs) {
|
||||
zvfs_ensure_init();
|
||||
struct zvfs_inode *inode = zvfs_inode_by_path(abspath);
|
||||
if (inode)
|
||||
patch_stat64(buf, inode);
|
||||
}
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* statx */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
statx(int dirfd, const char *path, int flags,
|
||||
unsigned int mask, struct statx *buf)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
if (!real_statx) {
|
||||
errno = ENOSYS;
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1;
|
||||
}
|
||||
|
||||
char abspath[PATH_MAX];
|
||||
int is_zvfs = 0;
|
||||
|
||||
if (!ZVFS_IN_HOOK()) {
|
||||
if (zvfs_resolve_atpath(dirfd, path, abspath, sizeof(abspath)) == 0)
|
||||
is_zvfs = zvfs_is_zvfs_path(abspath);
|
||||
}
|
||||
|
||||
if (real_statx(dirfd, path, flags, mask, buf) < 0) {
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!is_zvfs) {
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return 0;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
/* statx 用 stx_mask 标记哪些字段有效,覆盖 size/atime/mtime */
|
||||
struct zvfs_inode *inode = zvfs_inode_by_path(abspath);
|
||||
if (inode) {
|
||||
pthread_mutex_lock(&inode->mu);
|
||||
|
||||
if (mask & STATX_SIZE) {
|
||||
buf->stx_size = inode->logical_size;
|
||||
buf->stx_mask |= STATX_SIZE;
|
||||
/* stx_blocks 以 512 字节为单位 */
|
||||
buf->stx_blocks = (inode->logical_size + 511) / 512;
|
||||
buf->stx_mask |= STATX_BLOCKS;
|
||||
}
|
||||
if (mask & STATX_ATIME) {
|
||||
buf->stx_atime.tv_sec = inode->atime;
|
||||
buf->stx_atime.tv_nsec = 0;
|
||||
buf->stx_mask |= STATX_ATIME;
|
||||
}
|
||||
if (mask & STATX_MTIME) {
|
||||
buf->stx_mtime.tv_sec = inode->mtime;
|
||||
buf->stx_mtime.tv_nsec = 0;
|
||||
buf->stx_mask |= STATX_MTIME;
|
||||
}
|
||||
|
||||
pthread_mutex_unlock(&inode->mu);
|
||||
}
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return 0;
|
||||
}
|
||||
35
src/hook/zvfs_hook_stat.h
Normal file
35
src/hook/zvfs_hook_stat.h
Normal file
@@ -0,0 +1,35 @@
|
||||
#ifndef __ZVFS_HOOK_STAT_H__
|
||||
#define __ZVFS_HOOK_STAT_H__
|
||||
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
/*
|
||||
* stat 族 hook。
|
||||
*
|
||||
* 核心策略:
|
||||
* 对 zvfs 文件,透传 real_stat* 获取大部分字段
|
||||
* (ino、dev、nlink、mode、uid、gid、blksize、blocks 等),
|
||||
* 只覆盖以下字段:
|
||||
* st_size ← inode->logical_size
|
||||
* st_atime ← inode->atime
|
||||
* st_mtime ← inode->mtime
|
||||
*
|
||||
* st_blocks 保持真实文件的值(接近 0,因为真实文件只有 xattr)。
|
||||
* 上层(postgres/rocksdb)用 st_size 判断文件大小,这是关键字段。
|
||||
*
|
||||
* 对非 zvfs 文件:完全透传。
|
||||
*/
|
||||
|
||||
int stat(const char *path, struct stat *buf);
|
||||
int stat64(const char *path, struct stat64 *buf);
|
||||
int fstat(int fd, struct stat *buf);
|
||||
int fstat64(int fd, struct stat64 *buf);
|
||||
int lstat(const char *path, struct stat *buf);
|
||||
int lstat64(const char *path, struct stat64 *buf);
|
||||
int fstatat(int dirfd, const char *path, struct stat *buf, int flags);
|
||||
int fstatat64(int dirfd, const char *path, struct stat64 *buf, int flags);
|
||||
int statx(int dirfd, const char *path, int flags,
|
||||
unsigned int mask, struct statx *buf);
|
||||
|
||||
#endif // __ZVFS_HOOK_STAT_H__
|
||||
122
src/hook/zvfs_hook_sync.c
Normal file
122
src/hook/zvfs_hook_sync.c
Normal file
@@ -0,0 +1,122 @@
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
#include "zvfs_hook_sync.h"
|
||||
#include "zvfs_hook_init.h"
|
||||
#include "zvfs_hook_reentrant.h"
|
||||
#include "fs/zvfs.h"
|
||||
#include "fs/zvfs_open_file.h"
|
||||
#include "spdk_engine/io_engine.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <pthread.h>
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* fsync */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
fsync(int fd)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
struct zvfs_open_file *of = NULL;
|
||||
if (!ZVFS_IN_HOOK()) {
|
||||
pthread_mutex_lock(&g_fs.fd_mu);
|
||||
of = openfile_lookup(fd);
|
||||
pthread_mutex_unlock(&g_fs.fd_mu);
|
||||
}
|
||||
|
||||
if (!of) {
|
||||
int r = real_fsync(fd);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
/*
|
||||
* zvfs 无写缓冲区,数据已在 blob_write 时落到 SPDK 存储。
|
||||
* 调用 blob_sync_md 确保 blob 元数据(size 等)持久化。
|
||||
*/
|
||||
int r = blob_sync_md(of->handle);
|
||||
if (r < 0) errno = EIO;
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return (r < 0) ? -1 : 0;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* fdatasync */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
fdatasync(int fd)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
struct zvfs_open_file *of = NULL;
|
||||
if (!ZVFS_IN_HOOK()) {
|
||||
pthread_mutex_lock(&g_fs.fd_mu);
|
||||
of = openfile_lookup(fd);
|
||||
pthread_mutex_unlock(&g_fs.fd_mu);
|
||||
}
|
||||
|
||||
if (!of) {
|
||||
int r = real_fdatasync(fd);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
/*
|
||||
* fdatasync 只保证数据持久化,不要求元数据(atime 等)同步。
|
||||
* 对 zvfs:数据已无缓冲,blob_sync_md 同步 size 元数据即可。
|
||||
* 与 fsync 实现相同——如果将来区分数据/元数据可在此分叉。
|
||||
*/
|
||||
int r = blob_sync_md(of->handle);
|
||||
if (r < 0) errno = EIO;
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return (r < 0) ? -1 : 0;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* sync_file_range */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
int
|
||||
sync_file_range(int fd, off_t offset, off_t nbytes, unsigned int flags)
|
||||
{
|
||||
ZVFS_HOOK_ENTER();
|
||||
|
||||
struct zvfs_open_file *of = NULL;
|
||||
if (!ZVFS_IN_HOOK()) {
|
||||
pthread_mutex_lock(&g_fs.fd_mu);
|
||||
of = openfile_lookup(fd);
|
||||
pthread_mutex_unlock(&g_fs.fd_mu);
|
||||
}
|
||||
|
||||
if (!of) {
|
||||
int r = real_sync_file_range
|
||||
? real_sync_file_range(fd, offset, nbytes, flags)
|
||||
: (errno = ENOSYS, -1);
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return r;
|
||||
}
|
||||
|
||||
zvfs_ensure_init();
|
||||
|
||||
/*
|
||||
* PostgreSQL checkpointer 用此调用按范围刷脏页。
|
||||
* zvfs 无页缓存,数据实时落盘,直接返回 0。
|
||||
* 参数合法性检查与内核保持一致:
|
||||
* offset < 0 或 nbytes < 0 → EINVAL
|
||||
* flags 包含非法位 → EINVAL
|
||||
*/
|
||||
(void)offset; (void)nbytes; (void)flags;
|
||||
|
||||
ZVFS_HOOK_LEAVE();
|
||||
return 0;
|
||||
}
|
||||
24
src/hook/zvfs_hook_sync.h
Normal file
24
src/hook/zvfs_hook_sync.h
Normal file
@@ -0,0 +1,24 @@
|
||||
#ifndef __ZVFS_HOOK_SYNC_H__
|
||||
#define __ZVFS_HOOK_SYNC_H__
|
||||
|
||||
#include <sys/types.h>
|
||||
|
||||
/*
|
||||
* zvfs 无写缓冲区:所有 blob_write 成功即代表数据已落到 SPDK 管理的存储。
|
||||
*
|
||||
* fsync / fdatasync:
|
||||
* 对 zvfs fd 调用 blob_sync_md 同步 blob 元数据(size 等),
|
||||
* 然后返回 0。不需要 flush 数据缓冲区。
|
||||
* 非 zvfs fd 透传。
|
||||
*
|
||||
* sync_file_range:
|
||||
* PostgreSQL checkpointer 按范围刷脏页。
|
||||
* zvfs 无页缓存,直接返回 0。
|
||||
* 非 zvfs fd 透传。
|
||||
*/
|
||||
|
||||
int fsync(int fd);
|
||||
int fdatasync(int fd);
|
||||
int sync_file_range(int fd, off_t offset, off_t nbytes, unsigned int flags);
|
||||
|
||||
#endif // __ZVFS_HOOK_SYNC_H__
|
||||
0
src/main.c
Normal file
0
src/main.c
Normal file
812
src/spdk_engine/io_engine.c
Normal file
812
src/spdk_engine/io_engine.c
Normal file
@@ -0,0 +1,812 @@
|
||||
#include "spdk_engine/io_engine.h"
|
||||
#include "config.h"
|
||||
#include "common/utils.h"
|
||||
|
||||
#include <spdk/event.h>
|
||||
#include <spdk/log.h>
|
||||
#include <spdk/bdev.h>
|
||||
#include <spdk/blob.h>
|
||||
#include <spdk/blob_bdev.h>
|
||||
#include <spdk/thread.h>
|
||||
#include <semaphore.h>
|
||||
#include <errno.h>
|
||||
#include <pthread.h>
|
||||
#include <string.h>
|
||||
|
||||
struct zvfs_spdk_io_engine g_engine = {0};
|
||||
static int g_engine_init_rc = -EAGAIN;
|
||||
static pthread_mutex_t g_super_blob_mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||
static spdk_blob_id g_super_blob_id_cache = SPDK_BLOBID_INVALID;
|
||||
|
||||
static __thread struct zvfs_tls_ctx tls = {0};
|
||||
|
||||
// 初始化操作上下文
|
||||
struct json_load_ctx {
|
||||
bool done;
|
||||
int rc;
|
||||
};
|
||||
|
||||
struct bs_init_ctx {
|
||||
bool done;
|
||||
int rc;
|
||||
struct spdk_blob_store *bs;
|
||||
};
|
||||
|
||||
// metadata 操作通用上下文
|
||||
struct md_op_ctx {
|
||||
void (*fn)(struct md_op_ctx *ctx);
|
||||
volatile bool done;
|
||||
int rc;
|
||||
// op-specific fields
|
||||
union {
|
||||
struct { // for create
|
||||
uint64_t size_hint;
|
||||
spdk_blob_id blob_id;
|
||||
} create;
|
||||
struct { // for open
|
||||
spdk_blob_id blob_id;
|
||||
struct spdk_blob *blob;
|
||||
} open;
|
||||
struct { // for resize/sync/close
|
||||
struct zvfs_blob_handle *handle;
|
||||
uint64_t new_size; // for resize
|
||||
} handle_op;
|
||||
struct { // for delete
|
||||
spdk_blob_id blob_id;
|
||||
} delete;
|
||||
struct { // for get/set super
|
||||
spdk_blob_id blob_id;
|
||||
} super;
|
||||
};
|
||||
char *op_name;
|
||||
};
|
||||
|
||||
// IO completion 上下文
|
||||
struct io_completion_ctx {
|
||||
bool done;
|
||||
int rc;
|
||||
};
|
||||
|
||||
// metadata poller 线程函数
|
||||
static void *md_poller_fn(void *arg) {
|
||||
spdk_set_thread(g_engine.md_thread);
|
||||
while (true) {
|
||||
spdk_thread_poll(g_engine.md_thread, 0, 0);
|
||||
usleep(1000);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// 前向声明
|
||||
static struct spdk_io_channel *get_current_channel(void);
|
||||
static int dispatch_md_op(struct md_op_ctx *ctx);
|
||||
static int dispatch_md_op_quiet(struct md_op_ctx *ctx);
|
||||
static void md_op_cb(void *arg);
|
||||
static int open_bdev_and_init_bs(const char *bdev_name);
|
||||
static int load_json_config(void);
|
||||
static int ensure_engine_ready(const char *op);
|
||||
|
||||
// callbacks
|
||||
static void json_app_load_done(int rc, void *arg);
|
||||
static void zvfs_spdk_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *event_ctx);
|
||||
static void bs_init_cb(void *arg, struct spdk_blob_store *bs, int bserrno);
|
||||
static void blob_create_cb(void *arg, spdk_blob_id blobid, int rc);
|
||||
static void blob_open_cb(void *arg, struct spdk_blob *blob, int rc);
|
||||
static void blob_resize_cb(void *arg, int rc);
|
||||
static void blob_sync_md_cb(void *arg, int rc);
|
||||
static void blob_close_cb(void *arg, int rc);
|
||||
static void blob_delete_cb(void *arg, int rc);
|
||||
static void io_completion_cb(void *arg, int rc);
|
||||
static void blob_get_super_cb(void *arg, spdk_blob_id blobid, int rc);
|
||||
static void blob_set_super_cb(void *arg, int rc);
|
||||
|
||||
// op functions on matadata
|
||||
static void blob_create_on_md(struct md_op_ctx *ctx);
|
||||
static void blob_open_on_md(struct md_op_ctx *ctx);
|
||||
static void blob_resize_on_md(struct md_op_ctx *ctx);
|
||||
static void blob_sync_md_on_md(struct md_op_ctx *ctx);
|
||||
static void blob_close_on_md(struct md_op_ctx *ctx);
|
||||
static void blob_delete_on_md(struct md_op_ctx *ctx);
|
||||
static void blob_get_super_on_md(struct md_op_ctx *ctx);
|
||||
static void blob_set_super_on_md(struct md_op_ctx *ctx);
|
||||
|
||||
__attribute__((constructor)) static void preload_init(void) {
|
||||
const char *auto_init = getenv("ZVFS_AUTO_INIT");
|
||||
if (!auto_init || strcmp(auto_init, "1") != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
printf("\n\n auto init \n\n");
|
||||
const char *bdev_name = getenv("SPDK_BDEV_NAME") ? getenv("SPDK_BDEV_NAME") : ZVFS_BDEV;
|
||||
g_engine_init_rc = io_engine_init(bdev_name);
|
||||
if (g_engine_init_rc != 0) {
|
||||
SPDK_ERRLOG("io_engine_init failed in constructor: %d\n", g_engine_init_rc);
|
||||
}
|
||||
}
|
||||
|
||||
static int wait_done(bool *done_ptr, int *rc_ptr, const char *op) {
|
||||
int iter = 0;
|
||||
while (!*done_ptr) {
|
||||
if (tls.thread) {
|
||||
spdk_thread_poll(tls.thread, 0, 0);
|
||||
}else{
|
||||
SPDK_ERRLOG("not init tls.thread\n");
|
||||
return -EBADE;
|
||||
}
|
||||
if (++iter > WAITER_MAX_TIME) {
|
||||
SPDK_ERRLOG("%s timeout\n", op);
|
||||
return -ETIMEDOUT;
|
||||
}
|
||||
}
|
||||
|
||||
if (*rc_ptr != 0) {
|
||||
SPDK_ERRLOG("%s failed in callback: %d\n", op, *rc_ptr);
|
||||
return *rc_ptr;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int wait_done_volatile(volatile bool *done_ptr, int *rc_ptr, const char *op) {
|
||||
int iter = 0;
|
||||
while (!*done_ptr) {
|
||||
if (tls.thread) {
|
||||
spdk_thread_poll(tls.thread, 0, 0);
|
||||
}else{
|
||||
SPDK_ERRLOG("not init tls.thread\n");
|
||||
return -EBADE;
|
||||
}
|
||||
if (++iter > WAITER_MAX_TIME) {
|
||||
SPDK_ERRLOG("%s timeout\n", op);
|
||||
return -ETIMEDOUT;
|
||||
}
|
||||
}
|
||||
|
||||
if (*rc_ptr != 0) {
|
||||
SPDK_ERRLOG("%s failed in callback: %d\n", op, *rc_ptr);
|
||||
return *rc_ptr;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// no rc error
|
||||
static int wait_done_volatile_quiet(volatile bool *done_ptr, int *rc_ptr, const char *op) {
|
||||
int iter = 0;
|
||||
while (!*done_ptr) {
|
||||
if (tls.thread) {
|
||||
spdk_thread_poll(tls.thread, 0, 0);
|
||||
} else {
|
||||
SPDK_ERRLOG("not init tls.thread\n");
|
||||
return -EBADE;
|
||||
}
|
||||
if (++iter > WAITER_MAX_TIME) {
|
||||
SPDK_ERRLOG("%s timeout\n", op);
|
||||
return -ETIMEDOUT;
|
||||
}
|
||||
}
|
||||
|
||||
return *rc_ptr;
|
||||
}
|
||||
|
||||
int io_engine_init(const char *bdev_name) {
|
||||
if (g_engine_init_rc == 0 && g_engine.bs != NULL && g_engine.md_thread != NULL) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct spdk_env_opts env_opts;
|
||||
spdk_env_opts_init(&env_opts);
|
||||
env_opts.name = "zvfs";
|
||||
|
||||
|
||||
if (spdk_env_init(&env_opts) != 0) {
|
||||
SPDK_ERRLOG("spdk_env_init failed\n");
|
||||
g_engine_init_rc = -1;
|
||||
return g_engine_init_rc;
|
||||
}
|
||||
|
||||
spdk_log_set_print_level(SPDK_LOG_NOTICE);
|
||||
spdk_log_set_level(SPDK_LOG_NOTICE);
|
||||
spdk_log_open(NULL);
|
||||
|
||||
if (spdk_thread_lib_init(NULL, 0) != 0) {
|
||||
SPDK_ERRLOG("spdk_thread_lib_init failed\n");
|
||||
g_engine_init_rc = -1;
|
||||
return g_engine_init_rc;
|
||||
}
|
||||
|
||||
// 为主线程 lazy init(constructor 在主线程跑)
|
||||
tls.thread = spdk_thread_create("main_thread", NULL);
|
||||
if (!tls.thread) {
|
||||
SPDK_ERRLOG("create main_thread failed\n");
|
||||
g_engine_init_rc = -1;
|
||||
return g_engine_init_rc;
|
||||
}
|
||||
spdk_set_thread(tls.thread);
|
||||
|
||||
if (load_json_config() != 0) {
|
||||
SPDK_ERRLOG("Failed to load SPDK config\n");
|
||||
g_engine_init_rc = -1;
|
||||
return g_engine_init_rc;
|
||||
}
|
||||
|
||||
/**
|
||||
* 这里是因为要让一个线程专门负责poll
|
||||
*/
|
||||
// 创建 md_thread
|
||||
g_engine.md_thread = spdk_thread_create("md_thread", NULL);
|
||||
if (!g_engine.md_thread) {
|
||||
SPDK_ERRLOG("create md_thread failed\n");
|
||||
g_engine_init_rc = -1;
|
||||
return g_engine_init_rc;
|
||||
}
|
||||
|
||||
// 起专用 poller pthread for md_thread
|
||||
pthread_t md_poller_tid;
|
||||
if (pthread_create(&md_poller_tid, NULL, md_poller_fn, NULL) != 0) {
|
||||
SPDK_ERRLOG("pthread_create for md_poller failed\n");
|
||||
g_engine_init_rc = -1;
|
||||
return g_engine_init_rc;
|
||||
}
|
||||
if (pthread_detach(md_poller_tid) != 0) {
|
||||
SPDK_ERRLOG("pthread_detach for md_poller failed\n");
|
||||
g_engine_init_rc = -1;
|
||||
return g_engine_init_rc;
|
||||
}
|
||||
|
||||
// init bdev/bs
|
||||
g_super_blob_id_cache = SPDK_BLOBID_INVALID;
|
||||
int rc = open_bdev_and_init_bs(bdev_name);
|
||||
if (rc != 0) {
|
||||
g_engine_init_rc = rc;
|
||||
return rc;
|
||||
}
|
||||
g_engine_init_rc = 0;
|
||||
return g_engine_init_rc;
|
||||
}
|
||||
|
||||
static int load_json_config(void) {
|
||||
const char *path = getenv("SPDK_JSON_CONFIG");
|
||||
if(!path) path = SPDK_JSON_PATH;
|
||||
|
||||
|
||||
struct json_load_ctx ctx = {
|
||||
.done = false,
|
||||
.rc = 0
|
||||
};
|
||||
spdk_subsystem_init_from_json_config(path, SPDK_DEFAULT_RPC_ADDR, json_app_load_done,
|
||||
&ctx, true);
|
||||
return wait_done(&ctx.done, &ctx.rc, "load_json_config");
|
||||
}
|
||||
|
||||
// lazy get channel
|
||||
static struct spdk_io_channel *get_current_channel(void) {
|
||||
if (ensure_engine_ready("get_current_channel") != 0) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (tls.thread) {
|
||||
spdk_thread_poll(tls.thread, 0, 0);
|
||||
}
|
||||
|
||||
if (!tls.thread) {
|
||||
char name[32];
|
||||
snprintf(name, sizeof(name), "worker_%lu", pthread_self());
|
||||
tls.thread = spdk_thread_create(name, NULL);
|
||||
if (!tls.thread) {
|
||||
SPDK_ERRLOG("spdk_thread_create failed\n");
|
||||
return NULL;
|
||||
}
|
||||
spdk_set_thread(tls.thread);
|
||||
}
|
||||
|
||||
if (!tls.channel) {
|
||||
tls.channel = spdk_bs_alloc_io_channel(g_engine.bs);
|
||||
if (!tls.channel) {
|
||||
SPDK_ERRLOG("alloc io_channel failed\n");
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
return tls.channel;
|
||||
}
|
||||
|
||||
// 通用 dispatch md op
|
||||
static int dispatch_md_op(struct md_op_ctx *ctx) {
|
||||
int rc = ensure_engine_ready(ctx->op_name ? ctx->op_name : "dispatch_md_op");
|
||||
if (rc != 0) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
ctx->done = false;
|
||||
ctx->rc = 0;
|
||||
|
||||
spdk_thread_send_msg(g_engine.md_thread, md_op_cb, ctx);
|
||||
|
||||
return wait_done_volatile(&ctx->done, &ctx->rc, ctx->op_name);
|
||||
}
|
||||
|
||||
static int dispatch_md_op_quiet(struct md_op_ctx *ctx) {
|
||||
int rc = ensure_engine_ready(ctx->op_name ? ctx->op_name : "dispatch_md_op_quiet");
|
||||
if (rc != 0) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
ctx->done = false;
|
||||
ctx->rc = 0;
|
||||
|
||||
spdk_thread_send_msg(g_engine.md_thread, md_op_cb, ctx);
|
||||
|
||||
return wait_done_volatile_quiet(&ctx->done, &ctx->rc, ctx->op_name);
|
||||
}
|
||||
|
||||
static int ensure_engine_ready(const char *op) {
|
||||
if (g_engine_init_rc != 0) {
|
||||
SPDK_ERRLOG("%s: io engine init failed, rc=%d\n", op, g_engine_init_rc);
|
||||
return g_engine_init_rc;
|
||||
}
|
||||
|
||||
if (!g_engine.bs || !g_engine.md_thread) {
|
||||
SPDK_ERRLOG("%s: io engine not ready (bs=%p, md_thread=%p)\n",
|
||||
op, (void *)g_engine.bs, (void *)g_engine.md_thread);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void md_op_cb(void *arg) {
|
||||
struct md_op_ctx *ctx = arg;
|
||||
ctx->fn(ctx);
|
||||
}
|
||||
|
||||
void json_app_load_done(int rc, void *arg) {
|
||||
struct json_load_ctx* ctx = (struct json_load_ctx*)arg;
|
||||
ctx->done = true;
|
||||
ctx->rc = rc;
|
||||
}
|
||||
|
||||
// bdev open + bs init
|
||||
static void zvfs_spdk_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
|
||||
void *event_ctx) {
|
||||
// 后续加日志或处理
|
||||
switch (type) {
|
||||
case SPDK_BDEV_EVENT_REMOVE:
|
||||
SPDK_NOTICELOG("bdev removed: %s\n", spdk_bdev_get_name(bdev));
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void bs_init_cb(void *arg, struct spdk_blob_store *bs, int bserrno) {
|
||||
struct bs_init_ctx *ctx = (struct bs_init_ctx *)arg;
|
||||
ctx->rc = bserrno;
|
||||
ctx->bs = bs;
|
||||
ctx->done = true;
|
||||
}
|
||||
|
||||
static int open_bdev_and_init_bs(const char *bdev_name) {
|
||||
SPDK_NOTICELOG("open_bdev_and_init_bs\n");
|
||||
struct spdk_bs_dev *bs_dev = NULL;
|
||||
int rc = spdk_bdev_create_bs_dev_ext(bdev_name, zvfs_spdk_bdev_event_cb, NULL, &bs_dev);
|
||||
if (rc != 0) {
|
||||
SPDK_ERRLOG("spdk_bdev_create_bs_dev_ext failed: %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
g_engine.bs_dev = bs_dev;
|
||||
|
||||
struct bs_init_ctx ctx = {
|
||||
.done = false,
|
||||
.rc = 0,
|
||||
.bs = NULL
|
||||
};
|
||||
|
||||
/* 优先加载已有 blobstore;失败时回退到 init。 */
|
||||
spdk_bs_load(bs_dev, NULL, bs_init_cb, &ctx);
|
||||
rc = wait_done(&ctx.done, &ctx.rc, "bs_load");
|
||||
if (rc != 0) {
|
||||
SPDK_NOTICELOG("spdk_bs_load failed (%d), fallback to spdk_bs_init\n", rc);
|
||||
|
||||
/*
|
||||
* 注意:spdk_bs_load 失败路径会销毁传入的 dev。
|
||||
* 这里必须重新 create 一个新的 bs_dev,不能复用旧指针。
|
||||
*/
|
||||
bs_dev = NULL;
|
||||
rc = spdk_bdev_create_bs_dev_ext(bdev_name, zvfs_spdk_bdev_event_cb, NULL, &bs_dev);
|
||||
if (rc != 0) {
|
||||
SPDK_ERRLOG("spdk_bdev_create_bs_dev_ext(for init fallback) failed: %d\n", rc);
|
||||
g_engine.bs_dev = NULL;
|
||||
return rc;
|
||||
}
|
||||
g_engine.bs_dev = bs_dev;
|
||||
|
||||
ctx.done = false;
|
||||
ctx.rc = 0;
|
||||
ctx.bs = NULL;
|
||||
|
||||
spdk_bs_init(bs_dev, NULL, bs_init_cb, &ctx);
|
||||
rc = wait_done(&ctx.done, &ctx.rc, "bs_init");
|
||||
if (rc != 0) {
|
||||
g_engine.bs_dev = NULL;
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
g_engine.bs = ctx.bs;
|
||||
g_engine.io_unit_size = spdk_bs_get_io_unit_size(ctx.bs);
|
||||
g_engine.cluster_size = spdk_bs_get_cluster_size(ctx.bs);
|
||||
|
||||
SPDK_NOTICELOG("Blobstore initialized successfully on bdev: %s\n", bdev_name);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void blob_get_super_cb(void *arg, spdk_blob_id blobid, int rc) {
|
||||
struct md_op_ctx *ctx = arg;
|
||||
ctx->rc = rc;
|
||||
ctx->super.blob_id = blobid;
|
||||
ctx->done = true;
|
||||
}
|
||||
|
||||
static void blob_set_super_cb(void *arg, int rc) {
|
||||
struct md_op_ctx *ctx = arg;
|
||||
ctx->rc = rc;
|
||||
ctx->done = true;
|
||||
}
|
||||
|
||||
static void blob_get_super_on_md(struct md_op_ctx *ctx) {
|
||||
spdk_bs_get_super(g_engine.bs, blob_get_super_cb, ctx);
|
||||
}
|
||||
|
||||
static void blob_set_super_on_md(struct md_op_ctx *ctx) {
|
||||
spdk_bs_set_super(g_engine.bs, ctx->super.blob_id, blob_set_super_cb, ctx);
|
||||
}
|
||||
|
||||
static int bs_get_super_id(spdk_blob_id *blob_id) {
|
||||
struct md_op_ctx ctx = {
|
||||
.fn = blob_get_super_on_md,
|
||||
.op_name = "blob get super",
|
||||
};
|
||||
ctx.super.blob_id = SPDK_BLOBID_INVALID;
|
||||
|
||||
int rc = dispatch_md_op_quiet(&ctx);
|
||||
if (rc != 0) {
|
||||
return rc;
|
||||
}
|
||||
*blob_id = ctx.super.blob_id;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bs_set_super_id(spdk_blob_id blob_id) {
|
||||
struct md_op_ctx ctx = {
|
||||
.fn = blob_set_super_on_md,
|
||||
.op_name = "blob set super",
|
||||
};
|
||||
ctx.super.blob_id = blob_id;
|
||||
return dispatch_md_op(&ctx);
|
||||
}
|
||||
|
||||
struct zvfs_blob_handle *blob_get_super(void) {
|
||||
pthread_mutex_lock(&g_super_blob_mutex);
|
||||
|
||||
if (g_super_blob_id_cache != SPDK_BLOBID_INVALID) {
|
||||
struct zvfs_blob_handle *cached = blob_open(g_super_blob_id_cache);
|
||||
if (cached) {
|
||||
pthread_mutex_unlock(&g_super_blob_mutex);
|
||||
return cached;
|
||||
}
|
||||
g_super_blob_id_cache = SPDK_BLOBID_INVALID;
|
||||
}
|
||||
|
||||
spdk_blob_id super_id = SPDK_BLOBID_INVALID;
|
||||
int rc = bs_get_super_id(&super_id);
|
||||
if (rc == 0 && super_id != SPDK_BLOBID_INVALID) {
|
||||
g_super_blob_id_cache = super_id;
|
||||
struct zvfs_blob_handle *existing = blob_open(super_id);
|
||||
if (!existing) {
|
||||
g_super_blob_id_cache = SPDK_BLOBID_INVALID;
|
||||
}
|
||||
pthread_mutex_unlock(&g_super_blob_mutex);
|
||||
return existing;
|
||||
}
|
||||
if (rc == 0 && super_id == SPDK_BLOBID_INVALID) {
|
||||
rc = -ENOENT;
|
||||
}
|
||||
|
||||
if (rc != -ENOENT) {
|
||||
SPDK_ERRLOG("spdk_bs_get_super failed: %d\n", rc);
|
||||
pthread_mutex_unlock(&g_super_blob_mutex);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct zvfs_blob_handle *created = blob_create(0);
|
||||
if (!created) {
|
||||
pthread_mutex_unlock(&g_super_blob_mutex);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
rc = bs_set_super_id(created->id);
|
||||
if (rc != 0) {
|
||||
spdk_blob_id created_id = created->id;
|
||||
SPDK_ERRLOG("spdk_bs_set_super failed: %d\n", rc);
|
||||
blob_close(created);
|
||||
blob_delete(created_id);
|
||||
pthread_mutex_unlock(&g_super_blob_mutex);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
g_super_blob_id_cache = created->id;
|
||||
pthread_mutex_unlock(&g_super_blob_mutex);
|
||||
return created;
|
||||
}
|
||||
|
||||
// blob_create
|
||||
static void blob_create_cb(void *arg, spdk_blob_id blobid, int rc) {
|
||||
struct md_op_ctx *ctx = arg;
|
||||
ctx->rc = rc;
|
||||
ctx->create.blob_id = blobid;
|
||||
ctx->done = true;
|
||||
}
|
||||
|
||||
static void blob_create_on_md(struct md_op_ctx *ctx) {
|
||||
struct spdk_blob_opts opts;
|
||||
spdk_blob_opts_init(&opts, sizeof(opts));
|
||||
// size_hint 如果需,但 create 不直接 set size,用 resize 后
|
||||
spdk_bs_create_blob_ext(g_engine.bs, &opts, blob_create_cb, ctx);
|
||||
}
|
||||
|
||||
struct zvfs_blob_handle *blob_create(uint64_t size_hint) {
|
||||
if(size_hint == 0) size_hint = g_engine.cluster_size;
|
||||
struct md_op_ctx ctx = {.fn = blob_create_on_md, .create.size_hint = size_hint, .op_name = "blob create"};
|
||||
int rc = dispatch_md_op(&ctx);
|
||||
if (rc) return NULL;
|
||||
|
||||
struct zvfs_blob_handle *handle = blob_open(ctx.create.blob_id);
|
||||
if (handle && size_hint > 0) {
|
||||
rc = blob_resize(handle, size_hint); // 初始 resize
|
||||
if (rc != 0) {
|
||||
SPDK_ERRLOG("blob_resize failed after create: %d\n", rc);
|
||||
blob_close(handle);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
rc = blob_sync_md(handle);
|
||||
if (rc != 0) {
|
||||
SPDK_ERRLOG("blob_sync_md failed after resize: %d\n", rc);
|
||||
blob_close(handle);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
return handle;
|
||||
}
|
||||
|
||||
// blob_open
|
||||
static void blob_open_cb(void *arg, struct spdk_blob *blob, int rc) {
|
||||
struct md_op_ctx *ctx = arg;
|
||||
ctx->rc = rc;
|
||||
ctx->open.blob = blob;
|
||||
ctx->done = true;
|
||||
}
|
||||
|
||||
static void blob_open_on_md(struct md_op_ctx *ctx) {
|
||||
struct spdk_blob_open_opts opts;
|
||||
spdk_blob_open_opts_init(&opts, sizeof(opts));
|
||||
spdk_bs_open_blob_ext(g_engine.bs, ctx->open.blob_id, &opts, blob_open_cb, ctx);
|
||||
}
|
||||
|
||||
struct zvfs_blob_handle *blob_open(uint64_t blob_id) {
|
||||
struct md_op_ctx ctx = {.fn = blob_open_on_md, .open.blob_id = blob_id, .op_name = "blob open"};
|
||||
int rc = dispatch_md_op(&ctx);
|
||||
if (rc) return NULL;
|
||||
|
||||
struct zvfs_blob_handle *handle = malloc(sizeof(*handle));
|
||||
if (!handle) return NULL;
|
||||
|
||||
handle->id = blob_id;
|
||||
handle->blob = ctx.open.blob;
|
||||
handle->size = spdk_blob_get_num_clusters(handle->blob) * g_engine.cluster_size;
|
||||
|
||||
// 预分配固定大小的 DMA buf,后续所有 IO 都经过这块缓存,避免每次 IO 动态申请
|
||||
// 必须用 spdk_dma_malloc 保证地址对齐到 io_unit_size
|
||||
handle->dma_buf_size = ZVFS_DMA_BUF_SIZE;
|
||||
handle->dma_buf = spdk_dma_malloc(ZVFS_DMA_BUF_SIZE, g_engine.io_unit_size, NULL);
|
||||
if (!handle->dma_buf) {
|
||||
SPDK_ERRLOG("spdk_dma_malloc failed for blob %lu\n", blob_id);
|
||||
free(handle);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return handle;
|
||||
}
|
||||
|
||||
// blob_write
|
||||
static void io_completion_cb(void *arg, int rc) {
|
||||
struct io_completion_ctx *ctx = arg;
|
||||
ctx->rc = rc;
|
||||
ctx->done = true;
|
||||
}
|
||||
|
||||
int blob_write(struct zvfs_blob_handle *handle, uint64_t offset, const void *buf, size_t len) {
|
||||
if (tls.thread) {
|
||||
spdk_thread_poll(tls.thread, 0, 0);
|
||||
}
|
||||
|
||||
struct spdk_io_channel *ch = get_current_channel();
|
||||
if (!ch) return -1;
|
||||
if (len == 0) return 0;
|
||||
|
||||
// 越界检查
|
||||
if (offset + len > handle->size) {
|
||||
SPDK_ERRLOG("blob_write out of range: offset=%lu len=%zu blob_size=%lu\n",
|
||||
offset, len, handle->size);
|
||||
return -ERANGE;
|
||||
}
|
||||
|
||||
// 计算对齐后的 IO 范围和 dma_buf 内偏移
|
||||
uint64_t lba_off = 0;
|
||||
uint64_t lba_len = 0;
|
||||
uint32_t buf_off = 0;
|
||||
int rc = zvfs_calc_io_units(offset, len, g_engine.io_unit_size, &lba_off, &lba_len, &buf_off);
|
||||
if (rc != 0) {
|
||||
SPDK_ERRLOG("blob_write calc_io_units failed: %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
size_t aligned_bytes = lba_len * g_engine.io_unit_size;
|
||||
if (aligned_bytes > ZVFS_DMA_BUF_SIZE) {
|
||||
SPDK_ERRLOG("blob_write aligned_bytes=%zu exceeds ZVFS_DMA_BUF_SIZE\n", aligned_bytes);
|
||||
return -ENOSPC;
|
||||
}
|
||||
|
||||
struct io_completion_ctx io_ctx = {.done = false, .rc = 0};
|
||||
|
||||
spdk_blob_io_read(handle->blob, ch, handle->dma_buf, lba_off, lba_len,
|
||||
io_completion_cb, &io_ctx);
|
||||
|
||||
|
||||
rc = wait_done(&io_ctx.done, &io_ctx.rc, "io_write(read phase)");
|
||||
if (rc != 0) return rc;
|
||||
|
||||
memcpy((uint8_t *)handle->dma_buf + buf_off, buf, len);
|
||||
io_ctx.done = false;
|
||||
io_ctx.rc = 0;
|
||||
|
||||
spdk_blob_io_write(handle->blob, ch, handle->dma_buf, lba_off, lba_len,
|
||||
io_completion_cb, &io_ctx);
|
||||
rc = wait_done(&io_ctx.done, &io_ctx.rc, "io_write(write phase)");
|
||||
if (rc != 0) return rc;
|
||||
|
||||
return io_ctx.rc;
|
||||
}
|
||||
|
||||
// blob_read 类似
|
||||
int blob_read(struct zvfs_blob_handle *handle, uint64_t offset, void *buf, size_t len) {
|
||||
if (tls.thread) {
|
||||
spdk_thread_poll(tls.thread, 0, 0);
|
||||
}
|
||||
|
||||
struct spdk_io_channel *ch = get_current_channel();
|
||||
if (!ch) return -1;
|
||||
if (len == 0) return 0;
|
||||
|
||||
// 越界检查
|
||||
if (offset + len > handle->size) {
|
||||
SPDK_ERRLOG("blob_read out of range: offset=%lu len=%zu blob_size=%lu\n",
|
||||
offset, len, handle->size);
|
||||
return -ERANGE;
|
||||
}
|
||||
|
||||
|
||||
// 计算对齐后的 IO 范围和 dma_buf 内偏移
|
||||
uint64_t lba_off = 0;
|
||||
uint64_t lba_len = 0;
|
||||
uint32_t buf_off = 0;
|
||||
int rc = zvfs_calc_io_units(offset, len, g_engine.io_unit_size, &lba_off, &lba_len, &buf_off);
|
||||
if (rc != 0) {
|
||||
SPDK_ERRLOG("io_read offset/len not aligned to io_unit_size=%lu\n", g_engine.io_unit_size);
|
||||
return rc;
|
||||
}
|
||||
|
||||
// 读入对齐范围到 dma_buf,再从正确偏移处截取到用户 buf
|
||||
size_t aligned_bytes = lba_len * g_engine.io_unit_size;
|
||||
if (aligned_bytes > ZVFS_DMA_BUF_SIZE) {
|
||||
SPDK_ERRLOG("blob_read aligned_bytes=%zu exceeds ZVFS_DMA_BUF_SIZE\n", aligned_bytes);
|
||||
return -ENOSPC;
|
||||
}
|
||||
|
||||
struct io_completion_ctx io_ctx = {.done = false, .rc = 0};
|
||||
|
||||
spdk_blob_io_read(handle->blob, ch, handle->dma_buf, lba_off, lba_len,
|
||||
io_completion_cb, &io_ctx);
|
||||
|
||||
rc = wait_done(&io_ctx.done, &io_ctx.rc, "io_read");
|
||||
if (rc != 0) return rc;
|
||||
|
||||
memcpy(buf, (uint8_t *)handle->dma_buf + buf_off, len);
|
||||
return io_ctx.rc;
|
||||
}
|
||||
|
||||
// blob_resize
|
||||
static void blob_resize_cb(void *arg, int rc) {
|
||||
struct md_op_ctx *ctx = arg;
|
||||
ctx->rc = rc;
|
||||
ctx->done = true;
|
||||
}
|
||||
|
||||
static void blob_resize_on_md(struct md_op_ctx *ctx) {
|
||||
uint64_t new_clusters = 0;
|
||||
uint64_t cluster_size = g_engine.cluster_size;
|
||||
int rc = zvfs_calc_ceil_units(ctx->handle_op.new_size, cluster_size, &new_clusters);
|
||||
if (rc != 0) {
|
||||
ctx->rc = rc;
|
||||
ctx->done = true;
|
||||
return;
|
||||
}
|
||||
spdk_blob_resize(ctx->handle_op.handle->blob, new_clusters, blob_resize_cb, ctx);
|
||||
}
|
||||
|
||||
int blob_resize(struct zvfs_blob_handle *handle, uint64_t new_size) {
|
||||
struct md_op_ctx ctx = {.fn = blob_resize_on_md, .op_name = "blob resize"};
|
||||
ctx.handle_op.handle = handle;
|
||||
ctx.handle_op.new_size = new_size;
|
||||
int rc = dispatch_md_op(&ctx);
|
||||
if (rc == 0) {
|
||||
uint64_t new_clusters = 0;
|
||||
zvfs_calc_ceil_units(new_size, g_engine.cluster_size, &new_clusters);
|
||||
handle->size = new_clusters * g_engine.cluster_size;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
// blob_sync_md
|
||||
static void blob_sync_md_cb(void *arg, int rc) {
|
||||
struct md_op_ctx *ctx = arg;
|
||||
ctx->rc = rc;
|
||||
ctx->done = true;
|
||||
}
|
||||
|
||||
static void blob_sync_md_on_md(struct md_op_ctx *ctx) {
|
||||
spdk_blob_sync_md(ctx->handle_op.handle->blob, blob_sync_md_cb, ctx);
|
||||
}
|
||||
|
||||
int blob_sync_md(struct zvfs_blob_handle *handle) {
|
||||
struct md_op_ctx ctx = {.fn = blob_sync_md_on_md, .op_name = "blob sync"};
|
||||
ctx.handle_op.handle = handle;
|
||||
return dispatch_md_op(&ctx);
|
||||
}
|
||||
|
||||
// blob_close
|
||||
static void blob_close_cb(void *arg, int rc) {
|
||||
struct md_op_ctx *ctx = arg;
|
||||
ctx->rc = rc;
|
||||
ctx->done = true;
|
||||
}
|
||||
|
||||
static void blob_close_on_md(struct md_op_ctx *ctx) {
|
||||
spdk_blob_close(ctx->handle_op.handle->blob, blob_close_cb, ctx);
|
||||
}
|
||||
|
||||
int blob_close(struct zvfs_blob_handle *handle) {
|
||||
struct md_op_ctx ctx = {.fn = blob_close_on_md, .op_name = "blob close"};
|
||||
ctx.handle_op.handle = handle;
|
||||
int rc = dispatch_md_op(&ctx);
|
||||
if (rc == 0) {
|
||||
spdk_dma_free(handle->dma_buf);
|
||||
free(handle);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
// blob_delete
|
||||
static void blob_delete_cb(void *arg, int rc) {
|
||||
struct md_op_ctx *ctx = arg;
|
||||
ctx->rc = rc;
|
||||
ctx->done = true;
|
||||
}
|
||||
|
||||
static void blob_delete_on_md(struct md_op_ctx *ctx) {
|
||||
spdk_bs_delete_blob(g_engine.bs, ctx->delete.blob_id, blob_delete_cb, ctx);
|
||||
}
|
||||
|
||||
int blob_delete(uint64_t blob_id) {
|
||||
struct md_op_ctx ctx = {.fn = blob_delete_on_md, .op_name = "blob delete"};
|
||||
ctx.delete.blob_id = blob_id;
|
||||
return dispatch_md_op(&ctx);
|
||||
}
|
||||
44
src/spdk_engine/io_engine.h
Normal file
44
src/spdk_engine/io_engine.h
Normal file
@@ -0,0 +1,44 @@
|
||||
#ifndef __ZVFS_IO_ENGINE_H__
|
||||
#define __ZVFS_IO_ENGINE_H__
|
||||
|
||||
#include <stdint.h>
|
||||
#include <sys/types.h>
|
||||
#include <spdk/blob.h>
|
||||
|
||||
// blob_handle 结构体:底层 blob 信息,不含文件级 size(上层维护)
|
||||
typedef struct zvfs_blob_handle {
|
||||
spdk_blob_id id;
|
||||
struct spdk_blob *blob;
|
||||
uint64_t size;
|
||||
void *dma_buf;
|
||||
uint64_t dma_buf_size;
|
||||
} zvfs_blob_handle_t ;
|
||||
|
||||
typedef struct zvfs_spdk_io_engine {
|
||||
struct spdk_bs_dev *bs_dev;
|
||||
struct spdk_blob_store *bs;
|
||||
struct spdk_thread *md_thread;
|
||||
uint64_t io_unit_size;
|
||||
uint64_t cluster_size;
|
||||
int reactor_count;
|
||||
|
||||
} zvfs_spdk_io_engine_t;
|
||||
|
||||
typedef struct zvfs_tls_ctx {
|
||||
struct spdk_thread *thread;
|
||||
struct spdk_io_channel *channel;
|
||||
}zvfs_tls_ctx_t;
|
||||
|
||||
int io_engine_init(const char *bdev_name);
|
||||
|
||||
struct zvfs_blob_handle *blob_get_super(void);
|
||||
struct zvfs_blob_handle *blob_create(uint64_t size_hint); // 创建并 open,返回 handle
|
||||
struct zvfs_blob_handle *blob_open(uint64_t blob_id); // open 现有 blob,返回 handle
|
||||
int blob_write(struct zvfs_blob_handle *handle, uint64_t offset, const void *buf, size_t len);
|
||||
int blob_read(struct zvfs_blob_handle *handle, uint64_t offset, void *buf, size_t len);
|
||||
int blob_resize(struct zvfs_blob_handle *handle, uint64_t new_size);
|
||||
int blob_sync_md(struct zvfs_blob_handle *handle);
|
||||
int blob_close(struct zvfs_blob_handle *handle); // close 这个 handle 的 blob*
|
||||
int blob_delete(uint64_t blob_id); // delete,整个 blob(不需 handle)
|
||||
|
||||
#endif // __ZVFS_IO_ENGINE_H__
|
||||
17
src/zvfsmalloc.json
Executable file
17
src/zvfsmalloc.json
Executable file
@@ -0,0 +1,17 @@
|
||||
{
|
||||
"subsystems": [
|
||||
{
|
||||
"subsystem": "bdev",
|
||||
"config": [
|
||||
{
|
||||
"method": "bdev_malloc_create",
|
||||
"params": {
|
||||
"name": "Malloc0",
|
||||
"num_blocks": 32768,
|
||||
"block_size": 512
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -1,43 +0,0 @@
|
||||
CC ?= gcc
|
||||
CFLAGS ?= -O2 -Wall -Wextra -std=gnu11
|
||||
|
||||
SRCS := $(sort $(wildcard test_*.c))
|
||||
BIN_DIR ?= bin
|
||||
BIN_NAMES := $(SRCS:.c=)
|
||||
BINS := $(addprefix $(BIN_DIR)/,$(BIN_NAMES))
|
||||
RUN_DIR ?= /tmp/zvfs-test
|
||||
RUN_BINS ?= test_basic test_lseek test_dual_open_same_file test_two_files \
|
||||
test_single_file_perf test_single_file_random_perf \
|
||||
test_single_file_random_noaligned_perf test_write_file test_read_delete_file \
|
||||
test_phase2_posix
|
||||
|
||||
.PHONY: all clean list run-test
|
||||
|
||||
all: $(BINS)
|
||||
|
||||
$(BIN_DIR):
|
||||
mkdir -p $@
|
||||
|
||||
$(BIN_DIR)/%: %.c test_utils.h | $(BIN_DIR)
|
||||
$(CC) $(CFLAGS) -o $@ $<
|
||||
|
||||
list:
|
||||
@printf "%s\n" $(BINS)
|
||||
|
||||
run-test: all
|
||||
@mkdir -p $(RUN_DIR)
|
||||
@pass=0; fail=0; \
|
||||
for t in $(RUN_BINS); do \
|
||||
printf "\n[RUN] %s\n" "$$t"; \
|
||||
if ./$(BIN_DIR)/$$t $(RUN_DIR); then \
|
||||
pass=$$((pass + 1)); \
|
||||
else \
|
||||
fail=$$((fail + 1)); \
|
||||
fi; \
|
||||
done; \
|
||||
printf "\n=== run-test summary: PASS=%d FAIL=%d ===\n" $$pass $$fail; \
|
||||
test $$fail -eq 0
|
||||
|
||||
clean:
|
||||
$(RM) $(BINS)
|
||||
-rmdir $(BIN_DIR)
|
||||
@@ -1,51 +0,0 @@
|
||||
#include "test_utils.h"
|
||||
|
||||
static int test_basic(const char *path)
|
||||
{
|
||||
printf("\n=== test_basic ===\n");
|
||||
|
||||
printf("open: %s\n", path);
|
||||
int fd = open(path, O_CREAT | O_RDWR | O_TRUNC, 0644);
|
||||
if (fd < 0) { perror("open"); return 1; }
|
||||
|
||||
const char *msg = "ABCDEFGHIJKL";
|
||||
ssize_t w = write(fd, msg, strlen(msg));
|
||||
if (w < 0) { perror("write"); return 2; }
|
||||
printf("write: %zd\n", w);
|
||||
|
||||
const char *msg2 = "MNOPQRSTUVWXYZ";
|
||||
ssize_t w2 = write(fd, msg2, strlen(msg2));
|
||||
if (w2 < 0) { perror("write"); return 2; }
|
||||
printf("write: %zd\n", w2);
|
||||
|
||||
close(fd);
|
||||
|
||||
fd = open(path, O_RDONLY);
|
||||
if (fd < 0) { perror("open R"); return 3; }
|
||||
|
||||
char buf[10];
|
||||
memset(buf, 0, sizeof(buf));
|
||||
ssize_t r = read(fd, buf, sizeof(buf));
|
||||
if (r < 0) { perror("read"); return 4; }
|
||||
printf("read: %zd bytes: %.*s\n", r, (int)r, buf);
|
||||
|
||||
char buf2[512];
|
||||
memset(buf2, 0, sizeof(buf2));
|
||||
ssize_t r2 = read(fd, buf2, sizeof(buf2));
|
||||
if (r2 < 0) { perror("read"); return 4; }
|
||||
printf("read: %zd bytes: %.*s\n", r2, (int)r2, buf2);
|
||||
|
||||
close(fd);
|
||||
|
||||
if (unlink(path) != 0) { perror("unlink"); return 5; }
|
||||
printf("unlink: ok\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
char path[PATH_MAX];
|
||||
make_path(path, sizeof(path), argc >= 2 ? argv[1] : NULL, "file.dat");
|
||||
int rc = test_basic(path);
|
||||
return report_result("test_basic", rc);
|
||||
}
|
||||
@@ -1,50 +0,0 @@
|
||||
#include "test_utils.h"
|
||||
|
||||
static int test_dual_open_same_file(const char *path)
|
||||
{
|
||||
printf("\n=== test_dual_open_same_file ===\n");
|
||||
|
||||
int fd_init = open(path, O_CREAT | O_RDWR | O_TRUNC, 0644);
|
||||
if (fd_init < 0) { perror("open init"); return 1; }
|
||||
const char *init = "0123456789";
|
||||
if (write(fd_init, init, 10) != 10) { perror("write init"); return 2; }
|
||||
close(fd_init);
|
||||
|
||||
int fd_w = open(path, O_WRONLY);
|
||||
if (fd_w < 0) { perror("open W"); return 3; }
|
||||
|
||||
int fd_r = open(path, O_RDONLY);
|
||||
if (fd_r < 0) { perror("open R"); return 4; }
|
||||
|
||||
printf("fd_w=%d fd_r=%d\n", fd_w, fd_r);
|
||||
|
||||
if (write(fd_w, "HELLO", 5) != 5) { perror("write"); return 5; }
|
||||
printf("write via fd_w: HELLO (overwrite first 5 bytes)\n");
|
||||
|
||||
char buf[32] = {0};
|
||||
lseek(fd_r, 0, SEEK_SET);
|
||||
ssize_t r = read(fd_r, buf, sizeof(buf));
|
||||
printf("read via fd_r: %zd bytes: %.*s (expect: HELLO56789)\n", r, (int)r, buf);
|
||||
|
||||
lseek(fd_w, 0, SEEK_END);
|
||||
if (write(fd_w, "!!!", 3) != 3) { perror("write append"); return 6; }
|
||||
printf("write append via fd_w: !!!\n");
|
||||
|
||||
lseek(fd_r, 10, SEEK_SET);
|
||||
memset(buf, 0, sizeof(buf));
|
||||
r = read(fd_r, buf, sizeof(buf));
|
||||
printf("read appended via fd_r: %zd bytes: %.*s (expect: !!!)\n", r, (int)r, buf);
|
||||
|
||||
close(fd_w);
|
||||
close(fd_r);
|
||||
unlink(path);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
char path[PATH_MAX];
|
||||
make_path(path, sizeof(path), argc >= 2 ? argv[1] : NULL, "file.dat");
|
||||
int rc = test_dual_open_same_file(path);
|
||||
return report_result("test_dual_open_same_file", rc);
|
||||
}
|
||||
@@ -1,55 +0,0 @@
|
||||
#include "test_utils.h"
|
||||
|
||||
static int test_lseek(const char *path)
|
||||
{
|
||||
printf("\n=== test_lseek ===\n");
|
||||
|
||||
int fd = open(path, O_CREAT | O_RDWR | O_TRUNC, 0644);
|
||||
if (fd < 0) { perror("open"); return 1; }
|
||||
|
||||
const char *alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
|
||||
if (write(fd, alpha, 26) != 26) { perror("write"); return 2; }
|
||||
printf("write 26 bytes: %s\n", alpha);
|
||||
|
||||
off_t pos = lseek(fd, 0, SEEK_SET);
|
||||
printf("lseek SEEK_SET 0 -> %ld\n", (long)pos);
|
||||
char buf[32] = {0};
|
||||
ssize_t r = read(fd, buf, 5);
|
||||
printf("read 5 bytes: %.*s (expect: ABCDE)\n", (int)r, buf);
|
||||
|
||||
pos = lseek(fd, 3, SEEK_CUR);
|
||||
printf("lseek SEEK_CUR +3 -> %ld\n", (long)pos);
|
||||
memset(buf, 0, sizeof(buf));
|
||||
r = read(fd, buf, 5);
|
||||
printf("read 5 bytes: %.*s (expect: IJKLM)\n", (int)r, buf);
|
||||
|
||||
pos = lseek(fd, -5, SEEK_END);
|
||||
printf("lseek SEEK_END -5 -> %ld\n", (long)pos);
|
||||
memset(buf, 0, sizeof(buf));
|
||||
r = read(fd, buf, 10);
|
||||
printf("read %zd bytes: %.*s (expect: VWXYZ)\n", r, (int)r, buf);
|
||||
|
||||
pos = lseek(fd, 30, SEEK_SET);
|
||||
printf("lseek SEEK_SET 30 -> %ld\n", (long)pos);
|
||||
if (write(fd, "!", 1) != 1) { perror("write hole"); return 3; }
|
||||
|
||||
lseek(fd, 26, SEEK_SET);
|
||||
memset(buf, 0xAA, sizeof(buf));
|
||||
r = read(fd, buf, 5);
|
||||
printf("read hole+1: %zd bytes, hole[0]=%02X hole[1]=%02X hole[2]=%02X "
|
||||
"hole[3]=%02X last='%c' (expect: 00 00 00 00 '!')\n",
|
||||
r, (unsigned char)buf[0], (unsigned char)buf[1],
|
||||
(unsigned char)buf[2], (unsigned char)buf[3], buf[4]);
|
||||
|
||||
close(fd);
|
||||
unlink(path);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
char path[PATH_MAX];
|
||||
make_path(path, sizeof(path), argc >= 2 ? argv[1] : NULL, "file.dat");
|
||||
int rc = test_lseek(path);
|
||||
return report_result("test_lseek", rc);
|
||||
}
|
||||
@@ -1,98 +0,0 @@
|
||||
#include "test_utils.h"
|
||||
#include <sys/stat.h>
|
||||
|
||||
static int expect_errno(const char *what, int exp)
|
||||
{
|
||||
if (errno != exp) {
|
||||
fprintf(stderr, "%s: errno=%d expected=%d\n", what, errno, exp);
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int test_phase2(const char *root)
|
||||
{
|
||||
char dir[PATH_MAX];
|
||||
char file[PATH_MAX];
|
||||
char file2[PATH_MAX];
|
||||
struct stat st;
|
||||
int dfd = -1;
|
||||
int fd = -1;
|
||||
|
||||
snprintf(dir, sizeof(dir), "%s/p2db", root);
|
||||
snprintf(file, sizeof(file), "%s/p2db/data.log", root);
|
||||
snprintf(file2, sizeof(file2), "%s/p2db/data2.log", root);
|
||||
|
||||
(void)unlink(file2);
|
||||
(void)unlink(file);
|
||||
(void)rmdir(dir);
|
||||
|
||||
if (mkdir(dir, 0755) != 0) { perror("mkdir"); return 1; }
|
||||
dfd = open(dir, O_RDONLY | O_DIRECTORY);
|
||||
if (dfd < 0) { perror("open dir"); return 2; }
|
||||
|
||||
fd = openat(dfd, "data.log", O_CREAT | O_RDWR | O_TRUNC, 0644);
|
||||
if (fd < 0) { perror("openat create"); return 3; }
|
||||
|
||||
if (write(fd, "ABCD", 4) != 4) { perror("write"); return 4; }
|
||||
if (pwrite(fd, "XYZ", 3, 8) != 3) { perror("pwrite"); return 5; }
|
||||
|
||||
char buf[16] = {0};
|
||||
ssize_t n = pread(fd, buf, 11, 0);
|
||||
if (n != 11) { perror("pread"); return 6; }
|
||||
if (memcmp(buf, "ABCD", 4) != 0 || buf[4] || buf[5] || buf[6] || buf[7] ||
|
||||
memcmp(buf + 8, "XYZ", 3) != 0) {
|
||||
fprintf(stderr, "pread sparse verify failed\n");
|
||||
return 7;
|
||||
}
|
||||
|
||||
if (fsync(fd) != 0) { perror("fsync"); return 8; }
|
||||
if (fdatasync(fd) != 0) { perror("fdatasync"); return 9; }
|
||||
|
||||
if (fstat(fd, &st) != 0) { perror("fstat"); return 10; }
|
||||
if (st.st_size != 11) {
|
||||
fprintf(stderr, "fstat size=%ld expected=11\n", (long)st.st_size);
|
||||
return 11;
|
||||
}
|
||||
|
||||
if (ftruncate(fd, 4) != 0) { perror("ftruncate"); return 12; }
|
||||
memset(buf, 0, sizeof(buf));
|
||||
n = pread(fd, buf, 16, 0);
|
||||
if (n != 4 || memcmp(buf, "ABCD", 4) != 0) {
|
||||
fprintf(stderr, "truncate readback failed n=%zd\n", n);
|
||||
return 13;
|
||||
}
|
||||
|
||||
if (rename(file, file2) != 0) { perror("rename"); return 14; }
|
||||
if (access(file, F_OK) == 0 || expect_errno("access old", ENOENT) != 0) {
|
||||
return 15;
|
||||
}
|
||||
if (access(file2, F_OK) != 0) { perror("access new"); return 16; }
|
||||
|
||||
int fd2 = open(file2, O_CREAT | O_EXCL | O_RDWR, 0644);
|
||||
if (fd2 >= 0 || expect_errno("open excl", EEXIST) != 0) {
|
||||
if (fd2 >= 0) close(fd2);
|
||||
return 17;
|
||||
}
|
||||
|
||||
int rd = open(file2, O_RDONLY);
|
||||
if (rd < 0) { perror("open rdonly"); return 18; }
|
||||
if (write(rd, "Q", 1) >= 0 || expect_errno("write rdonly", EBADF) != 0) {
|
||||
close(rd);
|
||||
return 19;
|
||||
}
|
||||
close(rd);
|
||||
|
||||
close(fd);
|
||||
close(dfd);
|
||||
if (unlink(file2) != 0) { perror("unlink"); return 20; }
|
||||
if (rmdir(dir) != 0) { perror("rmdir"); return 21; }
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
const char *root = argc >= 2 ? argv[1] : "/zvfs";
|
||||
int rc = test_phase2(root);
|
||||
return report_result("test_phase2_posix", rc);
|
||||
}
|
||||
@@ -1,31 +0,0 @@
|
||||
#include "test_utils.h"
|
||||
|
||||
static int test_read_delete_file(const char *path)
|
||||
{
|
||||
printf("\n=== test_read_delete_file ===\n");
|
||||
|
||||
int fd = open(path, O_RDONLY);
|
||||
if (fd < 0) { perror("open"); return 1; }
|
||||
printf("open: %s fd=%d\n", path, fd);
|
||||
|
||||
char buf[256] = {0};
|
||||
ssize_t r = read(fd, buf, sizeof(buf));
|
||||
if (r < 0) { perror("read"); close(fd); return 2; }
|
||||
printf("read: %zd bytes: %.*s\n", r, (int)r, buf);
|
||||
|
||||
close(fd);
|
||||
printf("close: ok\n");
|
||||
|
||||
if (unlink(path) != 0) { perror("unlink"); return 3; }
|
||||
printf("unlink: ok\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
char path[PATH_MAX];
|
||||
make_path(path, sizeof(path), argc >= 2 ? argv[1] : NULL, "file.dat");
|
||||
int rc = test_read_delete_file(path);
|
||||
return report_result("test_read_delete_file", rc);
|
||||
}
|
||||
@@ -1,93 +0,0 @@
|
||||
#include "test_utils.h"
|
||||
|
||||
static int test_single_file_perf(const char *path)
|
||||
{
|
||||
size_t io_size = 128 * 1024;
|
||||
// size_t io_size = 4096;
|
||||
size_t max_size = 2ULL * 1024 * 1024 * 1024;
|
||||
size_t max_count = max_size / io_size;
|
||||
int test_sec = 10;
|
||||
int direct = 0;
|
||||
|
||||
printf("\n=== test_single_file_perf ===\n");
|
||||
printf("Path : %s\n", path);
|
||||
printf("IO size : %zu KB\n", io_size / 1024);
|
||||
printf("Max file: %zu MB\n", max_size / 1024 / 1024);
|
||||
printf("Duration: %d sec\n", test_sec);
|
||||
|
||||
unlink(path);
|
||||
char *buf = aligned_alloc(4096, io_size);
|
||||
if (!buf) { perror("aligned_alloc"); return 1; }
|
||||
memset(buf, 'A', io_size);
|
||||
|
||||
struct timespec t1, t2, now;
|
||||
|
||||
int fd = open(path, O_CREAT | O_RDWR | direct, 0644);
|
||||
if (fd < 0) { perror("open write"); free(buf); return 1; }
|
||||
|
||||
clock_gettime(CLOCK_MONOTONIC, &t1);
|
||||
size_t wcount = 0;
|
||||
size_t wpos = 0;
|
||||
do {
|
||||
if (wpos >= max_count) {
|
||||
lseek(fd, 0, SEEK_SET);
|
||||
wpos = 0;
|
||||
}
|
||||
if (write(fd, buf, io_size) != (ssize_t)io_size) {
|
||||
perror("write");
|
||||
close(fd);
|
||||
free(buf);
|
||||
return 2;
|
||||
}
|
||||
wcount++;
|
||||
wpos++;
|
||||
clock_gettime(CLOCK_MONOTONIC, &now);
|
||||
} while (time_diff_sec(t1, now) < test_sec);
|
||||
clock_gettime(CLOCK_MONOTONIC, &t2);
|
||||
close(fd);
|
||||
|
||||
double wsec = time_diff_sec(t1, t2);
|
||||
double wmb = (double)(wcount * io_size) / (1024.0 * 1024.0);
|
||||
printf("\nWRITE:\n");
|
||||
printf(" total : %.1f MB\n", wmb);
|
||||
printf(" time : %.3f sec\n", wsec);
|
||||
printf(" IOPS : %.0f ops/sec\n", wcount / wsec);
|
||||
printf(" BW : %.2f MB/s\n", wmb / wsec);
|
||||
|
||||
fd = open(path, O_RDONLY | direct);
|
||||
if (fd < 0) { perror("open read"); free(buf); return 3; }
|
||||
|
||||
clock_gettime(CLOCK_MONOTONIC, &t1);
|
||||
size_t rcount = 0;
|
||||
do {
|
||||
ssize_t r = read(fd, buf, io_size);
|
||||
if (r <= 0) {
|
||||
lseek(fd, 0, SEEK_SET);
|
||||
continue;
|
||||
}
|
||||
rcount++;
|
||||
clock_gettime(CLOCK_MONOTONIC, &now);
|
||||
} while (time_diff_sec(t1, now) < test_sec);
|
||||
clock_gettime(CLOCK_MONOTONIC, &t2);
|
||||
close(fd);
|
||||
|
||||
double rsec = time_diff_sec(t1, t2);
|
||||
double rmb = (double)(rcount * io_size) / (1024.0 * 1024.0);
|
||||
printf("\nREAD:\n");
|
||||
printf(" total : %.1f MB\n", rmb);
|
||||
printf(" time : %.3f sec\n", rsec);
|
||||
printf(" IOPS : %.0f ops/sec\n", rcount / rsec);
|
||||
printf(" BW : %.2f MB/s\n", rmb / rsec);
|
||||
|
||||
unlink(path);
|
||||
free(buf);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
char path[PATH_MAX];
|
||||
make_path(path, sizeof(path), argc >= 2 ? argv[1] : NULL, "file.dat");
|
||||
int rc = test_single_file_perf(path);
|
||||
return report_result("test_single_file_perf", rc);
|
||||
}
|
||||
@@ -1,116 +0,0 @@
|
||||
#include "test_utils.h"
|
||||
|
||||
static int test_single_file_random_noaligned_perf(const char *path)
|
||||
{
|
||||
size_t io_size = 128 * 1024;
|
||||
size_t max_size = 2ULL * 1024 * 1024 * 1024;
|
||||
int test_sec = 10;
|
||||
int direct = 0;
|
||||
|
||||
printf("\n=== test_single_file_random_noaligned_perf ===\n");
|
||||
printf("Path : %s\n", path);
|
||||
printf("IO size : %zu KB\n", io_size / 1024);
|
||||
printf("Range : %zu MB\n", max_size / 1024 / 1024);
|
||||
printf("Duration: %d sec\n", test_sec);
|
||||
|
||||
srand(0x1234);
|
||||
|
||||
char *buf = aligned_alloc(4096, io_size);
|
||||
if (!buf) { perror("aligned_alloc"); return 1; }
|
||||
memset(buf, 'A', io_size);
|
||||
|
||||
struct timespec t1, t2, now;
|
||||
|
||||
unlink(path);
|
||||
|
||||
int fd = open(path, O_CREAT | O_RDWR | direct, 0644);
|
||||
if (fd < 0) { perror("open rand write"); free(buf); return 2; }
|
||||
|
||||
clock_gettime(CLOCK_MONOTONIC, &t1);
|
||||
|
||||
size_t wcount = 0;
|
||||
do {
|
||||
off_t offset = (off_t)(rand() % (max_size - io_size));
|
||||
|
||||
if (lseek(fd, offset, SEEK_SET) < 0) {
|
||||
perror("lseek rand write");
|
||||
close(fd);
|
||||
free(buf);
|
||||
return 3;
|
||||
}
|
||||
|
||||
if (write(fd, buf, io_size) != (ssize_t)io_size) {
|
||||
perror("rand write");
|
||||
close(fd);
|
||||
free(buf);
|
||||
return 4;
|
||||
}
|
||||
|
||||
wcount++;
|
||||
clock_gettime(CLOCK_MONOTONIC, &now);
|
||||
} while (time_diff_sec(t1, now) < test_sec);
|
||||
|
||||
clock_gettime(CLOCK_MONOTONIC, &t2);
|
||||
close(fd);
|
||||
|
||||
double wsec = time_diff_sec(t1, t2);
|
||||
double wmb = (double)wcount * io_size / 1024 / 1024;
|
||||
|
||||
printf("\nRANDOM WRITE:\n");
|
||||
printf(" total : %.1f MB\n", wmb);
|
||||
printf(" time : %.3f sec\n", wsec);
|
||||
printf(" IOPS : %.0f ops/sec\n", wcount / wsec);
|
||||
printf(" BW : %.2f MB/s\n", wmb / wsec);
|
||||
|
||||
fd = open(path, O_RDONLY | direct);
|
||||
if (fd < 0) { perror("open rand read"); free(buf); return 5; }
|
||||
|
||||
clock_gettime(CLOCK_MONOTONIC, &t1);
|
||||
|
||||
size_t rcount = 0;
|
||||
do {
|
||||
off_t offset = (off_t)(rand() % (max_size - io_size));
|
||||
|
||||
if (lseek(fd, offset, SEEK_SET) < 0) {
|
||||
perror("lseek rand read");
|
||||
close(fd);
|
||||
free(buf);
|
||||
return 6;
|
||||
}
|
||||
|
||||
ssize_t r = read(fd, buf, io_size);
|
||||
if (r < 0) {
|
||||
perror("rand read");
|
||||
close(fd);
|
||||
free(buf);
|
||||
return 7;
|
||||
}
|
||||
|
||||
rcount++;
|
||||
clock_gettime(CLOCK_MONOTONIC, &now);
|
||||
} while (time_diff_sec(t1, now) < test_sec);
|
||||
|
||||
clock_gettime(CLOCK_MONOTONIC, &t2);
|
||||
close(fd);
|
||||
|
||||
double rsec = time_diff_sec(t1, t2);
|
||||
double rmb = (double)rcount * io_size / 1024 / 1024;
|
||||
|
||||
printf("\nRANDOM READ:\n");
|
||||
printf(" total : %.1f MB\n", rmb);
|
||||
printf(" time : %.3f sec\n", rsec);
|
||||
printf(" IOPS : %.0f ops/sec\n", rcount / rsec);
|
||||
printf(" BW : %.2f MB/s\n", rmb / rsec);
|
||||
|
||||
unlink(path);
|
||||
free(buf);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
char path[PATH_MAX];
|
||||
make_path(path, sizeof(path), argc >= 2 ? argv[1] : NULL, "file.dat");
|
||||
int rc = test_single_file_random_noaligned_perf(path);
|
||||
return report_result("test_single_file_random_noaligned_perf", rc);
|
||||
}
|
||||
@@ -1,119 +0,0 @@
|
||||
#include "test_utils.h"
|
||||
|
||||
static int test_single_file_random_perf(const char *path)
|
||||
{
|
||||
size_t io_size = 128 * 1024;
|
||||
size_t max_size = 2ULL * 1024 * 1024 * 1024;
|
||||
size_t max_count = max_size / io_size;
|
||||
int test_sec = 10;
|
||||
int direct = 0;
|
||||
|
||||
printf("\n=== test_single_file_random_perf ===\n");
|
||||
printf("Path : %s\n", path);
|
||||
printf("IO size : %zu KB\n", io_size / 1024);
|
||||
printf("Range : %zu MB\n", max_size / 1024 / 1024);
|
||||
printf("Duration: %d sec\n", test_sec);
|
||||
|
||||
srand(0x1234);
|
||||
|
||||
char *buf = aligned_alloc(4096, io_size);
|
||||
if (!buf) { perror("aligned_alloc"); return 1; }
|
||||
memset(buf, 'A', io_size);
|
||||
|
||||
struct timespec t1, t2, now;
|
||||
|
||||
unlink(path);
|
||||
|
||||
int fd = open(path, O_CREAT | O_RDWR | direct, 0644);
|
||||
if (fd < 0) { perror("open rand write"); free(buf); return 2; }
|
||||
|
||||
clock_gettime(CLOCK_MONOTONIC, &t1);
|
||||
|
||||
size_t wcount = 0;
|
||||
do {
|
||||
size_t blk = rand() % max_count;
|
||||
off_t offset = (off_t)blk * io_size;
|
||||
|
||||
if (lseek(fd, offset, SEEK_SET) < 0) {
|
||||
perror("lseek rand write");
|
||||
close(fd);
|
||||
free(buf);
|
||||
return 3;
|
||||
}
|
||||
|
||||
if (write(fd, buf, io_size) != (ssize_t)io_size) {
|
||||
perror("rand write");
|
||||
close(fd);
|
||||
free(buf);
|
||||
return 4;
|
||||
}
|
||||
|
||||
wcount++;
|
||||
clock_gettime(CLOCK_MONOTONIC, &now);
|
||||
} while (time_diff_sec(t1, now) < test_sec);
|
||||
|
||||
clock_gettime(CLOCK_MONOTONIC, &t2);
|
||||
close(fd);
|
||||
|
||||
double wsec = time_diff_sec(t1, t2);
|
||||
double wmb = (double)wcount * io_size / 1024 / 1024;
|
||||
|
||||
printf("\nRANDOM WRITE:\n");
|
||||
printf(" total : %.1f MB\n", wmb);
|
||||
printf(" time : %.3f sec\n", wsec);
|
||||
printf(" IOPS : %.0f ops/sec\n", wcount / wsec);
|
||||
printf(" BW : %.2f MB/s\n", wmb / wsec);
|
||||
|
||||
fd = open(path, O_RDONLY | direct);
|
||||
if (fd < 0) { perror("open rand read"); free(buf); return 5; }
|
||||
|
||||
clock_gettime(CLOCK_MONOTONIC, &t1);
|
||||
|
||||
size_t rcount = 0;
|
||||
do {
|
||||
size_t blk = rand() % max_count;
|
||||
off_t offset = (off_t)blk * io_size;
|
||||
|
||||
if (lseek(fd, offset, SEEK_SET) < 0) {
|
||||
perror("lseek rand read");
|
||||
close(fd);
|
||||
free(buf);
|
||||
return 6;
|
||||
}
|
||||
|
||||
ssize_t r = read(fd, buf, io_size);
|
||||
if (r < 0) {
|
||||
perror("rand read");
|
||||
close(fd);
|
||||
free(buf);
|
||||
return 7;
|
||||
}
|
||||
|
||||
rcount++;
|
||||
clock_gettime(CLOCK_MONOTONIC, &now);
|
||||
} while (time_diff_sec(t1, now) < test_sec);
|
||||
|
||||
clock_gettime(CLOCK_MONOTONIC, &t2);
|
||||
close(fd);
|
||||
|
||||
double rsec = time_diff_sec(t1, t2);
|
||||
double rmb = (double)rcount * io_size / 1024 / 1024;
|
||||
|
||||
printf("\nRANDOM READ:\n");
|
||||
printf(" total : %.1f MB\n", rmb);
|
||||
printf(" time : %.3f sec\n", rsec);
|
||||
printf(" IOPS : %.0f ops/sec\n", rcount / rsec);
|
||||
printf(" BW : %.2f MB/s\n", rmb / rsec);
|
||||
|
||||
unlink(path);
|
||||
free(buf);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
char path[PATH_MAX];
|
||||
make_path(path, sizeof(path), argc >= 2 ? argv[1] : NULL, "file.dat");
|
||||
int rc = test_single_file_random_perf(path);
|
||||
return report_result("test_single_file_random_perf", rc);
|
||||
}
|
||||
@@ -1,78 +0,0 @@
|
||||
#include "test_utils.h"
|
||||
|
||||
static int test_two_files(const char *path_a, const char *path_b)
|
||||
{
|
||||
printf("\n=== test_two_files ===\n");
|
||||
|
||||
int fd_a = open(path_a, O_CREAT | O_RDWR | O_TRUNC, 0644);
|
||||
if (fd_a < 0) { perror("open A"); return 1; }
|
||||
|
||||
int fd_b = open(path_b, O_CREAT | O_RDWR | O_TRUNC, 0644);
|
||||
if (fd_b < 0) { perror("open B"); return 2; }
|
||||
|
||||
printf("fd_a=%d fd_b=%d\n", fd_a, fd_b);
|
||||
|
||||
const char *data_a = "File-A: Hello World!";
|
||||
const char *data_b = "File-B: Goodbye World!";
|
||||
if (write(fd_a, data_a, strlen(data_a)) < 0) { perror("write A"); return 3; }
|
||||
if (write(fd_b, data_b, strlen(data_b)) < 0) { perror("write B"); return 4; }
|
||||
printf("write A: %s\n", data_a);
|
||||
printf("write B: %s\n", data_b);
|
||||
|
||||
lseek(fd_a, 0, SEEK_SET);
|
||||
lseek(fd_b, 0, SEEK_SET);
|
||||
|
||||
char buf_a[64] = {0};
|
||||
char buf_b[64] = {0};
|
||||
ssize_t r_a = read(fd_a, buf_a, sizeof(buf_a));
|
||||
ssize_t r_b = read(fd_b, buf_b, sizeof(buf_b));
|
||||
|
||||
printf("read A: %zd bytes: %.*s\n", r_a, (int)r_a, buf_a);
|
||||
printf("read B: %zd bytes: %.*s\n", r_b, (int)r_b, buf_b);
|
||||
|
||||
int ok = 1;
|
||||
if (strncmp(buf_a, data_a, strlen(data_a)) != 0) {
|
||||
printf("FAIL: A content mismatch!\n");
|
||||
ok = 0;
|
||||
}
|
||||
if (strncmp(buf_b, data_b, strlen(data_b)) != 0) {
|
||||
printf("FAIL: B content mismatch!\n");
|
||||
ok = 0;
|
||||
}
|
||||
if (ok) {
|
||||
printf("PASS: both files read back correctly\n");
|
||||
}
|
||||
|
||||
lseek(fd_a, 0, SEEK_END);
|
||||
if (write(fd_a, "[A-TAIL]", 8) != 8) { perror("append A"); return 5; }
|
||||
|
||||
lseek(fd_b, 8, SEEK_SET);
|
||||
if (write(fd_b, "Hi! ", 7) != 7) { perror("overwrite B"); return 6; }
|
||||
|
||||
lseek(fd_a, 0, SEEK_SET);
|
||||
lseek(fd_b, 0, SEEK_SET);
|
||||
memset(buf_a, 0, sizeof(buf_a));
|
||||
memset(buf_b, 0, sizeof(buf_b));
|
||||
r_a = read(fd_a, buf_a, sizeof(buf_a));
|
||||
r_b = read(fd_b, buf_b, sizeof(buf_b));
|
||||
printf("after cross-write:\n");
|
||||
printf(" A: %.*s\n", (int)r_a, buf_a);
|
||||
printf(" B: %.*s\n", (int)r_b, buf_b);
|
||||
|
||||
close(fd_a);
|
||||
close(fd_b);
|
||||
unlink(path_a);
|
||||
unlink(path_b);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
char path_a[PATH_MAX];
|
||||
char path_b[PATH_MAX];
|
||||
const char *dir = argc >= 2 ? argv[1] : NULL;
|
||||
make_path(path_a, sizeof(path_a), dir, "file_a.dat");
|
||||
make_path(path_b, sizeof(path_b), dir, "file_b.dat");
|
||||
int rc = test_two_files(path_a, path_b);
|
||||
return report_result("test_two_files", rc);
|
||||
}
|
||||
@@ -1,40 +0,0 @@
|
||||
#ifndef TEST_UTILS_H
|
||||
#define TEST_UTILS_H
|
||||
|
||||
#define _GNU_SOURCE
|
||||
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <limits.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#ifndef PATH_MAX
|
||||
#define PATH_MAX 4096
|
||||
#endif
|
||||
|
||||
static inline double time_diff_sec(struct timespec a, struct timespec b)
|
||||
{
|
||||
return (b.tv_sec - a.tv_sec) + (b.tv_nsec - a.tv_nsec) / 1000000000.0;
|
||||
}
|
||||
|
||||
static inline void make_path(char *out, size_t out_sz, const char *dir, const char *name)
|
||||
{
|
||||
if (dir && dir[0] != 0) {
|
||||
snprintf(out, out_sz, "%s/%s", dir, name);
|
||||
} else {
|
||||
snprintf(out, out_sz, "/tmp/%s", name);
|
||||
}
|
||||
}
|
||||
|
||||
static inline int report_result(const char *name, int rc)
|
||||
{
|
||||
printf("\n=== %s %s ===\n", name, rc == 0 ? "PASSED" : "FAILED");
|
||||
return rc;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,27 +0,0 @@
|
||||
#include "test_utils.h"
|
||||
|
||||
static int test_write_file(const char *path)
|
||||
{
|
||||
printf("\n=== test_write_file ===\n");
|
||||
|
||||
int fd = open(path, O_CREAT | O_RDWR, 0644);
|
||||
if (fd < 0) { perror("open"); return 1; }
|
||||
printf("open: %s fd=%d\n", path, fd);
|
||||
|
||||
const char *msg = "Hello, zvfs!";
|
||||
ssize_t w = write(fd, msg, strlen(msg));
|
||||
if (w < 0) { perror("write"); close(fd); return 2; }
|
||||
printf("write: %zd bytes: %s\n", w, msg);
|
||||
|
||||
close(fd);
|
||||
printf("close: ok\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
char path[PATH_MAX];
|
||||
make_path(path, sizeof(path), argc >= 2 ? argv[1] : NULL, "file.dat");
|
||||
int rc = test_write_file(path);
|
||||
return report_result("test_write_file", rc);
|
||||
}
|
||||
13
tests/Makefile
Normal file
13
tests/Makefile
Normal file
@@ -0,0 +1,13 @@
|
||||
SUBDIRS := ioengine_test hook
|
||||
|
||||
.PHONY: all clean $(SUBDIRS)
|
||||
|
||||
all: $(SUBDIRS)
|
||||
|
||||
$(SUBDIRS):
|
||||
$(MAKE) -C $@
|
||||
|
||||
clean:
|
||||
for dir in $(SUBDIRS); do \
|
||||
$(MAKE) -C $$dir clean; \
|
||||
done
|
||||
8
tests/hook/Makefile
Normal file
8
tests/hook/Makefile
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
BIN_DIR := $(abspath $(CURDIR)/../bin)
|
||||
|
||||
all:
|
||||
gcc -g -o $(BIN_DIR)/hook_api_test hook_api_test.c
|
||||
|
||||
clean:
|
||||
rm -rf $(BIN_DIR)/hook_api_test
|
||||
322
tests/hook/hook_api_test.c
Normal file
322
tests/hook/hook_api_test.c
Normal file
@@ -0,0 +1,322 @@
|
||||
#define _GNU_SOURCE
|
||||
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <limits.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/uio.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#define ASSERT_TRUE(cond, fmt, ...) \
|
||||
do { \
|
||||
if (!(cond)) { \
|
||||
fprintf(stderr, "[FAIL] %s:%d " fmt "\n", __func__, __LINE__, \
|
||||
##__VA_ARGS__); \
|
||||
return -1; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define ASSERT_SYS_OK(expr) \
|
||||
do { \
|
||||
if ((expr) < 0) { \
|
||||
fprintf(stderr, "[FAIL] %s:%d %s: %s\n", __func__, __LINE__, \
|
||||
#expr, strerror(errno)); \
|
||||
return -1; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
static int
|
||||
join_path(char *out, size_t out_sz, const char *dir, const char *name)
|
||||
{
|
||||
int n = snprintf(out, out_sz, "%s/%s", dir, name);
|
||||
if (n < 0 || (size_t)n >= out_sz) {
|
||||
errno = ENAMETOOLONG;
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
test_basic_rw_seek_stat(const char *workdir)
|
||||
{
|
||||
char path[PATH_MAX];
|
||||
ASSERT_SYS_OK(join_path(path, sizeof(path), workdir, "basic_rw.txt"));
|
||||
|
||||
int fd = open(path, O_CREAT | O_TRUNC | O_RDWR, 0644);
|
||||
ASSERT_SYS_OK(fd);
|
||||
|
||||
const char *init = "abcdef";
|
||||
ssize_t nr = write(fd, init, 6);
|
||||
ASSERT_TRUE(nr == 6, "write expected 6, got %zd", nr);
|
||||
|
||||
off_t off = lseek(fd, 0, SEEK_SET);
|
||||
ASSERT_TRUE(off == 0, "lseek expected 0, got %lld", (long long)off);
|
||||
|
||||
char buf[16] = {0};
|
||||
nr = read(fd, buf, 6);
|
||||
ASSERT_TRUE(nr == 6, "read expected 6, got %zd", nr);
|
||||
ASSERT_TRUE(memcmp(buf, "abcdef", 6) == 0, "read content mismatch");
|
||||
|
||||
nr = pwrite(fd, "XYZ", 3, 3);
|
||||
ASSERT_TRUE(nr == 3, "pwrite expected 3, got %zd", nr);
|
||||
|
||||
memset(buf, 0, sizeof(buf));
|
||||
nr = pread(fd, buf, 6, 0);
|
||||
ASSERT_TRUE(nr == 6, "pread expected 6, got %zd", nr);
|
||||
ASSERT_TRUE(memcmp(buf, "abcXYZ", 6) == 0, "pread content mismatch");
|
||||
|
||||
struct stat st;
|
||||
ASSERT_SYS_OK(fstat(fd, &st));
|
||||
ASSERT_TRUE(st.st_size == 6, "fstat size expected 6, got %lld",
|
||||
(long long)st.st_size);
|
||||
|
||||
ASSERT_SYS_OK(ftruncate(fd, 4));
|
||||
ASSERT_SYS_OK(fstat(fd, &st));
|
||||
ASSERT_TRUE(st.st_size == 4, "ftruncate size expected 4, got %lld",
|
||||
(long long)st.st_size);
|
||||
|
||||
ASSERT_SYS_OK(fdatasync(fd));
|
||||
ASSERT_SYS_OK(fsync(fd));
|
||||
ASSERT_SYS_OK(close(fd));
|
||||
ASSERT_SYS_OK(unlink(path));
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
test_openat_rename_unlink(const char *workdir)
|
||||
{
|
||||
char subdir[PATH_MAX];
|
||||
ASSERT_SYS_OK(join_path(subdir, sizeof(subdir), workdir, "openat_dir"));
|
||||
|
||||
ASSERT_SYS_OK(mkdir(subdir, 0755));
|
||||
|
||||
int dfd = open(subdir, O_RDONLY | O_DIRECTORY);
|
||||
ASSERT_SYS_OK(dfd);
|
||||
|
||||
int fd = openat(dfd, "a.txt", O_CREAT | O_TRUNC | O_RDWR, 0644);
|
||||
ASSERT_SYS_OK(fd);
|
||||
|
||||
ssize_t nr = write(fd, "hello", 5);
|
||||
ASSERT_TRUE(nr == 5, "write expected 5, got %zd", nr);
|
||||
ASSERT_SYS_OK(close(fd));
|
||||
|
||||
struct stat st;
|
||||
ASSERT_SYS_OK(fstatat(dfd, "a.txt", &st, 0));
|
||||
ASSERT_TRUE(st.st_size == 5, "fstatat size expected 5, got %lld",
|
||||
(long long)st.st_size);
|
||||
|
||||
ASSERT_SYS_OK(renameat(dfd, "a.txt", dfd, "b.txt"));
|
||||
ASSERT_SYS_OK(fstatat(dfd, "b.txt", &st, 0));
|
||||
ASSERT_TRUE(st.st_size == 5, "renamed file size expected 5, got %lld",
|
||||
(long long)st.st_size);
|
||||
|
||||
ASSERT_SYS_OK(unlinkat(dfd, "b.txt", 0));
|
||||
|
||||
errno = 0;
|
||||
ASSERT_TRUE(fstatat(dfd, "b.txt", &st, 0) == -1 && errno == ENOENT,
|
||||
"fstatat after unlink should be ENOENT");
|
||||
|
||||
ASSERT_SYS_OK(close(dfd));
|
||||
ASSERT_SYS_OK(rmdir(subdir));
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
test_dup_fcntl_ioctl(const char *workdir)
|
||||
{
|
||||
char path[PATH_MAX];
|
||||
ASSERT_SYS_OK(join_path(path, sizeof(path), workdir, "dup_fcntl.txt"));
|
||||
|
||||
int fd = open(path, O_CREAT | O_TRUNC | O_RDWR, 0644);
|
||||
ASSERT_SYS_OK(fd);
|
||||
|
||||
ASSERT_TRUE(write(fd, "0123456789", 10) == 10, "write expected 10 bytes");
|
||||
ASSERT_SYS_OK(lseek(fd, 0, SEEK_SET));
|
||||
|
||||
int fd2 = dup(fd);
|
||||
bool dup_supported = true;
|
||||
if (fd2 < 0 && (errno == ENOTSUP || errno == EOPNOTSUPP || errno == ENOSYS)) {
|
||||
dup_supported = false;
|
||||
fprintf(stderr, "[INFO] dup on this backend is unsupported, skip shared-offset check\n");
|
||||
} else {
|
||||
ASSERT_SYS_OK(fd2);
|
||||
}
|
||||
|
||||
char buf[4] = {0};
|
||||
if (dup_supported) {
|
||||
ASSERT_TRUE(read(fd, buf, 2) == 2, "read(fd) expected 2 bytes");
|
||||
ASSERT_TRUE(memcmp(buf, "01", 2) == 0, "first read mismatch");
|
||||
|
||||
memset(buf, 0, sizeof(buf));
|
||||
ASSERT_TRUE(read(fd2, buf, 2) == 2, "read(fd2) expected 2 bytes");
|
||||
ASSERT_TRUE(memcmp(buf, "23", 2) == 0,
|
||||
"dup offset should be shared, expected \"23\"");
|
||||
}
|
||||
|
||||
int fd_flags = fcntl(fd, F_GETFD);
|
||||
ASSERT_SYS_OK(fd_flags);
|
||||
ASSERT_SYS_OK(fcntl(fd, F_SETFD, fd_flags | FD_CLOEXEC));
|
||||
|
||||
int fd_flags_after = fcntl(fd, F_GETFD);
|
||||
ASSERT_SYS_OK(fd_flags_after);
|
||||
ASSERT_TRUE((fd_flags_after & FD_CLOEXEC) != 0,
|
||||
"FD_CLOEXEC should be set");
|
||||
|
||||
int file_flags = fcntl(fd, F_GETFL);
|
||||
ASSERT_SYS_OK(file_flags);
|
||||
ASSERT_SYS_OK(fcntl(fd, F_SETFL, file_flags | O_APPEND));
|
||||
|
||||
int file_flags_after = fcntl(fd, F_GETFL);
|
||||
ASSERT_SYS_OK(file_flags_after);
|
||||
ASSERT_TRUE((file_flags_after & O_APPEND) != 0, "O_APPEND should be set");
|
||||
|
||||
int avail = -1;
|
||||
ASSERT_SYS_OK(ioctl(dup_supported ? fd2 : fd, FIONREAD, &avail));
|
||||
if (dup_supported) {
|
||||
ASSERT_TRUE(avail == 6, "FIONREAD expected 6, got %d", avail);
|
||||
ASSERT_SYS_OK(close(fd2));
|
||||
} else {
|
||||
ASSERT_TRUE(avail == 10, "FIONREAD expected 10, got %d", avail);
|
||||
}
|
||||
|
||||
ASSERT_SYS_OK(close(fd));
|
||||
ASSERT_SYS_OK(unlink(path));
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
test_readv_writev_pwritev(const char *workdir)
|
||||
{
|
||||
char path[PATH_MAX];
|
||||
ASSERT_SYS_OK(join_path(path, sizeof(path), workdir, "iov.txt"));
|
||||
|
||||
int fd = open(path, O_CREAT | O_TRUNC | O_RDWR, 0644);
|
||||
ASSERT_SYS_OK(fd);
|
||||
|
||||
struct iovec wiov[3];
|
||||
wiov[0].iov_base = "ab";
|
||||
wiov[0].iov_len = 2;
|
||||
wiov[1].iov_base = "cd";
|
||||
wiov[1].iov_len = 2;
|
||||
wiov[2].iov_base = "ef";
|
||||
wiov[2].iov_len = 2;
|
||||
|
||||
ssize_t nr = writev(fd, wiov, 3);
|
||||
ASSERT_TRUE(nr == 6, "writev expected 6, got %zd", nr);
|
||||
|
||||
ASSERT_SYS_OK(lseek(fd, 0, SEEK_SET));
|
||||
|
||||
char a[2] = {0}, b[2] = {0}, c[2] = {0};
|
||||
struct iovec riov[3];
|
||||
riov[0].iov_base = a;
|
||||
riov[0].iov_len = 2;
|
||||
riov[1].iov_base = b;
|
||||
riov[1].iov_len = 2;
|
||||
riov[2].iov_base = c;
|
||||
riov[2].iov_len = 2;
|
||||
|
||||
nr = readv(fd, riov, 3);
|
||||
ASSERT_TRUE(nr == 6, "readv expected 6, got %zd", nr);
|
||||
ASSERT_TRUE(memcmp(a, "ab", 2) == 0 &&
|
||||
memcmp(b, "cd", 2) == 0 &&
|
||||
memcmp(c, "ef", 2) == 0, "readv content mismatch");
|
||||
|
||||
struct iovec pwiov[2];
|
||||
pwiov[0].iov_base = "12";
|
||||
pwiov[0].iov_len = 2;
|
||||
pwiov[1].iov_base = "34";
|
||||
pwiov[1].iov_len = 2;
|
||||
nr = pwritev(fd, pwiov, 2, 1);
|
||||
ASSERT_TRUE(nr == 4, "pwritev expected 4, got %zd", nr);
|
||||
|
||||
char out[8] = {0};
|
||||
nr = pread(fd, out, 6, 0);
|
||||
ASSERT_TRUE(nr == 6, "pread expected 6, got %zd", nr);
|
||||
ASSERT_TRUE(memcmp(out, "a1234f", 6) == 0, "pwritev content mismatch");
|
||||
|
||||
ASSERT_SYS_OK(close(fd));
|
||||
ASSERT_SYS_OK(unlink(path));
|
||||
return 0;
|
||||
}
|
||||
|
||||
typedef int (*test_fn)(const char *workdir);
|
||||
|
||||
struct test_case {
|
||||
const char *name;
|
||||
test_fn fn;
|
||||
};
|
||||
|
||||
static int
|
||||
run_test(const struct test_case *tc, const char *workdir)
|
||||
{
|
||||
int rc = tc->fn(workdir);
|
||||
if (rc == 0) {
|
||||
printf("[PASS] %s\n", tc->name);
|
||||
return 0;
|
||||
}
|
||||
printf("[FAIL] %s\n", tc->name);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int
|
||||
main(void)
|
||||
{
|
||||
const char *base = getenv("ZVFS_TEST_ROOT");
|
||||
if (!base || base[0] == '\0')
|
||||
base = "/tmp";
|
||||
|
||||
char workdir[PATH_MAX];
|
||||
int n = snprintf(workdir, sizeof(workdir), "%s/zvfs-hook-api-XXXXXX", base);
|
||||
if (n < 0 || (size_t)n >= sizeof(workdir)) {
|
||||
fprintf(stderr, "workdir template too long\n");
|
||||
return 2;
|
||||
}
|
||||
|
||||
if (!mkdtemp(workdir)) {
|
||||
fprintf(stderr, "mkdtemp(%s) failed: %s\n", workdir, strerror(errno));
|
||||
return 2;
|
||||
}
|
||||
|
||||
printf("workdir=%s\n", workdir);
|
||||
printf("hint: set ZVFS_TEST_ROOT=/zvfs when validating LD_PRELOAD hook path.\n");
|
||||
|
||||
struct test_case tests[] = {
|
||||
{"basic_rw_seek_stat", test_basic_rw_seek_stat},
|
||||
{"openat_rename_unlink", test_openat_rename_unlink},
|
||||
{"dup_fcntl_ioctl", test_dup_fcntl_ioctl},
|
||||
{"readv_writev_pwritev", test_readv_writev_pwritev},
|
||||
};
|
||||
|
||||
int failed = 0;
|
||||
for (size_t i = 0; i < sizeof(tests) / sizeof(tests[0]); ++i) {
|
||||
if (run_test(&tests[i], workdir) != 0)
|
||||
failed++;
|
||||
}
|
||||
|
||||
const char *keep = getenv("ZVFS_TEST_KEEP");
|
||||
if (!keep || strcmp(keep, "1") != 0) {
|
||||
if (rmdir(workdir) < 0) {
|
||||
fprintf(stderr,
|
||||
"warning: failed to remove workdir %s: %s\n",
|
||||
workdir, strerror(errno));
|
||||
}
|
||||
} else {
|
||||
printf("kept workdir=%s\n", workdir);
|
||||
}
|
||||
|
||||
if (failed == 0) {
|
||||
printf("ALL TESTS PASSED\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
printf("FAILED=%d\n", failed);
|
||||
return 1;
|
||||
}
|
||||
43
tests/ioengine_test/Makefile
Normal file
43
tests/ioengine_test/Makefile
Normal file
@@ -0,0 +1,43 @@
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../spdk)
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.modules.mk
|
||||
include $(SPDK_ROOT_DIR)/mk/spdk.app_vars.mk
|
||||
|
||||
# 输出目录
|
||||
BIN_DIR := $(abspath $(CURDIR)/../bin)
|
||||
|
||||
TEST_BINS := \
|
||||
ioengine_single_blob_test \
|
||||
ioengine_multi_blob_test \
|
||||
ioengine_same_blob_mt_test
|
||||
|
||||
COMMON_SRCS := \
|
||||
test_common.c \
|
||||
../../src/spdk_engine/io_engine.c \
|
||||
../../src/common/utils.c
|
||||
|
||||
SPDK_LIB_LIST = $(ALL_MODULES_LIST) event event_bdev
|
||||
LIBS += $(SPDK_LIB_LINKER_ARGS)
|
||||
|
||||
CFLAGS += -I$(abspath $(CURDIR)/../../src) -I$(CURDIR)
|
||||
|
||||
.PHONY: all clean
|
||||
all: $(BIN_DIR) $(addprefix $(BIN_DIR)/,$(TEST_BINS))
|
||||
|
||||
# 创建 bin 目录
|
||||
$(BIN_DIR):
|
||||
mkdir -p $(BIN_DIR)
|
||||
|
||||
$(BIN_DIR)/ioengine_single_blob_test: ioengine_single_blob_test.c $(COMMON_SRCS) $(SPDK_LIB_FILES) $(ENV_LIBS)
|
||||
$(CC) $(CFLAGS) -o $@ $< $(COMMON_SRCS) $(LDFLAGS) $(LIBS) $(ENV_LDFLAGS) $(SYS_LIBS)
|
||||
|
||||
$(BIN_DIR)/ioengine_multi_blob_test: ioengine_multi_blob_test.c $(COMMON_SRCS) $(SPDK_LIB_FILES) $(ENV_LIBS)
|
||||
$(CC) $(CFLAGS) -o $@ $< $(COMMON_SRCS) $(LDFLAGS) $(LIBS) $(ENV_LDFLAGS) $(SYS_LIBS)
|
||||
|
||||
$(BIN_DIR)/ioengine_same_blob_mt_test: ioengine_same_blob_mt_test.c $(COMMON_SRCS) $(SPDK_LIB_FILES) $(ENV_LIBS)
|
||||
$(CC) $(CFLAGS) -o $@ $< $(COMMON_SRCS) $(LDFLAGS) $(LIBS) $(ENV_LDFLAGS) $(SYS_LIBS)
|
||||
|
||||
clean:
|
||||
rm -f $(addprefix $(BIN_DIR)/,$(TEST_BINS))
|
||||
106
tests/ioengine_test/ioengine_multi_blob_test.c
Normal file
106
tests/ioengine_test/ioengine_multi_blob_test.c
Normal file
@@ -0,0 +1,106 @@
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "spdk_engine/io_engine.h"
|
||||
#include "test_common.h"
|
||||
|
||||
#define MULTI_BLOB_COUNT 3
|
||||
|
||||
int main(void) {
|
||||
int rc = 0;
|
||||
const char *bdev_name = getenv("SPDK_BDEV_NAME");
|
||||
struct zvfs_blob_handle *handles[MULTI_BLOB_COUNT] = {0};
|
||||
uint64_t ids[MULTI_BLOB_COUNT] = {0};
|
||||
uint64_t cluster = 0;
|
||||
void *wbuf = NULL;
|
||||
void *rbuf = NULL;
|
||||
int i = 0;
|
||||
|
||||
if (!bdev_name) {
|
||||
bdev_name = "Malloc0";
|
||||
}
|
||||
if (io_engine_init(bdev_name) != 0) {
|
||||
fprintf(stderr, "TEST2: io_engine_init failed (bdev=%s)\n", bdev_name);
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("[TEST2] single thread / multi blob\n");
|
||||
|
||||
handles[0] = blob_create(0);
|
||||
if (!handles[0]) {
|
||||
fprintf(stderr, "TEST2: create first blob failed\n");
|
||||
return 1;
|
||||
}
|
||||
ids[0] = handles[0]->id;
|
||||
cluster = handles[0]->size;
|
||||
if (cluster == 0) {
|
||||
fprintf(stderr, "TEST2: invalid cluster size\n");
|
||||
rc = 1;
|
||||
goto out;
|
||||
}
|
||||
if (blob_resize(handles[0], cluster * 2) != 0) {
|
||||
fprintf(stderr, "TEST2: resize first blob failed\n");
|
||||
rc = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
for (i = 1; i < MULTI_BLOB_COUNT; i++) {
|
||||
handles[i] = blob_create(cluster * 2);
|
||||
if (!handles[i]) {
|
||||
fprintf(stderr, "TEST2: create blob %d failed\n", i);
|
||||
rc = 1;
|
||||
goto out;
|
||||
}
|
||||
ids[i] = handles[i]->id;
|
||||
}
|
||||
|
||||
if (alloc_aligned_buf(&wbuf, cluster) != 0 || alloc_aligned_buf(&rbuf, cluster) != 0) {
|
||||
fprintf(stderr, "TEST2: alloc aligned buffer failed\n");
|
||||
rc = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
for (i = 0; i < MULTI_BLOB_COUNT; i++) {
|
||||
fill_pattern((uint8_t *)wbuf, cluster, (uint8_t)(0x20 + i));
|
||||
memset(rbuf, 0, cluster);
|
||||
|
||||
if (blob_write(handles[i], 0, wbuf, cluster) != 0) {
|
||||
fprintf(stderr, "TEST2: blob_write[%d] failed\n", i);
|
||||
rc = 1;
|
||||
goto out;
|
||||
}
|
||||
if (blob_read(handles[i], 0, rbuf, cluster) != 0) {
|
||||
fprintf(stderr, "TEST2: blob_read[%d] failed\n", i);
|
||||
rc = 1;
|
||||
goto out;
|
||||
}
|
||||
if (memcmp(wbuf, rbuf, cluster) != 0) {
|
||||
fprintf(stderr, "TEST2: blob[%d] readback mismatch\n", i);
|
||||
rc = 1;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
for (i = 0; i < MULTI_BLOB_COUNT; i++) {
|
||||
if (handles[i]) {
|
||||
(void)blob_close(handles[i]);
|
||||
}
|
||||
}
|
||||
for (i = 0; i < MULTI_BLOB_COUNT; i++) {
|
||||
if (ids[i] != 0) {
|
||||
(void)blob_delete(ids[i]);
|
||||
}
|
||||
}
|
||||
free(wbuf);
|
||||
free(rbuf);
|
||||
|
||||
if (rc == 0) {
|
||||
printf("[TEST2] PASS\n");
|
||||
return 0;
|
||||
}
|
||||
printf("[TEST2] FAIL\n");
|
||||
return 1;
|
||||
}
|
||||
147
tests/ioengine_test/ioengine_same_blob_mt_test.c
Normal file
147
tests/ioengine_test/ioengine_same_blob_mt_test.c
Normal file
@@ -0,0 +1,147 @@
|
||||
#include <pthread.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "spdk_engine/io_engine.h"
|
||||
#include "test_common.h"
|
||||
|
||||
#define THREAD_COUNT 4
|
||||
|
||||
struct mt_case_arg {
|
||||
struct zvfs_blob_handle *handle;
|
||||
uint64_t cluster_size;
|
||||
uint64_t offset;
|
||||
uint8_t seed;
|
||||
pthread_barrier_t *barrier;
|
||||
int rc;
|
||||
};
|
||||
|
||||
static void *mt_case_worker(void *arg) {
|
||||
struct mt_case_arg *ctx = (struct mt_case_arg *)arg;
|
||||
void *wbuf = NULL;
|
||||
void *rbuf = NULL;
|
||||
|
||||
if (alloc_aligned_buf(&wbuf, ctx->cluster_size) != 0 ||
|
||||
alloc_aligned_buf(&rbuf, ctx->cluster_size) != 0) {
|
||||
free(wbuf);
|
||||
free(rbuf);
|
||||
ctx->rc = 1;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
fill_pattern((uint8_t *)wbuf, ctx->cluster_size, ctx->seed);
|
||||
(void)pthread_barrier_wait(ctx->barrier);
|
||||
|
||||
if (blob_write(ctx->handle, ctx->offset, wbuf, ctx->cluster_size) != 0) {
|
||||
ctx->rc = 1;
|
||||
goto out;
|
||||
}
|
||||
if (blob_read(ctx->handle, ctx->offset, rbuf, ctx->cluster_size) != 0) {
|
||||
ctx->rc = 1;
|
||||
goto out;
|
||||
}
|
||||
if (memcmp(wbuf, rbuf, ctx->cluster_size) != 0) {
|
||||
ctx->rc = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ctx->rc = 0;
|
||||
|
||||
out:
|
||||
free(wbuf);
|
||||
free(rbuf);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
int rc = 0;
|
||||
const char *bdev_name = getenv("SPDK_BDEV_NAME");
|
||||
int i = 0;
|
||||
struct zvfs_blob_handle *h = NULL;
|
||||
uint64_t blob_id = 0;
|
||||
uint64_t cluster = 0;
|
||||
pthread_t tids[THREAD_COUNT];
|
||||
struct mt_case_arg args[THREAD_COUNT];
|
||||
pthread_barrier_t barrier;
|
||||
int barrier_inited = 0;
|
||||
|
||||
if (!bdev_name) {
|
||||
bdev_name = "Malloc0";
|
||||
}
|
||||
if (io_engine_init(bdev_name) != 0) {
|
||||
fprintf(stderr, "TEST3: io_engine_init failed (bdev=%s)\n", bdev_name);
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("[TEST3] multi thread / same blob\n");
|
||||
|
||||
h = blob_create(0);
|
||||
if (!h) {
|
||||
fprintf(stderr, "TEST3: blob_create failed\n");
|
||||
return 1;
|
||||
}
|
||||
blob_id = h->id;
|
||||
cluster = h->size;
|
||||
if (cluster == 0) {
|
||||
fprintf(stderr, "TEST3: invalid cluster size\n");
|
||||
rc = 1;
|
||||
goto out;
|
||||
}
|
||||
if (blob_resize(h, cluster * THREAD_COUNT) != 0) {
|
||||
fprintf(stderr, "TEST3: blob_resize failed\n");
|
||||
rc = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (pthread_barrier_init(&barrier, NULL, THREAD_COUNT) != 0) {
|
||||
fprintf(stderr, "TEST3: barrier init failed\n");
|
||||
rc = 1;
|
||||
goto out;
|
||||
}
|
||||
barrier_inited = 1;
|
||||
|
||||
for (i = 0; i < THREAD_COUNT; i++) {
|
||||
args[i].handle = h;
|
||||
args[i].cluster_size = cluster;
|
||||
args[i].offset = cluster * (uint64_t)i;
|
||||
args[i].seed = (uint8_t)(0x40 + i);
|
||||
args[i].barrier = &barrier;
|
||||
args[i].rc = 1;
|
||||
if (pthread_create(&tids[i], NULL, mt_case_worker, &args[i]) != 0) {
|
||||
fprintf(stderr, "TEST3: pthread_create[%d] failed\n", i);
|
||||
rc = 1;
|
||||
while (--i >= 0) {
|
||||
pthread_join(tids[i], NULL);
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < THREAD_COUNT; i++) {
|
||||
pthread_join(tids[i], NULL);
|
||||
if (args[i].rc != 0) {
|
||||
fprintf(stderr, "TEST3: worker[%d] failed\n", i);
|
||||
rc = 1;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
if (barrier_inited) {
|
||||
(void)pthread_barrier_destroy(&barrier);
|
||||
}
|
||||
if (h) {
|
||||
(void)blob_close(h);
|
||||
}
|
||||
if (blob_id != 0) {
|
||||
(void)blob_delete(blob_id);
|
||||
}
|
||||
|
||||
if (rc == 0) {
|
||||
printf("[TEST3] PASS\n");
|
||||
return 0;
|
||||
}
|
||||
printf("[TEST3] FAIL\n");
|
||||
return 1;
|
||||
}
|
||||
136
tests/ioengine_test/ioengine_single_blob_test.c
Normal file
136
tests/ioengine_test/ioengine_single_blob_test.c
Normal file
@@ -0,0 +1,136 @@
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "spdk_engine/io_engine.h"
|
||||
#include "test_common.h"
|
||||
|
||||
int main(void) {
|
||||
int rc = 0;
|
||||
const char *bdev_name = getenv("SPDK_BDEV_NAME");
|
||||
struct zvfs_blob_handle *h = NULL;
|
||||
struct zvfs_blob_handle *reopen = NULL;
|
||||
uint64_t blob_id = 0;
|
||||
uint64_t cluster = 0;
|
||||
void *wbuf = NULL;
|
||||
void *rbuf = NULL;
|
||||
|
||||
if (!bdev_name) {
|
||||
bdev_name = "Malloc0";
|
||||
}
|
||||
if (io_engine_init(bdev_name) != 0) {
|
||||
fprintf(stderr, "TEST1: io_engine_init failed (bdev=%s)\n", bdev_name);
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("[TEST1] single thread / single blob\n");
|
||||
|
||||
h = blob_create(0);
|
||||
if (!h) {
|
||||
fprintf(stderr, "TEST1: blob_create failed\n");
|
||||
return 1;
|
||||
}
|
||||
blob_id = h->id;
|
||||
cluster = h->size;
|
||||
if (cluster == 0) {
|
||||
fprintf(stderr, "TEST1: invalid cluster size\n");
|
||||
rc = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
rc = blob_resize(h, cluster * 2);
|
||||
if (rc != 0) {
|
||||
fprintf(stderr, "TEST1: blob_resize failed: %d\n", rc);
|
||||
rc = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
rc = alloc_aligned_buf(&wbuf, cluster);
|
||||
if (rc != 0) {
|
||||
fprintf(stderr, "TEST1: alloc write buf failed: %d\n", rc);
|
||||
rc = 1;
|
||||
goto out;
|
||||
}
|
||||
rc = alloc_aligned_buf(&rbuf, cluster);
|
||||
if (rc != 0) {
|
||||
fprintf(stderr, "TEST1: alloc read buf failed: %d\n", rc);
|
||||
rc = 1;
|
||||
goto out;
|
||||
}
|
||||
fill_pattern((uint8_t *)wbuf, cluster, 0x11);
|
||||
|
||||
rc = blob_write(h, 0, wbuf, cluster);
|
||||
if (rc != 0) {
|
||||
fprintf(stderr, "TEST1: blob_write failed: %d\n", rc);
|
||||
rc = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
rc = blob_read(h, 0, rbuf, cluster);
|
||||
if (rc != 0) {
|
||||
fprintf(stderr, "TEST1: blob_read failed: %d\n", rc);
|
||||
rc = 1;
|
||||
goto out;
|
||||
}
|
||||
if (memcmp(wbuf, rbuf, cluster) != 0) {
|
||||
fprintf(stderr, "TEST1: readback mismatch\n");
|
||||
rc = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
rc = blob_sync_md(h);
|
||||
if (rc != 0) {
|
||||
fprintf(stderr, "TEST1: blob_sync_md failed: %d\n", rc);
|
||||
rc = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
rc = blob_close(h);
|
||||
if (rc != 0) {
|
||||
fprintf(stderr, "TEST1: blob_close failed: %d\n", rc);
|
||||
rc = 1;
|
||||
goto out;
|
||||
}
|
||||
h = NULL;
|
||||
|
||||
reopen = blob_open(blob_id);
|
||||
if (!reopen) {
|
||||
fprintf(stderr, "TEST1: blob_open(reopen) failed\n");
|
||||
rc = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
memset(rbuf, 0, cluster);
|
||||
rc = blob_read(reopen, 0, rbuf, cluster);
|
||||
if (rc != 0) {
|
||||
fprintf(stderr, "TEST1: reopen blob_read failed: %d\n", rc);
|
||||
rc = 1;
|
||||
goto out;
|
||||
}
|
||||
if (memcmp(wbuf, rbuf, cluster) != 0) {
|
||||
fprintf(stderr, "TEST1: reopen readback mismatch\n");
|
||||
rc = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
out:
|
||||
if (reopen) {
|
||||
(void)blob_close(reopen);
|
||||
}
|
||||
if (h) {
|
||||
(void)blob_close(h);
|
||||
}
|
||||
if (blob_id != 0) {
|
||||
(void)blob_delete(blob_id);
|
||||
}
|
||||
free(wbuf);
|
||||
free(rbuf);
|
||||
|
||||
if (rc == 0) {
|
||||
printf("[TEST1] PASS\n");
|
||||
return 0;
|
||||
}
|
||||
printf("[TEST1] FAIL\n");
|
||||
return 1;
|
||||
}
|
||||
20
tests/ioengine_test/test_common.c
Normal file
20
tests/ioengine_test/test_common.c
Normal file
@@ -0,0 +1,20 @@
|
||||
#include "test_common.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
int alloc_aligned_buf(void **buf, size_t len) {
|
||||
int rc = posix_memalign(buf, 4096, len);
|
||||
if (rc != 0) {
|
||||
return -rc;
|
||||
}
|
||||
memset(*buf, 0, len);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void fill_pattern(uint8_t *buf, size_t len, uint8_t seed) {
|
||||
size_t i = 0;
|
||||
for (i = 0; i < len; i++) {
|
||||
buf[i] = (uint8_t)(seed + (uint8_t)i);
|
||||
}
|
||||
}
|
||||
10
tests/ioengine_test/test_common.h
Normal file
10
tests/ioengine_test/test_common.h
Normal file
@@ -0,0 +1,10 @@
|
||||
#ifndef __IOENGINE_TEST_COMMON_H__
|
||||
#define __IOENGINE_TEST_COMMON_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
int alloc_aligned_buf(void **buf, size_t len);
|
||||
void fill_pattern(uint8_t *buf, size_t len, uint8_t seed);
|
||||
|
||||
#endif // __IOENGINE_TEST_COMMON_H__
|
||||
1135
zvfs/zvfs.c
1135
zvfs/zvfs.c
File diff suppressed because it is too large
Load Diff
154
zvfs/zvfs.h
154
zvfs/zvfs.h
@@ -1,154 +0,0 @@
|
||||
#ifndef __ZVFS_HOOK_H__
|
||||
#define __ZVFS_HOOK_H__
|
||||
|
||||
#include <stdio.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <spdk/event.h>
|
||||
#include <spdk/blob.h>
|
||||
#include <spdk/bdev.h>
|
||||
#include <spdk/blob_bdev.h>
|
||||
#include <spdk/env.h>
|
||||
|
||||
#define ZVFS_MAX_FILES 1024
|
||||
#define ZVFS_MAX_FD 64
|
||||
#define BUFFER_SIZE (1024*8)
|
||||
|
||||
extern const char *json_file;
|
||||
extern struct spdk_thread *global_thread;
|
||||
static const int WAITER_MAX_TIME = 10000000;
|
||||
|
||||
/* 目录项(内存中的目录) */
|
||||
typedef struct {
|
||||
char filename[256];
|
||||
spdk_blob_id blob_id;
|
||||
uint64_t file_size; // 文件逻辑大小(字节)
|
||||
uint64_t allocated_clusters; // 已分配的cluster数量
|
||||
bool is_valid; // false 表示已删除
|
||||
int32_t open_count; // 打开的文件句柄数量
|
||||
} zvfs_dirent_t;
|
||||
|
||||
/* 文件系统全局结构 */
|
||||
typedef struct zvfs_s {
|
||||
struct spdk_bs_dev *bs_dev;
|
||||
struct spdk_blob_store *bs;
|
||||
struct spdk_io_channel *channel;
|
||||
struct spdk_blob *super_blob; // 承载目录日志的blob
|
||||
uint64_t io_unit_size; // page大小,单位字节
|
||||
|
||||
/* 目录 */
|
||||
zvfs_dirent_t *dirents[ZVFS_MAX_FILES]; // 目录项数组 #define ZVFS_MAX_FILES 1024
|
||||
uint32_t dirent_count; // 当前有效项数
|
||||
|
||||
/* 伪FD表 */
|
||||
struct zvfs_file_s *fd_table[ZVFS_MAX_FD]; // // e.g., #define ZVFS_MAX_FD 64
|
||||
int fd_base; // 伪FD起始值,如1000
|
||||
int openfd_count;
|
||||
|
||||
/* 元数据 */
|
||||
uint32_t magic; // 0x5A563146 (ZV1F)
|
||||
uint32_t version; // 1
|
||||
|
||||
bool bs_dev_owned;
|
||||
int op_errno;
|
||||
|
||||
bool finished;
|
||||
} zvfs_t;
|
||||
|
||||
/* 打开的文件句柄 */
|
||||
typedef struct zvfs_file_s {
|
||||
zvfs_t *fs;
|
||||
spdk_blob_id blob_id;
|
||||
struct spdk_blob *blob;
|
||||
zvfs_dirent_t *dirent; // 指回目录项 file_size/allocated_clusters
|
||||
|
||||
uint64_t current_offset; // 当前读写位置
|
||||
int flags; // O_RDONLY / O_RDWR / O_CREAT 等
|
||||
int pseudo_fd;
|
||||
|
||||
/* 临时DMA缓冲区(可选:每个file一个,避免每次malloc) */
|
||||
void *dma_buf;
|
||||
uint64_t dma_buf_size;
|
||||
|
||||
/* Small-write coalescing buffer in hook layer. */
|
||||
uint8_t *wb_buf;
|
||||
uint64_t wb_base;
|
||||
size_t wb_len;
|
||||
size_t wb_cap;
|
||||
bool wb_valid;
|
||||
|
||||
int op_errno;
|
||||
bool finished;
|
||||
} zvfs_file_t;
|
||||
|
||||
typedef enum {
|
||||
ZVFS_IO_READ = 0,
|
||||
ZVFS_IO_WRITE = 1,
|
||||
} zvfs_io_op_t;
|
||||
|
||||
typedef struct zvfs_io_req_s {
|
||||
zvfs_file_t *file;
|
||||
zvfs_io_op_t op;
|
||||
uint8_t *buf;
|
||||
size_t len;
|
||||
uint64_t offset;
|
||||
int flags;
|
||||
|
||||
size_t result;
|
||||
int op_errno;
|
||||
bool finished;
|
||||
|
||||
uint64_t lba;
|
||||
uint64_t page_off;
|
||||
uint64_t lba_count;
|
||||
bool aligned;
|
||||
} zvfs_io_req_t;
|
||||
|
||||
bool waiter(struct spdk_thread *thread, spdk_msg_fn start_fn, void *ctx, bool *finished);
|
||||
|
||||
int zvfs_env_setup(void);
|
||||
int zvfs_mount(struct zvfs_s *fs);
|
||||
int zvfs_umount(struct zvfs_s *fs);
|
||||
int zvfs_create(struct zvfs_file_s *file);
|
||||
int zvfs_open(struct zvfs_file_s *file);
|
||||
int zvfs_read(struct zvfs_file_s *file, uint8_t *buffer, size_t count);
|
||||
int zvfs_write(struct zvfs_file_s *file, const uint8_t *buffer, size_t count);
|
||||
int zvfs_pread(struct zvfs_file_s *file, uint8_t *buffer, size_t count, uint64_t offset);
|
||||
int zvfs_pwrite(struct zvfs_file_s *file, const uint8_t *buffer, size_t count, uint64_t offset);
|
||||
int zvfs_close(struct zvfs_file_s *file);
|
||||
int zvfs_delete(struct zvfs_file_s *file);
|
||||
|
||||
/* POSIX hook API(zvfs_hook.c 实现) */
|
||||
int open(const char *path, int flags, ...);
|
||||
int open64(const char *path, int flags, ...);
|
||||
int openat(int dirfd, const char *path, int flags, ...);
|
||||
int openat64(int dirfd, const char *path, int flags, ...);
|
||||
ssize_t read(int fd, void *buf, size_t count);
|
||||
ssize_t write(int fd, const void *buf, size_t count);
|
||||
ssize_t pread(int fd, void *buf, size_t count, off_t offset);
|
||||
ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset);
|
||||
ssize_t pread64(int fd, void *buf, size_t count, off_t offset);
|
||||
ssize_t pwrite64(int fd, const void *buf, size_t count, off_t offset);
|
||||
int close(int fd);
|
||||
int unlink(const char *name);
|
||||
int unlinkat(int dirfd, const char *name, int flags);
|
||||
off_t lseek(int fd, off_t offset, int whence);
|
||||
int fsync(int fd);
|
||||
int fdatasync(int fd);
|
||||
int ftruncate(int fd, off_t length);
|
||||
int fallocate(int fd, int mode, off_t offset, off_t len);
|
||||
int posix_fadvise(int fd, off_t offset, off_t len, int advice);
|
||||
int sync_file_range(int fd, off_t offset, off_t nbytes, unsigned int flags);
|
||||
int mkdir(const char *path, mode_t mode);
|
||||
int rmdir(const char *path);
|
||||
int rename(const char *oldpath, const char *newpath);
|
||||
int access(const char *path, int mode);
|
||||
int fcntl(int fd, int cmd, ...);
|
||||
int stat(const char *path, struct stat *st);
|
||||
int lstat(const char *path, struct stat *st);
|
||||
int fstat(int fd, struct stat *st);
|
||||
int fstatat(int dirfd, const char *path, struct stat *st, int flags);
|
||||
|
||||
#endif
|
||||
2576
zvfs/zvfs_hook.c
2576
zvfs/zvfs_hook.c
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user