From ea64511f958a701fa14b975d65462004c505f56f Mon Sep 17 00:00:00 2001 From: 1iaan <139833683+1iaan@users.noreply.github.com> Date: Mon, 30 Mar 2026 21:17:25 +0800 Subject: [PATCH] =?UTF-8?q?=E6=80=A7=E8=83=BD=E6=B5=8B=E8=AF=95=E7=94=A8?= =?UTF-8?q?=E4=BE=8B=E5=92=8C=E7=BB=93=E6=9E=9C=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + README.md | 203 +++++++++++++++++---- scripts/run_db_bench_zvfs.sh | 8 +- scripts/run_fio_matrix.sh | 262 ++++++++++++++++++++++++++++ scripts/run_pgbench_zvfs.sh | 4 +- scripts/run_test_hook_api.sh | 8 +- src/hook/zvfs_hook_dir.c | 94 ++++++++-- src/hook/zvfs_hook_fd.c | 25 +++ src/hook/zvfs_hook_seek.c | 48 +++-- zvfs_fio_test/sys/prepare_fill.fio | 13 ++ zvfs_fio_test/sys/randread_4k.fio | 15 ++ zvfs_fio_test/sys/randrw_4k.fio | 16 ++ zvfs_fio_test/sys/randwrite_4k.fio | 15 ++ zvfs_fio_test/zvfs/prepare_fill.fio | 13 ++ zvfs_fio_test/zvfs/randread_4k.fio | 15 ++ zvfs_fio_test/zvfs/randrw_4k.fio | 16 ++ zvfs_fio_test/zvfs/randwrite_4k.fio | 15 ++ 17 files changed, 706 insertions(+), 65 deletions(-) create mode 100755 scripts/run_fio_matrix.sh create mode 100644 zvfs_fio_test/sys/prepare_fill.fio create mode 100644 zvfs_fio_test/sys/randread_4k.fio create mode 100644 zvfs_fio_test/sys/randrw_4k.fio create mode 100644 zvfs_fio_test/sys/randwrite_4k.fio create mode 100644 zvfs_fio_test/zvfs/prepare_fill.fio create mode 100644 zvfs_fio_test/zvfs/randread_4k.fio create mode 100644 zvfs_fio_test/zvfs/randrw_4k.fio create mode 100644 zvfs_fio_test/zvfs/randwrite_4k.fio diff --git a/.gitignore b/.gitignore index d9f7176..d540104 100755 --- a/.gitignore +++ b/.gitignore @@ -6,5 +6,6 @@ *.so codex/ +results/* tests/bin zvfs_daemon \ No newline at end of file diff --git a/README.md b/README.md index e75d413..998b5ed 100755 --- a/README.md +++ b/README.md @@ -11,6 +11,35 @@ ZVFS 是一个 **透明用户态文件系统原型**,通过 `LD_PRELOAD` 劫 --- +# 测试方案 + +```shell +git clone http://gitlab.0voice.com/lianyiheng/zvfs.git +cd zvfs +git submodule update --init --recursive + +cd spdk +sudo ./scripts/pkgdep.sh +sudo ./configure --with-shared +sudo make -j +sudo ./scripts/setup.sh + +cd .. +make && make test + +su root + +# 测试 +mkdir -p /zvfs/zvfs_sys_fio +mkdir -p /tmp/zvfs_sys_fio + +SPDK_JSON_CONFIG="$PWD/src/zvfsnvme.json" ./src/daemon/zvfs_daemon +ZVFS_LD_PRELOAD_VALUE="$PWD/src/libzvfs.so" ./scripts/run_fio_matrix.sh + +# 结果: +results/fio/..../summary.md +``` + # 设计思路 大多数用户态文件系统(如 FUSE)需要修改应用或挂载文件系统。用户态文件系统如果要通过VFS,需要多一到两次额外的用户态/内核态切换。ZVFS 的目标是对应用完全透明:应用按正常方式调用 POSIX API,底层存储路径被悄悄替换掉。 @@ -135,54 +164,89 @@ fcntl fcntl64 ioctl > 注:VMware 模拟 NVMe 无法体现 SPDK 轮询模式对中断驱动 I/O 的延迟优势, > 以下数据用于评估 hook 层与 IPC 的额外开销,不代表真实硬件上的性能对比。 -### 顺序写吞吐 +### fio 4K(psync,30s) -| Block Size | spdk_nvme_perf | ZVFS | -|---|---|---| -| 4K | 100 MiB/s | 94 MiB/s | -| 128K | 1843 MiB/s | 1662 MiB/s | +测试口径: -ZVFS 达到 **SPDK 原生性能约 90%**。 +- `ioengine=psync` +- `direct=1` +- `iodepth=1` +- `bs=4K` +- `time_based=1` +- `runtime=30` +- `size=512M` +- `sys`: 普通文件路径 +- `zvfs`: `LD_PRELOAD=./src/libzvfs.so` ---- +#### prepare_fill 顺序写带宽 -### fio 随机写(16K,psync) +| | sys | ZVFS | +|---|---:|---:| +| 带宽 | 10.92 MiB/s | 14.41 MiB/s | +| disk util | 99.68% | 5.49% | -| | kernel (psync) | ZVFS | -|---|---|---| -| IOPS | 1855 | 1353 | -| 吞吐 | 28.0 MiB/s | 21.2 MiB/s | -| avg clat | 492 µs | 692 µs | -| sys% | 28.6% | 8.4% | +#### randread_4k -> 当前 ZVFS 在该单线程 `psync` 随机写场景下达到 kernel `psync` 的约 73% IOPS。daemon 内部 `SPDK + reply_q` 已收敛到较稳定范围,剩余主要开销集中在 `client -> daemon` 请求进入阶段。 +| | sys | ZVFS | +|---|---:|---:| +| IOPS | 3118.31 | 3685.21 | +| 吞吐 | 12.18 MiB/s | 14.40 MiB/s | +| avg clat | 318.31 µs | 268.91 µs | +| disk util | 99.77% | 0.52% | + +#### randwrite_4k + +| | sys | ZVFS | +|---|---:|---:| +| IOPS | 2883.24 | 3816.78 | +| 吞吐 | 11.26 MiB/s | 14.91 MiB/s | +| avg clat | 344.20 µs | 259.53 µs | +| disk util | 99.80% | 3.97% | + +#### randrw_4k(50/50) + +| | sys | ZVFS | +|---|---:|---:| +| 读 IOPS | 1614.29 | 2652.07 | +| 写 IOPS | 1605.60 | 2637.78 | +| 读 avg clat | 306.56 µs | 184.11 µs | +| 写 avg clat | 309.72 µs | 189.44 µs | +| disk util | 99.87% | 2.98% | --- ### WRITE 请求端到端延迟分解(单位 µs) -基于 12 条 `WRITE` trace 样本统计,下面按调用栈层级展开平均耗时。由于四舍五入,父子项相加会有 `±1 µs` 误差。 +```bash +sudo env \ + ZVFS_TRACE_LATENCY=1 \ + LD_PRELOAD="$PWD/src/libzvfs.so" \ + fio ./zvfs_fio_test/zvfs/randwrite_4k.fio 2> /tmp/zvfs.write.trace.log +``` +基于 `/tmp/zvfs.write.trace.log` 中 `107946` 条 `WRITE` trace 样本统计,下面按调用栈层级展开平均耗时。由于四舍五入,父子项相加会有 `±1 µs` 误差。 ```text -total 748 -├─ c2s 317 -│ ├─ send 39 -│ └─ server_rx_wait 278 -├─ server 336 -│ ├─ rx_dispatch 12 -│ ├─ dispatch_spdk 25 -│ ├─ spdk 194 -│ └─ reply_q 103 -│ ├─ spdk_post 11 -│ └─ cq_wait 91 -│ ├─ kick 13 -│ ├─ wake_sched 65 -│ └─ wake_to_tx 12 -└─ s2c 95 - ├─ resp_wait 83 - └─ parse 12 +total 256 +├─ c2s 41 +│ ├─ send 7 +│ └─ server_rx_wait 34 +├─ server 154 +│ ├─ rx_dispatch 0 +│ ├─ dispatch_spdk 5 +│ ├─ spdk 138 +│ │ ├─ phase1 0 +│ │ └─ phase2 138 +│ └─ reply_q 10 +│ ├─ spdk_post 0 +│ └─ cq_wait 10 +│ ├─ kick 1 +│ ├─ wake_sched 8 +│ └─ wake_to_tx 0 +└─ s2c 60 + ├─ resp_wait 60 + └─ parse 0 ``` -当前 `WRITE` 的主要额外开销已经比较清晰:一是 `c2s / server_rx_wait`,二是 `server` 内部的 `spdk` 与 `reply_q`。在 `reply_q` 中,`wake_sched` 已明显大于 `kick` 和 `wake_to_tx`,说明回包路径的主要损耗不在 `eventfd` 写入本身,而在 reactor 被唤醒后的调度等待。 +现在一次 `WRITE` 平均大约 `256 µs`。其中最耗时的是实际存储写入(`spdk`,约 `138 µs`),其次是请求发给 daemon 和结果返回应用这两段通信等待(`c2s` + `s2c`,约 `101 µs`)。回包队列相关开销(`reply_q`)已经压到约 `10 µs`,不再是主要瓶颈。 --- @@ -243,3 +307,74 @@ PostgreSQL tablespace 通过 symbolic link 访问路径: pg_tblspc/xxx ## write 延迟显著高于预期 这次 fio 延迟排查里,最初 `WRITE` 延迟明显高于预期。沿端到端路径加轻量打点后发现问题并不在 SPDK 本体,而是同时叠加了无条件 RMW、VM 中 poller 调度抖动、线程未绑核,以及后期 trace 暴露出来的 reactor 唤醒后核心切换抖动。对应处理是:整块对齐写跳过 read phase、将 reactor/md/io 线程固定到指定 CPU,并把 io 线程数和绑核目标收敛到配置项中。修复后 `dispatch_spdk` 从毫秒级降到几十微秒,`WRITE` 平均延迟也回落到约 700 µs,但剩余尾延迟仍主要表现为请求进入与回包阶段的调度等待。 + +--- + +# 脚本参数 + +以下脚本都支持通过环境变量 `ZVFS_LD_PRELOAD_VALUE` 指定加载的 so 库: + +- `scripts/run_fio_matrix.sh` +- `scripts/run_pgbench_zvfs.sh` +- `scripts/run_db_bench_zvfs.sh` +- `scripts/run_test_hook_api.sh` + +示例: + +```bash +sudo env ZVFS_LD_PRELOAD_VALUE="$PWD/src/libzvfs.so" ./scripts/run_fio_matrix.sh +``` + +其他脚本同理: + +```bash +sudo env ZVFS_LD_PRELOAD_VALUE="$PWD/src/libzvfs.so" ./scripts/run_pgbench_zvfs.sh +sudo env ZVFS_LD_PRELOAD_VALUE="$PWD/src/libzvfs.so" ./scripts/run_db_bench_zvfs.sh +env ZVFS_LD_PRELOAD_VALUE="$PWD/src/libzvfs.so" ./scripts/run_test_hook_api.sh +``` + +## 延迟 Trace + +`WRITE` / `SYNC_MD` 的端到端阶段打印通过环境变量 `ZVFS_TRACE_LATENCY=1` 打开。 +打印代码在客户端侧 [`src/spdk_engine/io_engine.c`](/home/lian/share/zvfs/src/spdk_engine/io_engine.c),输出会写到执行 workload 的进程标准错误。 + +示例: + +```bash +sudo env \ + ZVFS_TRACE_LATENCY=1 \ + LD_PRELOAD="$PWD/src/libzvfs.so" \ + fio ./zvfs_fio_test/zvfs/randwrite_4k.fio 2> /tmp/zvfs.write.trace.log +``` + +筛出 `WRITE` trace: + +```bash +grep '\[zvfs\]\[trace\]\[WRITE\]' /tmp/zvfs.write.trace.log +``` + +筛出 `SYNC_MD` trace: + +```bash +grep '\[zvfs\]\[trace\]\[SYNC_MD\]' /tmp/zvfs.write.trace.log +``` + +单行输出字段包括: + +- `total` +- `c2s` +- `send` +- `server_rx_wait` +- `rx_dispatch` +- `dispatch_spdk` +- `spdk` +- `phase1` +- `phase2` +- `spdk_post` +- `kick` +- `wake_sched` +- `wake_to_tx` +- `reply_q` +- `cq_wait` + +更新 README 中的 `WRITE 请求端到端延迟分解` 时,可对多条 `[zvfs][trace][WRITE]` 日志按字段取平均后再汇总。 diff --git a/scripts/run_db_bench_zvfs.sh b/scripts/run_db_bench_zvfs.sh index 52ee640..96738a9 100755 --- a/scripts/run_db_bench_zvfs.sh +++ b/scripts/run_db_bench_zvfs.sh @@ -1,6 +1,10 @@ #!/usr/bin/env bash set -euo pipefail -env -u LD_PRELOAD=/home/lian/share/zvfs/src/libzvfs.so -rf /zvfs/rocksdb_manual || true + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +LD_PRELOAD_PATH="${ZVFS_LD_PRELOAD_VALUE:-${ROOT_DIR}/src/libzvfs.so}" + +env LD_PRELOAD="${LD_PRELOAD_PATH}" rm -rf /zvfs/rocksdb_manual || true # ========================= # Manual Config (edit here) @@ -91,7 +95,7 @@ echo "USE_MMAP_READS=$USE_MMAP_READS USE_MMAP_WRITES=$USE_MMAP_WRITES" echo "STATISTICS=$STATISTICS STATS_INTERVAL_SECONDS=$STATS_INTERVAL_SECONDS HISTOGRAM=$HISTOGRAM" echo -exec env LD_PRELOAD=/home/lian/share/zvfs/src/libzvfs.so "$DB_BENCH_BIN" \ +exec env LD_PRELOAD="${LD_PRELOAD_PATH}" "$DB_BENCH_BIN" \ --db="$DB_PATH" \ --benchmarks="$BENCHMARKS" \ --num="$NUM" \ diff --git a/scripts/run_fio_matrix.sh b/scripts/run_fio_matrix.sh new file mode 100755 index 0000000..e40ec5f --- /dev/null +++ b/scripts/run_fio_matrix.sh @@ -0,0 +1,262 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +FIO_CASE_DIR="${ROOT_DIR}/zvfs_fio_test" +RESULTS_ROOT="${ROOT_DIR}/results/fio" +TIMESTAMP="$(date +%Y%m%d_%H%M%S)" +RUN_DIR="${RESULTS_ROOT}/${TIMESTAMP}" + +FIO_BIN="${FIO_BIN:-$(command -v fio 2>/dev/null || true)}" +PYTHON_BIN="${PYTHON_BIN:-$(command -v python3 2>/dev/null || true)}" +LD_PRELOAD_PATH="${LD_PRELOAD_PATH:-${ROOT_DIR}/src/libzvfs.so}" +ZVFS_LD_PRELOAD_VALUE="${ZVFS_LD_PRELOAD_VALUE:-${LD_PRELOAD_PATH}}" + +SYS_PARENT_DIR="/tmp/zvfs_sys_fio" +ZVFS_PARENT_DIR="/zvfs/zvfs_sys_fio" +SYS_TEST_FILE="${SYS_PARENT_DIR}/testfile" +ZVFS_TEST_FILE="${ZVFS_PARENT_DIR}/testfile" + +WORKLOADS=( + "prepare_fill" + "randread_4k" + "randwrite_4k" + "randrw_4k" +) + +MODES=( + "sys" + "zvfs" +) + +require_bin() { + local bin_path="$1" + local name="$2" + + if [[ -z "${bin_path}" || ! -x "${bin_path}" ]]; then + echo "未找到 ${name},请先安装或通过环境变量指定路径。" >&2 + exit 1 + fi +} + +ensure_paths() { + mkdir -p "${SYS_PARENT_DIR}" + + if [[ ! -d "${ZVFS_PARENT_DIR}" ]]; then + echo "未找到 ZVFS 测试目录:${ZVFS_PARENT_DIR}" >&2 + echo "请先准备好对应目录,再执行脚本。" >&2 + exit 1 + fi + + if [[ -z "${ZVFS_LD_PRELOAD_VALUE}" ]]; then + echo "ZVFS_LD_PRELOAD_VALUE 不能为空。" >&2 + exit 1 + fi + + local zvfs_preload_file="${ZVFS_LD_PRELOAD_VALUE%%:*}" + if [[ ! -f "${zvfs_preload_file}" ]]; then + echo "未找到 libzvfs.so:${zvfs_preload_file}" >&2 + exit 1 + fi +} + +cleanup_test_file() { + local mode="$1" + + if [[ "${mode}" == "zvfs" ]]; then + env LD_PRELOAD="${ZVFS_LD_PRELOAD_VALUE}" rm -f "${ZVFS_TEST_FILE}" + else + rm -f "${SYS_TEST_FILE}" + fi +} + +cleanup_all_test_files() { + cleanup_test_file "sys" + cleanup_test_file "zvfs" +} + +run_case() { + local mode="$1" + local workload="$2" + local fio_file="${FIO_CASE_DIR}/${mode}/${workload}.fio" + local output_file="${RUN_DIR}/${mode}_${workload}.json" + + if [[ ! -f "${fio_file}" ]]; then + echo "未找到 fio 配置:${fio_file}" >&2 + exit 1 + fi + + echo "[run] mode=${mode} workload=${workload}" + + if [[ "${workload}" == "prepare_fill" ]]; then + cleanup_test_file "${mode}" + fi + + if [[ "${mode}" == "zvfs" ]]; then + env LD_PRELOAD="${ZVFS_LD_PRELOAD_VALUE}" \ + "${FIO_BIN}" --output-format=json --output="${output_file}" "${fio_file}" + else + "${FIO_BIN}" --output-format=json --output="${output_file}" "${fio_file}" + fi +} + +write_summary() { + local summary_file="${RUN_DIR}/summary.md" + local parser_script + parser_script="$(cat <<'PY' +import json +import os +import sys + +run_dir = sys.argv[1] +modes = ["sys", "zvfs"] +workloads = ["prepare_fill", "randread_4k", "randwrite_4k", "randrw_4k"] + +def load_job(path): + with open(path, "r", encoding="utf-8") as f: + content = f.read() + start = content.find("{") + if start == -1: + raise RuntimeError(f"fio output is not JSON: {path}") + data = json.loads(content[start:]) + jobs = data.get("jobs", []) + if not jobs: + raise RuntimeError(f"fio output has no jobs: {path}") + return data, jobs[0] + +def mib_per_sec(io_part): + value = io_part.get("bw_bytes", 0) + if not value: + return "-" + return f"{value / 1024 / 1024:.2f}" + +def iops(io_part): + value = io_part.get("iops", 0) + if not value: + return "-" + return f"{value:.2f}" + +def lat_us(io_part): + clat_ns = io_part.get("clat_ns", {}) + value = clat_ns.get("mean", 0) + if not value: + return "-" + return f"{value / 1000:.2f}" + +def disk_util(data): + parts = [] + for item in data.get("disk_util", []): + name = item.get("name") + util = item.get("util") + if not name or util is None: + continue + parts.append(f"{name}={util:.2f}%") + return ", ".join(parts) if parts else "-" + +lines = [ + "# FIO Summary", + "", + f"run_dir: `{run_dir}`", + "", + "| mode | workload | bw(MiB/s) | iops | avg_lat(us) | read_iops | write_iops | read_avg_lat(us) | write_avg_lat(us) | disk_util |", + "| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |", +] + +for mode in modes: + for workload in workloads: + path = os.path.join(run_dir, f"{mode}_{workload}.json") + data, job = load_job(path) + read_part = job.get("read", {}) + write_part = job.get("write", {}) + util_col = disk_util(data) + + if workload == "prepare_fill": + row = [ + mode, + workload, + mib_per_sec(write_part), + "-", + "-", + "-", + "-", + "-", + "-", + util_col, + ] + elif workload == "randread_4k": + row = [ + mode, + workload, + mib_per_sec(read_part), + iops(read_part), + lat_us(read_part), + "-", + "-", + "-", + "-", + util_col, + ] + elif workload == "randwrite_4k": + row = [ + mode, + workload, + mib_per_sec(write_part), + iops(write_part), + lat_us(write_part), + "-", + "-", + "-", + "-", + util_col, + ] + else: + row = [ + mode, + workload, + "-", + "-", + "-", + iops(read_part), + iops(write_part), + lat_us(read_part), + lat_us(write_part), + util_col, + ] + + lines.append("| " + " | ".join(row) + " |") + +print("\n".join(lines)) +PY +)" + + "${PYTHON_BIN}" -c "${parser_script}" "${RUN_DIR}" > "${summary_file}" + echo + echo "[summary] ${summary_file}" + sed -n '1,200p' "${summary_file}" +} + +main() { + require_bin "${FIO_BIN}" "fio" + require_bin "${PYTHON_BIN}" "python3" + ensure_paths + + mkdir -p "${RUN_DIR}" + + echo "结果目录:${RUN_DIR}" + echo "fio 配置目录:${FIO_CASE_DIR}" + echo "LD_PRELOAD:${ZVFS_LD_PRELOAD_VALUE}" + echo + + cleanup_all_test_files + + for mode in "${MODES[@]}"; do + for workload in "${WORKLOADS[@]}"; do + run_case "${mode}" "${workload}" + done + done + + write_summary + cleanup_all_test_files +} + +main "$@" diff --git a/scripts/run_pgbench_zvfs.sh b/scripts/run_pgbench_zvfs.sh index 9421125..b217c26 100755 --- a/scripts/run_pgbench_zvfs.sh +++ b/scripts/run_pgbench_zvfs.sh @@ -1,6 +1,8 @@ #!/usr/bin/env bash set -euo pipefail +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + # 简化版 pgbench 测试脚本: # 1) 参数都在本文件顶部配置; # 2) 直接连接 benchdb; @@ -21,7 +23,7 @@ PG_INIT_STEPS="dtg" PG_SKIP_INIT="0" PG_SUPERUSER="postgres" USE_LD_PRELOAD="1" -LD_PRELOAD_PATH="/home/lian/share/zvfs/src/libzvfs.so" +LD_PRELOAD_PATH="${ZVFS_LD_PRELOAD_VALUE:-${ROOT_DIR}/src/libzvfs.so}" # 可选:优先使用这个目录;为空时自动从 PATH 里找 PG_BIN_DIR="/usr/lib/postgresql/12/bin" diff --git a/scripts/run_test_hook_api.sh b/scripts/run_test_hook_api.sh index b15d7cf..4562ec0 100644 --- a/scripts/run_test_hook_api.sh +++ b/scripts/run_test_hook_api.sh @@ -1 +1,7 @@ -LD_PRELOAD=/home/lian/share/zvfs/src/libzvfs.so ZVFS_TEST_ROOT=/zvfs /home/lian/share/zvfs/tests/bin/hook_api_test \ No newline at end of file +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +LD_PRELOAD_PATH="${ZVFS_LD_PRELOAD_VALUE:-${ROOT_DIR}/src/libzvfs.so}" + +LD_PRELOAD="${LD_PRELOAD_PATH}" ZVFS_TEST_ROOT=/zvfs "${ROOT_DIR}/tests/bin/hook_api_test" diff --git a/src/hook/zvfs_hook_dir.c b/src/hook/zvfs_hook_dir.c index 72f209e..c86f0c6 100644 --- a/src/hook/zvfs_hook_dir.c +++ b/src/hook/zvfs_hook_dir.c @@ -14,6 +14,7 @@ #include #include #include +#include #include /* RENAME_EXCHANGE, RENAME_NOREPLACE */ /* ------------------------------------------------------------------ */ @@ -34,22 +35,36 @@ * (inode 和 blob 的清理推迟到 close 路径中 ref_count 归零时) */ static void -zvfs_unlink_path(const char *abspath) +zvfs_unlink_blob_by_id(uint64_t blob_id) +{ + if (blob_id != 0) { + (void)blob_delete(blob_id); + } +} + +static void +zvfs_unlink_path(const char *abspath, uint64_t fallback_blob_id) { /* --- 查 path_cache -------------------------------------------- */ pthread_mutex_lock(&g_fs.path_mu); struct zvfs_path_entry *pe = path_cache_lookup(abspath); - if (!pe) { + struct zvfs_inode *inode = pe ? pe->inode : NULL; + pthread_mutex_unlock(&g_fs.path_mu); + + if (!inode && fallback_blob_id != 0) { + pthread_mutex_lock(&g_fs.inode_mu); + inode = inode_lookup(fallback_blob_id); + pthread_mutex_unlock(&g_fs.inode_mu); + } + + if (!inode) { /* - * 不在缓存里:该文件可能从未被 open 过(没有 inode 对象)。 - * 无内存状态需要清理,直接返回。 - * blob 也不存在(文件从未被 zvfs open 创建),所以安全。 + * 文件可能从未被当前进程 open 过,因此 path_cache / inode_table + * 都为空;但只要删前成功读到 xattr,就仍需删除对应 blob。 */ - pthread_mutex_unlock(&g_fs.path_mu); + zvfs_unlink_blob_by_id(fallback_blob_id); return; } - struct zvfs_inode *inode = pe->inode; - pthread_mutex_unlock(&g_fs.path_mu); /* --- 持 inode->mu 决策 ---------------------------------------- */ pthread_mutex_lock(&inode->mu); @@ -63,7 +78,7 @@ zvfs_unlink_path(const char *abspath) */ pthread_mutex_unlock(&inode->mu); - blob_delete(inode->blob_id); + zvfs_unlink_blob_by_id(inode->blob_id); pthread_mutex_lock(&g_fs.inode_mu); inode_remove(inode->blob_id); @@ -90,6 +105,29 @@ zvfs_unlink_path(const char *abspath) } } +static int +zvfs_prefetch_unlink_target(const char *abspath, uint64_t *blob_id_out) +{ + int fd; + + *blob_id_out = 0; + + fd = real_open(abspath, O_RDONLY); + if (fd < 0) { + return -1; + } + + if (zvfs_xattr_read_blob_id(fd, blob_id_out) < 0) { + int saved = errno; + real_close(fd); + errno = saved; + return -1; + } + + real_close(fd); + return 0; +} + /* ------------------------------------------------------------------ */ /* unlink */ /* ------------------------------------------------------------------ */ @@ -99,8 +137,16 @@ unlink(const char *path) { ZVFS_HOOK_ENTER(); int ret; + char abspath[PATH_MAX]; + uint64_t blob_id = 0; + int is_zvfs_path = 0; - if (ZVFS_IN_HOOK() || !zvfs_is_zvfs_path(path)) { + if (!ZVFS_IN_HOOK() && + zvfs_resolve_atpath(AT_FDCWD, path, abspath, sizeof(abspath)) == 0) { + is_zvfs_path = zvfs_is_zvfs_path(abspath); + } + + if (ZVFS_IN_HOOK() || !is_zvfs_path) { ret = real_unlink(path); ZVFS_HOOK_LEAVE(); return ret; @@ -108,10 +154,16 @@ unlink(const char *path) zvfs_ensure_init(); + if (zvfs_prefetch_unlink_target(abspath, &blob_id) != 0) { + ret = real_unlink(path); + ZVFS_HOOK_LEAVE(); + return ret; + } + /* 先让真实 FS 删除文件(xattr 随之消失) */ ret = real_unlink(path); if (ret == 0) - zvfs_unlink_path(path); + zvfs_unlink_path(abspath, blob_id); ZVFS_HOOK_LEAVE(); return ret; @@ -126,6 +178,7 @@ unlinkat(int dirfd, const char *path, int flags) { ZVFS_HOOK_ENTER(); int ret; + uint64_t blob_id = 0; /* * AT_REMOVEDIR:rmdir 语义,目录由真实 FS 管理,直接透传。 @@ -151,9 +204,15 @@ unlinkat(int dirfd, const char *path, int flags) zvfs_ensure_init(); + if (zvfs_prefetch_unlink_target(abspath, &blob_id) != 0) { + ret = real_unlinkat(dirfd, path, flags); + ZVFS_HOOK_LEAVE(); + return ret; + } + ret = real_unlinkat(dirfd, path, flags); if (ret == 0) - zvfs_unlink_path(abspath); + zvfs_unlink_path(abspath, blob_id); ZVFS_HOOK_LEAVE(); return ret; @@ -170,6 +229,7 @@ remove(const char *path) int ret; char abspath[PATH_MAX]; int is_zvfs_path = 0; + uint64_t blob_id = 0; if (!ZVFS_IN_HOOK() && zvfs_resolve_atpath(AT_FDCWD, path, abspath, sizeof(abspath)) == 0) { @@ -184,13 +244,19 @@ remove(const char *path) zvfs_ensure_init(); + if (zvfs_prefetch_unlink_target(abspath, &blob_id) != 0) { + ret = real_remove(path); + ZVFS_HOOK_LEAVE(); + return ret; + } + ret = real_remove(path); if (ret == 0) { /* * remove 既可删文件也可删空目录。 * 若是目录,path_cache 不会命中,zvfs_unlink_path 会安全 no-op。 */ - zvfs_unlink_path(abspath); + zvfs_unlink_path(abspath, blob_id); } ZVFS_HOOK_LEAVE(); @@ -221,7 +287,7 @@ zvfs_rename_paths(const char *oldabs, const char *newabs) * real_rename 已经把它从磁盘上删掉了, * 走和 unlink 一样的延迟/立即 blob_delete 逻辑。 */ - zvfs_unlink_path(newabs); + zvfs_unlink_path(newabs, 0); } /* 把 oldpath 的缓存条目 rename 到 newpath */ diff --git a/src/hook/zvfs_hook_fd.c b/src/hook/zvfs_hook_fd.c index 17f8efe..706bd30 100644 --- a/src/hook/zvfs_hook_fd.c +++ b/src/hook/zvfs_hook_fd.c @@ -207,12 +207,37 @@ zvfs_open_impl(int real_fd, const char *abspath, int flags, mode_t mode) struct zvfs_inode *inode = NULL; uint64_t blob_id = 0; uint64_t handle_id = 0; + int create_new = 0; zvfs_debug_open_log(abspath, NULL, "open_impl enter real_fd=%d path=%s flags=0x%x mode=%#o", real_fd, zvfs_dbg_str(abspath), flags, (unsigned)mode); if (flags & O_CREAT) { + /* + * O_CREAT does not imply the file is newly created. + * fio, for example, may open an existing file with O_CREAT again + * during the worker phase. Only create a new blob when the backing + * file does not already carry a ZVFS blob_id xattr. + */ + if (zvfs_xattr_read_blob_id(real_fd, &blob_id) == 0) { + create_new = 0; + } else if (errno == ENODATA +#ifdef ENOATTR + || errno == ENOATTR +#endif + ) { + create_new = 1; + blob_id = 0; + } else { + zvfs_debug_open_log(abspath, NULL, + "open_impl xattr probe fail errno=%d(%s)", + errno, strerror(errno)); + goto fail; + } + } + + if (create_new) { /* ---- 创建路径 -------------------------------------------- */ /* 1. 创建 blob */ diff --git a/src/hook/zvfs_hook_seek.c b/src/hook/zvfs_hook_seek.c index ac7ba0d..d600b65 100644 --- a/src/hook/zvfs_hook_seek.c +++ b/src/hook/zvfs_hook_seek.c @@ -239,25 +239,36 @@ fallocate(int fd, int mode, off_t offset, off_t len) return -1; } - /* FALLOC_FL_KEEP_SIZE:预分配但不改变文件逻辑大小,直接返回 0 */ - if (mode & FALLOC_FL_KEEP_SIZE) { - ZVFS_HOOK_LEAVE(); - return 0; - } + uint64_t new_end = (uint64_t)offset + (uint64_t)len; + uint64_t alloc_end = new_end; /* - * 普通 fallocate(mode == 0): - * 确保 [offset, offset+len) 范围内的空间被"分配"。 - * zvfs 的语义:把 logical_size 扩展到 max(logical_size, offset+len)。 - * 不提前 blob_resize,因为 SPDK cluster 按写入时分配更高效。 + * ZVFS 读路径会按 blob 当前 cluster 大小做越界检查,因此这里不能只 + * 更新 logical_size/st_size,必须同步把 blob capacity 扩到目标范围。 + * + * 对 KEEP_SIZE: + * 预分配空间,但不改变文件逻辑大小。 + * 对普通 fallocate: + * 预分配空间,并把逻辑大小扩展到 offset + len。 */ - uint64_t new_end = (uint64_t)offset + (uint64_t)len; - pthread_mutex_lock(&of->inode->mu); - if (new_end > of->inode->logical_size) - inode_update_size(of->inode, fd, new_end); + if (alloc_end < of->inode->logical_size) + alloc_end = of->inode->logical_size; pthread_mutex_unlock(&of->inode->mu); + if (blob_resize(of->handle_id, alloc_end) < 0) { + errno = EIO; + ZVFS_HOOK_LEAVE(); + return -1; + } + + if ((mode & FALLOC_FL_KEEP_SIZE) == 0) { + pthread_mutex_lock(&of->inode->mu); + if (new_end > of->inode->logical_size) + inode_update_size(of->inode, fd, new_end); + pthread_mutex_unlock(&of->inode->mu); + } + ZVFS_HOOK_LEAVE(); return 0; } @@ -293,6 +304,17 @@ posix_fallocate(int fd, off_t offset, off_t len) if (offset < 0 || len <= 0) { ZVFS_HOOK_LEAVE(); return EINVAL; } uint64_t new_end = (uint64_t)offset + (uint64_t)len; + uint64_t alloc_end = new_end; + + pthread_mutex_lock(&of->inode->mu); + if (alloc_end < of->inode->logical_size) + alloc_end = of->inode->logical_size; + pthread_mutex_unlock(&of->inode->mu); + + if (blob_resize(of->handle_id, alloc_end) < 0) { + ZVFS_HOOK_LEAVE(); + return errno ? errno : EIO; + } pthread_mutex_lock(&of->inode->mu); if (new_end > of->inode->logical_size) diff --git a/zvfs_fio_test/sys/prepare_fill.fio b/zvfs_fio_test/sys/prepare_fill.fio new file mode 100644 index 0000000..e5f24b8 --- /dev/null +++ b/zvfs_fio_test/sys/prepare_fill.fio @@ -0,0 +1,13 @@ +[global] +ioengine=psync +direct=1 +thread=1 +group_reporting=1 +iodepth=1 +bs=4k +size=512M +filename=/tmp/zvfs_sys_fio/testfile + +[prepare_fill] +description=Sequential prefill for bandwidth +rw=write diff --git a/zvfs_fio_test/sys/randread_4k.fio b/zvfs_fio_test/sys/randread_4k.fio new file mode 100644 index 0000000..2359bb0 --- /dev/null +++ b/zvfs_fio_test/sys/randread_4k.fio @@ -0,0 +1,15 @@ +[global] +ioengine=psync +direct=1 +thread=1 +group_reporting=1 +iodepth=1 +bs=4k +size=512M +time_based=1 +runtime=30 +filename=/tmp/zvfs_sys_fio/testfile + +[randread_4k] +description=4K random read for IOPS and average latency +rw=randread diff --git a/zvfs_fio_test/sys/randrw_4k.fio b/zvfs_fio_test/sys/randrw_4k.fio new file mode 100644 index 0000000..55499d5 --- /dev/null +++ b/zvfs_fio_test/sys/randrw_4k.fio @@ -0,0 +1,16 @@ +[global] +ioengine=psync +direct=1 +thread=1 +group_reporting=1 +iodepth=1 +bs=4k +size=512M +time_based=1 +runtime=30 +filename=/tmp/zvfs_sys_fio/testfile + +[randrw_4k] +description=4K random read write for per direction IOPS and average latency +rw=randrw +rwmixread=50 diff --git a/zvfs_fio_test/sys/randwrite_4k.fio b/zvfs_fio_test/sys/randwrite_4k.fio new file mode 100644 index 0000000..4676a95 --- /dev/null +++ b/zvfs_fio_test/sys/randwrite_4k.fio @@ -0,0 +1,15 @@ +[global] +ioengine=psync +direct=1 +thread=1 +group_reporting=1 +iodepth=1 +bs=4k +size=512M +time_based=1 +runtime=30 +filename=/tmp/zvfs_sys_fio/testfile + +[randwrite_4k] +description=4K random write for IOPS and average latency +rw=randwrite diff --git a/zvfs_fio_test/zvfs/prepare_fill.fio b/zvfs_fio_test/zvfs/prepare_fill.fio new file mode 100644 index 0000000..c3dbf96 --- /dev/null +++ b/zvfs_fio_test/zvfs/prepare_fill.fio @@ -0,0 +1,13 @@ +[global] +ioengine=psync +direct=1 +thread=1 +group_reporting=1 +iodepth=1 +bs=4k +size=512M +filename=/zvfs/zvfs_sys_fio/testfile + +[prepare_fill] +description=Sequential prefill for bandwidth +rw=write diff --git a/zvfs_fio_test/zvfs/randread_4k.fio b/zvfs_fio_test/zvfs/randread_4k.fio new file mode 100644 index 0000000..270ac81 --- /dev/null +++ b/zvfs_fio_test/zvfs/randread_4k.fio @@ -0,0 +1,15 @@ +[global] +ioengine=psync +direct=1 +thread=1 +group_reporting=1 +iodepth=1 +bs=4k +size=512M +time_based=1 +runtime=30 +filename=/zvfs/zvfs_sys_fio/testfile + +[randread_4k] +description=4K random read for IOPS and average latency +rw=randread diff --git a/zvfs_fio_test/zvfs/randrw_4k.fio b/zvfs_fio_test/zvfs/randrw_4k.fio new file mode 100644 index 0000000..3416914 --- /dev/null +++ b/zvfs_fio_test/zvfs/randrw_4k.fio @@ -0,0 +1,16 @@ +[global] +ioengine=psync +direct=1 +thread=1 +group_reporting=1 +iodepth=1 +bs=4k +size=512M +time_based=1 +runtime=30 +filename=/zvfs/zvfs_sys_fio/testfile + +[randrw_4k] +description=4K random read write for per direction IOPS and average latency +rw=randrw +rwmixread=50 diff --git a/zvfs_fio_test/zvfs/randwrite_4k.fio b/zvfs_fio_test/zvfs/randwrite_4k.fio new file mode 100644 index 0000000..3151670 --- /dev/null +++ b/zvfs_fio_test/zvfs/randwrite_4k.fio @@ -0,0 +1,15 @@ +[global] +ioengine=psync +direct=1 +thread=1 +group_reporting=1 +iodepth=1 +bs=4k +size=512M +time_based=1 +runtime=30 +filename=/zvfs/zvfs_sys_fio/testfile + +[randwrite_4k] +description=4K random write for IOPS and average latency +rw=randwrite