Files
zvfs/src/hook/zvfs_hook_fd.c
2026-04-14 07:40:56 +00:00

1558 lines
45 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "zvfs_hook_fd.h"
#include "zvfs_hook_init.h"
#include "zvfs_hook_reentrant.h"
#include "fs/zvfs.h"
#include "fs/zvfs_inode.h"
#include "fs/zvfs_path_entry.h"
#include "fs/zvfs_open_file.h"
#include "spdk_engine/io_engine.h"
#include <stdarg.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <limits.h>
#include <pthread.h>
#include <stdio.h>
#define zvfs_debug_open_log(...) ((void)0)
/* close 路径辅助:在文件后半段实现。 */
static int zvfs_detach_fd_mapping(int fd, int do_sync_md);
/* ------------------------------------------------------------------ */
/* 内部:路径判定辅助 */
/* ------------------------------------------------------------------ */
/**
* openat 到达符号链接之后跳转到 /zvfs 下,导致捕获不了。
*
* 1. 判断路径是不是 /zvfs
* 2. 判断readpath是不是 /zvfs
* 3. 如果O_CREATE并且目标不存在realpath什么也拿不到。先解析父路径再拼接看是不是落在 /zvfs
*/
static int
zvfs_classify_path(const char *abspath, int may_create,
char *normalized_out, size_t out_size)
{
char resolved[PATH_MAX];
char tmp[PATH_MAX];
char parent[PATH_MAX];
char candidate[PATH_MAX];
const char *name;
char *slash;
int n;
if (!abspath || !normalized_out || out_size == 0) {
return 0;
}
strncpy(normalized_out, abspath, out_size);
normalized_out[out_size - 1] = '\0';
if (zvfs_is_zvfs_path(abspath)) {
return 1;
}
if (realpath(abspath, resolved) != NULL) {
if (zvfs_is_zvfs_path(resolved)) {
strncpy(normalized_out, resolved, out_size);
normalized_out[out_size - 1] = '\0';
return 1;
}
return 0;
}
if (!may_create) {
return 0;
}
strncpy(tmp, abspath, sizeof(tmp));
tmp[sizeof(tmp) - 1] = '\0';
slash = strrchr(tmp, '/');
if (!slash) {
return 0;
}
name = slash + 1;
if (*name == '\0') {
return 0;
}
if (slash == tmp) {
strcpy(parent, "/");
} else {
*slash = '\0';
strncpy(parent, tmp, sizeof(parent));
parent[sizeof(parent) - 1] = '\0';
}
if (realpath(parent, resolved) == NULL) {
return 0;
}
n = snprintf(candidate, sizeof(candidate), "%s/%s", resolved, name);
if (n <= 0 || (size_t)n >= sizeof(candidate)) {
return 0;
}
if (!zvfs_is_zvfs_path(candidate)) {
return 0;
}
strncpy(normalized_out, candidate, out_size);
normalized_out[out_size - 1] = '\0';
return 1;
}
/* ------------------------------------------------------------------ */
/* 内部fopen 模式解析 */
/* ------------------------------------------------------------------ */
static int
zvfs_parse_fopen_mode(const char *mode, int extra_flags, int *flags_out, mode_t *create_mode_out)
{
int flags = 0;
int plus = 0;
int excl = 0;
int cloexec = 0;
const char *p;
if (!mode || !*mode || !flags_out || !create_mode_out) {
errno = EINVAL;
return -1;
}
for (p = mode + 1; *p && *p != ','; ++p) {
if (*p == '+') plus = 1;
else if (*p == 'x') excl = 1;
else if (*p == 'e') cloexec = 1;
}
switch (mode[0]) {
case 'r':
flags = plus ? O_RDWR : O_RDONLY;
break;
case 'w':
flags = (plus ? O_RDWR : O_WRONLY) | O_CREAT | O_TRUNC;
break;
case 'a':
flags = (plus ? O_RDWR : O_WRONLY) | O_CREAT | O_APPEND;
break;
default:
errno = EINVAL;
return -1;
}
if (excl) {
flags |= O_EXCL;
}
if (cloexec) {
flags |= O_CLOEXEC;
}
flags |= extra_flags;
*flags_out = flags;
*create_mode_out = 0666;
return 0;
}
static void
zvfs_sanitize_fdopen_mode(const char *mode, char out[4])
{
int i = 0;
int plus = 0;
int binary = 0;
const char *p;
out[0] = 'r';
out[1] = '\0';
if (!mode || !*mode) {
return;
}
for (p = mode + 1; *p && *p != ','; ++p) {
if (*p == '+') plus = 1;
else if (*p == 'b') binary = 1;
}
out[i++] = mode[0];
if (binary && i < 3) out[i++] = 'b';
if (plus && i < 3) out[i++] = '+';
out[i] = '\0';
}
/* ------------------------------------------------------------------ */
/* 内部open 的核心逻辑(路径已解析为绝对路径) */
/* ------------------------------------------------------------------ */
/**
* zvfs_open_impl - 对一个确认属于 zvfs 的绝对路径执行 open。
*
* real_fd已经由 real_open* 打开的真实 fd用于 xattr 读写 + ftruncate
* flags open 时传入的 flags。
* mode O_CREAT 时的权限。
*
* 成功返回 real_fd即用户拿到的 fd失败返回 -1errno 已设置),
* 失败时调用方负责 real_close(real_fd)。
*/
static int
zvfs_open_impl(int real_fd, const char *abspath, int flags, mode_t mode)
{
struct zvfs_inode *inode = NULL;
uint64_t blob_id = 0;
uint64_t handle_id = 0;
int create_new = 0;
zvfs_debug_open_log(abspath, NULL,
"open_impl enter real_fd=%d path=%s flags=0x%x mode=%#o",
real_fd, zvfs_dbg_str(abspath), flags, (unsigned)mode);
if (flags & O_CREAT) {
/*
* O_CREAT does not imply the file is newly created.
* fio, for example, may open an existing file with O_CREAT again
* during the worker phase. Only create a new blob when the backing
* file does not already carry a ZVFS blob_id xattr.
*/
if (zvfs_xattr_read_blob_id(real_fd, &blob_id) == 0) {
create_new = 0;
} else if (errno == ENODATA
#ifdef ENOATTR
|| errno == ENOATTR
#endif
) {
create_new = 1;
blob_id = 0;
} else {
zvfs_debug_open_log(abspath, NULL,
"open_impl xattr probe fail errno=%d(%s)",
errno, strerror(errno));
goto fail;
}
}
if (create_new) {
/* ---- 创建路径 -------------------------------------------- */
/* 1. 创建 blob */
if (blob_create(0, flags, &blob_id, &handle_id) != 0) {
int saved = errno;
if (saved == 0) saved = EIO;
fprintf(stderr,
"[zvfs] create blob failed path=%s flags=0x%x errno=%d(%s)\n",
abspath, flags, saved, strerror(saved));
zvfs_debug_open_log(abspath, NULL,
"create branch blob_create fail errno=%d(%s)",
saved, strerror(saved));
errno = saved;
goto fail;
}
zvfs_debug_open_log(abspath, NULL,
"create branch blob_create ok blob_id=%lu handle_id=%lu",
(unsigned long)blob_id, (unsigned long)handle_id);
/* 2. 把 blob_id 写入真实文件的 xattr */
if (zvfs_xattr_write_blob_id(real_fd, blob_id) < 0) {
zvfs_debug_open_log(abspath, NULL,
"create branch xattr_write fail errno=%d(%s)",
errno, strerror(errno));
goto fail;
}
zvfs_debug_open_log(abspath, NULL, "create branch xattr_write ok");
/* 3. logical_size = 0让 st_size 也为 0 */
if (real_ftruncate(real_fd, 0) < 0) {
zvfs_debug_open_log(abspath, NULL,
"create branch real_ftruncate(0) fail errno=%d(%s)",
errno, strerror(errno));
goto fail;
}
zvfs_debug_open_log(abspath, NULL, "create branch real_ftruncate(0) ok");
/* 4. 分配 inode */
inode = inode_alloc(blob_id, mode ? mode : 0666, ZVFS_ITYPE_FILE);
if (!inode) {
errno = ENOMEM;
zvfs_debug_open_log(abspath, NULL, "create branch inode_alloc fail ENOMEM");
goto fail;
}
/* 5. 插入全局表 */
pthread_mutex_lock(&g_fs.inode_mu);
inode_insert(inode);
pthread_mutex_unlock(&g_fs.inode_mu);
/* 6. 插入 path_cache */
pthread_mutex_lock(&g_fs.path_mu);
path_cache_insert(abspath, inode);
pthread_mutex_unlock(&g_fs.path_mu);
zvfs_debug_open_log(abspath, NULL,
"create branch inode/path_cache inserted logical_size=%lu",
(unsigned long)inode->logical_size);
} else {
/* ---- 打开已有文件路径 ------------------------------------- */
/* 1. 先查 path_cache命中说明另一个 fd 已经打开过 */
pthread_mutex_lock(&g_fs.path_mu);
struct zvfs_path_entry *pe = path_cache_lookup(abspath);
if (pe) inode = pe->inode;
pthread_mutex_unlock(&g_fs.path_mu);
if (inode) {
zvfs_debug_open_log(abspath, NULL,
"open existing path_cache hit inode_blob_id=%lu",
(unsigned long)inode->blob_id);
/* path_cache 命中:直接用缓存的 inode重新 blob_open */
blob_id = inode->blob_id;
if (blob_open(blob_id, flags, &handle_id) != 0) {
if (errno == 0) errno = EIO;
zvfs_debug_open_log(abspath, NULL,
"open existing path_cache-hit blob_open fail errno=%d(%s)",
errno, strerror(errno));
goto fail;
}
/* 共享 inode增加引用 */
atomic_fetch_add(&inode->ref_count, 1);
zvfs_debug_open_log(abspath, NULL,
"open existing path_cache-hit blob_open ok handle_id=%lu",
(unsigned long)handle_id);
} else {
zvfs_debug_open_log(abspath, NULL, "open existing path_cache miss");
/* 未命中:从 xattr 读 blob_id可能是进程首次 open */
if (zvfs_xattr_read_blob_id(real_fd, &blob_id) < 0) {
/* xattr 不存在:不是 zvfs 管理的文件,降级透传 */
return real_fd; /* 直接返回,不做任何包装 */
}
zvfs_debug_open_log(abspath, NULL,
"open existing xattr_read ok blob_id=%lu",
(unsigned long)blob_id);
/* 再查 inode_table另一个 fd 可能已经 open 但路径未缓存)*/
pthread_mutex_lock(&g_fs.inode_mu);
inode = inode_lookup(blob_id);
pthread_mutex_unlock(&g_fs.inode_mu);
if (inode) {
zvfs_debug_open_log(abspath, NULL,
"open existing inode_table hit blob_id=%lu",
(unsigned long)blob_id);
if (blob_open(blob_id, flags, &handle_id) != 0) {
if (errno == 0) errno = EIO;
zvfs_debug_open_log(abspath, NULL,
"open existing inode_table-hit blob_open fail errno=%d(%s)",
errno, strerror(errno));
goto fail;
}
atomic_fetch_add(&inode->ref_count, 1);
zvfs_debug_open_log(abspath, NULL,
"open existing inode_table-hit blob_open ok handle_id=%lu",
(unsigned long)handle_id);
} else {
/* 全新 inode需从真实文件 stat 获取 mode/size */
struct stat st;
if (zvfs_real_fstat(real_fd, &st) < 0) {
zvfs_debug_open_log(abspath, NULL,
"open existing fstat fail errno=%d(%s)",
errno, strerror(errno));
goto fail;
}
inode = inode_alloc(blob_id, st.st_mode, ZVFS_ITYPE_FILE);
if (!inode) {
errno = ENOMEM;
zvfs_debug_open_log(abspath, NULL,
"open existing inode_alloc fail ENOMEM");
goto fail;
}
inode->logical_size = (uint64_t)st.st_size;
pthread_mutex_lock(&g_fs.inode_mu);
inode_insert(inode);
pthread_mutex_unlock(&g_fs.inode_mu);
pthread_mutex_lock(&g_fs.path_mu);
path_cache_insert(abspath, inode);
pthread_mutex_unlock(&g_fs.path_mu);
if (blob_open(blob_id, flags, &handle_id) != 0) {
if (errno == 0) errno = EIO;
zvfs_debug_open_log(abspath, NULL,
"open existing new-inode blob_open fail errno=%d(%s)",
errno, strerror(errno));
goto fail;
}
zvfs_debug_open_log(abspath, NULL,
"open existing new-inode ready handle_id=%lu logical_size=%lu",
(unsigned long)handle_id,
(unsigned long)inode->logical_size);
}
}
}
/* ---- 分配 openfile插入 fd_table ---------------------------- */
struct zvfs_open_file *of = openfile_alloc(real_fd, inode, handle_id);
if (!of) { errno = ENOMEM; goto fail_handle; }
pthread_mutex_lock(&g_fs.fd_mu);
openfile_insert(of);
pthread_mutex_unlock(&g_fs.fd_mu);
zvfs_debug_open_log(abspath, NULL,
"open_impl success real_fd=%d handle_id=%lu inode_blob_id=%lu",
real_fd,
(unsigned long)handle_id,
(unsigned long)(inode ? inode->blob_id : 0));
return real_fd;
fail_handle:
if (handle_id != 0) {
blob_close(handle_id);
}
fail:
zvfs_debug_open_log(abspath, NULL,
"open_impl fail errno=%d(%s) real_fd=%d",
errno, strerror(errno), real_fd);
/* inode 若刚分配ref_count==1需要回滚 */
if (inode && atomic_load(&inode->ref_count) == 1) {
pthread_mutex_lock(&g_fs.inode_mu);
inode_remove(inode->blob_id);
pthread_mutex_unlock(&g_fs.inode_mu);
pthread_mutex_lock(&g_fs.path_mu);
path_cache_remove(abspath);
pthread_mutex_unlock(&g_fs.path_mu);
inode_free(inode);
}
return -1;
}
/* ------------------------------------------------------------------ */
/* open */
/* ------------------------------------------------------------------ */
int
open(const char *path, int flags, ...)
{
ZVFS_HOOK_ENTER();
char abspath[PATH_MAX];
char normpath[PATH_MAX];
abspath[0] = '\0';
normpath[0] = '\0';
int is_zvfs_path = 0;
mode_t mode = 0;
if (flags & O_CREAT) {
va_list ap;
va_start(ap, flags);
mode = (mode_t)va_arg(ap, unsigned int);
va_end(ap);
}
if (zvfs_resolve_atpath(AT_FDCWD, path, abspath, sizeof(abspath)) == 0) {
is_zvfs_path = zvfs_classify_path(abspath, (flags & O_CREAT) != 0,
normpath, sizeof(normpath));
zvfs_debug_open_log(path, abspath,
"open resolve ok path=%s abspath=%s norm=%s flags=0x%x is_zvfs=%d",
zvfs_dbg_str(path), zvfs_dbg_str(abspath),
zvfs_dbg_str(normpath), flags, is_zvfs_path);
} else {
zvfs_debug_open_log(path, NULL,
"open resolve fail path=%s flags=0x%x errno=%d(%s)",
zvfs_dbg_str(path), flags, errno, strerror(errno));
}
int ret;
if (ZVFS_IN_HOOK() || !is_zvfs_path) {
zvfs_debug_open_log(path, abspath,
"open passthrough reason=%s path=%s flags=0x%x",
ZVFS_IN_HOOK() ? "reentrant" : "non-zvfs",
zvfs_dbg_str(path), flags);
ret = real_open(path, flags, mode);
zvfs_debug_open_log(path, abspath,
"open passthrough ret=%d errno=%d(%s)",
ret, (ret < 0) ? errno : 0, (ret < 0) ? strerror(errno) : "OK");
ZVFS_HOOK_LEAVE();
return ret;
}
zvfs_ensure_init();
/* 先让真实 FS 创建 / 打开文件(获得 real_fd */
int real_fd = real_open(path, flags, mode);
if (real_fd < 0) {
zvfs_debug_open_log(path, abspath,
"open real_open fail path=%s flags=0x%x errno=%d(%s)",
zvfs_dbg_str(path), flags, errno, strerror(errno));
ZVFS_HOOK_LEAVE();
return -1;
}
zvfs_debug_open_log(path, abspath,
"open real_open ok real_fd=%d path=%s norm=%s",
real_fd, zvfs_dbg_str(path), zvfs_dbg_str(normpath));
ret = zvfs_open_impl(real_fd, normpath, flags, mode);
if (ret < 0) {
int saved = errno;
real_close(real_fd);
errno = saved;
zvfs_debug_open_log(path, abspath,
"open zvfs_open_impl fail real_fd=%d errno=%d(%s)",
real_fd, saved, strerror(saved));
} else {
zvfs_debug_open_log(path, abspath,
"open zvfs_open_impl success fd=%d", ret);
}
ZVFS_HOOK_LEAVE();
return ret;
}
int open64(const char *path, int flags, ...)
{
mode_t mode = 0;
if (flags & O_CREAT) {
va_list ap; va_start(ap, flags);
mode = (mode_t)va_arg(ap, unsigned int);
va_end(ap);
}
return open(path, flags | O_LARGEFILE, mode);
}
/* ------------------------------------------------------------------ */
/* openat */
/* ------------------------------------------------------------------ */
int
openat(int dirfd, const char *path, int flags, ...)
{
ZVFS_HOOK_ENTER();
char normpath[PATH_MAX];
char abspath[PATH_MAX];
normpath[0] = '\0';
abspath[0] = '\0';
int is_zvfs_path = 0;
mode_t mode = 0;
if (flags & O_CREAT) {
va_list ap; va_start(ap, flags);
mode = (mode_t)va_arg(ap, unsigned int);
va_end(ap);
}
/* 解析绝对路径判断是否属于 zvfs */
if (zvfs_resolve_atpath(dirfd, path, abspath, sizeof(abspath)) < 0) {
zvfs_debug_open_log(path, NULL,
"openat resolve fail dirfd=%d path=%s flags=0x%x errno=%d(%s)",
dirfd, zvfs_dbg_str(path), flags, errno, strerror(errno));
ZVFS_HOOK_LEAVE();
return -1;
}
is_zvfs_path = zvfs_classify_path(abspath, (flags & O_CREAT) != 0,
normpath, sizeof(normpath));
zvfs_debug_open_log(path, abspath,
"openat resolve ok dirfd=%d path=%s abspath=%s norm=%s flags=0x%x is_zvfs=%d",
dirfd, zvfs_dbg_str(path), zvfs_dbg_str(abspath),
zvfs_dbg_str(normpath), flags, is_zvfs_path);
int ret;
if (ZVFS_IN_HOOK() || !is_zvfs_path) {
zvfs_debug_open_log(path, abspath,
"openat passthrough reason=%s dirfd=%d path=%s flags=0x%x",
ZVFS_IN_HOOK() ? "reentrant" : "non-zvfs",
dirfd, zvfs_dbg_str(path), flags);
ret = real_openat(dirfd, path, flags, mode);
zvfs_debug_open_log(path, abspath,
"openat passthrough ret=%d errno=%d(%s)",
ret, (ret < 0) ? errno : 0, (ret < 0) ? strerror(errno) : "OK");
ZVFS_HOOK_LEAVE();
return ret;
}
zvfs_ensure_init();
int real_fd = real_openat(dirfd, path, flags, mode);
if (real_fd < 0) {
zvfs_debug_open_log(path, abspath,
"openat real_openat fail dirfd=%d path=%s flags=0x%x errno=%d(%s)",
dirfd, zvfs_dbg_str(path), flags, errno, strerror(errno));
ZVFS_HOOK_LEAVE();
return -1;
}
zvfs_debug_open_log(path, abspath,
"openat real_openat ok real_fd=%d dirfd=%d path=%s norm=%s",
real_fd, dirfd, zvfs_dbg_str(path), zvfs_dbg_str(normpath));
ret = zvfs_open_impl(real_fd, normpath, flags, mode);
if (ret < 0) {
int saved = errno;
real_close(real_fd);
errno = saved;
zvfs_debug_open_log(path, abspath,
"openat zvfs_open_impl fail real_fd=%d errno=%d(%s)",
real_fd, saved, strerror(saved));
} else {
zvfs_debug_open_log(path, abspath,
"openat zvfs_open_impl success fd=%d", ret);
}
ZVFS_HOOK_LEAVE();
return ret;
}
int openat64(int dirfd, const char *path, int flags, ...)
{
mode_t mode = 0;
if (flags & O_CREAT) {
va_list ap; va_start(ap, flags);
mode = (mode_t)va_arg(ap, unsigned int);
va_end(ap);
}
return openat(dirfd, path, flags | O_LARGEFILE, mode);
}
/* ------------------------------------------------------------------ */
/* fopen / fopen64 */
/* ------------------------------------------------------------------ */
static FILE *
zvfs_fopen_common(const char *path, const char *mode, int extra_open_flags, int use_fopen64)
{
char abspath[PATH_MAX];
char normpath[PATH_MAX];
char fdopen_mode[4];
int is_zvfs_path = 0;
int flags = 0;
mode_t create_mode = 0666;
int real_fd = -1;
FILE *fp = NULL;
if (zvfs_parse_fopen_mode(mode, extra_open_flags, &flags, &create_mode) != 0) {
if (use_fopen64 && real_fopen64) return real_fopen64(path, mode);
if (real_fopen) return real_fopen(path, mode);
errno = ENOSYS;
return NULL;
}
if (zvfs_resolve_atpath(AT_FDCWD, path, abspath, sizeof(abspath)) == 0) {
is_zvfs_path = zvfs_classify_path(abspath, (flags & O_CREAT) != 0,
normpath, sizeof(normpath));
zvfs_debug_open_log(path, abspath,
"fopen resolve ok path=%s mode=%s norm=%s flags=0x%x is_zvfs=%d",
zvfs_dbg_str(path), zvfs_dbg_str(mode),
zvfs_dbg_str(normpath), flags, is_zvfs_path);
} else {
zvfs_debug_open_log(path, NULL,
"fopen resolve fail path=%s mode=%s errno=%d(%s)",
zvfs_dbg_str(path), zvfs_dbg_str(mode), errno, strerror(errno));
}
if (ZVFS_IN_HOOK() || !is_zvfs_path) {
if (use_fopen64 && real_fopen64) return real_fopen64(path, mode);
if (real_fopen) return real_fopen(path, mode);
errno = ENOSYS;
return NULL;
}
zvfs_ensure_init();
real_fd = real_open(path, flags, create_mode);
if (real_fd < 0) {
return NULL;
}
if (zvfs_open_impl(real_fd, normpath, flags, create_mode) < 0) {
int saved = errno;
real_close(real_fd);
errno = saved;
return NULL;
}
zvfs_debug_open_log(path, normpath,
"fopen mapped-after-open_impl fd=%d mapped=%d",
real_fd, zvfs_debug_has_fd_mapping(real_fd));
zvfs_sanitize_fdopen_mode(mode, fdopen_mode);
if (real_fdopen) {
fp = real_fdopen(real_fd, fdopen_mode);
} else {
fp = fdopen(real_fd, fdopen_mode);
}
if (!fp) {
int saved = errno;
close(real_fd);
errno = saved;
return NULL;
}
zvfs_debug_open_log(path, normpath,
"fopen mapped-after-fdopen fd=%d mapped=%d",
real_fd, zvfs_debug_has_fd_mapping(real_fd));
return fp;
}
FILE *
fopen(const char *path, const char *mode)
{
ZVFS_HOOK_ENTER();
FILE *fp = zvfs_fopen_common(path, mode, 0, 0);
ZVFS_HOOK_LEAVE();
return fp;
}
FILE *
fopen64(const char *path, const char *mode)
{
ZVFS_HOOK_ENTER();
FILE *fp = zvfs_fopen_common(path, mode, O_LARGEFILE, 1);
ZVFS_HOOK_LEAVE();
return fp;
}
int
fclose(FILE *stream)
{
ZVFS_HOOK_ENTER();
int ret;
int ret_errno = 0;
int bk_rc = 0;
int bk_errno = 0;
int fd = -1;
int need_bookkeeping = 0;
if (!stream) {
errno = EINVAL;
ZVFS_HOOK_LEAVE();
return -1;
}
if (!ZVFS_IN_HOOK()) {
fd = fileno(stream);
if (fd >= 0 && zvfs_is_zvfs_fd(fd)) {
need_bookkeeping = 1;
}
}
if (!real_fclose) {
errno = ENOSYS;
ZVFS_HOOK_LEAVE();
return -1;
}
if (ZVFS_IN_HOOK() || !need_bookkeeping) {
ret = real_fclose(stream);
ZVFS_HOOK_LEAVE();
return ret;
}
zvfs_ensure_init();
ret = real_fclose(stream);
if (ret < 0) {
ret_errno = errno;
}
/*
* 无论 real_fclose 是否报错,都尝试回收 zvfs bookkeeping。
* 某些 libc 实现即使返回 EOF也可能已经关闭了底层 fd。
*/
if (zvfs_detach_fd_mapping(fd, 1) < 0) {
bk_rc = -1;
bk_errno = errno;
}
if (ret < 0) {
errno = ret_errno;
ZVFS_HOOK_LEAVE();
return -1;
}
if (bk_rc < 0) {
errno = bk_errno;
ZVFS_HOOK_LEAVE();
return -1;
}
ZVFS_HOOK_LEAVE();
return 0;
}
/* ------------------------------------------------------------------ */
/* creat */
/* ------------------------------------------------------------------ */
int creat(const char *path, mode_t mode)
{
return open(path, O_CREAT | O_WRONLY | O_TRUNC, mode);
}
int creat64(const char *path, mode_t mode)
{
return open(path, O_CREAT | O_WRONLY | O_TRUNC | O_LARGEFILE, mode);
}
/* ------------------------------------------------------------------ */
/* glibc 别名 */
/* ------------------------------------------------------------------ */
int __open(const char *path, int flags, ...)
{
mode_t mode = 0;
if (flags & O_CREAT) {
va_list ap; va_start(ap, flags);
mode = (mode_t)va_arg(ap, unsigned int);
va_end(ap);
}
return open(path, flags, mode);
}
int __open64(const char *path, int flags, ...)
{
mode_t mode = 0;
if (flags & O_CREAT) {
va_list ap; va_start(ap, flags);
mode = (mode_t)va_arg(ap, unsigned int);
va_end(ap);
}
return open64(path, flags, mode);
}
int __openat(int dirfd, const char *path, int flags, ...)
{
mode_t mode = 0;
if (flags & O_CREAT) {
va_list ap; va_start(ap, flags);
mode = (mode_t)va_arg(ap, unsigned int);
va_end(ap);
}
return openat(dirfd, path, flags, mode);
}
int __openat64(int dirfd, const char *path, int flags, ...)
{
mode_t mode = 0;
if (flags & O_CREAT) {
va_list ap; va_start(ap, flags);
mode = (mode_t)va_arg(ap, unsigned int);
va_end(ap);
}
return openat64(dirfd, path, flags, mode);
}
int __libc_open(const char *path, int flags, ...)
{
mode_t mode = 0;
if (flags & O_CREAT) {
va_list ap; va_start(ap, flags);
mode = (mode_t)va_arg(ap, unsigned int);
va_end(ap);
}
return open(path, flags, mode);
}
int __libc_open64(const char *path, int flags, ...)
{
mode_t mode = 0;
if (flags & O_CREAT) {
va_list ap; va_start(ap, flags);
mode = (mode_t)va_arg(ap, unsigned int);
va_end(ap);
}
return open64(path, flags, mode);
}
int __libc_openat(int dirfd, const char *path, int flags, ...)
{
mode_t mode = 0;
if (flags & O_CREAT) {
va_list ap; va_start(ap, flags);
mode = (mode_t)va_arg(ap, unsigned int);
va_end(ap);
}
return openat(dirfd, path, flags, mode);
}
int __libc_openat64(int dirfd, const char *path, int flags, ...)
{
mode_t mode = 0;
if (flags & O_CREAT) {
va_list ap; va_start(ap, flags);
mode = (mode_t)va_arg(ap, unsigned int);
va_end(ap);
}
return openat64(dirfd, path, flags, mode);
}
int __open_2(const char *path, int flags)
{
if ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE)) {
errno = EINVAL;
return -1;
}
zvfs_debug_open_log(path, NULL,
"__open_2 called path=%s flags=0x%x",
zvfs_dbg_str(path), flags);
return open(path, flags);
}
int __open64_2(const char *path, int flags)
{
if ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE)) {
errno = EINVAL;
return -1;
}
return open64(path, flags);
}
int __openat_2(int dirfd, const char *path, int flags)
{
if ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE)) {
errno = EINVAL;
return -1;
}
zvfs_debug_open_log(path, NULL,
"__openat_2 called dirfd=%d path=%s flags=0x%x",
dirfd, zvfs_dbg_str(path), flags);
return openat(dirfd, path, flags);
}
int __openat64_2(int dirfd, const char *path, int flags)
{
if ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE)) {
errno = EINVAL;
return -1;
}
return openat64(dirfd, path, flags);
}
int __open_nocancel(const char *path, int flags, ...)
{
mode_t mode = 0;
if (flags & O_CREAT) {
va_list ap; va_start(ap, flags);
mode = (mode_t)va_arg(ap, unsigned int);
va_end(ap);
}
return open(path, flags, mode);
}
int __open64_nocancel(const char *path, int flags, ...)
{
mode_t mode = 0;
if (flags & O_CREAT) {
va_list ap; va_start(ap, flags);
mode = (mode_t)va_arg(ap, unsigned int);
va_end(ap);
}
return open64(path, flags, mode);
}
int __openat_nocancel(int dirfd, const char *path, int flags, ...)
{
mode_t mode = 0;
if (flags & O_CREAT) {
va_list ap; va_start(ap, flags);
mode = (mode_t)va_arg(ap, unsigned int);
va_end(ap);
}
return openat(dirfd, path, flags, mode);
}
int __openat64_nocancel(int dirfd, const char *path, int flags, ...)
{
mode_t mode = 0;
if (flags & O_CREAT) {
va_list ap; va_start(ap, flags);
mode = (mode_t)va_arg(ap, unsigned int);
va_end(ap);
}
return openat64(dirfd, path, flags, mode);
}
/* ------------------------------------------------------------------ */
/* close */
/* ------------------------------------------------------------------ */
/*
* zvfs_release_openfile - 释放一个 openfile 对应的 zvfs 资源。
* 这里只处理 zvfs bookkeeping不做 real_close(fd)。
*/
static int
zvfs_release_openfile(struct zvfs_open_file *of, int do_sync_md)
{
int saved_errno = 0;
struct zvfs_inode *inode = of->inode;
uint64_t handle_id = of->handle_id;
openfile_free(of);
if (do_sync_md && handle_id != 0 && blob_sync_md(handle_id) < 0) {
saved_errno = (errno != 0) ? errno : EIO;
}
if (handle_id != 0 && blob_close(handle_id) < 0 && saved_errno == 0) {
saved_errno = (errno != 0) ? errno : EIO;
}
/* ---- inode ref_count-- --------------------------------------- */
int inode_ref = atomic_fetch_sub(&inode->ref_count, 1) - 1;
if (inode_ref == 0) {
/*
* 最后一个 fd 关闭了这个 inode。
* 若 deleted执行延迟 blob_delete。
*/
bool do_delete = false;
pthread_mutex_lock(&inode->mu);
do_delete = inode->deleted;
pthread_mutex_unlock(&inode->mu);
if (do_delete && blob_delete(inode->blob_id) < 0 && saved_errno == 0)
saved_errno = (errno != 0) ? errno : EIO;
pthread_mutex_lock(&g_fs.inode_mu);
inode_remove(inode->blob_id);
pthread_mutex_unlock(&g_fs.inode_mu);
/* path_cache 在 unlink 时已经摘除deleted=true 路径)
* 或在此处还需摘除(正常关闭最后一个 fd*/
if (!do_delete) {
/* 正常关闭path 留着,只有 inode 的引用归零时清缓存 */
/* 注意path_cache 里的指针指向这个即将释放的 inode
* 所以必须把 path_cache 条目也清掉,否则成为悬空指针 */
pthread_mutex_lock(&g_fs.path_mu);
/* 遍历找到所有指向这个 inode 的 path entry 并移除
* (一个 inode 对应一个 pathhardlink 暂不支持)*/
struct zvfs_path_entry *pe, *tmp; (void)tmp;
HASH_ITER(hh, g_fs.path_cache, pe, tmp) {
if (pe->inode == inode) {
HASH_DEL(g_fs.path_cache, pe);
free(pe->path);
free(pe);
break; /* 一对一关系,找到即退 */
}
}
pthread_mutex_unlock(&g_fs.path_mu);
}
inode_free(inode);
}
if (saved_errno != 0) {
errno = saved_errno;
return -1;
}
return 0;
}
/*
* zvfs_detach_fd_mapping - 仅摘除 fd -> openfile 映射并释放 zvfs 资源。
* 不调用 real_close(fd),用于 dup2/dup3 中 newfd 旧值清理。
*/
static int
zvfs_detach_fd_mapping(int fd, int do_sync_md)
{
pthread_mutex_lock(&g_fs.fd_mu);
struct zvfs_open_file *of = openfile_lookup(fd);
if (!of) {
pthread_mutex_unlock(&g_fs.fd_mu);
errno = EBADF;
return -1;
}
openfile_remove(fd);
pthread_mutex_unlock(&g_fs.fd_mu);
return zvfs_release_openfile(of, do_sync_md);
}
/*
* zvfs_close_impl - close(fd) 的 zvfs 路径:
* 先做 bookkeeping再做 real_close(fd)。
*/
static int
zvfs_close_impl(int fd)
{
int bk_rc = zvfs_detach_fd_mapping(fd, 1);
int bk_errno = (bk_rc < 0) ? errno : 0;
int rc = real_close(fd);
if (rc < 0)
return -1;
if (bk_rc < 0) {
errno = bk_errno;
return -1;
}
return 0;
}
int
close(int fd)
{
ZVFS_HOOK_ENTER();
int ret;
int is_zvfs_fd = (!ZVFS_IN_HOOK() && zvfs_is_zvfs_fd(fd));
if (!is_zvfs_fd) {
ret = real_close(fd);
ZVFS_HOOK_LEAVE();
return ret;
}
zvfs_ensure_init();
ret = zvfs_close_impl(fd);
ZVFS_HOOK_LEAVE();
return ret;
}
int __close(int fd) { return close(fd); }
int __libc_close(int fd) { return close(fd); }
int __close_nocancel(int fd) { return close(fd); }
/* ------------------------------------------------------------------ */
/* dup helper */
/* ------------------------------------------------------------------ */
int
zvfs_dup_attach_newfd(int oldfd, int newfd, int new_fd_flags)
{
struct zvfs_open_file *old_of, *new_of;
int fd_flags;
int rc;
int saved;
if (oldfd < 0 || newfd < 0) {
errno = EBADF;
return -1;
}
pthread_mutex_lock(&g_fs.fd_mu);
old_of = openfile_lookup(oldfd);
if (!old_of) {
pthread_mutex_unlock(&g_fs.fd_mu);
errno = EBADF;
return -1;
}
if (openfile_lookup(newfd) != NULL) {
pthread_mutex_unlock(&g_fs.fd_mu);
errno = EEXIST;
return -1;
}
rc = blob_add_ref(old_of->handle_id, 1);
if (rc != 0) {
pthread_mutex_unlock(&g_fs.fd_mu);
return -1;
}
new_of = openfile_alloc(newfd, old_of->inode, old_of->handle_id);
if (!new_of) {
saved = (errno != 0) ? errno : ENOMEM;
(void)blob_close(old_of->handle_id);
pthread_mutex_unlock(&g_fs.fd_mu);
errno = saved;
return -1;
}
fd_flags = (new_fd_flags >= 0) ? new_fd_flags : old_of->fd_flags;
new_of->fd_flags = fd_flags;
atomic_fetch_add(&old_of->inode->ref_count, 1);
openfile_insert(new_of);
pthread_mutex_unlock(&g_fs.fd_mu);
return 0;
}
static int
zvfs_add_ref_batch_or_fallback(const uint64_t *handle_ids,
const uint32_t *ref_deltas,
uint32_t count)
{
uint32_t i;
if (count == 0)
return 0;
if (blob_add_ref_batch(handle_ids, ref_deltas, count) == 0)
return 0;
for (i = 0; i < count; i++) {
if (blob_add_ref(handle_ids[i], ref_deltas[i]) != 0)
return -1;
}
return 0;
}
static void
zvfs_rollback_added_refs(const uint64_t *handle_ids, uint32_t count)
{
uint32_t i;
for (i = 0; i < count; i++) {
if (handle_ids[i] != 0)
(void)blob_close(handle_ids[i]);
}
}
static int
zvfs_snapshot_fd_handles(uint64_t **handle_ids_out,
uint32_t **ref_deltas_out,
uint32_t *count_out)
{
struct zvfs_open_file *of, *tmp;
uint32_t i = 0;
uint32_t count;
uint64_t *handle_ids = NULL;
uint32_t *ref_deltas = NULL;
*handle_ids_out = NULL;
*ref_deltas_out = NULL;
*count_out = 0;
pthread_mutex_lock(&g_fs.fd_mu);
count = (uint32_t)HASH_COUNT(g_fs.fd_table);
if (count == 0) {
pthread_mutex_unlock(&g_fs.fd_mu);
return 0;
}
handle_ids = calloc(count, sizeof(*handle_ids));
ref_deltas = calloc(count, sizeof(*ref_deltas));
if (!handle_ids || !ref_deltas) {
pthread_mutex_unlock(&g_fs.fd_mu);
free(handle_ids);
free(ref_deltas);
errno = ENOMEM;
return -1;
}
HASH_ITER(hh, g_fs.fd_table, of, tmp) {
if (i >= count)
break;
handle_ids[i] = of->handle_id;
ref_deltas[i] = 1;
i++;
}
pthread_mutex_unlock(&g_fs.fd_mu);
*handle_ids_out = handle_ids;
*ref_deltas_out = ref_deltas;
*count_out = i;
return 0;
}
static int
zvfs_snapshot_fds_in_range(unsigned int first, unsigned int last,
int **fds_out, uint32_t *count_out)
{
struct zvfs_open_file *of, *tmp;
uint32_t cap;
uint32_t n = 0;
int *fds = NULL;
*fds_out = NULL;
*count_out = 0;
pthread_mutex_lock(&g_fs.fd_mu);
cap = (uint32_t)HASH_COUNT(g_fs.fd_table);
if (cap == 0) {
pthread_mutex_unlock(&g_fs.fd_mu);
return 0;
}
fds = calloc(cap, sizeof(*fds));
if (!fds) {
pthread_mutex_unlock(&g_fs.fd_mu);
errno = ENOMEM;
return -1;
}
HASH_ITER(hh, g_fs.fd_table, of, tmp) {
if (of->fd < 0) {
continue;
}
if ((unsigned int)of->fd < first || (unsigned int)of->fd > last) {
continue;
}
fds[n++] = of->fd;
}
pthread_mutex_unlock(&g_fs.fd_mu);
*fds_out = fds;
*count_out = n;
return 0;
}
/* ------------------------------------------------------------------ */
/* close_range */
/* ------------------------------------------------------------------ */
int
close_range(unsigned int first, unsigned int last, int flags)
{
ZVFS_HOOK_ENTER();
if (ZVFS_IN_HOOK()) {
int ret = real_close_range ? real_close_range(first, last, flags)
: (errno = ENOSYS, -1);
ZVFS_HOOK_LEAVE();
return ret;
}
if (first > last) {
errno = EINVAL;
ZVFS_HOOK_LEAVE();
return -1;
}
/*
* 只快照当前 zvfs fd_table 中命中的 fd避免对 [first,last] 做
* 全范围扫描last=UINT_MAX 时会非常慢,且旧逻辑存在回绕风险)。
*/
int any_err = 0;
int inited = 0;
int *zvfs_fds = NULL;
uint32_t zvfs_fd_count = 0;
if (zvfs_snapshot_fds_in_range(first, last, &zvfs_fds, &zvfs_fd_count) < 0) {
ZVFS_HOOK_LEAVE();
return -1;
}
for (uint32_t i = 0; i < zvfs_fd_count; i++) {
if (!inited) {
zvfs_ensure_init();
inited = 1;
}
if (zvfs_close_impl(zvfs_fds[i]) < 0) {
any_err = 1;
}
}
free(zvfs_fds);
/* 让内核处理剩余非 zvfs fdCLOEXEC 等 flags 也在这里生效) */
if (real_close_range) {
if (real_close_range(first, last, flags) < 0 && !any_err)
any_err = 1;
} else {
/* 降级:逐个 close 非 zvfs fd按 open-max 做上界截断) */
unsigned int upper = last;
long open_max = sysconf(_SC_OPEN_MAX);
if (open_max > 0 && upper >= (unsigned int)open_max) {
upper = (unsigned int)open_max - 1;
}
for (unsigned int fd = first; fd <= upper; fd++) {
if (!zvfs_is_zvfs_fd((int)fd))
real_close((int)fd);
if (fd == upper)
break;
}
}
ZVFS_HOOK_LEAVE();
return any_err ? -1 : 0;
}
/* ------------------------------------------------------------------ */
/* dup */
/* ------------------------------------------------------------------ */
int
dup(int oldfd)
{
ZVFS_HOOK_ENTER();
int is_zvfs_fd = (!ZVFS_IN_HOOK() && zvfs_is_zvfs_fd(oldfd));
if (!is_zvfs_fd) {
int ret = real_dup(oldfd);
ZVFS_HOOK_LEAVE();
return ret;
}
zvfs_ensure_init();
int newfd = real_dup(oldfd);
if (newfd < 0) {
ZVFS_HOOK_LEAVE();
return -1;
}
if (zvfs_dup_attach_newfd(oldfd, newfd, 0) < 0) {
int saved = errno;
(void)real_close(newfd);
errno = saved;
ZVFS_HOOK_LEAVE();
return -1;
}
ZVFS_HOOK_LEAVE();
return newfd;
}
/* ------------------------------------------------------------------ */
/* dup2 */
/* ------------------------------------------------------------------ */
int
dup2(int oldfd, int newfd)
{
ZVFS_HOOK_ENTER();
int is_zvfs_fd = (!ZVFS_IN_HOOK() && zvfs_is_zvfs_fd(oldfd));
if (!is_zvfs_fd) {
int ret = real_dup2(oldfd, newfd);
ZVFS_HOOK_LEAVE();
return ret;
}
/* POSIX 兼容dup2(oldfd, oldfd) 对合法 fd 直接返回 oldfd。 */
if (oldfd == newfd) {
ZVFS_HOOK_LEAVE();
return oldfd;
}
zvfs_ensure_init();
int newfd_was_zvfs = zvfs_is_zvfs_fd(newfd);
int ret = real_dup2(oldfd, newfd);
if (ret < 0) {
ZVFS_HOOK_LEAVE();
return -1;
}
if (newfd_was_zvfs && zvfs_detach_fd_mapping(newfd, 1) < 0) {
int saved = errno;
(void)real_close(newfd);
errno = saved;
ZVFS_HOOK_LEAVE();
return -1;
}
if (zvfs_dup_attach_newfd(oldfd, newfd, 0) < 0) {
int saved = errno;
(void)real_close(newfd);
errno = saved;
ZVFS_HOOK_LEAVE();
return -1;
}
ZVFS_HOOK_LEAVE();
return ret;
}
/* ------------------------------------------------------------------ */
/* dup3 */
/* ------------------------------------------------------------------ */
int
dup3(int oldfd, int newfd, int flags)
{
ZVFS_HOOK_ENTER();
int is_zvfs_fd = (!ZVFS_IN_HOOK() && zvfs_is_zvfs_fd(oldfd));
if (!is_zvfs_fd) {
int ret = real_dup3(oldfd, newfd, flags);
ZVFS_HOOK_LEAVE();
return ret;
}
if (oldfd == newfd) {
errno = EINVAL;
ZVFS_HOOK_LEAVE();
return -1;
}
if ((flags & ~O_CLOEXEC) != 0) {
errno = EINVAL;
ZVFS_HOOK_LEAVE();
return -1;
}
zvfs_ensure_init();
int newfd_was_zvfs = zvfs_is_zvfs_fd(newfd);
int ret = real_dup3(oldfd, newfd, flags);
if (ret < 0) {
ZVFS_HOOK_LEAVE();
return -1;
}
if (newfd_was_zvfs && zvfs_detach_fd_mapping(newfd, 1) < 0) {
int saved = errno;
(void)real_close(newfd);
errno = saved;
ZVFS_HOOK_LEAVE();
return -1;
}
int fd_flags = (flags & O_CLOEXEC) ? FD_CLOEXEC : 0;
if (zvfs_dup_attach_newfd(oldfd, newfd, fd_flags) < 0) {
int saved = errno;
(void)real_close(newfd);
errno = saved;
ZVFS_HOOK_LEAVE();
return -1;
}
ZVFS_HOOK_LEAVE();
return ret;
}
/* ------------------------------------------------------------------ */
/* fork */
/* ------------------------------------------------------------------ */
pid_t
fork(void)
{
ZVFS_HOOK_ENTER();
if (ZVFS_IN_HOOK()) {
pid_t ret = real_fork();
ZVFS_HOOK_LEAVE();
return ret;
}
uint64_t *handle_ids = NULL;
uint32_t *ref_deltas = NULL;
uint32_t count = 0;
if (zvfs_snapshot_fd_handles(&handle_ids, &ref_deltas, &count) < 0) {
ZVFS_HOOK_LEAVE();
return -1;
}
if (count > 0) {
zvfs_ensure_init();
if (zvfs_add_ref_batch_or_fallback(handle_ids, ref_deltas, count) < 0) {
int saved = errno;
free(handle_ids);
free(ref_deltas);
errno = saved;
ZVFS_HOOK_LEAVE();
return -1;
}
}
pid_t ret = real_fork();
if (ret < 0) {
int saved = errno;
if (count > 0)
zvfs_rollback_added_refs(handle_ids, count);
free(handle_ids);
free(ref_deltas);
errno = saved;
ZVFS_HOOK_LEAVE();
return -1;
}
free(handle_ids);
free(ref_deltas);
ZVFS_HOOK_LEAVE();
return ret;
}