#ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #include "zvfs_hook_fd.h" #include "zvfs_hook_init.h" #include "zvfs_hook_reentrant.h" #include "fs/zvfs.h" #include "fs/zvfs_inode.h" #include "fs/zvfs_path_entry.h" #include "fs/zvfs_open_file.h" #include "spdk_engine/io_engine.h" #include #include #include #include #include #include #include #include #define zvfs_debug_open_log(...) ((void)0) /* close 路径辅助:在文件后半段实现。 */ static int zvfs_detach_fd_mapping(int fd, int do_sync_md); /* ------------------------------------------------------------------ */ /* 内部:路径判定辅助 */ /* ------------------------------------------------------------------ */ /** * openat 到达符号链接之后跳转到 /zvfs 下,导致捕获不了。 * * 1. 判断路径是不是 /zvfs * 2. 判断readpath是不是 /zvfs * 3. 如果O_CREATE并且目标不存在,realpath什么也拿不到。先解析父路径,再拼接看是不是落在 /zvfs */ static int zvfs_classify_path(const char *abspath, int may_create, char *normalized_out, size_t out_size) { char resolved[PATH_MAX]; char tmp[PATH_MAX]; char parent[PATH_MAX]; char candidate[PATH_MAX]; const char *name; char *slash; int n; if (!abspath || !normalized_out || out_size == 0) { return 0; } strncpy(normalized_out, abspath, out_size); normalized_out[out_size - 1] = '\0'; if (zvfs_is_zvfs_path(abspath)) { return 1; } if (realpath(abspath, resolved) != NULL) { if (zvfs_is_zvfs_path(resolved)) { strncpy(normalized_out, resolved, out_size); normalized_out[out_size - 1] = '\0'; return 1; } return 0; } if (!may_create) { return 0; } strncpy(tmp, abspath, sizeof(tmp)); tmp[sizeof(tmp) - 1] = '\0'; slash = strrchr(tmp, '/'); if (!slash) { return 0; } name = slash + 1; if (*name == '\0') { return 0; } if (slash == tmp) { strcpy(parent, "/"); } else { *slash = '\0'; strncpy(parent, tmp, sizeof(parent)); parent[sizeof(parent) - 1] = '\0'; } if (realpath(parent, resolved) == NULL) { return 0; } n = snprintf(candidate, sizeof(candidate), "%s/%s", resolved, name); if (n <= 0 || (size_t)n >= sizeof(candidate)) { return 0; } if (!zvfs_is_zvfs_path(candidate)) { return 0; } strncpy(normalized_out, candidate, out_size); normalized_out[out_size - 1] = '\0'; return 1; } /* ------------------------------------------------------------------ */ /* 内部:fopen 模式解析 */ /* ------------------------------------------------------------------ */ static int zvfs_parse_fopen_mode(const char *mode, int extra_flags, int *flags_out, mode_t *create_mode_out) { int flags = 0; int plus = 0; int excl = 0; int cloexec = 0; const char *p; if (!mode || !*mode || !flags_out || !create_mode_out) { errno = EINVAL; return -1; } for (p = mode + 1; *p && *p != ','; ++p) { if (*p == '+') plus = 1; else if (*p == 'x') excl = 1; else if (*p == 'e') cloexec = 1; } switch (mode[0]) { case 'r': flags = plus ? O_RDWR : O_RDONLY; break; case 'w': flags = (plus ? O_RDWR : O_WRONLY) | O_CREAT | O_TRUNC; break; case 'a': flags = (plus ? O_RDWR : O_WRONLY) | O_CREAT | O_APPEND; break; default: errno = EINVAL; return -1; } if (excl) { flags |= O_EXCL; } if (cloexec) { flags |= O_CLOEXEC; } flags |= extra_flags; *flags_out = flags; *create_mode_out = 0666; return 0; } static void zvfs_sanitize_fdopen_mode(const char *mode, char out[4]) { int i = 0; int plus = 0; int binary = 0; const char *p; out[0] = 'r'; out[1] = '\0'; if (!mode || !*mode) { return; } for (p = mode + 1; *p && *p != ','; ++p) { if (*p == '+') plus = 1; else if (*p == 'b') binary = 1; } out[i++] = mode[0]; if (binary && i < 3) out[i++] = 'b'; if (plus && i < 3) out[i++] = '+'; out[i] = '\0'; } /* ------------------------------------------------------------------ */ /* 内部:open 的核心逻辑(路径已解析为绝对路径) */ /* ------------------------------------------------------------------ */ /** * zvfs_open_impl - 对一个确认属于 zvfs 的绝对路径执行 open。 * * real_fd:已经由 real_open* 打开的真实 fd(用于 xattr 读写 + ftruncate)。 * flags :open 时传入的 flags。 * mode :O_CREAT 时的权限。 * * 成功返回 real_fd(即用户拿到的 fd),失败返回 -1(errno 已设置), * 失败时调用方负责 real_close(real_fd)。 */ static int zvfs_open_impl(int real_fd, const char *abspath, int flags, mode_t mode) { struct zvfs_inode *inode = NULL; uint64_t blob_id = 0; uint64_t handle_id = 0; int create_new = 0; zvfs_debug_open_log(abspath, NULL, "open_impl enter real_fd=%d path=%s flags=0x%x mode=%#o", real_fd, zvfs_dbg_str(abspath), flags, (unsigned)mode); if (flags & O_CREAT) { /* * O_CREAT does not imply the file is newly created. * fio, for example, may open an existing file with O_CREAT again * during the worker phase. Only create a new blob when the backing * file does not already carry a ZVFS blob_id xattr. */ if (zvfs_xattr_read_blob_id(real_fd, &blob_id) == 0) { create_new = 0; } else if (errno == ENODATA #ifdef ENOATTR || errno == ENOATTR #endif ) { create_new = 1; blob_id = 0; } else { zvfs_debug_open_log(abspath, NULL, "open_impl xattr probe fail errno=%d(%s)", errno, strerror(errno)); goto fail; } } if (create_new) { /* ---- 创建路径 -------------------------------------------- */ /* 1. 创建 blob */ if (blob_create(0, flags, &blob_id, &handle_id) != 0) { int saved = errno; if (saved == 0) saved = EIO; fprintf(stderr, "[zvfs] create blob failed path=%s flags=0x%x errno=%d(%s)\n", abspath, flags, saved, strerror(saved)); zvfs_debug_open_log(abspath, NULL, "create branch blob_create fail errno=%d(%s)", saved, strerror(saved)); errno = saved; goto fail; } zvfs_debug_open_log(abspath, NULL, "create branch blob_create ok blob_id=%lu handle_id=%lu", (unsigned long)blob_id, (unsigned long)handle_id); /* 2. 把 blob_id 写入真实文件的 xattr */ if (zvfs_xattr_write_blob_id(real_fd, blob_id) < 0) { zvfs_debug_open_log(abspath, NULL, "create branch xattr_write fail errno=%d(%s)", errno, strerror(errno)); goto fail; } zvfs_debug_open_log(abspath, NULL, "create branch xattr_write ok"); /* 3. logical_size = 0,让 st_size 也为 0 */ if (real_ftruncate(real_fd, 0) < 0) { zvfs_debug_open_log(abspath, NULL, "create branch real_ftruncate(0) fail errno=%d(%s)", errno, strerror(errno)); goto fail; } zvfs_debug_open_log(abspath, NULL, "create branch real_ftruncate(0) ok"); /* 4. 分配 inode */ inode = inode_alloc(blob_id, mode ? mode : 0666, ZVFS_ITYPE_FILE); if (!inode) { errno = ENOMEM; zvfs_debug_open_log(abspath, NULL, "create branch inode_alloc fail ENOMEM"); goto fail; } /* 5. 插入全局表 */ pthread_mutex_lock(&g_fs.inode_mu); inode_insert(inode); pthread_mutex_unlock(&g_fs.inode_mu); /* 6. 插入 path_cache */ pthread_mutex_lock(&g_fs.path_mu); path_cache_insert(abspath, inode); pthread_mutex_unlock(&g_fs.path_mu); zvfs_debug_open_log(abspath, NULL, "create branch inode/path_cache inserted logical_size=%lu", (unsigned long)inode->logical_size); } else { /* ---- 打开已有文件路径 ------------------------------------- */ /* 1. 先查 path_cache,命中说明另一个 fd 已经打开过 */ pthread_mutex_lock(&g_fs.path_mu); struct zvfs_path_entry *pe = path_cache_lookup(abspath); if (pe) inode = pe->inode; pthread_mutex_unlock(&g_fs.path_mu); if (inode) { zvfs_debug_open_log(abspath, NULL, "open existing path_cache hit inode_blob_id=%lu", (unsigned long)inode->blob_id); /* path_cache 命中:直接用缓存的 inode,重新 blob_open */ blob_id = inode->blob_id; if (blob_open(blob_id, flags, &handle_id) != 0) { if (errno == 0) errno = EIO; zvfs_debug_open_log(abspath, NULL, "open existing path_cache-hit blob_open fail errno=%d(%s)", errno, strerror(errno)); goto fail; } /* 共享 inode,增加引用 */ atomic_fetch_add(&inode->ref_count, 1); zvfs_debug_open_log(abspath, NULL, "open existing path_cache-hit blob_open ok handle_id=%lu", (unsigned long)handle_id); } else { zvfs_debug_open_log(abspath, NULL, "open existing path_cache miss"); /* 未命中:从 xattr 读 blob_id,可能是进程首次 open */ if (zvfs_xattr_read_blob_id(real_fd, &blob_id) < 0) { /* xattr 不存在:不是 zvfs 管理的文件,降级透传 */ return real_fd; /* 直接返回,不做任何包装 */ } zvfs_debug_open_log(abspath, NULL, "open existing xattr_read ok blob_id=%lu", (unsigned long)blob_id); /* 再查 inode_table(另一个 fd 可能已经 open 但路径未缓存)*/ pthread_mutex_lock(&g_fs.inode_mu); inode = inode_lookup(blob_id); pthread_mutex_unlock(&g_fs.inode_mu); if (inode) { zvfs_debug_open_log(abspath, NULL, "open existing inode_table hit blob_id=%lu", (unsigned long)blob_id); if (blob_open(blob_id, flags, &handle_id) != 0) { if (errno == 0) errno = EIO; zvfs_debug_open_log(abspath, NULL, "open existing inode_table-hit blob_open fail errno=%d(%s)", errno, strerror(errno)); goto fail; } atomic_fetch_add(&inode->ref_count, 1); zvfs_debug_open_log(abspath, NULL, "open existing inode_table-hit blob_open ok handle_id=%lu", (unsigned long)handle_id); } else { /* 全新 inode:需从真实文件 stat 获取 mode/size */ struct stat st; if (zvfs_real_fstat(real_fd, &st) < 0) { zvfs_debug_open_log(abspath, NULL, "open existing fstat fail errno=%d(%s)", errno, strerror(errno)); goto fail; } inode = inode_alloc(blob_id, st.st_mode, ZVFS_ITYPE_FILE); if (!inode) { errno = ENOMEM; zvfs_debug_open_log(abspath, NULL, "open existing inode_alloc fail ENOMEM"); goto fail; } inode->logical_size = (uint64_t)st.st_size; pthread_mutex_lock(&g_fs.inode_mu); inode_insert(inode); pthread_mutex_unlock(&g_fs.inode_mu); pthread_mutex_lock(&g_fs.path_mu); path_cache_insert(abspath, inode); pthread_mutex_unlock(&g_fs.path_mu); if (blob_open(blob_id, flags, &handle_id) != 0) { if (errno == 0) errno = EIO; zvfs_debug_open_log(abspath, NULL, "open existing new-inode blob_open fail errno=%d(%s)", errno, strerror(errno)); goto fail; } zvfs_debug_open_log(abspath, NULL, "open existing new-inode ready handle_id=%lu logical_size=%lu", (unsigned long)handle_id, (unsigned long)inode->logical_size); } } } /* ---- 分配 openfile,插入 fd_table ---------------------------- */ struct zvfs_open_file *of = openfile_alloc(real_fd, inode, handle_id); if (!of) { errno = ENOMEM; goto fail_handle; } pthread_mutex_lock(&g_fs.fd_mu); openfile_insert(of); pthread_mutex_unlock(&g_fs.fd_mu); zvfs_debug_open_log(abspath, NULL, "open_impl success real_fd=%d handle_id=%lu inode_blob_id=%lu", real_fd, (unsigned long)handle_id, (unsigned long)(inode ? inode->blob_id : 0)); return real_fd; fail_handle: if (handle_id != 0) { blob_close(handle_id); } fail: zvfs_debug_open_log(abspath, NULL, "open_impl fail errno=%d(%s) real_fd=%d", errno, strerror(errno), real_fd); /* inode 若刚分配(ref_count==1)需要回滚 */ if (inode && atomic_load(&inode->ref_count) == 1) { pthread_mutex_lock(&g_fs.inode_mu); inode_remove(inode->blob_id); pthread_mutex_unlock(&g_fs.inode_mu); pthread_mutex_lock(&g_fs.path_mu); path_cache_remove(abspath); pthread_mutex_unlock(&g_fs.path_mu); inode_free(inode); } return -1; } /* ------------------------------------------------------------------ */ /* open */ /* ------------------------------------------------------------------ */ int open(const char *path, int flags, ...) { ZVFS_HOOK_ENTER(); char abspath[PATH_MAX]; char normpath[PATH_MAX]; abspath[0] = '\0'; normpath[0] = '\0'; int is_zvfs_path = 0; mode_t mode = 0; if (flags & O_CREAT) { va_list ap; va_start(ap, flags); mode = (mode_t)va_arg(ap, unsigned int); va_end(ap); } if (zvfs_resolve_atpath(AT_FDCWD, path, abspath, sizeof(abspath)) == 0) { is_zvfs_path = zvfs_classify_path(abspath, (flags & O_CREAT) != 0, normpath, sizeof(normpath)); zvfs_debug_open_log(path, abspath, "open resolve ok path=%s abspath=%s norm=%s flags=0x%x is_zvfs=%d", zvfs_dbg_str(path), zvfs_dbg_str(abspath), zvfs_dbg_str(normpath), flags, is_zvfs_path); } else { zvfs_debug_open_log(path, NULL, "open resolve fail path=%s flags=0x%x errno=%d(%s)", zvfs_dbg_str(path), flags, errno, strerror(errno)); } int ret; if (ZVFS_IN_HOOK() || !is_zvfs_path) { zvfs_debug_open_log(path, abspath, "open passthrough reason=%s path=%s flags=0x%x", ZVFS_IN_HOOK() ? "reentrant" : "non-zvfs", zvfs_dbg_str(path), flags); ret = real_open(path, flags, mode); zvfs_debug_open_log(path, abspath, "open passthrough ret=%d errno=%d(%s)", ret, (ret < 0) ? errno : 0, (ret < 0) ? strerror(errno) : "OK"); ZVFS_HOOK_LEAVE(); return ret; } zvfs_ensure_init(); /* 先让真实 FS 创建 / 打开文件(获得 real_fd) */ int real_fd = real_open(path, flags, mode); if (real_fd < 0) { zvfs_debug_open_log(path, abspath, "open real_open fail path=%s flags=0x%x errno=%d(%s)", zvfs_dbg_str(path), flags, errno, strerror(errno)); ZVFS_HOOK_LEAVE(); return -1; } zvfs_debug_open_log(path, abspath, "open real_open ok real_fd=%d path=%s norm=%s", real_fd, zvfs_dbg_str(path), zvfs_dbg_str(normpath)); ret = zvfs_open_impl(real_fd, normpath, flags, mode); if (ret < 0) { int saved = errno; real_close(real_fd); errno = saved; zvfs_debug_open_log(path, abspath, "open zvfs_open_impl fail real_fd=%d errno=%d(%s)", real_fd, saved, strerror(saved)); } else { zvfs_debug_open_log(path, abspath, "open zvfs_open_impl success fd=%d", ret); } ZVFS_HOOK_LEAVE(); return ret; } int open64(const char *path, int flags, ...) { mode_t mode = 0; if (flags & O_CREAT) { va_list ap; va_start(ap, flags); mode = (mode_t)va_arg(ap, unsigned int); va_end(ap); } return open(path, flags | O_LARGEFILE, mode); } /* ------------------------------------------------------------------ */ /* openat */ /* ------------------------------------------------------------------ */ int openat(int dirfd, const char *path, int flags, ...) { ZVFS_HOOK_ENTER(); char normpath[PATH_MAX]; char abspath[PATH_MAX]; normpath[0] = '\0'; abspath[0] = '\0'; int is_zvfs_path = 0; mode_t mode = 0; if (flags & O_CREAT) { va_list ap; va_start(ap, flags); mode = (mode_t)va_arg(ap, unsigned int); va_end(ap); } /* 解析绝对路径判断是否属于 zvfs */ if (zvfs_resolve_atpath(dirfd, path, abspath, sizeof(abspath)) < 0) { zvfs_debug_open_log(path, NULL, "openat resolve fail dirfd=%d path=%s flags=0x%x errno=%d(%s)", dirfd, zvfs_dbg_str(path), flags, errno, strerror(errno)); ZVFS_HOOK_LEAVE(); return -1; } is_zvfs_path = zvfs_classify_path(abspath, (flags & O_CREAT) != 0, normpath, sizeof(normpath)); zvfs_debug_open_log(path, abspath, "openat resolve ok dirfd=%d path=%s abspath=%s norm=%s flags=0x%x is_zvfs=%d", dirfd, zvfs_dbg_str(path), zvfs_dbg_str(abspath), zvfs_dbg_str(normpath), flags, is_zvfs_path); int ret; if (ZVFS_IN_HOOK() || !is_zvfs_path) { zvfs_debug_open_log(path, abspath, "openat passthrough reason=%s dirfd=%d path=%s flags=0x%x", ZVFS_IN_HOOK() ? "reentrant" : "non-zvfs", dirfd, zvfs_dbg_str(path), flags); ret = real_openat(dirfd, path, flags, mode); zvfs_debug_open_log(path, abspath, "openat passthrough ret=%d errno=%d(%s)", ret, (ret < 0) ? errno : 0, (ret < 0) ? strerror(errno) : "OK"); ZVFS_HOOK_LEAVE(); return ret; } zvfs_ensure_init(); int real_fd = real_openat(dirfd, path, flags, mode); if (real_fd < 0) { zvfs_debug_open_log(path, abspath, "openat real_openat fail dirfd=%d path=%s flags=0x%x errno=%d(%s)", dirfd, zvfs_dbg_str(path), flags, errno, strerror(errno)); ZVFS_HOOK_LEAVE(); return -1; } zvfs_debug_open_log(path, abspath, "openat real_openat ok real_fd=%d dirfd=%d path=%s norm=%s", real_fd, dirfd, zvfs_dbg_str(path), zvfs_dbg_str(normpath)); ret = zvfs_open_impl(real_fd, normpath, flags, mode); if (ret < 0) { int saved = errno; real_close(real_fd); errno = saved; zvfs_debug_open_log(path, abspath, "openat zvfs_open_impl fail real_fd=%d errno=%d(%s)", real_fd, saved, strerror(saved)); } else { zvfs_debug_open_log(path, abspath, "openat zvfs_open_impl success fd=%d", ret); } ZVFS_HOOK_LEAVE(); return ret; } int openat64(int dirfd, const char *path, int flags, ...) { mode_t mode = 0; if (flags & O_CREAT) { va_list ap; va_start(ap, flags); mode = (mode_t)va_arg(ap, unsigned int); va_end(ap); } return openat(dirfd, path, flags | O_LARGEFILE, mode); } /* ------------------------------------------------------------------ */ /* fopen / fopen64 */ /* ------------------------------------------------------------------ */ static FILE * zvfs_fopen_common(const char *path, const char *mode, int extra_open_flags, int use_fopen64) { char abspath[PATH_MAX]; char normpath[PATH_MAX]; char fdopen_mode[4]; int is_zvfs_path = 0; int flags = 0; mode_t create_mode = 0666; int real_fd = -1; FILE *fp = NULL; if (zvfs_parse_fopen_mode(mode, extra_open_flags, &flags, &create_mode) != 0) { if (use_fopen64 && real_fopen64) return real_fopen64(path, mode); if (real_fopen) return real_fopen(path, mode); errno = ENOSYS; return NULL; } if (zvfs_resolve_atpath(AT_FDCWD, path, abspath, sizeof(abspath)) == 0) { is_zvfs_path = zvfs_classify_path(abspath, (flags & O_CREAT) != 0, normpath, sizeof(normpath)); zvfs_debug_open_log(path, abspath, "fopen resolve ok path=%s mode=%s norm=%s flags=0x%x is_zvfs=%d", zvfs_dbg_str(path), zvfs_dbg_str(mode), zvfs_dbg_str(normpath), flags, is_zvfs_path); } else { zvfs_debug_open_log(path, NULL, "fopen resolve fail path=%s mode=%s errno=%d(%s)", zvfs_dbg_str(path), zvfs_dbg_str(mode), errno, strerror(errno)); } if (ZVFS_IN_HOOK() || !is_zvfs_path) { if (use_fopen64 && real_fopen64) return real_fopen64(path, mode); if (real_fopen) return real_fopen(path, mode); errno = ENOSYS; return NULL; } zvfs_ensure_init(); real_fd = real_open(path, flags, create_mode); if (real_fd < 0) { return NULL; } if (zvfs_open_impl(real_fd, normpath, flags, create_mode) < 0) { int saved = errno; real_close(real_fd); errno = saved; return NULL; } zvfs_debug_open_log(path, normpath, "fopen mapped-after-open_impl fd=%d mapped=%d", real_fd, zvfs_debug_has_fd_mapping(real_fd)); zvfs_sanitize_fdopen_mode(mode, fdopen_mode); if (real_fdopen) { fp = real_fdopen(real_fd, fdopen_mode); } else { fp = fdopen(real_fd, fdopen_mode); } if (!fp) { int saved = errno; close(real_fd); errno = saved; return NULL; } zvfs_debug_open_log(path, normpath, "fopen mapped-after-fdopen fd=%d mapped=%d", real_fd, zvfs_debug_has_fd_mapping(real_fd)); return fp; } FILE * fopen(const char *path, const char *mode) { ZVFS_HOOK_ENTER(); FILE *fp = zvfs_fopen_common(path, mode, 0, 0); ZVFS_HOOK_LEAVE(); return fp; } FILE * fopen64(const char *path, const char *mode) { ZVFS_HOOK_ENTER(); FILE *fp = zvfs_fopen_common(path, mode, O_LARGEFILE, 1); ZVFS_HOOK_LEAVE(); return fp; } int fclose(FILE *stream) { ZVFS_HOOK_ENTER(); int ret; int ret_errno = 0; int bk_rc = 0; int bk_errno = 0; int fd = -1; int need_bookkeeping = 0; if (!stream) { errno = EINVAL; ZVFS_HOOK_LEAVE(); return -1; } if (!ZVFS_IN_HOOK()) { fd = fileno(stream); if (fd >= 0 && zvfs_is_zvfs_fd(fd)) { need_bookkeeping = 1; } } if (!real_fclose) { errno = ENOSYS; ZVFS_HOOK_LEAVE(); return -1; } if (ZVFS_IN_HOOK() || !need_bookkeeping) { ret = real_fclose(stream); ZVFS_HOOK_LEAVE(); return ret; } zvfs_ensure_init(); ret = real_fclose(stream); if (ret < 0) { ret_errno = errno; } /* * 无论 real_fclose 是否报错,都尝试回收 zvfs bookkeeping。 * 某些 libc 实现即使返回 EOF,也可能已经关闭了底层 fd。 */ if (zvfs_detach_fd_mapping(fd, 1) < 0) { bk_rc = -1; bk_errno = errno; } if (ret < 0) { errno = ret_errno; ZVFS_HOOK_LEAVE(); return -1; } if (bk_rc < 0) { errno = bk_errno; ZVFS_HOOK_LEAVE(); return -1; } ZVFS_HOOK_LEAVE(); return 0; } /* ------------------------------------------------------------------ */ /* creat */ /* ------------------------------------------------------------------ */ int creat(const char *path, mode_t mode) { return open(path, O_CREAT | O_WRONLY | O_TRUNC, mode); } int creat64(const char *path, mode_t mode) { return open(path, O_CREAT | O_WRONLY | O_TRUNC | O_LARGEFILE, mode); } /* ------------------------------------------------------------------ */ /* glibc 别名 */ /* ------------------------------------------------------------------ */ int __open(const char *path, int flags, ...) { mode_t mode = 0; if (flags & O_CREAT) { va_list ap; va_start(ap, flags); mode = (mode_t)va_arg(ap, unsigned int); va_end(ap); } return open(path, flags, mode); } int __open64(const char *path, int flags, ...) { mode_t mode = 0; if (flags & O_CREAT) { va_list ap; va_start(ap, flags); mode = (mode_t)va_arg(ap, unsigned int); va_end(ap); } return open64(path, flags, mode); } int __openat(int dirfd, const char *path, int flags, ...) { mode_t mode = 0; if (flags & O_CREAT) { va_list ap; va_start(ap, flags); mode = (mode_t)va_arg(ap, unsigned int); va_end(ap); } return openat(dirfd, path, flags, mode); } int __openat64(int dirfd, const char *path, int flags, ...) { mode_t mode = 0; if (flags & O_CREAT) { va_list ap; va_start(ap, flags); mode = (mode_t)va_arg(ap, unsigned int); va_end(ap); } return openat64(dirfd, path, flags, mode); } int __libc_open(const char *path, int flags, ...) { mode_t mode = 0; if (flags & O_CREAT) { va_list ap; va_start(ap, flags); mode = (mode_t)va_arg(ap, unsigned int); va_end(ap); } return open(path, flags, mode); } int __libc_open64(const char *path, int flags, ...) { mode_t mode = 0; if (flags & O_CREAT) { va_list ap; va_start(ap, flags); mode = (mode_t)va_arg(ap, unsigned int); va_end(ap); } return open64(path, flags, mode); } int __libc_openat(int dirfd, const char *path, int flags, ...) { mode_t mode = 0; if (flags & O_CREAT) { va_list ap; va_start(ap, flags); mode = (mode_t)va_arg(ap, unsigned int); va_end(ap); } return openat(dirfd, path, flags, mode); } int __libc_openat64(int dirfd, const char *path, int flags, ...) { mode_t mode = 0; if (flags & O_CREAT) { va_list ap; va_start(ap, flags); mode = (mode_t)va_arg(ap, unsigned int); va_end(ap); } return openat64(dirfd, path, flags, mode); } int __open_2(const char *path, int flags) { if ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE)) { errno = EINVAL; return -1; } zvfs_debug_open_log(path, NULL, "__open_2 called path=%s flags=0x%x", zvfs_dbg_str(path), flags); return open(path, flags); } int __open64_2(const char *path, int flags) { if ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE)) { errno = EINVAL; return -1; } return open64(path, flags); } int __openat_2(int dirfd, const char *path, int flags) { if ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE)) { errno = EINVAL; return -1; } zvfs_debug_open_log(path, NULL, "__openat_2 called dirfd=%d path=%s flags=0x%x", dirfd, zvfs_dbg_str(path), flags); return openat(dirfd, path, flags); } int __openat64_2(int dirfd, const char *path, int flags) { if ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE)) { errno = EINVAL; return -1; } return openat64(dirfd, path, flags); } int __open_nocancel(const char *path, int flags, ...) { mode_t mode = 0; if (flags & O_CREAT) { va_list ap; va_start(ap, flags); mode = (mode_t)va_arg(ap, unsigned int); va_end(ap); } return open(path, flags, mode); } int __open64_nocancel(const char *path, int flags, ...) { mode_t mode = 0; if (flags & O_CREAT) { va_list ap; va_start(ap, flags); mode = (mode_t)va_arg(ap, unsigned int); va_end(ap); } return open64(path, flags, mode); } int __openat_nocancel(int dirfd, const char *path, int flags, ...) { mode_t mode = 0; if (flags & O_CREAT) { va_list ap; va_start(ap, flags); mode = (mode_t)va_arg(ap, unsigned int); va_end(ap); } return openat(dirfd, path, flags, mode); } int __openat64_nocancel(int dirfd, const char *path, int flags, ...) { mode_t mode = 0; if (flags & O_CREAT) { va_list ap; va_start(ap, flags); mode = (mode_t)va_arg(ap, unsigned int); va_end(ap); } return openat64(dirfd, path, flags, mode); } /* ------------------------------------------------------------------ */ /* close */ /* ------------------------------------------------------------------ */ /* * zvfs_release_openfile - 释放一个 openfile 对应的 zvfs 资源。 * 这里只处理 zvfs bookkeeping,不做 real_close(fd)。 */ static int zvfs_release_openfile(struct zvfs_open_file *of, int do_sync_md) { int saved_errno = 0; struct zvfs_inode *inode = of->inode; uint64_t handle_id = of->handle_id; openfile_free(of); if (do_sync_md && handle_id != 0 && blob_sync_md(handle_id) < 0) { saved_errno = (errno != 0) ? errno : EIO; } if (handle_id != 0 && blob_close(handle_id) < 0 && saved_errno == 0) { saved_errno = (errno != 0) ? errno : EIO; } /* ---- inode ref_count-- --------------------------------------- */ int inode_ref = atomic_fetch_sub(&inode->ref_count, 1) - 1; if (inode_ref == 0) { /* * 最后一个 fd 关闭了这个 inode。 * 若 deleted:执行延迟 blob_delete。 */ bool do_delete = false; pthread_mutex_lock(&inode->mu); do_delete = inode->deleted; pthread_mutex_unlock(&inode->mu); if (do_delete && blob_delete(inode->blob_id) < 0 && saved_errno == 0) saved_errno = (errno != 0) ? errno : EIO; pthread_mutex_lock(&g_fs.inode_mu); inode_remove(inode->blob_id); pthread_mutex_unlock(&g_fs.inode_mu); /* path_cache 在 unlink 时已经摘除(deleted=true 路径) * 或在此处还需摘除(正常关闭最后一个 fd)*/ if (!do_delete) { /* 正常关闭:path 留着,只有 inode 的引用归零时清缓存 */ /* 注意:path_cache 里的指针指向这个即将释放的 inode, * 所以必须把 path_cache 条目也清掉,否则成为悬空指针 */ pthread_mutex_lock(&g_fs.path_mu); /* 遍历找到所有指向这个 inode 的 path entry 并移除 * (一个 inode 对应一个 path,hardlink 暂不支持)*/ struct zvfs_path_entry *pe, *tmp; (void)tmp; HASH_ITER(hh, g_fs.path_cache, pe, tmp) { if (pe->inode == inode) { HASH_DEL(g_fs.path_cache, pe); free(pe->path); free(pe); break; /* 一对一关系,找到即退 */ } } pthread_mutex_unlock(&g_fs.path_mu); } inode_free(inode); } if (saved_errno != 0) { errno = saved_errno; return -1; } return 0; } /* * zvfs_detach_fd_mapping - 仅摘除 fd -> openfile 映射并释放 zvfs 资源。 * 不调用 real_close(fd),用于 dup2/dup3 中 newfd 旧值清理。 */ static int zvfs_detach_fd_mapping(int fd, int do_sync_md) { pthread_mutex_lock(&g_fs.fd_mu); struct zvfs_open_file *of = openfile_lookup(fd); if (!of) { pthread_mutex_unlock(&g_fs.fd_mu); errno = EBADF; return -1; } openfile_remove(fd); pthread_mutex_unlock(&g_fs.fd_mu); return zvfs_release_openfile(of, do_sync_md); } /* * zvfs_close_impl - close(fd) 的 zvfs 路径: * 先做 bookkeeping,再做 real_close(fd)。 */ static int zvfs_close_impl(int fd) { int bk_rc = zvfs_detach_fd_mapping(fd, 1); int bk_errno = (bk_rc < 0) ? errno : 0; int rc = real_close(fd); if (rc < 0) return -1; if (bk_rc < 0) { errno = bk_errno; return -1; } return 0; } int close(int fd) { ZVFS_HOOK_ENTER(); int ret; int is_zvfs_fd = (!ZVFS_IN_HOOK() && zvfs_is_zvfs_fd(fd)); if (!is_zvfs_fd) { ret = real_close(fd); ZVFS_HOOK_LEAVE(); return ret; } zvfs_ensure_init(); ret = zvfs_close_impl(fd); ZVFS_HOOK_LEAVE(); return ret; } int __close(int fd) { return close(fd); } int __libc_close(int fd) { return close(fd); } int __close_nocancel(int fd) { return close(fd); } /* ------------------------------------------------------------------ */ /* dup helper */ /* ------------------------------------------------------------------ */ int zvfs_dup_attach_newfd(int oldfd, int newfd, int new_fd_flags) { struct zvfs_open_file *old_of, *new_of; int fd_flags; int rc; int saved; if (oldfd < 0 || newfd < 0) { errno = EBADF; return -1; } pthread_mutex_lock(&g_fs.fd_mu); old_of = openfile_lookup(oldfd); if (!old_of) { pthread_mutex_unlock(&g_fs.fd_mu); errno = EBADF; return -1; } if (openfile_lookup(newfd) != NULL) { pthread_mutex_unlock(&g_fs.fd_mu); errno = EEXIST; return -1; } rc = blob_add_ref(old_of->handle_id, 1); if (rc != 0) { pthread_mutex_unlock(&g_fs.fd_mu); return -1; } new_of = openfile_alloc(newfd, old_of->inode, old_of->handle_id); if (!new_of) { saved = (errno != 0) ? errno : ENOMEM; (void)blob_close(old_of->handle_id); pthread_mutex_unlock(&g_fs.fd_mu); errno = saved; return -1; } fd_flags = (new_fd_flags >= 0) ? new_fd_flags : old_of->fd_flags; new_of->fd_flags = fd_flags; atomic_fetch_add(&old_of->inode->ref_count, 1); openfile_insert(new_of); pthread_mutex_unlock(&g_fs.fd_mu); return 0; } static int zvfs_add_ref_batch_or_fallback(const uint64_t *handle_ids, const uint32_t *ref_deltas, uint32_t count) { uint32_t i; if (count == 0) return 0; if (blob_add_ref_batch(handle_ids, ref_deltas, count) == 0) return 0; for (i = 0; i < count; i++) { if (blob_add_ref(handle_ids[i], ref_deltas[i]) != 0) return -1; } return 0; } static void zvfs_rollback_added_refs(const uint64_t *handle_ids, uint32_t count) { uint32_t i; for (i = 0; i < count; i++) { if (handle_ids[i] != 0) (void)blob_close(handle_ids[i]); } } static int zvfs_snapshot_fd_handles(uint64_t **handle_ids_out, uint32_t **ref_deltas_out, uint32_t *count_out) { struct zvfs_open_file *of, *tmp; uint32_t i = 0; uint32_t count; uint64_t *handle_ids = NULL; uint32_t *ref_deltas = NULL; *handle_ids_out = NULL; *ref_deltas_out = NULL; *count_out = 0; pthread_mutex_lock(&g_fs.fd_mu); count = (uint32_t)HASH_COUNT(g_fs.fd_table); if (count == 0) { pthread_mutex_unlock(&g_fs.fd_mu); return 0; } handle_ids = calloc(count, sizeof(*handle_ids)); ref_deltas = calloc(count, sizeof(*ref_deltas)); if (!handle_ids || !ref_deltas) { pthread_mutex_unlock(&g_fs.fd_mu); free(handle_ids); free(ref_deltas); errno = ENOMEM; return -1; } HASH_ITER(hh, g_fs.fd_table, of, tmp) { if (i >= count) break; handle_ids[i] = of->handle_id; ref_deltas[i] = 1; i++; } pthread_mutex_unlock(&g_fs.fd_mu); *handle_ids_out = handle_ids; *ref_deltas_out = ref_deltas; *count_out = i; return 0; } static int zvfs_snapshot_fds_in_range(unsigned int first, unsigned int last, int **fds_out, uint32_t *count_out) { struct zvfs_open_file *of, *tmp; uint32_t cap; uint32_t n = 0; int *fds = NULL; *fds_out = NULL; *count_out = 0; pthread_mutex_lock(&g_fs.fd_mu); cap = (uint32_t)HASH_COUNT(g_fs.fd_table); if (cap == 0) { pthread_mutex_unlock(&g_fs.fd_mu); return 0; } fds = calloc(cap, sizeof(*fds)); if (!fds) { pthread_mutex_unlock(&g_fs.fd_mu); errno = ENOMEM; return -1; } HASH_ITER(hh, g_fs.fd_table, of, tmp) { if (of->fd < 0) { continue; } if ((unsigned int)of->fd < first || (unsigned int)of->fd > last) { continue; } fds[n++] = of->fd; } pthread_mutex_unlock(&g_fs.fd_mu); *fds_out = fds; *count_out = n; return 0; } /* ------------------------------------------------------------------ */ /* close_range */ /* ------------------------------------------------------------------ */ int close_range(unsigned int first, unsigned int last, int flags) { ZVFS_HOOK_ENTER(); if (ZVFS_IN_HOOK()) { int ret = real_close_range ? real_close_range(first, last, flags) : (errno = ENOSYS, -1); ZVFS_HOOK_LEAVE(); return ret; } if (first > last) { errno = EINVAL; ZVFS_HOOK_LEAVE(); return -1; } /* * 只快照当前 zvfs fd_table 中命中的 fd,避免对 [first,last] 做 * 全范围扫描(last=UINT_MAX 时会非常慢,且旧逻辑存在回绕风险)。 */ int any_err = 0; int inited = 0; int *zvfs_fds = NULL; uint32_t zvfs_fd_count = 0; if (zvfs_snapshot_fds_in_range(first, last, &zvfs_fds, &zvfs_fd_count) < 0) { ZVFS_HOOK_LEAVE(); return -1; } for (uint32_t i = 0; i < zvfs_fd_count; i++) { if (!inited) { zvfs_ensure_init(); inited = 1; } if (zvfs_close_impl(zvfs_fds[i]) < 0) { any_err = 1; } } free(zvfs_fds); /* 让内核处理剩余非 zvfs fd(CLOEXEC 等 flags 也在这里生效) */ if (real_close_range) { if (real_close_range(first, last, flags) < 0 && !any_err) any_err = 1; } else { /* 降级:逐个 close 非 zvfs fd(按 open-max 做上界截断) */ unsigned int upper = last; long open_max = sysconf(_SC_OPEN_MAX); if (open_max > 0 && upper >= (unsigned int)open_max) { upper = (unsigned int)open_max - 1; } for (unsigned int fd = first; fd <= upper; fd++) { if (!zvfs_is_zvfs_fd((int)fd)) real_close((int)fd); if (fd == upper) break; } } ZVFS_HOOK_LEAVE(); return any_err ? -1 : 0; } /* ------------------------------------------------------------------ */ /* dup */ /* ------------------------------------------------------------------ */ int dup(int oldfd) { ZVFS_HOOK_ENTER(); int is_zvfs_fd = (!ZVFS_IN_HOOK() && zvfs_is_zvfs_fd(oldfd)); if (!is_zvfs_fd) { int ret = real_dup(oldfd); ZVFS_HOOK_LEAVE(); return ret; } zvfs_ensure_init(); int newfd = real_dup(oldfd); if (newfd < 0) { ZVFS_HOOK_LEAVE(); return -1; } if (zvfs_dup_attach_newfd(oldfd, newfd, 0) < 0) { int saved = errno; (void)real_close(newfd); errno = saved; ZVFS_HOOK_LEAVE(); return -1; } ZVFS_HOOK_LEAVE(); return newfd; } /* ------------------------------------------------------------------ */ /* dup2 */ /* ------------------------------------------------------------------ */ int dup2(int oldfd, int newfd) { ZVFS_HOOK_ENTER(); int is_zvfs_fd = (!ZVFS_IN_HOOK() && zvfs_is_zvfs_fd(oldfd)); if (!is_zvfs_fd) { int ret = real_dup2(oldfd, newfd); ZVFS_HOOK_LEAVE(); return ret; } /* POSIX 兼容:dup2(oldfd, oldfd) 对合法 fd 直接返回 oldfd。 */ if (oldfd == newfd) { ZVFS_HOOK_LEAVE(); return oldfd; } zvfs_ensure_init(); int newfd_was_zvfs = zvfs_is_zvfs_fd(newfd); int ret = real_dup2(oldfd, newfd); if (ret < 0) { ZVFS_HOOK_LEAVE(); return -1; } if (newfd_was_zvfs && zvfs_detach_fd_mapping(newfd, 1) < 0) { int saved = errno; (void)real_close(newfd); errno = saved; ZVFS_HOOK_LEAVE(); return -1; } if (zvfs_dup_attach_newfd(oldfd, newfd, 0) < 0) { int saved = errno; (void)real_close(newfd); errno = saved; ZVFS_HOOK_LEAVE(); return -1; } ZVFS_HOOK_LEAVE(); return ret; } /* ------------------------------------------------------------------ */ /* dup3 */ /* ------------------------------------------------------------------ */ int dup3(int oldfd, int newfd, int flags) { ZVFS_HOOK_ENTER(); int is_zvfs_fd = (!ZVFS_IN_HOOK() && zvfs_is_zvfs_fd(oldfd)); if (!is_zvfs_fd) { int ret = real_dup3(oldfd, newfd, flags); ZVFS_HOOK_LEAVE(); return ret; } if (oldfd == newfd) { errno = EINVAL; ZVFS_HOOK_LEAVE(); return -1; } if ((flags & ~O_CLOEXEC) != 0) { errno = EINVAL; ZVFS_HOOK_LEAVE(); return -1; } zvfs_ensure_init(); int newfd_was_zvfs = zvfs_is_zvfs_fd(newfd); int ret = real_dup3(oldfd, newfd, flags); if (ret < 0) { ZVFS_HOOK_LEAVE(); return -1; } if (newfd_was_zvfs && zvfs_detach_fd_mapping(newfd, 1) < 0) { int saved = errno; (void)real_close(newfd); errno = saved; ZVFS_HOOK_LEAVE(); return -1; } int fd_flags = (flags & O_CLOEXEC) ? FD_CLOEXEC : 0; if (zvfs_dup_attach_newfd(oldfd, newfd, fd_flags) < 0) { int saved = errno; (void)real_close(newfd); errno = saved; ZVFS_HOOK_LEAVE(); return -1; } ZVFS_HOOK_LEAVE(); return ret; } /* ------------------------------------------------------------------ */ /* fork */ /* ------------------------------------------------------------------ */ pid_t fork(void) { ZVFS_HOOK_ENTER(); if (ZVFS_IN_HOOK()) { pid_t ret = real_fork(); ZVFS_HOOK_LEAVE(); return ret; } uint64_t *handle_ids = NULL; uint32_t *ref_deltas = NULL; uint32_t count = 0; if (zvfs_snapshot_fd_handles(&handle_ids, &ref_deltas, &count) < 0) { ZVFS_HOOK_LEAVE(); return -1; } if (count > 0) { zvfs_ensure_init(); if (zvfs_add_ref_batch_or_fallback(handle_ids, ref_deltas, count) < 0) { int saved = errno; free(handle_ids); free(ref_deltas); errno = saved; ZVFS_HOOK_LEAVE(); return -1; } } pid_t ret = real_fork(); if (ret < 0) { int saved = errno; if (count > 0) zvfs_rollback_added_refs(handle_ids, count); free(handle_ids); free(ref_deltas); errno = saved; ZVFS_HOOK_LEAVE(); return -1; } free(handle_ids); free(ref_deltas); ZVFS_HOOK_LEAVE(); return ret; }