主从同步性能优化,主从同步性能测试。

This commit is contained in:
1iaan
2026-02-01 16:49:50 +00:00
parent 003566b69a
commit 6d1a50cf88
31 changed files with 2119 additions and 400 deletions

View File

@@ -3,7 +3,7 @@ CC = gcc
CFLAGS = -g -DJEMALLOC_NO_DEMANGLE CFLAGS = -g -DJEMALLOC_NO_DEMANGLE
NET_SRCS = ntyco.c proactor.c reactor.c kvstore.c NET_SRCS = ntyco.c proactor.c reactor.c kvstore.c
KV_SRCS = kvs_array_bin.c kvs_rbtree_bin.c kvs_hash_bin.c kvs_rw_tools.c kvs_protocol_resp.c kvs_slave.c KV_SRCS = kvs_array_bin.c kvs_rbtree_bin.c kvs_hash_bin.c kvs_rw_tools.c kvs_protocol_resp.c kvs_slave.c replica_shm.c
MEM_SRCS = ./memory/mempool.c ./memory/alloc_dispatch.c MEM_SRCS = ./memory/mempool.c ./memory/alloc_dispatch.c
COMMON_SRCS = ./common/config.c ./diskuring/diskuring.c COMMON_SRCS = ./common/config.c ./diskuring/diskuring.c
DUMP_SRCS = ./dump/kvs_snapshot.c ./dump/kvs_oplog.c DUMP_SRCS = ./dump/kvs_snapshot.c ./dump/kvs_oplog.c
@@ -12,8 +12,8 @@ SRCS = $(NET_SRCS) $(KV_SRCS) $(MEM_SRCS) $(COMMON_SRCS) $(DUMP_SRCS)
INC = -I./NtyCo/core/ -I/usr/include/libxml2 -I./ INC = -I./NtyCo/core/ -I/usr/include/libxml2 -I./
LIBDIRS = -L./NtyCo/ LIBDIRS = -L./NtyCo/
# LIBS = -lntyco -lpthread -luring -ldl -lxml2 -ljemalloc # LIBS = -lntyco -lpthread -luring -ldl -lxml2 -lrt -ljemalloc
LIBS = -lntyco -lpthread -luring -ldl -lxml2 LIBS = -lntyco -lpthread -luring -ldl -lxml2 -lrt
TARGET = kvstore TARGET = kvstore
SUBDIR = ./NtyCo/ SUBDIR = ./NtyCo/

View File

@@ -253,13 +253,13 @@ VIRT 58504
RES 4604 RES 4604
插入 20w 删除 10w重复 10 次,共计插入 200w 删除 100w。 插入 20w 删除 10w重复 10 次,共计插入 200w 删除 100w。
BATCH (N=3000000) --> time_used=3320 ms, qps=1807228 BATCH (N=9000000) --> time_used=12897 ms, qps=1395673
VIRT 208M VIRT 489M
RES 155M RES 430M
插入 10w 删除 20w重复 10 次,共计插入 100w 删除 200w。 插入 10w 删除 20w重复 10 次,共计插入 100w 删除 200w。
BATCH (N=3000000) --> time_used=3097 ms, qps=1937358 BATCH (N=9000000) --> time_used=10033 ms, qps=1794079
VIRT 208M VIRT 208M
RES 155M RES 155M
@@ -268,6 +268,27 @@ RES 155M
![alt text](image12.png) ![alt text](image12.png)
![alt text](image13.png) ![alt text](image13.png)
#### jemalloc
```shell
VIRT 69376
RES 5408
插入 20w 删除 10w重复 30 次,共计插入 600w 删除 300w。
BATCH (N=9000000) --> time_used=9436 ms, qps=1907587
VIRT 356M
RES 294M
插入 10w 删除 20w重复 30 次,共计插入 300w 删除 600w。
BATCH (N=9000000) --> time_used=9353 ms, qps=1924516
VIRT 356M
RES 119M
```
![alt text](image11.png)
![alt text](image22.png)
![alt text](image23.png)
#### mypool #### mypool
```shell ```shell
VIRT 58504 VIRT 58504
@@ -289,22 +310,34 @@ RES 71492
![alt text](image32.png) ![alt text](image32.png)
![alt text](image33.png) ![alt text](image33.png)
#### jemalloc ### 测试4主从同步
测试条件:
1. 不启用持久化。
2. 启用主从同步。
3. pipline
1. RSET 100w 条, p:i v:i -> +OK
2. RGET 100w 条, p:i -> +v:i
3. RDEL 100w 条。 p:i -> +OK
5. 本机发送请求。
```shell ```shell
VIRT 69376 lian@ubuntu:~/share/9.1-kvstore$ ./test-redis/testcase 192.168.10.129 8888 4
RES 5408 Connected to 192.168.10.129:8888
BATCH (N=3000000) --> time_used=3702 ms, qps=810372
插入 20w 删除 10w重复 30 次,共计插入 600w 删除 300w。 BATCH (N=3000000) --> time_used=3804 ms, qps=788643
BATCH (N=9000000) --> time_used=9436 ms, qps=1907587 BATCH (N=3000000) --> time_used=4076 ms, qps=736015
BATCH (N=3000000) --> time_used=3840 ms, qps=781250
VIRT 356M BATCH (N=3000000) --> time_used=3824 ms, qps=784518
RES 294M average qps:780159
ALL TESTS PASSED.
插入 10w 删除 20w重复 30 次,共计插入 300w 删除 600w。 lian@ubuntu:~/share/9.1-kvstore$ ./test-redis/testcase 192.168.10.129 8888 4
BATCH (N=9000000) --> time_used=9353 ms, qps=1924516 Connected to 192.168.10.129:8888
BATCH (N=3000000) --> time_used=3958 ms, qps=757958
VIRT 356M BATCH (N=3000000) --> time_used=4043 ms, qps=742023
RES 119M BATCH (N=3000000) --> time_used=3729 ms, qps=804505
BATCH (N=3000000) --> time_used=3989 ms, qps=752068
BATCH (N=3000000) --> time_used=3603 ms, qps=832639
average qps:777838
ALL TESTS PASSED.
``` ```
### 面试题 ### 面试题

View File

@@ -58,6 +58,7 @@ static void set_default_config(AppConfig *cfg)
cfg->persistence = PERSIST_NONE; cfg->persistence = PERSIST_NONE;
cfg->allocator = ALLOC_JEMALLOC; cfg->allocator = ALLOC_JEMALLOC;
cfg->leak_mode = MEMLEAK_DETECT_OFF; cfg->leak_mode = MEMLEAK_DETECT_OFF;
cfg->replica_mode = REPLICA_DISABLE;
} }
/* ---- 字符串转枚举 ---- */ /* ---- 字符串转枚举 ---- */
@@ -103,6 +104,14 @@ static void parse_leakage(const char *s, MemLeakDetectMode *out)
else *out = MEMLEAK_DETECT_OFF; else *out = MEMLEAK_DETECT_OFF;
} }
static void parse_replica(const char *s, ReplicaMode *out)
{
if (!s || !out) return;
if (!strcasecmp(s, "enable")) *out = REPLICA_ENABLE;
else if (!strcasecmp(s, "disable")) *out = REPLICA_DISABLE;
else *out = REPLICA_DISABLE;
}
static int read_file_mmap(const char *filename, void **out_addr, size_t *out_len, int *out_fd) { static int read_file_mmap(const char *filename, void **out_addr, size_t *out_len, int *out_fd) {
if (!filename || !out_addr || !out_len || !out_fd) return -1; if (!filename || !out_addr || !out_len || !out_fd) return -1;
@@ -199,6 +208,16 @@ const char *leakage_to_string(MemLeakDetectMode a)
} }
} }
const char *replica_to_string(ReplicaMode a)
{
switch (a) {
case MEMLEAK_DETECT_ON: return "enable";
case MEMLEAK_DETECT_OFF: return "disable";
default: return "unknown";
}
}
/* ---- 主函数:从 XML 加载配置 ---- */ /* ---- 主函数:从 XML 加载配置 ---- */
/* server 部分 */ /* server 部分 */
@@ -236,6 +255,15 @@ void server_load(xmlNodePtr *root, AppConfig *out_cfg){
} }
} }
xmlNodePtr replica_node = find_child(server, "replica");
if (replica_node) {
xmlChar *txt = xmlNodeGetContent(replica_node);
if (txt) {
parse_replica((char *)txt, &out_cfg->replica_mode);
xmlFree(txt);
}
}
/* master (always read if present) */ /* master (always read if present) */
xmlNodePtr master = find_child(server, "master"); xmlNodePtr master = find_child(server, "master");
if (master) { if (master) {

View File

@@ -20,6 +20,11 @@ typedef enum {
PERSIST_NONE PERSIST_NONE
} PersistenceType; } PersistenceType;
typedef enum {
REPLICA_DISABLE,
REPLICA_ENABLE
}ReplicaMode;
// typedef enum { // typedef enum {
// ALLOC_JEMALLOC, // ALLOC_JEMALLOC,
// ALLOC_MALLOC, // ALLOC_MALLOC,
@@ -51,6 +56,7 @@ typedef struct {
AllocatorType allocator; AllocatorType allocator;
MemLeakDetectMode leak_mode; MemLeakDetectMode leak_mode;
ReplicaMode replica_mode;
} AppConfig; } AppConfig;
/** /**
@@ -64,5 +70,7 @@ const char *server_mode_to_string(ServerMode mode);
const char *persistence_to_string(PersistenceType p); const char *persistence_to_string(PersistenceType p);
const char *allocator_to_string(AllocatorType a); const char *allocator_to_string(AllocatorType a);
const char *leakage_to_string(MemLeakDetectMode a); const char *leakage_to_string(MemLeakDetectMode a);
const char *replica_to_string(ReplicaMode a);
#endif /* CONFIG_H */ #endif /* CONFIG_H */

View File

@@ -6,6 +6,7 @@
<mode>master</mode> <!-- master / slave --> <mode>master</mode> <!-- master / slave -->
<!-- 仅当 mode=slave 时使用 --> <!-- 仅当 mode=slave 时使用 -->
<replica>enable</replica>
<master> <master>
<ip>192.168.10.129</ip> <ip>192.168.10.129</ip>
<port>8888</port> <port>8888</port>

View File

@@ -67,53 +67,30 @@ static void queue_push(iouring_ctx_t *ctx, task_t *t)
pthread_mutex_unlock(&ctx->q_m); pthread_mutex_unlock(&ctx->q_m);
} }
static void queue_push_front(iouring_ctx_t *ctx, task_t *list_head, task_t *list_tail) { static task_t *queue_pop(iouring_ctx_t *ctx)
pthread_mutex_lock(&ctx->q_m);
list_tail->next = ctx->q_head;
ctx->q_head = list_head;
if (!ctx->q_tail) {
ctx->q_tail = list_tail;
}
pthread_cond_signal(&ctx->q_cv);
pthread_mutex_unlock(&ctx->q_m); }
static task_t *queue_pop_all(iouring_ctx_t *ctx)
{ {
pthread_mutex_lock(&ctx->q_m); pthread_mutex_lock(&ctx->q_m);
task_t *list = ctx->q_head; task_t *t = ctx->q_head;
ctx->q_head = ctx->q_tail = NULL; if (t) {
pthread_mutex_unlock(&ctx->q_m); ctx->q_head = t->next;
return list; if (!ctx->q_head) {
}
static task_t *queue_pop_n(iouring_ctx_t *ctx, int n)
{
if (n <= 0)
return NULL;
pthread_mutex_lock(&ctx->q_m);
task_t *head = ctx->q_head;
if (!head) {
pthread_mutex_unlock(&ctx->q_m);
return NULL;
}
task_t *curr = head;
task_t *prev = NULL;
int count = 0;
while (curr && count < n) {
prev = curr;
curr = curr->next;
count++;
}
ctx->q_head = curr;
if (!curr) {
// 队列被取空
ctx->q_tail = NULL; ctx->q_tail = NULL;
} }
prev->next = NULL; t->next = NULL;
}
pthread_mutex_unlock(&ctx->q_m);
return t;
}
static void queue_push_front(iouring_ctx_t *ctx, task_t *t)
{
pthread_mutex_lock(&ctx->q_m);
t->next = ctx->q_head;
ctx->q_head = t;
if (!ctx->q_tail) {
ctx->q_tail = t;
}
pthread_mutex_unlock(&ctx->q_m); pthread_mutex_unlock(&ctx->q_m);
return head;
} }
extern void sync_wakeup(); extern void sync_wakeup();
@@ -126,25 +103,26 @@ static void *worker_main(void *arg)
{ {
int cq_count = 0; int cq_count = 0;
// ========== 1. 疯狂收割 CQE(必须优先做,释放 in_flight 额度)========== // ========== 1. 收割 CQE ==========
// 使用 while 而不是 if确保把 CQ 薅干净 // 检查溢出
if (*ctx->ring.sq.kflags & IORING_SQ_CQ_OVERFLOW) {
fprintf(stderr, "FATAL: CQ overflow detected! Backpressure broken!\n");
abort();
}
while (true) { while (true) {
struct io_uring_cqe *cqe; struct io_uring_cqe *cqe;
unsigned head; unsigned head;
io_uring_for_each_cqe(&ctx->ring, head, cqe) { io_uring_for_each_cqe(&ctx->ring, head, cqe) {
task_t *done = (task_t *)(uintptr_t)cqe->user_data; task_t *done = (task_t *)(uintptr_t)cqe->user_data;
// 先减计数(必须在处理前减,否则可能瞬间突破上限)
atomic_fetch_sub(&ctx->in_flight, 1); atomic_fetch_sub(&ctx->in_flight, 1);
task_finish(done, cqe->res); task_finish(done, cqe->res);
if (cqe->res < 0) { if (cqe->res < 0) {
fprintf(stderr, "write fail: fd=%d res=%d\n", done->fd, cqe->res); fprintf(stderr, "write fail: fd=%d res=%d, offset=%ld\n", done->fd, cqe->res, done->off);
} }
// 加入销毁队列
pthread_mutex_lock(&g_destroy_queue.lock); pthread_mutex_lock(&g_destroy_queue.lock);
done->next = g_destroy_queue.head; done->next = g_destroy_queue.head;
g_destroy_queue.head = done; g_destroy_queue.head = done;
@@ -159,99 +137,87 @@ static void *worker_main(void *arg)
sync_wakeup(); sync_wakeup();
} }
// 如果这次没收满,说明 CQ 空了,退出收割循环
if (cq_count == 0) break; if (cq_count == 0) break;
cq_count = 0; // 重置继续薅(可能有新的完成了) cq_count = 0;
} }
// 检查溢出(保险起见,虽然有了背压不该再溢出)
if (*ctx->ring.sq.kflags & IORING_SQ_CQ_OVERFLOW) {
fprintf(stderr, "FATAL: CQ overflow detected! Backpressure broken!\n");
abort(); // 直接崩溃,说明逻辑有 bug
}
// ========== 2. 计算还能提交多少 ==========
// ========== 2. 批量准备 SQE ==========
int batch_count = 0;
while (true) {
int current_in_flight = atomic_load(&ctx->in_flight); int current_in_flight = atomic_load(&ctx->in_flight);
int available_slots = ctx->max_in_flight - current_in_flight; if (current_in_flight >= ctx->max_in_flight) {
break; // 满了,停止取任务
if (available_slots <= 0) {
// 满了!不能取新任务,必须等待 CQE忙等或阻塞等
// 方案 B阻塞等 CQE推荐
struct io_uring_cqe *cqe;
int ret = io_uring_wait_cqe(&ctx->ring, &cqe);
if (ret == 0 && !ctx->stop) {
// 收到一个 CQE回循环开头处理
continue;
} }
task_t *t = queue_pop(ctx);
if (!t) break;
struct io_uring_sqe *sqe = io_uring_get_sqe(&ctx->ring);
if (!sqe) {
queue_push_front(ctx, t);
break;
}
io_uring_prep_writev(sqe, t->fd, t->iovs, t->iovcnt, t->off);
sqe->user_data = (uint64_t)(uintptr_t)t;
batch_count++;
}
// ========== 3. 提交 ==========
if (batch_count > 0) {
int submitted = io_uring_submit(&ctx->ring);
push_to_sqe += submitted;
atomic_fetch_add(&ctx->in_flight, submitted);
continue; continue;
} }
// ========== 3. 从任务队列取任务(只取 available_slots 个)==========
task_t *task_list = queue_pop_n(ctx, available_slots); // ========== 4. 没事做就等待 ==========
if (!task_list) { if (batch_count == 0) {
if (!ctx->stop && atomic_load(&ctx->in_flight) > 0) { int inflight = atomic_load(&ctx->in_flight);
int ret = io_uring_submit_and_wait(&ctx->ring, 1); if (inflight > 0) {
// 有任务在飞等一个CQE
continue; continue;
} } else {
// 没任务,等待条件变量 // 真没事了,等新任务
pthread_mutex_lock(&ctx->q_m); pthread_mutex_lock(&ctx->q_m);
while (ctx->q_head == NULL && !ctx->stop) { while (ctx->q_head == NULL && !ctx->stop) {
pthread_cond_wait(&ctx->q_cv, &ctx->q_m); pthread_cond_wait(&ctx->q_cv, &ctx->q_m);
} }
pthread_mutex_unlock(&ctx->q_m); pthread_mutex_unlock(&ctx->q_m);
continue; }
}
} }
// ========== 4. 准备 SQE受限于 available_slots========== printf("Shutdown: draining remaining CQEs...\n");
int batch_count = 0; int final_cq = 0;
task_t *curr = task_list; struct io_uring_cqe *cqe;
task_t *prev = NULL; unsigned head;
task_t *submitted_head = task_list; // 记录这次实际要提交的部分
task_t *remaining_head = NULL; // 装不下的部分
while (curr && batch_count < available_slots) { while (atomic_load(&ctx->in_flight) > 0) {
struct io_uring_sqe *sqe = io_uring_get_sqe(&ctx->ring); io_uring_for_each_cqe(&ctx->ring, head, cqe) {
task_t *done = (task_t *)(uintptr_t)cqe->user_data;
atomic_fetch_sub(&ctx->in_flight, 1);
task_finish(done, cqe->res);
if (!sqe) { pthread_mutex_lock(&g_destroy_queue.lock);
// SQ 满了(这种情况在控制 inflight 后很少见,但保险起见) done->next = g_destroy_queue.head;
break; g_destroy_queue.head = done;
pthread_mutex_unlock(&g_destroy_queue.lock);
get_from_cqe++;
final_cq++;
} }
io_uring_prep_writev(sqe, curr->fd, curr->iovs, curr->iovcnt, curr->off); if (final_cq > 0) {
sqe->user_data = (uint64_t)(uintptr_t)curr; io_uring_cq_advance(&ctx->ring, final_cq);
final_cq = 0;
batch_count++;
prev = curr;
curr = curr->next;
} }
// 断开链表:已准备的 和 未准备的 // 如果还有 inflight等一下
if (prev) { if (atomic_load(&ctx->in_flight) > 0) {
prev->next = NULL; // 已提交的部分结尾 io_uring_submit_and_wait(&ctx->ring, 1);
}
remaining_head = curr; // 剩下的部分(如果有)
// ========== 5. 提交并增加计数 ==========
if (batch_count > 0) {
int submitted = io_uring_submit(&ctx->ring);
if (submitted != batch_count) {
fprintf(stderr, "CRITICAL: prep %d but submit %d\n", batch_count, submitted);
// 这种情况很严重,说明 ring 损坏了,建议 abort
abort();
}
atomic_fetch_add(&ctx->in_flight, submitted);
push_to_sqe += submitted;
}
// ========== 6. 把没提交的任务塞回队列头部(保持顺序)==========
if (remaining_head) {
task_t *tail = remaining_head;
while (tail->next) tail = tail->next;
queue_push_front(ctx, remaining_head, tail);
} }
} }
@@ -260,12 +226,6 @@ static void *worker_main(void *arg)
return NULL; return NULL;
} }
int iouring_register_fd(iouring_ctx_t *ctx, int fd) {
int fds[1] = {fd};
int ret = io_uring_register_files(&ctx->ring, fds, 1);
return ret;
}
int iouring_init(iouring_ctx_t *ctx, unsigned entries) int iouring_init(iouring_ctx_t *ctx, unsigned entries)
{ {
memset(ctx, 0, sizeof(*ctx)); memset(ctx, 0, sizeof(*ctx));
@@ -352,8 +312,17 @@ task_t* submit_write(iouring_ctx_t *ctx, int fd, void **bufs, size_t *lens, int
return t; return t;
} }
int uring_task_complete(iouring_ctx_t *ctx){
pthread_mutex_lock(&ctx->q_m);
int notask = ctx->q_head == NULL;
pthread_mutex_unlock(&ctx->q_m);
int noflight = atomic_load(&ctx->in_flight);
// printf("%d\n", noflight);
return (noflight == 0) && notask;
}
// 主线程定期调用此函数清理 // 主线程定期调用此函数清理
void cleanup_finished_iouring_tasks() { void cleanup_finished_iouring_tasks(iouring_ctx_t *ctx) {
pthread_mutex_lock(&g_destroy_queue.lock); pthread_mutex_lock(&g_destroy_queue.lock);
task_t *list = g_destroy_queue.head; task_t *list = g_destroy_queue.head;
g_destroy_queue.head = NULL; g_destroy_queue.head = NULL;
@@ -366,8 +335,6 @@ void cleanup_finished_iouring_tasks() {
task_destroy(list); // 在主线程执行销毁 task_destroy(list); // 在主线程执行销毁
list = next; list = next;
} }
// printf("clean: %d\n\n", cnt);
// mp_print();
release_cnt += cnt; release_cnt += cnt;
// printf("push:%lld, sqe:%lld, cqe:%lld, rls:%lld\n", push_to_queue, push_to_sqe, get_from_cqe, release_cnt); // printf("push:%lld, sqe:%lld, cqe:%lld, rls:%lld\n", push_to_queue, push_to_sqe, get_from_cqe, release_cnt);
} }

View File

@@ -0,0 +1,199 @@
#include "diskuring.h"
#include "memory/alloc_dispatch.h"
#include <poll.h>
#include <sys/eventfd.h>
void task_init(task_t *t)
{
t->done = 0;
t->res = 0;
t->next = NULL;
}
void task_finish(task_t *t, int res)
{
t->res = res;
t->done = 1;
}
void task_destroy(task_t *t)
{
if (t->iovs) {
for (int i = 0; i < t->iovcnt; i++) {
if (t->iovs[i].iov_base) {
kvs_free(t->iovs[i].iov_base);
}
}
kvs_free(t->iovs);
}
kvs_free(t);
}
int iouring_init(iouring_ctx_t *ctx, unsigned entries)
{
memset(ctx, 0, sizeof(*ctx));
struct io_uring_params params;
memset(&params, 0, sizeof(params));
// params.flags |= IORING_SETUP_CQSIZE;
// params.cq_entries = 256 * 1024;
// params.sq_entries = 128 * 1024;
int ret = io_uring_queue_init_params(entries, &ctx->ring, &params);
if (ret < 0) {
fprintf(stderr, "io_uring_queue_init_params failed: %d (%s)\n",
ret, strerror(-ret));
return ret;
}
unsigned cq_size = *ctx->ring.cq.kring_entries;
printf("Kernel CQ size: %u\n", cq_size);
if (ret != 0)
{
io_uring_queue_exit(&ctx->ring);
return -ret;
}
return 0;
}
void iouring_shutdown(iouring_ctx_t *ctx)
{
io_uring_queue_exit(&ctx->ring);
}
void harvest_cqes(iouring_ctx_t *ctx)
{
struct io_uring_cqe *cqe;
unsigned head;
int cq_count = 0;
// 使用 for_each_cqe 薅干净 CQ
io_uring_for_each_cqe(&ctx->ring, head, cqe) {
task_t *done = (task_t *)(uintptr_t)cqe->user_data;
task_finish(done, cqe->res);
if (cqe->res < 0) {
fprintf(stderr, "write fail: fd=%d res=%d\n", done->fd, cqe->res);
}
// 直接 destroy单线程无需全局队列
task_destroy(done);
cq_count++;
}
if (cq_count > 0) {
// printf("harvest cq:%d\n", cq_count);
io_uring_cq_advance(&ctx->ring, cq_count);
}
// 检查 CQ overflow保险
if (*ctx->ring.sq.kflags & IORING_SQ_CQ_OVERFLOW) {
fprintf(stderr, "FATAL: CQ overflow detected!\n");
abort();
}
}
task_t* submit_write(iouring_ctx_t *ctx, int fd, void **bufs, size_t *lens, int count, off_t off){
if (!bufs || !lens || count <= 0) return NULL;
task_t *t = (task_t *)kvs_malloc(sizeof(task_t));
task_init(t);
t->op = TASK_WRITE;
t->fd = fd;
t->off = off;
t->iovs = (struct iovec *)kvs_malloc(sizeof(struct iovec) * count);
if(!t->iovs) {
kvs_free(t);
return NULL;
}
for(int i = 0;i < count; ++ i){
size_t len = lens[i];
void *buf = kvs_malloc(len);
if(!buf){
for(int j = 0; j < i; ++j){
if(t->iovs[j].iov_base) kvs_free(t->iovs[j].iov_base);
}
kvs_free(t->iovs);
kvs_free(t);
return NULL;
}
memcpy(buf, bufs[i], len);
t->iovs[i].iov_base = buf;
t->iovs[i].iov_len = len;
}
t->iovcnt = count;
harvest_cqes(ctx);
if(!ctx->head){
ctx->head = t;
ctx->tail = t;
}else{
ctx->tail->next = t;
ctx->tail = t;
}
int submitted = 0;
while(true){
task_t *cur = ctx->head;
if(!cur){
break;
}
ctx->head = cur->next;
if (!ctx->head) {
ctx->tail = NULL;
}
cur->next = NULL;
struct io_uring_sqe *sqe = io_uring_get_sqe(&ctx->ring);
if (!sqe) {
break;
}
io_uring_prep_writev(sqe, cur->fd, cur->iovs, cur->iovcnt, cur->off);
sqe->user_data = (uint64_t)(uintptr_t)cur;
submitted++;
}
if(submitted > 0){
int ret = io_uring_submit(&ctx->ring);
}
return t;
}
void iouring_tick(iouring_ctx_t *ctx) {
harvest_cqes(ctx);
int submitted = 0;
while(ctx->head){
struct io_uring_sqe *sqe = io_uring_get_sqe(&ctx->ring);
if (!sqe) {
break;
}
task_t *cur = ctx->head;
ctx->head = cur->next;
if (!ctx->head) {
ctx->tail = NULL;
}
cur->next = NULL;
io_uring_prep_writev(sqe, cur->fd, cur->iovs, cur->iovcnt, cur->off);
sqe->user_data = (uint64_t)(uintptr_t)cur;
submitted++;
}
if(submitted > 0){
int ret = io_uring_submit(&ctx->ring);
}
}

View File

@@ -18,7 +18,6 @@ typedef struct task {
int fd; int fd;
off_t off; off_t off;
int refcount;
int res; // cqe->res int res; // cqe->res
int done; // 0/1 int done; // 0/1
@@ -49,7 +48,6 @@ typedef struct {
pthread_mutex_t lock; pthread_mutex_t lock;
} destroy_queue_t; } destroy_queue_t;
int iouring_register_fd(iouring_ctx_t *ctx, int fd);
void task_init(task_t *t); void task_init(task_t *t);
void task_finish(task_t *t, int res); void task_finish(task_t *t, int res);
@@ -61,6 +59,7 @@ int iouring_init(iouring_ctx_t *ctx, unsigned entries);
void iouring_shutdown(iouring_ctx_t *ctx); void iouring_shutdown(iouring_ctx_t *ctx);
task_t* submit_write(iouring_ctx_t *ctx, int fd, void **bufs, size_t *lens, int count, off_t off); task_t* submit_write(iouring_ctx_t *ctx, int fd, void **bufs, size_t *lens, int count, off_t off);
int uring_task_complete(iouring_ctx_t *ctx);
void cleanup_finished_iouring_tasks(); void cleanup_finished_iouring_tasks();

View File

@@ -0,0 +1,50 @@
#ifndef __DISK_IOURING_H__
#define __DISK_IOURING_H__
#include <liburing.h>
#include <pthread.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>
#define BATCH_SIZE 256
typedef enum { TASK_READ, TASK_WRITE } task_op_t;
typedef struct task {
task_op_t op;
int fd;
off_t off;
int res; // cqe->res
int done; // 0/1
struct iovec *iovs; // iovec 数组
int iovcnt; // iovec 数量
struct task *next;
} task_t;
typedef struct {
struct io_uring ring;
int pending_count;
task_t *head;
task_t *tail;
} iouring_ctx_t;
void task_init(task_t *t);
void task_finish(task_t *t, int res);
void task_destroy(task_t *t);
int iouring_init(iouring_ctx_t *ctx, unsigned entries);
void iouring_shutdown(iouring_ctx_t *ctx);
task_t* submit_write(iouring_ctx_t *ctx, int fd, void **bufs, size_t *lens, int count, off_t off);
void iouring_tick(iouring_ctx_t *ctx);
extern iouring_ctx_t global_uring_ctx;
#endif

View File

@@ -130,7 +130,7 @@ $(BZS_APPS): $(LIBBLAZESYM_OBJ)
# Build application binary # Build application binary
$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT) $(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT)
$(call msg,BINARY,$@) $(call msg,BINARY,$@)
$(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@ $(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -lrt -lpthread -o $@
# delete failed targets # delete failed targets
.DELETE_ON_ERROR: .DELETE_ON_ERROR:

View File

@@ -15,66 +15,46 @@ struct {
__uint(value_size, sizeof(int)); __uint(value_size, sizeof(int));
} events SEC(".maps"); } events SEC(".maps");
/* __completed_cmd(const uint8_t *cmd, size_t len, unsigned long long seq); */ // 1) notify: __replica_notify(seq, off, len)
SEC("uprobe//home/lian/share/9.1-kvstore/kvstore:__completed_cmd") // SEC("uprobe//home/lian/share/9.1-kvstore/kvstore:__replica_notify")
int BPF_KPROBE(handle_completed_cmd, // int BPF_KPROBE(handle_replica_notify, __u64 seq, __u32 off, __u32 len)
const __u8 *cmd, size_t len, __u64 seq) // {
{ // struct replica_event evt = {};
struct replica_event evt = {}; // evt.type = EVENT_CMD_META;
__u32 copy_len; // evt.meta.seq = seq;
// evt.meta.off = off;
// evt.meta.len = len;
evt.type = EVENT_COMPLETED_CMD; // bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &evt, sizeof(evt));
evt.complete.seq = seq; // return 0;
// }
copy_len = len; // 2) ssync: __ssync(ip, ip_len, port, seq)
if (copy_len > MAX_CMD_LEN)
copy_len = MAX_CMD_LEN;
evt.complete.len = copy_len;
if (cmd)
bpf_probe_read_user(evt.complete.cmd, copy_len, cmd);
bpf_perf_event_output(ctx, &events,
BPF_F_CURRENT_CPU,
&evt, sizeof(evt));
return 0;
}
/* __ssync(const uint8_t *ip, uint32_t ip_len, int port, unsigned long long seq); */
SEC("uprobe//home/lian/share/9.1-kvstore/kvstore:__ssync") SEC("uprobe//home/lian/share/9.1-kvstore/kvstore:__ssync")
int BPF_KPROBE(handle_ssync, int BPF_KPROBE(handle_ssync, const __u8 *ip, __u32 ip_len, int port, __u64 seq)
const __u8 *ip, __u32 ip_len, int port, __u64 seq)
{ {
struct replica_event evt = {}; struct replica_event evt = {};
evt.type = EVENT_SSYNC; evt.type = EVENT_SSYNC;
evt.sync.seq = seq; evt.sync.seq = seq;
evt.sync.port = port; evt.sync.port = port;
__u32 copy_len = ip_len; __u32 copy_len = ip_len;
if (copy_len > sizeof(evt.sync.ip)) if (copy_len > MAX_IP_LEN) copy_len = MAX_IP_LEN;
copy_len = sizeof(evt.sync.ip); evt.sync.ip_len = copy_len;
if (ip) if (ip)
bpf_probe_read_user(evt.sync.ip, copy_len, ip); bpf_probe_read_user(evt.sync.ip, copy_len, ip);
bpf_perf_event_output(ctx, &events, bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &evt, sizeof(evt));
BPF_F_CURRENT_CPU,
&evt, sizeof(evt));
return 0; return 0;
} }
/* __sready(void); */ // 3) sready: __sready()
SEC("uprobe//home/lian/share/9.1-kvstore/kvstore:__sready") SEC("uprobe//home/lian/share/9.1-kvstore/kvstore:__sready")
int BPF_KPROBE(handle_sready) int BPF_KPROBE(handle_sready)
{ {
struct replica_event evt = {}; struct replica_event evt = {};
evt.type = EVENT_SREADY; evt.type = EVENT_SREADY;
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &evt, sizeof(evt));
bpf_perf_event_output(ctx, &events,
BPF_F_CURRENT_CPU,
&evt, sizeof(evt));
return 0; return 0;
} }

View File

@@ -10,202 +10,474 @@
#include <sys/socket.h> #include <sys/socket.h>
#include <netinet/in.h> #include <netinet/in.h>
#include <arpa/inet.h> #include <arpa/inet.h>
#include <sys/epoll.h>
#include <fcntl.h> #include <fcntl.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <pthread.h>
#include "replica_shm.h"
#include "replica.h" #include "replica.h"
typedef enum { #define DEBUGLOG(...) fprintf(stderr, __VA_ARGS__)
OFFLINE = 0,
ONLINE = 1,
}replica_state_e ;
struct cmd_node { /* ============================================================ */
__u64 seq; #define REPLICA_SHM_MAGIC 0x52504C43u /* 'RPLC' */
__u32 len; #define REPLICA_SHM_VER 1
uint8_t *cmd;
struct cmd_node *next; static inline uint64_t align8_u64(uint64_t x) { return (x + 7u) & ~7ull; }
};
int replica_shm_open(replica_shm_t *s, const char *name, size_t total_size, int create)
{
if (!s || !name || total_size < (sizeof(replica_shm_hdr_t) + 4096)) return -EINVAL;
memset(s, 0, sizeof(*s));
int flags = O_RDWR;
if (create) flags |= O_CREAT;
int fd = shm_open(name, flags, 0666);
if (fd < 0) return -errno;
if (create) {
if (ftruncate(fd, (off_t)total_size) != 0) {
int e = -errno; close(fd); return e;
}
}
void *p = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (p == MAP_FAILED) {
int e = -errno; close(fd); return e;
}
s->fd = fd;
s->map_size = total_size;
s->hdr = (replica_shm_hdr_t *)p;
s->data = (uint8_t *)p + sizeof(replica_shm_hdr_t);
// 初始化头
if (create || s->hdr->magic != REPLICA_SHM_MAGIC) {
memset(s->hdr, 0, sizeof(*s->hdr));
s->hdr->magic = REPLICA_SHM_MAGIC;
s->hdr->version = REPLICA_SHM_VER;
s->hdr->capacity = total_size - sizeof(replica_shm_hdr_t);
s->hdr->write_off = 0;
s->hdr->last_seq = 0;
}
return 0;
}
int replica_shm_peek(replica_shm_t *s, uint32_t off, replica_rec_hdr_t *out_hdr)
{
if (!s || !s->hdr || !s->data || !out_hdr) return -EINVAL;
if ((uint64_t)off + sizeof(replica_rec_hdr_t) > s->hdr->capacity) return -EINVAL;
memcpy(out_hdr, s->data + off, sizeof(*out_hdr));
return 0;
}
void replica_shm_close(replica_shm_t *s)
{
if (!s) return;
if (s->hdr && s->map_size) munmap(s->hdr, s->map_size);
if (s->fd > 0) close(s->fd);
memset(s, 0, sizeof(*s));
}
/* ================================================================================*/
struct pending_queue {
struct cmd_node *head;
struct cmd_node *tail;
int count;
};
/* ================= 全局状态 ================= */ /* ================= 全局状态 ================= */
static replica_state_e state = OFFLINE; #define DEBUGLOG(...) fprintf(stderr, __VA_ARGS__)
static int sockfd = -1;
static replica_shm_t g_shm;
static int g_sockfd = -1;
static char peer_ip[MAX_IP_LEN]; static char peer_ip[MAX_IP_LEN];
static int peer_port; static int peer_port = 0;
static __u64 peer_seq; static uint64_t SYNC_SEQ = 0;
static struct pending_queue pending = { static uint64_t local_seq = 0;
.head = NULL, static uint32_t read_off = 0;
.tail = NULL,
.count = 0, static pthread_t reader_thread;
static pthread_t sender_thread;
static volatile int should_stop = 0;
/* ================= ================= */
struct send_node {
uint8_t *data;
uint32_t len;
uint32_t sent;
struct send_node *next;
}; };
/* ================= pending 队列操作 ================= */ static struct {
static void pending_free() struct send_node *head;
struct send_node *tail;
int count;
pthread_mutex_t lock;
pthread_cond_t not_empty;
} sendq = {
.lock = PTHREAD_MUTEX_INITIALIZER,
.not_empty = PTHREAD_COND_INITIALIZER
};
static void sendq_free_all(void)
{ {
struct pending_queue *q = &pending; pthread_mutex_lock(&sendq.lock);
struct cmd_node *cur = q->head; struct send_node *c = sendq.head;
while (cur) { while (c) {
struct cmd_node *tmp = cur; struct send_node *n = c->next;
cur = cur->next; free(c->data);
free(tmp->cmd); free(c);
free(tmp); c = n;
} }
q->head = q->tail = NULL; sendq.head = sendq.tail = NULL;
q->count = 0; sendq.count = 0;
pthread_mutex_unlock(&sendq.lock);
} }
static void pending_push(__u64 seq, __u32 len, const uint8_t *cmd)
{
struct cmd_node *node = malloc(sizeof(*node));
if (!node)
return;
node->cmd = malloc(len); static void sendq_push(uint8_t *data, uint32_t len)
if (!node->cmd) { {
free(node); struct send_node *n = (struct send_node *)malloc(sizeof(*n));
if (!n) {
free(data);
return; return;
} }
n->data = data;
n->len = len;
n->sent = 0;
n->next = NULL;
memcpy(node->cmd, cmd, len); pthread_mutex_lock(&sendq.lock);
node->seq = seq;
node->len = len;
node->next = NULL;
if (!pending.tail) { if (!sendq.tail) {
pending.head = pending.tail = node; sendq.head = sendq.tail = n;
} else { } else {
pending.tail->next = node; sendq.tail->next = n;
pending.tail = node; sendq.tail = n;
} }
sendq.count++;
pending.count++; pthread_cond_signal(&sendq.not_empty);
pthread_mutex_unlock(&sendq.lock);
} }
static void pending_gc(__u64 min_seq) static void sendq_pop(void)
{ {
struct cmd_node *cur = pending.head; if (!sendq.head) return;
struct send_node *n = sendq.head;
int n = pending.count; sendq.head = n->next;
while (cur && cur->seq < min_seq) { if (!sendq.head) sendq.tail = NULL;
struct cmd_node *tmp = cur; free(n->data);
cur = cur->next; free(n);
sendq.count--;
free(tmp->cmd);
free(tmp);
pending.count--;
}
printf("gc:%d\n", n-pending.count);
pending.head = cur;
if (!cur)
pending.tail = NULL;
} }
static void pending_send_one(struct cmd_node *node){ /* ================= Reader 线程:读共享内存 ================= */
int rt = send(sockfd, node->cmd, node->len, 0); static void* reader_thread_func(void *arg)
printf("send seq:%lld, rt=%d\n", node->seq, rt);
}
static void pending_send_all(void)
{ {
struct cmd_node *cur = pending.head; (void)arg;
while (cur) { DEBUGLOG("Reader thread started\n");
pending_send_one(cur);
cur = cur->next; while (!should_stop) {
replica_rec_hdr_t h;
uint64_t last = __atomic_load_n(&g_shm.hdr->last_seq, __ATOMIC_ACQUIRE);
if (local_seq > last) {
// 没有新数据,短暂休眠避免空转
continue;
} }
if (read_off+ sizeof(replica_rec_hdr_t) >= g_shm.hdr->capacity) {
DEBUGLOG("Reader: read_off overflow, reset\n");
// read_off = 0;
break;
// continue;
}
if (replica_shm_peek(&g_shm, read_off, &h) != 0) {
DEBUGLOG("Reader: peek failed at %u\n", read_off);
break;
// continue;
}
// 检测 wrap
if (h.len == 0) {
DEBUGLOG("Reader: wrap at offset %u\n", read_off);
read_off = 0;
continue;
}
// 跳过 SYNC_SEQ 之前的
if (h.seq < SYNC_SEQ) {
uint64_t step = align8_u64((uint64_t)sizeof(replica_rec_hdr_t) + (uint64_t)h.len);
if (read_off + step > g_shm.hdr->capacity) {
read_off = 0;
} else {
read_off += (uint32_t)step;
}
continue;
}
// 序列号检查
if (h.seq != local_seq) {
DEBUGLOG("Reader: seq mismatch! h.seq=%lu, local_seq=%lu, off=%u\n",
h.seq, local_seq, read_off);
continue;
}
// 读取数据
uint8_t *buf = (uint8_t *)malloc(h.len);
if (!buf) {
DEBUGLOG("Reader: malloc failed\n");
usleep(1000);
continue;
}
memcpy(buf, g_shm.data + read_off + sizeof(replica_rec_hdr_t), h.len);
sendq_push(buf, h.len);
uint64_t step = align8_u64((uint64_t)sizeof(replica_rec_hdr_t) + (uint64_t)h.len);
if (read_off + step > g_shm.hdr->capacity) {
read_off = 0;
} else {
read_off += (uint32_t)step;
}
local_seq++;
}
DEBUGLOG("Reader thread stopped\n");
return NULL;
}
/* ================= Sender 线程:发送数据 ================= */
static void* sender_thread_func(void *arg)
{
(void)arg;
DEBUGLOG("Sender thread started\n");
int epfd = epoll_create1(0);
if (epfd < 0) {
perror("epoll_create1");
return NULL;
}
struct epoll_event ev;
memset(&ev, 0, sizeof(ev));
ev.events = EPOLLIN | EPOLLOUT;
ev.data.fd = g_sockfd;
if (epoll_ctl(epfd, EPOLL_CTL_ADD, g_sockfd, &ev) != 0) {
perror("epoll_ctl ADD");
close(epfd);
return NULL;
}
while (!should_stop && g_sockfd >= 0) {
struct epoll_event events[4];
int nfds = epoll_wait(epfd, events, 4, 100); // 100ms timeout
if (nfds < 0) {
if (errno == EINTR) continue;
perror("epoll_wait");
break;
}
for (int i = 0; i < nfds; i++) {
if (events[i].data.fd != g_sockfd)
continue;
if (events[i].events & (EPOLLERR | EPOLLHUP)) {
DEBUGLOG("Sender: EPOLLERR/EPOLLHUP\n");
close(g_sockfd);
g_sockfd = -1;
break;
}
if (events[i].events & EPOLLIN) {
char buf[4096];
recv(g_sockfd, buf, sizeof(buf), 0);
}
if (events[i].events & EPOLLOUT) {
pthread_mutex_lock(&sendq.lock);
while (sendq.head) {
struct send_node *n = sendq.head;
pthread_mutex_unlock(&sendq.lock);
int nbytes = send(g_sockfd, n->data + n->sent,
(int)(n->len - n->sent), MSG_NOSIGNAL);
pthread_mutex_lock(&sendq.lock);
if (nbytes > 0) {
n->sent += (uint32_t)nbytes;
if (n->sent == n->len) {
sendq_pop();
continue;
}
// partial send
break;
}
if (nbytes < 0) {
if (errno == EAGAIN || errno == EWOULDBLOCK) {
break;
}
DEBUGLOG("Sender: send error errno=%d\n", errno);
pthread_mutex_unlock(&sendq.lock);
close(g_sockfd);
g_sockfd = -1;
goto out;
}
// nbytes == 0
DEBUGLOG("Sender: send returned 0\n");
pthread_mutex_unlock(&sendq.lock);
close(g_sockfd);
g_sockfd = -1;
goto out;
}
pthread_mutex_unlock(&sendq.lock);
}
}
}
out:
close(epfd);
DEBUGLOG("Sender thread stopped\n");
return NULL;
} }
/* ================= 网络逻辑 ================= */ /* ================= 网络逻辑 ================= */
static void try_connect(void) static int connect_peer(void)
{ {
if(sockfd > 0){ if (peer_port <= 0 || peer_ip[0] == '\0')
close(sockfd); return -1;
sockfd = -1;
if (g_sockfd >= 0) {
close(g_sockfd);
g_sockfd = -1;
} }
struct sockaddr_in addr = {}; int fd = socket(AF_INET, SOCK_STREAM, 0);
int i = 0; if (fd < 0) {
addr.sin_family = AF_INET;
addr.sin_port = htons(peer_port);
inet_pton(AF_INET, peer_ip, &addr.sin_addr);
for(i = 0;i < 10; ++ i){
sockfd = socket(AF_INET, SOCK_STREAM, 0);
if (sockfd < 0) {
perror("socket"); perror("socket");
return; return -1;
} }
printf("connect try %d...\n", i + 1);
if (connect(sockfd, (struct sockaddr *)&addr, sizeof(addr)) == 0) { struct sockaddr_in a;
printf("connect success: %s:%d\n", peer_ip, peer_port); memset(&a, 0, sizeof(a));
state = ONLINE; a.sin_family = AF_INET;
pending_send_all(); a.sin_port = htons(peer_port);
return; if (inet_pton(AF_INET, peer_ip, &a.sin_addr) != 1) {
DEBUGLOG("inet_pton failed for ip=%s\n", peer_ip);
close(fd);
return -1;
} }
perror("connect"); if (connect(fd, (struct sockaddr *)&a, sizeof(a)) != 0) {
close(sockfd); // 这里可以重试;按你的要求先简单返回失败
sockfd = -1; // perror("connect");
close(fd);
sleep(1); return -1;
} }
printf("connect failed after 10 retries\n"); // non-blocking配合 epoll
int flags = fcntl(fd, F_GETFL, 0);
if (flags >= 0) fcntl(fd, F_SETFL, flags | O_NONBLOCK);
g_sockfd = fd;
DEBUGLOG("connect ok %s:%d\n", peer_ip, peer_port);
return 0;
} }
/* ================= perf buffer 回调 ================= */ /* ================= perf buffer 回调 ================= */
static void handle_event(void *ctx, int cpu, void *data, __u32 size) static void handle_event(void *ctx, int cpu, void *data, __u32 size)
{ {
struct replica_event *evt = data; (void)ctx; (void)cpu;
if (size < sizeof(struct replica_event)) return;
switch (evt->type) { struct replica_event *e = (struct replica_event*)data;
case EVENT_SSYNC: if (e->type == EVENT_SSYNC) {
strncpy(peer_ip, evt->sync.ip, sizeof(peer_ip)); memset(peer_ip, 0, sizeof(peer_ip));
peer_port = evt->sync.port; memcpy(peer_ip, e->sync.ip, e->sync.ip_len);
peer_seq = evt->sync.seq; peer_port = e->sync.port;
printf("SSYNC [seq:%lld], [%s:%d]\n", peer_seq, peer_ip, peer_port); SYNC_SEQ = e->sync.seq;
state = OFFLINE; local_seq = SYNC_SEQ;
pending_gc(peer_seq); read_off = 0;
break;
case EVENT_COMPLETED_CMD: DEBUGLOG("SSYNC: peer=%s:%d SYNC_SEQ=%llu\n",
// printf("CMD [seq:%lld], cmd:\n[\n%s]\n", evt->complete.seq, evt->complete.cmd); peer_ip, peer_port, (unsigned long long)SYNC_SEQ);
pending_push(evt->complete.seq,
evt->complete.len,
evt->complete.cmd);
if (state == ONLINE && pending.tail) { // 停止旧线程
struct cmd_node *n = pending.tail; should_stop = 1;
pending_send_one(n); if (reader_thread) {
pthread_join(reader_thread, NULL);
reader_thread = 0;
}
if (sender_thread) {
pthread_join(sender_thread, NULL);
sender_thread = 0;
} }
break;
case EVENT_SREADY: if (g_sockfd >= 0) {
printf("SREADY \n"); close(g_sockfd);
if (state == OFFLINE) g_sockfd = -1;
try_connect(); }
break; sendq_free_all();
return;
}
if (e->type == EVENT_SREADY) {
DEBUGLOG("SREADY\n");
if (connect_peer() != 0) {
DEBUGLOG("connect_peer failed\n");
return;
}
// 启动双线程
should_stop = 0;
if (pthread_create(&reader_thread, NULL, reader_thread_func, NULL) != 0) {
perror("pthread_create reader");
return;
}
if (pthread_create(&sender_thread, NULL, sender_thread_func, NULL) != 0) {
perror("pthread_create sender");
pthread_cancel(reader_thread);
return;
}
DEBUGLOG("Reader and Sender threads started\n");
return;
} }
} }
/* ================= main ================= */ /* ================= main ================= */
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
int rc = replica_shm_open(&g_shm, REPLICA_SHM_NAME, REPLICA_SHM_SIZE, 0);
if (rc != 0) {
fprintf(stderr, "replica_shm_open failed rc=%d (did you create it in kvstore?)\n", rc);
return 1;
}
struct replica_bpf *skel; struct replica_bpf *skel;
struct perf_buffer *pb = NULL; struct perf_buffer *pb = NULL;
int err; int err;
@@ -231,8 +503,7 @@ int main(int argc, char **argv)
goto cleanup; goto cleanup;
} }
printf("Successfully started! Please run `sudo cat /sys/kernel/debug/tracing/trace_pipe` " printf("Successfully started! \n");
"to see output of the BPF programs.\n");
pb = perf_buffer__new(bpf_map__fd(skel->maps.events), 8, pb = perf_buffer__new(bpf_map__fd(skel->maps.events), 8,
@@ -244,13 +515,19 @@ int main(int argc, char **argv)
while (1) { while (1) {
perf_buffer__poll(pb, 1000); // 处理事件 perf_buffer__poll(pb, 1000); // 处理事件
} }
perf_buffer__free(pb); perf_buffer__free(pb);
cleanup: cleanup:
pending_free(); should_stop = 1;
if (sockfd >= 0) close(sockfd); if (reader_thread) pthread_join(reader_thread, NULL);
if (sender_thread) pthread_join(sender_thread, NULL);
if (g_sockfd >= 0) close(g_sockfd);
replica_shm_close(&g_shm);
sendq_free_all();
replica_bpf__destroy(skel); replica_bpf__destroy(skel);
return -err; return -err;
} }

View File

@@ -2,34 +2,25 @@
#define __REPLICA_H__ #define __REPLICA_H__
#define MAX_CMD_LEN 256
#define MAX_IP_LEN 64 #define MAX_IP_LEN 64
enum event_type { enum {
EVENT_COMPLETED_CMD, EVENT_SSYNC = 1,
EVENT_SSYNC, EVENT_SREADY = 2,
EVENT_SREADY,
};
struct complete_cmd_evt {
__u64 seq;
__u32 len;
__u8 cmd[MAX_CMD_LEN];
};
struct sync_evt {
__u64 seq;
char ip[MAX_IP_LEN];
__s32 port;
}; };
struct replica_event { struct replica_event {
__u32 type; uint32_t type;
__u32 _pad; uint32_t _pad;
union { union {
struct complete_cmd_evt complete; struct {
struct sync_evt sync; uint64_t seq; // SYNC_SEQ从这个 seq 开始增量
int32_t port;
uint32_t ip_len;
char ip[MAX_IP_LEN];
} sync;
}; };
}; };

59
ebpf/c/replica_shm.h Normal file
View File

@@ -0,0 +1,59 @@
#ifndef __REPLICA_SHM_H__
#define __REPLICA_SHM_H__
#include <stdint.h>
#include <stddef.h>
#ifndef REPLICA_SHM_NAME
#define REPLICA_SHM_NAME "/kvs_replica_shm"
#endif
#ifndef REPLICA_SHM_SIZE
// 64MB按需调
#define REPLICA_SHM_SIZE (256u * 1024u * 1024u)
#endif
// 每条记录头部(放在 shm 的 data 区里)
typedef struct __attribute__((packed)) {
uint64_t seq; // 单调递增
uint32_t len; // payload bytes
uint32_t flags; // 预留:压缩、类型等
uint32_t crc32; // 可选0 表示不校验
uint32_t reserved; // 对齐
// uint8_t payload[len] 紧跟其后
} replica_rec_hdr_t;
// shm 顶部元数据
typedef struct __attribute__((packed)) {
uint32_t magic;
uint32_t version;
uint64_t capacity; // data 区大小(字节)
uint64_t write_off; // producer 写指针0..capacity-1
uint64_t last_seq; // producer 最新 seq
uint8_t _pad[64]; // cacheline padding
// 后面紧跟 data[capacity]
} replica_shm_hdr_t;
typedef struct {
int fd;
size_t map_size;
replica_shm_hdr_t *hdr;
uint8_t *data;
} replica_shm_t;
// kvstore: 初始化create/open + mmap
int replica_shm_open(replica_shm_t *s, const char *name, size_t total_size, int create);
// kvstore: append 一条记录,返回 off相对 data 起始),用于 notify
// 单写者设计:无需锁。返回 0 成功,<0 失败(空间不足或参数错误)
int replica_shm_append(replica_shm_t *s, uint64_t seq, const void *buf, uint32_t len, uint32_t *out_off);
// replicator: 读取记录头(不移动游标),你也可以直接 memcpy payload
// off 是 data 内偏移
int replica_shm_peek(replica_shm_t *s, uint32_t off, replica_rec_hdr_t *out_hdr);
// 关闭
void replica_shm_close(replica_shm_t *s);
extern replica_shm_t g_rep_shm;
#endif

18
ebpf/old.c/.gitignore vendored Normal file
View File

@@ -0,0 +1,18 @@
/.output
/bootstrap
/bootstrap_legacy
/minimal
/minimal_legacy
/minimal_ns
/uprobe
/kprobe
/fentry
/profile
/usdt
/sockfilter
/tc
/ksyscall
/task_iter
/lsm
/cmake-build-debug/
/cmake-build-release/

133
ebpf/old.c/CMakeLists.txt Normal file
View File

@@ -0,0 +1,133 @@
# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
cmake_minimum_required(VERSION 3.16)
project(examples C)
# Tell cmake where to find BpfObject module
list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../tools/cmake)
# Build vendored libbpf
include(ExternalProject)
ExternalProject_Add(libbpf
PREFIX libbpf
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../libbpf/src
CONFIGURE_COMMAND ""
BUILD_COMMAND make
CC=${CMAKE_C_COMPILER}
BUILD_STATIC_ONLY=1
OBJDIR=${CMAKE_CURRENT_BINARY_DIR}/libbpf/libbpf
DESTDIR=${CMAKE_CURRENT_BINARY_DIR}/libbpf
INCLUDEDIR=
LIBDIR=
UAPIDIR=
install install_uapi_headers
BUILD_IN_SOURCE TRUE
INSTALL_COMMAND ""
STEP_TARGETS build
BUILD_BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/libbpf/libbpf.a
)
ExternalProject_Add(bpftool
PREFIX bpftool
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../bpftool/src
CONFIGURE_COMMAND ""
BUILD_COMMAND make bootstrap
OUTPUT=${CMAKE_CURRENT_BINARY_DIR}/bpftool/
BUILD_IN_SOURCE TRUE
INSTALL_COMMAND ""
STEP_TARGETS build
)
find_program(CARGO_EXISTS cargo)
if(CARGO_EXISTS)
if(CMAKE_CROSSCOMPILING)
# Determine target triple
if(CMAKE_SYSTEM_NAME MATCHES "Linux")
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
set(CARGO_TARGET "x86_64-unknown-linux-gnu")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
set(CARGO_TARGET "aarch64-unknown-linux-gnu")
else()
message(FATAL_ERROR "Unsupported processor for Linux: ${CMAKE_SYSTEM_PROCESSOR}")
endif()
if(CMAKE_CXX_COMPILER)
set(RUST_LINKER ${CMAKE_CXX_COMPILER})
else()
set(RUST_LINKER ${CMAKE_C_COMPILER})
endif()
else()
message((FATAL_ERROR "Unsupported platform: ${CMAKE_SYSTEM_NAME}"))
endif()
ExternalProject_Add(blazesym
PREFIX blazesym
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../blazesym
CONFIGURE_COMMAND ""
BUILD_COMMAND ${CMAKE_COMMAND} -E env
RUSTFLAGS=-C\ linker=${RUST_LINKER}
cargo build --package=blazesym-c --release --target=${CARGO_TARGET}
BUILD_IN_SOURCE TRUE
INSTALL_COMMAND ""
STEP_TARGETS build
)
else() # Host
ExternalProject_Add(blazesym
PREFIX blazesym
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../blazesym
CONFIGURE_COMMAND ""
BUILD_COMMAND
cargo build --package=blazesym-c --release
BUILD_IN_SOURCE TRUE
INSTALL_COMMAND ""
STEP_TARGETS build
)
endif()
endif()
# Set BpfObject input parameters -- note this is usually not necessary unless
# you're in a highly vendored environment (like libbpf-bootstrap)
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
set(ARCH "x86")
elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
set(ARCH "arm")
elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
set(ARCH "arm64")
elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
set(ARCH "powerpc")
elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "mips")
set(ARCH "mips")
elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
set(ARCH "riscv")
elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
set(ARCH "loongarch")
endif()
set(BPFOBJECT_BPFTOOL_EXE ${CMAKE_CURRENT_BINARY_DIR}/bpftool/bootstrap/bpftool)
set(BPFOBJECT_VMLINUX_H ${CMAKE_CURRENT_SOURCE_DIR}/../../vmlinux.h/include/${ARCH}/vmlinux.h)
set(LIBBPF_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/libbpf)
set(LIBBPF_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/libbpf/libbpf.a)
find_package(BpfObject REQUIRED)
# Create an executable for each application
file(GLOB apps *.bpf.c)
if(NOT CARGO_EXISTS)
list(REMOVE_ITEM apps ${CMAKE_CURRENT_SOURCE_DIR}/profile.bpf.c)
endif()
foreach(app ${apps})
get_filename_component(app_stem ${app} NAME_WE)
# Build object skeleton and depend skeleton on libbpf build
bpf_object(${app_stem} ${app_stem}.bpf.c)
add_dependencies(${app_stem}_skel libbpf bpftool)
add_executable(${app_stem} ${app_stem}.c)
target_link_libraries(${app_stem} ${app_stem}_skel)
if(${app_stem} STREQUAL profile)
target_include_directories(${app_stem} PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/../../blazesym/capi/include)
target_link_libraries(${app_stem}
${CMAKE_CURRENT_SOURCE_DIR}/../../blazesym/target/${CARGO_TARGET}/release/libblazesym_c.a -lpthread -lrt -ldl)
add_dependencies(${app_stem} blazesym)
endif()
endforeach()

139
ebpf/old.c/Makefile Normal file
View File

@@ -0,0 +1,139 @@
# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
OUTPUT := .output
CLANG ?= clang
LIBBPF_SRC := $(abspath ../../libbpf/src)
BPFTOOL_SRC := $(abspath ../../bpftool/src)
LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a)
BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool)
BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool
LIBBLAZESYM_SRC := $(abspath ../../blazesym/)
LIBBLAZESYM_INC := $(abspath $(LIBBLAZESYM_SRC)/capi/include)
LIBBLAZESYM_OBJ := $(abspath $(OUTPUT)/libblazesym_c.a)
ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \
| sed 's/arm.*/arm/' \
| sed 's/aarch64/arm64/' \
| sed 's/ppc64le/powerpc/' \
| sed 's/mips.*/mips/' \
| sed 's/riscv64/riscv/' \
| sed 's/loongarch64/loongarch/')
VMLINUX := ../../vmlinux.h/include/$(ARCH)/vmlinux.h
# Use our own libbpf API headers and Linux UAPI headers distributed with
# libbpf to avoid dependency on system-wide headers, which could be missing or
# outdated
INCLUDES := -I$(OUTPUT) -I../../libbpf/include/uapi -I$(dir $(VMLINUX)) -I$(LIBBLAZESYM_INC)
CFLAGS := -g -Wall
ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS)
# APPS = minimal minimal_legacy minimal_ns bootstrap bootstrap_legacy uprobe kprobe fentry \
usdt sockfilter tc ksyscall task_iter lsm
APPS = replica
CARGO ?= $(shell which cargo)
ifeq ($(strip $(CARGO)),)
BZS_APPS :=
else
BZS_APPS := profile
APPS += $(BZS_APPS)
# Required by libblazesym
ALL_LDFLAGS += -lrt -ldl -lpthread -lm
endif
# Get Clang's default includes on this system. We'll explicitly add these dirs
# to the includes list when compiling with `-target bpf` because otherwise some
# architecture-specific dirs will be "missing" on some architectures/distros -
# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h,
# sys/cdefs.h etc. might be missing.
#
# Use '-idirafter': Don't interfere with include mechanics except where the
# build would have failed anyways.
CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - </dev/null 2>&1 \
| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }')
ifeq ($(V),1)
Q =
msg =
else
Q = @
msg = @printf ' %-8s %s%s\n' \
"$(1)" \
"$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \
"$(if $(3), $(3))";
MAKEFLAGS += --no-print-directory
endif
define allow-override
$(if $(or $(findstring environment,$(origin $(1))),\
$(findstring command line,$(origin $(1)))),,\
$(eval $(1) = $(2)))
endef
$(call allow-override,CC,$(CROSS_COMPILE)cc)
$(call allow-override,LD,$(CROSS_COMPILE)ld)
.PHONY: all
all: $(APPS)
.PHONY: clean
clean:
$(call msg,CLEAN)
$(Q)rm -rf $(OUTPUT) $(APPS)
$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT):
$(call msg,MKDIR,$@)
$(Q)mkdir -p $@
# Build libbpf
$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf
$(call msg,LIB,$@)
$(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 \
OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@) \
INCLUDEDIR= LIBDIR= UAPIDIR= \
install
# Build bpftool
$(BPFTOOL): | $(BPFTOOL_OUTPUT)
$(call msg,BPFTOOL,$@)
$(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap
$(LIBBLAZESYM_SRC)/target/release/libblazesym_c.a::
$(Q)cd $(LIBBLAZESYM_SRC) && $(CARGO) build --package=blazesym-c --release
$(LIBBLAZESYM_OBJ): $(LIBBLAZESYM_SRC)/target/release/libblazesym_c.a | $(OUTPUT)
$(call msg,LIB, $@)
$(Q)cp $(LIBBLAZESYM_SRC)/target/release/libblazesym_c.a $@
# Build BPF code
$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL)
$(call msg,BPF,$@)
$(Q)$(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) \
$(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) \
-c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
$(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
# Generate BPF skeletons
$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL)
$(call msg,GEN-SKEL,$@)
$(Q)$(BPFTOOL) gen skeleton $< > $@
# Build user-space code
$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h
$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT)
$(call msg,CC,$@)
$(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@
$(patsubst %,$(OUTPUT)/%.o,$(BZS_APPS)): $(LIBBLAZESYM_OBJ)
$(BZS_APPS): $(LIBBLAZESYM_OBJ)
# Build application binary
$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT)
$(call msg,BINARY,$@)
$(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@
# delete failed targets
.DELETE_ON_ERROR:
# keep intermediate (.skel.h, .bpf.o, etc) targets
.SECONDARY:

80
ebpf/old.c/replica.bpf.c Normal file
View File

@@ -0,0 +1,80 @@
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
/* Copyright (c) 2020 Facebook */
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>
#include "replica.h"
char LICENSE[] SEC("license") = "Dual BSD/GPL";
struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__uint(key_size, sizeof(int));
__uint(value_size, sizeof(int));
} events SEC(".maps");
/* __completed_cmd(const uint8_t *cmd, size_t len, unsigned long long seq); */
SEC("uprobe//home/lian/share/9.1-kvstore/kvstore:__completed_cmd")
int BPF_KPROBE(handle_completed_cmd,
const __u8 *cmd, size_t len, __u64 seq)
{
struct replica_event evt = {};
__u32 copy_len;
evt.type = EVENT_COMPLETED_CMD;
evt.complete.seq = seq;
copy_len = len;
if (copy_len > MAX_CMD_LEN)
copy_len = MAX_CMD_LEN;
evt.complete.len = copy_len;
if (cmd)
bpf_probe_read_user(evt.complete.cmd, copy_len, cmd);
bpf_perf_event_output(ctx, &events,
BPF_F_CURRENT_CPU,
&evt, sizeof(evt));
return 0;
}
/* __ssync(const uint8_t *ip, uint32_t ip_len, int port, unsigned long long seq); */
SEC("uprobe//home/lian/share/9.1-kvstore/kvstore:__ssync")
int BPF_KPROBE(handle_ssync,
const __u8 *ip, __u32 ip_len, int port, __u64 seq)
{
struct replica_event evt = {};
evt.type = EVENT_SSYNC;
evt.sync.seq = seq;
evt.sync.port = port;
__u32 copy_len = ip_len;
if (copy_len > sizeof(evt.sync.ip))
copy_len = sizeof(evt.sync.ip);
if (ip)
bpf_probe_read_user(evt.sync.ip, copy_len, ip);
bpf_perf_event_output(ctx, &events,
BPF_F_CURRENT_CPU,
&evt, sizeof(evt));
return 0;
}
/* __sready(void); */
SEC("uprobe//home/lian/share/9.1-kvstore/kvstore:__sready")
int BPF_KPROBE(handle_sready)
{
struct replica_event evt = {};
evt.type = EVENT_SREADY;
bpf_perf_event_output(ctx, &events,
BPF_F_CURRENT_CPU,
&evt, sizeof(evt));
return 0;
}

355
ebpf/old.c/replica.c Normal file
View File

@@ -0,0 +1,355 @@
// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
/* Copyright (c) 2020 Facebook */
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <sys/resource.h>
#include <bpf/libbpf.h>
#include "replica.skel.h"
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <sys/epoll.h>
#include <fcntl.h>
#include "replica.h"
#define DEBUGLOG printf
typedef enum {
OFFLINE = 0,
ONLINE = 1,
}replica_state_e ;
struct cmd_node {
__u64 seq;
__u32 len;
uint8_t *cmd;
struct cmd_node *next;
};
struct pending_queue {
struct cmd_node *head;
struct cmd_node *tail;
int count;
};
/* ================= 全局状态 ================= */
static replica_state_e state = OFFLINE;
static int sockfd = -1;
static int epollfd = -1;
static char peer_ip[MAX_IP_LEN];
static int peer_port;
static __u64 peer_seq;
static struct pending_queue pending = {
.head = NULL,
.tail = NULL,
.count = 0,
};
/* ================= pending 队列操作 ================= */
static void pending_free()
{
struct pending_queue *q = &pending;
struct cmd_node *cur = q->head;
while (cur) {
struct cmd_node *tmp = cur;
cur = cur->next;
free(tmp->cmd);
free(tmp);
}
q->head = q->tail = NULL;
q->count = 0;
}
static void pending_push(__u64 seq, __u32 len, const uint8_t *cmd)
{
struct cmd_node *node = malloc(sizeof(*node));
if (!node)
return;
node->cmd = malloc(len);
if (!node->cmd) {
free(node);
return;
}
memcpy(node->cmd, cmd, len);
node->seq = seq;
node->len = len;
node->next = NULL;
if (!pending.tail) {
pending.head = pending.tail = node;
} else {
pending.tail->next = node;
pending.tail = node;
}
pending.count++;
}
static void pending_gc(__u64 min_seq)
{
struct cmd_node *cur = pending.head;
int n = pending.count;
while (cur && cur->seq < min_seq) {
struct cmd_node *tmp = cur;
cur = cur->next;
free(tmp->cmd);
free(tmp);
pending.count--;
}
DEBUGLOG("gc:%d\n", n-pending.count);
pending.head = cur;
if (!cur)
pending.tail = NULL;
}
static void pending_send_all(void)
{
struct cmd_node *cur = pending.head;
while (cur) {
int rt = send(sockfd, cur->cmd, cur->len, 0);
if(rt == (int)cur->len){
struct cmd_node *tmp = cur;
cur = cur->next;
free(tmp->cmd);
free(tmp);
pending.count--;
}else{
DEBUGLOG("error\n");
// 失败:不移动 cur直接 break
if (rt < 0) {
perror("send failed");
if (errno == ECONNRESET || errno == EPIPE) {
state = OFFLINE;
if (sockfd >= 0) {
close(sockfd);
sockfd = -1;
DEBUGLOG("connect closed\n");
}
} else if (rt == 0) {
fprintf(stderr, "send returned 0 (peer closed?)\n");
} else {
fprintf(stderr, "partial send: %d/%u\n", rt, cur->len);
}
break;
}
}
}
pending.head = cur;
if(!cur)
pending.tail = NULL;
}
/* ================= 网络逻辑 ================= */
static void try_connect(void)
{
if(sockfd > 0){
close(sockfd);
sockfd = -1;
}
struct sockaddr_in addr = {};
int i = 0;
addr.sin_family = AF_INET;
addr.sin_port = htons(peer_port);
inet_pton(AF_INET, peer_ip, &addr.sin_addr);
for(i = 0;i < 10; ++ i){
sockfd = socket(AF_INET, SOCK_STREAM, 0);
if (sockfd < 0) {
perror("socket");
return;
}
DEBUGLOG("connect try %d...\n", i + 1);
if (connect(sockfd, (struct sockaddr *)&addr, sizeof(addr)) == 0) {
DEBUGLOG("connect success: %s:%d\n", peer_ip, peer_port);
int flags = fcntl(sockfd, F_GETFL, 0);
fcntl(sockfd, F_SETFL, flags | O_NONBLOCK);
struct epoll_event ev;
ev.events = EPOLLIN;
ev.data.fd = sockfd;
epoll_ctl(epollfd, EPOLL_CTL_ADD, sockfd, &ev);
state = ONLINE;
pending_send_all();
return;
}
perror("connect");
close(sockfd);
sockfd = -1;
sleep(1);
}
DEBUGLOG("connect failed after 10 retries\n");
}
static void handle_socket_readable(void)
{
char buf[65536];
while (1) {
int n = recv(sockfd, buf, sizeof(buf), MSG_DONTWAIT);
if (n > 0) {
continue;
} else if (n == 0) {
state = OFFLINE;
epoll_ctl(epollfd, EPOLL_CTL_DEL, sockfd, NULL);
close(sockfd);
sockfd = -1;
DEBUGLOG("connection closed\n");
break;
} else {
if (errno == EAGAIN || errno == EWOULDBLOCK) {
break;
}
perror("recv");
state = OFFLINE;
epoll_ctl(epollfd, EPOLL_CTL_DEL, sockfd, NULL);
close(sockfd);
sockfd = -1;
break;
}
}
}
static void handle_socket_writable(void)
{
pending_send_all();
if (!pending.head) {
struct epoll_event ev;
ev.events = EPOLLIN; // 只监听读
ev.data.fd = sockfd;
epoll_ctl(epollfd, EPOLL_CTL_MOD, sockfd, &ev);
}
}
/* ================= perf buffer 回调 ================= */
static void handle_event(void *ctx, int cpu, void *data, __u32 size)
{
struct replica_event *evt = data;
switch (evt->type) {
case EVENT_SSYNC:
strncpy(peer_ip, evt->sync.ip, sizeof(peer_ip));
peer_port = evt->sync.port;
peer_seq = evt->sync.seq;
DEBUGLOG("SSYNC [seq:%lld], [%s:%d]\n", peer_seq, peer_ip, peer_port);
state = OFFLINE;
pending_gc(peer_seq);
break;
case EVENT_COMPLETED_CMD:
// DEBUGLOG("CMD [seq:%lld], cmd:\n[\n%s]\n", evt->complete.seq, evt->complete.cmd);
pending_push(evt->complete.seq,
evt->complete.len,
evt->complete.cmd);
if (state == ONLINE && sockfd >= 0) {
struct epoll_event ev;
ev.events = EPOLLIN | EPOLLOUT;
ev.data.fd = sockfd;
epoll_ctl(epollfd, EPOLL_CTL_MOD, sockfd, &ev);
}
break;
case EVENT_SREADY:
DEBUGLOG("SREADY \n");
if (state == OFFLINE)
try_connect();
break;
}
}
/* ================= main ================= */
int main(int argc, char **argv)
{
struct replica_bpf *skel;
struct perf_buffer *pb = NULL;
int err;
/* Open BPF application */
skel = replica_bpf__open();
if (!skel) {
fprintf(stderr, "Failed to open BPF skeleton\n");
return 1;
}
/* Load & verify BPF programs */
err = replica_bpf__load(skel);
if (err) {
fprintf(stderr, "Failed to load and verify BPF skeleton\n");
goto cleanup;
}
/* Attach tracepoint handler */
err = replica_bpf__attach(skel);
if (err) {
fprintf(stderr, "Failed to attach BPF skeleton\n");
goto cleanup;
}
printf("Successfully started! \n");
pb = perf_buffer__new(bpf_map__fd(skel->maps.events), 8,
handle_event, NULL, NULL, NULL);
if(!pb){
goto cleanup;
}
epollfd = epoll_create1(0);
if (epollfd < 0) {
fprintf(stderr, "epoll_create1 failed\n");
goto cleanup;
}
while (1) {
struct epoll_event events[10];
perf_buffer__poll(pb, 1000); // 处理事件
if(OFFLINE) continue;
int nfds = epoll_wait(epollfd, events, 10, 0);
for (int i = 0; i < nfds; i++) {
if (events[i].data.fd == sockfd) {
if (events[i].events & EPOLLIN) {
handle_socket_readable(); // 快速消费接收数据
}
if (events[i].events & EPOLLOUT) {
handle_socket_writable(); // 发送数据
}
}
}
}
perf_buffer__free(pb);
cleanup:
pending_free();
if (sockfd >= 0) close(sockfd);
replica_bpf__destroy(skel);
return -err;
}

37
ebpf/old.c/replica.h Normal file
View File

@@ -0,0 +1,37 @@
#ifndef __REPLICA_H__
#define __REPLICA_H__
#define MAX_CMD_LEN 256
#define MAX_IP_LEN 64
enum event_type {
EVENT_COMPLETED_CMD,
EVENT_SSYNC,
EVENT_SREADY,
};
struct complete_cmd_evt {
__u64 seq;
__u32 len;
__u8 cmd[MAX_CMD_LEN];
};
struct sync_evt {
__u64 seq;
char ip[MAX_IP_LEN];
__s32 port;
};
struct replica_event {
__u32 type;
__u32 _pad;
union {
struct complete_cmd_evt complete;
struct sync_evt sync;
};
};
#endif

126
ebpf/old.c/xmake.lua Normal file
View File

@@ -0,0 +1,126 @@
add_rules("mode.release", "mode.debug")
add_rules("platform.linux.bpf")
set_license("GPL-2.0")
if xmake.version():satisfies(">=2.5.7 <=2.5.9") then
on_load(function (target)
raise("xmake(%s) has a bug preventing BPF source code compilation. Please run `xmake update -f 2.5.6` to revert to v2.5.6 version or upgrade to xmake v2.6.1 that fixed the issue.", xmake.version())
end)
end
option("system-libbpf", {showmenu = true, default = false, description = "Use system-installed libbpf"})
option("require-bpftool", {showmenu = true, default = false, description = "Require bpftool package"})
add_requires("elfutils", "zlib")
if is_plat("android") then
add_requires("ndk >=22.x <26", "argp-standalone")
set_toolchains("@ndk", {sdkver = "23"})
else
add_requires("llvm >=10.x")
set_toolchains("@llvm")
add_requires("linux-headers")
end
-- fix error: libbpf: map 'my_pid_map': unsupported map linkage static. for bpftool >= 7.2.0
-- we cannot add `"-fvisibility=hidden"` when compiling *.bpf.c
set_symbols("none")
if is_arch("arm64", "arm64-v8a") then
add_includedirs("../../vmlinux.h/include/arm64")
elseif is_arch("arm.*") then
add_includedirs("../../vmlinux.h/include/arm")
elseif is_arch("riscv32", "riscv64") then
add_includedirs("../../vmlinux.h/include/riscv")
elseif is_arch("loongarch") then
add_includedirs("../../vmlinux.h/include/loongarch")
elseif is_arch("ppc", "powerpc") then
add_includedirs("../../vmlinux.h/include/powerpc")
elseif is_arch("x86_64", "i386") then
add_includedirs("../../vmlinux.h/include/x86")
else
add_includedirs("../../vmlinux.h/include")
end
-- we can run `xmake f --require-bpftool=y` to pull bpftool from xmake-repo repository
if has_config("require-bpftool") then
add_requires("linux-tools", {configs = {bpftool = true}})
add_packages("linux-tools")
else
before_build(function (target)
os.addenv("PATH", path.join(os.scriptdir(), "..", "..", "tools"))
end)
end
-- we use the vendored libbpf sources for libbpf-bootstrap.
-- for some projects you may want to use the system-installed libbpf, so you can run `xmake f --system-libbpf=y`
if has_config("system-libbpf") then
add_requires("libbpf", {system = true})
else
target("libbpf")
set_kind("static")
set_basename("bpf")
add_files("../../libbpf/src/*.c")
add_includedirs("../../libbpf/include")
add_includedirs("../../libbpf/include/uapi", {public = true})
add_includedirs("$(buildir)", {interface = true})
add_configfiles("../../libbpf/src/(*.h)", {prefixdir = "bpf"})
add_packages("elfutils", "zlib")
if is_plat("android") then
add_defines("__user=", "__force=", "__poll_t=uint32_t")
end
end
target("minimal")
set_kind("binary")
add_files("minimal.c", "minimal.bpf.c")
add_packages("linux-headers")
if not has_config("system-libbpf") then
add_deps("libbpf")
end
target("minimal_legacy")
set_kind("binary")
add_files("minimal_legacy.c", "minimal_legacy.bpf.c")
add_packages("linux-headers")
if not has_config("system-libbpf") then
add_deps("libbpf")
end
target("bootstrap")
set_kind("binary")
add_files("bootstrap.c", "bootstrap.bpf.c")
add_packages("linux-headers")
if not has_config("system-libbpf") then
add_deps("libbpf")
end
if is_plat("android") then
add_packages("argp-standalone")
end
target("fentry")
set_kind("binary")
add_files("fentry.c", "fentry.bpf.c")
add_packages("linux-headers")
if not has_config("system-libbpf") then
add_deps("libbpf")
end
target("uprobe")
set_kind("binary")
add_files("uprobe.c", "uprobe.bpf.c")
add_packages("linux-headers")
if not has_config("system-libbpf") then
add_deps("libbpf")
end
target("kprobe")
set_kind("binary")
add_files("kprobe.c", "kprobe.bpf.c")
add_packages("linux-headers")
if not has_config("system-libbpf") then
add_deps("libbpf")
end
if is_plat("android") then
-- TODO we need fix vmlinux.h to support android
set_default(false)
end

Binary file not shown.

Before

Width:  |  Height:  |  Size: 9.8 KiB

After

Width:  |  Height:  |  Size: 7.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 9.8 KiB

After

Width:  |  Height:  |  Size: 7.8 KiB

View File

@@ -235,13 +235,22 @@ int kvs_array_save(iouring_ctx_t *uring, kvs_array_t *inst, const char* filename
for (int i = 0; i < count; i++) total += lens[i]; for (int i = 0; i < count; i++) total += lens[i];
task_t *t = submit_write(uring, fd, bufs, lens, count, current_off); task_t *t = submit_write(uring, fd, bufs, lens, count, current_off);
cleanup_finished_iouring_tasks();
if (!t) { close(fd); return -4; } if (!t) {
perror("task init failed");
goto clean;
}
current_off += (off_t) total; current_off += (off_t) total;
} }
clean:
while (!uring_task_complete(uring)) {
usleep(1000);
cleanup_finished_iouring_tasks();
}
close(fd); close(fd);
return 0; return 0;
} }

View File

@@ -273,7 +273,9 @@ int kvs_hash_save(iouring_ctx_t *uring, kvs_hash_t *inst, const char* filename){
for(int i = 0;i < inst->max_slots; ++ i){ for(int i = 0;i < inst->max_slots; ++ i){
for (hashnode_t *n = inst->nodes[i]; n != NULL; n = n->next) { for (hashnode_t *n = inst->nodes[i]; n != NULL; n = n->next) {
if (!n->key || n->key_len == 0) continue; if (!n->key || n->key_len == 0) continue;
if (n->value_len > 0 && !n->value) { close(fd); return -3; } if (n->value_len > 0 && !n->value) {
goto clean;
}
uint32_t klen = htonl((uint32_t)n->key_len); uint32_t klen = htonl((uint32_t)n->key_len);
@@ -308,14 +310,21 @@ int kvs_hash_save(iouring_ctx_t *uring, kvs_hash_t *inst, const char* filename){
task_t *t = submit_write(uring, fd, bufs, lens, count, current_off); task_t *t = submit_write(uring, fd, bufs, lens, count, current_off);
if(!t) {
if (!t) { close(fd); return -4; } perror("task init failed");
goto clean;
}
cleanup_finished_iouring_tasks();
current_off += (off_t) total; current_off += (off_t) total;
} }
} }
clean:
while (!uring_task_complete(uring)) {
usleep(1000);
cleanup_finished_iouring_tasks();
}
close(fd); close(fd);
return 0; return 0;
} }

View File

@@ -504,9 +504,12 @@ static int kvs_rbtree_save_node(iouring_ctx_t *uring, int fd, off_t *current_off
for (int i = 0; i < count; i++) total += lens[i]; for (int i = 0; i < count; i++) total += lens[i];
task_t *t = submit_write(uring, fd, bufs, lens, count, *current_off); task_t *t = submit_write(uring, fd, bufs, lens, count, *current_off);
cleanup_finished_iouring_tasks();
if(!t) {
if (!t) { return -4; } perror("task init failed");
return -4;
}
*current_off += (off_t) total; *current_off += (off_t) total;
@@ -527,6 +530,10 @@ int kvs_rbtree_save(iouring_ctx_t *uring, kvs_rbtree_t *inst, const char* filena
int rc = kvs_rbtree_save_node(uring, fd, &current_off, inst, inst->root); int rc = kvs_rbtree_save_node(uring, fd, &current_off, inst, inst->root);
while (!uring_task_complete(uring)) {
usleep(1000);
cleanup_finished_iouring_tasks();
}
close(fd); close(fd);
return rc; return rc;
} }

View File

@@ -8,6 +8,7 @@
#include "memory/alloc_dispatch.h" #include "memory/alloc_dispatch.h"
#include "common/config.h" #include "common/config.h"
#include "diskuring/diskuring.h" #include "diskuring/diskuring.h"
#include "replica_shm.h"
#include <sys/types.h> #include <sys/types.h>
#include <sys/stat.h> #include <sys/stat.h>
@@ -29,10 +30,21 @@ unsigned long long global_seq;
extern int global_oplog_fd; extern int global_oplog_fd;
replica_shm_t g_rep_shm;
__attribute__((noinline))
void __completed_cmd(const uint8_t *cmd, size_t len, unsigned long long seq){ void __completed_cmd(const uint8_t *cmd, size_t len, unsigned long long seq){
asm volatile("" ::: "memory");
} }
// __attribute__((noinline))
// void __replica_notify(uint64_t seq, uint32_t off, uint32_t len)
// {
// // 空函数即可,目的是让 uprobe 拿到参数
// asm volatile("" ::: "memory");
// }
int kvs_protocol(struct conn* conn){ int kvs_protocol(struct conn* conn){
if (!conn) return -1; if (!conn) return -1;
char *request = conn->rbuffer; char *request = conn->rbuffer;
@@ -68,10 +80,6 @@ int kvs_protocol(struct conn* conn){
int dr = resp_dispatch(&cmd, &val); int dr = resp_dispatch(&cmd, &val);
__completed_cmd(p, len, global_seq);
global_seq ++;
/* /*
* 语义建议: * 语义建议:
* - resp_dispatch() 即使返回 -1比如 unknown command / wrong argc * - resp_dispatch() 即使返回 -1比如 unknown command / wrong argc
@@ -88,8 +96,6 @@ int kvs_protocol(struct conn* conn){
} }
} else { } else {
// persist into oplog // persist into oplog
if(global_cfg.persistence == PERSIST_INCREMENTAL){
/* 执行成功:在这里保存到日志中(只记录更新类命令) */ /* 执行成功:在这里保存到日志中(只记录更新类命令) */
if (cmd.argc > 0 && cmd.argv[0].ptr) { if (cmd.argc > 0 && cmd.argv[0].ptr) {
/* 更新类命令SET/DEL/MOD/RSET/RDEL/RMOD/HSET/HDEL/HMOD/SAVE */ /* 更新类命令SET/DEL/MOD/RSET/RDEL/RMOD/HSET/HDEL/HMOD/SAVE */
@@ -110,8 +116,26 @@ int kvs_protocol(struct conn* conn){
} }
if (is_update) { if (is_update) {
if(global_cfg.persistence == PERSIST_INCREMENTAL){
kvs_oplog_append(p, len, global_oplog_fd); kvs_oplog_append(p, len, global_oplog_fd);
} }
// __completed_cmd(p, len, global_seq);
// global_seq ++;
if (global_cfg.replica_mode == REPLICA_ENABLE) {
uint32_t off = 0;
int ar = replica_shm_append(&g_rep_shm, global_seq, p, (uint32_t)len, &off);
if (ar == 0) {
// __replica_notify(global_seq, off, (uint32_t)len);
global_seq++;
} else {
// shm 满或异常:你可以选择降级(比如直接跳过复制,或阻塞/丢弃)
// 为了不影响主路径,这里先打印并跳过
fprintf(stderr, "replica_shm_append failed %d\n", ar);
}
}
} }
} }
} }
@@ -246,6 +270,7 @@ int init_config(AppConfig *cfg){
printf("IP : %s\n", cfg->ip); printf("IP : %s\n", cfg->ip);
printf("Port : %d\n", cfg->port); printf("Port : %d\n", cfg->port);
printf("Replica-Mode : %s\n", replica_to_string(cfg->replica_mode));
printf("Mode : %s\n", server_mode_to_string(cfg->mode)); printf("Mode : %s\n", server_mode_to_string(cfg->mode));
printf("|—— Master IP : %s\n", cfg->master_ip); printf("|—— Master IP : %s\n", cfg->master_ip);
printf("|—— Master Port : %d\n", cfg->master_port); printf("|—— Master Port : %d\n", cfg->master_port);
@@ -268,7 +293,24 @@ int init_config(AppConfig *cfg){
} }
void init_disk_uring(iouring_ctx_t *uring_ctx){ void init_disk_uring(iouring_ctx_t *uring_ctx){
iouring_init(uring_ctx, 2048); // iouring_init(uring_ctx, 4096);
iouring_init(uring_ctx, (1024*8));
}
void dest_disk_uring(iouring_ctx_t *uring_ctx){
iouring_shutdown(uring_ctx);
}
int kvs_replica_init(void)
{
if (global_cfg.replica_mode == REPLICA_ENABLE) {
int rc = replica_shm_open(&g_rep_shm, REPLICA_SHM_NAME, REPLICA_SHM_SIZE, /*create=*/ 1);
if (rc != 0) {
fprintf(stderr, "replica_shm_open failed rc=%d\n", rc);
return rc;
}
}
return 0;
} }
@@ -279,6 +321,7 @@ int main(int argc, char *argv[]) {
} }
global_seq = 0; global_seq = 0;
kvs_replica_init();
init_memory_pool(&global_cfg); init_memory_pool(&global_cfg);
init_data_file(&global_cfg); init_data_file(&global_cfg);
init_disk_uring(&global_uring_ctx); init_disk_uring(&global_uring_ctx);

View File

@@ -279,7 +279,6 @@ extern kvs_rbtree_t global_rbtree;
extern kvs_hash_t global_hash; extern kvs_hash_t global_hash;
#endif #endif
void __completed_cmd(const uint8_t *cmd, size_t len, unsigned long long seq);
#endif #endif

View File

@@ -336,7 +336,9 @@ void sync_wakeup() {
ssize_t n = write(wakeup_fd, &one, sizeof(one)); ssize_t n = write(wakeup_fd, &one, sizeof(one));
} }
// #include "diskuring/diskuring.h"
// extern iouring_ctx_t global_uring_ctx;
// extern void iouring_tick(iouring_ctx_t *ctx);
// 定时器 // 定时器
int handle_timer_fd_cb(int fd){ int handle_timer_fd_cb(int fd){
@@ -349,14 +351,15 @@ int handle_timer_fd_cb(int fd){
if (n < 0 && errno == EAGAIN) break; if (n < 0 && errno == EAGAIN) break;
break; break;
} }
// iouring_tick(&global_uring_ctx);
} }
int init_timer_fd(void){ int init_timer_fd(void){
int tfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK | TFD_CLOEXEC); int tfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK | TFD_CLOEXEC);
struct itimerspec its = { struct itimerspec its = {
.it_interval = {1, 0}, // 每 1 .it_interval = {0, 100 * 1000 * 1000}, // 100ms = 100,000,000 纳
.it_value = {1, 0}, // 1 秒后首次触发 .it_value = {0, 100 * 1000 * 1000}, // 首次 100ms 后触发
}; };
timerfd_settime(tfd, 0, &its, NULL); timerfd_settime(tfd, 0, &its, NULL);
@@ -406,12 +409,12 @@ int reactor_start(unsigned short port, msg_handler handler) {
return -1; return -1;
} }
// timer_fd = init_timer_fd(); timer_fd = init_timer_fd();
// if(timer_fd < 0){ if(timer_fd < 0){
// close(epfd); close(epfd);
// close(wakeup_fd); close(wakeup_fd);
// return -1; return -1;
// } }
int i = 0; int i = 0;

110
replica_shm.c Normal file
View File

@@ -0,0 +1,110 @@
#include "replica_shm.h"
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <stdio.h>
#define REPLICA_SHM_MAGIC 0x52504C43u /* 'RPLC' */
#define REPLICA_SHM_VER 1
static inline uint64_t align8_u64(uint64_t x) { return (x + 7u) & ~7ull; }
int replica_shm_open(replica_shm_t *s, const char *name, size_t total_size, int create)
{
if (!s || !name || total_size < (sizeof(replica_shm_hdr_t) + 4096)) return -EINVAL;
memset(s, 0, sizeof(*s));
int flags = O_RDWR;
if (create) flags |= O_CREAT;
int fd = shm_open(name, flags, 0666);
if (fd < 0) return -errno;
if (create) {
if (ftruncate(fd, (off_t)total_size) != 0) {
int e = -errno; close(fd); return e;
}
}
void *p = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (p == MAP_FAILED) {
int e = -errno; close(fd); return e;
}
s->fd = fd;
s->map_size = total_size;
s->hdr = (replica_shm_hdr_t *)p;
s->data = (uint8_t *)p + sizeof(replica_shm_hdr_t);
// 初始化头
if (create || s->hdr->magic != REPLICA_SHM_MAGIC) {
memset(s->hdr, 0, sizeof(*s->hdr));
s->hdr->magic = REPLICA_SHM_MAGIC;
s->hdr->version = REPLICA_SHM_VER;
s->hdr->capacity = total_size - sizeof(replica_shm_hdr_t) - sizeof(replica_rec_hdr_t);
s->hdr->write_off = 0;
s->hdr->last_seq = 0;
}
printf("capcity:%ld\n", s->hdr->capacity);
return 0;
}
int replica_shm_append(replica_shm_t *s, uint64_t seq, const void *buf, uint32_t len, uint32_t *out_off)
{
if (!s || !s->hdr || !s->data || !buf || len == 0 || !out_off) return -EINVAL;
uint64_t cap = s->hdr->capacity;
uint64_t off = __atomic_load_n(&s->hdr->write_off, __ATOMIC_RELAXED);
uint64_t need = align8_u64((uint64_t)sizeof(replica_rec_hdr_t) + (uint64_t)len);
// 简化如果尾部放不下则写一个“wrap marker”回到 0
// wrap marker: hdr.len=0, seq 保留,表示消费者遇到就跳到 0
if (off + need > cap) {
replica_rec_hdr_t wrap = { .seq = seq, .len = 0, .flags = 0, .crc32 = 0, .reserved = 0 };
memcpy(s->data + off, &wrap, sizeof(wrap));
__atomic_store_n(&s->hdr->write_off, 0, __ATOMIC_RELEASE);
off = 0;
if (need > cap) return -ENOSPC; // 单条记录太大
}
replica_rec_hdr_t h = {0};
h.seq = seq;
h.len = len;
h.flags = 0;
h.crc32 = 0;
// 写 header + payload
memcpy(s->data + off, &h, sizeof(h));
memcpy(s->data + off + sizeof(h), buf, len);
// 发布 write_off / last_seq保证消费者看到 payload
uint64_t new_off = off + need;
__atomic_store_n(&s->hdr->last_seq, seq, __ATOMIC_RELEASE);
__atomic_store_n(&s->hdr->write_off, new_off, __ATOMIC_RELEASE);
*out_off = (uint32_t)off;
return 0;
}
int replica_shm_peek(replica_shm_t *s, uint32_t off, replica_rec_hdr_t *out_hdr)
{
if (!s || !s->hdr || !s->data || !out_hdr) return -EINVAL;
if ((uint64_t)off + sizeof(replica_rec_hdr_t) > s->hdr->capacity) return -EINVAL;
memcpy(out_hdr, s->data + off, sizeof(*out_hdr));
return 0;
}
void replica_shm_close(replica_shm_t *s)
{
if (!s) return;
if (s->hdr && s->map_size) munmap(s->hdr, s->map_size);
if (s->fd > 0) close(s->fd);
memset(s, 0, sizeof(*s));
}

59
replica_shm.h Normal file
View File

@@ -0,0 +1,59 @@
#ifndef __REPLICA_SHM_H__
#define __REPLICA_SHM_H__
#include <stdint.h>
#include <stddef.h>
#ifndef REPLICA_SHM_NAME
#define REPLICA_SHM_NAME "/kvs_replica_shm"
#endif
#ifndef REPLICA_SHM_SIZE
// 64MB按需调
#define REPLICA_SHM_SIZE (256u * 1024u * 1024u)
#endif
// 每条记录头部(放在 shm 的 data 区里)
typedef struct __attribute__((packed)) {
uint64_t seq; // 单调递增
uint32_t len; // payload bytes
uint32_t flags; // 预留:压缩、类型等
uint32_t crc32; // 可选0 表示不校验
uint32_t reserved; // 对齐
// uint8_t payload[len] 紧跟其后
} replica_rec_hdr_t;
// shm 顶部元数据
typedef struct __attribute__((packed)) {
uint32_t magic;
uint32_t version;
uint64_t capacity; // data 区大小(字节)
uint64_t write_off; // producer 写指针0..capacity-1
uint64_t last_seq; // producer 最新 seq
uint8_t _pad[64]; // cacheline padding
// 后面紧跟 data[capacity]
} replica_shm_hdr_t;
typedef struct {
int fd;
size_t map_size;
replica_shm_hdr_t *hdr;
uint8_t *data;
} replica_shm_t;
// kvstore: 初始化create/open + mmap
int replica_shm_open(replica_shm_t *s, const char *name, size_t total_size, int create);
// kvstore: append 一条记录,返回 off相对 data 起始),用于 notify
// 单写者设计:无需锁。返回 0 成功,<0 失败(空间不足或参数错误)
int replica_shm_append(replica_shm_t *s, uint64_t seq, const void *buf, uint32_t len, uint32_t *out_off);
// replicator: 读取记录头(不移动游标),你也可以直接 memcpy payload
// off 是 data 内偏移
int replica_shm_peek(replica_shm_t *s, uint32_t off, replica_rec_hdr_t *out_hdr);
// 关闭
void replica_shm_close(replica_shm_t *s);
extern replica_shm_t g_rep_shm;
#endif