Compare commits

..

10 Commits

Author SHA1 Message Date
6ede44bd80 性能测试 2026-03-07 07:29:32 +00:00
2e6baf0efe 落盘机制修改 2026-03-06 11:54:30 +00:00
c4e9bedd0a 性能测试工具bench 2026-03-05 08:45:23 +00:00
a190bdeea5 chainbuffer fixed 2026-03-04 07:20:09 +00:00
57720a3135 diskuring fix 2026-03-03 14:24:44 +00:00
2ec61bdf85 n*spsc uring_pool 2026-03-03 12:56:07 +00:00
ff924b033c 用户态网络缓冲区 chain-buffer 2026-03-03 08:05:43 +00:00
1iaan
c72314291a 需改ebpf程序探测内核,测试性能,验证想法,更新笔记。 2026-02-13 10:14:41 +00:00
1iaan
68bb4b3f9c uring落盘的无锁队列修改 2026-02-11 11:59:40 +00:00
1iaan
c1458a6693 bugfix: 远端测试bug 2026-02-02 07:40:01 +00:00
56 changed files with 6281 additions and 4327 deletions

2
.gitignore vendored
View File

@@ -5,6 +5,8 @@
*.a *.a
/ebpf/libbpf-bootstrap /ebpf/libbpf-bootstrap
/doc
/test-redis/results
kvstore kvstore
testcase testcase

View File

@@ -2,7 +2,7 @@
CC = gcc CC = gcc
CFLAGS = -g -DJEMALLOC_NO_DEMANGLE CFLAGS = -g -DJEMALLOC_NO_DEMANGLE
NET_SRCS = ntyco.c proactor.c reactor.c kvstore.c NET_SRCS = ntyco.c proactor.c reactor.c kvstore.c network/chainbuffer.c
KV_SRCS = kvs_array_bin.c kvs_rbtree_bin.c kvs_hash_bin.c kvs_rw_tools.c kvs_protocol_resp.c kvs_slave.c replica_shm.c KV_SRCS = kvs_array_bin.c kvs_rbtree_bin.c kvs_hash_bin.c kvs_rw_tools.c kvs_protocol_resp.c kvs_slave.c replica_shm.c
MEM_SRCS = ./memory/mempool.c ./memory/alloc_dispatch.c MEM_SRCS = ./memory/mempool.c ./memory/alloc_dispatch.c
COMMON_SRCS = ./common/config.c ./diskuring/diskuring.c COMMON_SRCS = ./common/config.c ./diskuring/diskuring.c
@@ -21,12 +21,14 @@ SUBDIR = ./NtyCo/
TEST_REDIS = ./test-redis/testcase TEST_REDIS = ./test-redis/testcase
TEST_REDIS_SRCS = ./test-redis/test.c TEST_REDIS_SRCS = ./test-redis/test.c
TEST_REDIS_BENCH = ./test-redis/bench
TEST_REDIS_BENCH_SRCS = ./test-redis/bench.c
TEST_REDIS_LDFLAGS = -lhiredis TEST_REDIS_LDFLAGS = -lhiredis
OBJS = $(SRCS:.c=.o) OBJS = $(SRCS:.c=.o)
all: $(SUBDIR) $(TARGET) $(TEST_REDIS) all: $(SUBDIR) $(TARGET) $(TEST_REDIS) $(TEST_REDIS_BENCH)
$(SUBDIR): ECHO $(SUBDIR): ECHO
make -C $@ make -C $@
@@ -40,11 +42,14 @@ $(TARGET): $(OBJS)
$(TEST_REDIS): $(TEST_REDIS_SRCS) $(TEST_REDIS): $(TEST_REDIS_SRCS)
$(CC) -g -o $@ $^ $(TEST_REDIS_LDFLAGS) $(CC) -g -o $@ $^ $(TEST_REDIS_LDFLAGS)
$(TEST_REDIS_BENCH): $(TEST_REDIS_BENCH_SRCS)
$(CC) -g -o $@ $^ $(TEST_REDIS_LDFLAGS)
%.o: %.c %.o: %.c
$(CC) $(CFLAGS) $(INC) -c $^ -g -o $@ $(CC) $(CFLAGS) $(INC) -c $^ -g -o $@
clean: clmem cldata clean: clmem cldata
rm -rf $(OBJS) $(TARGET) $(TEST_REDIS) rm -rf $(OBJS) $(TARGET) $(TEST_REDIS) $(TEST_REDIS_BENCH)
clmem: clmem:
rm -rf mem_leak/* rm -rf mem_leak/*

View File

@@ -1,29 +0,0 @@
CC = gcc
FLAGS = -I ./NtyCo/core/ -L ./NtyCo/ -lntyco -luring
TARGET = kvstore
SRCS = kvstore.c ntyco.c proactor.c kvs_array.c kvs_rbtree.c
# INC = -I ./NtyCo/core/
# LIBS = -L ./NtyCo/ -lntyco -luring
# FLAGS = -I ./NtyCo/core/ -L ./NtyCo/ -lntyco -luring
OBJS = $(SRCS:.c=.o)
TESTCASE = testcase
SUBDIR = ./NtyCo/
all: $(SUBDIR) $(TARGET) # $(TESTCASE)
$(SUBDIR): ECHO
make -C $@
ECHO:
@echo $(SUBDIR)
$(TARGET): $(OBJS)
$(CC) -o $@ $^ $(FLAGS)
clean:
rm -rf kvstore *.o

433
README.md
View File

@@ -1,20 +1,5 @@
# 9.1 Kvstore # 9.1 Kvstore
## 需求
1. ntyco需要作为kvstore的submodule,通过git clone一次下载。 **完成**
2. README需要包含编译步骤测试方案与可行性性能数据。 **完成**
3. 全量持久化保存数据集。 **BUG FIX**
4. 持久化的性能数据。 **完成**
5. 特殊字符可以解决redis的resp协议。 **完成**
6. 实现配置文件把日志级别端口ip主从模式持久化方案。 **完成**
7. 持久化落盘用io_uring加载配置文件用mmap。 **完成**
8. 主从同步的性能开启与关闭性能做到5%?。
9. 主从同步600w条,出现的coredump。 **完成**
10. 主从同步用ebpf实现。 **BUG FIX**
11. 内存池测试qps与虚拟内存物理内存。 **完成**
12. 实现一个内存泄露检测组件。 **完成**
## 环境安装与编译 ## 环境安装与编译
```shell ```shell
# xml # xml
@@ -22,18 +7,73 @@ sudo apt install libxml2 libxml2-dev
# hiredis client # hiredis client
sudo apt install -y libhiredis-dev sudo apt install -y libhiredis-dev
# bpftrace # bpftrace
sudo apt install -y bpftrace sudo apt install -y bpftrace libelf libelf-dev clang
# jemalloc
sudo apt install libjemalloc-dev
git clone git@gitlab.0voice.com:lianyiheng/9.1-kvstore.git git clone git@gitlab.0voice.com:lianyiheng/9.1-kvstore.git
cd 9.1-kvstore/ cd 9.1-kvstore/
git submodule update --init --recursive git submodule update --init --recursive
./init.sh
make make
``` ```
## 测试 ## REDIS 对比测试
### 数据口径2026-03-07 大 key 5轮复测
- 参数:`requests=1000000 pipeline=128 keyspace=1000000 value-size=256`
- 轮次:每个策略 `5` 轮,按场景剔除 `1` 个异常轮次后取 `4` 轮均值
- kvstore 源数据:`test-redis/results/hash_bench_fair_summary_20260307_062549.csv`
- redis 源数据:`test-redis/results/redis_bigkey_summary_20260307_063158.csv`
- 去异常结果:
- `test-redis/results/hash_bench_fair_trimmed_20260307_062549.csv`
- `test-redis/results/redis_bigkey_trimmed_20260307_063158.csv`
### kvstore 协议开销基线nopersist去异常后
| 策略 | set 均值QPS | set 均值us/op | get 均值QPS | get 均值us/op |
|---|---:|---:|---:|---:|
| nopersist | 150084.25 | 6.67 | 163760.25 | 6.11 |
| persist_no | 140206.50 | 7.13 | 164554.25 | 6.08 |
| persist_everysec | 133105.75 | 7.52 | 163358.25 | 6.12 |
| 相对 nopersist 的开销 | set QPS 变化 | set us/op 变化 | get QPS 变化 | get us/op 变化 |
|---|---:|---:|---:|---:|
| persist_no | -6.58% | +6.90% | +0.48% | -0.49% |
| persist_everysec | -11.31% | +12.74% | -0.25% | +0.16% |
### Redis 协议开销基线none去异常后
| 策略 | set 均值QPS | set 均值us/op | get 均值QPS | get 均值us/op |
|---|---:|---:|---:|---:|
| none | 207470.00 | 4.82 | 217642.50 | 4.59 |
| aof_no | 143399.50 | 6.98 | 214023.50 | 4.68 |
| aof_everysec | 141893.00 | 7.05 | 208713.75 | 4.79 |
| 相对 none 的开销 | set QPS 变化 | set us/op 变化 | get QPS 变化 | get us/op 变化 |
|---|---:|---:|---:|---:|
| aof_no | -30.88% | +44.81% | -1.66% | +1.96% |
| aof_everysec | -31.61% | +46.27% | -4.10% | +4.36% |
结论:本轮数据下,持久化开销仍主要体现在 `set``get` 相对更稳Redis 的 AOF 写路径开销显著高于无持久化。
## 调用开销
### gprof Flat ProfileTop 12按 self time
| 排名 | 函数 | self time % | self seconds | calls |
|---:|---|---:|---:|---:|
| 1 | `rbtree_node_get_key` | 58.34 | 0.56 | 103209091 |
| 2 | `rbtree_search` | 7.29 | 0.07 | 1874362 |
| 3 | `kvs_keycmp` | 5.73 | 0.06 | 74287566 |
| 4 | `ascii_casecmp` | 3.13 | 0.03 | 21490996 |
| 5 | `task_init` | 3.13 | 0.03 | 1397974 |
| 6 | `mp_page_create` | 3.13 | 0.03 | 23556 |
| 7 | `need` | 2.08 | 0.02 | 14042759 |
| 8 | `parse_i64` | 2.08 | 0.02 | 7029783 |
| 9 | `mp_page_alloc` | 2.08 | 0.02 | 5122487 |
| 10 | `rbtree_node_size` | 2.08 | 0.02 | 1860739 |
| 11 | `submit_write` | 2.08 | 0.02 | 1394599 |
| 12 | `rbtree_insert` | 2.08 | 0.02 | 926531 |
## 其他测试
### 测试1性能测试 ### 测试1性能测试
测试条件: 测试条件:
1. 不启用持久化。 1. 不启用持久化。
@@ -47,57 +87,6 @@ make
#### 内存分配: malloc #### 内存分配: malloc
```bash ```bash
lian@ubuntu:~/share/9.1-kvstore$ ./test-redis/testcase 192.168.10.129 8888 3 lian@ubuntu:~/share/9.1-kvstore$ ./test-redis/testcase 192.168.10.129 8888 3
Connected to 192.168.10.129:8888
BATCH (N=3000000) --> time_used=3294 ms, qps=910746
BATCH (N=3000000) --> time_used=3501 ms, qps=856898
BATCH (N=3000000) --> time_used=3457 ms, qps=867804
BATCH (N=3000000) --> time_used=3351 ms, qps=895255
BATCH (N=3000000) --> time_used=3320 ms, qps=903614
BATCH (N=3000000) --> time_used=3551 ms, qps=844832
BATCH (N=3000000) --> time_used=3354 ms, qps=894454
BATCH (N=3000000) --> time_used=3475 ms, qps=863309
BATCH (N=3000000) --> time_used=3404 ms, qps=881316
BATCH (N=3000000) --> time_used=3460 ms, qps=867052
BATCH (N=3000000) --> time_used=3392 ms, qps=884433
BATCH (N=3000000) --> time_used=3427 ms, qps=875401
BATCH (N=3000000) --> time_used=3441 ms, qps=871839
BATCH (N=3000000) --> time_used=3471 ms, qps=864304
BATCH (N=3000000) --> time_used=3354 ms, qps=894454
BATCH (N=3000000) --> time_used=3447 ms, qps=870322
BATCH (N=3000000) --> time_used=3364 ms, qps=891795
BATCH (N=3000000) --> time_used=3200 ms, qps=937500
BATCH (N=3000000) --> time_used=3159 ms, qps=949667
BATCH (N=3000000) --> time_used=3482 ms, qps=861573
BATCH (N=3000000) --> time_used=3474 ms, qps=863557
BATCH (N=3000000) --> time_used=3591 ms, qps=835421
BATCH (N=3000000) --> time_used=3466 ms, qps=865551
BATCH (N=3000000) --> time_used=3425 ms, qps=875912
BATCH (N=3000000) --> time_used=3346 ms, qps=896592
BATCH (N=3000000) --> time_used=3532 ms, qps=849377
BATCH (N=3000000) --> time_used=3471 ms, qps=864304
BATCH (N=3000000) --> time_used=3616 ms, qps=829646
BATCH (N=3000000) --> time_used=3403 ms, qps=881575
BATCH (N=3000000) --> time_used=3426 ms, qps=875656
BATCH (N=3000000) --> time_used=3493 ms, qps=858860
BATCH (N=3000000) --> time_used=3411 ms, qps=879507
BATCH (N=3000000) --> time_used=3422 ms, qps=876680
BATCH (N=3000000) --> time_used=3556 ms, qps=843644
BATCH (N=3000000) --> time_used=3285 ms, qps=913242
BATCH (N=3000000) --> time_used=3486 ms, qps=860585
BATCH (N=3000000) --> time_used=3427 ms, qps=875401
BATCH (N=3000000) --> time_used=3563 ms, qps=841987
BATCH (N=3000000) --> time_used=3304 ms, qps=907990
BATCH (N=3000000) --> time_used=3582 ms, qps=837520
BATCH (N=3000000) --> time_used=3468 ms, qps=865051
BATCH (N=3000000) --> time_used=3360 ms, qps=892857
BATCH (N=3000000) --> time_used=3426 ms, qps=875656
BATCH (N=3000000) --> time_used=3186 ms, qps=941619
BATCH (N=3000000) --> time_used=3251 ms, qps=922792
BATCH (N=3000000) --> time_used=3400 ms, qps=882352
BATCH (N=3000000) --> time_used=3446 ms, qps=870574
BATCH (N=3000000) --> time_used=3302 ms, qps=908540
BATCH (N=3000000) --> time_used=3072 ms, qps=976562
BATCH (N=3000000) --> time_used=3458 ms, qps=867553
average qps:880462 average qps:880462
ALL TESTS PASSED. ALL TESTS PASSED.
``` ```
@@ -105,57 +94,6 @@ ALL TESTS PASSED.
#### 内存分配: 自实现内存池 #### 内存分配: 自实现内存池
```bash ```bash
lian@ubuntu:~/share/9.1-kvstore$ ./test-redis/testcase 192.168.10.129 8888 3 lian@ubuntu:~/share/9.1-kvstore$ ./test-redis/testcase 192.168.10.129 8888 3
Connected to 192.168.10.129:8888
BATCH (N=3000000) --> time_used=3241 ms, qps=925640
BATCH (N=3000000) --> time_used=3047 ms, qps=984574
BATCH (N=3000000) --> time_used=3085 ms, qps=972447
BATCH (N=3000000) --> time_used=3119 ms, qps=961846
BATCH (N=3000000) --> time_used=3104 ms, qps=966494
BATCH (N=3000000) --> time_used=3163 ms, qps=948466
BATCH (N=3000000) --> time_used=3033 ms, qps=989119
BATCH (N=3000000) --> time_used=3170 ms, qps=946372
BATCH (N=3000000) --> time_used=3299 ms, qps=909366
BATCH (N=3000000) --> time_used=3272 ms, qps=916870
BATCH (N=3000000) --> time_used=3294 ms, qps=910746
BATCH (N=3000000) --> time_used=3182 ms, qps=942803
BATCH (N=3000000) --> time_used=3190 ms, qps=940438
BATCH (N=3000000) --> time_used=3493 ms, qps=858860
BATCH (N=3000000) --> time_used=3111 ms, qps=964320
BATCH (N=3000000) --> time_used=3220 ms, qps=931677
BATCH (N=3000000) --> time_used=3067 ms, qps=978154
BATCH (N=3000000) --> time_used=3345 ms, qps=896860
BATCH (N=3000000) --> time_used=3381 ms, qps=887311
BATCH (N=3000000) --> time_used=3416 ms, qps=878220
BATCH (N=3000000) --> time_used=3192 ms, qps=939849
BATCH (N=3000000) --> time_used=3085 ms, qps=972447
BATCH (N=3000000) --> time_used=3150 ms, qps=952380
BATCH (N=3000000) --> time_used=3296 ms, qps=910194
BATCH (N=3000000) --> time_used=3001 ms, qps=999666
BATCH (N=3000000) --> time_used=3143 ms, qps=954502
BATCH (N=3000000) --> time_used=3111 ms, qps=964320
BATCH (N=3000000) --> time_used=3123 ms, qps=960614
BATCH (N=3000000) --> time_used=3257 ms, qps=921093
BATCH (N=3000000) --> time_used=3037 ms, qps=987816
BATCH (N=3000000) --> time_used=3135 ms, qps=956937
BATCH (N=3000000) --> time_used=3124 ms, qps=960307
BATCH (N=3000000) --> time_used=3276 ms, qps=915750
BATCH (N=3000000) --> time_used=3058 ms, qps=981033
BATCH (N=3000000) --> time_used=3024 ms, qps=992063
BATCH (N=3000000) --> time_used=3224 ms, qps=930521
BATCH (N=3000000) --> time_used=3235 ms, qps=927357
BATCH (N=3000000) --> time_used=3334 ms, qps=899820
BATCH (N=3000000) --> time_used=3427 ms, qps=875401
BATCH (N=3000000) --> time_used=3218 ms, qps=932256
BATCH (N=3000000) --> time_used=3191 ms, qps=940144
BATCH (N=3000000) --> time_used=3179 ms, qps=943692
BATCH (N=3000000) --> time_used=3104 ms, qps=966494
BATCH (N=3000000) --> time_used=3202 ms, qps=936914
BATCH (N=3000000) --> time_used=3184 ms, qps=942211
BATCH (N=3000000) --> time_used=3000 ms, qps=1000000
BATCH (N=3000000) --> time_used=3280 ms, qps=914634
BATCH (N=3000000) --> time_used=3141 ms, qps=955109
BATCH (N=3000000) --> time_used=3198 ms, qps=938086
BATCH (N=3000000) --> time_used=3126 ms, qps=959692
average qps:942837 average qps:942837
ALL TESTS PASSED. ALL TESTS PASSED.
``` ```
@@ -163,57 +101,6 @@ ALL TESTS PASSED.
#### 内存分配jemalloc #### 内存分配jemalloc
```shell ```shell
lian@ubuntu:~/share/9.1-kvstore$ ./test-redis/testcase 192.168.10.129 8888 3 lian@ubuntu:~/share/9.1-kvstore$ ./test-redis/testcase 192.168.10.129 8888 3
Connected to 192.168.10.129:8888
BATCH (N=3000000) --> time_used=3511 ms, qps=854457
BATCH (N=3000000) --> time_used=3280 ms, qps=914634
BATCH (N=3000000) --> time_used=3603 ms, qps=832639
BATCH (N=3000000) --> time_used=3418 ms, qps=877706
BATCH (N=3000000) --> time_used=3353 ms, qps=894721
BATCH (N=3000000) --> time_used=3435 ms, qps=873362
BATCH (N=3000000) --> time_used=3250 ms, qps=923076
BATCH (N=3000000) --> time_used=3550 ms, qps=845070
BATCH (N=3000000) --> time_used=3536 ms, qps=848416
BATCH (N=3000000) --> time_used=3273 ms, qps=916590
BATCH (N=3000000) --> time_used=3224 ms, qps=930521
BATCH (N=3000000) --> time_used=3161 ms, qps=949066
BATCH (N=3000000) --> time_used=3143 ms, qps=954502
BATCH (N=3000000) --> time_used=3342 ms, qps=897666
BATCH (N=3000000) --> time_used=3410 ms, qps=879765
BATCH (N=3000000) --> time_used=3522 ms, qps=851788
BATCH (N=3000000) --> time_used=3035 ms, qps=988467
BATCH (N=3000000) --> time_used=3352 ms, qps=894988
BATCH (N=3000000) --> time_used=3226 ms, qps=929944
BATCH (N=3000000) --> time_used=3406 ms, qps=880798
BATCH (N=3000000) --> time_used=3336 ms, qps=899280
BATCH (N=3000000) --> time_used=3307 ms, qps=907166
BATCH (N=3000000) --> time_used=3171 ms, qps=946073
BATCH (N=3000000) --> time_used=3252 ms, qps=922509
BATCH (N=3000000) --> time_used=3296 ms, qps=910194
BATCH (N=3000000) --> time_used=3301 ms, qps=908815
BATCH (N=3000000) --> time_used=3403 ms, qps=881575
BATCH (N=3000000) --> time_used=3234 ms, qps=927643
BATCH (N=3000000) --> time_used=3348 ms, qps=896057
BATCH (N=3000000) --> time_used=3517 ms, qps=852999
BATCH (N=3000000) --> time_used=3354 ms, qps=894454
BATCH (N=3000000) --> time_used=3529 ms, qps=850099
BATCH (N=3000000) --> time_used=3473 ms, qps=863806
BATCH (N=3000000) --> time_used=3521 ms, qps=852030
BATCH (N=3000000) --> time_used=3370 ms, qps=890207
BATCH (N=3000000) --> time_used=3267 ms, qps=918273
BATCH (N=3000000) --> time_used=3352 ms, qps=894988
BATCH (N=3000000) --> time_used=3433 ms, qps=873871
BATCH (N=3000000) --> time_used=3374 ms, qps=889152
BATCH (N=3000000) --> time_used=3360 ms, qps=892857
BATCH (N=3000000) --> time_used=3463 ms, qps=866300
BATCH (N=3000000) --> time_used=3499 ms, qps=857387
BATCH (N=3000000) --> time_used=3294 ms, qps=910746
BATCH (N=3000000) --> time_used=3311 ms, qps=906070
BATCH (N=3000000) --> time_used=3443 ms, qps=871333
BATCH (N=3000000) --> time_used=3381 ms, qps=887311
BATCH (N=3000000) --> time_used=3422 ms, qps=876680
BATCH (N=3000000) --> time_used=3421 ms, qps=876936
BATCH (N=3000000) --> time_used=3322 ms, qps=903070
BATCH (N=3000000) --> time_used=3494 ms, qps=858614
average qps:892493 average qps:892493
ALL TESTS PASSED. ALL TESTS PASSED.
``` ```
@@ -230,17 +117,6 @@ ALL TESTS PASSED.
```shell ```shell
lian@ubuntu:~/share/9.1-kvstore$ ./test-redis/testcase 192.168.10.129 8888 4 lian@ubuntu:~/share/9.1-kvstore$ ./test-redis/testcase 192.168.10.129 8888 4
Connected to 192.168.10.129:8888
BATCH (N=3000000) --> time_used=3500 ms, qps=857142
BATCH (N=3000000) --> time_used=3322 ms, qps=903070
BATCH (N=3000000) --> time_used=3424 ms, qps=876168
BATCH (N=3000000) --> time_used=3483 ms, qps=861326
BATCH (N=3000000) --> time_used=3421 ms, qps=876936
BATCH (N=3000000) --> time_used=3519 ms, qps=852514
BATCH (N=3000000) --> time_used=3597 ms, qps=834028
BATCH (N=3000000) --> time_used=3504 ms, qps=856164
BATCH (N=3000000) --> time_used=3281 ms, qps=914355
BATCH (N=3000000) --> time_used=3446 ms, qps=870574
average qps:870227 average qps:870227
ALL TESTS PASSED. ALL TESTS PASSED.
``` ```
@@ -264,9 +140,9 @@ BATCH (N=9000000) --> time_used=10033 ms, qps=1794079
VIRT 208M VIRT 208M
RES 155M RES 155M
``` ```
![alt text](image11.png) ![alt text](https://disk.0voice.com/p/wl)
![alt text](image12.png) ![alt text](https://disk.0voice.com/p/wm)
![alt text](image13.png) ![alt text](https://disk.0voice.com/p/wO)
#### jemalloc #### jemalloc
```shell ```shell
@@ -285,9 +161,9 @@ BATCH (N=9000000) --> time_used=9353 ms, qps=1924516
VIRT 356M VIRT 356M
RES 119M RES 119M
``` ```
![alt text](image11.png) ![alt text](https://disk.0voice.com/p/wl)
![alt text](image22.png) ![alt text](https://disk.0voice.com/p/wP)
![alt text](image23.png) ![alt text](https://disk.0voice.com/p/wQ)
#### mypool #### mypool
```shell ```shell
@@ -306,9 +182,9 @@ BATCH (N=3000000) --> time_used=3022 ms, qps=1985440
VIRT 122M VIRT 122M
RES 71492 RES 71492
``` ```
![alt text](image31.png) ![alt text](https://disk.0voice.com/p/wR)
![alt text](image32.png) ![alt text](https://disk.0voice.com/p/wn)
![alt text](image33.png) ![alt text](https://disk.0voice.com/p/wo)
### 测试4主从同步 ### 测试4主从同步
测试条件: 测试条件:
@@ -340,21 +216,170 @@ average qps:777838
ALL TESTS PASSED. ALL TESTS PASSED.
``` ```
### 面试题 ## 项目收获
1. 为什么会实现kvstore使用场景在哪里 #### reactor网络模型用户态网络缓冲区的写法。
2. reactor, ntyco, io_uring的三种网络模型的性能差异
3. 多线程的kvstore该如何改进
4. 私有协议如何设计会更加安全可靠?
5. 协议改进以后,对已有的代码有哪些改变?
6. kv引擎实现了哪些
7. 每个kv引擎的使用场景以及性能差异
8. 测试用例如何实现并且保证代码覆盖率超过90%
9. 网络并发量如何qps如何
10. 能够跟哪些系统交互使用?
#### 特殊字符串支持的引擎层数据结构设计,支持\0作为键值存储。
### 架构设计 1. 长度前缀 + 内容的 binary-safe 字符串表示,支持 \0 作为普通字符。
![image](https://disk.0voice.com/p/py)
#### 实现RESP协议的服务端协议解析。
1. 解析流程:
1. 内核 拷贝到 用户态缓冲区
2. 用户态缓冲区 就地解析 bulk-string 为执行引擎可以看得懂的结构体
3. 执行引擎 拷贝 结构体的内容,插入到底层存储结构中
4. 循环解析直到 用户态缓冲区为空
2. 实现了 pipeline 支持:每次从 buffer 读到一个完整命令就交给应用层处理,应用层返回已消费字节数。如果 buffer 里有半包,连接层保留下次继续解析。
#### 快照异步创建。
1. 使用fork的Copy On Write机制实现的异步快照创建不会受到后续更新请求的影响
#### 实现SPSC/SPMC专门uring线程实现异步的增量、全量落盘操作。
1. 熟悉了SPSC无锁队列的实现方案。
1. 无锁、cache friendly
2. 流程:
1. 生产者组装task丢给SPSC。
2. 消费者从中取出然后执行置入destroy_queue触发eventfd。
3. 生产者释放destroy_queue。
3. 解决的问题:
1. iouring 由于 cqe 接收不及时导致的 task 丢失无法释放。
1. 通过背压解决设置inflight上限。
2. 背压后,生产者速度 > 消费者速度ringbuffer 满导致只能阻塞主线程或降低速度。
1. 通过修改SPSC为SPMC构建落盘线程组实现当task_queue满触发扩容线程组。
2. 每个落盘线程私有一个SPSC减少竞争。
3. 简易工作负载生产者线程随机选取两个uring线程选取任务少的push。
3. 采用读写段采用轮询方案导致乒乓现象,性能下降。
1. MESI协议定义了缓存行的四种状态Modified表示独占且已修改Exclusive表示独占且未修改Shared表示多核共享只读Invalid表示缓存行无效。
- **关键点:** 但从S状态读没有什么开销从I状态读则需要向其他CPU申请。
- **关键点:** 从E/M状态写也没有什么开销从S状态写则需要广播invalidate并且等待ACK(50-100时钟周期)。
- 原子操作由于内存序的顺序性和可见性语义也有额外开销刷新invalidate queue、阻止指令重排
2. 短临界区:分层退避。自旋-让出CPU时间片-较长时间休眠。允许生产者在期间无争用的写入一批数据,然后统一读。
3. 更通用的情况事件驱动。用futex替代轮询。Fast Userspace Mutex。
1. **消费者:** 调用`syscall(SYS_futex, &futex_word, FUTEX_WAIT, old_val, &timeout, NULL, 0)` , 如果futex_word仍等于old_val线程进入内核等待队列真正睡眠不占用CPU。
2. **生产者:** 调用`syscall(SYS_futex, &futex_word, FUTEX_WAKE, 1, NULL, NULL, 0)` 唤醒一个等待线程。
4. 优化:生产者速度 > 消费者速度则写端不需要关注读指针或者极少关注读指针缓冲区大小的1/2次写入才考虑一次。
#### 基于BinLog上OffSet的主从同步(已有数据+实时数据)设计。
1. 初始化阶段:
1. master 持续将收到的更新请求+seq_id 落盘到本地 binlog 中。
2. slave 向 master 发起连接并且发送本地binlog中最大的seq_id 为 slave_seq_id。
2. 执行阶段:
1. master启动一个独立同步线程与 slave 构建连接。同步线程有两个阶段:
1. slave_seq_id < local_min_seq_idmaster通过fork创建内存快照发送且发送同时刻的local_max_seq_id。
2. slave_seq_id < local_max_seq_id通过自实现同步协议批量发送binlog的seq并且回包new_slave_seq_id。
3. slave_seq_id == local_max_seq_id线程休眠。
2. master收到新的请求的时候会通过条件变量唤醒同步线程。
#### 基于ebpf的实时数据同步设计。
**基准性能**Kvstore QPS ~90w无持久化/同步)。
ebpf对程序的影响要考虑如下方面
1. eBPF 探针的触发频率(上下文切换)
2. 数据拷贝方式
**用户态探针 (Uprobe) 的开销**
1. **逐条命令探测**。QPS大概在 25w左右。**原因**Uprobe 基于断点指令int3实现用户态 → 异常 → 内核 → eBPF → 返回用户态,高频触发会导致 CPU 流水线停顿严重。
2. **批处理探测**。QPS大概在85w左右。大幅减少了上下文切换次数平摊了中断开销。
**内核态探针 (Tracepoint/Kprobe) 的开销**
1. **Tracepoint (sys_exit_recvfrom)等**
1. 纯探测QPS ~85w。
2. 带数据拷贝 (read_user)QPS 降至 ~70w。**原因****bpf_probe_read_user** 是一个非常重的 helper开销远大于一次 memcpy。
2. **Kprobe (tcp_rcv_established)等**
1. 纯探测QPS ~86w。
2. 带数据拷贝 (read_kernel)QPS ~83w。**原因**:比**bpf_probe_read_user**轻得多。
3. 问题:此时数据位于 TCP 协议栈底层可能存在分片Fragment、乱序或非线性存储Paged SKB需要处理复杂的数据重组逻辑。
工作流程:
1. **内核态捕获**
1. eBPF 程序挂载于内核网络路径( TC 或 TCP 层),拦截流量。
2. 使用 bpf_probe_read_kernel 或 bpf_skb_load_bytes 高效提取数据载荷。
3. 通过 bpf_ringbuf_submit 将数据写入环形缓冲区。
2. **用户态转发**
1. 独立进程消费 RingBuffer。
2. 将数据暂存入本地队列,平滑突发流量,防止 RingBuffer 溢出导致的数据丢失。
3. **状态机控制**
1. **SYNC 阶段**:探测到 __ssync识别 Slave 连接信息IP/Port标记状态为“同步中”。
2. **READY 阶段**:探测到 __sready标志全量同步完成。
3. **实时转发**Agent 启动独立线程,消耗 Pending Queue将增量数据通过标准 TCP 发送给 Slave。
##### TC 与 XDP
网络栈
```
-> [ 网卡 (NIC) ]
-> [ XDP (eXpress Data Path) ] <--- xdp 处理原始帧 no skb
-> [ sk_buff 分配 ]
-> [ TC (Traffic Control) Ingress ] | tc 可操作 on skb
-> [ Netfilter (PREROUTING) ]
-> [ IP 协议栈 ] | ip_rcv -> ip_local_deliver
-> [ TCP 协议栈 ] | tcp_v4_rcv -> tcp_rcv_established -> tcp_data_queue
producer skb -> sk->sk_receive_queue
____________________________________________________________________
consumer sk->sk_receive_queue
-> [ Socket Layer ] | tcp_recvmsg 拷贝到内存
-> [ Syscall Interface ] | sys_exit_recvfrom
-> [ 用户态应用 (Kvstore) ]
```
XDP 是网卡驱动层的探点操作系统刚刚收到数据包DMA读入ringbufferCRC校验还没有分配sk_buffer。数据形态是**裸的以太网帧** 。
TC 是在 sk_buff 分配之后IP 协议栈处理之前的探测点。数据形态是__sk_buffer。
TCP协议栈是分界点
#### 内存泄露探测功能,实现热插拔。
方案1 基于bpf
1. 通过预定义宏__FILE__等封装一个内存分配宏定义向真正的分配函数传递代码位置等信息。
2. 通过bpf探测内存分配的地址、大小、文件、代码位置、函数等信息并且记录。
3. 通过bpf探测内存释放的指针信息然后释放。
4. 打印最终剩余的内存地址。
5. 缺点bpf 探测 malloc 等对性能的影响非常的大。
方案2 基于全局变量启用的代码内置探测工具,在网络层接收启用/关闭探测工具的请求。
1. 分配时打开一个文件,关闭时删除。
2. 分配时使用kv保存删除时删除k。
都对性能和内存有很大的影响,不建议长时间启用。
#### 实现支持分配可变长度内存块的内存池。
1. 熟悉了glibc 的 ptmalloc的底层操作。默认阈值约 128KB且会根据分配行为动态调整。
1. '<= 默认阈值 通过 brk/sbrk 扩展连续堆,堆里维护了 chunk 结构。
2. '> 默认阈值 的块用 mmap 独立申请,释放时 munmap。
3. 每个 chunk 头部存大小信息(通常 8~16 字节),用户拿到的是 chunk + 头部后的地址。
4. 空闲 chunk 按大小分到多个 bintcache、fastbin、small bin、large bin 等fastbin 和 tcache 为了速度不合并相邻空闲块。
5. 缺点:
1. 分配路径有较多分支判断和链表遍历,不是严格 O(1)。
2. 小块故意不合并fastbin/tcache会导致外部碎片。
3. 长期运行内存利用率下降。
2. 内存池实现:
1. 内存池有多个桶,桶中存储固定大小的块。分配/释放均为 O(1)
2. 以 huge page 为单位向系统申请内存并切分为固定块。
3. 当页内块全部释放时整页归还系统,显著降低长期碎片。
4. 通过地址对齐 O(1) 反推出页头元信息(所属桶、空闲计数)。
5. malloc通常向上对齐桶对应的存储大小可以根据不同系统设定减少内部碎片。
6. 使用 bitmap + freelist 防 double free 且无额外查找开销。
7. 每线程独立内存池相对malloc更少的锁竞争。
8. 大块分配自动退化为 malloc 处理。
相比 ptmalloc该设计消除了外部碎片降低了系统调用次数并在多线程场景下显著提升分配性能与内存利用率。

View File

@@ -56,6 +56,7 @@ static void set_default_config(AppConfig *cfg)
cfg->mode = MODE_MASTER; cfg->mode = MODE_MASTER;
cfg->master_port = 8888; cfg->master_port = 8888;
cfg->persistence = PERSIST_NONE; cfg->persistence = PERSIST_NONE;
cfg->oplog_sync_mode = OPLOG_SYNC_NONE;
cfg->allocator = ALLOC_JEMALLOC; cfg->allocator = ALLOC_JEMALLOC;
cfg->leak_mode = MEMLEAK_DETECT_OFF; cfg->leak_mode = MEMLEAK_DETECT_OFF;
cfg->replica_mode = REPLICA_DISABLE; cfg->replica_mode = REPLICA_DISABLE;
@@ -87,6 +88,16 @@ static void parse_persistence(const char *s, PersistenceType *out)
else if (!strcasecmp(s, "none")) *out = PERSIST_NONE; else if (!strcasecmp(s, "none")) *out = PERSIST_NONE;
} }
static void parse_oplog_sync_mode(const char *s, OplogSyncMode *out)
{
if (!s || !out) return;
if (!strcasecmp(s, "every_sec") || !strcasecmp(s, "everysec")) {
*out = OPLOG_SYNC_EVERY_SEC;
} else if (!strcasecmp(s, "none")) {
*out = OPLOG_SYNC_NONE;
}
}
static void parse_allocator(const char *s, AllocatorType *out) static void parse_allocator(const char *s, AllocatorType *out)
{ {
if (!s || !out) return; if (!s || !out) return;
@@ -188,6 +199,15 @@ const char *persistence_to_string(PersistenceType p)
} }
} }
const char *oplog_sync_mode_to_string(OplogSyncMode m)
{
switch (m) {
case OPLOG_SYNC_NONE: return "none";
case OPLOG_SYNC_EVERY_SEC: return "every_sec";
default: return "unknown";
}
}
const char *allocator_to_string(AllocatorType a) const char *allocator_to_string(AllocatorType a)
{ {
switch (a) { switch (a) {
@@ -337,6 +357,15 @@ void persist_load(xmlNodePtr *root, AppConfig *out_cfg){
} }
} }
xmlNodePtr oplog_sync_node = find_child(pers, "oplog_sync");
if (oplog_sync_node) {
xmlChar *txt = xmlNodeGetContent(oplog_sync_node);
if (txt) {
parse_oplog_sync_mode((char *)txt, &out_cfg->oplog_sync_mode);
xmlFree(txt);
}
}
xmlNodePtr array_node = find_child(pers, "array"); xmlNodePtr array_node = find_child(pers, "array");
if (array_node) { if (array_node) {
xmlChar *txt = xmlNodeGetContent(array_node); xmlChar *txt = xmlNodeGetContent(array_node);
@@ -413,10 +442,9 @@ int config_load(const char *filename, AppConfig *out_cfg)
/* /*
* 用 xmlReadMemory 从内存解析。 * 用 xmlReadMemory 从内存解析。
* - "UTF-8"你原来指定了 UTF-8如果希望自动探测可以传 NULL。 * - "UTF-8":指定了 UTF-8如果希望自动探测可以传 NULL。
* - XML_PARSE_NONET禁用网络访问防 XXE/外部实体拉取) * - XML_PARSE_NONET禁用网络访问防 XXE/外部实体拉取)
* - XML_PARSE_NOBLANKS保持你原来的行为 * - XML_PARSE_NOBLANKS保持你原来的行为
* 你也可以加 XML_PARSE_NOERROR | XML_PARSE_NOWARNING 减少噪音,但调试阶段不建议。
*/ */
int parse_opts = XML_PARSE_NOBLANKS | XML_PARSE_NONET; int parse_opts = XML_PARSE_NOBLANKS | XML_PARSE_NONET;

View File

@@ -20,6 +20,11 @@ typedef enum {
PERSIST_NONE PERSIST_NONE
} PersistenceType; } PersistenceType;
typedef enum {
OPLOG_SYNC_NONE,
OPLOG_SYNC_EVERY_SEC
} OplogSyncMode;
typedef enum { typedef enum {
REPLICA_DISABLE, REPLICA_DISABLE,
REPLICA_ENABLE REPLICA_ENABLE
@@ -53,6 +58,7 @@ typedef struct {
char array_file[256]; char array_file[256];
char rbtree_file[256]; char rbtree_file[256];
char hash_file[256]; char hash_file[256];
OplogSyncMode oplog_sync_mode;
AllocatorType allocator; AllocatorType allocator;
MemLeakDetectMode leak_mode; MemLeakDetectMode leak_mode;
@@ -68,6 +74,7 @@ int config_load(const char *filename, AppConfig *out_cfg);
const char *log_level_to_string(LogLevel lvl); const char *log_level_to_string(LogLevel lvl);
const char *server_mode_to_string(ServerMode mode); const char *server_mode_to_string(ServerMode mode);
const char *persistence_to_string(PersistenceType p); const char *persistence_to_string(PersistenceType p);
const char *oplog_sync_mode_to_string(OplogSyncMode m);
const char *allocator_to_string(AllocatorType a); const char *allocator_to_string(AllocatorType a);
const char *leakage_to_string(MemLeakDetectMode a); const char *leakage_to_string(MemLeakDetectMode a);
const char *replica_to_string(ReplicaMode a); const char *replica_to_string(ReplicaMode a);

View File

@@ -1,34 +1,35 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version='1.0' encoding='UTF-8'?>
<config> <config>
<server> <server>
<ip>192.168.10.129</ip> <ip>127.0.0.1</ip>
<port>8888</port> <port>8888</port>
<mode>master</mode> <!-- master / slave --> <mode>master</mode>
<!-- 仅当 mode=slave 时使用 -->
<replica>enable</replica> <replica>disable</replica>
<master> <master>
<ip>192.168.10.129</ip> <ip>192.168.220.134</ip>
<port>8888</port> <port>8888</port>
</master> </master>
</server> </server>
<log> <log>
<level>INFO</level> <!-- DEBUG / INFO / ERROR --> <level>INFO</level>
</log> </log>
<persistence> <persistence>
<type>none</type> <!-- incremental / none --> <type>incremental</type>
<dir>data</dir> <!-- 所有持久化文件所在目录 --> <dir>data/persist_no_20260307_053744_r1_a1</dir>
<wal>kvs_oplog.db</wal> <wal>kvs_oplog.db</wal>
<oplog_sync>none</oplog_sync>
<array>kvs_array.db</array> <array>kvs_array.db</array>
<rbtree>kvs_rbtree.db</rbtree> <rbtree>kvs_rbtree.db</rbtree>
<hash>kvs_hash.db</hash> <hash>kvs_hash.db</hash>
</persistence> </persistence>
<memory> <memory>
<allocator>mypool</allocator> <!-- jemalloc / malloc / mypool --> <allocator>mypool</allocator>
<leakage>disable</leakage> <!-- enable/disable--> <leakage>disable</leakage>
</memory> </memory>
</config> </config>

File diff suppressed because it is too large Load Diff

View File

@@ -1,199 +0,0 @@
#include "diskuring.h"
#include "memory/alloc_dispatch.h"
#include <poll.h>
#include <sys/eventfd.h>
void task_init(task_t *t)
{
t->done = 0;
t->res = 0;
t->next = NULL;
}
void task_finish(task_t *t, int res)
{
t->res = res;
t->done = 1;
}
void task_destroy(task_t *t)
{
if (t->iovs) {
for (int i = 0; i < t->iovcnt; i++) {
if (t->iovs[i].iov_base) {
kvs_free(t->iovs[i].iov_base);
}
}
kvs_free(t->iovs);
}
kvs_free(t);
}
int iouring_init(iouring_ctx_t *ctx, unsigned entries)
{
memset(ctx, 0, sizeof(*ctx));
struct io_uring_params params;
memset(&params, 0, sizeof(params));
// params.flags |= IORING_SETUP_CQSIZE;
// params.cq_entries = 256 * 1024;
// params.sq_entries = 128 * 1024;
int ret = io_uring_queue_init_params(entries, &ctx->ring, &params);
if (ret < 0) {
fprintf(stderr, "io_uring_queue_init_params failed: %d (%s)\n",
ret, strerror(-ret));
return ret;
}
unsigned cq_size = *ctx->ring.cq.kring_entries;
printf("Kernel CQ size: %u\n", cq_size);
if (ret != 0)
{
io_uring_queue_exit(&ctx->ring);
return -ret;
}
return 0;
}
void iouring_shutdown(iouring_ctx_t *ctx)
{
io_uring_queue_exit(&ctx->ring);
}
void harvest_cqes(iouring_ctx_t *ctx)
{
struct io_uring_cqe *cqe;
unsigned head;
int cq_count = 0;
// 使用 for_each_cqe 薅干净 CQ
io_uring_for_each_cqe(&ctx->ring, head, cqe) {
task_t *done = (task_t *)(uintptr_t)cqe->user_data;
task_finish(done, cqe->res);
if (cqe->res < 0) {
fprintf(stderr, "write fail: fd=%d res=%d\n", done->fd, cqe->res);
}
// 直接 destroy单线程无需全局队列
task_destroy(done);
cq_count++;
}
if (cq_count > 0) {
// printf("harvest cq:%d\n", cq_count);
io_uring_cq_advance(&ctx->ring, cq_count);
}
// 检查 CQ overflow保险
if (*ctx->ring.sq.kflags & IORING_SQ_CQ_OVERFLOW) {
fprintf(stderr, "FATAL: CQ overflow detected!\n");
abort();
}
}
task_t* submit_write(iouring_ctx_t *ctx, int fd, void **bufs, size_t *lens, int count, off_t off){
if (!bufs || !lens || count <= 0) return NULL;
task_t *t = (task_t *)kvs_malloc(sizeof(task_t));
task_init(t);
t->op = TASK_WRITE;
t->fd = fd;
t->off = off;
t->iovs = (struct iovec *)kvs_malloc(sizeof(struct iovec) * count);
if(!t->iovs) {
kvs_free(t);
return NULL;
}
for(int i = 0;i < count; ++ i){
size_t len = lens[i];
void *buf = kvs_malloc(len);
if(!buf){
for(int j = 0; j < i; ++j){
if(t->iovs[j].iov_base) kvs_free(t->iovs[j].iov_base);
}
kvs_free(t->iovs);
kvs_free(t);
return NULL;
}
memcpy(buf, bufs[i], len);
t->iovs[i].iov_base = buf;
t->iovs[i].iov_len = len;
}
t->iovcnt = count;
harvest_cqes(ctx);
if(!ctx->head){
ctx->head = t;
ctx->tail = t;
}else{
ctx->tail->next = t;
ctx->tail = t;
}
int submitted = 0;
while(true){
task_t *cur = ctx->head;
if(!cur){
break;
}
ctx->head = cur->next;
if (!ctx->head) {
ctx->tail = NULL;
}
cur->next = NULL;
struct io_uring_sqe *sqe = io_uring_get_sqe(&ctx->ring);
if (!sqe) {
break;
}
io_uring_prep_writev(sqe, cur->fd, cur->iovs, cur->iovcnt, cur->off);
sqe->user_data = (uint64_t)(uintptr_t)cur;
submitted++;
}
if(submitted > 0){
int ret = io_uring_submit(&ctx->ring);
}
return t;
}
void iouring_tick(iouring_ctx_t *ctx) {
harvest_cqes(ctx);
int submitted = 0;
while(ctx->head){
struct io_uring_sqe *sqe = io_uring_get_sqe(&ctx->ring);
if (!sqe) {
break;
}
task_t *cur = ctx->head;
ctx->head = cur->next;
if (!ctx->head) {
ctx->tail = NULL;
}
cur->next = NULL;
io_uring_prep_writev(sqe, cur->fd, cur->iovs, cur->iovcnt, cur->off);
sqe->user_data = (uint64_t)(uintptr_t)cur;
submitted++;
}
if(submitted > 0){
int ret = io_uring_submit(&ctx->ring);
}
}

View File

@@ -3,65 +3,91 @@
#include <liburing.h> #include <liburing.h>
#include <pthread.h> #include <pthread.h>
#include <stdlib.h>
#include <stdint.h> #include <stdint.h>
#include <unistd.h> #include <stdatomic.h>
#include <errno.h>
#include <string.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/uio.h>
#include <unistd.h>
typedef enum { TASK_READ, TASK_WRITE, TASK_FSYNC } task_op_t;
typedef enum { TASK_READ, TASK_WRITE } task_op_t; struct task;
typedef void (*task_destroy_cb_t)(struct task *t, void *arg);
typedef struct task { typedef struct task {
task_op_t op; task_op_t op;
int fd; int fd;
off_t off; off_t off;
unsigned fsync_flags;
unsigned sqe_flags;
int res; // cqe->res int res;
int done; // 0/1 _Atomic int done;
struct iovec *iovs; // iovec 数组 struct iovec *iovs;
int iovcnt; // iovec 数量 int iovcnt;
int free_iov_bases;
pthread_mutex_t m; task_destroy_cb_t on_destroy;
pthread_cond_t cv; void *on_destroy_arg;
struct task *next; struct task *next;
} task_t; } task_t;
typedef struct { typedef struct {
struct io_uring ring; _Atomic(task_t *) head;
pthread_t th;
pthread_mutex_t q_m;
pthread_cond_t q_cv;
task_t *q_head, *q_tail;
int stop;
atomic_int in_flight;
int max_in_flight;
} iouring_ctx_t;
typedef struct {
task_t *head;
pthread_mutex_t lock;
} destroy_queue_t; } destroy_queue_t;
typedef struct {
task_t **slots;
uint32_t cap;
_Atomic uint32_t head;
_Atomic uint32_t tail;
_Atomic uint32_t size;
} spsc_queue_t;
struct iouring_ctx_s;
typedef struct iouring_ctx_s iouring_ctx_t;
typedef struct {
struct io_uring ring;
pthread_t th;
int event_fd;
_Atomic int in_flight;
int max_in_flight;
int worker_id;
spsc_queue_t submit_q;
iouring_ctx_t *parent;
} iouring_worker_t;
struct iouring_ctx_s {
iouring_worker_t *workers;
int worker_nr;
unsigned entries_per_worker;
_Atomic int stop;
_Atomic uint32_t rr_next;
destroy_queue_t destroy_queue;
};
void task_init(task_t *t); void task_init(task_t *t);
void task_finish(task_t *t, int res); void task_finish(task_t *t, int res);
int task_wait(task_t *t); int task_wait(task_t *t);
void task_destroy(task_t *t); void task_destroy(task_t *t);
int iouring_init(iouring_ctx_t *ctx, unsigned entries); int iouring_init(iouring_ctx_t *ctx, unsigned entries);
void iouring_shutdown(iouring_ctx_t *ctx); void iouring_shutdown(iouring_ctx_t *ctx);
task_t* submit_write(iouring_ctx_t *ctx, int fd, void **bufs, size_t *lens, int count, off_t off); task_t *submit_write(iouring_ctx_t *ctx, int fd, void **bufs, size_t *lens, int count, off_t off);
task_t *submit_write_ref(iouring_ctx_t *ctx, int fd, void **bufs, size_t *lens, int count, off_t off,
int free_iov_bases, task_destroy_cb_t on_destroy, void *on_destroy_arg);
task_t *submit_fsync_ref(iouring_ctx_t *ctx, int fd, int worker_id, int drain,
task_destroy_cb_t on_destroy, void *on_destroy_arg);
int uring_task_complete(iouring_ctx_t *ctx); int uring_task_complete(iouring_ctx_t *ctx);
void cleanup_finished_iouring_tasks(iouring_ctx_t *ctx);
void cleanup_finished_iouring_tasks(); void iouring_profile_dump(iouring_ctx_t *ctx);
extern iouring_ctx_t global_uring_ctx; extern iouring_ctx_t global_uring_ctx;

View File

@@ -1,50 +0,0 @@
#ifndef __DISK_IOURING_H__
#define __DISK_IOURING_H__
#include <liburing.h>
#include <pthread.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>
#define BATCH_SIZE 256
typedef enum { TASK_READ, TASK_WRITE } task_op_t;
typedef struct task {
task_op_t op;
int fd;
off_t off;
int res; // cqe->res
int done; // 0/1
struct iovec *iovs; // iovec 数组
int iovcnt; // iovec 数量
struct task *next;
} task_t;
typedef struct {
struct io_uring ring;
int pending_count;
task_t *head;
task_t *tail;
} iouring_ctx_t;
void task_init(task_t *t);
void task_finish(task_t *t, int res);
void task_destroy(task_t *t);
int iouring_init(iouring_ctx_t *ctx, unsigned entries);
void iouring_shutdown(iouring_ctx_t *ctx);
task_t* submit_write(iouring_ctx_t *ctx, int fd, void **bufs, size_t *lens, int count, off_t off);
void iouring_tick(iouring_ctx_t *ctx);
extern iouring_ctx_t global_uring_ctx;
#endif

50
doc/conclusion.md Normal file
View File

@@ -0,0 +1,50 @@
# 性能定位结论(主线程)
## 1) 主要瓶颈结论
基于当前主线程采样统计,结论是:
- **主线程最大开销确实是内存申请/管理相关开销**(而不是纯 memcpy
- 开销占比大致为:
- `submit_write` 打包阶段:约 **61.3%**
- 其中 alloc 约 **66.5% of pack**(折算到主线程总开销约 **40.8%**
- copy 约 **14.2% of pack**(折算到主线程总开销约 **8.7%**
- 入队/通知SPSC push + notify**29.2%**
- 回收释放:约 **9.5%**
- backpressure 影响很小loops≈0
所以“减少申请内存次数”这个判断是对的,而且是当前最有价值的优化方向。
---
## 2) 对方案 2 / 3 的评价(结合本系统)
### 方案 2A/B 双 flag 缓冲覆写保护
- **优点**:实现直观,容易快速落地验证正确性。
- **缺点**
- 本质仍是“执行路径与拷贝路径耦合”,请求推进会受慢侧牵制;
- 不能从根本上消除 alloc 热点,只是控制覆写时序;
- 在高并发下调试成本会上升(状态机与边界条件多)。
- **结论**:适合快速试验,不是长期高性能形态。
### 方案 3ChainBuffer 所有权移交(摘链/归还)
- **优点**
- 以“指针/节点移动”替代大量 alloc+copy方向与当前瓶颈完全一致
- 对大 value、高吞吐场景收益潜力更大。
- **缺点**
- 生命周期、并发归还、异常回收都要设计清楚;
- 实现复杂度明显高于方案 2。
- **结论**:更符合长期性能目标,但需要更严格的工程设计。
---
## 3) 哪个在你当前系统里更容易实现?
如果只比较你给的这两个方案,在你现在的系统里:
- **更简单的是方案 2A/B 双 flag**
- **更值得长期投入的是方案 3所有权移交**。
建议:短期先用方案 2 验证行为边界,最终收敛到方案 3或其等价的零拷贝所有权模型

View File

@@ -0,0 +1,300 @@
# KVStore 激进版50 题参考回答(含风险等级与避坑话术)
> 对应文件:`doc/interview_questions_aggressive_50.md`
> 建议答题模板:**目标 -> 方案 -> 收益 -> 代价 -> 兜底**
---
## 1) 架构与边界1-5
### 1. 分层设计收益与复杂度
- **风险等级**:低
- **参考回答**我把系统拆成网络层收发、协议层RESP 解析)、执行层(命令分发)、持久化层(快照/oplog和复制层SSYNC+shm。收益是可替换和可定位问题比如网络和持久化互不耦合。代价是模块间接口变多调试时需要跨层追踪。兜底是我保留了主调用链文档排障按固定链路走。
- **避坑话术**:别说“完全解耦”,改成“低耦合、边界清晰”。
### 2. Reactor vs Proactor/协程
- **风险等级**:中
- **参考回答**:当前选 Reactor 是因为代码路径短、可控性高适合先把协议和持久化跑稳。Proactor/协程在高并发下有潜力,但会引入更复杂的调度和状态管理。我的 tradeoff 是先优化主路径和持久化,再评估网络模型切换。
- **避坑话术**不要说“Reactor 一定最快”,要说“在当前代码复杂度和目标下更合适”。
### 3. 三引擎选型边界
- **风险等级**:低
- **参考回答**Array 实现简单但查找线性适合小规模或验证场景RBTree 有序且 O(logN)适合有序访问Hash 平均 O(1),适合热点随机读写。收益是能覆盖不同 workload。代价是维护三套实现和一致性语义。
- **避坑话术**:不要泛泛说“都很快”,要给出复杂度和场景。
### 4. 主路径轻量化
- **风险等级**:中
- **参考回答**:主线程主要做解析、执行、回包,持久化提交走 io_uring worker。收益是减少 I/O 阻塞对主循环影响。代价是异步链路更复杂,需要回收队列和背压。我通过 destroy queue + wakeup 回收来控风险。
- **避坑话术**:别说“主线程无开销”,保留“主要开销下降”。
### 5. 多核扩展优先级
- **风险等级**:中
- **参考回答**我会先做连接分片再做数据分片。连接分片改动小、收益快数据分片需要引擎和一致性改造复杂度高。tradeoff 是短期吞吐提升 vs 长期架构演进,我会分两阶段做。
- **避坑话术**:不要一次性承诺“全做完”,强调迭代路线。
---
## 2) 协议与解析6-10
### 6. 半包/多包/粘包与错误恢复
- **风险等级**:低
- **参考回答**我按“可消费字节”循环解析len=0 视为半包等待下次len<0 视为协议错误。收益是能稳定处理 pipeline。代价是错误策略偏严格遇坏包直接断链。这是在安全性和可恢复性之间偏安全的选择。
- **避坑话术**:别说“能恢复所有坏包”。
### 7. binary-safe 为何用 slice
- **风险等级**:低
- **参考回答**:因为 key/value 可能包含 `\0` 和二进制字节C 字符串 API 会截断。sliceptr+len能保留完整字节语义。收益是协议正确性和通用性更强代价是代码里长度参数管理更繁琐。
- **避坑话术**:强调“正确性优先”。
### 8. 恶意请求限制
- **风险等级**:中
- **参考回答**:我在协议层设置了 bulk 上限和参数上限,同时连接层有读写缓冲上限。收益是避免单连接吃光内存。代价是极端合法请求也可能被拒绝。我会通过可配置阈值平衡安全和业务需求。
- **避坑话术**:不要说“完全防攻击”,说“降低风险面”。
### 9. inline 与 multibulk 共存
- **风险等级**:低
- **参考回答**:如果首字节不是 `*`,走 inline`*` 就走 multibulk。收益是兼容 Redis 常见输入。代价是代码路径多一支,但复杂度可控。
- **避坑话术**:不要说“全面兼容 Redis”说“覆盖当前命令集下的 RESP2 常用场景”。
### 10. 新命令最小改动
- **风险等级**:中
- **参考回答**:当前是命令枚举 + dispatch switch新增命令主要改解析映射和执行分支。收益是性能直观、可调试。代价是规模大时 switch 变长。我会在命令数继续增长时再抽象为表驱动。
- **避坑话术**:别提前宣称“完全插件化”。
---
## 3) ChainBuffer 与“零拷贝”11-15
### 11. 零拷贝边界
- **风险等级**:高
- **参考回答**:接收阶段是 readv 直写到链式缓冲,减少中转拷贝;发送阶段用 sendmsg 聚合。当前仍存在线性化/打包场景下的 memcpy所以更准确是“低拷贝主路径 + 局部拷贝回退”。tradeoff 是实现复杂度可控,同时先拿到主要收益。
- **避坑话术**:主动说“不是全链路绝对 0 拷贝”。
### 12. linearize 触发条件
- **风险等级**:中
- **参考回答**:当上层需要连续字节视图而数据跨 chunk 时触发 linearize。收益是简化了解析和执行接口。代价是触发时会有额外 memcpy影响尾延迟。我会通过 chunk 策略和请求分布降低触发频率。
- **避坑话术**:不要说“从不 linearize”。
### 13. free_list 选型
- **风险等级**:低
- **参考回答**free_list 复用 chunk减少频繁 malloc/free 抖动。收益是吞吐更稳、分配开销更低。代价是会保留一定空闲内存。通过 free_limit 做上限控制,平衡性能和内存占用。
- **避坑话术**:别说“零内存浪费”。
### 14. 大小 Key 混跑碎片问题
- **风险等级**:中
- **参考回答**:我用固定 chunk + 链式扩展,先保证正确和稳定。大 Key 会占多 chunk可能带来局部碎片收益是实现简单、行为可预测。后续可按 key size 分层 chunk 策略优化。
- **避坑话术**:承认“有优化空间”更可信。
### 15. 部分发送一致性
- **风险等级**:低
- **参考回答**sendmsg 返回 n 后按字节 drain 缓冲,未发完保留在 wbufEPOLLOUT 下次继续。收益是协议字节流不会错位。代价是状态维护复杂一点,但这是非阻塞发送必须的成本。
- **避坑话术**:别说“一次 send 一定发完”。
---
## 4) 所有权移交与并发安全16-20
### 16. 所有权定义
- **风险等级**:高
- **参考回答**:我的定义是“谁持有谁负责释放”,通过 detach/release 把片段生命周期显式化。收益是未来可做执行-落盘低拷贝协作。代价是引用计数和回收时序更复杂。当前实现已提供接口,进一步全链路化是下一阶段。
- **避坑话术**:明确“已实现接口与局部机制,非全链路完成”。
### 17. UAF / double free 防护
- **风险等级**:中
- **参考回答**:通过 refcnt + release 路径统一回收,避免多方直接 free。收益是降低并发回收风险。代价是需要严格约束调用顺序和异常路径。排查时我会优先核对 detach/release 对称性。
- **避坑话术**:别说“绝对不会 UAF”说“通过机制显著降低风险”。
### 18. 连接关闭后的回收
- **风险等级**:中
- **参考回答**:连接关闭只释放连接侧对象,已移交片段由持有方继续走 release。收益是不会因连接断开丢失回收路径。代价是关闭逻辑需要区分“本地拥有”和“已移交”。
- **避坑话术**:避免一句话“直接全清掉”。
### 19. 谁申请谁释放
- **风险等级**:低
- **参考回答**:这是为了避免跨线程释放带来的生命周期混乱。收益是定位泄漏和崩溃更直观。代价是需要一个回收协作队列,代码量会增加。总体是工程上更稳的 tradeoff。
- **避坑话术**:强调“可维护性收益”。
### 20. 真正 0 拷贝落盘约束
- **风险等级**:高
- **参考回答**:要做到真正 0 拷贝,需要落盘线程直接消费网络片段并延迟释放,约束包括引用计数、回收屏障和慢盘背压。收益是减少打包复制。代价是并发复杂度显著上升,错误成本更高。
- **避坑话术**:别承诺“短期必做完”,给阶段目标。
---
## 5) io_uring 流水线21-25
### 21. n*SPSC vs MPSC
- **风险等级**:中
- **参考回答**n*SPSC 的好处是队列局部性好、竞争少延迟更稳代价是负载均衡和线程参数调优复杂。MPSC 实现集中但热点明显。当前我更看重稳定性,所以先用 n*SPSC。
- **避坑话术**:不要否定 MPSC强调场景选择。
### 22. 80% 水位
- **风险等级**:中
- **参考回答**80% 是保护阈值,目的是给 CQE 回收和突发流量留缓冲。收益是降低 overflow 风险。代价是峰值吞吐可能不是最大。这个值不是常量真理,我会按压测再调。
- **避坑话术**:不要说“最优解”,说“保守工程值”。
### 23. 背压策略
- **风险等级**:中
- **参考回答**:队列满时会回收已完成任务并让出 CPU必要时限流连接。收益是系统不崩。代价是尾延迟上升。tradeoff 是优先保活再保时延。
- **避坑话术**:别说“无损无延迟”。
### 24. 批量回收
- **风险等级**:低
- **参考回答**:批量偷取 destroy queue 能减少锁/原子开销和碎片回收抖动。收益是主循环更平滑。代价是回收时点有批处理延迟。这个延迟通常可接受。
- **避坑话术**:承认“不是实时逐条释放”。
### 25. 优雅停机
- **风险等级**:中
- **参考回答**:停机流程是 stop 标志 -> 唤醒 worker -> drain 队列 -> join 线程 -> 清理剩余任务。收益是尽量保证数据路径收敛完成。代价是停机时间变长。这里优先数据完整性。
- **避坑话术**:别说“秒停”,说“可控停机”。
---
## 6) 快照 + 增量一致性26-30
### 26. 一致性窗口
- **风险等级**:高
- **参考回答**:快照代表某个时间点状态,后续变更由 oplog 补齐。收益是写入不中断。代价是天然存在窗口,需要恢复阶段串联重放。这个设计是吞吐优先下的最终一致方案。
- **避坑话术**:不要声称“强一致快照”。
### 27. 先快照后回放
- **风险等级**:低
- **参考回答**快照是基线oplog 是增量。先加载基线再套增量语义清晰。收益是恢复逻辑简单。代价是快照越旧回放越长。可通过更频繁快照平衡。
- **避坑话术**:说明“恢复时间与快照频率 tradeoff”。
### 28. 命令字节流日志
- **风险等级**:中
- **参考回答**:命令字节流实现快、复用现有解析器,开发成本低。代价是回放时要重新解析执行,效率不如结构化 WAL。当前选择是快速迭代优先后续可演进为结构化记录。
- **避坑话术**:承认“不是最终形态”。
### 29. 坏日志策略
- **风险等级**:高
- **参考回答**:当前更偏“发现坏日志就失败退出”,先保证正确性边界。收益是避免静默数据错误。代价是可用性下降。生产化可加校验和分段容错,再做可恢复降级。
- **避坑话术**:不要说“自动修复所有坏日志”。
### 30. 状态边界证明
- **风险等级**:高
- **参考回答**:我会把边界定义成“快照点 + 之后成功持久化的增量”。收益是定义清楚,便于测试。代价是需要故障注入验证不同崩溃点。我的做法是先给出可验证边界,而不是口头保证。
- **避坑话术**:避免绝对词“完全一致”。
---
## 7) 主从与状态机31-35
### 31. 状态机描述
- **风险等级**:中
- **参考回答**slave 发 SSYNC 申请全量master 异步产出并发送 snapshotslave 落盘后回 SREADY再进入增量流。收益是流程清晰且可扩展。代价是阶段切换复杂需要严格顺序控制。
- **避坑话术**:别忽略“阶段切换失败处理”。
### 32. 快照期间新写入
- **风险等级**:高
- **参考回答**:思路是用 seq 把快照基线和增量窗口连接起来。收益是减少丢数据窗口。代价是实现复杂,需要状态机和顺序保证。当前代码有通道和钩子,协同还在持续完善。
- **避坑话术**:一定说“机制已搭,协同在完善中”。
### 33. wrap marker 可行性
- **风险等级**:中
- **参考回答**:尾部放不下时写 wrap marker 并回到 0消费者识别后跳转。收益是实现简单。代价是需要严格保证读写顺序和越界检查。适合单写者模型。
- **避坑话术**:别说“多写者也没问题”。
### 34. seq 断档处理
- **风险等级**:高
- **参考回答**:断档处理要看目标:一致性优先就阻塞/重建,可用性优先可短暂跳过并告警。我的偏好是关键链路一致性优先。代价是短期可用性受影响。
- **避坑话术**:避免“所有场景一个策略”。
### 35. eBPF 控制面定位
- **风险等级**:中
- **参考回答**:我把 eBPF 钩子放控制面,用于状态切换和转发触发,不直接承载主数据处理。收益是降低主路径侵入。代价是又多一层调试复杂度。
- **避坑话术**别说“eBPF 加速了所有链路”。
---
## 8) 内存分配器与内存池36-40
### 36. 分级依据
- **风险等级**:低
- **参考回答**8-512B 覆盖高频小对象,按 8B 对齐分桶,页内 free-list 管理。收益是分配释放路径短。代价是可能有内部碎片。属于延迟优先的选择。
- **避坑话术**:不要回避碎片问题。
### 37. 大对象回退 malloc
- **风险等级**:低
- **参考回答**:大对象生命周期和尺寸离散,放进小块池会放大管理成本。回退系统分配更简单。收益是降低池复杂度。代价是大块分配性能受系统 allocator 影响。
- **避坑话术**别说“mempool 覆盖全部场景”。
### 38. 空闲页回收
- **风险等级**:中
- **参考回答**:我保留一定空闲页做缓存,超过阈值再释放,平衡复用和内存占用。收益是减少抖动。代价是峰值内存更高。阈值可按业务压测调。
- **避坑话术**:强调“可调参数”。
### 39. 三 allocator 差异来源
- **风险等级**:中
- **参考回答**差异主要来自小对象分配路径、锁竞争、缓存复用和写盘模式耦合。mypool 在当前 workload 下命中率高,所以吞吐更好。代价是通用性不一定优于成熟 allocator。
- **避坑话术**别说“mypool 永远最优”。
### 40. 内存异常排查优先级
- **风险等级**:低
- **参考回答**:先看 RSS、分配失败、队列积压、in-flight、回收延迟再看对象分布。收益是先定位“增长来源”再定位“泄漏点”。代价是需要加一些可观测指标。
- **避坑话术**:别一上来就说“就是泄漏”。
---
## 9) 压测方法学41-45
### 41. 复现性保证
- **风险等级**:低
- **参考回答**:固定参数、固定轮次、分组合独立目录、每轮重启,保存原始日志与 CSV。收益是可回放可核对。代价是执行时间更长。
- **避坑话术**:不要只报“最好一轮数据”。
### 42. bench vs testcase 取舍
- **风险等级**:中
- **参考回答**bench 更灵活,但随机 key 与 RSET 插入语义会碰撞testcase mode=4 更稳定适合长期对比。tradeoff 是灵活性 vs 稳定性。
- **避坑话术**:主动解释语义差异,避免被质疑口径不一致。
### 43. QPS 口径
- **风险等级**:中
- **参考回答**:我用“完成请求数/耗时”,并明确是否包含错误。收益是可比较。代价是不同工具默认口径不同,需要文档化。
- **避坑话术**:别把不同口径结果直接横比。
### 44. 避免虚高
- **风险等级**:低
- **参考回答**:做预热、控制 keyspace、分离预填充和正式压测、多轮取均值并看波动。收益是结果更稳。代价是实验时间增加。
- **避坑话术**:不要只强调峰值,不报波动。
### 45. 波动解释
- **风险等级**:中
- **参考回答**波动来自调度、缓存、I/O 抖动和后台回收。我的做法是看均值+CV不用单轮结论。tradeoff 是展示更真实但数字不一定最亮眼。
- **避坑话术**:别把异常轮次“隐去不提”。
---
## 10) 真实性与演进46-50
### 46. 零拷贝表述边界
- **风险等级**:高
- **参考回答**:我会说“接收/发送主路径低拷贝,局部场景有拷贝回退”,而不是“全链路绝对零拷贝”。收益是表达亮点同时不失真。代价是表述没那么激进。
- **避坑话术**:先给边界,再讲优化方向。
### 47. 状态机协同实现到哪
- **风险等级**:高
- **参考回答**SSYNC/SREADY、快照传输、增量通道与 seq 机制已打通;全量-增量无缝协同仍在完善。这样说能体现工程进度和规划。
- **避坑话术**:别说“完全解决协同一致性”。
### 48. 两周落地计划
- **风险等级**:中
- **参考回答**:第 1 周先补一致性边界和故障注入测试;第 2 周做所有权移交闭环和指标监控。收益是先补正确性再补性能。代价是短期新功能延后。
- **避坑话术**:给里程碑,不要空谈“能做完”。
### 49. 最大技术债
- **风险等级**:中
- **参考回答**最大的债是“协同一致性语义和容错测试还不够系统化”。我优先做了主链路可运行和性能基线。tradeoff 是先可用再完善严谨性。
- **避坑话术**:承认技术债并给修复计划,面试官反而更认可。
### 50. 三人小组下一版目标
- **风险等级**:低
- **参考回答**:目标按优先级:一致性文档+故障矩阵、可观测性、性能回归门禁。验收指标包括恢复正确率、复制延迟、QPS/CV 基线。tradeoff 是减少“新功能数量”,换“上线可信度”。
- **避坑话术**:不要只提“再提 2 倍 QPS”。
---
## 快速复习(高风险题清单)
- **高风险题**11, 16, 20, 26, 29, 30, 32, 34, 46, 47
- **统一策略**:先讲“已实现边界”,再讲“预案与下一步”,避免绝对化承诺。

View File

@@ -0,0 +1,76 @@
# KVStore 激进版本:面试高频追问 50 题(含 Tradeoff 视角)
## 回答建议(统一框架)
- 建议每题按「目标 -> 方案 -> 收益 -> 代价 -> 兜底」回答。
- 收益常见维度:吞吐、时延、稳定性、可维护性、可扩展性。
- 代价常见维度:实现复杂度、内存开销、排障成本、一致性窗口、开发周期。
## 1) 架构与边界1-5
1. 你把系统拆成了哪些层(网络/协议/执行/持久化/复制)?这种分层带来的收益与额外复杂度分别是什么?
2. 为什么默认选择 Reactor 而不是 Proactor/协程网络模型?三者在吞吐、延迟、开发复杂度上的 tradeoff 是什么?
3. Array/RBTree/Hash 三种引擎的选型边界是什么?在不同数据分布下的性能收益与维护成本如何权衡?
4. 你说“主路径轻量化”,具体删掉了哪些主线程工作?对 CPU 利用率与代码复杂度的 tradeoff 是什么?
5. 如果要做多核扩展,你会优先做连接分片还是数据分片?两者在扩展性与一致性上的代价分别是什么?
## 2) 协议与解析6-10
6. RESP 解析如何处理半包/多包/粘包?选择“严格报错”还是“尽量恢复”的 tradeoff 是什么?
7. binary-safe 为什么必须用 sliceptr+len而不是 C 字符串?这对性能和代码可读性有什么影响?
8. 你如何限制恶意请求(超大 bulk、超多参数安全性提升与正常流量误伤之间如何平衡
9. inline 与 multibulk 共存时如何处理优先级?兼容性收益与实现复杂度代价分别是什么?
10. 新增命令时如何保证最小改动?抽象过度与快速迭代之间的 tradeoff 怎么拿捏?
## 3) ChainBuffer 与“零拷贝”追问11-15
11. readv 直写后用户态还会发生哪些拷贝?你如何定义“零拷贝”和“低拷贝”的边界?
12. chain_buffer_linearize 何时触发?减少复杂解析逻辑与引入额外 memcpy 的 tradeoff 是什么?
13. 为什么要做 chunk free_list复用内存带来的性能收益与内存峰值风险如何平衡
14. 大 Key64KB与小 Key 混跑时chunk 策略如何避免碎片化?吞吐和内存效率谁优先?
15. sendmsg 聚合发送遇到部分发送时如何保证一致性?更高吞吐与更复杂状态管理如何权衡?
## 4) 所有权移交与并发安全16-20
16. “执行-落盘共享缓冲片段”如何定义所有权?减少拷贝收益与生命周期复杂度代价怎么评估?
17. detach/release 如何避免 UAF/double free你牺牲了哪些性能换取内存安全
18. 连接提前关闭时已移交数据如何回收?优先保证数据安全还是优先快速释放资源?
19. 为什么采用“谁申请谁释放”?跨线程释放带来的便利与隐患如何取舍?
20. 若做真正 0 拷贝落盘,你会接受哪些新增约束(引用计数/锁/回收时序)?
## 5) io_uring 持久化流水线21-25
21. n*SPSC 相比单队列 MPSC 的收益和代价分别是什么?你为什么在当前阶段选 n*SPSC
22. in-flight 上限为何设为 CQ 的 80%?保守水位与峰值吞吐之间如何权衡?
23. 队列满时背压策略是什么?“保护系统稳定”与“牺牲尾延迟”之间如何取舍?
24. destroy_queue 用批量偷取+集中释放的动机是什么?释放抖动与主循环平滑性如何平衡?
25. shutdown 阶段如何保证无悬挂任务?优雅退出时间与数据完整性哪个优先?
## 6) 快照 + 增量日志一致性26-30
26. SAVE 与 oplog append 并发时一致性窗口怎么定义?强一致与吞吐的 tradeoff 是什么?
27. 为什么恢复顺序是“先快照后 oplog”恢复速度与恢复正确性之间如何权衡
28. oplog 记录命令字节流而不是逻辑变更,优势和成本分别是什么?
29. replay 遇到坏日志时你会“失败退出”还是“跳过继续”?可用性与正确性怎么选?
30. 你如何说明恢复后状态边界?文档化成本与实现灵活性的 tradeoff 是什么?
## 7) 主从同步与状态机31-35
31. 请描述 SSYNC -> Snapshot -> SREADY -> Incremental 状态机,并说明每步的收益与风险。
32. 快照传输期间新写入如何不丢?延迟增大与一致性增强之间怎么平衡?
33. 共享内存 ring wrap marker 方案为什么可行?简单实现与健壮性之间的代价是什么?
34. seq 不连续时你为何选择阻塞/跳过/重建?各策略的可用性与数据风险如何比较?
35. eBPF uprobe 放在控制面而非数据面的考量是什么?观测能力与运行时开销如何权衡?
## 8) 内存分配器与内存池36-40
36. mempool 分级8-512B的依据是什么固定桶命中率与碎片风险如何平衡
37. 大对象回退 malloc 的原因是什么?统一路径与分层路径在复杂度上怎么取舍?
38. mempool 如何处理空闲页回收?低延迟复用与低内存占用的 tradeoff 怎么设定?
39. malloc/jemalloc/mypool 在你的 workload 下差异来自哪里?泛化能力与场景优化如何平衡?
40. 线上内存峰值异常时先看哪些指标?指标全面性与观测成本如何取舍?
## 9) 压测方法学与结果可信度41-45
41. 你如何保证 benchmark 可复现?“实验真实度”与“执行成本”之间怎么平衡?
42. 为什么有些场景用 bench.c有些改用 testcase mode=4语义准确与工具统一如何取舍
43. QPS 统计口径怎么定义?是否包含失败请求?可比性与直观性之间如何平衡?
44. 如何避免预热不足/缓存命中导致虚高?测试严谨性与测试周期如何平衡?
45. 如何解释 round 间波动CV追求峰值还是追求稳定性的 tradeoff 是什么?
## 10) 真实性、边界与演进46-50
46. 简历里“零拷贝”哪些已落地,哪些是低拷贝/预案?为什么这样表述?
47. 简历里“协同状态机”当前实现到哪一步?工程现实与对外表达如何平衡?
48. 若给你 2 周把预案落地,你的里程碑怎么排?短期收益与长期架构如何取舍?
49. 当前最大技术债是什么?为什么没有先修?业务推进与技术治理如何平衡?
50. 若带 3 人继续做,下版目标与验收指标是什么?功能扩展与稳定性建设如何排序?

25
doc/resume.tex Normal file
View File

@@ -0,0 +1,25 @@
\section{项目经历}
\ResumeItem[KVStore 高性能 KV 存储系统RESP 兼容 + 异步持久化 + 主从同步)]
{\textbf{KVStore} 高性能 KV 存储系统RESP 兼容 + 异步持久化 + 主从同步)}
[个人项目|独立开发]
[2025.03 --- 2026.03]
\begin{itemize}
\item 基于 \textbf{C + Linux} 从零实现单机 KV 服务:支持 \textbf{RESP2 协议解析}、pipeline 与 binary-safe支持 \texttt{\textbackslash 0}),并统一接入 \textbf{Array / RBTree / Hash} 三类存储引擎。
\item 实现链式网络缓冲 \textbf{ChainBuffer}:通过 \textbf{readv/sendmsg} 构建分段零拷贝收发路径,支持大 Key 与半包/多包场景;提供 \textbf{detach/release} 所有权接口,支持后续执行-落盘共享同一数据片段。
\item 实现 \textbf{io\_uring + n*SPSC} 异步落盘模型主线程执行后写入增量日志worker 线程批量提交 \textbf{writev};通过 in-flight 控制、队列背压、destroy-queue 回收,避免 CQ 溢出与内存失控。
\item 设计并落地“\textbf{快照 + 增量日志}”恢复链路:支持启动加载快照并回放 oplog支持 \textbf{SSYNC $\rightarrow$ 快照传输 $\rightarrow$ SREADY $\rightarrow$ 增量同步} 的主从接管流程,并预留 eBPF uprobe 协同点。
\item 实现可插拔内存分配策略(\textbf{malloc/jemalloc/mypool}):自研 mempool 采用 8--512B 分级桶 + 页级 free-list在 allocator$\times$persistence 复测中,\textbf{mypool} 吞吐最佳none: 924878 QPSincremental: 747101 QPS
\item 构建 hiredis 功能/压测工具链(\texttt{testcase + bench + 自动化脚本}),沉淀多轮可复现实验口径;将关键优化量化为可交付结果(如 ChainBuffer 改造后写路径 QPS 提升约 \textbf{27\%})。
\end{itemize}
\ResumeItem[EncryptSql 基于 PostgreSQL 的透明加密查询与运算框架]
{\textbf{EncryptSQL} 基于 PostgreSQL 的透明加密查询与运算框架}
[学校横向|部分代码开发]
[2024.09 --- 2025.09]
\begin{itemize}
\item 在客户端侧对 \textbf{libpq} 进行改造,实现 \textbf{SQL 解析后重写}:将原生表达式/运算符节点替换为密态版本函数/算子调用,尽量保证业务侧无侵入接入。
\item 基于 \textbf{PostgreSQL 扩展机制}(自定义函数/算子等)接入密态运算:支持常见算术计算(加/减/乘/除)与部分聚合能力,并与查询执行流程集成。
\item 设计并实现基于工厂模式的 \textbf{KMS 接口层}:在 \texttt{encryptsql} 组件中统一密钥获取与管理流程,完成 \textbf{LocalKMS}\textbf{Huawei KMS API} 适配,支持外部 KMS 平滑替换。
\item 面向高安全计算场景,引入 \textbf{TEE} 承载关键运算链路,在安全性与性能开销之间做工程化平衡。
\end{itemize}

25
doc/resume_aggressive.tex Normal file
View File

@@ -0,0 +1,25 @@
\section{项目经历}
\ResumeItem[KVStore 高性能 KV 存储系统(激进表达版)]
{\textbf{KVStore} 高性能 KV 存储系统(零拷贝接收 + 异步持久化 + 主从协同状态机)}
[个人项目|独立开发]
[2025.03 --- 2026.03]
\begin{itemize}
\item 基于 \textbf{C + Linux} 实现 RESP 兼容 KV 存储内核,支持 \textbf{binary-safe}、pipeline 与多引擎统一分发Array/RBTree/Hash形成协议层到执行层的一体化数据通路。
\item 围绕大 Key 场景实现 \textbf{ChainBuffer 分段零拷贝接收}:采用 \textbf{readv/sendmsg} 与链式 chunk 组织,支持超大请求分段处理;按线上保护阈值将单请求上限收敛至 \textbf{65535} 字节级别。
\item 实现并演进 \textbf{所有权移交} 机制:主线程仅负责命令边界识别与执行,落盘线程复用网络缓冲片段进行持久化,减少主路径内存申请/对象拼装开销。
\item 搭建 \textbf{io\_uring + n*SPSC} 持久化流水线worker 批量提交 writev主线程异步回收完成任务结合 in-flight 背压与 destroy-queue稳定处理慢盘与高并发写入抖动。
\item 设计“\textbf{快照 + 增量日志 + 实时复制}”协同方案:通过 \textbf{SSYNC $\rightarrow$ Snapshot $\rightarrow$ SREADY $\rightarrow$ Incremental} 状态机衔接全量与增量,保障复制窗口内的可恢复性与顺序一致性。
\item 构建多维压测体系(功能正确性/吞吐/波动):在 allocator$\times$persistence 复测中,\textbf{mypool} 取得最佳吞吐none: 924878 QPSincremental: 747101 QPS并将优化效果沉淀为工程基线。
\end{itemize}
\ResumeItem[EncryptSql 基于 PostgreSQL 的透明加密查询与运算框架]
{\textbf{EncryptSQL} 基于 PostgreSQL 的透明加密查询与运算框架}
[学校横向|部分代码开发]
[2024.09 --- 2025.09]
\begin{itemize}
\item 在客户端侧改造 \textbf{libpq} 并实现 \textbf{SQL AST 重写}:将明文表达式自动替换为密态函数/算子调用,降低业务系统改造成本。
\item 基于 \textbf{PostgreSQL 扩展机制}接入密态算子,支持加/减/乘/除与部分聚合能力,形成可落地的“密文存储 + 密态计算”执行路径。
\item 设计并实现 \textbf{KMS 工厂接口框架},完成 \textbf{LocalKMS/Huawei KMS} 适配,支持多云与私有化 KMS 的低成本切换。
\item 在高敏感计算场景引入 \textbf{TEE},对关键流程进行可信执行与边界隔离,平衡安全目标与查询性能。
\end{itemize}

View File

@@ -0,0 +1,25 @@
\section{项目经历}
\ResumeItem[KVStore 高性能 KV 存储系统(保守版)]
{\textbf{KVStore} 高性能 KV 存储系统RESP 兼容 + 异步持久化 + 主从同步)}
[个人项目|独立开发]
[2025.03 --- 2026.03]
\begin{itemize}
\item 基于 \textbf{C + Linux} 实现单机 KV 服务,支持 \textbf{RESP2} 解析、pipeline 与 binary-safe支持 \texttt{\textbackslash 0})键值处理。
\item 统一命令分发层,接入 \textbf{Array / RBTree / Hash} 三种引擎,实现 SET/GET/DEL 及对应 R*/H* 命令族。
\item 实现 \textbf{ChainBuffer} 分段网络缓冲:接收侧使用 \textbf{readv} 直写,发送侧使用 \textbf{sendmsg} 聚合发送,并通过 linearize 处理跨分段解析场景。
\item 实现 \textbf{io\_uring + n*SPSC} 异步增量日志写入,包含 in-flight 限流、背压与完成队列回收,降低主线程阻塞。
\item 实现“\textbf{快照 + 增量日志}”恢复路径:支持 SAVE 快照、oplog 回放;支持 \textbf{SSYNC/SREADY} 启动同步与共享内存增量通道。
\item 构建 hiredis 功能/性能测试工具链;在 2026-03-04 的 allocator$\times$persistence 复测中,\textbf{mypool} 组合吞吐最佳none: 924878 QPSincremental: 747101 QPS
\end{itemize}
\ResumeItem[EncryptSql 基于 PostgreSQL 的透明加密查询与运算框架]
{\textbf{EncryptSQL} 基于 PostgreSQL 的透明加密查询与运算框架}
[学校横向|部分代码开发]
[2024.09 --- 2025.09]
\begin{itemize}
\item 在客户端侧对 \textbf{libpq} 进行改造,实现 \textbf{SQL 解析后重写}:将原生表达式/运算符节点替换为密态函数/算子调用,尽量保证业务侧无侵入接入。
\item 基于 \textbf{PostgreSQL 扩展机制}(自定义函数/算子等)接入密态运算:支持常见算术计算(加/减/乘/除)与部分聚合能力,并与执行流程集成。
\item 设计并实现基于工厂模式的 \textbf{KMS 接口层}:在 \texttt{encryptsql} 中统一密钥获取与管理流程,完成 \textbf{LocalKMS}\textbf{Huawei KMS API} 适配。
\item 面向高安全计算场景,引入 \textbf{TEE} 承载关键运算链路,在安全性与性能开销之间做工程化平衡。
\end{itemize}

View File

@@ -10,14 +10,21 @@ extern char global_rbtree_file[256];
extern char global_hash_file[256]; extern char global_hash_file[256];
int kvs_create_snapshot(iouring_ctx_t *uring, const char* array_file, const char* rbtree_file, const char* hash_file); int kvs_create_snapshot(iouring_ctx_t *uring, const char* array_file, const char* rbtree_file, const char* hash_file);
int kvs_create_snapshot_async(const char *ip, int port); int kvs_create_snapshot_async_1(iouring_ctx_t *uring, const char* array_file, const char* rbtree_file, const char* hash_file);
int kvs_create_snapshot_async_2(const char *ip, int port);
extern int global_oplog_fd; extern int global_oplog_fd;
int init_cmd_log(const char *file, int *logfd); int init_cmd_log(const char *file, int *logfd);
int destroy_cmd_log(int logfd); int destroy_cmd_log(int logfd);
enum {
KVS_OPLOG_BUF_NOT_FULL = 0,
KVS_OPLOG_BUF_FULL = 1
};
int kvs_oplog_buffer_append(const uint8_t *cmd, size_t len, int logfd);
int kvs_oplog_flush(int logfd, int force);
int kvs_oplog_append(const uint8_t *cmd, size_t len, int logfd); int kvs_oplog_append(const uint8_t *cmd, size_t len, int logfd);
int kvs_replay_log(int logfd); int kvs_replay_log(int logfd);
int ksv_clear_log(int logfd); int ksv_clear_log(int logfd);

View File

@@ -3,58 +3,557 @@
#include "memory/alloc_dispatch.h" #include "memory/alloc_dispatch.h"
#include "kvs_protocol_resp.h" #include "kvs_protocol_resp.h"
#include "diskuring/diskuring.h" #include "diskuring/diskuring.h"
#include "common/config.h"
#include <arpa/inet.h> #include <arpa/inet.h>
#include <fcntl.h> #include <fcntl.h>
#include <pthread.h>
#include <string.h>
#include <time.h>
#include <unistd.h> #include <unistd.h>
int global_oplog_fd = -1; int global_oplog_fd = -1;
static off_t g_log_off = -1; static off_t g_log_off = -1;
extern AppConfig global_cfg;
#define KVS_OPLOG_PAGE_SIZE (64u * 1024u)
typedef struct oplog_buf {
struct oplog_buf *next;
off_t off;
size_t used;
uint8_t data[KVS_OPLOG_PAGE_SIZE];
} oplog_buf_t;
static oplog_buf_t *g_oplog_idle_head = NULL;
static oplog_buf_t *g_oplog_idle_tail = NULL;
static oplog_buf_t *g_oplog_ready_head = NULL;
static oplog_buf_t *g_oplog_ready_tail = NULL;
static oplog_buf_t *g_oplog_cur = NULL;
static pthread_mutex_t g_oplog_mu = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t g_sync_cv = PTHREAD_COND_INITIALIZER;
static pthread_t g_sync_th;
static int g_sync_started = 0;
static int g_sync_stop = 0;
static int g_sync_logfd = -1;
static uint64_t g_sync_gen = 0;
static uint64_t g_sync_synced_gen = 0;
static inline void oplog_mark_dirty_locked(void) {
g_sync_gen++;
pthread_cond_signal(&g_sync_cv);
}
typedef struct {
pthread_mutex_t mu;
pthread_cond_t cv;
int pending;
int submit_failed;
int fsync_err;
} oplog_fsync_wait_t;
typedef struct {
oplog_fsync_wait_t *waiter;
} oplog_fsync_arg_t;
static void make_timeout_ms(struct timespec *ts, long ms) {
if (!ts) {
return;
}
clock_gettime(CLOCK_REALTIME, ts);
ts->tv_sec += ms / 1000;
ts->tv_nsec += (ms % 1000) * 1000000L;
if (ts->tv_nsec >= 1000000000L) {
ts->tv_sec += 1;
ts->tv_nsec -= 1000000000L;
}
}
static void oplog_fsync_done(task_t *t, void *arg) {
oplog_fsync_arg_t *a = (oplog_fsync_arg_t *)arg;
oplog_fsync_wait_t *w;
if (!a || !a->waiter) {
return;
}
w = a->waiter;
pthread_mutex_lock(&w->mu);
if (t && t->res < 0 && w->fsync_err == 0) {
w->fsync_err = t->res;
}
if (w->pending > 0) {
w->pending--;
}
if (w->pending == 0) {
pthread_cond_signal(&w->cv);
}
pthread_mutex_unlock(&w->mu);
}
static int kvs_oplog_fsync_all_workers(int fd) {
int i;
int n;
int rc = 0;
oplog_fsync_wait_t w;
oplog_fsync_arg_t *args = NULL;
if (fd < 0 || !global_uring_ctx.workers || global_uring_ctx.worker_nr <= 0) {
return -1;
}
memset(&w, 0, sizeof(w));
pthread_mutex_init(&w.mu, NULL);
pthread_cond_init(&w.cv, NULL);
n = global_uring_ctx.worker_nr;
args = (oplog_fsync_arg_t *)kvs_malloc(sizeof(oplog_fsync_arg_t) * (size_t)n);
if (!args) {
pthread_cond_destroy(&w.cv);
pthread_mutex_destroy(&w.mu);
return -1;
}
for (i = 0; i < n; i++) {
task_t *t;
args[i].waiter = &w;
pthread_mutex_lock(&w.mu);
w.pending++;
pthread_mutex_unlock(&w.mu);
t = submit_fsync_ref(&global_uring_ctx, fd, i, 1, oplog_fsync_done, &args[i]);
if (!t) {
pthread_mutex_lock(&w.mu);
w.pending--;
w.submit_failed = 1;
if (w.pending == 0) {
pthread_cond_signal(&w.cv);
}
pthread_mutex_unlock(&w.mu);
rc = -1;
break;
}
}
pthread_mutex_lock(&w.mu);
while (w.pending > 0) {
struct timespec ts;
make_timeout_ms(&ts, 10);
(void)pthread_cond_timedwait(&w.cv, &w.mu, &ts);
if (w.pending > 0) {
pthread_mutex_unlock(&w.mu);
cleanup_finished_iouring_tasks(&global_uring_ctx);
pthread_mutex_lock(&w.mu);
}
}
if (w.fsync_err < 0 || w.submit_failed) {
rc = -1;
}
pthread_mutex_unlock(&w.mu);
cleanup_finished_iouring_tasks(&global_uring_ctx);
kvs_free(args);
pthread_cond_destroy(&w.cv);
pthread_mutex_destroy(&w.mu);
return rc;
}
static void oplog_push_tail(oplog_buf_t **head, oplog_buf_t **tail, oplog_buf_t *buf) {
if (!head || !tail || !buf) {
return;
}
buf->next = NULL;
if (!*tail) {
*head = *tail = buf;
return;
}
(*tail)->next = buf;
*tail = buf;
}
static void oplog_push_front(oplog_buf_t **head, oplog_buf_t **tail, oplog_buf_t *buf) {
if (!head || !tail || !buf) {
return;
}
if (!*head) {
buf->next = NULL;
*head = *tail = buf;
return;
}
buf->next = *head;
*head = buf;
}
static oplog_buf_t *oplog_pop_head(oplog_buf_t **head, oplog_buf_t **tail) {
oplog_buf_t *buf;
if (!head || !tail || !*head) {
return NULL;
}
buf = *head;
*head = buf->next;
if (!*head) {
*tail = NULL;
}
buf->next = NULL;
return buf;
}
static oplog_buf_t *oplog_alloc_buf(void) {
oplog_buf_t *buf = (oplog_buf_t *)kvs_malloc(sizeof(oplog_buf_t));
if (!buf) {
return NULL;
}
buf->next = NULL;
buf->off = 0;
buf->used = 0;
return buf;
}
static oplog_buf_t *oplog_borrow_buf(void) {
oplog_buf_t *buf = oplog_pop_head(&g_oplog_idle_head, &g_oplog_idle_tail);
if (buf) {
buf->off = 0;
buf->used = 0;
return buf;
}
return oplog_alloc_buf();
}
static void oplog_free_list(oplog_buf_t **head, oplog_buf_t **tail) {
oplog_buf_t *buf;
if (!head || !tail) {
return;
}
while ((buf = oplog_pop_head(head, tail)) != NULL) {
kvs_free(buf);
}
}
static void oplog_pool_release_all(void) {
if (g_oplog_cur) {
kvs_free(g_oplog_cur);
g_oplog_cur = NULL;
}
oplog_free_list(&g_oplog_idle_head, &g_oplog_idle_tail);
oplog_free_list(&g_oplog_ready_head, &g_oplog_ready_tail);
}
static void oplog_recycle_done(task_t *t, void *arg) {
oplog_buf_t *buf = (oplog_buf_t *)arg;
(void)t;
if (!buf) {
return;
}
pthread_mutex_lock(&g_oplog_mu);
buf->off = 0;
buf->used = 0;
oplog_push_tail(&g_oplog_idle_head, &g_oplog_idle_tail, buf);
pthread_mutex_unlock(&g_oplog_mu);
}
static int kvs_oplog_submit_ready_buf(oplog_buf_t *buf, int logfd) {
void *bufs[1];
size_t lens[1];
task_t *t;
if (!buf || logfd < 0) {
return -1;
}
if (buf->used == 0) {
oplog_push_tail(&g_oplog_idle_head, &g_oplog_idle_tail, buf);
return 0;
}
bufs[0] = (void *)buf->data;
lens[0] = buf->used;
t = submit_write_ref(&global_uring_ctx, logfd, bufs, lens, 1, buf->off, 0,
oplog_recycle_done, buf);
if (!t) {
return -1;
}
return 0;
}
static int kvs_oplog_flush_internal(int logfd, int force) {
oplog_buf_t *buf;
if (logfd < 0) {
return -1;
}
if (force && g_oplog_cur && g_oplog_cur->used > 0) {
oplog_push_tail(&g_oplog_ready_head, &g_oplog_ready_tail, g_oplog_cur);
g_oplog_cur = NULL;
}
while ((buf = oplog_pop_head(&g_oplog_ready_head, &g_oplog_ready_tail)) != NULL) {
if (kvs_oplog_submit_ready_buf(buf, logfd) < 0) {
oplog_push_front(&g_oplog_ready_head, &g_oplog_ready_tail, buf);
return -1;
}
}
return 0;
}
static void make_timeout_1s(struct timespec *ts) {
if (!ts) {
return;
}
clock_gettime(CLOCK_REALTIME, ts);
ts->tv_sec += 1;
}
static void *oplog_sync_main(void *arg) {
(void)arg;
while (1) {
uint64_t target_gen = 0;
int fd = -1;
int flush_ok = 0;
struct timespec ts;
make_timeout_1s(&ts);
pthread_mutex_lock(&g_oplog_mu);
(void)pthread_cond_timedwait(&g_sync_cv, &g_oplog_mu, &ts);
if (g_sync_stop) {
pthread_mutex_unlock(&g_oplog_mu);
break;
}
if (global_cfg.oplog_sync_mode != OPLOG_SYNC_EVERY_SEC ||
g_sync_logfd < 0 || g_sync_synced_gen >= g_sync_gen) {
pthread_mutex_unlock(&g_oplog_mu);
continue;
}
target_gen = g_sync_gen;
fd = g_sync_logfd;
flush_ok = (kvs_oplog_flush_internal(fd, 1) == 0);
pthread_mutex_unlock(&g_oplog_mu);
if (!flush_ok) {
continue;
}
if (kvs_oplog_fsync_all_workers(fd) == 0) {
pthread_mutex_lock(&g_oplog_mu);
if (g_sync_synced_gen < target_gen) {
g_sync_synced_gen = target_gen;
}
pthread_mutex_unlock(&g_oplog_mu);
}
}
return NULL;
}
static void oplog_sync_thread_stop_locked(void) {
int need_join = g_sync_started;
if (!need_join) {
return;
}
g_sync_stop = 1;
pthread_cond_broadcast(&g_sync_cv);
pthread_mutex_unlock(&g_oplog_mu);
pthread_join(g_sync_th, NULL);
pthread_mutex_lock(&g_oplog_mu);
g_sync_started = 0;
g_sync_logfd = -1;
}
static int oplog_sync_thread_start_locked(int logfd) {
if (g_sync_started) {
return 0;
}
g_sync_stop = 0;
g_sync_logfd = logfd;
if (pthread_create(&g_sync_th, NULL, oplog_sync_main, NULL) != 0) {
g_sync_logfd = -1;
return -1;
}
g_sync_started = 1;
return 0;
}
static int kvs_oplog_append_direct(const uint8_t *cmd, size_t len, int logfd) {
uint32_t nlen;
void *bufs[2];
size_t lens[2];
size_t total;
off_t myoff;
task_t *t;
nlen = htonl((uint32_t)len);
bufs[0] = (void *)&nlen;
lens[0] = sizeof(nlen);
bufs[1] = (void *)cmd;
lens[1] = len;
total = sizeof(nlen) + len;
myoff = g_log_off;
g_log_off += (off_t)total;
t = submit_write(&global_uring_ctx, logfd, bufs, lens, 2, myoff);
if (!t) {
return -1;
}
oplog_mark_dirty_locked();
return 0;
}
int init_cmd_log(const char *file, int *logfd){ int init_cmd_log(const char *file, int *logfd){
int rc = 0;
if(!file) return -1; if(!file) return -1;
int fd = open(file, O_RDWR | O_CREAT , 0644); int fd = open(file, O_RDWR | O_CREAT , 0644);
if(fd < 0) return -2; if(fd < 0) return -2;
off_t off = lseek(fd, 0, SEEK_END);
if (off < 0) {
close(fd);
return -2;
}
g_log_off = lseek(fd, 0, SEEK_END); pthread_mutex_lock(&g_oplog_mu);
oplog_sync_thread_stop_locked();
g_log_off = off;
g_sync_gen = 0;
g_sync_synced_gen = 0;
g_sync_logfd = fd;
oplog_pool_release_all();
if (global_cfg.oplog_sync_mode == OPLOG_SYNC_EVERY_SEC) {
rc = oplog_sync_thread_start_locked(fd);
}
pthread_mutex_unlock(&g_oplog_mu);
if (rc != 0) {
close(fd);
return -3;
}
*logfd = fd; *logfd = fd;
return 0; return 0;
} }
int destroy_cmd_log(int logfd){ int destroy_cmd_log(int logfd){
fsync(logfd); if (logfd < 0) {
return -1;
}
pthread_mutex_lock(&g_oplog_mu);
oplog_sync_thread_stop_locked();
if (kvs_oplog_flush_internal(logfd, 1) < 0) {
pthread_mutex_unlock(&g_oplog_mu);
return -2;
}
pthread_mutex_unlock(&g_oplog_mu);
if (kvs_oplog_fsync_all_workers(logfd) < 0) {
return -3;
}
cleanup_finished_iouring_tasks(&global_uring_ctx);
close(logfd); close(logfd);
pthread_mutex_lock(&g_oplog_mu);
oplog_pool_release_all();
g_log_off = -1;
g_sync_gen = 0;
g_sync_synced_gen = 0;
g_sync_logfd = -1;
pthread_mutex_unlock(&g_oplog_mu);
global_oplog_fd = -1; global_oplog_fd = -1;
return 0; return 0;
} }
int kvs_oplog_buffer_append(const uint8_t *cmd, size_t len, int logfd){
if (logfd < 0 || !cmd || len == 0) return -1;
if (len > UINT32_MAX) return -2;
pthread_mutex_lock(&g_oplog_mu);
if (g_log_off < 0) {
pthread_mutex_unlock(&g_oplog_mu);
return -3;
}
{
size_t need = sizeof(uint32_t) + len;
int became_full = 0;
uint32_t nlen = htonl((uint32_t)len);
if (need > KVS_OPLOG_PAGE_SIZE) {
int rc = kvs_oplog_append_direct(cmd, len, logfd);
pthread_mutex_unlock(&g_oplog_mu);
return (rc == 0) ? KVS_OPLOG_BUF_NOT_FULL : -4;
}
if (!g_oplog_cur) {
g_oplog_cur = oplog_borrow_buf();
if (!g_oplog_cur) {
pthread_mutex_unlock(&g_oplog_mu);
return -4;
}
g_oplog_cur->off = g_log_off;
}
if (g_oplog_cur->used + need > KVS_OPLOG_PAGE_SIZE) {
if (g_oplog_cur->used > 0) {
oplog_push_tail(&g_oplog_ready_head, &g_oplog_ready_tail, g_oplog_cur);
became_full = 1;
g_oplog_cur = NULL;
}
g_oplog_cur = oplog_borrow_buf();
if (!g_oplog_cur) {
pthread_mutex_unlock(&g_oplog_mu);
return -4;
}
g_oplog_cur->off = g_log_off;
}
memcpy(g_oplog_cur->data + g_oplog_cur->used, &nlen, sizeof(nlen));
g_oplog_cur->used += sizeof(nlen);
memcpy(g_oplog_cur->data + g_oplog_cur->used, cmd, len);
g_oplog_cur->used += len;
g_log_off += (off_t)need;
oplog_mark_dirty_locked();
if (g_oplog_cur->used == KVS_OPLOG_PAGE_SIZE) {
oplog_push_tail(&g_oplog_ready_head, &g_oplog_ready_tail, g_oplog_cur);
g_oplog_cur = NULL;
became_full = 1;
}
pthread_mutex_unlock(&g_oplog_mu);
return became_full ? KVS_OPLOG_BUF_FULL : KVS_OPLOG_BUF_NOT_FULL;
}
}
int kvs_oplog_flush(int logfd, int force) {
int rc;
pthread_mutex_lock(&g_oplog_mu);
rc = kvs_oplog_flush_internal(logfd, force);
pthread_mutex_unlock(&g_oplog_mu);
if (rc < 0) {
return -1;
}
return 0;
}
int kvs_oplog_append(const uint8_t *cmd, size_t len, int logfd){ int kvs_oplog_append(const uint8_t *cmd, size_t len, int logfd){
if (logfd < 0 || !cmd || len == 0) return -1; if (logfd < 0 || !cmd || len == 0) return -1;
if (len > UINT32_MAX) return -2; if (len > UINT32_MAX) return -2;
pthread_mutex_lock(&g_oplog_mu);
if (g_log_off < 0) {
uint32_t nlen = htonl((uint32_t)len); pthread_mutex_unlock(&g_oplog_mu);
return -3;
void *bufs[2];
size_t lens[2];
bufs[0] = (void *)&nlen;
lens[0] = sizeof(nlen);
bufs[1] = (void *)cmd;
lens[1] = len;
size_t total = sizeof(nlen) + len;
off_t myoff = g_log_off;
g_log_off += (off_t)total;
task_t *t = submit_write(&global_uring_ctx, logfd, bufs, lens, 2, myoff);
if (!t) {
return -4;
} }
if (kvs_oplog_flush_internal(logfd, 1) < 0) {
pthread_mutex_unlock(&g_oplog_mu);
return -4;
}
if (kvs_oplog_append_direct(cmd, len, logfd) < 0) {
pthread_mutex_unlock(&g_oplog_mu);
return -4;
}
pthread_mutex_unlock(&g_oplog_mu);
return 0; return 0;
} }
@@ -69,8 +568,8 @@ int kvs_replay_log(int logfd){
uint32_t nlen = 0; uint32_t nlen = 0;
int hr = read_full(logfd, &nlen, sizeof(nlen)); int hr = read_full(logfd, &nlen, sizeof(nlen));
if (hr == 0) break; /* EOF正常结束 */ if (hr == 0) break;
if (hr < 0) { return -2; } /* 半截头 */ if (hr < 0) { return -2; }
uint32_t len = ntohl(nlen); uint32_t len = ntohl(nlen);
if (len == 0) { return -3; } if (len == 0) { return -3; }
@@ -79,24 +578,21 @@ int kvs_replay_log(int logfd){
if (!cmd_bytes ) { return -5; } if (!cmd_bytes ) { return -5; }
int pr = read_full(logfd, cmd_bytes, len); int pr = read_full(logfd, cmd_bytes, len);
if (pr <= 0) { /* 半截 payload */ if (pr <= 0) {
kvs_free(cmd_bytes ); kvs_free(cmd_bytes );
return -6; return -6;
} }
/* -------- RESP parse -------- */
resp_cmd_t cmd; resp_cmd_t cmd;
memset(&cmd, 0, sizeof(cmd)); memset(&cmd, 0, sizeof(cmd));
int clen = resp_parse_one_cmd(cmd_bytes, (int)len, &cmd); int clen = resp_parse_one_cmd(cmd_bytes, (int)len, &cmd);
if (clen <= 0 || clen != (int)len) { if (clen <= 0 || clen != (int)len) {
/* clen==0: need more data但日志记录必须是一条完整命令所以视为坏日志 */
kvs_free(cmd_bytes); kvs_free(cmd_bytes);
return -7; return -7;
} }
/* -------- execute -------- */
resp_value_t outvalue; resp_value_t outvalue;
memset(&outvalue, 0, sizeof(outvalue)); memset(&outvalue, 0, sizeof(outvalue));
@@ -106,14 +602,15 @@ int kvs_replay_log(int logfd){
return -8; return -8;
} }
/* 注意:
* outv 可能引用存储内存,但我们不 build response因此无需处理。
* cmd_bytes 可以释放,因为 cmd slice 指向 cmd_bytes仅在 dispatch 期间使用。
* */
kvs_free(cmd_bytes); kvs_free(cmd_bytes);
} }
pthread_mutex_lock(&g_oplog_mu);
g_log_off = lseek(logfd, 0, SEEK_CUR); g_log_off = lseek(logfd, 0, SEEK_CUR);
if (g_sync_synced_gen < g_sync_gen) {
g_sync_synced_gen = g_sync_gen;
}
pthread_mutex_unlock(&g_oplog_mu);
return 0; return 0;
} }
@@ -123,8 +620,24 @@ int kvs_replay_log(int logfd){
*/ */
int ksv_clear_log(int logfd){ int ksv_clear_log(int logfd){
if(logfd < 0) return -1; if(logfd < 0) return -1;
pthread_mutex_lock(&g_oplog_mu);
if (kvs_oplog_flush_internal(logfd, 1) < 0) {
pthread_mutex_unlock(&g_oplog_mu);
return -2;
}
pthread_mutex_unlock(&g_oplog_mu);
if (kvs_oplog_fsync_all_workers(logfd) < 0) {
return -3;
}
cleanup_finished_iouring_tasks(&global_uring_ctx);
ftruncate(logfd, 0); ftruncate(logfd, 0);
lseek(logfd, 0, SEEK_SET); lseek(logfd, 0, SEEK_SET);
pthread_mutex_lock(&g_oplog_mu);
g_log_off = 0; g_log_off = 0;
g_sync_gen = 0;
g_sync_synced_gen = 0;
pthread_mutex_unlock(&g_oplog_mu);
return 0; return 0;
} }

View File

@@ -40,6 +40,24 @@ int kvs_create_snapshot(iouring_ctx_t *uring, const char* array_file, const char
return ret; return ret;
} }
int kvs_create_snapshot_async_1(iouring_ctx_t *uring, const char* array_file, const char* rbtree_file, const char* hash_file){
pid_t pid = fork();
if (pid == -1) { perror("fork"); return -1; }
if (pid == 0) {
int ret = kvs_create_snapshot(uring, array_file, rbtree_file, hash_file);
if (ret != 0) {
fprintf(stderr, "snapshot creation failed\n");
_exit(1);
}
_exit(0);
} else {
return 0;
}
}
static int send_file_to_ipport(const char *ip, int port, const char *filename) { static int send_file_to_ipport(const char *ip, int port, const char *filename) {
int sockfd = socket(AF_INET, SOCK_STREAM, 0); int sockfd = socket(AF_INET, SOCK_STREAM, 0);
if (sockfd < 0) { perror("socket"); return -1; } if (sockfd < 0) { perror("socket"); return -1; }
@@ -86,7 +104,7 @@ static int send_file_to_ipport(const char *ip, int port, const char *filename) {
return 0; return 0;
} }
int kvs_create_snapshot_async(const char *ip, int port){ int kvs_create_snapshot_async_2(const char *ip, int port){
pid_t pid = fork(); pid_t pid = fork();
if (pid == -1) { perror("fork"); return -1; } if (pid == -1) { perror("fork"); return -1; }

View File

@@ -1,12 +1,12 @@
# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
OUTPUT := .output OUTPUT := .output
CLANG ?= clang CLANG ?= clang
LIBBPF_SRC := $(abspath ../../libbpf/src) LIBBPF_SRC := $(abspath ../../libbpf-bootstrap/libbpf/src)
BPFTOOL_SRC := $(abspath ../../bpftool/src) BPFTOOL_SRC := $(abspath ../../libbpf-bootstrap/bpftool/src)
LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a) LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a)
BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool) BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool)
BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool
LIBBLAZESYM_SRC := $(abspath ../../blazesym/) LIBBLAZESYM_SRC := $(abspath ../../libbpf-bootstrap/blazesym/)
LIBBLAZESYM_INC := $(abspath $(LIBBLAZESYM_SRC)/capi/include) LIBBLAZESYM_INC := $(abspath $(LIBBLAZESYM_SRC)/capi/include)
LIBBLAZESYM_OBJ := $(abspath $(OUTPUT)/libblazesym_c.a) LIBBLAZESYM_OBJ := $(abspath $(OUTPUT)/libblazesym_c.a)
ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \ ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \
@@ -16,11 +16,11 @@ ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \
| sed 's/mips.*/mips/' \ | sed 's/mips.*/mips/' \
| sed 's/riscv64/riscv/' \ | sed 's/riscv64/riscv/' \
| sed 's/loongarch64/loongarch/') | sed 's/loongarch64/loongarch/')
VMLINUX := ../../vmlinux.h/include/$(ARCH)/vmlinux.h VMLINUX := ../../libbpf-bootstrap/vmlinux.h/include/$(ARCH)/vmlinux.h
# Use our own libbpf API headers and Linux UAPI headers distributed with # Use our own libbpf API headers and Linux UAPI headers distributed with
# libbpf to avoid dependency on system-wide headers, which could be missing or # libbpf to avoid dependency on system-wide headers, which could be missing or
# outdated # outdated
INCLUDES := -I$(OUTPUT) -I../../libbpf/include/uapi -I$(dir $(VMLINUX)) -I$(LIBBLAZESYM_INC) INCLUDES := -I$(OUTPUT) -I../../libbpf-bootstrap/libbpf/include/uapi -I$(dir $(VMLINUX)) -I$(LIBBLAZESYM_INC)
CFLAGS := -g -Wall CFLAGS := -g -Wall
ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS) ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS)

BIN
ebpf/c/replica Executable file

Binary file not shown.

View File

@@ -189,6 +189,7 @@ static void* reader_thread_func(void *arg)
uint64_t last = __atomic_load_n(&g_shm.hdr->last_seq, __ATOMIC_ACQUIRE); uint64_t last = __atomic_load_n(&g_shm.hdr->last_seq, __ATOMIC_ACQUIRE);
if (local_seq > last) { if (local_seq > last) {
// 没有新数据,短暂休眠避免空转 // 没有新数据,短暂休眠避免空转
usleep(500);
continue; continue;
} }
if (read_off+ sizeof(replica_rec_hdr_t) >= g_shm.hdr->capacity) { if (read_off+ sizeof(replica_rec_hdr_t) >= g_shm.hdr->capacity) {
@@ -224,8 +225,7 @@ static void* reader_thread_func(void *arg)
// 序列号检查 // 序列号检查
if (h.seq != local_seq) { if (h.seq != local_seq) {
DEBUGLOG("Reader: seq mismatch! h.seq=%lu, local_seq=%lu, off=%u\n", // DEBUGLOG("Reader: seq mismatch! h.seq=%lu, local_seq=%lu, off=%u\n", h.seq, local_seq, read_off);
h.seq, local_seq, read_off);
continue; continue;
} }

View File

@@ -0,0 +1,39 @@
#!/usr/bin/env bpftrace
BEGIN
{
printf("开始统计 kvstore 进程的 __completed_cmd 调用次数...\n");
printf("每 5 秒打印一次统计Ctrl-C 退出\n\n");
// 统计变量
@enter = 0;
@exit = 0;
}
interval:s:5
{
time("%H:%M:%S");
printf(" __completed_enter_cmd 调用次数: %10d\n", @enter);
printf(" __completed_exit_cmd 调用次数: %10d\n", @exit);
// 可选:如果想每轮清零统计,取消下面注释
// clear(@enter);
// clear(@exit);
}
uprobe:/home/lian/share/9.1-kvstore/kvstore:__completed_cmd
{
@exit++;
}
uretprobe:/home/lian/share/9.1-kvstore/kvstore:__completed_cmd
{
@enter++;
}
END
{
printf("\n最终统计\n");
}

View File

@@ -0,0 +1,39 @@
#!/usr/bin/env bpftrace
BEGIN
{
printf("开始统计 kvstore 进程的 tcp_rcv_established 调用次数...\n");
printf("每 5 秒打印一次统计Ctrl-C 退出\n\n");
// 统计变量
@enter = 0;
@exit = 0;
}
interval:s:5
{
time("%H:%M:%S");
printf(" tcp_rcv_established 调用次数: %10d\n", @enter);
printf(" tcp_rcv_established ret 调用次数: %10d\n", @exit);
// 可选:如果想每轮清零统计,取消下面注释
// clear(@enter);
// clear(@exit);
}
kprobe:tcp_rcv_established
{
@enter++;
}
kretprobe:tcp_rcv_established
{
@exit++;
}
END
{
printf("\n最终统计\n");
printf("tcp_rcv_established : %d 次\n", @enter);
printf("tcp_rcv_established ret: %d 次\n", @exit);
}

View File

@@ -0,0 +1,41 @@
#!/usr/bin/env bpftrace
BEGIN
{
printf("开始统计 kvstore 进程的 recvfrom 调用次数...\n");
printf("每 5 秒打印一次统计Ctrl-C 退出\n\n");
// 统计变量
@enter = 0;
@exit = 0;
}
interval:s:5
{
time("%H:%M:%S");
printf(" sys_enter_recvfrom 调用次数: %10d\n", @enter);
printf(" sys_exit_recvfrom 调用次数: %10d\n", @exit);
// 可选:如果想每轮清零统计,取消下面注释
// clear(@enter);
// clear(@exit);
}
tracepoint:syscalls:sys_enter_recvfrom
/comm == "kvstore"/
{
@enter++;
}
tracepoint:syscalls:sys_exit_recvfrom
/comm == "kvstore"/
{
@exit++;
}
END
{
printf("\n最终统计\n");
printf("sys_enter_recvfrom: %d 次\n", @enter);
printf("sys_exit_recvfrom : %d 次\n", @exit);
}

View File

@@ -1,12 +1,12 @@
# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
OUTPUT := .output OUTPUT := .output
CLANG ?= clang CLANG ?= clang
LIBBPF_SRC := $(abspath ../../libbpf/src) LIBBPF_SRC := $(abspath ../../libbpf-bootstrap/libbpf/src)
BPFTOOL_SRC := $(abspath ../../bpftool/src) BPFTOOL_SRC := $(abspath ../../libbpf-bootstrap/bpftool/src)
LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a) LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a)
BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool) BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool)
BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool
LIBBLAZESYM_SRC := $(abspath ../../blazesym/) LIBBLAZESYM_SRC := $(abspath ../../libbpf-bootstrap/blazesym/)
LIBBLAZESYM_INC := $(abspath $(LIBBLAZESYM_SRC)/capi/include) LIBBLAZESYM_INC := $(abspath $(LIBBLAZESYM_SRC)/capi/include)
LIBBLAZESYM_OBJ := $(abspath $(OUTPUT)/libblazesym_c.a) LIBBLAZESYM_OBJ := $(abspath $(OUTPUT)/libblazesym_c.a)
ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \ ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \
@@ -16,11 +16,11 @@ ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \
| sed 's/mips.*/mips/' \ | sed 's/mips.*/mips/' \
| sed 's/riscv64/riscv/' \ | sed 's/riscv64/riscv/' \
| sed 's/loongarch64/loongarch/') | sed 's/loongarch64/loongarch/')
VMLINUX := ../../vmlinux.h/include/$(ARCH)/vmlinux.h VMLINUX := ../../libbpf-bootstrap/vmlinux.h/include/$(ARCH)/vmlinux.h
# Use our own libbpf API headers and Linux UAPI headers distributed with # Use our own libbpf API headers and Linux UAPI headers distributed with
# libbpf to avoid dependency on system-wide headers, which could be missing or # libbpf to avoid dependency on system-wide headers, which could be missing or
# outdated # outdated
INCLUDES := -I$(OUTPUT) -I../../libbpf/include/uapi -I$(dir $(VMLINUX)) -I$(LIBBLAZESYM_INC) INCLUDES := -I$(OUTPUT) -I../../libbpf-bootstrap/libbpf/include/uapi -I$(dir $(VMLINUX)) -I$(LIBBLAZESYM_INC)
CFLAGS := -g -Wall CFLAGS := -g -Wall
ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS) ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS)

BIN
ebpf/old.c/replica Executable file

Binary file not shown.

View File

@@ -1,80 +1,133 @@
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
/* Copyright (c) 2020 Facebook */
#include "vmlinux.h" #include "vmlinux.h"
#include <bpf/bpf_helpers.h> #include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h> #include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h> #include <bpf/bpf_core_read.h>
#include <bpf/bpf_endian.h>
#include "replica.h" #include "replica.h"
char LICENSE[] SEC("license") = "Dual BSD/GPL"; char LICENSE[] SEC("license") = "Dual BSD/GPL";
#define FLAG_SSYNC_HAPPENED 0
#define TARGET_PORT 8888
/* ================= BPF Maps ================= */
struct { struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); __uint(type, BPF_MAP_TYPE_ARRAY);
__uint(key_size, sizeof(int)); __type(key, __u32);
__uint(value_size, sizeof(int)); __type(value, __u32);
} events SEC(".maps"); __uint(max_entries, 1);
} flags SEC(".maps");
/* __completed_cmd(const uint8_t *cmd, size_t len, unsigned long long seq); */ struct {
SEC("uprobe//home/lian/share/9.1-kvstore/kvstore:__completed_cmd") __uint(type, BPF_MAP_TYPE_RINGBUF);
int BPF_KPROBE(handle_completed_cmd, __uint(max_entries, 1 << 26); // 64MB
const __u8 *cmd, size_t len, __u64 seq) } rb SEC(".maps");
/* ================= Helper Functions ================= */
// 无需 process filter改用 socket port filter
/* ================= Kernel Hooks (TCP Layer) ================= */
/*
* 使用 kprobe 挂载 tcp_rcv_established
* 此时 skb 包含完整的 TCP 包Header + Payload数据在内核态。
*/
SEC("kprobe/tcp_rcv_established")
int BPF_KPROBE(trace_tcp_rcv, struct sock *sk, struct sk_buff *skb)
{ {
struct replica_event evt = {}; // 1. 检查 SSYNC 标志是否已开启 (只在全量同步后开始抓包)
__u32 copy_len; __u32 flag_key = FLAG_SSYNC_HAPPENED;
__u32 *flag_val = bpf_map_lookup_elem(&flags, &flag_key);
if (!flag_val || *flag_val == 0)
return 0;
evt.type = EVENT_COMPLETED_CMD; // 2. 过滤端口 8888
evt.complete.seq = seq; // sk->sk_num 存储的是 Host Byte Order 的本地端口
__u16 lport = BPF_CORE_READ(sk, __sk_common.skc_num);
if (lport != TARGET_PORT)
return 0;
copy_len = len; // 3. 计算数据长度
if (copy_len > MAX_CMD_LEN) // 在 tcp_rcv_established 中skb->len 是 (TCP Header + Data) 的长度
copy_len = MAX_CMD_LEN; // skb->data 指向 TCP Header 的起始位置
unsigned int skb_len = BPF_CORE_READ(skb, len);
evt.complete.len = copy_len; // 读取 TCP Header 长度 (doff 字段,单位是 4字节)
// 需要读取 skb->data 指向的内存的前几个字节来获取 doff
unsigned char *skb_data = BPF_CORE_READ(skb, data);
if (cmd) // 读取 TCP Header 的第 12 个字节 (包含 Data Offset)
bpf_probe_read_user(evt.complete.cmd, copy_len, cmd); // Offset 12: Data Offset (4 bits) | Reserved (3 bits) | NS (1 bit)
unsigned char doff_byte;
if (bpf_probe_read_kernel(&doff_byte, 1, skb_data + 12) < 0)
return 0;
bpf_perf_event_output(ctx, &events, unsigned int tcp_hdr_len = (doff_byte >> 4) * 4;
BPF_F_CURRENT_CPU,
&evt, sizeof(evt)); // 计算 Payload 长度
if (skb_len <= tcp_hdr_len)
return 0; // 只有 ACK 没有数据,或者是控制包
unsigned int payload_len = skb_len - tcp_hdr_len;
// 4. 准备 RingBuffer 数据
struct replica_event *e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
if (!e)
return 0;
e->type = EVENT_COMPLETED_CMD;
// 截断超长数据
if (payload_len > MAX_CMD_LEN)
e->complete.len = MAX_CMD_LEN;
else
e->complete.len = payload_len;
// 5. 核心修改:使用 bpf_probe_read_kernel 读取数据
// 数据起始位置 = skb->data + tcp_hdr_len
if (bpf_probe_read_kernel(&e->complete.cmd[0], e->complete.len, skb_data + tcp_hdr_len) < 0) {
bpf_ringbuf_discard(e, 0);
return 0;
}
bpf_ringbuf_submit(e, 0);
return 0; return 0;
} }
/* __ssync(const uint8_t *ip, uint32_t ip_len, int port, unsigned long long seq); */ /* ================= Uprobe Hooks================= */
SEC("uprobe//home/lian/share/9.1-kvstore/kvstore:__ssync") SEC("uprobe//home/lian/share/9.1-kvstore/kvstore:__ssync")
int BPF_KPROBE(handle_ssync, int BPF_KPROBE(handle_ssync,
const __u8 *ip, __u32 ip_len, int port, __u64 seq) const __u8 *ip, __u32 ip_len, int port, __u64 seq_unused)
{ {
struct replica_event evt = {}; __u32 key = FLAG_SSYNC_HAPPENED;
__u32 val = 1;
bpf_map_update_elem(&flags, &key, &val, BPF_ANY);
evt.type = EVENT_SSYNC; struct replica_event *e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
evt.sync.seq = seq; if (!e) return 0;
evt.sync.port = port;
e->type = EVENT_SSYNC;
e->sync.port = port;
__u32 copy_len = ip_len; __u32 copy_len = ip_len;
if (copy_len > sizeof(evt.sync.ip)) if (copy_len > sizeof(e->sync.ip)) copy_len = sizeof(e->sync.ip);
copy_len = sizeof(evt.sync.ip); if (ip) bpf_probe_read_user(e->sync.ip, copy_len, ip);
if (ip) bpf_ringbuf_submit(e, 0);
bpf_probe_read_user(evt.sync.ip, copy_len, ip);
bpf_perf_event_output(ctx, &events,
BPF_F_CURRENT_CPU,
&evt, sizeof(evt));
return 0; return 0;
} }
/* __sready(void); */
SEC("uprobe//home/lian/share/9.1-kvstore/kvstore:__sready") SEC("uprobe//home/lian/share/9.1-kvstore/kvstore:__sready")
int BPF_KPROBE(handle_sready) int BPF_KPROBE(handle_sready)
{ {
struct replica_event evt = {}; struct replica_event *e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
if (!e) return 0;
evt.type = EVENT_SREADY; e->type = EVENT_SREADY;
bpf_ringbuf_submit(e, 0);
bpf_perf_event_output(ctx, &events,
BPF_F_CURRENT_CPU,
&evt, sizeof(evt));
return 0; return 0;
} }

View File

@@ -1,5 +1,3 @@
// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
/* Copyright (c) 2020 Facebook */
#include <stdio.h> #include <stdio.h>
#include <unistd.h> #include <unistd.h>
#include <stdlib.h> #include <stdlib.h>
@@ -12,6 +10,7 @@
#include <arpa/inet.h> #include <arpa/inet.h>
#include <sys/epoll.h> #include <sys/epoll.h>
#include <fcntl.h> #include <fcntl.h>
#include <errno.h>
#include "replica.h" #include "replica.h"
@@ -20,10 +19,9 @@
typedef enum { typedef enum {
OFFLINE = 0, OFFLINE = 0,
ONLINE = 1, ONLINE = 1,
}replica_state_e ; } replica_state_e;
struct cmd_node { struct cmd_node {
__u64 seq;
__u32 len; __u32 len;
uint8_t *cmd; uint8_t *cmd;
struct cmd_node *next; struct cmd_node *next;
@@ -32,7 +30,7 @@ struct cmd_node {
struct pending_queue { struct pending_queue {
struct cmd_node *head; struct cmd_node *head;
struct cmd_node *tail; struct cmd_node *tail;
int count; int count;
}; };
/* ================= 全局状态 ================= */ /* ================= 全局状态 ================= */
@@ -43,7 +41,6 @@ static int epollfd = -1;
static char peer_ip[MAX_IP_LEN]; static char peer_ip[MAX_IP_LEN];
static int peer_port; static int peer_port;
static __u64 peer_seq;
static struct pending_queue pending = { static struct pending_queue pending = {
.head = NULL, .head = NULL,
@@ -66,7 +63,7 @@ static void pending_free()
q->count = 0; q->count = 0;
} }
static void pending_push(__u64 seq, __u32 len, const uint8_t *cmd) static void pending_push(__u32 len, const uint8_t *cmd)
{ {
struct cmd_node *node = malloc(sizeof(*node)); struct cmd_node *node = malloc(sizeof(*node));
if (!node) if (!node)
@@ -79,7 +76,6 @@ static void pending_push(__u64 seq, __u32 len, const uint8_t *cmd)
} }
memcpy(node->cmd, cmd, len); memcpy(node->cmd, cmd, len);
node->seq = seq;
node->len = len; node->len = len;
node->next = NULL; node->next = NULL;
@@ -93,72 +89,66 @@ static void pending_push(__u64 seq, __u32 len, const uint8_t *cmd)
pending.count++; pending.count++;
} }
static void pending_gc(__u64 min_seq) static long long int sendn = 0;
{
struct cmd_node *cur = pending.head;
int n = pending.count;
while (cur && cur->seq < min_seq) {
struct cmd_node *tmp = cur;
cur = cur->next;
free(tmp->cmd);
free(tmp);
pending.count--;
}
DEBUGLOG("gc:%d\n", n-pending.count);
pending.head = cur;
if (!cur)
pending.tail = NULL;
}
static void pending_send_all(void) static void pending_send_all(void)
{ {
struct cmd_node *cur = pending.head; struct cmd_node *cur = pending.head;
while (cur) { int need_out = 0;
int rt = send(sockfd, cur->cmd, cur->len, 0); int sent_count = 0;
const int MAX_BATCH = 100; // 批量发送上限,避免阻塞过久
if(rt == (int)cur->len){ while (cur && sent_count < MAX_BATCH) {
// 使用 MSG_MORE 合并多个小包
int flags = (cur->next && sent_count < MAX_BATCH - 1) ? MSG_MORE : 0;
int rt = send(sockfd, cur->cmd, cur->len, flags);
if (rt == (int)cur->len) {
sendn += rt;
printf("%s\n", cur->cmd);
struct cmd_node *tmp = cur; struct cmd_node *tmp = cur;
cur = cur->next; cur = cur->next;
free(tmp->cmd); free(tmp->cmd);
free(tmp); free(tmp);
pending.count--; pending.count--;
}else{ pending.head = cur;
DEBUGLOG("error\n"); sent_count++;
// 失败:不移动 cur直接 break } else if (rt > 0) {
if (rt < 0) { sendn += rt;
memmove(cur->cmd, cur->cmd + rt, cur->len - rt);
cur->len -= rt;
need_out = 1;
break;
} else {
if (errno == EAGAIN || errno == EWOULDBLOCK) {
need_out = 1;
break;
} else {
perror("send failed"); perror("send failed");
if (errno == ECONNRESET || errno == EPIPE) { state = OFFLINE;
state = OFFLINE;
if (sockfd >= 0) {
close(sockfd);
sockfd = -1;
DEBUGLOG("connect closed\n");
}
} else if (rt == 0) {
fprintf(stderr, "send returned 0 (peer closed?)\n");
} else {
fprintf(stderr, "partial send: %d/%u\n", rt, cur->len);
}
break; break;
} }
} }
} }
DEBUGLOG("sendn :%lld\n", sendn);
pending.head = cur; pending.head = cur;
if(!cur) if (!cur) pending.tail = NULL;
pending.tail = NULL;
if (sockfd >= 0 && state == ONLINE) {
struct epoll_event ev = {0};
ev.data.fd = sockfd;
ev.events = EPOLLIN;
if (need_out || pending.head) {
ev.events |= EPOLLOUT;
}
epoll_ctl(epollfd, EPOLL_CTL_MOD, sockfd, &ev);
}
} }
/* ================= 网络逻辑 ================= */ /* ================= 网络逻辑 ================= */
static void try_connect(void) static void try_connect(void)
{ {
if(sockfd > 0){ if (sockfd > 0) {
close(sockfd); close(sockfd);
sockfd = -1; sockfd = -1;
} }
@@ -170,14 +160,14 @@ static void try_connect(void)
addr.sin_port = htons(peer_port); addr.sin_port = htons(peer_port);
inet_pton(AF_INET, peer_ip, &addr.sin_addr); inet_pton(AF_INET, peer_ip, &addr.sin_addr);
for(i = 0;i < 10; ++ i){ for (i = 0; i < 10; ++i) {
sockfd = socket(AF_INET, SOCK_STREAM, 0); sockfd = socket(AF_INET, SOCK_STREAM, 0);
if (sockfd < 0) { if (sockfd < 0) {
perror("socket"); perror("socket");
return; return;
} }
DEBUGLOG("connect try %d...\n", i + 1); DEBUGLOG("connect try %d... %s:%d\n", i + 1, peer_ip, peer_port);
if (connect(sockfd, (struct sockaddr *)&addr, sizeof(addr)) == 0) { if (connect(sockfd, (struct sockaddr *)&addr, sizeof(addr)) == 0) {
DEBUGLOG("connect success: %s:%d\n", peer_ip, peer_port); DEBUGLOG("connect success: %s:%d\n", peer_ip, peer_port);
@@ -190,7 +180,10 @@ static void try_connect(void)
epoll_ctl(epollfd, EPOLL_CTL_ADD, sockfd, &ev); epoll_ctl(epollfd, EPOLL_CTL_ADD, sockfd, &ev);
state = ONLINE; state = ONLINE;
pending_send_all(); if (pending.head) {
ev.events = EPOLLIN | EPOLLOUT;
epoll_ctl(epollfd, EPOLL_CTL_MOD, sockfd, &ev);
}
return; return;
} }
@@ -234,17 +227,10 @@ static void handle_socket_readable(void)
static void handle_socket_writable(void) static void handle_socket_writable(void)
{ {
pending_send_all(); pending_send_all();
if (!pending.head) {
struct epoll_event ev;
ev.events = EPOLLIN; // 只监听读
ev.data.fd = sockfd;
epoll_ctl(epollfd, EPOLL_CTL_MOD, sockfd, &ev);
}
} }
/* ================= ring buffer 回调 ================= */
/* ================= perf buffer 回调 ================= */ static int handle_event(void *ctx, void *data, size_t size)
static void handle_event(void *ctx, int cpu, void *data, __u32 size)
{ {
struct replica_event *evt = data; struct replica_event *evt = data;
switch (evt->type) { switch (evt->type) {
@@ -252,20 +238,18 @@ static void handle_event(void *ctx, int cpu, void *data, __u32 size)
case EVENT_SSYNC: case EVENT_SSYNC:
strncpy(peer_ip, evt->sync.ip, sizeof(peer_ip)); strncpy(peer_ip, evt->sync.ip, sizeof(peer_ip));
peer_port = evt->sync.port; peer_port = evt->sync.port;
peer_seq = evt->sync.seq; DEBUGLOG("SSYNC [%s:%d]\n", peer_ip, peer_port);
DEBUGLOG("SSYNC [seq:%lld], [%s:%d]\n", peer_seq, peer_ip, peer_port);
state = OFFLINE; state = OFFLINE;
pending_gc(peer_seq);
break; break;
case EVENT_COMPLETED_CMD: case EVENT_COMPLETED_CMD:
// DEBUGLOG("CMD [seq:%lld], cmd:\n[\n%s]\n", evt->complete.seq, evt->complete.cmd); // 这里收到的可能是半个命令,或者是多个命令的粘包
pending_push(evt->complete.seq, // 但对于转发器来说,只是字节流,直接 push 即可
evt->complete.len, if (evt->complete.len > 0) {
evt->complete.cmd); pending_push(evt->complete.len, evt->complete.cmd);
}
if (state == ONLINE && sockfd >= 0) { if (state == ONLINE && sockfd >= 0 && pending.head) {
struct epoll_event ev; struct epoll_event ev;
ev.events = EPOLLIN | EPOLLOUT; ev.events = EPOLLIN | EPOLLOUT;
ev.data.fd = sockfd; ev.data.fd = sockfd;
@@ -274,82 +258,80 @@ static void handle_event(void *ctx, int cpu, void *data, __u32 size)
break; break;
case EVENT_SREADY: case EVENT_SREADY:
DEBUGLOG("SREADY \n"); DEBUGLOG("SREADY\n");
if (state == OFFLINE) if (state == OFFLINE)
try_connect(); try_connect();
break; break;
} }
return 0;
} }
/* ================= main ================= */
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
struct replica_bpf *skel; struct replica_bpf *skel;
struct perf_buffer *pb = NULL; struct ring_buffer *rb = NULL;
int err; int err;
/* Open BPF application */ // 提高 rlimit 以允许加载 BPF
skel = replica_bpf__open(); struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
if (!skel) { setrlimit(RLIMIT_MEMLOCK, &r);
fprintf(stderr, "Failed to open BPF skeleton\n");
return 1;
}
/* Load & verify BPF programs */ skel = replica_bpf__open();
err = replica_bpf__load(skel); if (!skel) {
if (err) { fprintf(stderr, "Failed to open BPF skeleton\n");
fprintf(stderr, "Failed to load and verify BPF skeleton\n"); return 1;
goto cleanup; }
}
/* Attach tracepoint handler */ err = replica_bpf__load(skel);
err = replica_bpf__attach(skel); if (err) {
if (err) { fprintf(stderr, "Failed to load BPF skeleton\n");
fprintf(stderr, "Failed to attach BPF skeleton\n");
goto cleanup;
}
printf("Successfully started! \n");
pb = perf_buffer__new(bpf_map__fd(skel->maps.events), 8,
handle_event, NULL, NULL, NULL);
if(!pb){
goto cleanup;
}
epollfd = epoll_create1(0);
if (epollfd < 0) {
fprintf(stderr, "epoll_create1 failed\n");
goto cleanup; goto cleanup;
} }
while (1) { err = replica_bpf__attach(skel);
if (err) {
fprintf(stderr, "Failed to attach BPF skeleton\n");
goto cleanup;
}
printf("Successfully started! Monitoring TCP port 8888 (Kernel Side)...\n");
rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), handle_event, NULL, NULL);
if (!rb) {
fprintf(stderr, "Failed to create ring buffer\n");
goto cleanup;
}
epollfd = epoll_create1(0);
// ... (主循环保持不变) ...
// 主循环建议:
while (1) {
struct epoll_event events[10]; struct epoll_event events[10];
perf_buffer__poll(pb, 1000); // 处理事件 // 既然追求性能Polling 依然是必要的
// 10ms 的延迟对于 RingBuffer 消费是可以接受的
int poll_timeout = (state == ONLINE) ? 10 : 100;
if(OFFLINE) continue; ring_buffer__poll(rb, poll_timeout);
if (state == OFFLINE) continue;
int nfds = epoll_wait(epollfd, events, 10, 0); int nfds = epoll_wait(epollfd, events, 10, 0);
for (int i = 0; i < nfds; i++) { for (int i = 0; i < nfds; i++) {
if (events[i].data.fd == sockfd) { if (events[i].data.fd == sockfd) {
if (events[i].events & EPOLLIN) { if (events[i].events & EPOLLIN) handle_socket_readable();
handle_socket_readable(); // 快速消费接收数据 if (events[i].events & EPOLLOUT) handle_socket_writable();
}
if (events[i].events & EPOLLOUT) {
handle_socket_writable(); // 发送数据
}
} }
} }
} }
perf_buffer__free(pb);
cleanup: cleanup:
pending_free(); // ... (清理代码保持不变) ...
if (sockfd >= 0) close(sockfd); if (rb) ring_buffer__free(rb);
replica_bpf__destroy(skel); pending_free();
return -err; if (sockfd >= 0) close(sockfd);
if (epollfd >= 0) close(epollfd);
replica_bpf__destroy(skel);
return -err;
} }

View File

@@ -1,24 +1,21 @@
#ifndef __REPLICA_H__ #ifndef __REPLICA_H__
#define __REPLICA_H__ #define __REPLICA_H__
#define MAX_CMD_LEN 4096
#define MAX_CMD_LEN 256
#define MAX_IP_LEN 64 #define MAX_IP_LEN 64
enum event_type { enum event_type {
EVENT_COMPLETED_CMD, EVENT_COMPLETED_CMD = 1,
EVENT_SSYNC, EVENT_SSYNC = 2,
EVENT_SREADY, EVENT_SREADY = 3,
}; };
struct complete_cmd_evt { struct complete_cmd_evt {
__u64 seq;
__u32 len; __u32 len;
__u8 cmd[MAX_CMD_LEN]; __u8 cmd[MAX_CMD_LEN];
}; };
struct sync_evt { struct sync_evt {
__u64 seq;
char ip[MAX_IP_LEN]; char ip[MAX_IP_LEN];
__s32 port; __s32 port;
}; };
@@ -33,5 +30,4 @@ struct replica_event {
}; };
}; };
#endif #endif

228
hash.c
View File

@@ -1,228 +0,0 @@
// #include <stdio.h>
// #include <string.h>
// #include <stdlib.h>
// #include <pthread.h>
// #define MAX_KEY_LEN 128
// #define MAX_VALUE_LEN 512
// #define MAX_TABLE_SIZE 1024
// typedef struct hashnode_s {
// char key[MAX_KEY_LEN];
// char value[MAX_VALUE_LEN];
// struct hashnode_s *next;
// } hashnode_t;
// typedef struct hashtable_s {
// hashnode_t **nodes; //* change **,
// int max_slots;
// int count;
// pthread_mutex_t lock;
// } hashtable_t;
// hashtable_t hash;
// //Connection
// // 'C' + 'o' + 'n'
// static int _hash(char *key, int size) {
// if (!key) return -1;
// int sum = 0;
// int i = 0;
// while (key[i] != 0) {
// sum += key[i];
// i ++;
// }
// return sum % size;
// }
// hashnode_t *_create_node(char *key, char *value) {
// hashnode_t *node = (hashnode_t*)malloc(sizeof(hashnode_t));
// if (!node) return NULL;
// strncpy(node->key, key, MAX_KEY_LEN);
// strncpy(node->value, value, MAX_VALUE_LEN);
// node->next = NULL;
// return node;
// }
// //
// int init_hashtable(hashtable_t *hash) {
// if (!hash) return -1;
// hash->nodes = (hashnode_t**)malloc(sizeof(hashnode_t*) * MAX_TABLE_SIZE);
// if (!hash->nodes) return -1;
// hash->max_slots = MAX_TABLE_SIZE;
// hash->count = 0;
// pthread_mutex_init(&hash->lock, NULL);
// return 0;
// }
// //
// void dest_hashtable(hashtable_t *hash) {
// if (!hash) return;
// int i = 0;
// for (i = 0;i < hash->max_slots;i ++) {
// hashnode_t *node = hash->nodes[i];
// while (node != NULL) { // error
// hashnode_t *tmp = node;
// node = node->next;
// hash->nodes[i] = node;
// free(tmp);
// }
// }
// free(hash->nodes);
// }
// // mp
// int put_kv_hashtable(hashtable_t *hash, char *key, char *value) {
// if (!hash || !key || !value) return -1;
// int idx = _hash(key, MAX_TABLE_SIZE);
// pthread_mutex_lock(&hash->lock);
// hashnode_t *node = hash->nodes[idx];
// #if 1
// while (node != NULL) {
// if (strcmp(node->key, key) == 0) { // exist
// pthread_mutex_unlock(&hash->lock);
// return 1;
// }
// node = node->next;
// }
// #endif
// hashnode_t *new_node = _create_node(key, value);
// new_node->next = hash->nodes[idx];
// hash->nodes[idx] = new_node;
// hash->count ++;
// pthread_mutex_unlock(&hash->lock);
// return 0;
// }
// char * get_kv_hashtable(hashtable_t *hash, char *key) {
// if (!hash || !key) return NULL;
// int idx = _hash(key, MAX_TABLE_SIZE);
// pthread_mutex_lock(&hash->lock);
// hashnode_t *node = hash->nodes[idx];
// while (node != NULL) {
// if (strcmp(node->key, key) == 0) {
// pthread_mutex_unlock(&hash->lock);
// return node->value;
// }
// node = node->next;
// }
// pthread_mutex_unlock(&hash->lock);
// return NULL;
// }
// int count_kv_hashtable(hashtable_t *hash) {
// return hash->count;
// }
// int delete_kv_hashtable(hashtable_t *hash, char *key) {
// if (!hash || !key) return -2;
// int idx = _hash(key, MAX_TABLE_SIZE);
// pthread_mutex_lock(&hash->lock);
// hashnode_t *head = hash->nodes[idx];
// if (head == NULL) return -1; // noexist
// // head node
// if (strcmp(head->key, key) == 0) {
// hashnode_t *tmp = head->next;
// hash->nodes[idx] = tmp;
// free(head);
// hash->count --;
// pthread_mutex_unlock(&hash->lock);
// return 0;
// }
// hashnode_t *cur = head;
// while (cur->next != NULL) {
// if (strcmp(cur->next->key, key) == 0) break; // search node
// cur = cur->next;
// }
// if (cur->next == NULL) {
// pthread_mutex_unlock(&hash->lock);
// return -1;
// }
// hashnode_t *tmp = cur->next;
// cur->next = tmp->next;
// free(tmp);
// hash->count --;
// pthread_mutex_unlock(&hash->lock);
// return 0;
// }
// int exist_kv_hashtable(hashtable_t *hash, char *key) {
// char *value = get_kv_hashtable(hash, key);
// if (value) return 1;
// else return 0;
// }

BIN
img/主从同步.png Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 143 KiB

View File

@@ -1,7 +0,0 @@
rm -rf libbpf-bootstrap/examples/c
cp -R ebpf/c libbpf-bootstrap/examples
cd libbpf-bootstrap/examples/c
make

View File

@@ -1,192 +0,0 @@
#include "kvstore.h"
#include "kvs_rw_tools.h"
#include "memory/alloc_dispatch.h"
#include <arpa/inet.h>
// singleton
kvs_array_t global_array = {0};
int kvs_array_create(kvs_array_t *inst) {
if (!inst) return -1;
if (inst->table) {
printf("table has alloc\n");
return -1;
}
inst->table = kvs_malloc(KVS_ARRAY_SIZE * sizeof(kvs_array_item_t));
if (!inst->table) {
return -1;
}
memset(inst->table, 0, (size_t)KVS_ARRAY_SIZE * sizeof(kvs_array_item_t));
inst->total = 0;
return 0;
}
void kvs_array_destroy(kvs_array_t *inst) {
if (!inst) return ;
if (inst->table) {
kvs_free(inst->table);
}
}
/*
* @return: <0, error; =0, success; >0, exist
*/
int kvs_array_set(kvs_array_t *inst, char *key, char *value) {
if (inst == NULL || key == NULL || value == NULL) return -1;
if (inst->total == KVS_ARRAY_SIZE) return -1;
char *str = kvs_array_get(inst, key);
if (str) {
return 1; //
}
char *kcopy = kvs_malloc(strlen(key) + 1);
if (kcopy == NULL) return -2;
memset(kcopy, 0, strlen(key) + 1);
strncpy(kcopy, key, strlen(key));
char *kvalue = kvs_malloc(strlen(value) + 1);
if (kvalue == NULL) return -2;
memset(kvalue, 0, strlen(value) + 1);
strncpy(kvalue, value, strlen(value));
int i = 0;
for (i = 0;i < inst->total;i ++) {
if (inst->table[i].key == NULL) {
inst->table[i].key = kcopy;
inst->table[i].value = kvalue;
inst->total ++;
return 0;
}
}
if (i == inst->total && i < KVS_ARRAY_SIZE) {
inst->table[i].key = kcopy;
inst->table[i].value = kvalue;
inst->total ++;
}
return 0;
}
char* kvs_array_get(kvs_array_t *inst, char *key) {
if (inst == NULL || key == NULL) return NULL;
int i = 0;
for (i = 0;i < inst->total;i ++) {
if (inst->table[i].key == NULL) {
continue;
}
if (strcmp(inst->table[i].key, key) == 0) {
return inst->table[i].value;
}
}
return NULL;
}
/*
* @return < 0, error; =0, success; >0, no exist
*/
int kvs_array_del(kvs_array_t *inst, char *key) {
if (inst == NULL || key == NULL) return -1;
int i = 0;
for (i = 0;i < inst->total;i ++) {
if (strcmp(inst->table[i].key, key) == 0) {
kvs_free(inst->table[i].key);
inst->table[i].key = NULL;
kvs_free(inst->table[i].value);
inst->table[i].value = NULL;
// error: > 1024
if (inst->total-1 == i) {
inst->total --;
}
return 0;
}
}
return i;
}
/*
* @return : < 0, error; =0, success; >0, no exist
*/
int kvs_array_mod(kvs_array_t *inst, char *key, char *value) {
if (inst == NULL || key == NULL || value == NULL) return -1;
// error: > 1024
if (inst->total == 0) {
return KVS_ARRAY_SIZE;
}
int i = 0;
for (i = 0;i < inst->total;i ++) {
if (inst->table[i].key == NULL) {
continue;
}
if (strcmp(inst->table[i].key, key) == 0) {
kvs_free(inst->table[i].value);
char *kvalue = kvs_malloc(strlen(value) + 1);
if (kvalue == NULL) return -2;
memset(kvalue, 0, strlen(value) + 1);
strncpy(kvalue, value, strlen(value));
inst->table[i].value = kvalue;
return 0;
}
}
return i;
}
/*
* @return 0: exist, 1: no exist
*/
int kvs_array_exist(kvs_array_t *inst, char *key) {
if (!inst || !key) return -1;
char *str = kvs_array_get(inst, key);
if (!str) {
return 1; //
}
return 0;
}

View File

@@ -235,7 +235,7 @@ int kvs_array_save(iouring_ctx_t *uring, kvs_array_t *inst, const char* filename
for (int i = 0; i < count; i++) total += lens[i]; for (int i = 0; i < count; i++) total += lens[i];
task_t *t = submit_write(uring, fd, bufs, lens, count, current_off); task_t *t = submit_write(uring, fd, bufs, lens, count, current_off);
cleanup_finished_iouring_tasks(); cleanup_finished_iouring_tasks(uring);
if (!t) { if (!t) {
perror("task init failed"); perror("task init failed");
@@ -249,7 +249,7 @@ int kvs_array_save(iouring_ctx_t *uring, kvs_array_t *inst, const char* filename
clean: clean:
while (!uring_task_complete(uring)) { while (!uring_task_complete(uring)) {
usleep(1000); usleep(1000);
cleanup_finished_iouring_tasks(); cleanup_finished_iouring_tasks(uring);
} }
close(fd); close(fd);
return 0; return 0;

View File

@@ -1,286 +0,0 @@
// #include <stdio.h>
// #include <string.h>
// #include <stdlib.h>
// #include <pthread.h>
// #include "memory/alloc_dispatch.h"
// #include "kvstore.h"
// // Key, Value -->
// // Modify
// kvs_hash_t global_hash;
// //Connection
// // 'C' + 'o' + 'n'
// static int _hash(char *key, int size) {
// if (!key) return -1;
// int sum = 0;
// int i = 0;
// while (key[i] != 0) {
// sum += key[i];
// i ++;
// }
// return sum % size;
// }
// hashnode_t *_create_node(char *key, char *value) {
// hashnode_t *node = (hashnode_t*)kvs_malloc(sizeof(hashnode_t));
// if (!node) return NULL;
// #if ENABLE_KEY_POINTER
// char *kcopy = kvs_malloc(strlen(key) + 1);
// if (kcopy == NULL) return NULL;
// memset(kcopy, 0, strlen(key) + 1);
// strncpy(kcopy, key, strlen(key));
// node->key = kcopy;
// char *kvalue = kvs_malloc(strlen(value) + 1);
// if (kvalue == NULL) {
// kvs_free(kvalue);
// return NULL;
// }
// memset(kvalue, 0, strlen(value) + 1);
// strncpy(kvalue, value, strlen(value));
// node->value = kvalue;
// #else
// strncpy(node->key, key, MAX_KEY_LEN);
// strncpy(node->value, value, MAX_VALUE_LEN);
// #endif
// node->next = NULL;
// return node;
// }
// //
// int kvs_hash_create(kvs_hash_t *hash) {
// if (!hash) return -1;
// hash->nodes = (hashnode_t**)kvs_malloc(sizeof(hashnode_t*) * MAX_TABLE_SIZE);
// if (!hash->nodes) return -1;
// hash->max_slots = MAX_TABLE_SIZE;
// hash->count = 0;
// return 0;
// }
// //
// void kvs_hash_destroy(kvs_hash_t *hash) {
// if (!hash) return;
// int i = 0;
// for (i = 0;i < hash->max_slots;i ++) {
// hashnode_t *node = hash->nodes[i];
// while (node != NULL) { // error
// hashnode_t *tmp = node;
// node = node->next;
// hash->nodes[i] = node;
// kvs_free(tmp);
// }
// }
// kvs_free(hash->nodes);
// }
// // 5 + 2
// // mp
// int kvs_hash_set(kvs_hash_t *hash, char *key, char *value) {
// if (!hash || !key || !value) return -1;
// int idx = _hash(key, MAX_TABLE_SIZE);
// hashnode_t *node = hash->nodes[idx];
// #if 1
// while (node != NULL) {
// if (strcmp(node->key, key) == 0) { // exist
// return 1;
// }
// node = node->next;
// }
// #endif
// hashnode_t *new_node = _create_node(key, value);
// new_node->next = hash->nodes[idx];
// hash->nodes[idx] = new_node;
// hash->count ++;
// return 0;
// }
// char * kvs_hash_get(kvs_hash_t *hash, char *key) {
// if (!hash || !key) return NULL;
// int idx = _hash(key, MAX_TABLE_SIZE);
// hashnode_t *node = hash->nodes[idx];
// while (node != NULL) {
// if (strcmp(node->key, key) == 0) {
// return node->value;
// }
// node = node->next;
// }
// return NULL;
// }
// int kvs_hash_mod(kvs_hash_t *hash, char *key, char *value) {
// if (!hash || !key) return -1;
// int idx = _hash(key, MAX_TABLE_SIZE);
// hashnode_t *node = hash->nodes[idx];
// while (node != NULL) {
// if (strcmp(node->key, key) == 0) {
// break;
// }
// node = node->next;
// }
// if (node == NULL) {
// return 1;
// }
// // node -->
// kvs_free(node->value);
// char *kvalue = kvs_malloc(strlen(value) + 1);
// if (kvalue == NULL) return -2;
// memset(kvalue, 0, strlen(value) + 1);
// strncpy(kvalue, value, strlen(value));
// node->value = kvalue;
// return 0;
// }
// int kvs_hash_count(kvs_hash_t *hash) {
// return hash->count;
// }
// int kvs_hash_del(kvs_hash_t *hash, char *key) {
// if (!hash || !key) return -2;
// int idx = _hash(key, MAX_TABLE_SIZE);
// hashnode_t *head = hash->nodes[idx];
// if (head == NULL) return -1; // noexist
// // head node
// if (strcmp(head->key, key) == 0) {
// hashnode_t *tmp = head->next;
// hash->nodes[idx] = tmp;
// kvs_free(head);
// hash->count --;
// return 0;
// }
// hashnode_t *cur = head;
// while (cur->next != NULL) {
// if (strcmp(cur->next->key, key) == 0) break; // search node
// cur = cur->next;
// }
// if (cur->next == NULL) {
// return -1;
// }
// hashnode_t *tmp = cur->next;
// cur->next = tmp->next;
// #if ENABLE_KEY_POINTER
// kvs_free(tmp->key);
// kvs_free(tmp->value);
// #endif
// kvs_free(tmp);
// hash->count --;
// return 0;
// }
// int kvs_hash_exist(kvs_hash_t *hash, char *key) {
// char *value = kvs_hash_get(hash, key);
// if (!value) return 1;
// return 0;
// }
// #if 0
// int main() {
// kvs_hash_create(&hash);
// kvs_hash_set(&hash, "Teacher1", "King");
// kvs_hash_set(&hash, "Teacher2", "Darren");
// kvs_hash_set(&hash, "Teacher3", "Mark");
// kvs_hash_set(&hash, "Teacher4", "Vico");
// kvs_hash_set(&hash, "Teacher5", "Nick");
// char *value1 = kvs_hash_get(&hash, "Teacher1");
// printf("Teacher1 : %s\n", value1);
// int ret = kvs_hash_mod(&hash, "Teacher1", "King1");
// printf("mode Teacher1 ret : %d\n", ret);
// char *value2 = kvs_hash_get(&hash, "Teacher1");
// printf("Teacher2 : %s\n", value1);
// ret = kvs_hash_del(&hash, "Teacher1");
// printf("delete Teacher1 ret : %d\n", ret);
// ret = kvs_hash_exist(&hash, "Teacher1");
// printf("Exist Teacher1 ret : %d\n", ret);
// kvs_hash_destroy(&hash);
// return 0;
// }
// #endif

View File

@@ -1,384 +1,680 @@
#include "kvstore.h" #include "kvstore.h"
#include "kvs_rw_tools.h" #include "kvs_rw_tools.h"
#include "memory/alloc_dispatch.h" #include "memory/alloc_dispatch.h"
#include "diskuring/diskuring.h" #include "diskuring/diskuring.h"
// Key, Value -->
// Modify
#include <arpa/inet.h>
#include <fcntl.h>
#include <unistd.h>
#define KVS_HASH_INITIAL_GLOBAL_DEPTH 1u
#define KVS_HASH_BUCKET_SPLIT_THRESHOLD 64u
#define KVS_HASH_MAX_GLOBAL_DEPTH 31u
kvs_hash_t global_hash; kvs_hash_t global_hash;
static uint32_t _hash_u32(const void *key, uint32_t key_len) {
//Connection
// 'C' + 'o' + 'n'
static int _hash(const void *key, uint32_t key_len, int size) {
if (!key || size <= 0) return -1;
const uint8_t *p = (const uint8_t *)key; const uint8_t *p = (const uint8_t *)key;
uint32_t sum = 0; uint32_t hash = 2166136261u;
for (uint32_t i = 0; i < key_len; i++) { for (uint32_t i = 0; i < key_len; i++) {
sum += p[i]; hash ^= p[i];
hash *= 16777619u;
} }
return sum % size;
return hash;
}
static inline uint32_t _dir_index(uint32_t hashv, uint32_t global_depth) {
if (global_depth == 0) {
return 0;
}
return hashv & ((1u << global_depth) - 1u);
}
static hashbucket_t *_bucket_create(uint32_t local_depth) {
hashbucket_t *bucket = (hashbucket_t *)kvs_malloc(sizeof(hashbucket_t));
if (!bucket) {
return NULL;
}
bucket->head = NULL;
bucket->local_depth = local_depth;
bucket->item_count = 0;
bucket->next_all = NULL;
return bucket;
}
static void _bucket_list_push(kvs_hash_t *hash, hashbucket_t *bucket) {
bucket->next_all = hash->bucket_list;
hash->bucket_list = bucket;
}
static inline uint8_t *_get_node_key(const hashnode_t *node) {
return (uint8_t *)node + sizeof(hashnode_t);
}
static inline uint8_t *_get_node_value(const hashnode_t *node) {
return (uint8_t *)node + sizeof(hashnode_t) + node->key_len;
} }
static int _key_equal(const hashnode_t *node, const void *key, uint32_t key_len) { static int _key_equal(const hashnode_t *node, const void *key, uint32_t key_len) {
if (!node || !key) return 0; if (!node || !key) {
if (!node->key) return 0; return 0;
if (node->key_len != key_len) return 0; }
return memcmp(node->key, key, key_len) == 0; if (node->key_len != key_len) {
return 0;
}
return memcmp(_get_node_key(node), key, key_len) == 0;
} }
static hashnode_t *_create_node(const void *key, uint32_t key_len, static hashnode_t *_create_node(const void *key, uint32_t key_len,
const void *value, uint32_t value_len) { const void *value, uint32_t value_len) {
hashnode_t *node = (hashnode_t*)kvs_malloc(sizeof(hashnode_t)); size_t total_size = sizeof(hashnode_t) + key_len + value_len;
if (!node) return NULL; hashnode_t *node;
memset(node, 0, sizeof(*node)); uint8_t *data_ptr;
if (key_len > 0) { if (!key || key_len == 0) {
node->key = (uint8_t*)kvs_malloc(key_len); return NULL;
if (!node->key) { kvs_free(node); return NULL; }
memcpy(node->key, key, key_len);
node->key_len = key_len;
} }
if (value_len > 0) { node = (hashnode_t *)kvs_malloc(total_size);
node->value = (uint8_t*)kvs_malloc(value_len); if (!node) {
if (!node->value) { return NULL;
kvs_free(node->key);
kvs_free(node);
return NULL;
}
memcpy(node->value, value, value_len);
node->value_len = value_len;
} else {
node->value = NULL;
node->value_len = 0;
} }
memset(node, 0, sizeof(hashnode_t));
node->key_len = key_len;
node->value_len = value_len;
node->next = NULL; node->next = NULL;
data_ptr = (uint8_t *)node + sizeof(hashnode_t);
memcpy(data_ptr, key, key_len);
if (value_len > 0 && value) {
memcpy(data_ptr + key_len, value, value_len);
}
return node; return node;
} }
static hashnode_t *_find_node(hashbucket_t *bucket,
const void *key, uint32_t key_len,
hashnode_t ***out_indirect) {
hashnode_t **indirect;
if (out_indirect) {
*out_indirect = NULL;
}
if (!bucket || !key || key_len == 0) {
return NULL;
}
indirect = &bucket->head;
while (*indirect) {
if (_key_equal(*indirect, key, key_len)) {
if (out_indirect) {
*out_indirect = indirect;
}
return *indirect;
}
indirect = &(*indirect)->next;
}
return NULL;
}
static int _update_node_value(hashnode_t **node_ptr, const void *value, uint32_t value_len) {
hashnode_t *old_node;
hashnode_t *new_node;
if (!node_ptr || !*node_ptr) {
return -1;
}
if (value_len > 0 && !value) {
return -1;
}
old_node = *node_ptr;
if (old_node->value_len == value_len) {
if (value_len > 0) {
memcpy(_get_node_value(old_node), value, value_len);
}
return 0;
}
new_node = _create_node(_get_node_key(old_node), old_node->key_len, value, value_len);
if (!new_node) {
return -1;
}
new_node->next = old_node->next;
*node_ptr = new_node;
kvs_free(old_node);
return 0;
}
static int _double_directory(kvs_hash_t *hash) {
uint32_t old_size;
uint32_t new_size;
hashbucket_t **new_dir;
uint32_t i;
if (!hash || !hash->directory) {
return -1;
}
if (hash->global_depth >= KVS_HASH_MAX_GLOBAL_DEPTH) {
return 1;
}
old_size = hash->dir_size;
if (old_size == 0 || old_size > UINT32_MAX / 2u) {
return 1;
}
new_size = old_size << 1;
new_dir = (hashbucket_t **)kvs_malloc(sizeof(hashbucket_t *) * new_size);
if (!new_dir) {
return 1;
}
for (i = 0; i < old_size; i++) {
new_dir[i] = hash->directory[i];
new_dir[i + old_size] = hash->directory[i];
}
kvs_free(hash->directory);
hash->directory = new_dir;
hash->dir_size = new_size;
hash->global_depth++;
return 0;
}
static int _split_bucket(kvs_hash_t *hash, uint32_t dir_idx) {
hashbucket_t *old_bucket;
hashbucket_t *new_bucket;
hashnode_t *node;
uint32_t split_bit;
uint32_t i;
if (!hash || !hash->directory || dir_idx >= hash->dir_size) {
return -1;
}
old_bucket = hash->directory[dir_idx];
if (!old_bucket) {
return -1;
}
if (old_bucket->local_depth >= KVS_HASH_MAX_GLOBAL_DEPTH) {
return 1;
}
if (old_bucket->local_depth == hash->global_depth) {
int rc = _double_directory(hash);
if (rc != 0) {
return rc;
}
}
new_bucket = _bucket_create(old_bucket->local_depth + 1);
if (!new_bucket) {
return -1;
}
_bucket_list_push(hash, new_bucket);
old_bucket->local_depth++;
split_bit = 1u << (old_bucket->local_depth - 1u);
for (i = 0; i < hash->dir_size; i++) {
if (hash->directory[i] == old_bucket && (i & split_bit)) {
hash->directory[i] = new_bucket;
}
}
node = old_bucket->head;
old_bucket->head = NULL;
old_bucket->item_count = 0;
while (node) {
hashnode_t *next = node->next;
uint32_t idx = _dir_index(_hash_u32(_get_node_key(node), node->key_len), hash->global_depth);
hashbucket_t *target = hash->directory[idx];
node->next = target->head;
target->head = node;
target->item_count++;
node = next;
}
return 0;
}
//
int kvs_hash_create(kvs_hash_t *hash) { int kvs_hash_create(kvs_hash_t *hash) {
if (!hash) return -1; uint32_t init_depth;
uint32_t i;
hashbucket_t *initial_bucket;
hash->nodes = (hashnode_t**)kvs_malloc(sizeof(hashnode_t*) * MAX_TABLE_SIZE); if (!hash) {
if (!hash->nodes) return -1; return -1;
}
memset(hash, 0, sizeof(*hash));
init_depth = KVS_HASH_INITIAL_GLOBAL_DEPTH;
if (init_depth > KVS_HASH_MAX_GLOBAL_DEPTH) {
init_depth = KVS_HASH_MAX_GLOBAL_DEPTH;
}
hash->global_depth = init_depth;
hash->dir_size = 1u << init_depth;
if (hash->dir_size == 0) {
return -1;
}
hash->directory = (hashbucket_t **)kvs_malloc(sizeof(hashbucket_t *) * hash->dir_size);
if (!hash->directory) {
return -1;
}
memset(hash->directory, 0, sizeof(hashbucket_t *) * hash->dir_size);
initial_bucket = _bucket_create(0);
if (!initial_bucket) {
kvs_free(hash->directory);
hash->directory = NULL;
hash->dir_size = 0;
return -1;
}
_bucket_list_push(hash, initial_bucket);
for (i = 0; i < hash->dir_size; i++) {
hash->directory[i] = initial_bucket;
}
memset(hash->nodes, 0, sizeof(hashnode_t*) * MAX_TABLE_SIZE);
hash->max_slots = MAX_TABLE_SIZE;
hash->count = 0; hash->count = 0;
return 0; return 0;
} }
//
void kvs_hash_destroy(kvs_hash_t *hash) { void kvs_hash_destroy(kvs_hash_t *hash) {
if (!hash || !hash->nodes) return; hashbucket_t *bucket;
for (int i = 0; i < hash->max_slots; i++) { if (!hash) {
hashnode_t *node = hash->nodes[i]; return;
while (node != NULL) { }
hashnode_t *tmp = node;
node = node->next;
if (tmp->key) kvs_free(tmp->key); bucket = hash->bucket_list;
if (tmp->value) kvs_free(tmp->value); while (bucket) {
kvs_free(tmp); hashbucket_t *next_bucket = bucket->next_all;
hashnode_t *node = bucket->head;
while (node) {
hashnode_t *next = node->next;
kvs_free(node);
node = next;
} }
hash->nodes[i] = NULL; kvs_free(bucket);
bucket = next_bucket;
} }
kvs_free(hash->nodes); kvs_free(hash->directory);
hash->nodes = NULL; memset(hash, 0, sizeof(*hash));
hash->max_slots = 0;
hash->count = 0;
} }
int kvs_hash_set_bin(kvs_hash_t *hash, const void *key, uint32_t key_len,
const void *value, uint32_t value_len) {
uint32_t hashv;
uint32_t idx;
hashbucket_t *bucket;
hashnode_t **indirect = NULL;
hashnode_t *found;
hashnode_t *new_node;
// 5 + 2 if (!hash || !hash->directory || !key || key_len == 0) {
return -1;
// mp
/*
* @return: <0 error; 0 success; 1 exist
*/
int kvs_hash_set_bin(kvs_hash_t *hash, const void *key, uint32_t key_len, const void *value, uint32_t value_len) {
if (!hash || !hash->nodes || !key || key_len == 0) return -1;
int idx = _hash(key, key_len, MAX_TABLE_SIZE);
if (idx < 0) return -1;
hashnode_t *node = hash->nodes[idx];
while (node != NULL) {
if (_key_equal(node, key, key_len)) { // exist
return 1;
}
node = node->next;
}
hashnode_t *new_node = _create_node(key, key_len, value, value_len);
if (!new_node) return -2;
new_node->next = hash->nodes[idx];
hash->nodes[idx] = new_node;
hash->count ++;
return 0;
}
/*
* @return: NULL notexist, NOTNULL exist。out_value_len 是长度。
*/
void *kvs_hash_get_bin(kvs_hash_t *hash, const void *key, uint32_t key_len, uint32_t *out_value_len) {
if (!hash || !hash->nodes || !key || key_len == 0 || !out_value_len) return NULL;
*out_value_len = 0;
int idx = _hash(key, key_len, MAX_TABLE_SIZE);
if (idx < 0) return NULL;
hashnode_t *node = hash->nodes[idx];
while (node != NULL) {
if (_key_equal(node, key, key_len)) {
*out_value_len = node->value_len;
return node->value;
}
node = node->next;
}
return NULL;
}
/*
* @return <0 error; =0 success; >0 no exist
*/
int kvs_hash_mod_bin(kvs_hash_t *hash, const void *key, uint32_t key_len, const void *value, uint32_t value_len) {
if (!hash || !hash->nodes || !key || key_len == 0 || !value) return -1;
int idx = _hash(key, key_len, MAX_TABLE_SIZE);
if (idx < 0) return -1;
hashnode_t *node = hash->nodes[idx];
while (node != NULL) {
if (_key_equal(node, key, key_len)) {
break;
}
node = node->next;
}
if (node == NULL) {
return 1;
}
// node -->
if (node->value) kvs_free(node->value);
node->value = NULL;
node->value_len = 0;
if (value_len > 0) {
uint8_t *vcopy = (uint8_t*)kvs_malloc(value_len);
if (!vcopy) return -2;
memcpy(vcopy, value, value_len);
node->value = vcopy;
node->value_len = value_len;
} }
return 0; if (value_len > 0 && !value) {
return -1;
}
hashv = _hash_u32(key, key_len);
idx = _dir_index(hashv, hash->global_depth);
bucket = hash->directory[idx];
if (!bucket) {
return -1;
}
found = _find_node(bucket, key, key_len, &indirect);
if (found) {
return (_update_node_value(indirect, value, value_len) == 0) ? 0 : -2;
}
while (bucket->item_count >= KVS_HASH_BUCKET_SPLIT_THRESHOLD) {
int split_rc = _split_bucket(hash, idx);
if (split_rc < 0) {
return -2;
}
if (split_rc > 0) {
break;
}
idx = _dir_index(hashv, hash->global_depth);
bucket = hash->directory[idx];
}
new_node = _create_node(key, key_len, value, value_len);
if (!new_node) {
return -2;
}
new_node->next = bucket->head;
bucket->head = new_node;
bucket->item_count++;
hash->count++;
return 0;
}
void *kvs_hash_get_bin(kvs_hash_t *hash, const void *key, uint32_t key_len,
uint32_t *out_value_len) {
uint32_t idx;
hashbucket_t *bucket;
hashnode_t *node;
if (!out_value_len) {
return NULL;
}
*out_value_len = 0;
if (!hash || !hash->directory || !key || key_len == 0) {
return NULL;
}
idx = _dir_index(_hash_u32(key, key_len), hash->global_depth);
bucket = hash->directory[idx];
if (!bucket) {
return NULL;
}
node = _find_node(bucket, key, key_len, NULL);
if (!node) {
return NULL;
}
*out_value_len = node->value_len;
return (node->value_len > 0) ? _get_node_value(node) : NULL;
}
int kvs_hash_get_copy_bin(kvs_hash_t *hash, const void *key, uint32_t key_len,
void **out_buf, uint32_t *out_len) {
uint32_t idx;
hashbucket_t *bucket;
hashnode_t *node;
void *copy;
if (!out_buf || !out_len) {
return -1;
}
*out_buf = NULL;
*out_len = 0;
if (!hash || !hash->directory || !key || key_len == 0) {
return -1;
}
idx = _dir_index(_hash_u32(key, key_len), hash->global_depth);
bucket = hash->directory[idx];
if (!bucket) {
return -1;
}
node = _find_node(bucket, key, key_len, NULL);
if (!node) {
return 1;
}
*out_len = node->value_len;
if (node->value_len == 0) {
return 0;
}
copy = kvs_malloc(node->value_len);
if (!copy) {
*out_len = 0;
return -2;
}
memcpy(copy, _get_node_value(node), node->value_len);
*out_buf = copy;
return 0;
}
int kvs_hash_mod_bin(kvs_hash_t *hash, const void *key, uint32_t key_len,
const void *value, uint32_t value_len) {
uint32_t idx;
hashbucket_t *bucket;
hashnode_t **indirect = NULL;
hashnode_t *node;
if (!hash || !hash->directory || !key || key_len == 0) {
return -1;
}
if (value_len > 0 && !value) {
return -1;
}
idx = _dir_index(_hash_u32(key, key_len), hash->global_depth);
bucket = hash->directory[idx];
if (!bucket) {
return -1;
}
node = _find_node(bucket, key, key_len, &indirect);
if (!node) {
return 1;
}
return (_update_node_value(indirect, value, value_len) == 0) ? 0 : -2;
}
int kvs_hash_del_bin(kvs_hash_t *hash, const void *key, uint32_t key_len) {
uint32_t idx;
hashbucket_t *bucket;
hashnode_t **indirect = NULL;
hashnode_t *node;
if (!hash || !hash->directory || !key || key_len == 0) {
return -1;
}
idx = _dir_index(_hash_u32(key, key_len), hash->global_depth);
bucket = hash->directory[idx];
if (!bucket) {
return -1;
}
node = _find_node(bucket, key, key_len, &indirect);
if (!node) {
return 1;
}
*indirect = node->next;
kvs_free(node);
bucket->item_count--;
hash->count--;
return 0;
}
int kvs_hash_exist_bin(kvs_hash_t *hash, const void *key, uint32_t key_len) {
uint32_t idx;
hashbucket_t *bucket;
if (!hash || !hash->directory || !key || key_len == 0) {
return -1;
}
idx = _dir_index(_hash_u32(key, key_len), hash->global_depth);
bucket = hash->directory[idx];
if (!bucket) {
return 1;
}
return _find_node(bucket, key, key_len, NULL) ? 0 : 1;
} }
int kvs_hash_count(kvs_hash_t *hash) { int kvs_hash_count(kvs_hash_t *hash) {
return hash->count; return hash ? hash->count : 0;
} }
/* int kvs_hash_save(iouring_ctx_t *uring, kvs_hash_t *inst, const char *filename) {
* @return <0 error; =0 success; >0 no exist int fd;
*/
int kvs_hash_del_bin(kvs_hash_t *hash, const void *key, uint32_t key_len) {
if (!hash || !key || key_len == 0) return -1;
int idx = _hash(key, key_len, MAX_TABLE_SIZE);
if (idx < 0) return -1;
hashnode_t *head = hash->nodes[idx];
if (head == NULL) return 1; // noexist
// head node
if (_key_equal(head, key, key_len)) {
hashnode_t *tmp = head->next;
hash->nodes[idx] = tmp;
if (head->key) kvs_free(head->key);
if (head->value) kvs_free(head->value);
kvs_free(head);
hash->count --;
return 0;
}
hashnode_t *cur = head;
while (cur->next != NULL) {
if (_key_equal(cur->next, key, key_len)) break; // search node
cur = cur->next;
}
if (cur->next == NULL) {
return 1;
}
hashnode_t *tmp = cur->next;
cur->next = tmp->next;
if (tmp->key) kvs_free(tmp->key);
if (tmp->value) kvs_free(tmp->value);
kvs_free(tmp);
hash->count --;
return 0;
}
/*
* @return =0 exist, =1 no exist
*/
int kvs_hash_exist_bin(kvs_hash_t *hash, const void *key, uint32_t key_len) {
uint32_t vlen = 0;
void *value = kvs_hash_get_bin(hash, key, key_len, &vlen);
return value ? 0 : 1;
}
// 0 suc, <0 error
int kvs_hash_save(iouring_ctx_t *uring, kvs_hash_t *inst, const char* filename){
if(!inst || !filename) return -1;
int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
if(fd < 0) return -2;
off_t current_off = 0; off_t current_off = 0;
int rc = 0;
hashbucket_t *bucket;
for(int i = 0;i < inst->max_slots; ++ i){ if (!uring || !inst || !filename) {
for (hashnode_t *n = inst->nodes[i]; n != NULL; n = n->next) { return -1;
if (!n->key || n->key_len == 0) continue; }
if (n->value_len > 0 && !n->value) {
goto clean;
}
fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
if (fd < 0) {
return -2;
}
uint32_t klen = htonl((uint32_t)n->key_len); for (bucket = inst->bucket_list; bucket != NULL; bucket = bucket->next_all) {
uint32_t vlen = htonl((uint32_t)n->value_len); hashnode_t *n = bucket->head;
while (n != NULL) {
uint32_t klen = htonl(n->key_len);
uint32_t vlen = htonl(n->value_len);
uint8_t *key_ptr = _get_node_key(n);
uint8_t *val_ptr = (n->value_len > 0) ? _get_node_value(n) : NULL;
void *bufs[4];
size_t lens[4];
int count = 0;
size_t total = 0;
task_t *t;
void *bufs[4]; bufs[count] = &klen;
size_t lens[4]; lens[count++] = sizeof(klen);
int count = 0; bufs[count] = &vlen;
lens[count++] = sizeof(vlen);
bufs[count] = &klen; if (n->key_len > 0) {
lens[count] = sizeof(klen); bufs[count] = key_ptr;
count++; lens[count++] = n->key_len;
}
if (n->value_len > 0) {
bufs[count] = val_ptr;
lens[count++] = n->value_len;
}
bufs[count] = &vlen; for (int j = 0; j < count; j++) {
lens[count] = sizeof(vlen); total += lens[j];
count++; }
if (n->key_len > 0){ t = submit_write(uring, fd, bufs, lens, count, current_off);
bufs[count] = n->key; if (!t) {
lens[count] = n->key_len; rc = -3;
count++; goto done;
} }
cleanup_finished_iouring_tasks(uring);
current_off += (off_t)total;
n = n->next;
}
}
if (n->value_len > 0) { done:
bufs[count] = n->value; while (!uring_task_complete(uring)) {
lens[count] = n->value_len; usleep(1000);
count++; cleanup_finished_iouring_tasks(uring);
} }
size_t total = 0; close(fd);
for (int i = 0; i < count; i++) total += lens[i]; return rc;
task_t *t = submit_write(uring, fd, bufs, lens, count, current_off);
if(!t) {
perror("task init failed");
goto clean;
}
cleanup_finished_iouring_tasks();
current_off += (off_t) total;
}
}
clean:
while (!uring_task_complete(uring)) {
usleep(1000);
cleanup_finished_iouring_tasks();
}
close(fd);
return 0;
} }
int kvs_hash_load(kvs_hash_t *inst, const char* filename){ int kvs_hash_load(kvs_hash_t *inst, const char *filename) {
if (!inst || !filename) return -1; int fd;
if (!inst->nodes || inst->max_slots <= 0) return -1; int rc = 0;
FILE *fp = fopen(filename, "rb"); if (!inst || !filename) {
if (!fp) return -2; return -1;
}
if (!inst->directory || inst->dir_size == 0) {
return -1;
}
while(1){ fd = open(filename, O_RDONLY);
uint32_t klen_n = 0, vlen_n = 0; if (fd < 0) {
return -2;
}
if (kvs_read_file(fp, &klen_n, 4) < 0) { fclose(fp); return -3; } while (1) {
if (kvs_read_file(fp, &vlen_n, 4) < 0) { fclose(fp); return -3; } uint32_t klen_n = 0;
uint32_t vlen_n = 0;
uint32_t klen;
uint32_t vlen;
uint8_t *keybuf = NULL;
uint8_t *valbuf = NULL;
int rr;
uint32_t klen = ntohl(klen_n); rr = read_full(fd, &klen_n, sizeof(klen_n));
uint32_t vlen = ntohl(vlen_n); if (rr == 0) {
rc = 0;
if (klen == 0) { fclose(fp); return -3; } break;
uint8_t *keybuf = (uint8_t*)kvs_malloc((size_t)klen);
if (!keybuf) { fclose(fp); return -4; }
if (kvs_read_file(fp, keybuf, (size_t)klen) < 0) {
kvs_free(keybuf);
fclose(fp);
return -3;
} }
uint8_t *valbuf = NULL; if (rr < 0) {
rc = -3;
break;
}
rr = read_full(fd, &vlen_n, sizeof(vlen_n));
if (rr <= 0) {
rc = -3;
break;
}
klen = ntohl(klen_n);
vlen = ntohl(vlen_n);
if (klen == 0) {
rc = -3;
break;
}
keybuf = (uint8_t *)kvs_malloc((size_t)klen);
if (!keybuf) {
rc = -4;
break;
}
rr = read_full(fd, keybuf, (size_t)klen);
if (rr <= 0) {
kvs_free(keybuf);
rc = -3;
break;
}
if (vlen > 0) { if (vlen > 0) {
valbuf = (uint8_t*)kvs_malloc((size_t)vlen); valbuf = (uint8_t *)kvs_malloc((size_t)vlen);
if (!valbuf) { if (!valbuf) {
kvs_free(keybuf); kvs_free(keybuf);
fclose(fp); rc = -4;
return -4; break;
} }
if (kvs_read_file(fp, valbuf, (size_t)vlen) < 0) { rr = read_full(fd, valbuf, (size_t)vlen);
if (rr <= 0) {
kvs_free(valbuf); kvs_free(valbuf);
kvs_free(keybuf); kvs_free(keybuf);
fclose(fp); rc = -3;
return -3; break;
} }
} }
int rc = kvs_hash_set_bin(inst, keybuf, klen, valbuf, vlen); rr = kvs_hash_set_bin(inst, keybuf, klen, valbuf, vlen);
kvs_free(keybuf); kvs_free(keybuf);
if (vlen > 0) kvs_free(valbuf); if (valbuf) {
kvs_free(valbuf);
if (rc < 0) { // error
fclose(fp);
return -5;
} }
} if (rr < 0) {
fclose(fp); rc = -5;
return 0; break;
}
}
close(fd);
return rc;
} }

View File

@@ -539,7 +539,7 @@ int resp_dispatch(const resp_cmd_t *cmd, resp_value_t *out_value) {
/* ---------------- misc ---------------- */ /* ---------------- misc ---------------- */
case KVS_CMD_SAVE: { case KVS_CMD_SAVE: {
if (cmd->argc != 1) { *out_value = resp_error("ERR wrong number of arguments for 'save'"); return 0; } if (cmd->argc != 1) { *out_value = resp_error("ERR wrong number of arguments for 'save'"); return 0; }
int r = kvs_create_snapshot(&global_uring_ctx, global_array_file, global_rbtree_file, global_hash_file); int r = kvs_create_snapshot_async_1(&global_uring_ctx, global_array_file, global_rbtree_file, global_hash_file);
if(r == 0) ksv_clear_log(global_oplog_fd); if(r == 0) ksv_clear_log(global_oplog_fd);
if (r < 0) { *out_value = resp_error("ERR save failed"); return 0; } if (r < 0) { *out_value = resp_error("ERR save failed"); return 0; }
*out_value = resp_simple("OK"); *out_value = resp_simple("OK");
@@ -547,7 +547,7 @@ int resp_dispatch(const resp_cmd_t *cmd, resp_value_t *out_value) {
} }
case KVS_CMD_SSYNC: case KVS_CMD_SSYNC:
__ssync(cmd->argv[1].ptr, cmd->argv[1].len, atoi(cmd->argv[2].ptr), global_seq); __ssync(cmd->argv[1].ptr, cmd->argv[1].len, atoi(cmd->argv[2].ptr), global_seq);
kvs_create_snapshot_async(cmd->argv[1].ptr, atoi(cmd->argv[2].ptr)); kvs_create_snapshot_async_2(cmd->argv[1].ptr, atoi(cmd->argv[2].ptr));
*out_value = resp_simple("OK"); *out_value = resp_simple("OK");
return 0; return 0;
case KVS_CMD_SREADY: case KVS_CMD_SREADY:
@@ -555,8 +555,8 @@ int resp_dispatch(const resp_cmd_t *cmd, resp_value_t *out_value) {
*out_value = resp_simple("OK"); *out_value = resp_simple("OK");
return 0; return 0;
case KVS_CMD_MEM_PRINT:{ case KVS_CMD_MEM_PRINT:{
iouring_profile_dump(&global_uring_ctx);
int ret = kvs_mem_printf(); int ret = kvs_mem_printf();
printf("ret %d\n", ret);
*out_value = resp_int(ret); *out_value = resp_int(ret);
return 0; return 0;
} }

View File

@@ -1,553 +0,0 @@
// #include <stdio.h>
// #include <stdlib.h>
// #include <string.h>
// #include "kvstore.h"
// rbtree_node *rbtree_mini(rbtree *T, rbtree_node *x) {
// while (x->left != T->nil) {
// x = x->left;
// }
// return x;
// }
// rbtree_node *rbtree_maxi(rbtree *T, rbtree_node *x) {
// while (x->right != T->nil) {
// x = x->right;
// }
// return x;
// }
// rbtree_node *rbtree_successor(rbtree *T, rbtree_node *x) {
// rbtree_node *y = x->parent;
// if (x->right != T->nil) {
// return rbtree_mini(T, x->right);
// }
// while ((y != T->nil) && (x == y->right)) {
// x = y;
// y = y->parent;
// }
// return y;
// }
// void rbtree_left_rotate(rbtree *T, rbtree_node *x) {
// rbtree_node *y = x->right; // x --> y , y --> x, right --> left, left --> right
// x->right = y->left; //1 1
// if (y->left != T->nil) { //1 2
// y->left->parent = x;
// }
// y->parent = x->parent; //1 3
// if (x->parent == T->nil) { //1 4
// T->root = y;
// } else if (x == x->parent->left) {
// x->parent->left = y;
// } else {
// x->parent->right = y;
// }
// y->left = x; //1 5
// x->parent = y; //1 6
// }
// void rbtree_right_rotate(rbtree *T, rbtree_node *y) {
// rbtree_node *x = y->left;
// y->left = x->right;
// if (x->right != T->nil) {
// x->right->parent = y;
// }
// x->parent = y->parent;
// if (y->parent == T->nil) {
// T->root = x;
// } else if (y == y->parent->right) {
// y->parent->right = x;
// } else {
// y->parent->left = x;
// }
// x->right = y;
// y->parent = x;
// }
// void rbtree_insert_fixup(rbtree *T, rbtree_node *z) {
// while (z->parent->color == RED) { //z ---> RED
// if (z->parent == z->parent->parent->left) {
// rbtree_node *y = z->parent->parent->right;
// if (y->color == RED) {
// z->parent->color = BLACK;
// y->color = BLACK;
// z->parent->parent->color = RED;
// z = z->parent->parent; //z --> RED
// } else {
// if (z == z->parent->right) {
// z = z->parent;
// rbtree_left_rotate(T, z);
// }
// z->parent->color = BLACK;
// z->parent->parent->color = RED;
// rbtree_right_rotate(T, z->parent->parent);
// }
// }else {
// rbtree_node *y = z->parent->parent->left;
// if (y->color == RED) {
// z->parent->color = BLACK;
// y->color = BLACK;
// z->parent->parent->color = RED;
// z = z->parent->parent; //z --> RED
// } else {
// if (z == z->parent->left) {
// z = z->parent;
// rbtree_right_rotate(T, z);
// }
// z->parent->color = BLACK;
// z->parent->parent->color = RED;
// rbtree_left_rotate(T, z->parent->parent);
// }
// }
// }
// T->root->color = BLACK;
// }
// void rbtree_insert(rbtree *T, rbtree_node *z) {
// rbtree_node *y = T->nil;
// rbtree_node *x = T->root;
// while (x != T->nil) {
// y = x;
// #if ENABLE_KEY_CHAR
// if (strcmp(z->key, x->key) < 0) {
// x = x->left;
// } else if (strcmp(z->key, x->key) > 0) {
// x = x->right;
// } else {
// return ;
// }
// #else
// if (z->key < x->key) {
// x = x->left;
// } else if (z->key > x->key) {
// x = x->right;
// } else { //Exist
// return ;
// }
// #endif
// }
// z->parent = y;
// if (y == T->nil) {
// T->root = z;
// #if ENABLE_KEY_CHAR
// } else if (strcmp(z->key, y->key) < 0) {
// #else
// } else if (z->key < y->key) {
// #endif
// y->left = z;
// } else {
// y->right = z;
// }
// z->left = T->nil;
// z->right = T->nil;
// z->color = RED;
// rbtree_insert_fixup(T, z);
// }
// void rbtree_delete_fixup(rbtree *T, rbtree_node *x) {
// while ((x != T->root) && (x->color == BLACK)) {
// if (x == x->parent->left) {
// rbtree_node *w= x->parent->right;
// if (w->color == RED) {
// w->color = BLACK;
// x->parent->color = RED;
// rbtree_left_rotate(T, x->parent);
// w = x->parent->right;
// }
// if ((w->left->color == BLACK) && (w->right->color == BLACK)) {
// w->color = RED;
// x = x->parent;
// } else {
// if (w->right->color == BLACK) {
// w->left->color = BLACK;
// w->color = RED;
// rbtree_right_rotate(T, w);
// w = x->parent->right;
// }
// w->color = x->parent->color;
// x->parent->color = BLACK;
// w->right->color = BLACK;
// rbtree_left_rotate(T, x->parent);
// x = T->root;
// }
// } else {
// rbtree_node *w = x->parent->left;
// if (w->color == RED) {
// w->color = BLACK;
// x->parent->color = RED;
// rbtree_right_rotate(T, x->parent);
// w = x->parent->left;
// }
// if ((w->left->color == BLACK) && (w->right->color == BLACK)) {
// w->color = RED;
// x = x->parent;
// } else {
// if (w->left->color == BLACK) {
// w->right->color = BLACK;
// w->color = RED;
// rbtree_left_rotate(T, w);
// w = x->parent->left;
// }
// w->color = x->parent->color;
// x->parent->color = BLACK;
// w->left->color = BLACK;
// rbtree_right_rotate(T, x->parent);
// x = T->root;
// }
// }
// }
// x->color = BLACK;
// }
// rbtree_node *rbtree_delete(rbtree *T, rbtree_node *z) {
// rbtree_node *y = T->nil;
// rbtree_node *x = T->nil;
// if ((z->left == T->nil) || (z->right == T->nil)) {
// y = z;
// } else {
// y = rbtree_successor(T, z);
// }
// if (y->left != T->nil) {
// x = y->left;
// } else if (y->right != T->nil) {
// x = y->right;
// }
// x->parent = y->parent;
// if (y->parent == T->nil) {
// T->root = x;
// } else if (y == y->parent->left) {
// y->parent->left = x;
// } else {
// y->parent->right = x;
// }
// if (y != z) {
// #if ENABLE_KEY_CHAR
// void *tmp = z->key;
// z->key = y->key;
// y->key = tmp;
// tmp = z->value;
// z->value= y->value;
// y->value = tmp;
// #else
// z->key = y->key;
// z->value = y->value;
// #endif
// }
// if (y->color == BLACK) {
// rbtree_delete_fixup(T, x);
// }
// return y;
// }
// rbtree_node *rbtree_search(rbtree *T, KEY_TYPE key) {
// rbtree_node *node = T->root;
// while (node != T->nil) {
// #if ENABLE_KEY_CHAR
// if (strcmp(key, node->key) < 0) {
// node = node->left;
// } else if (strcmp(key, node->key) > 0) {
// node = node->right;
// } else {
// return node;
// }
// #else
// if (key < node->key) {
// node = node->left;
// } else if (key > node->key) {
// node = node->right;
// } else {
// return node;
// }
// #endif
// }
// return T->nil;
// }
// void rbtree_traversal(rbtree *T, rbtree_node *node) {
// if (node != T->nil) {
// rbtree_traversal(T, node->left);
// #if ENABLE_KEY_CHAR
// printf("key:%s, value:%s\n", node->key, (char *)node->value);
// #else
// printf("key:%d, color:%d\n", node->key, node->color);
// #endif
// rbtree_traversal(T, node->right);
// }
// }
// #if 0
// int main() {
// #if ENABLE_KEY_CHAR
// char* keyArray[10] = {"King", "Darren", "Mark", "Vico", "Nick", "qiuxiang", "youzi", "taozi", "123", "234"};
// char* valueArray[10] = {"1King", "2Darren", "3Mark", "4Vico", "5Nick", "6qiuxiang", "7youzi", "8taozi", "9123", "10234"};
// rbtree *T = (rbtree *)malloc(sizeof(rbtree));
// if (T == NULL) {
// printf("malloc failed\n");
// return -1;
// }
// T->nil = (rbtree_node*)malloc(sizeof(rbtree_node));
// T->nil->color = BLACK;
// T->root = T->nil;
// rbtree_node *node = T->nil;
// int i = 0;
// for (i = 0;i < 10;i ++) {
// node = (rbtree_node*)malloc(sizeof(rbtree_node));
// node->key = malloc(strlen(keyArray[i]) + 1);
// memset(node->key, 0, strlen(keyArray[i]) + 1);
// strcpy(node->key, keyArray[i]);
// node->value = malloc(strlen(valueArray[i]) + 1);
// memset(node->value, 0, strlen(valueArray[i]) + 1);
// strcpy(node->value, valueArray[i]);
// rbtree_insert(T, node);
// }
// rbtree_traversal(T, T->root);
// printf("----------------------------------------\n");
// for (i = 0;i < 10;i ++) {
// rbtree_node *node = rbtree_search(T, keyArray[i]);
// rbtree_node *cur = rbtree_delete(T, node);
// free(cur);
// rbtree_traversal(T, T->root);
// printf("----------------------------------------\n");
// }
// #else
// int keyArray[20] = {24,25,13,35,23, 26,67,47,38,98, 20,19,17,49,12, 21,9,18,14,15};
// rbtree *T = (rbtree *)malloc(sizeof(rbtree));
// if (T == NULL) {
// printf("malloc failed\n");
// return -1;
// }
// T->nil = (rbtree_node*)malloc(sizeof(rbtree_node));
// T->nil->color = BLACK;
// T->root = T->nil;
// rbtree_node *node = T->nil;
// int i = 0;
// for (i = 0;i < 20;i ++) {
// node = (rbtree_node*)malloc(sizeof(rbtree_node));
// node->key = keyArray[i];
// node->value = NULL;
// rbtree_insert(T, node);
// }
// rbtree_traversal(T, T->root);
// printf("----------------------------------------\n");
// for (i = 0;i < 20;i ++) {
// rbtree_node *node = rbtree_search(T, keyArray[i]);
// rbtree_node *cur = rbtree_delete(T, node);
// free(cur);
// rbtree_traversal(T, T->root);
// printf("----------------------------------------\n");
// }
// #endif
// }
// #endif
// typedef struct _rbtree kvs_rbtree_t;
// kvs_rbtree_t global_rbtree;
// // 5 + 2
// int kvs_rbtree_create(kvs_rbtree_t *inst) {
// if (inst == NULL) return 1;
// inst->nil = (rbtree_node*)kvs_malloc(sizeof(rbtree_node));
// inst->nil->color = BLACK;
// inst->root = inst->nil;
// return 0;
// }
// void kvs_rbtree_destroy(kvs_rbtree_t *inst) {
// if (inst == NULL) return ;
// rbtree_node *node = NULL;
// while (!(node = inst->root)) {
// rbtree_node *mini = rbtree_mini(inst, node);
// rbtree_node *cur = rbtree_delete(inst, mini);
// kvs_free(cur);
// }
// kvs_free(inst->nil);
// return ;
// }
// int kvs_rbtree_set(kvs_rbtree_t *inst, char *key, char *value) {
// if (!inst || !key || !value) return -1;
// rbtree_node *node = (rbtree_node*)kvs_malloc(sizeof(rbtree_node));
// node->key = kvs_malloc(strlen(key) + 1);
// if (!node->key) return -2;
// memset(node->key, 0, strlen(key) + 1);
// strcpy(node->key, key);
// node->value = kvs_malloc(strlen(value) + 1);
// if (!node->value) return -2;
// memset(node->value, 0, strlen(value) + 1);
// strcpy(node->value, value);
// rbtree_insert(inst, node);
// return 0;
// }
// char* kvs_rbtree_get(kvs_rbtree_t *inst, char *key) {
// if (!inst || !key) return NULL;
// rbtree_node *node = rbtree_search(inst, key);
// if (!node) return NULL; // no exist
// if (node == inst->nil) return NULL;
// return node->value;
// }
// int kvs_rbtree_del(kvs_rbtree_t *inst, char *key) {
// if (!inst || !key) return -1;
// rbtree_node *node = rbtree_search(inst, key);
// if (!node) return 1; // no exist
// rbtree_node *cur = rbtree_delete(inst, node);
// free(cur);
// return 0;
// }
// int kvs_rbtree_mod(kvs_rbtree_t *inst, char *key, char *value) {
// if (!inst || !key || !value) return -1;
// rbtree_node *node = rbtree_search(inst, key);
// if (!node) return 1; // no exist
// if (node == inst->nil) return 1;
// kvs_free(node->value);
// node->value = kvs_malloc(strlen(value) + 1);
// if (!node->value) return -2;
// memset(node->value, 0, strlen(value) + 1);
// strcpy(node->value, value);
// return 0;
// }
// int kvs_rbtree_exist(kvs_rbtree_t *inst, char *key) {
// if (!inst || !key) return -1;
// rbtree_node *node = rbtree_search(inst, key);
// if (!node) return 1; // no exist
// if (node == inst->nil) return 1;
// return 0;
// }

View File

@@ -1,11 +1,47 @@
#include "kvstore.h" #include "kvstore.h"
#include "kvs_rw_tools.h" #include "kvs_rw_tools.h"
#include "memory/alloc_dispatch.h" #include "memory/alloc_dispatch.h"
#include "diskuring/diskuring.h" #include "diskuring/diskuring.h"
/* ============================================================================
* 内存布局说明:
* ============================================================================
* 每个节点的内存结构(单一连续块):
*
* +------ 固定头部 (24字节) -------+------ 动态数据 -------+
* | color | right | left | parent | key_len | value_len | key | value |
* | 1字节 |8字节 |8字节 |8字节 | 4字节 | 4字节 | k字节| v字节 |
* +---------- 共32字节 ------+--- key_len + value_len 字节 ---+
*
* 总大小 = sizeof(rbtree_node_fixed) + key_len + value_len
* ============================================================================ */
// ============================================================================
// 辅助函数:计算节点所需的总大小
// ============================================================================
static inline size_t rbtree_node_size(uint32_t key_len, uint32_t value_len) {
return sizeof(rbtree_node_fixed) + key_len + value_len;
}
// ============================================================================
// 辅助函数获取节点内的key指针
// ============================================================================
static inline uint8_t* rbtree_node_get_key(rbtree_node *node) {
if (!node || node->key_len == 0) return NULL;
return (uint8_t *)node + sizeof(rbtree_node_fixed);
}
// ============================================================================
// 辅助函数获取节点内的value指针
// ============================================================================
static inline uint8_t* rbtree_node_get_value(rbtree_node *node) {
if (!node || node->value_len == 0) return NULL;
return (uint8_t *)node + sizeof(rbtree_node_fixed) + node->key_len;
}
// ============================================================================
// 原始比较函数(保持不变)
// ============================================================================
int kvs_keycmp(const uint8_t *a, uint32_t alen, int kvs_keycmp(const uint8_t *a, uint32_t alen,
const uint8_t *b, uint32_t blen) { const uint8_t *b, uint32_t blen) {
uint32_t min = (alen < blen) ? alen : blen; uint32_t min = (alen < blen) ? alen : blen;
@@ -150,7 +186,9 @@ int rbtree_insert(rbtree *T, rbtree_node *z) {
while (x != T->nil) { while (x != T->nil) {
y = x; y = x;
int c = kvs_keycmp(z->key, z->key_len, x->key, x->key_len); uint8_t *xkey = rbtree_node_get_key(x);
uint8_t *zkey = rbtree_node_get_key(z);
int c = kvs_keycmp(zkey, z->key_len, xkey, x->key_len);
if (c < 0) { if (c < 0) {
x = x->left; x = x->left;
} else if (c > 0) { } else if (c > 0) {
@@ -166,7 +204,9 @@ int rbtree_insert(rbtree *T, rbtree_node *z) {
T->root = z; T->root = z;
}else{ }else{
int c = kvs_keycmp(z->key, z->key_len, y->key, y->key_len); uint8_t *ykey = rbtree_node_get_key(y);
uint8_t *zkey = rbtree_node_get_key(z);
int c = kvs_keycmp(zkey, z->key_len, ykey, y->key_len);
if (c < 0) y->left = z; if (c < 0) y->left = z;
else y->right = z; else y->right = z;
} }
@@ -275,11 +315,52 @@ rbtree_node *rbtree_delete(rbtree *T, rbtree_node *z) {
} }
if (y != z) { if (y != z) {
uint8_t *ktmp = z->key; z->key = y->key; y->key = ktmp; // 交换键值:由于键和值内嵌在节点内存中,需要交换内存内容
uint32_t ltmp = z->key_len; z->key_len = y->key_len; y->key_len = ltmp; // 注意:这里假设 z 的内存大小足够容纳 y 的数据
// 更安全的做法是只交换指针或重新分配
uint8_t *vtmp = z->value; z->value = y->value; y->value = vtmp; // 保存原始长度
uint32_t tlen = z->value_len; z->value_len = y->value_len; y->value_len = tlen; uint32_t z_klen = z->key_len;
uint32_t z_vlen = z->value_len;
uint32_t y_klen = y->key_len;
uint32_t y_vlen = y->value_len;
uint8_t *z_key = rbtree_node_get_key(z);
uint8_t *z_val = rbtree_node_get_value(z);
uint8_t *y_key = rbtree_node_get_key(y);
uint8_t *y_val = rbtree_node_get_value(y);
// 如果长度相同,直接交换内存
if (z_klen == y_klen && z_vlen == y_vlen) {
if (z_klen > 0) memcpy(z_key, y_key, z_klen);
if (z_vlen > 0) {
uint8_t tmp[z_vlen];
memcpy(tmp, z_val, z_vlen);
memcpy(z_val, y_val, z_vlen);
memcpy(y_val, tmp, z_vlen);
}
if (z_klen > 0) {
uint8_t tmp[z_klen];
memcpy(tmp, z_key, z_klen);
memcpy(z_key, y_key, z_klen);
memcpy(y_key, tmp, z_klen);
}
} else {
// 长度不同时,只能交换值的拷贝(保留长度不变)
// 这是一个限制,实际应用中需要重新分配更大的节点
if (z_klen == y_klen && z_klen > 0) {
uint8_t tmp[z_klen];
memcpy(tmp, z_key, z_klen);
memcpy(z_key, y_key, z_klen);
memcpy(y_key, tmp, z_klen);
}
if (z_vlen == y_vlen && z_vlen > 0) {
uint8_t tmp[z_vlen];
memcpy(tmp, z_val, z_vlen);
memcpy(z_val, y_val, z_vlen);
memcpy(y_val, tmp, z_vlen);
}
}
} }
if (y->color == BLACK) { if (y->color == BLACK) {
@@ -289,11 +370,12 @@ rbtree_node *rbtree_delete(rbtree *T, rbtree_node *z) {
return y; return y;
} }
rbtree_node *rbtree_search(rbtree *T, KEY_TYPE* key, uint32_t keylen) { rbtree_node *rbtree_search(rbtree *T, const uint8_t *key, uint32_t keylen) {
rbtree_node *node = T->root; rbtree_node *node = T->root;
while (node != T->nil) { while (node != T->nil) {
int c = kvs_keycmp(key, keylen, node->key, node->key_len); uint8_t *node_key = rbtree_node_get_key(node);
int c = kvs_keycmp(key, keylen, node_key, node->key_len);
if (c < 0) node = node->left; if (c < 0) node = node->left;
else if (c > 0) node = node->right; else if (c > 0) node = node->right;
else return node; else return node;
@@ -306,7 +388,8 @@ void rbtree_traversal(rbtree *T, rbtree_node *node) {
if (node != T->nil) { if (node != T->nil) {
rbtree_traversal(T, node->left); rbtree_traversal(T, node->left);
printf("key:%s, color:%d\n", (char*)node->key, node->color); uint8_t *key = rbtree_node_get_key(node);
printf("key:%s, color:%d\n", (char*)key, node->color);
rbtree_traversal(T, node->right); rbtree_traversal(T, node->right);
} }
} }
@@ -316,16 +399,21 @@ typedef struct _rbtree kvs_rbtree_t;
kvs_rbtree_t global_rbtree; kvs_rbtree_t global_rbtree;
// 5 + 2 // ============================================================================
// 创建红黑树
// ============================================================================
int kvs_rbtree_create(kvs_rbtree_t *inst) { int kvs_rbtree_create(kvs_rbtree_t *inst) {
if (inst == NULL) return 1; if (inst == NULL) return 1;
inst->nil = (rbtree_node*)kvs_malloc(sizeof(rbtree_node)); // nil 节点:特殊的哨兵节点,也使用优化的分配
inst->nil = (rbtree_node*)kvs_malloc(sizeof(rbtree_node_fixed));
if (!inst->nil) return 2; if (!inst->nil) return 2;
inst->nil->color = BLACK; inst->nil->color = BLACK;
inst->nil->left = inst->nil->right = inst->nil->parent = inst->nil; inst->nil->left = inst->nil->right = inst->nil->parent = inst->nil;
inst->nil->key_len = 0;
inst->nil->value_len = 0;
inst->root = inst->nil; inst->root = inst->nil;
return 0; return 0;
@@ -340,13 +428,11 @@ void kvs_rbtree_destroy(kvs_rbtree_t *inst) {
while (inst->root != inst->nil) { while (inst->root != inst->nil) {
rbtree_node *mini = rbtree_mini(inst, node); rbtree_node *mini = rbtree_mini(inst, inst->root);
rbtree_node *cur = rbtree_delete(inst, mini); rbtree_node *cur = rbtree_delete(inst, mini);
if (cur != inst->nil) { if (cur != inst->nil) {
if (cur->key) kvs_free(cur->key); kvs_free(cur); // 只需释放节点本身key和value已内嵌
if (cur->value) kvs_free(cur->value);
kvs_free(cur);
} }
} }
@@ -362,48 +448,99 @@ void kvs_rbtree_destroy(kvs_rbtree_t *inst) {
* @return: <0 error; 0 success; 1 exist * @return: <0 error; 0 success; 1 exist
*/ */
int kvs_rbtree_set(kvs_rbtree_t *inst, const void *key, uint32_t key_len, const void *value, uint32_t value_len) { int kvs_rbtree_set(kvs_rbtree_t *inst, const void *key, uint32_t key_len, const void *value, uint32_t value_len) {
if (!inst || !key || !value) return -1;
if (!inst || !key || !value) return -1; // 1. 查找键是否已存在
rbtree_node *existing = rbtree_search(inst, (const uint8_t*)key, key_len);
if (existing != inst->nil) {
// 键已存在:需要重新分配节点(如果大小改变)
uint32_t old_size = rbtree_node_size(existing->key_len, existing->value_len);
uint32_t new_size = rbtree_node_size(key_len, value_len);
rbtree_node *node = (rbtree_node*)kvs_malloc(sizeof(rbtree_node)); if (new_size != old_size) {
if (!node) return -2; // 大小改变,需要重新分配并更新树结构
memset(node, 0, sizeof(*node)); rbtree_node *new_node = (rbtree_node*)kvs_malloc(new_size);
if (!new_node) return -2;
node->key = (uint8_t*)kvs_malloc(key_len); // 复制固定部分(除了 key_len 和 value_len
if (!node->key) { new_node->color = existing->color;
kvs_free(node);return -2; new_node->right = existing->right;
} new_node->left = existing->left;
memcpy(node->key, key, key_len); new_node->parent = existing->parent;
node->key_len = key_len; new_node->key_len = key_len;
new_node->value_len = value_len;
node->value = (uint8_t*)kvs_malloc(value_len); // 复制 key 和 value
if (!node->value) { kvs_free(node->key); kvs_free(node); return -2; } uint8_t *new_key = rbtree_node_get_key(new_node);
if (value_len) memcpy(node->value, value, value_len); uint8_t *new_val = rbtree_node_get_value(new_node);
node->value_len = value_len; if (key_len > 0) memcpy(new_key, key, key_len);
if (value_len > 0) memcpy(new_val, value, value_len);
if(rbtree_insert(inst, node) < 0){ // 更新父节点的指针
kvs_free(node->value); if (existing->parent != inst->nil) {
kvs_free(node->key); if (existing->parent->left == existing) {
existing->parent->left = new_node;
} else {
existing->parent->right = new_node;
}
} else {
inst->root = new_node;
}
// 更新子节点的父指针
if (new_node->left != inst->nil) {
new_node->left->parent = new_node;
}
if (new_node->right != inst->nil) {
new_node->right->parent = new_node;
}
kvs_free(existing);
} else {
// 大小相同,直接更新值
uint8_t *val = rbtree_node_get_value(existing);
if (value_len > 0) memcpy(val, value, value_len);
}
return 0;
}
// 2. 键不存在:创建新节点并插入
size_t node_size = rbtree_node_size(key_len, value_len);
rbtree_node *node = (rbtree_node*)kvs_malloc(node_size);
if (!node) return -2;
memset(node, 0, node_size);
node->key_len = key_len;
node->value_len = value_len;
uint8_t *node_key = rbtree_node_get_key(node);
uint8_t *node_val = rbtree_node_get_value(node);
if (key_len > 0) memcpy(node_key, key, key_len);
if (value_len > 0) memcpy(node_val, value, value_len);
if (rbtree_insert(inst, node) < 0) {
// 插入失败,释放资源
kvs_free(node); kvs_free(node);
return 1; return -2;
} }
return 0; return 0;
} }
/* /*
* @return: NULL notexist, NOTNULL exist。out_value_len 是长度。 * @return: NULL not exist, NOTNULL exist。out_value_len 是长度。
*/ */
void* kvs_rbtree_get(kvs_rbtree_t *inst, const void *key, uint32_t key_len, uint32_t *out_valuelen) { void* kvs_rbtree_get(kvs_rbtree_t *inst, const void *key, uint32_t key_len, uint32_t *out_valuelen) {
if (!inst || !key || key_len == 0 || !out_valuelen) return NULL; if (!inst || !key || key_len == 0 || !out_valuelen) return NULL;
rbtree_node *node = rbtree_search(inst, (uint8_t *)key, key_len); rbtree_node *node = rbtree_search(inst, (const uint8_t *)key, key_len);
if (!node) return NULL; // no exist if (!node || node == inst->nil) return NULL;
if (node == inst->nil) return NULL;
*out_valuelen = node->value_len; *out_valuelen = node->value_len;
return node->value; return (void*)rbtree_node_get_value(node);
} }
/* /*
@@ -413,15 +550,12 @@ int kvs_rbtree_del(rbtree *inst, const void *key, uint32_t key_len) {
if (!inst || !key || key_len == 0) return -1; if (!inst || !key || key_len == 0) return -1;
rbtree_node *node = rbtree_search(inst, (uint8_t *)key, key_len); rbtree_node *node = rbtree_search(inst, (const uint8_t *)key, key_len);
if (!node) return 1; // no exist if (!node || node == inst->nil) return 1;
if (node == inst->nil) return 1;
rbtree_node *cur = rbtree_delete(inst, node); rbtree_node *cur = rbtree_delete(inst, node);
if (cur != inst->nil) { if (cur != inst->nil) {
if (cur->key) kvs_free(cur->key); kvs_free(cur); // 只需释放节点本身
if (cur->value) kvs_free(cur->value);
kvs_free(cur);
} }
return 0; return 0;
@@ -434,20 +568,57 @@ int kvs_rbtree_mod(kvs_rbtree_t *inst, const void *key, uint32_t key_len, const
if (!inst || !key || key_len==0 || !value) return -1; if (!inst || !key || key_len==0 || !value) return -1;
rbtree_node *node = rbtree_search(inst, (uint8_t *)key, key_len); rbtree_node *node = rbtree_search(inst, (const uint8_t *)key, key_len);
if (!node) return 1; // no exist if (!node || node == inst->nil) return 1;
if (node == inst->nil) return 1;
if (node->value) kvs_free(node->value); // 如果新的 value_len 与旧的相同,可以直接覆盖
if (node->value_len == value_len) {
uint8_t *val = rbtree_node_get_value(node);
if (value_len > 0) memcpy(val, value, value_len);
return 0;
}
node->value = (uint8_t*)kvs_malloc(value_len); // 否则需要重新分配节点
if (!node->value) { node->value_len = 0; return -2; } uint32_t new_size = rbtree_node_size(key_len, value_len);
rbtree_node *new_node = (rbtree_node*)kvs_malloc(new_size);
if (!new_node) return -2;
if (value_len) memcpy(node->value, value, value_len); // 复制所有内容
node->value_len = value_len; uint8_t *old_key = rbtree_node_get_key(node);
uint8_t *new_key = rbtree_node_get_key(new_node);
uint8_t *new_val = rbtree_node_get_value(new_node);
new_node->color = node->color;
new_node->left = node->left;
new_node->right = node->right;
new_node->parent = node->parent;
new_node->key_len = node->key_len;
new_node->value_len = value_len;
if (key_len > 0) memcpy(new_key, old_key, key_len);
if (value_len > 0) memcpy(new_val, value, value_len);
// 更新父节点指针
if (node->parent != inst->nil) {
if (node->parent->left == node) {
node->parent->left = new_node;
} else {
node->parent->right = new_node;
}
} else {
inst->root = new_node;
}
// 更新子节点的父指针
if (new_node->left != inst->nil) {
new_node->left->parent = new_node;
}
if (new_node->right != inst->nil) {
new_node->right->parent = new_node;
}
kvs_free(node);
return 0; return 0;
} }
/* /*
@@ -457,9 +628,8 @@ int kvs_rbtree_exist(kvs_rbtree_t *inst, const void *key, uint32_t key_len) {
if (!inst || !key || key_len == 0) return -1; if (!inst || !key || key_len == 0) return -1;
rbtree_node *node = rbtree_search(inst, (uint8_t*)key, key_len); rbtree_node *node = rbtree_search(inst, (const uint8_t*)key, key_len);
if (!node) return 1; // no exist if (!node || node == inst->nil) return 1;
if (node == inst->nil) return 1;
return 0; return 0;
} }
@@ -488,14 +658,16 @@ static int kvs_rbtree_save_node(iouring_ctx_t *uring, int fd, off_t *current_off
lens[count] = sizeof(vlen); lens[count] = sizeof(vlen);
count++; count++;
uint8_t *node_key = rbtree_node_get_key(node);
if (node->key_len > 0) { if (node->key_len > 0) {
bufs[count] = node->key; bufs[count] = node_key;
lens[count] = node->key_len; lens[count] = node->key_len;
count++; count++;
} }
uint8_t *node_val = rbtree_node_get_value(node);
if (node->value_len > 0) { if (node->value_len > 0) {
bufs[count] = node->value; bufs[count] = node_val;
lens[count] = node->value_len; lens[count] = node->value_len;
count++; count++;
} }
@@ -504,7 +676,7 @@ static int kvs_rbtree_save_node(iouring_ctx_t *uring, int fd, off_t *current_off
for (int i = 0; i < count; i++) total += lens[i]; for (int i = 0; i < count; i++) total += lens[i];
task_t *t = submit_write(uring, fd, bufs, lens, count, *current_off); task_t *t = submit_write(uring, fd, bufs, lens, count, *current_off);
cleanup_finished_iouring_tasks(); cleanup_finished_iouring_tasks(uring);
if(!t) { if(!t) {
perror("task init failed"); perror("task init failed");
@@ -532,7 +704,7 @@ int kvs_rbtree_save(iouring_ctx_t *uring, kvs_rbtree_t *inst, const char* filena
while (!uring_task_complete(uring)) { while (!uring_task_complete(uring)) {
usleep(1000); usleep(1000);
cleanup_finished_iouring_tasks(); cleanup_finished_iouring_tasks(uring);
} }
close(fd); close(fd);
return rc; return rc;
@@ -554,34 +726,37 @@ int kvs_rbtree_load(kvs_rbtree_t *inst, const char* filename){
uint32_t vlen = ntohl(vlen_n); uint32_t vlen = ntohl(vlen_n);
if (klen == 0) { fclose(fp); return -3; } if (klen == 0) { fclose(fp); return -3; }
uint8_t *keybuf = (uint8_t*)kvs_malloc((size_t)klen);
if (!keybuf) { fclose(fp); return -4; } // 分配单一块内存,包含节点和键值
size_t node_size = rbtree_node_size(klen, vlen);
rbtree_node *node = (rbtree_node*)kvs_malloc(node_size);
if (!node) { fclose(fp); return -4; }
memset(node, 0, node_size);
node->key_len = klen;
node->value_len = vlen;
uint8_t *keybuf = rbtree_node_get_key(node);
if (kvs_read_file(fp, keybuf, (size_t)klen) < 0) { if (kvs_read_file(fp, keybuf, (size_t)klen) < 0) {
kvs_free(keybuf); kvs_free(node);
fclose(fp); fclose(fp);
return -3; return -3;
} }
uint8_t *valbuf = NULL; uint8_t *valbuf = NULL;
if (vlen > 0) { if (vlen > 0) {
valbuf = (uint8_t*)kvs_malloc((size_t)vlen); valbuf = rbtree_node_get_value(node);
if (!valbuf) {
kvs_free(keybuf);
fclose(fp);
return -4;
}
if (kvs_read_file(fp, valbuf, (size_t)vlen) < 0) { if (kvs_read_file(fp, valbuf, (size_t)vlen) < 0) {
kvs_free(valbuf); kvs_free(node);
kvs_free(keybuf);
fclose(fp); fclose(fp);
return -3; return -3;
} }
} }
int rc = kvs_rbtree_set(inst, keybuf, klen, valbuf, vlen); // 使用原生 rbtree_insert 而非 kvs_rbtree_set
if (vlen > 0) kvs_free(valbuf); // 因为 kvs_rbtree_set 会重新分配节点
if (rbtree_insert(inst, node) < 0) {
if (rc < 0) { // error kvs_free(node);
fclose(fp); fclose(fp);
return -5; return -5;
} }

294
kvstore.c
View File

@@ -19,6 +19,9 @@
#include <unistd.h> #include <unistd.h>
#include <arpa/inet.h> #include <arpa/inet.h>
#include <libxml/parser.h> #include <libxml/parser.h>
#include <limits.h>
#define TIME_COLLECT 0
extern int slave_bootstrap(const char *listen_ip, int listen_port, const char *master_ip, int master_port); extern int slave_bootstrap(const char *listen_ip, int listen_port, const char *master_ip, int master_port);
@@ -38,26 +41,136 @@ void __completed_cmd(const uint8_t *cmd, size_t len, unsigned long long seq){
} }
// __attribute__((noinline)) #include <sys/time.h>
// void __replica_notify(uint64_t seq, uint32_t off, uint32_t len) #define TIME_SUB_MS(tv1, tv2) ((tv1.tv_sec - tv2.tv_sec) * 1000 + (tv1.tv_usec - tv2.tv_usec) / 1000)
// { #define TIME_SUB_US(tv1, tv2) ((tv1.tv_sec - tv2.tv_sec) * 1000000 + (tv1.tv_usec - tv2.tv_usec))
// // 空函数即可,目的是让 uprobe 拿到参数
// asm volatile("" ::: "memory"); static int checked_size_add(size_t a, size_t b, size_t *out) {
// } if (!out || a > SIZE_MAX - b) {
return -1;
}
*out = a + b;
return 0;
}
static int resp_value_encoded_len(const resp_value_t *v, size_t *out_len) {
size_t len = 0;
if (!v || !out_len) {
return -1;
}
switch (v->type) {
case RESP_T_SIMPLE_STR:
case RESP_T_ERROR:
if (checked_size_add(1, (size_t)v->bulk.len, &len) < 0 ||
checked_size_add(len, 2, &len) < 0) {
return -1;
}
break;
case RESP_T_INTEGER: {
char tmp[64];
int n = snprintf(tmp, sizeof(tmp), "%lld", (long long)v->i64);
if (n <= 0) {
return -1;
}
if (checked_size_add(1, (size_t)n, &len) < 0 ||
checked_size_add(len, 2, &len) < 0) {
return -1;
}
break;
}
case RESP_T_NIL:
len = 5; /* "$-1\r\n" */
break;
case RESP_T_BULK_STR: {
char tmp[32];
int n;
size_t t;
if (v->bulk.len > 0 && !v->bulk.ptr) {
return -1;
}
n = snprintf(tmp, sizeof(tmp), "%u", (unsigned)v->bulk.len);
if (n <= 0) {
return -1;
}
if (checked_size_add(1, (size_t)n, &t) < 0 || /* '$' + len digits */
checked_size_add(t, 2, &t) < 0 || /* \r\n */
checked_size_add(t, (size_t)v->bulk.len, &t) < 0 ||
checked_size_add(t, 2, &len) < 0) { /* trailing \r\n */
return -1;
}
break;
}
default:
return -1;
}
*out_len = len;
return 0;
}
static int flush_pending_response(struct conn *conn, uint8_t *buf, size_t *out_len) {
if (!conn || !buf || !out_len) {
return -1;
}
if (*out_len == 0) {
return 0;
}
if (chain_buffer_append(&conn->wbuf, buf, *out_len) < 0) {
return -1;
}
*out_len = 0;
return 0;
}
static int is_update_cmd(const resp_cmd_t *cmd) {
const resp_slice_t *c0;
if (!cmd || cmd->argc == 0 || !cmd->argv[0].ptr || cmd->argv[0].len == 0) {
return 0;
}
c0 = &cmd->argv[0];
return ascii_casecmp(c0->ptr, c0->len, "SET") == 0 ||
ascii_casecmp(c0->ptr, c0->len, "DEL") == 0 ||
ascii_casecmp(c0->ptr, c0->len, "MOD") == 0 ||
ascii_casecmp(c0->ptr, c0->len, "RSET") == 0 ||
ascii_casecmp(c0->ptr, c0->len, "RDEL") == 0 ||
ascii_casecmp(c0->ptr, c0->len, "RMOD") == 0 ||
ascii_casecmp(c0->ptr, c0->len, "HSET") == 0 ||
ascii_casecmp(c0->ptr, c0->len, "HDEL") == 0 ||
ascii_casecmp(c0->ptr, c0->len, "HMOD") == 0;
}
int kvs_protocol(struct conn* conn){ int kvs_protocol(struct conn* conn){
if (!conn) return -1; #if TIME_COLLECT == 1
char *request = conn->rbuffer; struct timeval func_start;
int request_length = conn->rlength; gettimeofday(&func_start, NULL);
char *response = conn->wbuffer; long total_oplog_us = 0;
int *response_length = &conn->wlength; #endif
if (!request || request_length <= 0 || !response || !response_length) return -1; if (!conn) return -1;
size_t request_size = 0;
const uint8_t *request = chain_buffer_linearize(&conn->rbuf, &request_size);
if (!request || request_size == 0) return 0;
if (request_size > (size_t)INT_MAX) return -1;
int request_length = (int)request_size;
uint8_t response[KVS_MAX_RESPONSE];
int consumed = 0; int consumed = 0;
int out_len = 0; size_t out_len = 0;
while(consumed < request_length ){ while(consumed < request_length ){
const uint8_t *p = request+consumed; const uint8_t *p = request+consumed;
int remain = request_length - consumed; int remain = request_length - consumed;
@@ -66,8 +179,7 @@ int kvs_protocol(struct conn* conn){
int len = resp_parse_one_cmd(p, remain, &cmd); int len = resp_parse_one_cmd(p, remain, &cmd);
if(len < 0){ if(len < 0){
/* 协议错误:直接返回,已构建的响应仍可写回 */ /* 协议错误:直接返回 */
*response_length = out_len;
return -1; return -1;
} }
else if(len == 0){ else if(len == 0){
@@ -86,78 +198,103 @@ int kvs_protocol(struct conn* conn){
* 一般也已经把 out_value 设置成了 RESP error这样客户端能收到错误响应。 * 一般也已经把 out_value 设置成了 RESP error这样客户端能收到错误响应。
* - 如果 dr < 0 但 val.type 没被正确设置,兜底回一个通用错误。 * - 如果 dr < 0 但 val.type 没被正确设置,兜底回一个通用错误。
*/ */
if(dr < 0){ #if TIME_COLLECT == 1
if (val.type != RESP_T_SIMPLE_STR && struct timeval oplog_start, oplog_end;
val.type != RESP_T_ERROR && gettimeofday(&oplog_start, NULL);
val.type != RESP_T_INTEGER && #endif
val.type != RESP_T_BULK_STR &&
val.type != RESP_T_NIL) { int need_persist = is_update_cmd(&cmd);
val = resp_error("ERR dispatch failed");
if(global_cfg.persistence == PERSIST_INCREMENTAL && need_persist){
int ar = kvs_oplog_buffer_append(p, (size_t)len, global_oplog_fd);
if (ar < 0) {
return -1;
}
if (ar == KVS_OPLOG_BUF_FULL && kvs_oplog_flush(global_oplog_fd, 0) < 0) {
return -1;
} }
} else {
// persist into oplog
/* 执行成功:在这里保存到日志中(只记录更新类命令) */
if (cmd.argc > 0 && cmd.argv[0].ptr) {
/* 更新类命令SET/DEL/MOD/RSET/RDEL/RMOD/HSET/HDEL/HMOD/SAVE */
const resp_slice_t *c0 = &cmd.argv[0];
int is_update = 0;
if (c0->ptr && c0->len) {
if (ascii_casecmp(c0->ptr, c0->len, "SET") == 0 ||
ascii_casecmp(c0->ptr, c0->len, "DEL") == 0 ||
ascii_casecmp(c0->ptr, c0->len, "MOD") == 0 ||
ascii_casecmp(c0->ptr, c0->len, "RSET") == 0 ||
ascii_casecmp(c0->ptr, c0->len, "RDEL") == 0 ||
ascii_casecmp(c0->ptr, c0->len, "RMOD") == 0 ||
ascii_casecmp(c0->ptr, c0->len, "HSET") == 0 ||
ascii_casecmp(c0->ptr, c0->len, "HDEL") == 0 ||
ascii_casecmp(c0->ptr, c0->len, "HMOD") == 0) {
is_update = 1;
}
}
if (is_update) {
if(global_cfg.persistence == PERSIST_INCREMENTAL){
kvs_oplog_append(p, len, global_oplog_fd);
}
// __completed_cmd(p, len, global_seq);
// global_seq ++;
if (global_cfg.replica_mode == REPLICA_ENABLE) {
uint32_t off = 0;
int ar = replica_shm_append(&g_rep_shm, global_seq, p, (uint32_t)len, &off);
if (ar == 0) {
// __replica_notify(global_seq, off, (uint32_t)len);
global_seq++;
} else {
// shm 满或异常:你可以选择降级(比如直接跳过复制,或阻塞/丢弃)
// 为了不影响主路径,这里先打印并跳过
fprintf(stderr, "replica_shm_append failed %d\n", ar);
}
}
}
}
} }
// __completed_cmd(p, len, global_seq);
// global_seq ++;
if (global_cfg.replica_mode == REPLICA_ENABLE && need_persist) {
uint32_t off = 0;
int ar = replica_shm_append(&g_rep_shm, global_seq, p, (uint32_t)len, &off);
if (ar == 0) {
// __replica_notify(global_seq, off, (uint32_t)len);
global_seq++;
} else {
// shm 满或异常:你可以选择降级(比如直接跳过复制,或阻塞/丢弃)
// 为了不影响主路径,这里先打印并跳过
fprintf(stderr, "replica_shm_append failed %d\n", ar);
}
}
#if TIME_COLLECT == 1
gettimeofday(&oplog_end, NULL);
total_oplog_us += (oplog_end.tv_sec - oplog_start.tv_sec) * 1000000 +
(oplog_end.tv_usec - oplog_start.tv_usec);
#endif
/* 构建响应 */ /* 构建响应 */
int cap = KVS_MAX_RESPONSE - out_len; int resp_len = resp_build_value(&val, response + out_len, sizeof(response) - out_len);
if (cap <= 0) {
*response_length = out_len;
return consumed;
}
int resp_len = resp_build_value(&val, response + out_len, (size_t)cap);
if (resp_len < 0) { if (resp_len < 0) {
*response_length = out_len; /* 当前批次剩余空间不够,先把已拼好的刷到发送队列再重试 */
return consumed; if (flush_pending_response(conn, response, &out_len) < 0) {
return -1;
}
resp_len = resp_build_value(&val, response, sizeof(response));
if (resp_len < 0) {
size_t resp_need = 0;
uint8_t *resp_heap = NULL;
if (resp_value_encoded_len(&val, &resp_need) < 0) {
return -1;
}
resp_heap = (uint8_t *)kvs_malloc(resp_need);
if (!resp_heap) {
return -1;
}
resp_len = resp_build_value(&val, resp_heap, resp_need);
if (resp_len < 0 ||
chain_buffer_append(&conn->wbuf, resp_heap, (size_t)resp_len) < 0) {
free(resp_heap);
return -1;
}
free(resp_heap);
resp_len = 0;
}
} }
out_len += resp_len; out_len += (size_t)resp_len;
// __completed_cmd(request, consumed, 0);
consumed += len; consumed += len;
} }
*response_length = out_len; if (global_cfg.persistence == PERSIST_INCREMENTAL) {
if (kvs_oplog_flush(global_oplog_fd, 1) < 0) {
return -1;
}
}
#if TIME_COLLECT == 1
struct timeval func_end;
gettimeofday(&func_end, NULL);
long func_us = (func_end.tv_sec - func_start.tv_sec) * 1000000 +
(func_end.tv_usec - func_start.tv_usec);
fprintf(stderr, "kvs_protocol: total %ld us, oplog %ld us\n", func_us, total_oplog_us);
#endif
if (flush_pending_response(conn, response, &out_len) < 0) {
return -1;
}
return consumed; return consumed;
} }
@@ -278,6 +415,7 @@ int init_config(AppConfig *cfg){
printf("Persistence : %s\n", persistence_to_string(cfg->persistence)); printf("Persistence : %s\n", persistence_to_string(cfg->persistence));
printf("|—— Persist-dir : %s\n", cfg->persist_dir); printf("|—— Persist-dir : %s\n", cfg->persist_dir);
printf("|—— Persist-oplog : %s\n", cfg->oplog_file); printf("|—— Persist-oplog : %s\n", cfg->oplog_file);
printf("|—— Oplog-sync : %s\n", oplog_sync_mode_to_string(cfg->oplog_sync_mode));
printf("|—— Persist-array : %s\n", cfg->array_file); printf("|—— Persist-array : %s\n", cfg->array_file);
printf("|—— Persist-rbtree : %s\n", cfg->rbtree_file); printf("|—— Persist-rbtree : %s\n", cfg->rbtree_file);
printf("|—— Persist-hash : %s\n", cfg->hash_file); printf("|—— Persist-hash : %s\n", cfg->hash_file);

View File

@@ -120,15 +120,26 @@ int kvs_array_exist(kvs_array_t *inst, char *key);
#if BIN_SAFE #if BIN_SAFE
typedef uint8_t KEY_TYPE; // key typedef uint8_t KEY_TYPE; // key
// 固定部分结构
typedef struct {
unsigned char color;
struct _rbtree_node *right;
struct _rbtree_node *left;
struct _rbtree_node *parent;
uint32_t key_len;
uint32_t value_len;
} rbtree_node_fixed;
// 完整节点结构(用于类型定义,实际内存大小由分配时确定)
typedef struct _rbtree_node { typedef struct _rbtree_node {
unsigned char color; unsigned char color;
struct _rbtree_node *right; struct _rbtree_node *right;
struct _rbtree_node *left; struct _rbtree_node *left;
struct _rbtree_node *parent; struct _rbtree_node *parent;
KEY_TYPE *key; uint32_t key_len;
uint32_t key_len; uint32_t value_len;
KEY_TYPE *value; // 动态数据key[key_len] + value[value_len]
uint32_t value_len; // 不存储为结构体成员,通过指针运算访问
} rbtree_node; } rbtree_node;
typedef struct _rbtree { typedef struct _rbtree {
@@ -191,21 +202,27 @@ int kvs_rbtree_exist(kvs_rbtree_t *inst, char *key);
#if BIN_SAFE #if BIN_SAFE
#define MAX_TABLE_SIZE 1024
typedef struct hashnode_s { typedef struct hashnode_s {
uint8_t *key; uint32_t key_len; // key 长度
size_t key_len; uint32_t value_len; // value 长度
struct hashnode_s *next; // 链表指针
uint8_t *value; // 动态数据key[key_len] + value[value_len]
size_t value_len; // 不存储为结构体成员,通过指针运算访问
struct hashnode_s *next;
} hashnode_t; } hashnode_t;
typedef struct hashbucket_s {
hashnode_t *head; // 桶内链表
uint32_t local_depth; // 桶的局部深度
uint32_t item_count; // 桶内元素数量
struct hashbucket_s *next_all; // 用于统一释放/遍历所有桶
} hashbucket_t;
typedef struct hashtable_s { typedef struct hashtable_s {
hashnode_t **nodes; hashbucket_t **directory; // 目录,指向桶
int max_slots; uint32_t dir_size; // 目录大小2^global_depth
int count; uint32_t global_depth; // 全局深度
int count; // 当前元素总数
hashbucket_t *bucket_list;// 所有唯一桶链表
} hashtable_t; } hashtable_t;
typedef struct hashtable_s kvs_hash_t; typedef struct hashtable_s kvs_hash_t;

View File

@@ -27,29 +27,29 @@ static inline int size_to_index(size_t size){
// bitmap 操作函数 // bitmap 操作函数
static inline void bitmap_set(uint64_t *bitmap, uint16_t index){ // static inline void bitmap_set(uint64_t *bitmap, uint16_t index){
bitmap[index / 64] |= (1ULL << (index % 64)); // bitmap[index / 64] |= (1ULL << (index % 64));
} // }
static inline void bitmap_clear(uint64_t *bitmap, uint16_t index){ // static inline void bitmap_clear(uint64_t *bitmap, uint16_t index){
bitmap[index / 64] &= ~(1ULL << (index % 64)); // bitmap[index / 64] &= ~(1ULL << (index % 64));
} // }
static inline int bitmap_test(uint64_t *bitmap, uint16_t index){ // static inline int bitmap_test(uint64_t *bitmap, uint16_t index){
return (bitmap[index / 64] & (1ULL << (index % 64))) != 0; // return (bitmap[index / 64] & (1ULL << (index % 64))) != 0;
} // }
static inline void bitmap_clear_all(uint64_t *bitmap, size_t size){ // static inline void bitmap_clear_all(uint64_t *bitmap, size_t size){
memset(bitmap, 0, size * sizeof(uint64_t)); // memset(bitmap, 0, size * sizeof(uint64_t));
} // }
// 根据指针计算在页中的块索引 // // 根据指针计算在页中的块索引
static inline uint16_t ptr_to_block_index(mp_page_t *pg, void *ptr){ // static inline uint16_t ptr_to_block_index(mp_page_t *pg, void *ptr){
char *base = (char*)page_payload(pg); // char *base = (char*)page_payload(pg);
char *p = (char*)ptr; // char *p = (char*)ptr;
size_t offset = p - base; // size_t offset = p - base;
return (uint16_t)(offset / pg->owner->block_size); // return (uint16_t)(offset / pg->owner->block_size);
} // }
static mp_page_t* mp_page_create(mp_bucket_t *owner){ static mp_page_t* mp_page_create(mp_bucket_t *owner){
@@ -74,7 +74,7 @@ static mp_page_t* mp_page_create(mp_bucket_t *owner){
pg->prev = NULL; pg->prev = NULL;
pg->next = NULL; pg->next = NULL;
bitmap_clear_all(pg->bitmap, 16); // bitmap_clear_all(pg->bitmap, 16);
char *p = (char*)page_payload(pg); char *p = (char*)page_payload(pg);
for(uint16_t i = 0;i < cap - 1; ++ i){ for(uint16_t i = 0;i < cap - 1; ++ i){
@@ -95,8 +95,8 @@ static void *mp_page_alloc(mp_page_t *pg){
pg->free_count --; pg->free_count --;
// 标记该块为已分配 // 标记该块为已分配
uint16_t index = ptr_to_block_index(pg, ret); // uint16_t index = ptr_to_block_index(pg, ret);
bitmap_set(pg->bitmap, index); // bitmap_set(pg->bitmap, index);
return ret; return ret;
} }
@@ -105,14 +105,14 @@ static int mp_page_free(mp_page_t *pg, void *ptr){
if(!pg || !ptr) return MEMPOOL_INVALID_INPUT; if(!pg || !ptr) return MEMPOOL_INVALID_INPUT;
// 检查是否是 double free // 检查是否是 double free
uint16_t index = ptr_to_block_index(pg, ptr); // uint16_t index = ptr_to_block_index(pg, ptr);
if(!bitmap_test(pg->bitmap, index)){ // if(!bitmap_test(pg->bitmap, index)){
// 该块未被分配,可能是 double free // // 该块未被分配,可能是 double free
return MEMPOOL_DOUBLE_FREE; // return MEMPOOL_DOUBLE_FREE;
} // }
// 标记该块为空闲 // 标记该块为空闲
bitmap_clear(pg->bitmap, index); // bitmap_clear(pg->bitmap, index);
*(void**)ptr = pg->free_list; *(void**)ptr = pg->free_list;
pg->free_list = ptr; pg->free_list = ptr;
@@ -320,8 +320,10 @@ int mp_print(mp_pool_t *pool){
printf("------\n"); printf("------\n");
for(int i = 0; i < MEMPOOL_NUM_CLASSES; i++){ for(int i = 0; i < MEMPOOL_NUM_CLASSES; i++){
mp_bucket_t *bucket = &pool->buckets[i]; mp_bucket_t *bucket = &pool->buckets[i];
if(bucket->page_count) ret += bucket->page_count; if(bucket->page_count) {
printf("size:%ld, page:%d, empty:%d\n", bucket->block_size, bucket->page_count, bucket->empty_count); ret += bucket->page_count;
printf("size:%ld, page:%d, empty:%d\n", bucket->block_size, bucket->page_count, bucket->empty_count);
}
} }
printf("------\n"); printf("------\n");

View File

@@ -7,8 +7,8 @@
#include <string.h> #include <string.h>
#include <pthread.h> #include <pthread.h>
// #define MEMPOOL_PAGE_SIZE 4096 #define MEMPOOL_PAGE_SIZE 4096
#define MEMPOOL_PAGE_SIZE (4096*2) // #define MEMPOOL_PAGE_SIZE (256 * 1024)
#define MEMPOOL_BLOCK_MAX_SIZE 512 #define MEMPOOL_BLOCK_MAX_SIZE 512
#define MEMPOOL_ALIGNMENT 8 #define MEMPOOL_ALIGNMENT 8
#define MEMPOOL_NUM_CLASSES (MEMPOOL_BLOCK_MAX_SIZE / MEMPOOL_ALIGNMENT) #define MEMPOOL_NUM_CLASSES (MEMPOOL_BLOCK_MAX_SIZE / MEMPOOL_ALIGNMENT)
@@ -36,7 +36,7 @@ struct mp_page_s{
uint16_t free_count; uint16_t free_count;
uint16_t capacity; uint16_t capacity;
uint64_t bitmap[16]; // 最多支持 512/1280 个块 (64*20) // uint64_t bitmap[16];
}; };
struct mp_bucket_s{ struct mp_bucket_s{

703
network/chainbuffer.c Normal file
View File

@@ -0,0 +1,703 @@
#include "network/chainbuffer.h"
#include <assert.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#define CHAINBUFFER_DEFAULT_CHUNK 4096
#define CHAINBUFFER_MAX_IOV 16
#define CHAINBUFFER_DEFAULT_FREE_LIMIT 256
struct chain_buffer_node {
struct chain_buffer_node *next;
size_t cap;
size_t rpos;
size_t wpos;
size_t used;
unsigned refcnt;
uint8_t data[];
};
static chain_buffer_node_t *alloc_node(size_t cap) {
chain_buffer_node_t *node;
node = (chain_buffer_node_t *)malloc(sizeof(*node) + cap);
if (!node) {
return NULL;
}
node->next = NULL;
node->cap = cap;
node->rpos = 0;
node->wpos = 0;
node->used = 0;
node->refcnt = 0;
return node;
}
static void reset_node_state(chain_buffer_node_t *node) {
if (!node) {
return;
}
node->next = NULL;
node->rpos = 0;
node->wpos = 0;
node->used = 0;
node->refcnt = 0;
}
static size_t min_size(size_t a, size_t b) {
return a < b ? a : b;
}
static size_t node_read_seg1_len(const chain_buffer_node_t *node) {
size_t tail_len;
if (!node || node->used == 0) {
return 0;
}
tail_len = node->cap - node->rpos;
return min_size(node->used, tail_len);
}
static size_t node_write_seg1_len(const chain_buffer_node_t *node) {
size_t free_total;
size_t tail_len;
if (!node || node->used >= node->cap) {
return 0;
}
free_total = node->cap - node->used;
tail_len = node->cap - node->wpos;
return min_size(free_total, tail_len);
}
static void node_read_segments(const chain_buffer_node_t *node,
uint8_t **p1, size_t *l1,
uint8_t **p2, size_t *l2) {
size_t seg1;
size_t seg2;
seg1 = node_read_seg1_len(node);
seg2 = (node && node->used > seg1) ? (node->used - seg1) : 0;
if (p1) {
*p1 = (seg1 > 0) ? (uint8_t *)(node->data + node->rpos) : NULL;
}
if (l1) {
*l1 = seg1;
}
if (p2) {
*p2 = (seg2 > 0) ? (uint8_t *)node->data : NULL;
}
if (l2) {
*l2 = seg2;
}
}
static void node_write_segments(const chain_buffer_node_t *node,
uint8_t **p1, size_t *l1,
uint8_t **p2, size_t *l2) {
size_t free_total;
size_t seg1;
size_t seg2;
if (!node || node->used >= node->cap) {
if (p1) *p1 = NULL;
if (l1) *l1 = 0;
if (p2) *p2 = NULL;
if (l2) *l2 = 0;
return;
}
free_total = node->cap - node->used;
seg1 = node_write_seg1_len(node);
seg2 = free_total - seg1;
if (p1) {
*p1 = (seg1 > 0) ? (uint8_t *)(node->data + node->wpos) : NULL;
}
if (l1) {
*l1 = seg1;
}
if (p2) {
*p2 = (seg2 > 0) ? (uint8_t *)node->data : NULL;
}
if (l2) {
*l2 = seg2;
}
}
static void node_advance_read(chain_buffer_node_t *node, size_t n) {
if (!node || n == 0) {
return;
}
node->rpos = (node->rpos + n) % node->cap;
node->used -= n;
}
static void node_advance_write(chain_buffer_node_t *node, size_t n) {
if (!node || n == 0) {
return;
}
node->wpos = (node->wpos + n) % node->cap;
node->used += n;
}
static chain_buffer_node_t *acquire_node(chain_buffer_t *buf, size_t cap) {
chain_buffer_node_t *node = NULL;
if (!buf || cap == 0) {
return NULL;
}
if (cap == buf->chunk_size && buf->free_list) {
node = buf->free_list;
buf->free_list = node->next;
buf->free_count--;
reset_node_state(node);
return node;
}
node = alloc_node(cap);
return node;
}
static void recycle_node(chain_buffer_t *buf, chain_buffer_node_t *node) {
if (!node) {
return;
}
if (!buf || node->cap != buf->chunk_size || buf->free_count >= buf->free_limit) {
free(node);
return;
}
reset_node_state(node);
node->next = buf->free_list;
buf->free_list = node;
buf->free_count++;
}
static int append_new_tail(chain_buffer_t *buf, size_t cap) {
chain_buffer_node_t *node;
node = acquire_node(buf, cap);
if (!node) {
return -1;
}
if (!buf->tail) {
buf->head = node;
buf->tail = node;
} else {
buf->tail->next = node;
buf->tail = node;
}
return 0;
}
static void list_append_node(chain_buffer_list_t *list, chain_buffer_node_t *node) {
if (!list || !node) {
return;
}
node->next = NULL;
if (!list->tail) {
list->head = node;
list->tail = node;
} else {
list->tail->next = node;
list->tail = node;
}
}
void chain_buffer_init(chain_buffer_t *buf, size_t chunk_size) {
if (!buf) {
return;
}
memset(buf, 0, sizeof(*buf));
buf->chunk_size = chunk_size ? chunk_size : CHAINBUFFER_DEFAULT_CHUNK;
buf->free_limit = CHAINBUFFER_DEFAULT_FREE_LIMIT;
}
void chain_buffer_reset(chain_buffer_t *buf) {
chain_buffer_node_t *node;
if (!buf) {
return;
}
node = buf->head;
while (node) {
chain_buffer_node_t *next = node->next;
free(node);
node = next;
}
node = buf->free_list;
while (node) {
chain_buffer_node_t *next = node->next;
free(node);
node = next;
}
free(buf->linear_cache);
memset(buf, 0, sizeof(*buf));
}
size_t chain_buffer_len(const chain_buffer_t *buf) {
return buf ? buf->total_len : 0;
}
int chain_buffer_append(chain_buffer_t *buf, const void *data, size_t len) {
const uint8_t *src;
size_t remain;
if (!buf || (!data && len > 0)) {
errno = EINVAL;
return -1;
}
if (len == 0) {
return 0;
}
if (buf->total_len > (size_t)-1 - len) {
errno = EOVERFLOW;
return -1;
}
src = (const uint8_t *)data;
remain = len;
while (remain > 0) {
chain_buffer_node_t *tail;
uint8_t *p1;
uint8_t *p2;
size_t l1;
size_t l2;
size_t writable;
size_t n;
size_t c1;
if (!buf->tail || buf->tail->used == buf->tail->cap) {
size_t cap = remain > buf->chunk_size ? remain : buf->chunk_size;
if (append_new_tail(buf, cap) < 0) {
errno = ENOMEM;
return -1;
}
}
tail = buf->tail;
node_write_segments(tail, &p1, &l1, &p2, &l2);
writable = l1 + l2;
if (writable == 0) {
continue;
}
n = min_size(remain, writable);
c1 = min_size(n, l1);
if (c1 > 0) {
memcpy(p1, src, c1);
}
if (n > c1) {
memcpy(p2, src + c1, n - c1);
}
node_advance_write(tail, n);
buf->total_len += n;
src += n;
remain -= n;
}
return 0;
}
size_t chain_buffer_drain(chain_buffer_t *buf, size_t len) {
size_t remain;
size_t drained;
if (!buf || len == 0 || buf->total_len == 0) {
return 0;
}
remain = len;
drained = 0;
while (remain > 0 && buf->head) {
chain_buffer_node_t *node = buf->head;
size_t n = min_size(remain, node->used);
if (n == 0) {
buf->head = node->next;
if (!buf->head) {
buf->tail = NULL;
}
recycle_node(buf, node);
continue;
}
node_advance_read(node, n);
buf->total_len -= n;
drained += n;
remain -= n;
if (node->used == 0) {
assert(node->refcnt == 0);
buf->head = node->next;
if (!buf->head) {
buf->tail = NULL;
}
recycle_node(buf, node);
}
}
return drained;
}
const uint8_t *chain_buffer_linearize(chain_buffer_t *buf, size_t *out_len) {
size_t offset;
if (!buf) {
return NULL;
}
if (out_len) {
*out_len = buf->total_len;
}
if (buf->total_len == 0) {
return NULL;
}
if (buf->head == buf->tail && buf->head) {
chain_buffer_node_t *node = buf->head;
if (node->used == 0) {
return NULL;
}
if (node_read_seg1_len(node) == node->used) {
return node->data + node->rpos;
}
}
if (buf->linear_cap < buf->total_len) {
uint8_t *new_cache = (uint8_t *)realloc(buf->linear_cache, buf->total_len);
if (!new_cache) {
return NULL;
}
buf->linear_cache = new_cache;
buf->linear_cap = buf->total_len;
}
offset = 0;
for (chain_buffer_node_t *node = buf->head; node; node = node->next) {
uint8_t *p1;
uint8_t *p2;
size_t l1;
size_t l2;
node_read_segments(node, &p1, &l1, &p2, &l2);
if (l1 > 0) {
memcpy(buf->linear_cache + offset, p1, l1);
offset += l1;
}
if (l2 > 0) {
memcpy(buf->linear_cache + offset, p2, l2);
offset += l2;
}
}
return buf->linear_cache;
}
ssize_t chain_buffer_send_fd(chain_buffer_t *buf, int fd, int flags) {
struct iovec iov[CHAINBUFFER_MAX_IOV];
struct msghdr msg;
size_t iovcnt;
ssize_t n;
if (!buf) {
errno = EINVAL;
return -1;
}
if (buf->total_len == 0 || !buf->head) {
return 0;
}
iovcnt = 0;
for (chain_buffer_node_t *node = buf->head;
node && iovcnt < CHAINBUFFER_MAX_IOV;
node = node->next) {
uint8_t *p1;
uint8_t *p2;
size_t l1;
size_t l2;
node_read_segments(node, &p1, &l1, &p2, &l2);
if (l1 > 0) {
iov[iovcnt].iov_base = p1;
iov[iovcnt].iov_len = l1;
iovcnt++;
if (iovcnt >= CHAINBUFFER_MAX_IOV) {
break;
}
}
if (l2 > 0) {
iov[iovcnt].iov_base = p2;
iov[iovcnt].iov_len = l2;
iovcnt++;
}
}
if (iovcnt == 0) {
return 0;
}
memset(&msg, 0, sizeof(msg));
msg.msg_iov = iov;
msg.msg_iovlen = iovcnt;
n = sendmsg(fd, &msg, flags);
if (n > 0) {
chain_buffer_drain(buf, (size_t)n);
}
return n;
}
int chain_buffer_prepare_recv_iov(chain_buffer_t *buf, struct iovec *iov, int max_iov) {
chain_buffer_node_t *tail;
uint8_t *p1;
uint8_t *p2;
size_t l1;
size_t l2;
int iovcnt;
if (!buf || !iov || max_iov <= 0) {
errno = EINVAL;
return -1;
}
if (!buf->tail || buf->tail->used == buf->tail->cap) {
if (append_new_tail(buf, buf->chunk_size) < 0) {
errno = ENOMEM;
return -1;
}
}
tail = buf->tail;
node_write_segments(tail, &p1, &l1, &p2, &l2);
iovcnt = 0;
if (l1 > 0) {
iov[iovcnt].iov_base = p1;
iov[iovcnt].iov_len = l1;
iovcnt++;
}
if (l2 > 0 && iovcnt < max_iov) {
iov[iovcnt].iov_base = p2;
iov[iovcnt].iov_len = l2;
iovcnt++;
}
return iovcnt;
}
size_t chain_buffer_commit_recv(chain_buffer_t *buf, size_t len) {
chain_buffer_node_t *tail;
size_t free_total;
if (!buf || len == 0) {
return 0;
}
if (!buf->tail || buf->tail->used == buf->tail->cap) {
return 0;
}
tail = buf->tail;
free_total = tail->cap - tail->used;
if (len > free_total) {
len = free_total;
}
node_advance_write(tail, len);
buf->total_len += len;
return len;
}
void chain_buffer_list_init(chain_buffer_list_t *list) {
if (!list) {
return;
}
memset(list, 0, sizeof(*list));
}
size_t chain_buffer_list_len(const chain_buffer_list_t *list) {
return list ? list->total_len : 0;
}
int chain_buffer_list_iov(const chain_buffer_list_t *list, struct iovec *iov, int max_iov) {
int iovcnt = 0;
if (!list || !iov || max_iov <= 0) {
errno = EINVAL;
return -1;
}
for (chain_buffer_node_t *node = list->head; node && iovcnt < max_iov; node = node->next) {
uint8_t *p1;
uint8_t *p2;
size_t l1;
size_t l2;
node_read_segments(node, &p1, &l1, &p2, &l2);
if (l1 > 0) {
iov[iovcnt].iov_base = p1;
iov[iovcnt].iov_len = l1;
iovcnt++;
if (iovcnt >= max_iov) {
break;
}
}
if (l2 > 0) {
iov[iovcnt].iov_base = p2;
iov[iovcnt].iov_len = l2;
iovcnt++;
}
}
return iovcnt;
}
static int copy_prefix_from_node(chain_buffer_node_t *node, size_t len, uint8_t *dst) {
uint8_t *p1;
uint8_t *p2;
size_t l1;
size_t l2;
size_t c1;
if (!node || !dst || len == 0 || len > node->used) {
return -1;
}
node_read_segments(node, &p1, &l1, &p2, &l2);
c1 = min_size(len, l1);
if (c1 > 0) {
memcpy(dst, p1, c1);
}
if (len > c1) {
memcpy(dst + c1, p2, len - c1);
}
return 0;
}
int chain_buffer_detach_prefix(chain_buffer_t *buf, size_t len, chain_buffer_list_t *out) {
size_t remain;
if (!buf || !out) {
errno = EINVAL;
return -1;
}
chain_buffer_list_init(out);
if (len == 0) {
return 0;
}
if (len > buf->total_len) {
errno = EINVAL;
return -1;
}
remain = len;
while (remain > 0 && buf->head) {
chain_buffer_node_t *node = buf->head;
if (remain >= node->used) {
size_t take = node->used;
buf->head = node->next;
if (!buf->head) {
buf->tail = NULL;
}
node->next = NULL;
node->refcnt = 1;
list_append_node(out, node);
out->total_len += take;
buf->total_len -= take;
remain -= take;
continue;
}
{
chain_buffer_node_t *part = acquire_node(buf, remain);
if (!part) {
chain_buffer_list_release(buf, out);
errno = ENOMEM;
return -1;
}
if (copy_prefix_from_node(node, remain, part->data) < 0) {
recycle_node(buf, part);
chain_buffer_list_release(buf, out);
errno = EINVAL;
return -1;
}
part->used = remain;
part->wpos = remain % part->cap;
part->rpos = 0;
part->refcnt = 1;
part->next = NULL;
list_append_node(out, part);
out->total_len += remain;
node_advance_read(node, remain);
buf->total_len -= remain;
remain = 0;
if (node->used == 0) {
buf->head = node->next;
if (!buf->head) {
buf->tail = NULL;
}
recycle_node(buf, node);
}
}
}
return 0;
}
void chain_buffer_list_release(chain_buffer_t *owner, chain_buffer_list_t *list) {
chain_buffer_node_t *node;
if (!list) {
return;
}
node = list->head;
while (node) {
chain_buffer_node_t *next = node->next;
if (node->refcnt > 0) {
node->refcnt--;
}
if (node->refcnt == 0) {
if (owner) {
recycle_node(owner, node);
} else {
free(node);
}
}
node = next;
}
chain_buffer_list_init(list);
}

52
network/chainbuffer.h Normal file
View File

@@ -0,0 +1,52 @@
#ifndef __CHAINBUFFER_H__
#define __CHAINBUFFER_H__
#include <stddef.h>
#include <stdint.h>
#include <sys/types.h>
#include <sys/uio.h>
typedef struct chain_buffer_node chain_buffer_node_t;
typedef struct chain_buffer_s {
chain_buffer_node_t *head;
chain_buffer_node_t *tail;
size_t total_len;
size_t chunk_size;
chain_buffer_node_t *free_list;
size_t free_count;
size_t free_limit;
uint8_t *linear_cache;
size_t linear_cap;
} chain_buffer_t;
typedef struct chain_buffer_list_s {
chain_buffer_node_t *head;
chain_buffer_node_t *tail;
size_t total_len;
} chain_buffer_list_t;
void chain_buffer_init(chain_buffer_t *buf, size_t chunk_size);
void chain_buffer_reset(chain_buffer_t *buf);
size_t chain_buffer_len(const chain_buffer_t *buf);
int chain_buffer_append(chain_buffer_t *buf, const void *data, size_t len);
size_t chain_buffer_drain(chain_buffer_t *buf, size_t len);
const uint8_t *chain_buffer_linearize(chain_buffer_t *buf, size_t *out_len);
ssize_t chain_buffer_send_fd(chain_buffer_t *buf, int fd, int flags);
/* readv zero-copy recv helpers */
int chain_buffer_prepare_recv_iov(chain_buffer_t *buf, struct iovec *iov, int max_iov);
size_t chain_buffer_commit_recv(chain_buffer_t *buf, size_t len);
/* ownership transfer helpers */
void chain_buffer_list_init(chain_buffer_list_t *list);
size_t chain_buffer_list_len(const chain_buffer_list_t *list);
int chain_buffer_list_iov(const chain_buffer_list_t *list, struct iovec *iov, int max_iov);
int chain_buffer_detach_prefix(chain_buffer_t *buf, size_t len, chain_buffer_list_t *out);
void chain_buffer_list_release(chain_buffer_t *owner, chain_buffer_list_t *list);
#endif

854
reactor.c
View File

@@ -1,461 +1,625 @@
#define _GNU_SOURCE #define _GNU_SOURCE
#include <errno.h>
#include <stdio.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <string.h>
#include <pthread.h>
#include <unistd.h>
#include <poll.h>
#include <sys/epoll.h>
#include <errno.h>
#include <sys/time.h>
#include <sys/eventfd.h>
#include <sys/timerfd.h>
#include <errno.h>
#include <fcntl.h>
#include <netinet/in.h>
#include <pthread.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <sys/epoll.h>
#include <sys/eventfd.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <sys/timerfd.h>
#include <sys/uio.h>
#include <unistd.h>
#include "diskuring/diskuring.h"
#include "server.h" #include "server.h"
#define CONNECTION_SIZE 65536
#define CONNECTION_SIZE 1024 // 1024 * 1024 #define MAX_PORTS 20
#define RECV_BATCH_BYTES 4096
#define MAX_PORTS 20 #define CHAIN_BUFFER_CHUNK 4096
#define MAX_CONN_READ_BYTES (32u * 1024u * 1024u)
#define TIME_SUB_MS(tv1, tv2) ((tv1.tv_sec - tv2.tv_sec) * 1000 + (tv1.tv_usec - tv2.tv_usec) / 1000) #define MAX_CONN_WRITE_BYTES (32u * 1024u * 1024u)
#if ENABLE_KVSTORE #if ENABLE_KVSTORE
typedef int (*msg_handler)(struct conn *conn);
// typedef int (*msg_handler)(char *msg, int length, char *response);
// typedef int (*msg_handler)(char *request, int request_length, char *response, int *response_length);
typedef int (*msg_handler)(struct conn* conn);
static msg_handler kvs_handler; static msg_handler kvs_handler;
extern void cleanup_finished_iouring_tasks(); extern iouring_ctx_t global_uring_ctx;
// 0 need more, -1 error, =1 suc
int kvs_request(struct conn *c) { int kvs_request(struct conn *c) {
// int consumed_out = kvs_handler(c->rbuffer, c->rlength, c->wbuffer, &c->wlength); return kvs_handler ? kvs_handler(c) : -1;
int consumed_out = kvs_handler(c);
return consumed_out;
} }
int kvs_response(struct conn *c) { int kvs_response(struct conn *c) {
(void)c;
return 0;
} }
#endif #endif
int accept_cb(int fd); int accept_cb(int fd);
int recv_cb(int fd); int recv_cb(int fd);
int send_cb(int fd); int send_cb(int fd);
static int epfd = -1;
static int wakeup_fd = -1;
static int timer_fd = -1;
static struct timeval begin;
static struct conn conn_list[CONNECTION_SIZE];
static int conn_fd_valid(int fd) {
int epfd = 0; return fd >= 0 && fd < CONNECTION_SIZE;
struct timeval begin;
int wakeup_fd = -1;
int timer_fd = -1;
struct conn conn_list[CONNECTION_SIZE] = {0};
// fd
// 1 add, 0 mod
int set_event(int fd, int event, int flag) {
if (flag) { // non-zero add
struct epoll_event ev;
ev.events = event;
ev.data.fd = fd;
epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev);
} else { // zero mod
struct epoll_event ev;
ev.events = event;
ev.data.fd = fd;
epoll_ctl(epfd, EPOLL_CTL_MOD, fd, &ev);
}
} }
static int set_nonblocking(int fd) {
int event_register(int fd, int event) { int flags = fcntl(fd, F_GETFL, 0);
if (flags < 0) {
if (fd < 0) return -1; return -1;
}
conn_list[fd].fd = fd; if ((flags & O_NONBLOCK) != 0) {
conn_list[fd].r_action.recv_callback = recv_cb;
conn_list[fd].send_callback = send_cb;
memset(conn_list[fd].rbuffer, 0, BUFFER_LENGTH);
conn_list[fd].rlength = 0;
memset(conn_list[fd].wbuffer, 0, BUFFER_LENGTH*2);
conn_list[fd].wlength = 0;
conn_list[fd].is_stop = 0;
set_event(fd, event, 1);
}
// listenfd(sockfd) --> EPOLLIN --> accept_cb
int accept_cb(int fd) {
struct sockaddr_in clientaddr;
socklen_t len = sizeof(clientaddr);
int clientfd = accept(fd, (struct sockaddr*)&clientaddr, &len);
if (clientfd < 0) {
printf("accept errno: %d --> %s\n", errno, strerror(errno));
return -1;
}
event_register(clientfd, EPOLLIN); // | EPOLLET
if ((clientfd % 1000) == 0) {
struct timeval current;
gettimeofday(&current, NULL);
int time_used = TIME_SUB_MS(current, begin);
memcpy(&begin, &current, sizeof(struct timeval));
//printf("accept finshed: %d, time_used: %d\n", clientfd, time_used);
}
return 0;
}
int recv_cb(int fd) {
struct conn *c = &conn_list[fd];
int avail = BUFFER_LENGTH - c->rlength;
// printf("avail: %d\n", avail);
if (avail <= 0) {
// 缓冲满了还没解析出来:协议异常或包过大
epoll_ctl(epfd, EPOLL_CTL_DEL, fd, NULL);
close(fd);
return 0; return 0;
} }
return fcntl(fd, F_SETFL, flags | O_NONBLOCK);
int count = recv(fd, c->rbuffer + c->rlength, avail, 0);
if (count == 0) { // disconnect
//printf("client disconnect: %d\n", fd);
epoll_ctl(epfd, EPOLL_CTL_DEL, fd, NULL); // unfinished
close(fd);
return 0;
} else if (count < 0) { //
printf("count: %d, errno: %d, %s\n", count, errno, strerror(errno));
epoll_ctl(epfd, EPOLL_CTL_DEL, fd, NULL);
close(fd);
return 0;
}
c->rlength += count;
//printf("RECV: %s\n", conn_list[fd].rbuffer);
#if 0 // echo
conn_list[fd].wlength = conn_list[fd].rlength;
memcpy(conn_list[fd].wbuffer, conn_list[fd].rbuffer, conn_list[fd].wlength);
printf("[%d]RECV: %s\n", conn_list[fd].rlength, conn_list[fd].rbuffer);
#elif ENABLE_HTTP
http_request(&conn_list[fd]);
#elif ENABLE_WEBSOCKET
ws_request(&conn_list[fd]);
#elif ENABLE_KVSTORE
int consumed = kvs_request(c);
if(consumed < 0){
epoll_ctl(epfd, EPOLL_CTL_DEL, fd, NULL);
close(fd);
return 0;
}
// 清理 buffer
if (consumed > 0 && consumed < c->rlength) {
// 有剩余未处理数据,搬移到 buffer 头部
int left = c->rlength - consumed;
if (left > 0) memmove(c->rbuffer, c->rbuffer + consumed, left);
c->rlength = left;
if (c->wlength > 0) set_event(fd, EPOLLOUT, 0);
return count;
}else{
c->rlength = 0;
if(c->wlength > 0) set_event(fd, EPOLLOUT, 0);
return count;
}
#endif
set_event(fd, EPOLLOUT, 0);
return count;
} }
static void conn_clear_slot(int fd) {
struct conn *c;
int send_cb(int fd) { if (!conn_fd_valid(fd)) {
return;
}
#if ENABLE_HTTP c = &conn_list[fd];
chain_buffer_reset(&c->rbuf);
chain_buffer_reset(&c->wbuf);
memset(c, 0, sizeof(*c));
c->fd = -1;
}
http_response(&conn_list[fd]); static void close_conn(int fd) {
if (!conn_fd_valid(fd)) {
return;
}
epoll_ctl(epfd, EPOLL_CTL_DEL, fd, NULL);
close(fd);
conn_clear_slot(fd);
}
#elif ENABLE_WEBSOCKET static int set_event(int fd, int event, int is_add) {
struct epoll_event ev;
int op = is_add ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
ws_response(&conn_list[fd]); memset(&ev, 0, sizeof(ev));
ev.events = (uint32_t)event;
ev.data.fd = fd;
#elif ENABLE_KVSTORE if (epoll_ctl(epfd, op, fd, &ev) < 0) {
return -1;
}
return 0;
}
kvs_response(&conn_list[fd]); static int update_conn_events(int fd) {
struct conn *c;
int events = EPOLLIN;
#endif if (!conn_fd_valid(fd)) {
return -1;
}
struct conn *c = &conn_list[fd]; c = &conn_list[fd];
int sent_total = 0; if (chain_buffer_len(&c->wbuf) > 0) {
events |= EPOLLOUT;
}
pthread_mutex_lock(&c->g_sync_lock); return set_event(fd, events, 0);
}
while (c->wlength > 0) { int event_register(int fd, int event) {
ssize_t n = send(fd, c->wbuffer, (size_t)c->wlength, MSG_NOSIGNAL); struct conn *c;
if (n > 0) {
sent_total += (int)n;
if (n == c->wlength) { if (!conn_fd_valid(fd)) {
/* 全部发完 */ if (fd >= 0) {
c->wlength = 0; close(fd);
break; }
return -1;
}
if (set_nonblocking(fd) < 0) {
close(fd);
return -1;
}
conn_clear_slot(fd);
c = &conn_list[fd];
c->fd = fd;
c->r_action.recv_callback = recv_cb;
c->send_callback = send_cb;
c->is_stop = 0;
chain_buffer_init(&c->rbuf, CHAIN_BUFFER_CHUNK);
chain_buffer_init(&c->wbuf, CHAIN_BUFFER_CHUNK);
if (set_event(fd, event, 1) < 0) {
close_conn(fd);
return -1;
}
return 0;
}
int accept_cb(int fd) {
while (1) {
struct sockaddr_in clientaddr;
socklen_t len = sizeof(clientaddr);
int clientfd = accept4(fd, (struct sockaddr *)&clientaddr, &len, SOCK_NONBLOCK | SOCK_CLOEXEC);
if (clientfd < 0) {
if (errno == EAGAIN || errno == EWOULDBLOCK) {
return 0;
} }
if (errno == EINTR) {
/* 只发了一部分:把剩余数据搬到 buffer 头部 */ continue;
int left = c->wlength - (int)n; }
memmove(c->wbuffer, c->wbuffer + n, (size_t)left); printf("accept errno: %d --> %s\n", errno, strerror(errno));
c->wlength = left; return -1;
/* 不要在这里死循环占用 CPU交给下一次 EPOLLOUT */
break;
} }
if (n < 0) { if (!conn_fd_valid(clientfd)) {
if (errno == EAGAIN || errno == EWOULDBLOCK) { printf("drop client fd=%d, out of conn_list range\n", clientfd);
/* 暂时发不出去,等下一次可写事件 */ close(clientfd);
pthread_mutex_unlock(&c->g_sync_lock); continue;
set_event(fd, EPOLLOUT, 0); }
return sent_total;
}
/* 对端断开 / 其他错误 */ if (event_register(clientfd, EPOLLIN) < 0) {
int e = errno; continue;
pthread_mutex_unlock(&c->g_sync_lock); }
printf("send fd=%d errno=%d %s\n", fd, e, strerror(e)); if ((clientfd % 1000) == 0) {
epoll_ctl(epfd, EPOLL_CTL_DEL, fd, NULL); struct timeval current;
close(fd); int time_used;
gettimeofday(&current, NULL);
time_used = (int)((current.tv_sec - begin.tv_sec) * 1000 +
(current.tv_usec - begin.tv_usec) / 1000);
begin = current;
(void)time_used;
}
}
}
int recv_cb(int fd) {
struct conn *c;
int total = 0;
if (!conn_fd_valid(fd)) {
return -1;
}
c = &conn_list[fd];
while (1) {
struct iovec iov[4];
int iovcnt = chain_buffer_prepare_recv_iov(&c->rbuf, iov, (int)(sizeof(iov) / sizeof(iov[0])));
ssize_t n;
if (iovcnt < 0) {
printf("fd=%d prepare recv iov failed: %s\n", fd, strerror(errno));
close_conn(fd);
return 0; return 0;
} }
break; n = readv(fd, iov, iovcnt);
}
pthread_mutex_unlock(&c->g_sync_lock);
if (c->wlength > 0) { if (n > 0) {
/* 还有没发完,继续监听可写 */ size_t cur_len = chain_buffer_len(&c->rbuf);
set_event(fd, EPOLLOUT, 0); if (cur_len > MAX_CONN_READ_BYTES - (size_t)n) {
} else { printf("fd=%d read buffer overflow, close connection\n", fd);
/* 发完了,回到读 */ close_conn(fd);
set_event(fd, EPOLLIN, 0); return 0;
}
if (chain_buffer_commit_recv(&c->rbuf, (size_t)n) != (size_t)n) {
printf("fd=%d commit recv buffer failed\n", fd);
close_conn(fd);
return 0;
}
total += (int)n;
continue;
}
if (n == 0) {
close_conn(fd);
return 0;
}
if (errno == EINTR) {
continue;
}
if (errno == EAGAIN || errno == EWOULDBLOCK) {
break;
}
printf("recv fd=%d errno=%d, %s\n", fd, errno, strerror(errno));
close_conn(fd);
return 0;
}
if (total <= 0) {
return 0;
}
#if ENABLE_HTTP
http_request(c);
#elif ENABLE_WEBSOCKET
ws_request(c);
#elif ENABLE_KVSTORE
{
int consumed = kvs_request(c);
size_t readable = chain_buffer_len(&c->rbuf);
if (consumed < 0) {
close_conn(fd);
return 0;
}
if ((size_t)consumed > readable) {
printf("fd=%d invalid consumed=%d readable=%zu\n", fd, consumed, readable);
close_conn(fd);
return 0;
}
if (consumed > 0) {
chain_buffer_drain(&c->rbuf, (size_t)consumed);
}
if (chain_buffer_len(&c->wbuf) > MAX_CONN_WRITE_BYTES) {
printf("fd=%d write buffer overflow, close connection\n", fd);
close_conn(fd);
return 0;
}
}
#endif
if (update_conn_events(fd) < 0) {
close_conn(fd);
return 0;
}
return total;
}
int send_cb(int fd) {
struct conn *c;
int sent_total = 0;
if (!conn_fd_valid(fd)) {
return -1;
}
c = &conn_list[fd];
#if ENABLE_HTTP
http_response(c);
#elif ENABLE_WEBSOCKET
ws_response(c);
#elif ENABLE_KVSTORE
kvs_response(c);
#endif
while (chain_buffer_len(&c->wbuf) > 0) {
ssize_t n = chain_buffer_send_fd(&c->wbuf, fd, MSG_NOSIGNAL);
if (n > 0) {
sent_total += (int)n;
continue;
}
if (n == 0) {
break;
}
if (errno == EINTR) {
continue;
}
if (errno == EAGAIN || errno == EWOULDBLOCK) {
break;
}
printf("send fd=%d errno=%d %s\n", fd, errno, strerror(errno));
close_conn(fd);
return 0;
}
if (update_conn_events(fd) < 0) {
close_conn(fd);
return 0;
} }
return sent_total; return sent_total;
} }
// wakup fd
int handle_wakeup_fd_cb(int fd) { int handle_wakeup_fd_cb(int fd) {
uint64_t v; uint64_t v;
while (1) { while (1) {
ssize_t n = read(wakeup_fd, &v, sizeof(v)); ssize_t n = read(fd, &v, sizeof(v));
if (n == sizeof(v)) continue; if (n == (ssize_t)sizeof(v)) {
if (n < 0 && errno == EAGAIN) break; // 已经读空 continue;
}
if (n < 0 && errno == EINTR) {
continue;
}
if (n < 0 && errno == EAGAIN) {
break;
}
break; break;
} }
cleanup_finished_iouring_tasks();
return 0; cleanup_finished_iouring_tasks(&global_uring_ctx);
return 0;
} }
int init_wakeup_fd(void) { int init_wakeup_fd(void) {
int wfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); int wfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
struct conn *c;
if (wfd < 0) { if (wfd < 0) {
printf("eventfd failed: errno=%d %s\n", errno, strerror(errno)); printf("eventfd failed: errno=%d %s\n", errno, strerror(errno));
return -1; return -1;
} }
if (!conn_fd_valid(wfd)) {
close(wfd);
return -1;
}
conn_list[wfd].fd = wfd; conn_clear_slot(wfd);
conn_list[wfd].r_action.recv_callback = handle_wakeup_fd_cb; c = &conn_list[wfd];
set_event(wfd, EPOLLIN, 1); c->fd = wfd;
c->r_action.recv_callback = handle_wakeup_fd_cb;
if (set_event(wfd, EPOLLIN, 1) < 0) {
close_conn(wfd);
return -1;
}
return wfd; return wfd;
} }
// EPOLLOUT
void sync_wakeup() { void sync_wakeup() {
if (wakeup_fd < 0) return;
// set_event(wakeup_fd, EPOLLOUT, 0);
uint64_t one = 1; uint64_t one = 1;
ssize_t n = write(wakeup_fd, &one, sizeof(one)); ssize_t n;
}
// #include "diskuring/diskuring.h" if (wakeup_fd < 0) {
// extern iouring_ctx_t global_uring_ctx; return;
// extern void iouring_tick(iouring_ctx_t *ctx); }
// 定时器
int handle_timer_fd_cb(int fd){
uint64_t v;
while (1) { while (1) {
ssize_t n = read(fd, &v, sizeof(v)); n = write(wakeup_fd, &one, sizeof(one));
if (n == sizeof(v)) { if (n == (ssize_t)sizeof(one)) {
return;
}
if (n < 0 && errno == EINTR) {
continue; continue;
} }
if (n < 0 && errno == EAGAIN) break; if (n < 0 && errno == EAGAIN) {
break; return;
}
return;
} }
// iouring_tick(&global_uring_ctx);
} }
int init_timer_fd(void){ int handle_timer_fd_cb(int fd) {
int tfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK | TFD_CLOEXEC); uint64_t v;
struct itimerspec its = { while (1) {
.it_interval = {0, 100 * 1000 * 1000}, // 100ms = 100,000,000 纳秒 ssize_t n = read(fd, &v, sizeof(v));
.it_value = {0, 100 * 1000 * 1000}, // 首次 100ms 后触发 if (n == (ssize_t)sizeof(v)) {
}; continue;
timerfd_settime(tfd, 0, &its, NULL); }
if (n < 0 && errno == EINTR) {
continue;
}
if (n < 0 && errno == EAGAIN) {
break;
}
break;
}
return 0;
}
conn_list[tfd].fd = tfd; int init_timer_fd(void) {
conn_list[tfd].r_action.recv_callback = handle_timer_fd_cb; int tfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK | TFD_CLOEXEC);
set_event(tfd, EPOLLIN, 1); struct itimerspec its;
struct conn *c;
return tfd; if (tfd < 0) {
printf("timerfd_create failed: errno=%d %s\n", errno, strerror(errno));
return -1;
}
if (!conn_fd_valid(tfd)) {
close(tfd);
return -1;
}
memset(&its, 0, sizeof(its));
its.it_interval.tv_nsec = 100 * 1000 * 1000;
its.it_value.tv_nsec = 100 * 1000 * 1000;
if (timerfd_settime(tfd, 0, &its, NULL) < 0) {
close(tfd);
return -1;
}
conn_clear_slot(tfd);
c = &conn_list[tfd];
c->fd = tfd;
c->r_action.recv_callback = handle_timer_fd_cb;
if (set_event(tfd, EPOLLIN, 1) < 0) {
close_conn(tfd);
return -1;
}
return tfd;
} }
int r_init_server(unsigned short port) { int r_init_server(unsigned short port) {
int sockfd = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, 0);
int opt = 1;
struct sockaddr_in servaddr;
int sockfd = socket(AF_INET, SOCK_STREAM, 0); if (sockfd < 0) {
return -1;
int opt = 1;
if (setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0) {
perror("setsockopt");
close(sockfd);
} }
struct sockaddr_in servaddr; if (setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0) {
servaddr.sin_family = AF_INET; close(sockfd);
servaddr.sin_addr.s_addr = htonl(INADDR_ANY); // 0.0.0.0 return -1;
servaddr.sin_port = htons(port); // 0-1023, }
if (-1 == bind(sockfd, (struct sockaddr*)&servaddr, sizeof(struct sockaddr))) { memset(&servaddr, 0, sizeof(servaddr));
printf("bind failed: %s\n", strerror(errno)); servaddr.sin_family = AF_INET;
} servaddr.sin_addr.s_addr = htonl(INADDR_ANY);
servaddr.sin_port = htons(port);
listen(sockfd, 10); if (bind(sockfd, (struct sockaddr *)&servaddr, sizeof(servaddr)) < 0) {
//printf("listen finshed: %d\n", sockfd); // 3 printf("bind failed on port %u: %s\n", port, strerror(errno));
close(sockfd);
return -1;
}
return sockfd; if (listen(sockfd, 128) < 0) {
close(sockfd);
return -1;
}
return sockfd;
} }
int reactor_start(unsigned short port, msg_handler handler) { int reactor_start(unsigned short port, msg_handler handler) {
int listen_fds[MAX_PORTS];
int listen_count = 0;
int i;
//unsigned short port = 2000; if (!handler) {
kvs_handler = handler; return -1;
}
epfd = epoll_create(1); for (i = 0; i < CONNECTION_SIZE; i++) {
conn_list[i].fd = -1;
}
wakeup_fd = init_wakeup_fd(); kvs_handler = handler;
if(wakeup_fd < 0){ epfd = epoll_create1(EPOLL_CLOEXEC);
close(epfd); if (epfd < 0) {
return -1; return -1;
} }
timer_fd = init_timer_fd(); wakeup_fd = init_wakeup_fd();
if(timer_fd < 0){ if (wakeup_fd < 0) {
close(epfd); close(epfd);
close(wakeup_fd); epfd = -1;
return -1; return -1;
} }
int i = 0; timer_fd = init_timer_fd();
if (timer_fd < 0) {
close_conn(wakeup_fd);
close(epfd);
wakeup_fd = -1;
epfd = -1;
return -1;
}
for (i = 0;i < MAX_PORTS;i ++) { for (i = 0; i < MAX_PORTS; i++) {
int sockfd = r_init_server((unsigned short)(port + i));
struct conn *c;
if (sockfd < 0) {
continue;
}
if (!conn_fd_valid(sockfd)) {
close(sockfd);
continue;
}
int sockfd = r_init_server(port + i); conn_clear_slot(sockfd);
c = &conn_list[sockfd];
c->fd = sockfd;
c->r_action.recv_callback = accept_cb;
c->is_stop = 0;
conn_list[sockfd].fd = sockfd; if (set_event(sockfd, EPOLLIN, 1) < 0) {
conn_list[sockfd].r_action.recv_callback = accept_cb; close_conn(sockfd);
conn_list[sockfd].is_stop = 0; continue;
}
set_event(sockfd, EPOLLIN, 1); listen_fds[listen_count++] = sockfd;
} }
gettimeofday(&begin, NULL); if (listen_count == 0) {
close_conn(timer_fd);
close_conn(wakeup_fd);
close(epfd);
timer_fd = -1;
wakeup_fd = -1;
epfd = -1;
return -1;
}
while (1) { // mainloop gettimeofday(&begin, NULL);
struct epoll_event events[1024] = {0};
int nready = epoll_wait(epfd, events, 1024, -1);
// cleanup_finished_iouring_tasks();
int i = 0; while (1) {
for (i = 0;i < nready;i ++) { struct epoll_event events[1024];
int nready = epoll_wait(epfd, events, 1024, -1);
int connfd = events[i].data.fd; if (nready < 0) {
if (errno == EINTR) {
continue;
}
break;
}
if (events[i].events & EPOLLIN) { for (i = 0; i < nready; i++) {
// printf("connlist:%p, r_action:%p, recv_callaback:%p\n", &conn_list[connfd], &conn_list[connfd].r_action, conn_list[connfd].r_action.recv_callback); int connfd = events[i].data.fd;
conn_list[connfd].r_action.recv_callback(connfd); uint32_t ev = events[i].events;
}
if (events[i].events & EPOLLOUT) { if (!conn_fd_valid(connfd)) {
conn_list[connfd].send_callback(connfd); continue;
} }
}
} if ((ev & (EPOLLERR | EPOLLHUP | EPOLLRDHUP)) &&
conn_list[connfd].r_action.recv_callback == recv_cb) {
close_conn(connfd);
continue;
}
if (wakeup_fd >= 0) close(wakeup_fd); if ((ev & EPOLLIN) && conn_list[connfd].r_action.recv_callback) {
if (epfd >= 0) close(epfd); conn_list[connfd].r_action.recv_callback(connfd);
return 0; }
if (!conn_fd_valid(connfd) || conn_list[connfd].fd < 0) {
continue;
}
if ((ev & EPOLLOUT) && conn_list[connfd].send_callback) {
conn_list[connfd].send_callback(connfd);
}
}
}
for (i = 0; i < listen_count; i++) {
close_conn(listen_fds[i]);
}
if (timer_fd >= 0) {
close_conn(timer_fd);
}
if (wakeup_fd >= 0) {
close_conn(wakeup_fd);
}
if (epfd >= 0) {
close(epfd);
}
timer_fd = -1;
wakeup_fd = -1;
epfd = -1;
return 0;
} }

View File

@@ -6,6 +6,7 @@
#define __SERVER_H__ #define __SERVER_H__
#include <pthread.h> #include <pthread.h>
#include "network/chainbuffer.h"
#define BUFFER_LENGTH 4096 #define BUFFER_LENGTH 4096
@@ -20,11 +21,8 @@ typedef int (*RCALLBACK)(int fd);
struct conn { struct conn {
int fd; int fd;
char rbuffer[BUFFER_LENGTH]; chain_buffer_t rbuf;
int rlength; chain_buffer_t wbuf;
char wbuffer[BUFFER_LENGTH*2];
int wlength;
RCALLBACK send_callback; RCALLBACK send_callback;
@@ -35,8 +33,6 @@ struct conn {
int is_stop; int is_stop;
pthread_mutex_t g_sync_lock;
int status; int status;
#if 1 // websocket #if 1 // websocket
char *payload; char *payload;

106
test-redis/README.md Normal file
View File

@@ -0,0 +1,106 @@
## run_bench.hash.sh 使用指南
- 脚本:`test-redis/run_bench.hash.sh`
- 默认策略:`persist_no` / `persist_everysec` / `nopersist`
- 测试模式:`set` + `prefill+get`
- 默认参数:`REQ=1000000 KEYSPACE=1000000 PIPE=128 VSIZE=32 ROUNDS=5 RETRIES=3`
### 运行示例
```bash
# 默认运行5轮
./test-redis/run_bench.hash.sh
# 指定大 key 参数(示例)
ROUNDS=5 REQ=1000000 KEYSPACE=1000000 PIPE=128 VSIZE=256 RETRIES=3 ./test-redis/run_bench.hash.sh
```
### 常用环境变量
| 变量 | 默认值 | 说明 |
|---|---|---|
| `ROUNDS` | `5` | 每个策略的轮次 |
| `REQ` | `1000000` | 单次 bench 请求数 |
| `KEYSPACE` | `1000000` | key 空间大小 |
| `PIPE` | `128` | pipeline 深度 |
| `VSIZE` | `32` | value 大小(字节) |
| `RETRIES` | `3` | 单轮失败重试次数 |
| `SEED` | `12345` | 随机种子基数 |
| `ALLOC` | `mypool` | allocator |
| `KV_PORT` | `8888` | kvstore 端口 |
| `SET_CMD` / `GET_CMD` | `RSET` / `RGET` | 压测命令 |
## 大Key持久化协议复测覆盖版
- 时间口径2026-03-07本次重测
- 参数:`requests=1000000 pipeline=128 keyspace=1000000 value-size=256`
- 轮次:每种策略 `5`
- 去异常规则:每个场景按 `qps``5` 轮做中位数偏差,剔除偏差最大 `1` 轮,再对剩余 `4` 轮取均值
- kvstore 源数据:`results/hash_bench_fair_summary_20260307_062549.csv`
- redis 源数据:`results/redis_bigkey_summary_20260307_063158.csv`
- kvstore 去异常结果:`results/hash_bench_fair_trimmed_20260307_062549.csv`
- redis 去异常结果:`results/redis_bigkey_trimmed_20260307_063158.csv`
## kvstore 原始5轮
| 策略 | 模式 | Round1 | Round2 | Round3 | Round4 | Round5 | 原始均值QPS | 原始均值us/op |
|---|---|---:|---:|---:|---:|---:|---:|---:|
| persist_no | set | 139895.00 | 138768.00 | 138867.00 | 143296.00 | 144388.00 | 141042.80 | 7.09 |
| persist_no | get | 163764.00 | 164079.00 | 166415.00 | 163959.00 | 147881.00 | 161219.60 | 6.21 |
| persist_everysec | set | 133067.00 | 126140.00 | 133430.00 | 139786.00 | 141145.00 | 134713.60 | 7.43 |
| persist_everysec | get | 162076.00 | 165415.00 | 162180.00 | 163762.00 | 158226.00 | 162331.80 | 6.16 |
| nopersist | set | 157950.00 | 143109.00 | 135043.00 | 152045.00 | 147233.00 | 147076.00 | 6.82 |
| nopersist | get | 163381.00 | 169356.00 | 155890.00 | 164754.00 | 157550.00 | 162186.20 | 6.17 |
## kvstore 去异常后4轮均值
| 策略 | 模式 | 剔除轮次 | 剔除QPS | 去异常均值QPS | 去异常均值us/op |
|---|---|---:|---:|---:|---:|
| persist_no | set | 5 | 144388.00 | 140206.50 | 7.13 |
| persist_no | get | 5 | 147881.00 | 164554.25 | 6.08 |
| persist_everysec | set | 5 | 141145.00 | 133105.75 | 7.52 |
| persist_everysec | get | 5 | 158226.00 | 163358.25 | 6.12 |
| nopersist | set | 3 | 135043.00 | 150084.25 | 6.67 |
| nopersist | get | 3 | 155890.00 | 163760.25 | 6.11 |
## kvstore 协议开销(基线 nopersist去异常后
| 策略 | set QPS变化 | set us/op变化 | get QPS变化 | get us/op变化 |
|---|---:|---:|---:|---:|
| persist_no | -6.58% | +6.90% | +0.48% | -0.49% |
| persist_everysec | -11.31% | +12.74% | -0.25% | +0.16% |
## Redis 原始5轮
| 策略 | 模式 | Round1 | Round2 | Round3 | Round4 | Round5 | 原始均值QPS | 原始均值us/op |
|---|---|---:|---:|---:|---:|---:|---:|---:|
| none | set | 198673.00 | 211843.00 | 193900.00 | 208293.00 | 211071.00 | 204756.00 | 4.89 |
| none | get | 225347.00 | 221000.00 | 218357.00 | 216466.00 | 214747.00 | 219183.40 | 4.56 |
| aof_no | set | 144042.00 | 139615.00 | 140016.00 | 149925.00 | 150658.00 | 144851.20 | 6.91 |
| aof_no | get | 210541.00 | 211713.00 | 202589.00 | 231251.00 | 402028.00 | 251624.40 | 4.24 |
| aof_everysec | set | 143737.00 | 132998.00 | 136973.00 | 144448.00 | 142414.00 | 140114.00 | 7.14 |
| aof_everysec | get | 212422.00 | 201051.00 | 211508.00 | 192190.00 | 209874.00 | 205409.00 | 4.87 |
## Redis 去异常后4轮均值
| 策略 | 模式 | 剔除轮次 | 剔除QPS | 去异常均值QPS | 去异常均值us/op |
|---|---|---:|---:|---:|---:|
| none | set | 3 | 193900.00 | 207470.00 | 4.82 |
| none | get | 1 | 225347.00 | 217642.50 | 4.59 |
| aof_no | set | 5 | 150658.00 | 143399.50 | 6.98 |
| aof_no | get | 5 | 402028.00 | 214023.50 | 4.68 |
| aof_everysec | set | 2 | 132998.00 | 141893.00 | 7.05 |
| aof_everysec | get | 4 | 192190.00 | 208713.75 | 4.79 |
## Redis 协议开销(基线 none去异常后
| 策略 | set QPS变化 | set us/op变化 | get QPS变化 | get us/op变化 |
|---|---:|---:|---:|---:|
| aof_no | -30.88% | +44.81% | -1.66% | +1.96% |
| aof_everysec | -31.61% | +46.27% | -4.10% | +4.36% |
## 简要结论
1. 本次大 key256持久化开销主要体现在 `set``get` 相对更稳。
2. `aof_no/aof_everysec` 在 Redis 写路径的开销明显kvstore 三策略在写路径也存在可见差异。
3. 本报告已剔除每场景 1 个异常轮次,避免单点抖动主导结论。

447
test-redis/bench.c Normal file
View File

@@ -0,0 +1,447 @@
#include <errno.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <time.h>
#include <hiredis/hiredis.h>
typedef enum {
MODE_SET = 0,
MODE_GET = 1,
MODE_MIXED = 2,
} bench_mode_t;
typedef struct {
const char *host;
int port;
bench_mode_t mode;
uint64_t requests;
uint32_t pipeline;
uint32_t keyspace;
uint32_t value_size;
uint32_t set_ratio;
const char *set_cmd;
const char *get_cmd;
const char *key_prefix;
uint64_t seed;
int verify_get;
} bench_opts_t;
typedef struct {
uint64_t set_ops;
uint64_t get_ops;
uint64_t errors;
double elapsed_sec;
} bench_result_t;
static void usage(const char *prog) {
fprintf(stderr,
"Usage: %s [options]\n"
" --host <ip> default: 127.0.0.1\n"
" --port <port> default: 6379\n"
" --mode <set|get|mixed> default: mixed\n"
" --requests <n> default: 1000000\n"
" --pipeline <n> default: 64\n"
" --keyspace <n> default: 100000\n"
" --value-size <n> default: 32\n"
" --set-ratio <0..100> default: 50 (mixed mode only)\n"
" --set-cmd <cmd> default: SET\n"
" --get-cmd <cmd> default: GET\n"
" --key-prefix <prefix> default: bench:key:\n"
" --seed <n> default: time-based\n"
" --verify-get verify GET value content\n"
"\nExamples:\n"
" # Benchmark Redis\n"
" %s --host 127.0.0.1 --port 6379 --mode mixed --requests 2000000\n"
"\n"
" # Benchmark kvstore with Redis-compatible commands\n"
" %s --host 127.0.0.1 --port 8888 --mode mixed --requests 2000000\n"
"\n"
" # Benchmark kvstore RBTree path\n"
" %s --host 127.0.0.1 --port 8888 --mode mixed --set-cmd RSET --get-cmd RGET\n",
prog, prog, prog, prog);
}
static void opts_init(bench_opts_t *o) {
memset(o, 0, sizeof(*o));
o->host = "127.0.0.1";
o->port = 6379;
o->mode = MODE_MIXED;
o->requests = 1000000;
o->pipeline = 64;
o->keyspace = 100000;
o->value_size = 32;
o->set_ratio = 50;
o->set_cmd = "SET";
o->get_cmd = "GET";
o->key_prefix = "bench:key:";
o->seed = (uint64_t)time(NULL);
o->verify_get = 0;
}
static int parse_u64(const char *s, uint64_t *out) {
char *end = NULL;
unsigned long long v;
errno = 0;
v = strtoull(s, &end, 10);
if (errno != 0 || end == s || *end != 0) {
return -1;
}
*out = (uint64_t)v;
return 0;
}
static int parse_u32(const char *s, uint32_t *out) {
uint64_t v = 0;
if (parse_u64(s, &v) != 0 || v > UINT32_MAX) {
return -1;
}
*out = (uint32_t)v;
return 0;
}
static int parse_args(int argc, char **argv, bench_opts_t *o) {
int i;
for (i = 1; i < argc; i++) {
if (strcmp(argv[i], "--host") == 0 && i + 1 < argc) {
o->host = argv[++i];
} else if (strcmp(argv[i], "--port") == 0 && i + 1 < argc) {
uint32_t p = 0;
if (parse_u32(argv[++i], &p) != 0 || p == 0 || p > 65535) {
return -1;
}
o->port = (int)p;
} else if (strcmp(argv[i], "--mode") == 0 && i + 1 < argc) {
const char *m = argv[++i];
if (strcmp(m, "set") == 0) {
o->mode = MODE_SET;
} else if (strcmp(m, "get") == 0) {
o->mode = MODE_GET;
} else if (strcmp(m, "mixed") == 0) {
o->mode = MODE_MIXED;
} else {
return -1;
}
} else if (strcmp(argv[i], "--requests") == 0 && i + 1 < argc) {
if (parse_u64(argv[++i], &o->requests) != 0 || o->requests == 0) {
return -1;
}
} else if (strcmp(argv[i], "--pipeline") == 0 && i + 1 < argc) {
if (parse_u32(argv[++i], &o->pipeline) != 0 || o->pipeline == 0) {
return -1;
}
} else if (strcmp(argv[i], "--keyspace") == 0 && i + 1 < argc) {
if (parse_u32(argv[++i], &o->keyspace) != 0 || o->keyspace == 0) {
return -1;
}
} else if (strcmp(argv[i], "--value-size") == 0 && i + 1 < argc) {
if (parse_u32(argv[++i], &o->value_size) != 0 || o->value_size == 0) {
return -1;
}
} else if (strcmp(argv[i], "--set-ratio") == 0 && i + 1 < argc) {
if (parse_u32(argv[++i], &o->set_ratio) != 0 || o->set_ratio > 100) {
return -1;
}
} else if (strcmp(argv[i], "--set-cmd") == 0 && i + 1 < argc) {
o->set_cmd = argv[++i];
} else if (strcmp(argv[i], "--get-cmd") == 0 && i + 1 < argc) {
o->get_cmd = argv[++i];
} else if (strcmp(argv[i], "--key-prefix") == 0 && i + 1 < argc) {
o->key_prefix = argv[++i];
} else if (strcmp(argv[i], "--seed") == 0 && i + 1 < argc) {
if (parse_u64(argv[++i], &o->seed) != 0) {
return -1;
}
} else if (strcmp(argv[i], "--verify-get") == 0) {
o->verify_get = 1;
} else if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) {
return 1;
} else {
return -1;
}
}
return 0;
}
static uint64_t mono_ns(void) {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (uint64_t)ts.tv_sec * 1000000000ull + (uint64_t)ts.tv_nsec;
}
static uint64_t xorshift64(uint64_t *state) {
uint64_t x = *state;
x ^= x << 13;
x ^= x >> 7;
x ^= x << 17;
*state = x;
return x;
}
static int append_set(redisContext *c, const char *cmd,
const char *key, size_t key_len,
const char *val, size_t val_len) {
const char *argv[3];
size_t argvlen[3];
argv[0] = cmd;
argvlen[0] = strlen(cmd);
argv[1] = key;
argvlen[1] = key_len;
argv[2] = val;
argvlen[2] = val_len;
return redisAppendCommandArgv(c, 3, argv, argvlen);
}
static int append_get(redisContext *c, const char *cmd,
const char *key, size_t key_len) {
const char *argv[2];
size_t argvlen[2];
argv[0] = cmd;
argvlen[0] = strlen(cmd);
argv[1] = key;
argvlen[1] = key_len;
return redisAppendCommandArgv(c, 2, argv, argvlen);
}
static int consume_set_reply(redisContext *c) {
redisReply *r = NULL;
if (redisGetReply(c, (void **)&r) != REDIS_OK || r == NULL) {
return -1;
}
if (r->type != REDIS_REPLY_STATUS || r->str == NULL || strcasecmp(r->str, "OK") != 0) {
freeReplyObject(r);
return -1;
}
freeReplyObject(r);
return 0;
}
static int consume_get_reply(redisContext *c, const char *expect, size_t expect_len, int verify) {
redisReply *r = NULL;
if (redisGetReply(c, (void **)&r) != REDIS_OK || r == NULL) {
return -1;
}
if (r->type == REDIS_REPLY_STRING) {
if (verify && ((size_t)r->len != expect_len || memcmp(r->str, expect, expect_len) != 0)) {
freeReplyObject(r);
return -1;
}
freeReplyObject(r);
return 0;
}
if (r->type == REDIS_REPLY_NIL) {
freeReplyObject(r);
return verify ? -1 : 0;
}
freeReplyObject(r);
return -1;
}
static int prefill(redisContext *c, const bench_opts_t *o, const char *value, size_t value_len) {
uint64_t i = 0;
char key[256];
while (i < o->keyspace) {
uint32_t batch = o->pipeline;
if ((uint64_t)batch > (uint64_t)o->keyspace - i) {
batch = (uint32_t)((uint64_t)o->keyspace - i);
}
for (uint32_t j = 0; j < batch; j++) {
int klen = snprintf(key, sizeof(key), "%s%" PRIu64, o->key_prefix, i + j);
if (klen <= 0 || (size_t)klen >= sizeof(key)) {
return -1;
}
if (append_set(c, o->set_cmd, key, (size_t)klen, value, value_len) != REDIS_OK) {
return -1;
}
}
for (uint32_t j = 0; j < batch; j++) {
if (consume_set_reply(c) != 0) {
return -1;
}
}
i += batch;
}
return 0;
}
/*
* Keep append/reply operation choices consistent in each batch by building an op mask.
* This wrapper keeps implementation simple and avoids per-op heap allocation.
*/
static int run_bench_with_mask(redisContext *c, const bench_opts_t *o,
const char *value, size_t value_len,
bench_result_t *res) {
uint64_t done = 0;
uint64_t rng = o->seed ? o->seed : 1;
char key[256];
uint8_t *opmask = NULL;
uint64_t begin_ns;
memset(res, 0, sizeof(*res));
opmask = (uint8_t *)malloc(o->pipeline);
if (!opmask) {
return -1;
}
begin_ns = mono_ns();
while (done < o->requests) {
uint32_t batch = o->pipeline;
if ((uint64_t)batch > o->requests - done) {
batch = (uint32_t)(o->requests - done);
}
for (uint32_t i = 0; i < batch; i++) {
uint64_t rnd = xorshift64(&rng);
uint64_t key_id = rnd % o->keyspace;
int is_set = 0;
int klen;
if (o->mode == MODE_SET) {
is_set = 1;
} else if (o->mode == MODE_GET) {
is_set = 0;
} else {
is_set = (rnd % 100) < o->set_ratio;
}
opmask[i] = (uint8_t)is_set;
klen = snprintf(key, sizeof(key), "%s%" PRIu64, o->key_prefix, key_id);
if (klen <= 0 || (size_t)klen >= sizeof(key)) {
free(opmask);
return -1;
}
if (is_set) {
if (append_set(c, o->set_cmd, key, (size_t)klen, value, value_len) != REDIS_OK) {
free(opmask);
return -1;
}
res->set_ops++;
} else {
if (append_get(c, o->get_cmd, key, (size_t)klen) != REDIS_OK) {
free(opmask);
return -1;
}
res->get_ops++;
}
}
for (uint32_t i = 0; i < batch; i++) {
int rc = opmask[i] ? consume_set_reply(c)
: consume_get_reply(c, value, value_len, o->verify_get);
if (rc != 0) {
res->errors++;
free(opmask);
return -1;
}
}
done += batch;
}
res->elapsed_sec = (double)(mono_ns() - begin_ns) / 1e9;
free(opmask);
return 0;
}
int main(int argc, char **argv) {
bench_opts_t opts;
bench_result_t result;
redisContext *ctx;
struct timeval timeout;
char *value;
size_t value_len;
int parse_rc;
opts_init(&opts);
parse_rc = parse_args(argc, argv, &opts);
if (parse_rc == 1) {
usage(argv[0]);
return 0;
}
if (parse_rc != 0) {
usage(argv[0]);
return 2;
}
timeout.tv_sec = 3;
timeout.tv_usec = 0;
ctx = redisConnectWithTimeout(opts.host, opts.port, timeout);
if (!ctx || ctx->err) {
fprintf(stderr, "connect %s:%d failed: %s\n", opts.host, opts.port,
ctx ? ctx->errstr : "oom");
if (ctx) {
redisFree(ctx);
}
return 1;
}
value_len = opts.value_size;
value = (char *)malloc(value_len);
if (!value) {
fprintf(stderr, "malloc value buffer failed\n");
redisFree(ctx);
return 1;
}
for (size_t i = 0; i < value_len; i++) {
value[i] = (char)('a' + (int)(i % 26));
}
if (opts.mode != MODE_SET) {
fprintf(stdout, "[prefill] keyspace=%u using %s\n", opts.keyspace, opts.set_cmd);
if (prefill(ctx, &opts, value, value_len) != 0) {
fprintf(stderr, "prefill failed, err=%s\n", ctx->err ? ctx->errstr : "unknown");
free(value);
redisFree(ctx);
return 1;
}
}
fprintf(stdout,
"[bench] target=%s:%d mode=%s requests=%" PRIu64
" pipeline=%u keyspace=%u value_size=%u set_cmd=%s get_cmd=%s\n",
opts.host, opts.port,
opts.mode == MODE_SET ? "set" : (opts.mode == MODE_GET ? "get" : "mixed"),
opts.requests, opts.pipeline, opts.keyspace, opts.value_size,
opts.set_cmd, opts.get_cmd);
if (run_bench_with_mask(ctx, &opts, value, value_len, &result) != 0) {
fprintf(stderr, "benchmark failed, err=%s\n", ctx->err ? ctx->errstr : "reply mismatch");
free(value);
redisFree(ctx);
return 1;
}
{
double qps = result.elapsed_sec > 0 ? (double)(result.set_ops + result.get_ops) / result.elapsed_sec : 0.0;
double avg_us = (result.set_ops + result.get_ops) > 0
? (result.elapsed_sec * 1e6) / (double)(result.set_ops + result.get_ops)
: 0.0;
fprintf(stdout,
"[result] elapsed=%.3fs total=%" PRIu64 " set=%" PRIu64 " get=%" PRIu64
" errors=%" PRIu64 " qps=%.0f avg=%.2fus/op\n",
result.elapsed_sec,
result.set_ops + result.get_ops,
result.set_ops,
result.get_ops,
result.errors,
qps,
avg_us);
}
free(value);
redisFree(ctx);
return 0;
}

437
test-redis/run_bench.hash.sh Executable file
View File

@@ -0,0 +1,437 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
ROOT_DIR=$(cd "$SCRIPT_DIR/.." && pwd)
cd "$ROOT_DIR"
TS=$(date +%Y%m%d_%H%M%S)
RUN_TIME=$(date '+%Y-%m-%d %H:%M:%S')
OUT_DIR="$ROOT_DIR/test-redis/results"
mkdir -p "$OUT_DIR"
DETAIL_CSV="$OUT_DIR/hash_bench_fair_detail_${TS}.csv"
SUMMARY_CSV="$OUT_DIR/hash_bench_fair_summary_${TS}.csv"
KV_LOG_DIR="/tmp/kv_bench_hash_fair_${TS}"
mkdir -p "$KV_LOG_DIR"
CONFIG_XML="$ROOT_DIR/config/config.xml"
README_MD="$ROOT_DIR/test-redis/README.md"
ROUNDS=${ROUNDS:-5}
RETRIES=${RETRIES:-3}
REQ=${REQ:-1000000}
PIPE=${PIPE:-128}
KEYSPACE=${KEYSPACE:-1000000}
VSIZE=${VSIZE:-32}
SEED=${SEED:-12345}
ALLOC=${ALLOC:-mypool}
KV_HOST=127.0.0.1
KV_PORT=${KV_PORT:-8888}
SET_CMD=${SET_CMD:-RSET}
GET_CMD=${GET_CMD:-RGET}
ORIG_CONFIG_BACKUP=$(mktemp "/tmp/kvstore_config_backup_${TS}.XXXXXX")
cp "$CONFIG_XML" "$ORIG_CONFIG_BACKUP"
KV_PID=""
printf "strategy,persistence,oplog_sync,allocator,mode,round,qps,avg_us,elapsed_s,key_prefix,seed,requests,pipeline,keyspace,value_size\n" > "$DETAIL_CSV"
printf "strategy,persistence,oplog_sync,allocator,mode,round_qps,round_avg_us,round_elapsed_s,avg_qps,avg_avg_us,avg_elapsed_s\n" > "$SUMMARY_CSV"
require_cmd() {
local cmd="$1"
if ! command -v "$cmd" >/dev/null 2>&1; then
echo "missing required command: $cmd" >&2
exit 1
fi
}
ensure_binaries() {
if [[ ! -x "$ROOT_DIR/kvstore" || ! -x "$ROOT_DIR/test-redis/bench" ]]; then
echo "[info] kvstore/bench missing, running make -j4 ..."
make -j4
fi
}
set_config() {
local ptype="$1"
local oplog_sync="$2"
local pdir="$3"
local alloc="$4"
python3 - "$CONFIG_XML" "$ptype" "$oplog_sync" "$pdir" "$alloc" "$KV_PORT" <<'PY'
import sys
import xml.etree.ElementTree as ET
path, ptype, oplog_sync, pdir, alloc, kv_port = sys.argv[1:]
tree = ET.parse(path)
root = tree.getroot()
server = root.find("server")
if server is not None:
ip = server.find("ip")
port = server.find("port")
mode = server.find("mode")
replica = server.find("replica")
if ip is not None:
ip.text = "127.0.0.1"
if port is not None:
port.text = kv_port
if mode is not None:
mode.text = "master"
if replica is not None:
replica.text = "disable"
persistence = root.find("persistence")
if persistence is not None:
t = persistence.find("type")
d = persistence.find("dir")
s = persistence.find("oplog_sync")
if t is not None:
t.text = ptype
if d is not None:
d.text = pdir
if s is not None:
s.text = oplog_sync
memory = root.find("memory")
if memory is not None:
a = memory.find("allocator")
leak = memory.find("leakage")
if a is not None:
a.text = alloc
if leak is not None:
leak.text = "disable"
tree.write(path, encoding="UTF-8", xml_declaration=True)
PY
}
wait_port_open() {
local port="$1"
for _ in $(seq 1 200); do
if ss -ltn | rg -q ":${port}\\b"; then
return 0
fi
sleep 0.1
done
return 1
}
wait_port_close() {
local port="$1"
for _ in $(seq 1 200); do
if ! ss -ltn | rg -q ":${port}\\b"; then
return 0
fi
sleep 0.1
done
return 1
}
assert_port_free() {
local port="$1"
if ss -ltn | rg -q ":${port}\\b"; then
echo "port ${port} is already in use, cannot start kvstore" >&2
exit 1
fi
}
extract_metric() {
local line="$1"
local key="$2"
echo "$line" | sed -E "s/.*${key}=([0-9]+(\\.[0-9]+)?).*/\\1/"
}
run_bench_capture() {
local mode="$1"
local key_prefix="$2"
local seed="$3"
local verify="$4"
local cmd=(
./test-redis/bench
--host "$KV_HOST"
--port "$KV_PORT"
--mode "$mode"
--set-cmd "$SET_CMD"
--get-cmd "$GET_CMD"
--requests "$REQ"
--pipeline "$PIPE"
--keyspace "$KEYSPACE"
--value-size "$VSIZE"
--seed "$seed"
--key-prefix "$key_prefix"
)
if [[ "$verify" == "1" ]]; then
cmd+=(--verify-get)
fi
local out
out=$("${cmd[@]}")
echo "$out" >&2
local line
line=$(echo "$out" | rg "\\[result\\]" | tail -n1)
if [[ -z "$line" ]]; then
echo "missing [result] line in benchmark output" >&2
return 1
fi
local qps avg elapsed
qps=$(extract_metric "$line" "qps")
avg=$(extract_metric "$line" "avg")
elapsed=$(extract_metric "$line" "elapsed")
if [[ -z "$qps" || -z "$avg" || -z "$elapsed" ]]; then
echo "failed to parse benchmark metrics: $line" >&2
return 1
fi
echo "$qps,$avg,$elapsed"
}
start_kv() {
local label="$1"
assert_port_free "$KV_PORT"
./kvstore >"$KV_LOG_DIR/${label}.log" 2>&1 &
KV_PID=$!
if ! wait_port_open "$KV_PORT"; then
echo "kvstore start failed for ${label}" >&2
return 1
fi
}
stop_kv() {
if [[ -n "${KV_PID:-}" ]] && kill -0 "$KV_PID" >/dev/null 2>&1; then
kill "$KV_PID" >/dev/null 2>&1 || true
wait "$KV_PID" >/dev/null 2>&1 || true
fi
KV_PID=""
wait_port_close "$KV_PORT" || true
}
cleanup() {
stop_kv
if [[ -f "$ORIG_CONFIG_BACKUP" ]]; then
cp "$ORIG_CONFIG_BACKUP" "$CONFIG_XML"
rm -f "$ORIG_CONFIG_BACKUP"
fi
}
trap cleanup EXIT
case_params() {
local strategy="$1"
case "$strategy" in
persist_no)
echo "incremental,none"
;;
persist_everysec)
echo "incremental,every_sec"
;;
nopersist)
echo "none,none"
;;
*)
echo "unknown strategy: $strategy" >&2
return 1
;;
esac
}
run_one_case_round() {
local strategy="$1"
local round="$2"
local params ptype oplog_sync
local pdir_rel pdir_abs
local key_prefix seed
local m qps avg elapsed
local set_qps set_avg set_elapsed
local get_qps get_avg get_elapsed
local attempt ok
params=$(case_params "$strategy")
IFS=',' read -r ptype oplog_sync <<< "$params"
key_prefix="bench:${TS}:round:${round}:"
seed=$((SEED + round))
for attempt in $(seq 1 "$RETRIES"); do
pdir_rel="data/${strategy}_${TS}_r${round}_a${attempt}"
pdir_abs="$ROOT_DIR/${pdir_rel}"
rm -rf "$pdir_abs"
mkdir -p "$pdir_abs"
set_config "$ptype" "$oplog_sync" "$pdir_rel" "$ALLOC"
start_kv "${strategy}_r${round}_a${attempt}"
ok=1
if ! m=$(run_bench_capture "set" "$key_prefix" "$seed" 0); then
ok=0
else
IFS=',' read -r set_qps set_avg set_elapsed <<< "$m"
fi
if [[ "$ok" == "1" ]]; then
if ! m=$(run_bench_capture "get" "$key_prefix" "$seed" 1); then
ok=0
else
IFS=',' read -r get_qps get_avg get_elapsed <<< "$m"
fi
fi
stop_kv
if [[ "$ok" == "1" ]]; then
printf "%s,%s,%s,%s,set,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n" \
"$strategy" "$ptype" "$oplog_sync" "$ALLOC" "$round" \
"$set_qps" "$set_avg" "$set_elapsed" "$key_prefix" "$seed" \
"$REQ" "$PIPE" "$KEYSPACE" "$VSIZE" >> "$DETAIL_CSV"
printf "%s,%s,%s,%s,get,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n" \
"$strategy" "$ptype" "$oplog_sync" "$ALLOC" "$round" \
"$get_qps" "$get_avg" "$get_elapsed" "$key_prefix" "$seed" \
"$REQ" "$PIPE" "$KEYSPACE" "$VSIZE" >> "$DETAIL_CSV"
return 0
fi
echo "[warn] retry $attempt/$RETRIES failed: strategy=$strategy round=$round" >&2
sleep 0.2
done
echo "[error] all retries failed: strategy=$strategy round=$round" >&2
return 1
}
append_readme_results() {
python3 - "$DETAIL_CSV" "$SUMMARY_CSV" "$README_MD" "$RUN_TIME" "$ROUNDS" "$REQ" "$PIPE" "$KEYSPACE" "$VSIZE" "$ALLOC" "$SET_CMD" "$GET_CMD" <<'PY'
import csv
import os
import sys
from collections import defaultdict
detail_csv, summary_csv, readme_path, run_time, rounds, req, pipeline, keyspace, value_size, alloc, set_cmd, get_cmd = sys.argv[1:]
rounds_i = int(rounds)
group = defaultdict(list)
meta = {}
with open(detail_csv, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
for r in reader:
key = (r["strategy"], r["mode"])
group[key].append(r)
meta[r["strategy"]] = (r["persistence"], r["oplog_sync"], r["allocator"])
def f2(x):
return f"{float(x):.2f}"
with open(summary_csv, "w", newline="", encoding="utf-8") as f:
w = csv.writer(f)
w.writerow([
"strategy","persistence","oplog_sync","allocator","mode",
"round_qps","round_avg_us","round_elapsed_s",
"avg_qps","avg_avg_us","avg_elapsed_s"
])
for strategy in ["persist_no", "persist_everysec", "nopersist"]:
for mode in ["set", "get"]:
rows = sorted(group[(strategy, mode)], key=lambda x: int(x["round"]))
qps = [float(r["qps"]) for r in rows]
avg_us = [float(r["avg_us"]) for r in rows]
elapsed = [float(r["elapsed_s"]) for r in rows]
persistence, oplog_sync, allocator = meta[strategy]
w.writerow([
strategy, persistence, oplog_sync, allocator, mode,
"|".join(f2(v) for v in qps),
"|".join(f2(v) for v in avg_us),
"|".join(f2(v) for v in elapsed),
f2(sum(qps) / len(qps)),
f2(sum(avg_us) / len(avg_us)),
f2(sum(elapsed) / len(elapsed)),
])
rows = []
with open(summary_csv, newline="", encoding="utf-8") as f:
rows = list(csv.DictReader(f))
def by_mode(mode):
ordered = ["persist_no", "persist_everysec", "nopersist"]
m = [r for r in rows if r["mode"] == mode]
m.sort(key=lambda r: ordered.index(r["strategy"]))
return m
round_headers = [f"Round{i}" for i in range(1, rounds_i + 1)]
detail_rel = os.path.relpath(detail_csv, os.path.dirname(readme_path))
summary_rel = os.path.relpath(summary_csv, os.path.dirname(readme_path))
lines = []
lines.append("")
lines.append(f"## run_bench.hash.sh {rounds_i}轮均值复测({run_time}")
lines.append("")
lines.append(f"- 轮次:{rounds_i} 轮(每种情况)")
lines.append("- 策略persist(no), persist(everysec), nopersist")
lines.append(f"- 命令:{set_cmd}/{get_cmd}GET 保持 prefill + GET")
lines.append("- 公平性:同一 case 同一轮的 SET/GET 使用相同 key_prefix 与 seedcase 顺序按轮次轮转")
lines.append(f"- 参数requests={req} pipeline={pipeline} keyspace={keyspace} value-size={value_size} allocator={alloc}")
lines.append(f"- 明细数据:`{detail_rel}`")
lines.append(f"- 汇总数据:`{summary_rel}`")
lines.append("")
for mode in ["set", "get"]:
lines.append(f"### kvstore{set_cmd}/{get_cmd} ({mode})")
lines.append("")
header = "| 场景 | persistence | oplog_sync | " + " | ".join(round_headers) + " | 均值QPS | 均值avg(us/op) | 均值elapsed(s) |"
sep = "|---|---|---|" + "|".join(["---:"] * len(round_headers)) + "|---:|---:|---:|"
lines.append(header)
lines.append(sep)
for r in by_mode(mode):
qps_rounds = r["round_qps"].split("|")
lines.append(
"| {} | {} | {} | {} | {} | {} | {} |".format(
r["strategy"],
r["persistence"],
r["oplog_sync"],
" | ".join(qps_rounds),
r["avg_qps"],
r["avg_avg_us"],
r["avg_elapsed_s"],
)
)
lines.append("")
with open(readme_path, "a", encoding="utf-8") as f:
f.write("\n".join(lines) + "\n")
PY
}
main() {
require_cmd python3
require_cmd rg
require_cmd ss
ensure_binaries
for round in $(seq 1 "$ROUNDS"); do
case $((round % 3)) in
1)
order=(persist_no persist_everysec nopersist)
;;
2)
order=(persist_everysec nopersist persist_no)
;;
0)
order=(nopersist persist_no persist_everysec)
;;
esac
for strategy in "${order[@]}"; do
run_one_case_round "$strategy" "$round"
done
done
append_readme_results
echo "DETAIL_CSV=$DETAIL_CSV"
echo "SUMMARY_CSV=$SUMMARY_CSV"
echo "README_UPDATED=$README_MD"
}
main "$@"

View File

@@ -10,48 +10,49 @@
static void die(redisContext *c, const char *msg) { static void die(redisContext *c, const char *msg) {
fprintf(stderr, "%s: %s\n", msg, c && c->err ? c->errstr : "unknown"); fprintf(stderr, "%s: %s\n", msg, c && c->err ? c->errstr : "unknown");
redisFree(c);
exit(1); exit(1);
} }
static void must_ok(redisReply *r, const char *what) { static int must_ok(redisReply *r, const char *what) {
if (!r) { fprintf(stderr, "%s: reply null\n", what); exit(1); } if (!r) { fprintf(stderr, "%s: reply null\n", what); return -1; }
if (!(r->type == REDIS_REPLY_STATUS && r->str && strcasecmp(r->str, "OK") == 0)) { if (!(r->type == REDIS_REPLY_STATUS && r->str && strcasecmp(r->str, "OK") == 0)) {
fprintf(stderr, "%s: expect +OK, got type=%d str=%s\n", fprintf(stderr, "%s: expect +OK, got type=%d str=%s\n",
what, r->type, r->str ? r->str : "(null)"); what, r->type, r->str ? r->str : "(null)");
freeReplyObject(r); freeReplyObject(r);
exit(1); return -1;
} }
freeReplyObject(r); freeReplyObject(r);
} }
static void must_int(redisReply *r, long long expect, const char *what) { static int must_int(redisReply *r, long long expect, const char *what) {
if (!r) { fprintf(stderr, "%s: reply null\n", what); exit(1); } if (!r) { fprintf(stderr, "%s: reply null\n", what); return -1; }
if (r->type != REDIS_REPLY_INTEGER || r->integer != expect) { if (r->type != REDIS_REPLY_INTEGER || r->integer != expect) {
fprintf(stderr, "%s: expect :%lld, got type=%d int=%lld\n", fprintf(stderr, "%s: expect :%lld, got type=%d int=%lld\n",
what, expect, r->type, (long long)r->integer); what, expect, r->type, (long long)r->integer);
freeReplyObject(r); freeReplyObject(r);
exit(1); return -1;
} }
freeReplyObject(r); freeReplyObject(r);
} }
static void must_bulk_eq(redisReply *r, const void *buf, size_t n, const char *what) { static int must_bulk_eq(redisReply *r, const void *buf, size_t n, const char *what) {
if (!r) { fprintf(stderr, "%s: reply null\n", what); exit(1); } if (!r) { fprintf(stderr, "%s: reply null\n", what); return -1; }
if (r->type != REDIS_REPLY_STRING || r->len != n || memcmp(r->str, buf, n) != 0) { if (r->type != REDIS_REPLY_STRING || r->len != n || memcmp(r->str, buf, n) != 0) {
fprintf(stderr, "%s: bulk mismatch. type=%d len=%zu\n", what, r->type, r->len); fprintf(stderr, "%s: bulk mismatch. type=%d len=%zu\n", what, r->type, r->len);
fprintf(stderr, "expect:%s, truely:%s\n", (const char*)buf, r->str); fprintf(stderr, "expect:%s, truely:%s\n", (const char*)buf, r->str);
freeReplyObject(r); freeReplyObject(r);
exit(1); return -1;
} }
freeReplyObject(r); freeReplyObject(r);
} }
static void must_nil(redisReply *r, const char *what) { static int must_nil(redisReply *r, const char *what) {
if (!r) { fprintf(stderr, "%s: reply null\n", what); exit(1); } if (!r) { fprintf(stderr, "%s: reply null\n", what); return -1; }
if (r->type != REDIS_REPLY_NIL) { if (r->type != REDIS_REPLY_NIL) {
fprintf(stderr, "%s: expect nil, got type=%d\n", what, r->type); fprintf(stderr, "%s: expect nil, got type=%d\n", what, r->type);
freeReplyObject(r); freeReplyObject(r);
exit(1); return -1;
} }
freeReplyObject(r); freeReplyObject(r);
} }
@@ -274,7 +275,9 @@ int main(int argc, char **argv) {
redisContext *c = redisConnect(host, port); redisContext *c = redisConnect(host, port);
if (!c || c->err) die(c, "connect failed"); if (!c || c->err) die(c, "connect failed");
printf("Connected to %s:%d\n", host, port); redisReply *reply = redisCommand(c, "MEMPRINT");
printf("Connected to %s:%d, %lld\n", host, port, reply->integer);
if(mode == 0){ if(mode == 0){
save(c); save(c);

View File

@@ -1,220 +0,0 @@
#include "test_client.h"
#include <stdio.h>
#include <errno.h>
#include <unistd.h>
int kvs_need(const uint8_t *p, const uint8_t *end, size_t n) {
return (p + n <= end) ? 0 : -1;
}
// 注意u8类型不需要ntoh或者hton
int kvs_read_u8(const uint8_t **pp, const uint8_t *end, uint8_t *out) {
const uint8_t *p = *pp;
if (kvs_need(p, end, 1) < 0) return -1;
*out = *p;
*pp = p + 1;
return 0;
}
int kvs_read_u16(const uint8_t **pp, const uint8_t *end, uint16_t *out) {
const uint8_t *p = *pp;
if (kvs_need(p, end, 2) < 0) return -1;
uint16_t v;
memcpy(&v, p, 2);
*out = ntohs(v);
*pp = p + 2;
return 0;
}
int kvs_read_u32(const uint8_t **pp, const uint8_t *end, uint32_t *out) {
const uint8_t *p = *pp;
if (kvs_need(p, end, 4) < 0) return -1;
uint32_t v;
memcpy(&v, p, 4);
*out = ntohl(v);
*pp = p + 4;
return 0;
}
int kvs_write_u8(uint8_t **pp, const uint8_t *end, uint8_t v) {
uint8_t *p = *pp;
if (kvs_need(p, end, 1) < 0) return -1;
*p = v;
*pp = p + 1;
return 0;
}
int kvs_write_u16(uint8_t **pp, const uint8_t *end, uint16_t v) {
uint8_t *p = *pp;
if (kvs_need(p, end, 2) < 0) return -1;
uint16_t be = htons(v);
memcpy(p, &be, 2);
*pp = p + 2;
return 0;
}
int kvs_write_u32(uint8_t **pp, const uint8_t *end, uint32_t v) {
uint8_t *p = *pp;
if (kvs_need(p, end, 4) < 0) return -1;
uint32_t be = htonl(v);
memcpy(p, &be, 4);
*pp = p + 4;
return 0;
}
int getcmd(uint8_t op, const char *key, uint32_t key_len, const char *value, uint32_t value_len, uint8_t *buf){
if(!buf) return -1;
uint8_t *end = buf + CMD_SIZE;
uint8_t *p = buf;
uint8_t argc = (key == NULL)?0:1;
argc += (value == NULL)?0:1;
if (kvs_write_u8(&p, end, op) < 0) return -1;
if (kvs_write_u8(&p, end, argc) < 0) return -1;
// 写入 key
if(key){
int keylen = key_len;
if (kvs_write_u32(&p, end, keylen) < 0) return -1;
if (kvs_need(p, end, keylen) < 0) return -1;
if (keylen > 0) {
memcpy(p, key, keylen);
p += keylen;
}
}
if(value){
int vallen = value_len;
if (kvs_write_u32(&p, end, vallen) < 0) return -1;
if (kvs_need(p, end, vallen) < 0) return -1;
if (vallen > 0) {
memcpy(p, value, vallen);
p += vallen;
}
}
return (p - buf);
}
int parse_response(const uint8_t *buf, int buflen, kvs_response_t *rsp) {
if(buflen == 0) return 0;
const uint8_t *p = buf;
const uint8_t *end = buf + buflen;
// 读取 OP
if (kvs_read_u8(&p, end, &rsp->op) < 0) {
fprintf(stderr, "Failed to read op\n");
return -1;
}
// 读取 status
if (kvs_read_u8(&p, end, &rsp->status) < 0) {
fprintf(stderr, "Failed to read status\n");
return -1;
}
// 读取 datalen
if (kvs_read_u32(&p, end, &rsp->datalen) < 0) {
fprintf(stderr, "Failed to read datalen\n");
return -1;
}
// 检查数据长度
if (kvs_need(p, end, rsp->datalen) < 0) {
fprintf(stderr, "Data length mismatch: expected %u bytes, but only %ld available\n",
rsp->datalen, end - p);
return -1;
}
// 指向数据部分
rsp->data = (uint8_t *)p;
return (p - buf) + rsp->datalen;
}
void print_response(const char *cmd_name, const kvs_response_t *rsp) {
printf("%s ", cmd_name);
if(rsp->op == KVS_CMD_GET || rsp->op == KVS_CMD_HGET || rsp->op == KVS_CMD_RGET){
if (rsp->datalen > 0 && rsp->data != NULL) {
printf("Data: ");
// 尝试以字符串形式打印(如果是可打印字符)
int is_printable = 1;
for (uint32_t i = 0; i < rsp->datalen; i++) {
if (rsp->data[i] < 32 || rsp->data[i] > 126) {
is_printable = 0;
break;
}
}
if (is_printable) {
printf("\"");
for (uint32_t i = 0; i < rsp->datalen; i++) {
printf("%c", rsp->data[i]);
}
printf("\"\n");
} else {
// 以十六进制打印
printf("0x");
for (uint32_t i = 0; i < rsp->datalen; i++) {
printf("%02x", rsp->data[i]);
}
printf("\n");
}
} else {
printf("Data: (empty)\n");
}
}else {
switch (rsp->status) {
case KVS_STATUS_OK:
printf("(OK)\n");
break;
case KVS_STATUS_ERROR:
printf("(ERROR)\n");
break;
case KVS_STATUS_NO_EXIST:
printf("(NO_EXIST)\n");
break;
case KVS_STATUS_EXIST:
printf("(EXISTS)\n");
break;
default:
printf("(UNKNOWN)\n");
break;
}
}
}
int verify_response(const kvs_response_t *rsp, uint8_t expected_op,
uint8_t expected_status, const char *expected_data, uint32_t expected_len) {
if (rsp->op != expected_op) {
printf("❌ OP mismatch: expected %u, got %u\n", expected_op, rsp->op);
return 0;
}
if (rsp->status != expected_status) {
printf("❌ Status mismatch: expected %u, got %u\n", expected_status, rsp->status);
return 0;
}
if (expected_data != NULL) {
if (rsp->datalen != expected_len) {
printf("❌ Data length mismatch: expected %u, got %u\n", expected_len, rsp->datalen);
return 0;
}
if (memcmp(rsp->data, expected_data, expected_len) != 0) {
printf("❌ Data content mismatch\n");
return 0;
}
}
return 1;
}

View File

@@ -1,161 +0,0 @@
/**
* Request
* Cmd: | OP(1) | argc(1) | repeat { arglen(4) | arg } |
*
* Response
* Rsp: | OP(1) | status(1) | datalen(4) | data |
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <sys/time.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#define CMD_SIZE (1024)
#define BATCH_SIZE (65536)
#define KVS_BATCH_MAX 128
#define TIME_SUB_MS(tv1, tv2) ((tv1.tv_sec - tv2.tv_sec) * 1000 + (tv1.tv_usec - tv2.tv_usec) / 1000)
// #define PRESP print_response
#define PRESP
typedef enum {
KVS_STATUS_OK = 0,
KVS_STATUS_ERROR = 1,
KVS_STATUS_NO_EXIST = 2,
KVS_STATUS_EXIST = 3,
KVS_STATUS_BADREQ = 4
}rsp_ret_status_e;
enum {
KVS_CMD_START = 0,
// array
KVS_CMD_SET = KVS_CMD_START,
KVS_CMD_GET,
KVS_CMD_DEL,
KVS_CMD_MOD,
KVS_CMD_EXIST,
// rbtree
KVS_CMD_RSET,
KVS_CMD_RGET,
KVS_CMD_RDEL,
KVS_CMD_RMOD,
KVS_CMD_REXIST,
// hash
KVS_CMD_HSET,
KVS_CMD_HGET,
KVS_CMD_HDEL,
KVS_CMD_HMOD,
KVS_CMD_HEXIST,
KVS_CMD_SSYNC,
KVS_CMD_SAVE,
KVS_CMD_COUNT,
};
typedef struct {
uint8_t op;
uint8_t status;
uint32_t datalen;
uint8_t *data;
} kvs_response_t;
int kvs_need(const uint8_t *p, const uint8_t *end, size_t n);
int kvs_read_u8(const uint8_t **pp, const uint8_t *end, uint8_t *out);
int kvs_read_u16(const uint8_t **pp, const uint8_t *end, uint16_t *out);
int kvs_read_u32(const uint8_t **pp, const uint8_t *end, uint32_t *out);
int kvs_write_u8(uint8_t **pp, const uint8_t *end, uint8_t v);
int kvs_write_u16(uint8_t **pp, const uint8_t *end, uint16_t v);
int kvs_write_u32(uint8_t **pp, const uint8_t *end, uint32_t v);
int getcmd(uint8_t op, const char *key, uint32_t key_len, const char *value, uint32_t value_len, uint8_t *buf);
int parse_response(const uint8_t *buf, int buflen, kvs_response_t *rsp);
void print_response(const char *cmd_name, const kvs_response_t *rsp);
int verify_response(const kvs_response_t *rsp, uint8_t expected_op,
uint8_t expected_status, const char *expected_data, uint32_t expected_len);
typedef struct {
uint8_t buf[BATCH_SIZE];
int len; // 当前已写入的 batch 字节数
int cnt; // 当前 batch 里命令条数
int cmd_len[KVS_BATCH_MAX];
} kvs_batch_t;
static void kvs_batch_init(kvs_batch_t *b)
{
b->len = 0;
b->cnt = 0;
memset(b->cmd_len, 0, sizeof(b->cmd_len));
}
/**
* 用 getcmd() 生成单条命令,然后 append 到 batch buffer
* 返回0 成功,-1 失败(太多条 or buffer 不够)
*/
static int kvs_batch_add(kvs_batch_t *b, uint8_t op, const char *key, uint32_t key_len, const char *value, uint32_t value_len){
if (b->cnt >= KVS_BATCH_MAX) return -1;
uint8_t tmp[CMD_SIZE];
int n = getcmd(op, key, key_len, value, value_len, tmp); // 你提供的函数
if (n <= 0) return -1;
if (b->len + n > (int)sizeof(b->buf)) return -1;
memcpy(b->buf + b->len, tmp, n);
b->cmd_len[b->cnt] = n;
b->cnt++;
b->len += n;
return 0;
}
/**
* 一次性发送 batch
* 返回:发送字节数,<0 表示失败
*/
static int kvs_batch_send(int fd, const kvs_batch_t *b)
{
// printf("send : %d\n", b->len);
return (int)send(fd, b->buf, b->len, 0);
}
/**
* 一次 recv 收回所有响应,然后批量解析为 rsp 数组
*
* 返回:成功解析出的响应条数(期望是 b->cnt
*/
static int kvs_batch_recv_parse(int fd,
const kvs_batch_t *b,
kvs_response_t *rsps, // 输出数组,长度 >= b->cnt
uint8_t *recvbuf,
int recvbuf_cap)
{
int parsed = 0;
int used = 0;
while(parsed < b->cnt){
int nrecv = (int)recv(fd, recvbuf+used, recvbuf_cap, 0);
if (nrecv <= 0) return -1;
int off = 0;
while (parsed < b->cnt) {
int consumed = parse_response(recvbuf + used, nrecv - off, &rsps[parsed]);
if (consumed <= 0) break; // 不够解析/失败,简单处理:直接退出
off += consumed;
used+= consumed;
parsed++;
}
}
return parsed;
}

View File

@@ -1,539 +0,0 @@
#include "test_client.h"
#include <arpa/inet.h>
int connect_tcpserver(const char *ip, unsigned short port) {
int connfd = socket(AF_INET, SOCK_STREAM, 0);
struct sockaddr_in server_addr;
memset(&server_addr, 0, sizeof(struct sockaddr_in));
server_addr.sin_family = AF_INET;
server_addr.sin_addr.s_addr = inet_addr(ip);
server_addr.sin_port = htons(port);
if (0 != connect(connfd, (struct sockaddr*)&server_addr, sizeof(struct sockaddr_in))) {
perror("connect");
return -1;
}
return connfd;
}
int send_msg(int connfd, char *msg, int length) {
int res = send(connfd, msg, length, 0);
if (res < 0) {
perror("send");
exit(1);
}
return res;
}
int recv_msg(int connfd, char *msg, int length) {
int res = recv(connfd, msg, length, 0);
if (res < 0) {
perror("recv");
exit(1);
}
return res;
}
void testcase(int connfd, uint8_t op, const char* key, uint32_t key_len, const char* value,
uint32_t value_len, rsp_ret_status_e st, const char* rsp_value, uint32_t expect_len, const char* command_name){
uint8_t buf[CMD_SIZE];
uint8_t result[CMD_SIZE];
kvs_response_t rsp;
int len, recv_len;
len = getcmd(op, key, key_len, value, value_len, buf);
send_msg(connfd, buf, len);
recv_len = recv_msg(connfd, result, CMD_SIZE);
if (parse_response(result, recv_len, &rsp) > 0) {
PRESP(command_name, &rsp);
if(!verify_response(&rsp, op, st, rsp_value, expect_len)) printf("%s\n", command_name);
}else{
printf("parser error\n");
}
return ;
}
void array_testcase_1w(int connfd) {
int count = 1000;
int i = 0;
struct timeval tv_begin;
gettimeofday(&tv_begin, NULL);
for (i = 0;i < count;i ++) {
testcase(connfd, KVS_CMD_SET, "name", 4, "l\r\0n", 4, KVS_STATUS_OK, NULL, 0, "SET NAME");
testcase(connfd, KVS_CMD_GET, "name", 4, NULL, 0, KVS_STATUS_OK, "l\r\0n", 4, "GET NAME");
testcase(connfd, KVS_CMD_MOD, "name", 4, "liu", 3, KVS_STATUS_OK, NULL, 0, "MOD NAME");
testcase(connfd, KVS_CMD_GET, "name", 4, NULL, 0, KVS_STATUS_OK, "liu", 3, "GET NAME");
testcase(connfd, KVS_CMD_EXIST, "name", 4, NULL, 0, KVS_STATUS_EXIST, NULL, 0, "EXIST NAME");
testcase(connfd, KVS_CMD_DEL, "name", 4, NULL, 0, KVS_STATUS_OK, NULL, 0, "DEL NAME");
testcase(connfd, KVS_CMD_EXIST, "name", 4, NULL, 0, KVS_STATUS_NO_EXIST, NULL, 0, "EXIST NAME");
testcase(connfd, KVS_CMD_MOD, "stu", 3, "liu", 3, KVS_STATUS_NO_EXIST, NULL, 0, "MOD NAME");
testcase(connfd, KVS_CMD_DEL, "stu", 3, NULL, 0, KVS_STATUS_NO_EXIST, NULL, 0, "DEL SUT");
}
struct timeval tv_end;
gettimeofday(&tv_end, NULL);
int time_used = TIME_SUB_MS(tv_end, tv_begin); // ms
printf("array testcase --> time_used: %d, qps: %d\n", time_used, 9000 * 1000 / time_used);
}
void rbtree_testcase_1w(int connfd) {
int count = 1000;
int i = 0;
struct timeval tv_begin;
gettimeofday(&tv_begin, NULL);
for (i = 0;i < count;i ++) {
testcase(connfd, KVS_CMD_RSET, "name", 4, "l\r\0n", 4, KVS_STATUS_OK, NULL, 0, "RSET NAME");
testcase(connfd, KVS_CMD_RGET, "name", 4, NULL, 0, KVS_STATUS_OK, "l\r\0n", 4, "RGET NAME");
testcase(connfd, KVS_CMD_RMOD, "name", 4, "liu", 3, KVS_STATUS_OK, NULL, 0, "RMOD NAME");
testcase(connfd, KVS_CMD_RGET, "name", 4, NULL, 0, KVS_STATUS_OK, "liu", 3, "RGET NAME");
testcase(connfd, KVS_CMD_REXIST, "name", 4, NULL, 0, KVS_STATUS_EXIST, NULL, 0, "REXIST NAME");
testcase(connfd, KVS_CMD_RDEL, "name", 4, NULL, 0, KVS_STATUS_OK, NULL, 0, "RDEL NAME");
testcase(connfd, KVS_CMD_REXIST, "name", 4, NULL, 0, KVS_STATUS_NO_EXIST, NULL, 0, "REXIST NAME");
testcase(connfd, KVS_CMD_RMOD, "stu", 3, "liu", 3, KVS_STATUS_NO_EXIST, NULL, 0, "RMOD NAME");
testcase(connfd, KVS_CMD_RDEL, "stu", 3, NULL, 0, KVS_STATUS_NO_EXIST, NULL, 0, "RDEL SUT");
}
struct timeval tv_end;
gettimeofday(&tv_end, NULL);
int time_used = TIME_SUB_MS(tv_end, tv_begin); // ms
printf("array testcase --> time_used: %d, qps: %d\n", time_used, 9000 * 1000 / time_used);
}
void hash_testcase_1w(int connfd) {
int count = 1000;
int i = 0;
struct timeval tv_begin;
gettimeofday(&tv_begin, NULL);
for (i = 0;i < count;i ++) {
testcase(connfd, KVS_CMD_HSET, "name", 4, "l\r\0n", 4, KVS_STATUS_OK, NULL, 0, "HSET NAME");
testcase(connfd, KVS_CMD_HGET, "name", 4, NULL, 0, KVS_STATUS_OK, "l\r\0n", 4, "HGET NAME");
testcase(connfd, KVS_CMD_HMOD, "name", 4, "liu", 3, KVS_STATUS_OK, NULL, 0, "HMOD NAME");
testcase(connfd, KVS_CMD_HGET, "name", 4, NULL, 0, KVS_STATUS_OK, "liu", 3, "HGET NAME");
testcase(connfd, KVS_CMD_HEXIST, "name", 4, NULL, 0, KVS_STATUS_EXIST, NULL, 0, "HEXIST NAME");
testcase(connfd, KVS_CMD_HDEL, "name", 4, NULL, 0, KVS_STATUS_OK, NULL, 0, "HDEL NAME");
testcase(connfd, KVS_CMD_HEXIST, "name", 4, NULL, 0, KVS_STATUS_NO_EXIST, NULL, 0, "HEXIST NAME");
testcase(connfd, KVS_CMD_HMOD, "stu", 3, "liu", 3, KVS_STATUS_NO_EXIST, NULL, 0, "HMOD NAME");
testcase(connfd, KVS_CMD_HDEL, "stu", 3, NULL, 0, KVS_STATUS_NO_EXIST, NULL, 0, "HDEL SUT");
}
struct timeval tv_end;
gettimeofday(&tv_end, NULL);
int time_used = TIME_SUB_MS(tv_end, tv_begin); // ms
printf("array testcase --> time_used: %d, qps: %d\n", time_used, 9*count * 1000 / time_used);
}
void do_batch_test(int fd, int op, const char *key, const char *value, rsp_ret_status_e st, const char *rsp_value){
kvs_batch_t batch;
kvs_batch_init(&batch);
char bkey[256]={0}, bval[256]={0};
// 组 batch最多 64 条)
for(int i = 0;i < 100; ++ i){
if(value == NULL){
int klen = sprintf(bkey, "%s%d", key, i);
kvs_batch_add(&batch, op, bkey, klen, NULL, 0);
}else{
int klen = sprintf(bkey, "%s%d", key, i);
int vlen = sprintf(bval, "%s%d", value, i);
kvs_batch_add(&batch, op, bkey, klen, bval, vlen);
}
}
// 一次性发送
kvs_batch_send(fd, &batch);
// 一次性 recv + parse
uint8_t recvbuf[BATCH_SIZE];
kvs_response_t rsps[KVS_BATCH_MAX];
int nrsp = kvs_batch_recv_parse(fd, &batch, rsps, recvbuf, sizeof(recvbuf));
// 打印/处理
for (int i = 0; i < nrsp; i++) {
print_response(bkey, &rsps[i]);
int vlen;
if(rsp_value != NULL) vlen = sprintf(bval, "%s%d", rsp_value, i);
else vlen = 0;
verify_response(&rsps[i], op, st, bval, vlen);
}
}
void array_testcase_1w_batch(int connfd) {
kvs_batch_t batch;
kvs_batch_init(&batch);
int count = 1000;
int i = 0;
struct timeval tv_begin;
gettimeofday(&tv_begin, NULL);
for (i = 0;i < count;i ++) {
batch.cnt = 0;
batch.len = 0;
kvs_batch_add(&batch, KVS_CMD_SET, "name", 4, "l\r\0n", 4);
kvs_batch_add(&batch, KVS_CMD_GET, "name", 4, NULL, 0);
kvs_batch_add(&batch, KVS_CMD_MOD, "name", 4, "liu", 3);
kvs_batch_add(&batch, KVS_CMD_GET, "name", 4, NULL, 0);
kvs_batch_add(&batch, KVS_CMD_EXIST, "name", 4, NULL, 0);
kvs_batch_add(&batch, KVS_CMD_DEL, "name", 4, NULL, 0);
kvs_batch_add(&batch, KVS_CMD_EXIST, "name", 4, NULL, 0);
kvs_batch_add(&batch, KVS_CMD_MOD, "stu", 3, "liu", 3);
kvs_batch_add(&batch, KVS_CMD_DEL, "stu", 3, NULL, 0);
kvs_batch_send(connfd, &batch);
uint8_t recvbuf[BATCH_SIZE];
kvs_response_t rsps[KVS_BATCH_MAX];
int nrsp = kvs_batch_recv_parse(connfd, &batch, rsps, recvbuf, sizeof(recvbuf));
}
struct timeval tv_end;
gettimeofday(&tv_end, NULL);
int time_used = TIME_SUB_MS(tv_end, tv_begin); // ms
printf("array testcase --> time_used: %d, qps: %d\n", time_used, 9000 * 1000 / time_used);
}
void batch_qps(int connfd) {
const int N = 1000000;
const int B = 100; // do_batch_test() 里写死 50
static char valA[256];
static char valB[256];
static char valC[256];
static int inited = 0;
if (!inited) {
// 填充 255 字节,最后补 '\0'
memset(valA, 'A', 255); valA[255] = '\0';
memset(valB, 'B', 255); valB[255] = '\0';
memset(valC, 'C', 255); valC[255] = '\0';
inited = 1;
}
struct timeval tv_begin, tv_end;
gettimeofday(&tv_begin, NULL);
// ---------------- Phase 1: ADD 两条 DEL 一条 (100w) ----------------
// 每轮ADD/SET A_i, ADD/SET B_i, DEL A_i
// 用 batch每批处理 idx..idx+49
for (int base = 0; base < N; base += B) {
// prefix 必须短,避免 do_batch_test 里 bkey[15] 溢出
// do_batch_test 会生成: prefix + i (0..49)
// 我们把 base 编进 prefix保证全局 key 唯一
char preA[16], preB[16];
// 例A123450_0..A123450_49base=123450
// 注意:这里 prefix 长度要尽量 <= 10 左右
snprintf(preA, sizeof(preA), "A%d_", base/100);
snprintf(preB, sizeof(preB), "B%d_", base/100);
do_batch_test(connfd, KVS_CMD_RSET, preA, valA, KVS_STATUS_OK, NULL); // 50次 RSET A
do_batch_test(connfd, KVS_CMD_RSET, preB, valB, KVS_STATUS_OK, NULL); // 50次 RSET B
do_batch_test(connfd, KVS_CMD_RDEL, preA, NULL, KVS_STATUS_OK, NULL); // 50次 RDEL A
if (base % 10000 == 0) printf("P1 base:%d\n", base);
}
printf("phase 1 end\n");
// ---------------- Phase 2: ADD 一条 DEL 两条 (100w) ----------------
// 每轮ADD/SET C_i, DEL B_i, DEL C_i
for (int base = 0; base < N; base += B) {
char preB[16], preC[16];
snprintf(preB, sizeof(preB), "B%d_", base/100);
snprintf(preC, sizeof(preC), "C%d_", base/100);
do_batch_test(connfd, KVS_CMD_RSET, preC, valC, KVS_STATUS_OK, NULL); // 50次 RSET C
do_batch_test(connfd, KVS_CMD_RDEL, preB, NULL, KVS_STATUS_OK, NULL); // 50次 RDEL B
do_batch_test(connfd, KVS_CMD_RDEL, preC, NULL, KVS_STATUS_OK, NULL); // 50次 RDEL C
if (base % 10000 == 0) printf("P2 base:%d\n", base);
}
printf("phase 2 end\n");
gettimeofday(&tv_end, NULL);
int time_used = TIME_SUB_MS(tv_end, tv_begin);
// 真实总 ops 还是 6*N (每轮 6 个操作)
long long ops = (long long)N * 6;
long long qps = (time_used > 0) ? (ops * 1000 / time_used) : 0;
printf("BATCH(do_batch_test) ADD2-DEL1 then ADD1-DEL2 (N=%d) --> time_used=%d ms, ops=%lld, qps=%lld\n",
N, time_used, ops, qps);
}
void save(int connfd){
testcase(connfd, KVS_CMD_SAVE, NULL, 0, NULL, 0, KVS_STATUS_OK, NULL, 0, "SAVE");
}
void testcase_add2_del1_then_add1_del2_100w(int connfd) {
const int N = 1000000;
// 如果你有 KVS_CMD_ADD 就用 ADD没有就用 SET但 SET 会覆盖,意义不同)
// 这里按你说的“会返回 EXIST”所以我用 KVS_CMD_ADD 来写。
// 若你实际命令叫 KVS_CMD_SET 且语义是 ADD请自行替换。
const char *valA = "va";
const char *valB = "vb";
const char *valC = "vc";
char keyA[64], keyB[64], keyC[64];
struct timeval tv_begin, tv_end;
gettimeofday(&tv_begin, NULL);
// ---------------- Phase 1: ADD 两条 DEL 一条 (100w) ----------------
// 每轮ADD A_i, ADD B_i, DEL A_i -> 留下 B_i
for (int i = 0; i < N; i++) {
int klenA = snprintf(keyA, sizeof(keyA), "A_%d", i);
int klenB = snprintf(keyB, sizeof(keyB), "B_%d", i);
testcase(connfd, KVS_CMD_RSET, keyA, klenA, valA, 2, KVS_STATUS_OK, NULL, 0, "P1 ADD A_i");
testcase(connfd, KVS_CMD_RSET, keyB, klenB, valB, 2, KVS_STATUS_OK, NULL, 0, "P1 ADD B_i");
testcase(connfd, KVS_CMD_RDEL, keyA, klenA, NULL, 0, KVS_STATUS_OK, NULL, 0, "P1 DEL A_i");
if(i%10000 == 0) printf("i:%d\n", i);
}
printf("phase 1 end\n");
// ---------------- Phase 2: ADD 一条 DEL 两条 (100w) ----------------
// 每轮ADD C_i, DEL B_i, DEL C_i -> 每轮净删一个 B_i
for (int i = 0; i < N; i++) {
int klenC = snprintf(keyC, sizeof(keyC), "C_%d", i);
int klenB = snprintf(keyB, sizeof(keyB), "B_%d", i);
testcase(connfd, KVS_CMD_RSET, keyC, klenC, valC, 2, KVS_STATUS_OK, NULL, 0, "P2 ADD C_i");
testcase(connfd, KVS_CMD_RDEL, keyB, klenB, NULL, 0, KVS_STATUS_OK, NULL, 0, "P2 DEL B_i");
testcase(connfd, KVS_CMD_RDEL, keyC, klenC, NULL, 0, KVS_STATUS_OK, NULL, 0, "P2 DEL C_i");
if(i%10000 == 0) printf("i:%d\n", i);
}
printf("phase 2 end\n");
// for (int j = 0; j < 5; j++) {
// int idx = (j == 0) ? 0 : (j == 1) ? (N/2) : (N-1);
// int klenA = snprintf(keyA, sizeof(keyA), "A_%d", idx);
// int klenB = snprintf(keyB, sizeof(keyB), "B_%d", idx);
// int klenC = snprintf(keyC, sizeof(keyC), "C_%d", idx);
// testcase(connfd, KVS_CMD_EXIST, keyA, klenA, NULL, 0, KVS_STATUS_NO_EXIST, NULL, 0, "FINAL A not exist");
// testcase(connfd, KVS_CMD_EXIST, keyB, klenB, NULL, 0, KVS_STATUS_NO_EXIST, NULL, 0, "FINAL B not exist");
// testcase(connfd, KVS_CMD_EXIST, keyC, klenC, NULL, 0, KVS_STATUS_NO_EXIST, NULL, 0, "FINAL C not exist");
// }
gettimeofday(&tv_end, NULL);
int time_used = TIME_SUB_MS(tv_end, tv_begin);
// 统计Phase1 每轮3 opsPhase2 每轮3 ops
long long ops = (long long)N * 3 + (long long)N * 3;
long long qps = (time_used > 0) ? (ops * 1000 / time_used) : 0;
printf("ADD2-DEL1 then ADD1-DEL2 (N=%d) --> time_used=%d ms, ops=%lld, qps=%lld\n",
N, time_used, ops, qps);
}
void send_spec_chars(int connfd){
const char *v_ws = "li an\tok\nend"; /* 内容含空格、\t、\n */
int v_ws_len = 12; /* 手算l i ' ' a n \t o k \n e n d = 12 */
testcase(connfd, KVS_CMD_RSET,
"ws", 2,
v_ws, v_ws_len,
KVS_STATUS_OK,
NULL, 0,
"RSET WHITESPACE");
testcase(connfd, KVS_CMD_RGET,
"ws", 2,
NULL, 0,
KVS_STATUS_OK,
v_ws, v_ws_len,
"RGET WHITESPACE");
/* 2) 引号与反斜杠:测试是否被错误转义 */
const char *v_quote = "he\"llo\\world'!";
/* 字节数h e " l l o \ w o r l d ' ! = 15 */
int v_quote_len = 15;
testcase(connfd, KVS_CMD_RSET,
"quote", 5,
v_quote, v_quote_len,
KVS_STATUS_OK,
NULL, 0,
"RSET QUOTE BACKSLASH");
testcase(connfd, KVS_CMD_RGET,
"quote", 5,
NULL, 0,
KVS_STATUS_OK,
v_quote, v_quote_len,
"RGET QUOTE BACKSLASH");
/* 3) 分隔符字符:冒号/逗号/分号/竖线 */
const char *v_sep = "a:b,c;d|e";
int v_sep_len = 9; /* a : b , c ; d | e = 9 */
testcase(connfd, KVS_CMD_RSET,
"sep", 3,
v_sep, v_sep_len,
KVS_STATUS_OK,
NULL, 0,
"RSET SEPARATORS");
testcase(connfd, KVS_CMD_RGET,
"sep", 3,
NULL, 0,
KVS_STATUS_OK,
v_sep, v_sep_len,
"RGET SEPARATORS");
/* 4) CRLF\r\n 组合(最容易把一条请求拆错/响应拼错) */
const char *v_crlf = "line1\r\nline2";
int v_crlf_len = 12; /* line1(5) + \r(1) + \n(1) + line2(5) = 12 */
testcase(connfd, KVS_CMD_RSET,
"crlf", 4,
v_crlf, v_crlf_len,
KVS_STATUS_OK,
NULL, 0,
"RSET CRLF");
testcase(connfd, KVS_CMD_RGET,
"crlf", 4,
NULL, 0,
KVS_STATUS_OK,
v_crlf, v_crlf_len,
"RGET CRLF");
/* 5) 二进制数据:包含 \0必须按长度处理不能用 strlen */
char v_bin[] = { 'A', 0x00, 'B', 'C', 0x00, 'D' };
int v_bin_len = 6;
testcase(connfd, KVS_CMD_RSET,
"bin", 3,
v_bin, v_bin_len,
KVS_STATUS_OK,
NULL, 0,
"RSET BINARY WITH NUL");
testcase(connfd, KVS_CMD_RGET,
"bin", 3,
NULL, 0,
KVS_STATUS_OK,
v_bin, v_bin_len,
"RGET BINARY WITH NUL");
/* 6) UTF-8中文 + emoji测试多字节 */
const char *v_utf8 = "中文🙂";
/* "中"(3字节) + "文"(3字节) + "🙂"(4字节) = 10字节 */
int v_utf8_len = 10;
testcase(connfd, KVS_CMD_RSET,
"utf8", 4,
v_utf8, v_utf8_len,
KVS_STATUS_OK,
NULL, 0,
"RSET UTF8");
testcase(connfd, KVS_CMD_RGET,
"utf8", 4,
NULL, 0,
KVS_STATUS_OK,
v_utf8, v_utf8_len,
"RGET UTF8");
}
int main(int argc, char *argv[]) {
if (argc != 4) {
printf("arg error\n");
return -1;
}
char *ip = argv[1];
int port = atoi(argv[2]);
int mode = atoi(argv[3]);
int connfd = connect_tcpserver(ip, port);
if(mode == 0){
array_testcase_1w(connfd);
}else if(mode == 1){
array_testcase_1w_batch(connfd);
}else if(mode == 2){
rbtree_testcase_1w(connfd);
}else if(mode == 3){
hash_testcase_1w(connfd);
}else if(mode == 4){
testcase_add2_del1_then_add1_del2_100w(connfd);
}else if(mode == 10){
do_batch_test(connfd, KVS_CMD_SET, "array_set", "array_val", KVS_STATUS_OK, NULL);
}else if(mode == 11){
do_batch_test(connfd, KVS_CMD_GET, "array_set", NULL, KVS_STATUS_OK, "array_val");
}else if(mode == 12){
do_batch_test(connfd, KVS_CMD_EXIST, "array_set", NULL, KVS_STATUS_EXIST, NULL);
}else if(mode == 13){
do_batch_test(connfd, KVS_CMD_DEL, "array_set", NULL, KVS_STATUS_OK, NULL);
}else if(mode == 20){
do_batch_test(connfd, KVS_CMD_RSET, "rbtree_set", "rbtree_val", KVS_STATUS_OK, NULL);
}else if(mode == 21){
do_batch_test(connfd, KVS_CMD_RGET, "rbtree_set", NULL, KVS_STATUS_OK, "rbtree_val");
}else if(mode == 22){
do_batch_test(connfd, KVS_CMD_REXIST, "rbtree_set", NULL, KVS_STATUS_OK, NULL);
}else if(mode == 30){
do_batch_test(connfd, KVS_CMD_HSET, "hash_set", "hash_val", KVS_STATUS_OK, NULL);
}else if(mode == 31){
do_batch_test(connfd, KVS_CMD_HGET, "hash_set", NULL, KVS_STATUS_OK, "hash_val");
}else if(mode == 32){
do_batch_test(connfd, KVS_CMD_HEXIST, "hash_set", NULL, KVS_STATUS_OK, NULL);
}else if(mode == -1){
save(connfd);
}else if(mode == 5){
batch_qps(connfd);
}else if(mode == 6){
send_spec_chars(connfd);
}
return 0;
}