940 lines
26 KiB
C
940 lines
26 KiB
C
#include "diskuring.h"
|
|
#include "memory/alloc_dispatch.h"
|
|
|
|
#include <errno.h>
|
|
#include <limits.h>
|
|
#include <sched.h>
|
|
#include <stdbool.h>
|
|
#include <sys/eventfd.h>
|
|
#include <time.h>
|
|
|
|
#define IOURING_MAX_WORKERS 16
|
|
#define IOURING_MIN_ENTRIES_PER_WORKER 128u
|
|
#define IOURING_SPSC_MIN_CAP 1024u
|
|
#define IOURING_SUBMIT_BATCH 256
|
|
|
|
extern void sync_wakeup();
|
|
|
|
typedef struct {
|
|
_Atomic uint64_t submit_calls;
|
|
_Atomic uint64_t submit_pack_ns;
|
|
_Atomic uint64_t submit_alloc_ns;
|
|
_Atomic uint64_t submit_copy_ns;
|
|
_Atomic uint64_t submit_queue_ns;
|
|
_Atomic uint64_t submit_backpressure_ns;
|
|
_Atomic uint64_t submit_backpressure_loops;
|
|
_Atomic uint64_t cleanup_calls;
|
|
_Atomic uint64_t cleanup_ns;
|
|
_Atomic uint64_t cleanup_tasks;
|
|
} iouring_profile_stats_t;
|
|
|
|
static iouring_profile_stats_t g_prof;
|
|
static _Atomic uint64_t g_prof_seq;
|
|
static int g_prof_enable = 0;
|
|
static uint64_t g_prof_sample_mask = 0;
|
|
static uint64_t g_prof_scale = 1;
|
|
|
|
static inline uint64_t mono_ns(void) {
|
|
struct timespec ts;
|
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
|
return (uint64_t)ts.tv_sec * 1000000000ull + (uint64_t)ts.tv_nsec;
|
|
}
|
|
|
|
static inline int prof_should_sample(void) {
|
|
uint64_t seq;
|
|
if (!g_prof_enable) {
|
|
return 0;
|
|
}
|
|
seq = atomic_fetch_add_explicit(&g_prof_seq, 1, memory_order_relaxed);
|
|
return (seq & g_prof_sample_mask) == 0;
|
|
}
|
|
|
|
static inline uint64_t div_u64(uint64_t a, uint64_t b) {
|
|
return b ? (a / b) : 0;
|
|
}
|
|
|
|
static int parse_env_int(const char *name, int defv, int minv, int maxv) {
|
|
const char *v = getenv(name);
|
|
char *end = NULL;
|
|
long n = 0;
|
|
if (!v || !*v) {
|
|
return defv;
|
|
}
|
|
errno = 0;
|
|
n = strtol(v, &end, 10);
|
|
if (errno != 0 || !end || *end != '\0') {
|
|
return defv;
|
|
}
|
|
if (n < minv) {
|
|
return minv;
|
|
}
|
|
if (n > maxv) {
|
|
return maxv;
|
|
}
|
|
return (int)n;
|
|
}
|
|
|
|
static int default_worker_nr(void) {
|
|
int n = 3;
|
|
return parse_env_int("KVS_URING_WORKERS", n, 1, IOURING_MAX_WORKERS);
|
|
}
|
|
|
|
static inline uint32_t spsc_next(const spsc_queue_t *q, uint32_t idx) {
|
|
idx++;
|
|
if (idx >= q->cap) {
|
|
idx = 0;
|
|
}
|
|
return idx;
|
|
}
|
|
|
|
static int spsc_queue_init(spsc_queue_t *q, uint32_t cap) {
|
|
if (!q || cap < 2) {
|
|
return -1;
|
|
}
|
|
q->slots = (task_t **)calloc(cap, sizeof(task_t *));
|
|
if (!q->slots) {
|
|
return -1;
|
|
}
|
|
q->cap = cap;
|
|
atomic_init(&q->head, 0);
|
|
atomic_init(&q->tail, 0);
|
|
atomic_init(&q->size, 0);
|
|
return 0;
|
|
}
|
|
|
|
static void spsc_queue_destroy(spsc_queue_t *q) {
|
|
if (!q) {
|
|
return;
|
|
}
|
|
free(q->slots);
|
|
q->slots = NULL;
|
|
q->cap = 0;
|
|
}
|
|
|
|
static int spsc_try_push(spsc_queue_t *q, task_t *t, int *need_notify) {
|
|
uint32_t tail;
|
|
uint32_t next;
|
|
uint32_t head;
|
|
uint32_t prev_size;
|
|
if (!q || !t) {
|
|
return -1;
|
|
}
|
|
if (need_notify) {
|
|
*need_notify = 0;
|
|
}
|
|
tail = atomic_load_explicit(&q->tail, memory_order_relaxed);
|
|
next = spsc_next(q, tail);
|
|
head = atomic_load_explicit(&q->head, memory_order_acquire);
|
|
if (next == head) {
|
|
return -1;
|
|
}
|
|
q->slots[tail] = t;
|
|
atomic_store_explicit(&q->tail, next, memory_order_release);
|
|
prev_size = atomic_fetch_add_explicit(&q->size, 1, memory_order_release);
|
|
if (need_notify && prev_size == 0) {
|
|
*need_notify = 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static task_t *spsc_try_pop(spsc_queue_t *q) {
|
|
uint32_t head;
|
|
uint32_t tail;
|
|
task_t *t;
|
|
if (!q) {
|
|
return NULL;
|
|
}
|
|
head = atomic_load_explicit(&q->head, memory_order_relaxed);
|
|
tail = atomic_load_explicit(&q->tail, memory_order_acquire);
|
|
if (head == tail) {
|
|
return NULL;
|
|
}
|
|
t = q->slots[head];
|
|
q->slots[head] = NULL;
|
|
atomic_store_explicit(&q->head, spsc_next(q, head), memory_order_release);
|
|
atomic_fetch_sub_explicit(&q->size, 1, memory_order_release);
|
|
return t;
|
|
}
|
|
|
|
static int spsc_empty(spsc_queue_t *q) {
|
|
return atomic_load_explicit(&q->size, memory_order_acquire) == 0;
|
|
}
|
|
|
|
static void destroy_queue_push(iouring_ctx_t *ctx, task_t *t) {
|
|
task_t *old_head;
|
|
do {
|
|
old_head = atomic_load_explicit(&ctx->destroy_queue.head, memory_order_relaxed);
|
|
t->next = old_head;
|
|
} while (!atomic_compare_exchange_weak_explicit(
|
|
&ctx->destroy_queue.head, &old_head, t, memory_order_release, memory_order_relaxed));
|
|
}
|
|
|
|
static task_t *destroy_queue_steal_all(iouring_ctx_t *ctx) {
|
|
return atomic_exchange_explicit(&ctx->destroy_queue.head, NULL, memory_order_acquire);
|
|
}
|
|
|
|
static void worker_notify(iouring_worker_t *w) {
|
|
uint64_t one = 1;
|
|
while (1) {
|
|
ssize_t n = write(w->event_fd, &one, sizeof(one));
|
|
if (n == (ssize_t)sizeof(one)) {
|
|
return;
|
|
}
|
|
if (n < 0 && errno == EINTR) {
|
|
continue;
|
|
}
|
|
if (n < 0 && errno == EAGAIN) {
|
|
return;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
|
|
static void worker_wait_event(iouring_worker_t *w) {
|
|
uint64_t v;
|
|
while (1) {
|
|
ssize_t n = read(w->event_fd, &v, sizeof(v));
|
|
if (n == (ssize_t)sizeof(v)) {
|
|
return;
|
|
}
|
|
if (n < 0 && errno == EINTR) {
|
|
continue;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
|
|
static void worker_collect_cq(iouring_worker_t *w, int *completed) {
|
|
while (1) {
|
|
struct io_uring_cqe *cqe = NULL;
|
|
int rc = io_uring_peek_cqe(&w->ring, &cqe);
|
|
if (rc < 0 || !cqe) {
|
|
break;
|
|
}
|
|
|
|
if (cqe->user_data != 0) {
|
|
task_t *done = (task_t *)(uintptr_t)cqe->user_data;
|
|
atomic_fetch_sub_explicit(&w->in_flight, 1, memory_order_relaxed);
|
|
task_finish(done, cqe->res);
|
|
if (cqe->res < 0) {
|
|
fprintf(stderr, "write fail: wid=%d fd=%d res=%d off=%ld\n",
|
|
w->worker_id, done->fd, cqe->res, (long)done->off);
|
|
}
|
|
destroy_queue_push(w->parent, done);
|
|
(*completed)++;
|
|
}
|
|
|
|
io_uring_cqe_seen(&w->ring, cqe);
|
|
}
|
|
}
|
|
|
|
static void *worker_main(void *arg) {
|
|
iouring_worker_t *w = (iouring_worker_t *)arg;
|
|
iouring_ctx_t *ctx = w->parent;
|
|
task_t *local_head = NULL;
|
|
task_t *local_tail = NULL;
|
|
|
|
while (1) {
|
|
int completed = 0;
|
|
int prepared = 0;
|
|
bool stop = atomic_load_explicit(&ctx->stop, memory_order_acquire) != 0;
|
|
|
|
if ((*w->ring.sq.kflags & IORING_SQ_CQ_OVERFLOW) != 0) {
|
|
fprintf(stderr, "FATAL: CQ overflow on worker %d\n", w->worker_id);
|
|
abort();
|
|
}
|
|
|
|
worker_collect_cq(w, &completed);
|
|
if (completed > 0) {
|
|
sync_wakeup();
|
|
}
|
|
|
|
while (prepared < IOURING_SUBMIT_BATCH) {
|
|
task_t *t = NULL;
|
|
struct io_uring_sqe *sqe = NULL;
|
|
|
|
if (atomic_load_explicit(&w->in_flight, memory_order_relaxed) >= w->max_in_flight) {
|
|
break;
|
|
}
|
|
|
|
if (local_head) {
|
|
t = local_head;
|
|
local_head = local_head->next;
|
|
if (!local_head) {
|
|
local_tail = NULL;
|
|
}
|
|
t->next = NULL;
|
|
} else {
|
|
t = spsc_try_pop(&w->submit_q);
|
|
}
|
|
|
|
if (!t) {
|
|
break;
|
|
}
|
|
|
|
sqe = io_uring_get_sqe(&w->ring);
|
|
if (!sqe) {
|
|
if (local_tail) {
|
|
local_tail->next = t;
|
|
local_tail = t;
|
|
} else {
|
|
local_head = t;
|
|
local_tail = t;
|
|
}
|
|
break;
|
|
}
|
|
|
|
if (t->op == TASK_WRITE) {
|
|
io_uring_prep_writev(sqe, t->fd, t->iovs, t->iovcnt, t->off);
|
|
} else if (t->op == TASK_FSYNC) {
|
|
io_uring_prep_fsync(sqe, t->fd, t->fsync_flags);
|
|
} else {
|
|
task_finish(t, -EINVAL);
|
|
destroy_queue_push(w->parent, t);
|
|
continue;
|
|
}
|
|
sqe->flags |= (unsigned char)t->sqe_flags;
|
|
sqe->user_data = (uint64_t)(uintptr_t)t;
|
|
prepared++;
|
|
}
|
|
|
|
if (prepared > 0) {
|
|
int submitted = io_uring_submit(&w->ring);
|
|
if (submitted < 0) {
|
|
if (submitted != -EINTR && submitted != -EAGAIN) {
|
|
fprintf(stderr, "io_uring_submit worker=%d ret=%d\n", w->worker_id, submitted);
|
|
}
|
|
} else if (submitted > 0) {
|
|
atomic_fetch_add_explicit(&w->in_flight, submitted, memory_order_relaxed);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (stop &&
|
|
atomic_load_explicit(&w->in_flight, memory_order_relaxed) == 0 &&
|
|
spsc_empty(&w->submit_q) &&
|
|
local_head == NULL) {
|
|
break;
|
|
}
|
|
|
|
if (atomic_load_explicit(&w->in_flight, memory_order_relaxed) > 0) {
|
|
io_uring_submit_and_wait(&w->ring, 1);
|
|
continue;
|
|
}
|
|
|
|
if (!spsc_empty(&w->submit_q) || local_head) {
|
|
continue;
|
|
}
|
|
|
|
worker_wait_event(w);
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
void task_init(task_t *t) {
|
|
if (!t) {
|
|
return;
|
|
}
|
|
t->op = TASK_WRITE;
|
|
t->fd = -1;
|
|
t->off = 0;
|
|
t->fsync_flags = 0;
|
|
t->sqe_flags = 0;
|
|
t->done = 0;
|
|
t->res = 0;
|
|
t->iovs = NULL;
|
|
t->iovcnt = 0;
|
|
t->free_iov_bases = 1;
|
|
t->on_destroy = NULL;
|
|
t->on_destroy_arg = NULL;
|
|
t->next = NULL;
|
|
}
|
|
|
|
void task_finish(task_t *t, int res) {
|
|
if (!t) {
|
|
return;
|
|
}
|
|
t->res = res;
|
|
atomic_store_explicit(&t->done, 1, memory_order_release);
|
|
}
|
|
|
|
int task_wait(task_t *t) {
|
|
if (!t) {
|
|
return -EINVAL;
|
|
}
|
|
while (atomic_load_explicit(&t->done, memory_order_acquire) == 0) {
|
|
sched_yield();
|
|
}
|
|
return t->res;
|
|
}
|
|
|
|
void task_destroy(task_t *t) {
|
|
if (!t) {
|
|
return;
|
|
}
|
|
|
|
if (t->on_destroy) {
|
|
t->on_destroy(t, t->on_destroy_arg);
|
|
}
|
|
|
|
if (t->iovs) {
|
|
if (t->free_iov_bases) {
|
|
for (int i = 0; i < t->iovcnt; i++) {
|
|
if (t->iovs[i].iov_base) {
|
|
kvs_free(t->iovs[i].iov_base);
|
|
}
|
|
}
|
|
}
|
|
kvs_free(t->iovs);
|
|
}
|
|
|
|
kvs_free(t);
|
|
}
|
|
|
|
static int init_worker(iouring_ctx_t *ctx, iouring_worker_t *w, int worker_id, unsigned entries) {
|
|
struct io_uring_params params;
|
|
unsigned cq_size = 0;
|
|
uint32_t spsc_cap = 0;
|
|
int ret = 0;
|
|
|
|
memset(w, 0, sizeof(*w));
|
|
w->worker_id = worker_id;
|
|
w->parent = ctx;
|
|
atomic_init(&w->in_flight, 0);
|
|
|
|
w->event_fd = eventfd(0, EFD_CLOEXEC);
|
|
if (w->event_fd < 0) {
|
|
return -errno;
|
|
}
|
|
|
|
memset(¶ms, 0, sizeof(params));
|
|
ret = io_uring_queue_init_params(entries, &w->ring, ¶ms);
|
|
if (ret < 0) {
|
|
close(w->event_fd);
|
|
w->event_fd = -1;
|
|
return ret;
|
|
}
|
|
|
|
cq_size = *w->ring.cq.kring_entries;
|
|
w->max_in_flight = (int)((cq_size * 8u) / 10u);
|
|
if (w->max_in_flight < 64) {
|
|
w->max_in_flight = 64;
|
|
}
|
|
|
|
spsc_cap = (uint32_t)(w->max_in_flight * 2);
|
|
if (spsc_cap < IOURING_SPSC_MIN_CAP) {
|
|
spsc_cap = IOURING_SPSC_MIN_CAP;
|
|
}
|
|
spsc_cap += 1;
|
|
if (spsc_queue_init(&w->submit_q, spsc_cap) != 0) {
|
|
io_uring_queue_exit(&w->ring);
|
|
close(w->event_fd);
|
|
w->event_fd = -1;
|
|
return -ENOMEM;
|
|
}
|
|
|
|
ret = pthread_create(&w->th, NULL, worker_main, w);
|
|
if (ret != 0) {
|
|
spsc_queue_destroy(&w->submit_q);
|
|
io_uring_queue_exit(&w->ring);
|
|
close(w->event_fd);
|
|
w->event_fd = -1;
|
|
return -ret;
|
|
}
|
|
|
|
printf("io_uring worker[%d]: entries=%u cq=%u max_in_flight=%d queue_cap=%u\n",
|
|
worker_id, entries, cq_size, w->max_in_flight, spsc_cap - 1);
|
|
return 0;
|
|
}
|
|
|
|
int iouring_init(iouring_ctx_t *ctx, unsigned entries) {
|
|
unsigned per_worker_entries;
|
|
int worker_nr;
|
|
int i;
|
|
|
|
if (!ctx) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
memset(ctx, 0, sizeof(*ctx));
|
|
atomic_init(&ctx->stop, 0);
|
|
atomic_init(&ctx->rr_next, 0);
|
|
atomic_init(&ctx->destroy_queue.head, NULL);
|
|
memset(&g_prof, 0, sizeof(g_prof));
|
|
atomic_init(&g_prof_seq, 0);
|
|
|
|
g_prof_enable = parse_env_int("KVS_IOURING_PROFILE", 0, 0, 1);
|
|
if (g_prof_enable) {
|
|
int shift = parse_env_int("KVS_IOURING_PROFILE_SHIFT", 6, 0, 12);
|
|
g_prof_sample_mask = ((uint64_t)1 << (uint64_t)shift) - 1;
|
|
g_prof_scale = (uint64_t)1 << (uint64_t)shift;
|
|
printf("io_uring profile enabled: sample=1/%llu\n",
|
|
(unsigned long long)g_prof_scale);
|
|
} else {
|
|
g_prof_sample_mask = 0;
|
|
g_prof_scale = 1;
|
|
}
|
|
|
|
worker_nr = default_worker_nr();
|
|
if (worker_nr < 1) {
|
|
worker_nr = 1;
|
|
}
|
|
|
|
if (entries < (unsigned)worker_nr * IOURING_MIN_ENTRIES_PER_WORKER) {
|
|
per_worker_entries = IOURING_MIN_ENTRIES_PER_WORKER;
|
|
} else {
|
|
per_worker_entries = entries / (unsigned)worker_nr;
|
|
}
|
|
if (per_worker_entries < IOURING_MIN_ENTRIES_PER_WORKER) {
|
|
per_worker_entries = IOURING_MIN_ENTRIES_PER_WORKER;
|
|
}
|
|
|
|
ctx->workers = (iouring_worker_t *)calloc((size_t)worker_nr, sizeof(iouring_worker_t));
|
|
if (!ctx->workers) {
|
|
return -ENOMEM;
|
|
}
|
|
ctx->worker_nr = worker_nr;
|
|
ctx->entries_per_worker = per_worker_entries;
|
|
|
|
for (i = 0; i < worker_nr; i++) {
|
|
int rc = init_worker(ctx, &ctx->workers[i], i, per_worker_entries);
|
|
if (rc != 0) {
|
|
ctx->worker_nr = i;
|
|
iouring_shutdown(ctx);
|
|
return rc;
|
|
}
|
|
}
|
|
|
|
printf("io_uring initialized with %d workers (n*SPSC)\n", worker_nr);
|
|
return 0;
|
|
}
|
|
|
|
static void wake_all_workers(iouring_ctx_t *ctx) {
|
|
if (!ctx || !ctx->workers) {
|
|
return;
|
|
}
|
|
for (int i = 0; i < ctx->worker_nr; i++) {
|
|
worker_notify(&ctx->workers[i]);
|
|
}
|
|
}
|
|
|
|
void cleanup_finished_iouring_tasks(iouring_ctx_t *ctx) {
|
|
task_t *list;
|
|
uint64_t start_ns = 0;
|
|
uint64_t tasks = 0;
|
|
if (!ctx) {
|
|
return;
|
|
}
|
|
|
|
if (g_prof_enable) {
|
|
start_ns = mono_ns();
|
|
}
|
|
|
|
list = destroy_queue_steal_all(ctx);
|
|
while (list) {
|
|
task_t *next = list->next;
|
|
task_destroy(list);
|
|
tasks++;
|
|
list = next;
|
|
}
|
|
|
|
if (g_prof_enable) {
|
|
uint64_t ns = mono_ns() - start_ns;
|
|
atomic_fetch_add_explicit(&g_prof.cleanup_calls, 1, memory_order_relaxed);
|
|
atomic_fetch_add_explicit(&g_prof.cleanup_ns, ns, memory_order_relaxed);
|
|
atomic_fetch_add_explicit(&g_prof.cleanup_tasks, tasks, memory_order_relaxed);
|
|
}
|
|
}
|
|
|
|
void iouring_profile_dump(iouring_ctx_t *ctx) {
|
|
uint64_t submit_calls;
|
|
uint64_t submit_pack_ns;
|
|
uint64_t submit_alloc_ns;
|
|
uint64_t submit_copy_ns;
|
|
uint64_t submit_queue_ns;
|
|
uint64_t submit_bp_ns;
|
|
uint64_t submit_bp_loops;
|
|
uint64_t cleanup_calls;
|
|
uint64_t cleanup_ns;
|
|
uint64_t cleanup_tasks;
|
|
uint64_t submit_total_ns;
|
|
uint64_t main_total_ns;
|
|
(void)ctx;
|
|
|
|
if (!g_prof_enable) {
|
|
printf("[iouring-prof] disabled (set KVS_IOURING_PROFILE=1)\n");
|
|
return;
|
|
}
|
|
|
|
submit_calls = atomic_load_explicit(&g_prof.submit_calls, memory_order_relaxed);
|
|
submit_pack_ns = atomic_load_explicit(&g_prof.submit_pack_ns, memory_order_relaxed);
|
|
submit_alloc_ns = atomic_load_explicit(&g_prof.submit_alloc_ns, memory_order_relaxed);
|
|
submit_copy_ns = atomic_load_explicit(&g_prof.submit_copy_ns, memory_order_relaxed);
|
|
submit_queue_ns = atomic_load_explicit(&g_prof.submit_queue_ns, memory_order_relaxed);
|
|
submit_bp_ns = atomic_load_explicit(&g_prof.submit_backpressure_ns, memory_order_relaxed);
|
|
submit_bp_loops = atomic_load_explicit(&g_prof.submit_backpressure_loops, memory_order_relaxed);
|
|
cleanup_calls = atomic_load_explicit(&g_prof.cleanup_calls, memory_order_relaxed);
|
|
cleanup_ns = atomic_load_explicit(&g_prof.cleanup_ns, memory_order_relaxed);
|
|
cleanup_tasks = atomic_load_explicit(&g_prof.cleanup_tasks, memory_order_relaxed);
|
|
submit_total_ns = submit_pack_ns + submit_queue_ns;
|
|
main_total_ns = submit_total_ns + cleanup_ns;
|
|
|
|
printf("[iouring-prof] submits=%llu cleanup_calls=%llu cleanup_tasks=%llu\n",
|
|
(unsigned long long)submit_calls,
|
|
(unsigned long long)cleanup_calls,
|
|
(unsigned long long)cleanup_tasks);
|
|
printf("[iouring-prof] submit_pack=%llums (alloc=%llums copy=%llums) submit_queue=%llums cleanup=%llums total_main=%llums\n",
|
|
(unsigned long long)(submit_pack_ns / 1000000ull),
|
|
(unsigned long long)(submit_alloc_ns / 1000000ull),
|
|
(unsigned long long)(submit_copy_ns / 1000000ull),
|
|
(unsigned long long)(submit_queue_ns / 1000000ull),
|
|
(unsigned long long)(cleanup_ns / 1000000ull),
|
|
(unsigned long long)(main_total_ns / 1000000ull));
|
|
printf("[iouring-prof] per_submit(ns): pack=%llu alloc=%llu copy=%llu queue=%llu backpressure=%llu loops=%llu\n",
|
|
(unsigned long long)div_u64(submit_pack_ns, submit_calls),
|
|
(unsigned long long)div_u64(submit_alloc_ns, submit_calls),
|
|
(unsigned long long)div_u64(submit_copy_ns, submit_calls),
|
|
(unsigned long long)div_u64(submit_queue_ns, submit_calls),
|
|
(unsigned long long)div_u64(submit_bp_ns, submit_calls),
|
|
(unsigned long long)div_u64(submit_bp_loops, submit_calls));
|
|
printf("[iouring-prof] per_cleanup(ns)=%llu per_task_free(ns)=%llu\n",
|
|
(unsigned long long)div_u64(cleanup_ns, cleanup_calls),
|
|
(unsigned long long)div_u64(cleanup_ns, cleanup_tasks));
|
|
if (main_total_ns > 0) {
|
|
printf("[iouring-prof] main-share: pack=%.1f%% queue=%.1f%% cleanup=%.1f%%\n",
|
|
(double)submit_pack_ns * 100.0 / (double)main_total_ns,
|
|
(double)submit_queue_ns * 100.0 / (double)main_total_ns,
|
|
(double)cleanup_ns * 100.0 / (double)main_total_ns);
|
|
if (submit_pack_ns > 0) {
|
|
double other_pct = 100.0 -
|
|
((double)submit_alloc_ns * 100.0 / (double)submit_pack_ns) -
|
|
((double)submit_copy_ns * 100.0 / (double)submit_pack_ns);
|
|
if (other_pct < 0.0) {
|
|
other_pct = 0.0;
|
|
}
|
|
printf("[iouring-prof] pack-share: alloc=%.1f%% copy=%.1f%% other=%.1f%%\n",
|
|
(double)submit_alloc_ns * 100.0 / (double)submit_pack_ns,
|
|
(double)submit_copy_ns * 100.0 / (double)submit_pack_ns,
|
|
other_pct);
|
|
}
|
|
}
|
|
}
|
|
|
|
void iouring_shutdown(iouring_ctx_t *ctx) {
|
|
int i;
|
|
if (!ctx || !ctx->workers) {
|
|
return;
|
|
}
|
|
|
|
atomic_store_explicit(&ctx->stop, 1, memory_order_release);
|
|
wake_all_workers(ctx);
|
|
|
|
for (i = 0; i < ctx->worker_nr; i++) {
|
|
iouring_worker_t *w = &ctx->workers[i];
|
|
if (w->th) {
|
|
pthread_join(w->th, NULL);
|
|
}
|
|
if (w->event_fd >= 0) {
|
|
close(w->event_fd);
|
|
w->event_fd = -1;
|
|
}
|
|
io_uring_queue_exit(&w->ring);
|
|
spsc_queue_destroy(&w->submit_q);
|
|
}
|
|
|
|
cleanup_finished_iouring_tasks(ctx);
|
|
free(ctx->workers);
|
|
ctx->workers = NULL;
|
|
ctx->worker_nr = 0;
|
|
}
|
|
|
|
static int queue_task_with_backpressure(iouring_ctx_t *ctx, task_t *t) {
|
|
uint32_t rr;
|
|
int n;
|
|
uint64_t start_ns = 0;
|
|
uint64_t loops = 0;
|
|
int sampled = prof_should_sample();
|
|
if (!ctx || !ctx->workers || !t) {
|
|
return -1;
|
|
}
|
|
|
|
if (sampled) {
|
|
start_ns = mono_ns();
|
|
}
|
|
|
|
n = ctx->worker_nr;
|
|
rr = atomic_fetch_add_explicit(&ctx->rr_next, 1, memory_order_relaxed);
|
|
|
|
while (atomic_load_explicit(&ctx->stop, memory_order_acquire) == 0) {
|
|
loops++;
|
|
for (int i = 0; i < n; i++) {
|
|
int idx = (int)((rr + (uint32_t)i) % (uint32_t)n);
|
|
iouring_worker_t *w = &ctx->workers[idx];
|
|
int need_notify = 0;
|
|
if (spsc_try_push(&w->submit_q, t, &need_notify) == 0) {
|
|
if (need_notify) {
|
|
worker_notify(w);
|
|
}
|
|
if (sampled) {
|
|
uint64_t ns = mono_ns() - start_ns;
|
|
atomic_fetch_add_explicit(&g_prof.submit_backpressure_ns, ns * g_prof_scale,
|
|
memory_order_relaxed);
|
|
atomic_fetch_add_explicit(&g_prof.submit_backpressure_loops, loops * g_prof_scale,
|
|
memory_order_relaxed);
|
|
}
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/* 队列满时主动回收,避免主线程无界撑内存。 */
|
|
cleanup_finished_iouring_tasks(ctx);
|
|
sched_yield();
|
|
}
|
|
|
|
if (sampled) {
|
|
uint64_t ns = mono_ns() - start_ns;
|
|
atomic_fetch_add_explicit(&g_prof.submit_backpressure_ns, ns * g_prof_scale,
|
|
memory_order_relaxed);
|
|
atomic_fetch_add_explicit(&g_prof.submit_backpressure_loops, loops * g_prof_scale,
|
|
memory_order_relaxed);
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
static int queue_task_to_worker_with_backpressure(iouring_ctx_t *ctx, task_t *t, int worker_id) {
|
|
iouring_worker_t *w;
|
|
|
|
if (!ctx || !ctx->workers || !t) {
|
|
return -1;
|
|
}
|
|
if (worker_id < 0 || worker_id >= ctx->worker_nr) {
|
|
return -1;
|
|
}
|
|
|
|
w = &ctx->workers[worker_id];
|
|
while (atomic_load_explicit(&ctx->stop, memory_order_acquire) == 0) {
|
|
int need_notify = 0;
|
|
if (spsc_try_push(&w->submit_q, t, &need_notify) == 0) {
|
|
if (need_notify) {
|
|
worker_notify(w);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
cleanup_finished_iouring_tasks(ctx);
|
|
sched_yield();
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
task_t *submit_write(iouring_ctx_t *ctx, int fd, void **bufs, size_t *lens, int count, off_t off) {
|
|
task_t *t;
|
|
size_t total = 0;
|
|
uint8_t *packed = NULL;
|
|
size_t copied = 0;
|
|
uint64_t pack_start = 0;
|
|
uint64_t alloc_start = 0;
|
|
uint64_t copy_start = 0;
|
|
uint64_t queue_start = 0;
|
|
int sampled = prof_should_sample();
|
|
|
|
if (!ctx || !ctx->workers || !bufs || !lens || count <= 0) {
|
|
return NULL;
|
|
}
|
|
atomic_fetch_add_explicit(&g_prof.submit_calls, 1, memory_order_relaxed);
|
|
|
|
if (sampled) {
|
|
pack_start = mono_ns();
|
|
alloc_start = pack_start;
|
|
}
|
|
|
|
t = (task_t *)kvs_malloc(sizeof(task_t));
|
|
if (!t) {
|
|
return NULL;
|
|
}
|
|
task_init(t);
|
|
t->op = TASK_WRITE;
|
|
t->fd = fd;
|
|
t->off = off;
|
|
t->iovcnt = 1;
|
|
t->iovs = (struct iovec *)kvs_malloc(sizeof(struct iovec));
|
|
if (!t->iovs) {
|
|
kvs_free(t);
|
|
return NULL;
|
|
}
|
|
|
|
for (int i = 0; i < count; ++i) {
|
|
if (lens[i] > SIZE_MAX - total) {
|
|
kvs_free(t->iovs);
|
|
kvs_free(t);
|
|
return NULL;
|
|
}
|
|
total += lens[i];
|
|
}
|
|
|
|
if (total == 0) {
|
|
kvs_free(t->iovs);
|
|
kvs_free(t);
|
|
return NULL;
|
|
}
|
|
|
|
packed = (uint8_t *)kvs_malloc(total);
|
|
if (!packed) {
|
|
kvs_free(t->iovs);
|
|
kvs_free(t);
|
|
return NULL;
|
|
}
|
|
|
|
if (sampled) {
|
|
uint64_t alloc_ns = mono_ns() - alloc_start;
|
|
atomic_fetch_add_explicit(&g_prof.submit_alloc_ns, alloc_ns * g_prof_scale,
|
|
memory_order_relaxed);
|
|
copy_start = mono_ns();
|
|
}
|
|
|
|
for (int i = 0; i < count; ++i) {
|
|
size_t len = lens[i];
|
|
if (len == 0) {
|
|
continue;
|
|
}
|
|
memcpy(packed + copied, bufs[i], len);
|
|
copied += len;
|
|
}
|
|
|
|
if (sampled) {
|
|
uint64_t copy_ns = mono_ns() - copy_start;
|
|
atomic_fetch_add_explicit(&g_prof.submit_copy_ns, copy_ns * g_prof_scale,
|
|
memory_order_relaxed);
|
|
}
|
|
|
|
t->iovs[0].iov_base = packed;
|
|
t->iovs[0].iov_len = copied;
|
|
|
|
if (sampled) {
|
|
uint64_t pack_ns = mono_ns() - pack_start;
|
|
atomic_fetch_add_explicit(&g_prof.submit_pack_ns, pack_ns * g_prof_scale,
|
|
memory_order_relaxed);
|
|
queue_start = mono_ns();
|
|
}
|
|
|
|
if (queue_task_with_backpressure(ctx, t) != 0) {
|
|
task_destroy(t);
|
|
return NULL;
|
|
}
|
|
|
|
if (sampled) {
|
|
uint64_t queue_ns = mono_ns() - queue_start;
|
|
atomic_fetch_add_explicit(&g_prof.submit_queue_ns, queue_ns * g_prof_scale,
|
|
memory_order_relaxed);
|
|
}
|
|
|
|
return t;
|
|
}
|
|
|
|
task_t *submit_write_ref(iouring_ctx_t *ctx, int fd, void **bufs, size_t *lens, int count, off_t off,
|
|
int free_iov_bases, task_destroy_cb_t on_destroy, void *on_destroy_arg) {
|
|
task_t *t;
|
|
size_t total = 0;
|
|
|
|
if (!ctx || !ctx->workers || !bufs || !lens || count <= 0) {
|
|
return NULL;
|
|
}
|
|
|
|
t = (task_t *)kvs_malloc(sizeof(task_t));
|
|
if (!t) {
|
|
return NULL;
|
|
}
|
|
task_init(t);
|
|
t->op = TASK_WRITE;
|
|
t->fd = fd;
|
|
t->off = off;
|
|
t->iovcnt = count;
|
|
t->free_iov_bases = free_iov_bases ? 1 : 0;
|
|
t->on_destroy = on_destroy;
|
|
t->on_destroy_arg = on_destroy_arg;
|
|
t->iovs = (struct iovec *)kvs_malloc(sizeof(struct iovec) * (size_t)count);
|
|
if (!t->iovs) {
|
|
kvs_free(t);
|
|
return NULL;
|
|
}
|
|
|
|
for (int i = 0; i < count; ++i) {
|
|
if (!bufs[i] || lens[i] == 0) {
|
|
task_destroy(t);
|
|
return NULL;
|
|
}
|
|
if (lens[i] > SIZE_MAX - total) {
|
|
task_destroy(t);
|
|
return NULL;
|
|
}
|
|
t->iovs[i].iov_base = bufs[i];
|
|
t->iovs[i].iov_len = lens[i];
|
|
total += lens[i];
|
|
}
|
|
|
|
if (total == 0) {
|
|
task_destroy(t);
|
|
return NULL;
|
|
}
|
|
|
|
if (queue_task_with_backpressure(ctx, t) != 0) {
|
|
task_destroy(t);
|
|
return NULL;
|
|
}
|
|
|
|
return t;
|
|
}
|
|
|
|
task_t *submit_fsync_ref(iouring_ctx_t *ctx, int fd, int worker_id, int drain,
|
|
task_destroy_cb_t on_destroy, void *on_destroy_arg) {
|
|
task_t *t;
|
|
|
|
if (!ctx || !ctx->workers || fd < 0) {
|
|
return NULL;
|
|
}
|
|
if (worker_id < 0 || worker_id >= ctx->worker_nr) {
|
|
return NULL;
|
|
}
|
|
|
|
t = (task_t *)kvs_malloc(sizeof(task_t));
|
|
if (!t) {
|
|
return NULL;
|
|
}
|
|
|
|
task_init(t);
|
|
t->op = TASK_FSYNC;
|
|
t->fd = fd;
|
|
t->fsync_flags = 0;
|
|
t->sqe_flags = drain ? IOSQE_IO_DRAIN : 0;
|
|
t->on_destroy = on_destroy;
|
|
t->on_destroy_arg = on_destroy_arg;
|
|
|
|
if (queue_task_to_worker_with_backpressure(ctx, t, worker_id) != 0) {
|
|
task_destroy(t);
|
|
return NULL;
|
|
}
|
|
|
|
return t;
|
|
}
|
|
|
|
int uring_task_complete(iouring_ctx_t *ctx) {
|
|
if (!ctx || !ctx->workers) {
|
|
return 1;
|
|
}
|
|
|
|
for (int i = 0; i < ctx->worker_nr; i++) {
|
|
iouring_worker_t *w = &ctx->workers[i];
|
|
if (!spsc_empty(&w->submit_q)) {
|
|
return 0;
|
|
}
|
|
if (atomic_load_explicit(&w->in_flight, memory_order_relaxed) > 0) {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
return atomic_load_explicit(&ctx->destroy_queue.head, memory_order_acquire) == NULL;
|
|
}
|