diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index addfcc74d851..4fe7af8a4907 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -2,6 +2,7 @@ #define IO_URING_TYPES_H #include +#include #include #include #include @@ -247,6 +248,7 @@ struct io_ring_ctx { struct percpu_ref refs; enum task_work_notify_mode notify_method; + unsigned sq_thread_idle; } ____cacheline_aligned_in_smp; /* submission data */ @@ -410,7 +412,18 @@ struct io_ring_ctx { struct callback_head poll_wq_task_work; struct list_head defer_list; - unsigned sq_thread_idle; + +#ifdef CONFIG_NET_RX_BUSY_POLL + struct list_head napi_list; /* track busy poll napi_id */ + spinlock_t napi_lock; /* napi_list lock */ + + /* napi busy poll default timeout */ + unsigned int napi_busy_poll_to; + bool napi_prefer_busy_poll; + + DECLARE_HASHTABLE(napi_ht, 4); +#endif + /* protected by ->completion_lock */ unsigned evfd_last_cq_tail; diff --git a/io_uring/Makefile b/io_uring/Makefile index 42afe4a1ff31..2e1d4e03799c 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -11,3 +11,4 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ notif.o waitid.o register.o truncate.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o +obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 5311f19faa4c..479f610e314f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -95,6 +95,7 @@ #include "notif.h" #include "waitid.h" #include "futex.h" +#include "napi.h" #include "timeout.h" #include "poll.h" @@ -349,6 +350,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); INIT_WQ_LIST(&ctx->submit_state.compl_reqs); INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd); + io_napi_init(ctx); + return ctx; err: kfree(ctx->cancel_table.hbs); @@ -2602,9 +2605,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, if (get_timespec64(&ts, uts)) return -EFAULT; + iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); + io_napi_adjust_timeout(ctx, &iowq, &ts); } + io_napi_busy_loop(ctx, &iowq); + trace_io_uring_cqring_wait(ctx, min_events); do { int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail); @@ -2897,6 +2904,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_req_caches_free(ctx); if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); + io_napi_free(ctx); kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); kfree(ctx->io_bl); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 859f6e0580e3..1ca99522811b 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -42,6 +42,10 @@ struct io_wait_queue { unsigned nr_timeouts; ktime_t timeout; +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned int napi_busy_poll_to; + bool napi_prefer_busy_poll; +#endif }; static inline bool io_should_wake(struct io_wait_queue *iowq) diff --git a/io_uring/napi.c b/io_uring/napi.c new file mode 100644 index 000000000000..1112cc39153c --- /dev/null +++ b/io_uring/napi.c @@ -0,0 +1,255 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "io_uring.h" +#include "napi.h" + +#ifdef CONFIG_NET_RX_BUSY_POLL + +/* Timeout for cleanout of stale entries. */ +#define NAPI_TIMEOUT (60 * SEC_CONVERSION) + +struct io_napi_entry { + unsigned int napi_id; + struct list_head list; + + unsigned long timeout; + struct hlist_node node; + + struct rcu_head rcu; +}; + +static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list, + unsigned int napi_id) +{ + struct io_napi_entry *e; + + hlist_for_each_entry_rcu(e, hash_list, node) { + if (e->napi_id != napi_id) + continue; + e->timeout = jiffies + NAPI_TIMEOUT; + return e; + } + + return NULL; +} + +void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock) +{ + struct hlist_head *hash_list; + unsigned int napi_id; + struct sock *sk; + struct io_napi_entry *e; + + sk = sock->sk; + if (!sk) + return; + + napi_id = READ_ONCE(sk->sk_napi_id); + + /* Non-NAPI IDs can be rejected. */ + if (napi_id < MIN_NAPI_ID) + return; + + hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; + + rcu_read_lock(); + e = io_napi_hash_find(hash_list, napi_id); + if (e) { + e->timeout = jiffies + NAPI_TIMEOUT; + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + + e = kmalloc(sizeof(*e), GFP_NOWAIT); + if (!e) + return; + + e->napi_id = napi_id; + e->timeout = jiffies + NAPI_TIMEOUT; + + spin_lock(&ctx->napi_lock); + if (unlikely(io_napi_hash_find(hash_list, napi_id))) { + spin_unlock(&ctx->napi_lock); + kfree(e); + return; + } + + hlist_add_tail_rcu(&e->node, hash_list); + list_add_tail(&e->list, &ctx->napi_list); + spin_unlock(&ctx->napi_lock); +} + +static void __io_napi_remove_stale(struct io_ring_ctx *ctx) +{ + struct io_napi_entry *e; + unsigned int i; + + spin_lock(&ctx->napi_lock); + hash_for_each(ctx->napi_ht, i, e, node) { + if (time_after(jiffies, e->timeout)) { + list_del(&e->list); + hash_del_rcu(&e->node); + kfree_rcu(e, rcu); + } + } + spin_unlock(&ctx->napi_lock); +} + +static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale) +{ + if (is_stale) + __io_napi_remove_stale(ctx); +} + +static inline bool io_napi_busy_loop_timeout(unsigned long start_time, + unsigned long bp_usec) +{ + if (bp_usec) { + unsigned long end_time = start_time + bp_usec; + unsigned long now = busy_loop_current_time(); + + return time_after(now, end_time); + } + + return true; +} + +static bool io_napi_busy_loop_should_end(void *data, + unsigned long start_time) +{ + struct io_wait_queue *iowq = data; + + if (signal_pending(current)) + return true; + if (io_should_wake(iowq)) + return true; + if (io_napi_busy_loop_timeout(start_time, iowq->napi_busy_poll_to)) + return true; + + return false; +} + +static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx, + void *loop_end_arg) +{ + struct io_napi_entry *e; + bool (*loop_end)(void *, unsigned long) = NULL; + bool is_stale = false; + + if (loop_end_arg) + loop_end = io_napi_busy_loop_should_end; + + list_for_each_entry_rcu(e, &ctx->napi_list, list) { + napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg, + ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); + + if (time_after(jiffies, e->timeout)) + is_stale = true; + } + + return is_stale; +} + +static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq) +{ + unsigned long start_time = busy_loop_current_time(); + void *loop_end_arg = NULL; + bool is_stale = false; + + /* Singular lists use a different napi loop end check function and are + * only executed once. + */ + if (list_is_singular(&ctx->napi_list)) + loop_end_arg = iowq; + + rcu_read_lock(); + do { + is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg); + } while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg); + rcu_read_unlock(); + + io_napi_remove_stale(ctx, is_stale); +} + +/* + * io_napi_init() - Init napi settings + * @ctx: pointer to io-uring context structure + * + * Init napi settings in the io-uring context. + */ +void io_napi_init(struct io_ring_ctx *ctx) +{ + INIT_LIST_HEAD(&ctx->napi_list); + spin_lock_init(&ctx->napi_lock); + ctx->napi_prefer_busy_poll = false; + ctx->napi_busy_poll_to = READ_ONCE(sysctl_net_busy_poll); +} + +/* + * io_napi_free() - Deallocate napi + * @ctx: pointer to io-uring context structure + * + * Free the napi list and the hash table in the io-uring context. + */ +void io_napi_free(struct io_ring_ctx *ctx) +{ + struct io_napi_entry *e; + LIST_HEAD(napi_list); + unsigned int i; + + spin_lock(&ctx->napi_lock); + hash_for_each(ctx->napi_ht, i, e, node) { + hash_del_rcu(&e->node); + kfree_rcu(e, rcu); + } + spin_unlock(&ctx->napi_lock); +} + +/* + * __io_napi_adjust_timeout() - Add napi id to the busy poll list + * @ctx: pointer to io-uring context structure + * @iowq: pointer to io wait queue + * @ts: pointer to timespec or NULL + * + * Adjust the busy loop timeout according to timespec and busy poll timeout. + */ +void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, + struct timespec64 *ts) +{ + unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to); + + if (ts) { + struct timespec64 poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to); + + if (timespec64_compare(ts, &poll_to_ts) > 0) { + *ts = timespec64_sub(*ts, poll_to_ts); + } else { + u64 to = timespec64_to_ns(ts); + + do_div(to, 1000); + ts->tv_sec = 0; + ts->tv_nsec = 0; + } + } + + iowq->napi_busy_poll_to = poll_to; +} + +/* + * __io_napi_busy_loop() - execute busy poll loop + * @ctx: pointer to io-uring context structure + * @iowq: pointer to io wait queue + * + * Execute the busy poll loop and merge the spliced off list. + */ +void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) +{ + iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll); + + if (!(ctx->flags & IORING_SETUP_SQPOLL) && iowq->napi_busy_poll_to) + io_napi_blocking_busy_loop(ctx, iowq); +} + +#endif diff --git a/io_uring/napi.h b/io_uring/napi.h new file mode 100644 index 000000000000..be8aa8ee32d9 --- /dev/null +++ b/io_uring/napi.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef IOU_NAPI_H +#define IOU_NAPI_H + +#include +#include +#include + +#ifdef CONFIG_NET_RX_BUSY_POLL + +void io_napi_init(struct io_ring_ctx *ctx); +void io_napi_free(struct io_ring_ctx *ctx); + +void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock); + +void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq, struct timespec64 *ts); +void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq); + +static inline bool io_napi(struct io_ring_ctx *ctx) +{ + return !list_empty(&ctx->napi_list); +} + +static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq, + struct timespec64 *ts) +{ + if (!io_napi(ctx)) + return; + __io_napi_adjust_timeout(ctx, iowq, ts); +} + +static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq) +{ + if (!io_napi(ctx)) + return; + __io_napi_busy_loop(ctx, iowq); +} + +/* + * io_napi_add() - Add napi id to the busy poll list + * @req: pointer to io_kiocb request + * + * Add the napi id of the socket to the napi busy poll list and hash table. + */ +static inline void io_napi_add(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct socket *sock; + + if (!READ_ONCE(ctx->napi_busy_poll_to)) + return; + + sock = sock_from_file(req->file); + if (sock) + __io_napi_add(ctx, sock); +} + +#else + +static inline void io_napi_init(struct io_ring_ctx *ctx) +{ +} +static inline void io_napi_free(struct io_ring_ctx *ctx) +{ +} +static inline bool io_napi(struct io_ring_ctx *ctx) +{ + return false; +} +static inline void io_napi_add(struct io_kiocb *req) +{ +} +static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq, + struct timespec64 *ts) +{ +} +static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq) +{ +} + +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +#endif diff --git a/io_uring/poll.c b/io_uring/poll.c index 8e01c2672df4..053d738c330c 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -15,6 +15,7 @@ #include "io_uring.h" #include "refs.h" +#include "napi.h" #include "opdef.h" #include "kbuf.h" #include "poll.h" @@ -649,6 +650,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, __io_poll_execute(req, mask); return 0; } + io_napi_add(req); if (ipt->owning) { /*