for-5.20/io_uring-2022-07-29

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmLkm5gQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpmKMD/4l3QIrLbjYIxlfrzQcHbmYuUkbQtj3SbZg
 6ejbnGVhCs1P9DdXH8MgE2BxgpiXQE0CqOK7vbSoo5ep2n2UTLI2DIxAl74SMIo7
 0wmJXtUJySuViKr3NYVHqlN180MkQYddBz0nGElhkQBPBCMhW8CrtPCeURr/YyHp
 2RxSYBXiUx2gRyig+klnp6oPEqelcBZJUyNHdA9yVrgl/RhB/t2rKj7D++8ukQM3
 Zuyh8WIkTeTfUz9hdGG7fuCEdZN4DlO2CCEc7uy0cKi6VRCKH4hYUCqClJ+/cfd2
 43dUI2O7B6D1t/ObFh8AGIDXBDqVA6ePQohQU6gooRkfQiBPKkc9d0ts4yIhRqca
 AjkzNM+0Eve3A01loJ8J84w8oZnvNpYEv5n8/sZVLWcyU3UIs0I88nC2OBiFtoRq
 d77CtFLwOTo+r3STtAhnZOqez90rhS6BqKtqlUP346PCuFItl6/MbGtwdTbLYEFj
 CVNIb2pERWSr2NxGv4lFyXaX/cRwruxojWH7yc3rRYjr4Ykevd1pe/fMGNiMAnKw
 5em/3QU3qq0ZVcXLMihksKeHHFIQwGDRMuyuv/fktV10+yYXQ0t16WzkJT3aR8Xo
 cqs0r8+6Jnj3uYcOMzj/FoLcpEPr21hnwAtzLto1mG1Wh4JRn/D7Nx5zqxPLxcW+
 NiU6VihPOw==
 =gxeV
 -----END PGP SIGNATURE-----

Merge tag 'for-5.20/io_uring-2022-07-29' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:

 - As per (valid) complaint in the last merge window, fs/io_uring.c has
   grown quite large these days. io_uring isn't really tied to fs
   either, as it supports a wide variety of functionality outside of
   that.

   Move the code to io_uring/ and split it into files that either
   implement a specific request type, and split some code into helpers
   as well. The code is organized a lot better like this, and io_uring.c
   is now < 4K LOC (me).

 - Deprecate the epoll_ctl opcode. It'll still work, just trigger a
   warning once if used. If we don't get any complaints on this, and I
   don't expect any, then we can fully remove it in a future release
   (me).

 - Improve the cancel hash locking (Hao)

 - kbuf cleanups (Hao)

 - Efficiency improvements to the task_work handling (Dylan, Pavel)

 - Provided buffer improvements (Dylan)

 - Add support for recv/recvmsg multishot support. This is similar to
   the accept (or poll) support for have for multishot, where a single
   SQE can trigger everytime data is received. For applications that
   expect to do more than a few receives on an instantiated socket, this
   greatly improves efficiency (Dylan).

 - Efficiency improvements for poll handling (Pavel)

 - Poll cancelation improvements (Pavel)

 - Allow specifiying a range for direct descriptor allocations (Pavel)

 - Cleanup the cqe32 handling (Pavel)

 - Move io_uring types to greatly cleanup the tracing (Pavel)

 - Tons of great code cleanups and improvements (Pavel)

 - Add a way to do sync cancelations rather than through the sqe -> cqe
   interface, as that's a lot easier to use for some use cases (me).

 - Add support to IORING_OP_MSG_RING for sending direct descriptors to a
   different ring. This avoids the usually problematic SCM case, as we
   disallow those. (me)

 - Make the per-command alloc cache we use for apoll generic, place
   limits on it, and use it for netmsg as well (me).

 - Various cleanups (me, Michal, Gustavo, Uros)

* tag 'for-5.20/io_uring-2022-07-29' of git://git.kernel.dk/linux-block: (172 commits)
  io_uring: ensure REQ_F_ISREG is set async offload
  net: fix compat pointer in get_compat_msghdr()
  io_uring: Don't require reinitable percpu_ref
  io_uring: fix types in io_recvmsg_multishot_overflow
  io_uring: Use atomic_long_try_cmpxchg in __io_account_mem
  io_uring: support multishot in recvmsg
  net: copy from user before calling __get_compat_msghdr
  net: copy from user before calling __copy_msghdr
  io_uring: support 0 length iov in buffer select in compat
  io_uring: fix multishot ending when not polled
  io_uring: add netmsg cache
  io_uring: impose max limit on apoll cache
  io_uring: add abstraction around apoll cache
  io_uring: move apoll cache to poll.c
  io_uring: consolidate hash_locked io-wq handling
  io_uring: clear REQ_F_HASH_LOCKED on hash removal
  io_uring: don't race double poll setting REQ_F_ASYNC_DATA
  io_uring: don't miss setting REQ_F_DOUBLE_POLL
  io_uring: disable multishot recvmsg
  io_uring: only trace one of complete or overflow
  ...
This commit is contained in:
Linus Torvalds 2022-08-02 13:20:44 -07:00
commit b349b1181d
66 changed files with 15141 additions and 13529 deletions

View File

@ -7811,9 +7811,6 @@ F: include/linux/fs.h
F: include/linux/fs_types.h
F: include/uapi/linux/fs.h
F: include/uapi/linux/openat2.h
X: fs/io-wq.c
X: fs/io-wq.h
X: fs/io_uring.c
FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER
M: Riku Voipio <riku.voipio@iki.fi>
@ -10521,9 +10518,7 @@ L: io-uring@vger.kernel.org
S: Maintained
T: git git://git.kernel.dk/linux-block
T: git git://git.kernel.dk/liburing
F: fs/io-wq.c
F: fs/io-wq.h
F: fs/io_uring.c
F: io_uring/
F: include/linux/io_uring.h
F: include/uapi/linux/io_uring.h
F: tools/io_uring/

View File

@ -1097,6 +1097,7 @@ export MODULES_NSDEPS := $(extmod_prefix)modules.nsdeps
ifeq ($(KBUILD_EXTMOD),)
core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/
core-$(CONFIG_BLOCK) += block/
core-$(CONFIG_IO_URING) += io_uring/
vmlinux-dirs := $(patsubst %/,%,$(filter %/, \
$(core-y) $(core-m) $(drivers-y) $(drivers-m) \

View File

@ -34,8 +34,6 @@ obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
obj-$(CONFIG_IO_URING) += io_uring.o
obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FS_VERITY) += verity/

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,544 @@
#ifndef IO_URING_TYPES_H
#define IO_URING_TYPES_H
#include <linux/blkdev.h>
#include <linux/task_work.h>
#include <linux/bitmap.h>
#include <uapi/linux/io_uring.h>
struct io_wq_work_node {
struct io_wq_work_node *next;
};
struct io_wq_work_list {
struct io_wq_work_node *first;
struct io_wq_work_node *last;
};
struct io_wq_work {
struct io_wq_work_node list;
unsigned flags;
/* place it here instead of io_kiocb as it fills padding and saves 4B */
int cancel_seq;
};
struct io_fixed_file {
/* file * with additional FFS_* flags */
unsigned long file_ptr;
};
struct io_file_table {
struct io_fixed_file *files;
unsigned long *bitmap;
unsigned int alloc_hint;
};
struct io_hash_bucket {
spinlock_t lock;
struct hlist_head list;
} ____cacheline_aligned_in_smp;
struct io_hash_table {
struct io_hash_bucket *hbs;
unsigned hash_bits;
};
struct io_uring {
u32 head ____cacheline_aligned_in_smp;
u32 tail ____cacheline_aligned_in_smp;
};
/*
* This data is shared with the application through the mmap at offsets
* IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
*
* The offsets to the member fields are published through struct
* io_sqring_offsets when calling io_uring_setup.
*/
struct io_rings {
/*
* Head and tail offsets into the ring; the offsets need to be
* masked to get valid indices.
*
* The kernel controls head of the sq ring and the tail of the cq ring,
* and the application controls tail of the sq ring and the head of the
* cq ring.
*/
struct io_uring sq, cq;
/*
* Bitmasks to apply to head and tail offsets (constant, equals
* ring_entries - 1)
*/
u32 sq_ring_mask, cq_ring_mask;
/* Ring sizes (constant, power of 2) */
u32 sq_ring_entries, cq_ring_entries;
/*
* Number of invalid entries dropped by the kernel due to
* invalid index stored in array
*
* Written by the kernel, shouldn't be modified by the
* application (i.e. get number of "new events" by comparing to
* cached value).
*
* After a new SQ head value was read by the application this
* counter includes all submissions that were dropped reaching
* the new SQ head (and possibly more).
*/
u32 sq_dropped;
/*
* Runtime SQ flags
*
* Written by the kernel, shouldn't be modified by the
* application.
*
* The application needs a full memory barrier before checking
* for IORING_SQ_NEED_WAKEUP after updating the sq tail.
*/
atomic_t sq_flags;
/*
* Runtime CQ flags
*
* Written by the application, shouldn't be modified by the
* kernel.
*/
u32 cq_flags;
/*
* Number of completion events lost because the queue was full;
* this should be avoided by the application by making sure
* there are not more requests pending than there is space in
* the completion queue.
*
* Written by the kernel, shouldn't be modified by the
* application (i.e. get number of "new events" by comparing to
* cached value).
*
* As completion events come in out of order this counter is not
* ordered with any other data.
*/
u32 cq_overflow;
/*
* Ring buffer of completion events.
*
* The kernel writes completion events fresh every time they are
* produced, so the application is allowed to modify pending
* entries.
*/
struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
};
struct io_restriction {
DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
u8 sqe_flags_allowed;
u8 sqe_flags_required;
bool registered;
};
struct io_submit_link {
struct io_kiocb *head;
struct io_kiocb *last;
};
struct io_submit_state {
/* inline/task_work completion list, under ->uring_lock */
struct io_wq_work_node free_list;
/* batch completion logic */
struct io_wq_work_list compl_reqs;
struct io_submit_link link;
bool plug_started;
bool need_plug;
unsigned short submit_nr;
struct blk_plug plug;
};
struct io_ev_fd {
struct eventfd_ctx *cq_ev_fd;
unsigned int eventfd_async: 1;
struct rcu_head rcu;
};
struct io_alloc_cache {
struct hlist_head list;
unsigned int nr_cached;
};
struct io_ring_ctx {
/* const or read-mostly hot data */
struct {
struct percpu_ref refs;
struct io_rings *rings;
unsigned int flags;
enum task_work_notify_mode notify_method;
unsigned int compat: 1;
unsigned int drain_next: 1;
unsigned int restricted: 1;
unsigned int off_timeout_used: 1;
unsigned int drain_active: 1;
unsigned int drain_disabled: 1;
unsigned int has_evfd: 1;
unsigned int syscall_iopoll: 1;
} ____cacheline_aligned_in_smp;
/* submission data */
struct {
struct mutex uring_lock;
/*
* Ring buffer of indices into array of io_uring_sqe, which is
* mmapped by the application using the IORING_OFF_SQES offset.
*
* This indirection could e.g. be used to assign fixed
* io_uring_sqe entries to operations and only submit them to
* the queue when needed.
*
* The kernel modifies neither the indices array nor the entries
* array.
*/
u32 *sq_array;
struct io_uring_sqe *sq_sqes;
unsigned cached_sq_head;
unsigned sq_entries;
/*
* Fixed resources fast path, should be accessed only under
* uring_lock, and updated through io_uring_register(2)
*/
struct io_rsrc_node *rsrc_node;
int rsrc_cached_refs;
atomic_t cancel_seq;
struct io_file_table file_table;
unsigned nr_user_files;
unsigned nr_user_bufs;
struct io_mapped_ubuf **user_bufs;
struct io_submit_state submit_state;
struct io_buffer_list *io_bl;
struct xarray io_bl_xa;
struct list_head io_buffers_cache;
struct io_hash_table cancel_table_locked;
struct list_head cq_overflow_list;
struct io_alloc_cache apoll_cache;
struct io_alloc_cache netmsg_cache;
} ____cacheline_aligned_in_smp;
/* IRQ completion list, under ->completion_lock */
struct io_wq_work_list locked_free_list;
unsigned int locked_free_nr;
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
struct io_sq_data *sq_data; /* if using sq thread polling */
struct wait_queue_head sqo_sq_wait;
struct list_head sqd_list;
unsigned long check_cq;
unsigned int file_alloc_start;
unsigned int file_alloc_end;
struct xarray personalities;
u32 pers_next;
struct {
/*
* We cache a range of free CQEs we can use, once exhausted it
* should go through a slower range setup, see __io_get_cqe()
*/
struct io_uring_cqe *cqe_cached;
struct io_uring_cqe *cqe_sentinel;
unsigned cached_cq_tail;
unsigned cq_entries;
struct io_ev_fd __rcu *io_ev_fd;
struct wait_queue_head cq_wait;
unsigned cq_extra;
} ____cacheline_aligned_in_smp;
struct {
spinlock_t completion_lock;
/*
* ->iopoll_list is protected by the ctx->uring_lock for
* io_uring instances that don't use IORING_SETUP_SQPOLL.
* For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
*/
struct io_wq_work_list iopoll_list;
struct io_hash_table cancel_table;
bool poll_multi_queue;
struct list_head io_buffers_comp;
} ____cacheline_aligned_in_smp;
/* timeouts */
struct {
spinlock_t timeout_lock;
atomic_t cq_timeouts;
struct list_head timeout_list;
struct list_head ltimeout_list;
unsigned cq_last_tm_flush;
} ____cacheline_aligned_in_smp;
/* Keep this last, we don't need it for the fast path */
struct io_restriction restrictions;
struct task_struct *submitter_task;
/* slow path rsrc auxilary data, used by update/register */
struct io_rsrc_node *rsrc_backup_node;
struct io_mapped_ubuf *dummy_ubuf;
struct io_rsrc_data *file_data;
struct io_rsrc_data *buf_data;
struct delayed_work rsrc_put_work;
struct llist_head rsrc_put_llist;
struct list_head rsrc_ref_list;
spinlock_t rsrc_ref_lock;
struct list_head io_buffers_pages;
#if defined(CONFIG_UNIX)
struct socket *ring_sock;
#endif
/* hashed buffered write serialization */
struct io_wq_hash *hash_map;
/* Only used for accounting purposes */
struct user_struct *user;
struct mm_struct *mm_account;
/* ctx exit and cancelation */
struct llist_head fallback_llist;
struct delayed_work fallback_work;
struct work_struct exit_work;
struct list_head tctx_list;
struct completion ref_comp;
/* io-wq management, e.g. thread count */
u32 iowq_limits[2];
bool iowq_limits_set;
struct list_head defer_list;
unsigned sq_thread_idle;
/* protected by ->completion_lock */
unsigned evfd_last_cq_tail;
};
enum {
REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT,
/* first byte is taken by user flags, shift it to not overlap */
REQ_F_FAIL_BIT = 8,
REQ_F_INFLIGHT_BIT,
REQ_F_CUR_POS_BIT,
REQ_F_NOWAIT_BIT,
REQ_F_LINK_TIMEOUT_BIT,
REQ_F_NEED_CLEANUP_BIT,
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
REQ_F_BUFFER_RING_BIT,
REQ_F_REISSUE_BIT,
REQ_F_CREDS_BIT,
REQ_F_REFCOUNT_BIT,
REQ_F_ARM_LTIMEOUT_BIT,
REQ_F_ASYNC_DATA_BIT,
REQ_F_SKIP_LINK_CQES_BIT,
REQ_F_SINGLE_POLL_BIT,
REQ_F_DOUBLE_POLL_BIT,
REQ_F_PARTIAL_IO_BIT,
REQ_F_CQE32_INIT_BIT,
REQ_F_APOLL_MULTISHOT_BIT,
REQ_F_CLEAR_POLLIN_BIT,
REQ_F_HASH_LOCKED_BIT,
/* keep async read/write and isreg together and in order */
REQ_F_SUPPORT_NOWAIT_BIT,
REQ_F_ISREG_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
};
enum {
/* ctx owns file */
REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
/* drain existing IO first */
REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
/* linked sqes */
REQ_F_LINK = BIT(REQ_F_LINK_BIT),
/* doesn't sever on completion < 0 */
REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
/* IOSQE_ASYNC */
REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
/* IOSQE_BUFFER_SELECT */
REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
/* IOSQE_CQE_SKIP_SUCCESS */
REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT),
/* fail rest of links */
REQ_F_FAIL = BIT(REQ_F_FAIL_BIT),
/* on inflight list, should be cancelled and waited on exit reliably */
REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
/* read/write uses file position */
REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
/* must not punt to workers */
REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
/* has or had linked timeout */
REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
/* needs cleanup */
REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
/* already went through poll handler */
REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
/* buffer already selected */
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
/* buffer selected from ring, needs commit */
REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT),
/* caller should reissue async */
REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
/* supports async reads/writes */
REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
/* has creds assigned */
REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),
/* skip refcounting if not set */
REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
/* there is a linked timeout that has to be armed */
REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
/* ->async_data allocated */
REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
/* don't post CQEs while failing linked requests */
REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT),
/* single poll may be active */
REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT),
/* double poll may active */
REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),
/* request has already done partial IO */
REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
/* fast poll multishot mode */
REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT),
/* ->extra1 and ->extra2 are initialised */
REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT),
/* recvmsg special flag, clear EPOLLIN */
REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT),
/* hashed into ->cancel_hash_locked, protected by ->uring_lock */
REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT),
};
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
struct io_task_work {
struct llist_node node;
io_req_tw_func_t func;
};
struct io_cqe {
__u64 user_data;
__s32 res;
/* fd initially, then cflags for completion */
union {
__u32 flags;
int fd;
};
};
/*
* Each request type overlays its private data structure on top of this one.
* They must not exceed this one in size.
*/
struct io_cmd_data {
struct file *file;
/* each command gets 56 bytes of data */
__u8 data[56];
};
#define io_kiocb_to_cmd(req) ((void *) &(req)->cmd)
#define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr)
struct io_kiocb {
union {
/*
* NOTE! Each of the io_kiocb union members has the file pointer
* as the first entry in their struct definition. So you can
* access the file pointer through any of the sub-structs,
* or directly as just 'file' in this struct.
*/
struct file *file;
struct io_cmd_data cmd;
};
u8 opcode;
/* polled IO has completed */
u8 iopoll_completed;
/*
* Can be either a fixed buffer index, or used with provided buffers.
* For the latter, before issue it points to the buffer group ID,
* and after selection it points to the buffer ID itself.
*/
u16 buf_index;
unsigned int flags;
struct io_cqe cqe;
struct io_ring_ctx *ctx;
struct task_struct *task;
struct io_rsrc_node *rsrc_node;
union {
/* store used ubuf, so we can prevent reloading */
struct io_mapped_ubuf *imu;
/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
struct io_buffer *kbuf;
/*
* stores buffer ID for ring provided buffers, valid IFF
* REQ_F_BUFFER_RING is set.
*/
struct io_buffer_list *buf_list;
};
union {
/* used by request caches, completion batching and iopoll */
struct io_wq_work_node comp_list;
/* cache ->apoll->events */
__poll_t apoll_events;
};
atomic_t refs;
atomic_t poll_refs;
struct io_task_work io_task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
union {
struct hlist_node hash_node;
struct {
u64 extra1;
u64 extra2;
};
};
/* internal polling, see IORING_FEAT_FAST_POLL */
struct async_poll *apoll;
/* opcode allocated if it needs to store data for async defer */
void *async_data;
/* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
struct io_kiocb *link;
/* custom credentials, valid IFF REQ_F_CREDS is set */
const struct cred *creds;
struct io_wq_work work;
};
struct io_overflow_cqe {
struct list_head list;
struct io_uring_cqe cqe;
};
#endif

View File

@ -416,10 +416,9 @@ extern int recvmsg_copy_msghdr(struct msghdr *msg,
struct user_msghdr __user *umsg, unsigned flags,
struct sockaddr __user **uaddr,
struct iovec **iov);
extern int __copy_msghdr_from_user(struct msghdr *kmsg,
struct user_msghdr __user *umsg,
struct sockaddr __user **save_addr,
struct iovec __user **uiov, size_t *nsegs);
extern int __copy_msghdr(struct msghdr *kmsg,
struct user_msghdr *umsg,
struct sockaddr __user **save_addr);
/* helpers which do the actual work for syscalls */
extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,

View File

@ -46,9 +46,8 @@ struct compat_rtentry {
unsigned short rt_irtt; /* Initial RTT */
};
int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg,
struct sockaddr __user **save_addr, compat_uptr_t *ptr,
compat_size_t *len);
int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr *msg,
struct sockaddr __user **save_addr);
int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *,
struct sockaddr __user **, struct iovec **);
int put_cmsg_compat(struct msghdr*, int, int, int, void *);

View File

@ -7,6 +7,7 @@
#include <linux/tracepoint.h>
#include <uapi/linux/io_uring.h>
#include <linux/io_uring_types.h>
#include <linux/io_uring.h>
struct io_wq_work;
@ -97,9 +98,7 @@ TRACE_EVENT(io_uring_register,
/**
* io_uring_file_get - called before getting references to an SQE file
*
* @ctx: pointer to a ring context structure
* @req: pointer to a submitted request
* @user_data: user data associated with the request
* @fd: SQE file descriptor
*
* Allows to trace out how often an SQE file reference is obtained, which can
@ -108,9 +107,9 @@ TRACE_EVENT(io_uring_register,
*/
TRACE_EVENT(io_uring_file_get,
TP_PROTO(void *ctx, void *req, unsigned long long user_data, int fd),
TP_PROTO(struct io_kiocb *req, int fd),
TP_ARGS(ctx, req, user_data, fd),
TP_ARGS(req, fd),
TP_STRUCT__entry (
__field( void *, ctx )
@ -120,9 +119,9 @@ TRACE_EVENT(io_uring_file_get,
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->ctx = req->ctx;
__entry->req = req;
__entry->user_data = user_data;
__entry->user_data = req->cqe.user_data;
__entry->fd = fd;
),
@ -133,22 +132,16 @@ TRACE_EVENT(io_uring_file_get,
/**
* io_uring_queue_async_work - called before submitting a new async work
*
* @ctx: pointer to a ring context structure
* @req: pointer to a submitted request
* @user_data: user data associated with the request
* @opcode: opcode of request
* @flags request flags
* @work: pointer to a submitted io_wq_work
* @rw: type of workqueue, hashed or normal
*
* Allows to trace asynchronous work submission.
*/
TRACE_EVENT(io_uring_queue_async_work,
TP_PROTO(void *ctx, void * req, unsigned long long user_data, u8 opcode,
unsigned int flags, struct io_wq_work *work, int rw),
TP_PROTO(struct io_kiocb *req, int rw),
TP_ARGS(ctx, req, user_data, opcode, flags, work, rw),
TP_ARGS(req, rw),
TP_STRUCT__entry (
__field( void *, ctx )
@ -159,19 +152,19 @@ TRACE_EVENT(io_uring_queue_async_work,
__field( struct io_wq_work *, work )
__field( int, rw )
__string( op_str, io_uring_get_opcode(opcode) )
__string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->ctx = req->ctx;
__entry->req = req;
__entry->user_data = user_data;
__entry->flags = flags;
__entry->opcode = opcode;
__entry->work = work;
__entry->user_data = req->cqe.user_data;
__entry->flags = req->flags;
__entry->opcode = req->opcode;
__entry->work = &req->work;
__entry->rw = rw;
__assign_str(op_str, io_uring_get_opcode(opcode));
__assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p",
@ -183,19 +176,16 @@ TRACE_EVENT(io_uring_queue_async_work,
/**
* io_uring_defer - called when an io_uring request is deferred
*
* @ctx: pointer to a ring context structure
* @req: pointer to a deferred request
* @user_data: user data associated with the request
* @opcode: opcode of request
*
* Allows to track deferred requests, to get an insight about what requests are
* not started immediately.
*/
TRACE_EVENT(io_uring_defer,
TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode),
TP_PROTO(struct io_kiocb *req),
TP_ARGS(ctx, req, user_data, opcode),
TP_ARGS(req),
TP_STRUCT__entry (
__field( void *, ctx )
@ -203,16 +193,16 @@ TRACE_EVENT(io_uring_defer,
__field( unsigned long long, data )
__field( u8, opcode )
__string( op_str, io_uring_get_opcode(opcode) )
__string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->ctx = req->ctx;
__entry->req = req;
__entry->data = user_data;
__entry->opcode = opcode;
__entry->data = req->cqe.user_data;
__entry->opcode = req->opcode;
__assign_str(op_str, io_uring_get_opcode(opcode));
__assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s",
@ -224,7 +214,6 @@ TRACE_EVENT(io_uring_defer,
* io_uring_link - called before the io_uring request added into link_list of
* another request
*
* @ctx: pointer to a ring context structure
* @req: pointer to a linked request
* @target_req: pointer to a previous request, that would contain @req
*
@ -233,9 +222,9 @@ TRACE_EVENT(io_uring_defer,
*/
TRACE_EVENT(io_uring_link,
TP_PROTO(void *ctx, void *req, void *target_req),
TP_PROTO(struct io_kiocb *req, struct io_kiocb *target_req),
TP_ARGS(ctx, req, target_req),
TP_ARGS(req, target_req),
TP_STRUCT__entry (
__field( void *, ctx )
@ -244,7 +233,7 @@ TRACE_EVENT(io_uring_link,
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->ctx = req->ctx;
__entry->req = req;
__entry->target_req = target_req;
),
@ -285,10 +274,7 @@ TRACE_EVENT(io_uring_cqring_wait,
/**
* io_uring_fail_link - called before failing a linked request
*
* @ctx: pointer to a ring context structure
* @req: request, which links were cancelled
* @user_data: user data associated with the request
* @opcode: opcode of request
* @link: cancelled link
*
* Allows to track linked requests cancellation, to see not only that some work
@ -296,9 +282,9 @@ TRACE_EVENT(io_uring_cqring_wait,
*/
TRACE_EVENT(io_uring_fail_link,
TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, void *link),
TP_PROTO(struct io_kiocb *req, struct io_kiocb *link),
TP_ARGS(ctx, req, user_data, opcode, link),
TP_ARGS(req, link),
TP_STRUCT__entry (
__field( void *, ctx )
@ -307,17 +293,17 @@ TRACE_EVENT(io_uring_fail_link,
__field( u8, opcode )
__field( void *, link )
__string( op_str, io_uring_get_opcode(opcode) )
__string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->ctx = req->ctx;
__entry->req = req;
__entry->user_data = user_data;
__entry->opcode = opcode;
__entry->user_data = req->cqe.user_data;
__entry->opcode = req->opcode;
__entry->link = link;
__assign_str(op_str, io_uring_get_opcode(opcode));
__assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p",
@ -376,23 +362,17 @@ TRACE_EVENT(io_uring_complete,
/**
* io_uring_submit_sqe - called before submitting one SQE
*
* @ctx: pointer to a ring context structure
* @req: pointer to a submitted request
* @user_data: user data associated with the request
* @opcode: opcode of request
* @flags request flags
* @force_nonblock: whether a context blocking or not
* @sq_thread: true if sq_thread has submitted this SQE
*
* Allows to track SQE submitting, to understand what was the source of it, SQ
* thread or io_uring_enter call.
*/
TRACE_EVENT(io_uring_submit_sqe,
TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, u32 flags,
bool force_nonblock, bool sq_thread),
TP_PROTO(struct io_kiocb *req, bool force_nonblock),
TP_ARGS(ctx, req, user_data, opcode, flags, force_nonblock, sq_thread),
TP_ARGS(req, force_nonblock),
TP_STRUCT__entry (
__field( void *, ctx )
@ -403,19 +383,19 @@ TRACE_EVENT(io_uring_submit_sqe,
__field( bool, force_nonblock )
__field( bool, sq_thread )
__string( op_str, io_uring_get_opcode(opcode) )
__string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->ctx = req->ctx;
__entry->req = req;
__entry->user_data = user_data;
__entry->opcode = opcode;
__entry->flags = flags;
__entry->user_data = req->cqe.user_data;
__entry->opcode = req->opcode;
__entry->flags = req->flags;
__entry->force_nonblock = force_nonblock;
__entry->sq_thread = sq_thread;
__entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL;
__assign_str(op_str, io_uring_get_opcode(opcode));
__assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, "
@ -427,10 +407,7 @@ TRACE_EVENT(io_uring_submit_sqe,
/*
* io_uring_poll_arm - called after arming a poll wait if successful
*
* @ctx: pointer to a ring context structure
* @req: pointer to the armed request
* @user_data: user data associated with the request
* @opcode: opcode of request
* @mask: request poll events mask
* @events: registered events of interest
*
@ -439,10 +416,9 @@ TRACE_EVENT(io_uring_submit_sqe,
*/
TRACE_EVENT(io_uring_poll_arm,
TP_PROTO(void *ctx, void *req, u64 user_data, u8 opcode,
int mask, int events),
TP_PROTO(struct io_kiocb *req, int mask, int events),
TP_ARGS(ctx, req, user_data, opcode, mask, events),
TP_ARGS(req, mask, events),
TP_STRUCT__entry (
__field( void *, ctx )
@ -452,18 +428,18 @@ TRACE_EVENT(io_uring_poll_arm,
__field( int, mask )
__field( int, events )
__string( op_str, io_uring_get_opcode(opcode) )
__string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->ctx = req->ctx;
__entry->req = req;
__entry->user_data = user_data;
__entry->opcode = opcode;
__entry->user_data = req->cqe.user_data;
__entry->opcode = req->opcode;
__entry->mask = mask;
__entry->events = events;
__assign_str(op_str, io_uring_get_opcode(opcode));
__assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x",
@ -475,18 +451,15 @@ TRACE_EVENT(io_uring_poll_arm,
/*
* io_uring_task_add - called after adding a task
*
* @ctx: pointer to a ring context structure
* @req: pointer to request
* @user_data: user data associated with the request
* @opcode: opcode of request
* @mask: request poll events mask
*
*/
TRACE_EVENT(io_uring_task_add,
TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, int mask),
TP_PROTO(struct io_kiocb *req, int mask),
TP_ARGS(ctx, req, user_data, opcode, mask),
TP_ARGS(req, mask),
TP_STRUCT__entry (
__field( void *, ctx )
@ -495,17 +468,17 @@ TRACE_EVENT(io_uring_task_add,
__field( u8, opcode )
__field( int, mask )
__string( op_str, io_uring_get_opcode(opcode) )
__string( op_str, io_uring_get_opcode(req->opcode) )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->ctx = req->ctx;
__entry->req = req;
__entry->user_data = user_data;
__entry->opcode = opcode;
__entry->user_data = req->cqe.user_data;
__entry->opcode = req->opcode;
__entry->mask = mask;
__assign_str(op_str, io_uring_get_opcode(opcode));
__assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x",
@ -518,7 +491,6 @@ TRACE_EVENT(io_uring_task_add,
* io_uring_req_failed - called when an sqe is errored dring submission
*
* @sqe: pointer to the io_uring_sqe that failed
* @ctx: pointer to a ring context structure
* @req: pointer to request
* @error: error it failed with
*
@ -526,9 +498,9 @@ TRACE_EVENT(io_uring_task_add,
*/
TRACE_EVENT(io_uring_req_failed,
TP_PROTO(const struct io_uring_sqe *sqe, void *ctx, void *req, int error),
TP_PROTO(const struct io_uring_sqe *sqe, struct io_kiocb *req, int error),
TP_ARGS(sqe, ctx, req, error),
TP_ARGS(sqe, req, error),
TP_STRUCT__entry (
__field( void *, ctx )
@ -552,7 +524,7 @@ TRACE_EVENT(io_uring_req_failed,
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->ctx = req->ctx;
__entry->req = req;
__entry->user_data = sqe->user_data;
__entry->opcode = sqe->opcode;
@ -622,12 +594,42 @@ TRACE_EVENT(io_uring_cqe_overflow,
__entry->ocqe = ocqe;
),
TP_printk("ring %p, user_data 0x%llx, res %d, flags %x, "
TP_printk("ring %p, user_data 0x%llx, res %d, cflags 0x%x, "
"overflow_cqe %p",
__entry->ctx, __entry->user_data, __entry->res,
__entry->cflags, __entry->ocqe)
);
/*
* io_uring_task_work_run - ran task work
*
* @tctx: pointer to a io_uring_task
* @count: how many functions it ran
* @loops: how many loops it ran
*
*/
TRACE_EVENT(io_uring_task_work_run,
TP_PROTO(void *tctx, unsigned int count, unsigned int loops),
TP_ARGS(tctx, count, loops),
TP_STRUCT__entry (
__field( void *, tctx )
__field( unsigned int, count )
__field( unsigned int, loops )
),
TP_fast_assign(
__entry->tctx = tctx;
__entry->count = count;
__entry->loops = loops;
),
TP_printk("tctx %p, count %u, loops %u",
__entry->tctx, __entry->count, __entry->loops)
);
#endif /* _TRACE_IO_URING_H */
/* This part must be outside protection */

View File

@ -10,6 +10,7 @@
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/time_types.h>
/*
* IO submission data structure (Submission Queue Entry)
@ -50,6 +51,7 @@ struct io_uring_sqe {
__u32 unlink_flags;
__u32 hardlink_flags;
__u32 xattr_flags;
__u32 msg_ring_flags;
};
__u64 user_data; /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */
@ -140,9 +142,12 @@ enum {
* IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
*/
#define IORING_SETUP_TASKRUN_FLAG (1U << 9)
#define IORING_SETUP_SQE128 (1U << 10) /* SQEs are 128 byte */
#define IORING_SETUP_CQE32 (1U << 11) /* CQEs are 32 byte */
/*
* Only one task is allowed to submit requests
*/
#define IORING_SETUP_SINGLE_ISSUER (1U << 12)
enum io_uring_op {
IORING_OP_NOP,
@ -229,10 +234,13 @@ enum io_uring_op {
*
* IORING_POLL_UPDATE Update existing poll request, matching
* sqe->addr as the old user_data field.
*
* IORING_POLL_LEVEL Level triggered poll.
*/
#define IORING_POLL_ADD_MULTI (1U << 0)
#define IORING_POLL_UPDATE_EVENTS (1U << 1)
#define IORING_POLL_UPDATE_USER_DATA (1U << 2)
#define IORING_POLL_ADD_LEVEL (1U << 3)
/*
* ASYNC_CANCEL flags.
@ -241,10 +249,12 @@ enum io_uring_op {
* IORING_ASYNC_CANCEL_FD Key off 'fd' for cancelation rather than the
* request 'user_data'
* IORING_ASYNC_CANCEL_ANY Match any request
* IORING_ASYNC_CANCEL_FD_FIXED 'fd' passed in is a fixed descriptor
*/
#define IORING_ASYNC_CANCEL_ALL (1U << 0)
#define IORING_ASYNC_CANCEL_FD (1U << 1)
#define IORING_ASYNC_CANCEL_ANY (1U << 2)
#define IORING_ASYNC_CANCEL_FD_FIXED (1U << 3)
/*
* send/sendmsg and recv/recvmsg flags (sqe->ioprio)
@ -253,14 +263,35 @@ enum io_uring_op {
* or receive and arm poll if that yields an
* -EAGAIN result, arm poll upfront and skip
* the initial transfer attempt.
*
* IORING_RECV_MULTISHOT Multishot recv. Sets IORING_CQE_F_MORE if
* the handler will continue to report
* CQEs on behalf of the same SQE.
*/
#define IORING_RECVSEND_POLL_FIRST (1U << 0)
#define IORING_RECV_MULTISHOT (1U << 1)
/*
* accept flags stored in sqe->ioprio
*/
#define IORING_ACCEPT_MULTISHOT (1U << 0)
/*
* IORING_OP_MSG_RING command types, stored in sqe->addr
*/
enum {
IORING_MSG_DATA, /* pass sqe->len as 'res' and off as user_data */
IORING_MSG_SEND_FD, /* send a registered fd to another ring */
};
/*
* IORING_OP_MSG_RING flags (sqe->msg_ring_flags)
*
* IORING_MSG_RING_CQE_SKIP Don't post a CQE to the target ring. Not
* applicable for IORING_MSG_DATA, obviously.
*/
#define IORING_MSG_RING_CQE_SKIP (1U << 0)
/*
* IO completion data structure (Completion Queue Entry)
*/
@ -420,6 +451,12 @@ enum {
IORING_REGISTER_PBUF_RING = 22,
IORING_UNREGISTER_PBUF_RING = 23,
/* sync cancelation API */
IORING_REGISTER_SYNC_CANCEL = 24,
/* register a range of fixed file slots for automatic slot allocation */
IORING_REGISTER_FILE_ALLOC_RANGE = 25,
/* this goes last */
IORING_REGISTER_LAST
};
@ -483,7 +520,7 @@ struct io_uring_probe {
__u8 ops_len; /* length of ops[] array below */
__u16 resv;
__u32 resv2[3];
struct io_uring_probe_op ops[0];
struct io_uring_probe_op ops[];
};
struct io_uring_restriction {
@ -555,4 +592,32 @@ struct io_uring_getevents_arg {
__u64 ts;
};
/*
* Argument for IORING_REGISTER_SYNC_CANCEL
*/
struct io_uring_sync_cancel_reg {
__u64 addr;
__s32 fd;
__u32 flags;
struct __kernel_timespec timeout;
__u64 pad[4];
};
/*
* Argument for IORING_REGISTER_FILE_ALLOC_RANGE
* The range is specified as [off, off + len)
*/
struct io_uring_file_index_range {
__u32 off;
__u32 len;
__u64 resv;
};
struct io_uring_recvmsg_out {
__u32 namelen;
__u32 controllen;
__u32 payloadlen;
__u32 flags;
};
#endif

11
io_uring/Makefile Normal file
View File

@ -0,0 +1,11 @@
# SPDX-License-Identifier: GPL-2.0
#
# Makefile for io_uring
obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
sync.o advise.o filetable.o \
openclose.o uring_cmd.o epoll.o \
statx.o net.o msg_ring.o timeout.o \
sqpoll.o fdinfo.o tctx.o poll.o \
cancel.o kbuf.o rsrc.o rw.o opdef.o
obj-$(CONFIG_IO_WQ) += io-wq.o

99
io_uring/advise.c Normal file
View File

@ -0,0 +1,99 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/io_uring.h>
#include <uapi/linux/fadvise.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "advise.h"
struct io_fadvise {
struct file *file;
u64 offset;
u32 len;
u32 advice;
};
struct io_madvise {
struct file *file;
u64 addr;
u32 len;
u32 advice;
};
int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
struct io_madvise *ma = io_kiocb_to_cmd(req);
if (sqe->buf_index || sqe->off || sqe->splice_fd_in)
return -EINVAL;
ma->addr = READ_ONCE(sqe->addr);
ma->len = READ_ONCE(sqe->len);
ma->advice = READ_ONCE(sqe->fadvise_advice);
return 0;
#else
return -EOPNOTSUPP;
#endif
}
int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
struct io_madvise *ma = io_kiocb_to_cmd(req);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
io_req_set_res(req, ret, 0);
return IOU_OK;
#else
return -EOPNOTSUPP;
#endif
}
int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_fadvise *fa = io_kiocb_to_cmd(req);
if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)
return -EINVAL;
fa->offset = READ_ONCE(sqe->off);
fa->len = READ_ONCE(sqe->len);
fa->advice = READ_ONCE(sqe->fadvise_advice);
return 0;
}
int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_fadvise *fa = io_kiocb_to_cmd(req);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK) {
switch (fa->advice) {
case POSIX_FADV_NORMAL:
case POSIX_FADV_RANDOM:
case POSIX_FADV_SEQUENTIAL:
break;
default:
return -EAGAIN;
}
}
ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
}

7
io_uring/advise.h Normal file
View File

@ -0,0 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_madvise(struct io_kiocb *req, unsigned int issue_flags);
int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_fadvise(struct io_kiocb *req, unsigned int issue_flags);

53
io_uring/alloc_cache.h Normal file
View File

@ -0,0 +1,53 @@
#ifndef IOU_ALLOC_CACHE_H
#define IOU_ALLOC_CACHE_H
/*
* Don't allow the cache to grow beyond this size.
*/
#define IO_ALLOC_CACHE_MAX 512
struct io_cache_entry {
struct hlist_node node;
};
static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
struct io_cache_entry *entry)
{
if (cache->nr_cached < IO_ALLOC_CACHE_MAX) {
cache->nr_cached++;
hlist_add_head(&entry->node, &cache->list);
return true;
}
return false;
}
static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache)
{
if (!hlist_empty(&cache->list)) {
struct hlist_node *node = cache->list.first;
hlist_del(node);
return container_of(node, struct io_cache_entry, node);
}
return NULL;
}
static inline void io_alloc_cache_init(struct io_alloc_cache *cache)
{
INIT_HLIST_HEAD(&cache->list);
cache->nr_cached = 0;
}
static inline void io_alloc_cache_free(struct io_alloc_cache *cache,
void (*free)(struct io_cache_entry *))
{
while (!hlist_empty(&cache->list)) {
struct hlist_node *node = cache->list.first;
hlist_del(node);
free(container_of(node, struct io_cache_entry, node));
}
cache->nr_cached = 0;
}
#endif

315
io_uring/cancel.c Normal file
View File

@ -0,0 +1,315 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/nospec.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "tctx.h"
#include "poll.h"
#include "timeout.h"
#include "cancel.h"
struct io_cancel {
struct file *file;
u64 addr;
u32 flags;
s32 fd;
};
#define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED)
static bool io_cancel_cb(struct io_wq_work *work, void *data)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
struct io_cancel_data *cd = data;
if (req->ctx != cd->ctx)
return false;
if (cd->flags & IORING_ASYNC_CANCEL_ANY) {
;
} else if (cd->flags & IORING_ASYNC_CANCEL_FD) {
if (req->file != cd->file)
return false;
} else {
if (req->cqe.user_data != cd->data)
return false;
}
if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
if (cd->seq == req->work.cancel_seq)
return false;
req->work.cancel_seq = cd->seq;
}
return true;
}
static int io_async_cancel_one(struct io_uring_task *tctx,
struct io_cancel_data *cd)
{
enum io_wq_cancel cancel_ret;
int ret = 0;
bool all;
if (!tctx || !tctx->io_wq)
return -ENOENT;
all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all);
switch (cancel_ret) {
case IO_WQ_CANCEL_OK:
ret = 0;
break;
case IO_WQ_CANCEL_RUNNING:
ret = -EALREADY;
break;
case IO_WQ_CANCEL_NOTFOUND:
ret = -ENOENT;
break;
}
return ret;
}
int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
unsigned issue_flags)
{
struct io_ring_ctx *ctx = cd->ctx;
int ret;
WARN_ON_ONCE(!io_wq_current_is_worker() && tctx != current->io_uring);
ret = io_async_cancel_one(tctx, cd);
/*
* Fall-through even for -EALREADY, as we may have poll armed
* that need unarming.
*/
if (!ret)
return 0;
ret = io_poll_cancel(ctx, cd, issue_flags);
if (ret != -ENOENT)
return ret;
spin_lock(&ctx->completion_lock);
if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
ret = io_timeout_cancel(ctx, cd);
spin_unlock(&ctx->completion_lock);
return ret;
}
int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_cancel *cancel = io_kiocb_to_cmd(req);
if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
return -EINVAL;
if (sqe->off || sqe->len || sqe->splice_fd_in)
return -EINVAL;
cancel->addr = READ_ONCE(sqe->addr);
cancel->flags = READ_ONCE(sqe->cancel_flags);
if (cancel->flags & ~CANCEL_FLAGS)
return -EINVAL;
if (cancel->flags & IORING_ASYNC_CANCEL_FD) {
if (cancel->flags & IORING_ASYNC_CANCEL_ANY)
return -EINVAL;
cancel->fd = READ_ONCE(sqe->fd);
}
return 0;
}
static int __io_async_cancel(struct io_cancel_data *cd,
struct io_uring_task *tctx,
unsigned int issue_flags)
{
bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY);
struct io_ring_ctx *ctx = cd->ctx;
struct io_tctx_node *node;
int ret, nr = 0;
do {
ret = io_try_cancel(tctx, cd, issue_flags);
if (ret == -ENOENT)
break;
if (!all)
return ret;
nr++;
} while (1);
/* slow path, try all io-wq's */
io_ring_submit_lock(ctx, issue_flags);
ret = -ENOENT;
list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
struct io_uring_task *tctx = node->task->io_uring;
ret = io_async_cancel_one(tctx, cd);
if (ret != -ENOENT) {
if (!all)
break;
nr++;
}
}
io_ring_submit_unlock(ctx, issue_flags);
return all ? nr : ret;
}
int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_cancel *cancel = io_kiocb_to_cmd(req);
struct io_cancel_data cd = {
.ctx = req->ctx,
.data = cancel->addr,
.flags = cancel->flags,
.seq = atomic_inc_return(&req->ctx->cancel_seq),
};
struct io_uring_task *tctx = req->task->io_uring;
int ret;
if (cd.flags & IORING_ASYNC_CANCEL_FD) {
if (req->flags & REQ_F_FIXED_FILE ||
cd.flags & IORING_ASYNC_CANCEL_FD_FIXED) {
req->flags |= REQ_F_FIXED_FILE;
req->file = io_file_get_fixed(req, cancel->fd,
issue_flags);
} else {
req->file = io_file_get_normal(req, cancel->fd);
}
if (!req->file) {
ret = -EBADF;
goto done;
}
cd.file = req->file;
}
ret = __io_async_cancel(&cd, tctx, issue_flags);
done:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
}
void init_hash_table(struct io_hash_table *table, unsigned size)
{
unsigned int i;
for (i = 0; i < size; i++) {
spin_lock_init(&table->hbs[i].lock);
INIT_HLIST_HEAD(&table->hbs[i].list);
}
}
static int __io_sync_cancel(struct io_uring_task *tctx,
struct io_cancel_data *cd, int fd)
{
struct io_ring_ctx *ctx = cd->ctx;
/* fixed must be grabbed every time since we drop the uring_lock */
if ((cd->flags & IORING_ASYNC_CANCEL_FD) &&
(cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
unsigned long file_ptr;
if (unlikely(fd > ctx->nr_user_files))
return -EBADF;
fd = array_index_nospec(fd, ctx->nr_user_files);
file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
cd->file = (struct file *) (file_ptr & FFS_MASK);
if (!cd->file)
return -EBADF;
}
return __io_async_cancel(cd, tctx, 0);
}
int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
__must_hold(&ctx->uring_lock)
{
struct io_cancel_data cd = {
.ctx = ctx,
.seq = atomic_inc_return(&ctx->cancel_seq),
};
ktime_t timeout = KTIME_MAX;
struct io_uring_sync_cancel_reg sc;
struct fd f = { };
DEFINE_WAIT(wait);
int ret;
if (copy_from_user(&sc, arg, sizeof(sc)))
return -EFAULT;
if (sc.flags & ~CANCEL_FLAGS)
return -EINVAL;
if (sc.pad[0] || sc.pad[1] || sc.pad[2] || sc.pad[3])
return -EINVAL;
cd.data = sc.addr;
cd.flags = sc.flags;
/* we can grab a normal file descriptor upfront */
if ((cd.flags & IORING_ASYNC_CANCEL_FD) &&
!(cd.flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
f = fdget(sc.fd);
if (!f.file)
return -EBADF;
cd.file = f.file;
}
ret = __io_sync_cancel(current->io_uring, &cd, sc.fd);
/* found something, done! */
if (ret != -EALREADY)
goto out;
if (sc.timeout.tv_sec != -1UL || sc.timeout.tv_nsec != -1UL) {
struct timespec64 ts = {
.tv_sec = sc.timeout.tv_sec,
.tv_nsec = sc.timeout.tv_nsec
};
timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
}
/*
* Keep looking until we get -ENOENT. we'll get woken everytime
* every time a request completes and will retry the cancelation.
*/
do {
cd.seq = atomic_inc_return(&ctx->cancel_seq);
prepare_to_wait(&ctx->cq_wait, &wait, TASK_INTERRUPTIBLE);
ret = __io_sync_cancel(current->io_uring, &cd, sc.fd);
if (ret != -EALREADY)
break;
mutex_unlock(&ctx->uring_lock);
ret = io_run_task_work_sig();
if (ret < 0) {
mutex_lock(&ctx->uring_lock);
break;
}
ret = schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS);
mutex_lock(&ctx->uring_lock);
if (!ret) {
ret = -ETIME;
break;
}
} while (1);
finish_wait(&ctx->cq_wait, &wait);
if (ret == -ENOENT || ret > 0)
ret = 0;
out:
fdput(f);
return ret;
}

23
io_uring/cancel.h Normal file
View File

@ -0,0 +1,23 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/io_uring_types.h>
struct io_cancel_data {
struct io_ring_ctx *ctx;
union {
u64 data;
struct file *file;
};
u32 flags;
int seq;
};
int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags);
int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
unsigned int issue_flags);
void init_hash_table(struct io_hash_table *table, unsigned size);
int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg);

65
io_uring/epoll.c Normal file
View File

@ -0,0 +1,65 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/io_uring.h>
#include <linux/eventpoll.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "epoll.h"
#if defined(CONFIG_EPOLL)
struct io_epoll {
struct file *file;
int epfd;
int op;
int fd;
struct epoll_event event;
};
int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_epoll *epoll = io_kiocb_to_cmd(req);
pr_warn_once("%s: epoll_ctl support in io_uring is deprecated and will "
"be removed in a future Linux kernel version.\n",
current->comm);
if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
epoll->epfd = READ_ONCE(sqe->fd);
epoll->op = READ_ONCE(sqe->len);
epoll->fd = READ_ONCE(sqe->off);
if (ep_op_has_event(epoll->op)) {
struct epoll_event __user *ev;
ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
if (copy_from_user(&epoll->event, ev, sizeof(*ev)))
return -EFAULT;
}
return 0;
}
int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_epoll *ie = io_kiocb_to_cmd(req);
int ret;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
if (force_nonblock && ret == -EAGAIN)
return -EAGAIN;
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
}
#endif

6
io_uring/epoll.h Normal file
View File

@ -0,0 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#if defined(CONFIG_EPOLL)
int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags);
#endif

194
io_uring/fdinfo.c Normal file
View File

@ -0,0 +1,194 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "sqpoll.h"
#include "fdinfo.h"
#include "cancel.h"
#include "rsrc.h"
#ifdef CONFIG_PROC_FS
static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
const struct cred *cred)
{
struct user_namespace *uns = seq_user_ns(m);
struct group_info *gi;
kernel_cap_t cap;
unsigned __capi;
int g;
seq_printf(m, "%5d\n", id);
seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
seq_puts(m, "\n\tGroups:\t");
gi = cred->group_info;
for (g = 0; g < gi->ngroups; g++) {
seq_put_decimal_ull(m, g ? " " : "",
from_kgid_munged(uns, gi->gid[g]));
}
seq_puts(m, "\n\tCapEff:\t");
cap = cred->cap_effective;
CAP_FOR_EACH_U32(__capi)
seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
seq_putc(m, '\n');
return 0;
}
static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
struct seq_file *m)
{
struct io_sq_data *sq = NULL;
struct io_overflow_cqe *ocqe;
struct io_rings *r = ctx->rings;
unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
unsigned int sq_head = READ_ONCE(r->sq.head);
unsigned int sq_tail = READ_ONCE(r->sq.tail);
unsigned int cq_head = READ_ONCE(r->cq.head);
unsigned int cq_tail = READ_ONCE(r->cq.tail);
unsigned int cq_shift = 0;
unsigned int sq_entries, cq_entries;
bool has_lock;
bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
unsigned int i;
if (is_cqe32)
cq_shift = 1;
/*
* we may get imprecise sqe and cqe info if uring is actively running
* since we get cached_sq_head and cached_cq_tail without uring_lock
* and sq_tail and cq_head are changed by userspace. But it's ok since
* we usually use these info when it is stuck.
*/
seq_printf(m, "SqMask:\t0x%x\n", sq_mask);
seq_printf(m, "SqHead:\t%u\n", sq_head);
seq_printf(m, "SqTail:\t%u\n", sq_tail);
seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
seq_printf(m, "CqHead:\t%u\n", cq_head);
seq_printf(m, "CqTail:\t%u\n", cq_tail);
seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
for (i = 0; i < sq_entries; i++) {
unsigned int entry = i + sq_head;
unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
struct io_uring_sqe *sqe;
if (sq_idx > sq_mask)
continue;
sqe = &ctx->sq_sqes[sq_idx];
seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
sq_idx, sqe->opcode, sqe->fd, sqe->flags,
sqe->user_data);
}
seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
for (i = 0; i < cq_entries; i++) {
unsigned int entry = i + cq_head;
struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift];
if (!is_cqe32) {
seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
entry & cq_mask, cqe->user_data, cqe->res,
cqe->flags);
} else {
seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, "
"extra1:%llu, extra2:%llu\n",
entry & cq_mask, cqe->user_data, cqe->res,
cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]);
}
}
/*
* Avoid ABBA deadlock between the seq lock and the io_uring mutex,
* since fdinfo case grabs it in the opposite direction of normal use
* cases. If we fail to get the lock, we just don't iterate any
* structures that could be going away outside the io_uring mutex.
*/
has_lock = mutex_trylock(&ctx->uring_lock);
if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
sq = ctx->sq_data;
if (!sq->thread)
sq = NULL;
}
seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
struct file *f = io_file_from_index(&ctx->file_table, i);
if (f)
seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
else
seq_printf(m, "%5u: <none>\n", i);
}
seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
struct io_mapped_ubuf *buf = ctx->user_bufs[i];
unsigned int len = buf->ubuf_end - buf->ubuf;
seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
}
if (has_lock && !xa_empty(&ctx->personalities)) {
unsigned long index;
const struct cred *cred;
seq_printf(m, "Personalities:\n");
xa_for_each(&ctx->personalities, index, cred)
io_uring_show_cred(m, index, cred);
}
if (has_lock)
mutex_unlock(&ctx->uring_lock);
seq_puts(m, "PollList:\n");
for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) {
struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i];
struct io_kiocb *req;
spin_lock(&hb->lock);
hlist_for_each_entry(req, &hb->list, hash_node)
seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
task_work_pending(req->task));
spin_unlock(&hb->lock);
}
seq_puts(m, "CqOverflowList:\n");
spin_lock(&ctx->completion_lock);
list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
struct io_uring_cqe *cqe = &ocqe->cqe;
seq_printf(m, " user_data=%llu, res=%d, flags=%x\n",
cqe->user_data, cqe->res, cqe->flags);
}
spin_unlock(&ctx->completion_lock);
}
__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
{
struct io_ring_ctx *ctx = f->private_data;
if (percpu_ref_tryget(&ctx->refs)) {
__io_uring_show_fdinfo(ctx, m);
percpu_ref_put(&ctx->refs);
}
}
#endif

3
io_uring/fdinfo.h Normal file
View File

@ -0,0 +1,3 @@
// SPDX-License-Identifier: GPL-2.0
void io_uring_show_fdinfo(struct seq_file *m, struct file *f);

193
io_uring/filetable.c Normal file
View File

@ -0,0 +1,193 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/nospec.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "rsrc.h"
#include "filetable.h"
static int io_file_bitmap_get(struct io_ring_ctx *ctx)
{
struct io_file_table *table = &ctx->file_table;
unsigned long nr = ctx->file_alloc_end;
int ret;
do {
ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint);
if (ret != nr)
return ret;
if (table->alloc_hint == ctx->file_alloc_start)
break;
nr = table->alloc_hint;
table->alloc_hint = ctx->file_alloc_start;
} while (1);
return -ENFILE;
}
bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
{
table->files = kvcalloc(nr_files, sizeof(table->files[0]),
GFP_KERNEL_ACCOUNT);
if (unlikely(!table->files))
return false;
table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT);
if (unlikely(!table->bitmap)) {
kvfree(table->files);
return false;
}
return true;
}
void io_free_file_tables(struct io_file_table *table)
{
kvfree(table->files);
bitmap_free(table->bitmap);
table->files = NULL;
table->bitmap = NULL;
}
static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
u32 slot_index)
__must_hold(&req->ctx->uring_lock)
{
bool needs_switch = false;
struct io_fixed_file *file_slot;
int ret;
if (io_is_uring_fops(file))
return -EBADF;
if (!ctx->file_data)
return -ENXIO;
if (slot_index >= ctx->nr_user_files)
return -EINVAL;
slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
if (file_slot->file_ptr) {
struct file *old_file;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
goto err;
old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
ctx->rsrc_node, old_file);
if (ret)
goto err;
file_slot->file_ptr = 0;
io_file_bitmap_clear(&ctx->file_table, slot_index);
needs_switch = true;
}
ret = io_scm_file_account(ctx, file);
if (!ret) {
*io_get_tag_slot(ctx->file_data, slot_index) = 0;
io_fixed_file_set(file_slot, file);
io_file_bitmap_set(&ctx->file_table, slot_index);
}
err:
if (needs_switch)
io_rsrc_node_switch(ctx, ctx->file_data);
if (ret)
fput(file);
return ret;
}
int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file,
unsigned int file_slot)
{
bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC;
int ret;
if (alloc_slot) {
ret = io_file_bitmap_get(ctx);
if (unlikely(ret < 0))
return ret;
file_slot = ret;
} else {
file_slot--;
}
ret = io_install_fixed_file(ctx, file, file_slot);
if (!ret && alloc_slot)
ret = file_slot;
return ret;
}
/*
* Note when io_fixed_fd_install() returns error value, it will ensure
* fput() is called correspondingly.
*/
int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
struct file *file, unsigned int file_slot)
{
struct io_ring_ctx *ctx = req->ctx;
int ret;
io_ring_submit_lock(ctx, issue_flags);
ret = __io_fixed_fd_install(ctx, file, file_slot);
io_ring_submit_unlock(ctx, issue_flags);
if (unlikely(ret < 0))
fput(file);
return ret;
}
int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
{
struct io_fixed_file *file_slot;
struct file *file;
int ret;
if (unlikely(!ctx->file_data))
return -ENXIO;
if (offset >= ctx->nr_user_files)
return -EINVAL;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;
offset = array_index_nospec(offset, ctx->nr_user_files);
file_slot = io_fixed_file_slot(&ctx->file_table, offset);
if (!file_slot->file_ptr)
return -EBADF;
file = (struct file *)(file_slot->file_ptr & FFS_MASK);
ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
if (ret)
return ret;
file_slot->file_ptr = 0;
io_file_bitmap_clear(&ctx->file_table, offset);
io_rsrc_node_switch(ctx, ctx->file_data);
return 0;
}
int io_register_file_alloc_range(struct io_ring_ctx *ctx,
struct io_uring_file_index_range __user *arg)
{
struct io_uring_file_index_range range;
u32 end;
if (copy_from_user(&range, arg, sizeof(range)))
return -EFAULT;
if (check_add_overflow(range.off, range.len, &end))
return -EOVERFLOW;
if (range.resv || end > ctx->nr_user_files)
return -EINVAL;
io_file_table_set_alloc_range(ctx, range.off, range.len);
return 0;
}

88
io_uring/filetable.h Normal file
View File

@ -0,0 +1,88 @@
// SPDX-License-Identifier: GPL-2.0
#ifndef IOU_FILE_TABLE_H
#define IOU_FILE_TABLE_H
#include <linux/file.h>
#include <linux/io_uring_types.h>
/*
* FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0
* and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we
* can't safely always dereference the file when the task has exited and ring
* cleanup is done. If a file is tracked and part of SCM, then unix gc on
* process exit may reap it before __io_sqe_files_unregister() is run.
*/
#define FFS_NOWAIT 0x1UL
#define FFS_ISREG 0x2UL
#if defined(CONFIG_64BIT)
#define FFS_SCM 0x4UL
#else
#define IO_URING_SCM_ALL
#define FFS_SCM 0x0UL
#endif
#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM)
bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files);
void io_free_file_tables(struct io_file_table *table);
int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
struct file *file, unsigned int file_slot);
int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file,
unsigned int file_slot);
int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset);
int io_register_file_alloc_range(struct io_ring_ctx *ctx,
struct io_uring_file_index_range __user *arg);
unsigned int io_file_get_flags(struct file *file);
static inline void io_file_bitmap_clear(struct io_file_table *table, int bit)
{
__clear_bit(bit, table->bitmap);
table->alloc_hint = bit;
}
static inline void io_file_bitmap_set(struct io_file_table *table, int bit)
{
WARN_ON_ONCE(test_bit(bit, table->bitmap));
__set_bit(bit, table->bitmap);
table->alloc_hint = bit + 1;
}
static inline struct io_fixed_file *
io_fixed_file_slot(struct io_file_table *table, unsigned i)
{
return &table->files[i];
}
static inline struct file *io_file_from_index(struct io_file_table *table,
int index)
{
struct io_fixed_file *slot = io_fixed_file_slot(table, index);
return (struct file *) (slot->file_ptr & FFS_MASK);
}
static inline void io_fixed_file_set(struct io_fixed_file *file_slot,
struct file *file)
{
unsigned long file_ptr = (unsigned long) file;
file_ptr |= io_file_get_flags(file);
file_slot->file_ptr = file_ptr;
}
static inline void io_reset_alloc_hint(struct io_ring_ctx *ctx)
{
ctx->file_table.alloc_hint = ctx->file_alloc_start;
}
static inline void io_file_table_set_alloc_range(struct io_ring_ctx *ctx,
unsigned off, unsigned len)
{
ctx->file_alloc_start = off;
ctx->file_alloc_end = off + len;
io_reset_alloc_hint(ctx);
}
#endif

293
io_uring/fs.c Normal file
View File

@ -0,0 +1,293 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "../fs/internal.h"
#include "io_uring.h"
#include "fs.h"
struct io_rename {
struct file *file;
int old_dfd;
int new_dfd;
struct filename *oldpath;
struct filename *newpath;
int flags;
};
struct io_unlink {
struct file *file;
int dfd;
int flags;
struct filename *filename;
};
struct io_mkdir {
struct file *file;
int dfd;
umode_t mode;
struct filename *filename;
};
struct io_link {
struct file *file;
int old_dfd;
int new_dfd;
struct filename *oldpath;
struct filename *newpath;
int flags;
};
int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_rename *ren = io_kiocb_to_cmd(req);
const char __user *oldf, *newf;
if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
ren->old_dfd = READ_ONCE(sqe->fd);
oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
ren->new_dfd = READ_ONCE(sqe->len);
ren->flags = READ_ONCE(sqe->rename_flags);
ren->oldpath = getname(oldf);
if (IS_ERR(ren->oldpath))
return PTR_ERR(ren->oldpath);
ren->newpath = getname(newf);
if (IS_ERR(ren->newpath)) {
putname(ren->oldpath);
return PTR_ERR(ren->newpath);
}
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_rename *ren = io_kiocb_to_cmd(req);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
ren->newpath, ren->flags);
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
return IOU_OK;
}
void io_renameat_cleanup(struct io_kiocb *req)
{
struct io_rename *ren = io_kiocb_to_cmd(req);
putname(ren->oldpath);
putname(ren->newpath);
}
int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_unlink *un = io_kiocb_to_cmd(req);
const char __user *fname;
if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
un->dfd = READ_ONCE(sqe->fd);
un->flags = READ_ONCE(sqe->unlink_flags);
if (un->flags & ~AT_REMOVEDIR)
return -EINVAL;
fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
un->filename = getname(fname);
if (IS_ERR(un->filename))
return PTR_ERR(un->filename);
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_unlink *un = io_kiocb_to_cmd(req);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
if (un->flags & AT_REMOVEDIR)
ret = do_rmdir(un->dfd, un->filename);
else
ret = do_unlinkat(un->dfd, un->filename);
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
return IOU_OK;
}
void io_unlinkat_cleanup(struct io_kiocb *req)
{
struct io_unlink *ul = io_kiocb_to_cmd(req);
putname(ul->filename);
}
int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_mkdir *mkd = io_kiocb_to_cmd(req);
const char __user *fname;
if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
mkd->dfd = READ_ONCE(sqe->fd);
mkd->mode = READ_ONCE(sqe->len);
fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
mkd->filename = getname(fname);
if (IS_ERR(mkd->filename))
return PTR_ERR(mkd->filename);
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_mkdir *mkd = io_kiocb_to_cmd(req);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
return IOU_OK;
}
void io_mkdirat_cleanup(struct io_kiocb *req)
{
struct io_mkdir *md = io_kiocb_to_cmd(req);
putname(md->filename);
}
int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_link *sl = io_kiocb_to_cmd(req);
const char __user *oldpath, *newpath;
if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
sl->new_dfd = READ_ONCE(sqe->fd);
oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
sl->oldpath = getname(oldpath);
if (IS_ERR(sl->oldpath))
return PTR_ERR(sl->oldpath);
sl->newpath = getname(newpath);
if (IS_ERR(sl->newpath)) {
putname(sl->oldpath);
return PTR_ERR(sl->newpath);
}
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_link *sl = io_kiocb_to_cmd(req);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
return IOU_OK;
}
int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_link *lnk = io_kiocb_to_cmd(req);
const char __user *oldf, *newf;
if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
lnk->old_dfd = READ_ONCE(sqe->fd);
lnk->new_dfd = READ_ONCE(sqe->len);
oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
lnk->flags = READ_ONCE(sqe->hardlink_flags);
lnk->oldpath = getname(oldf);
if (IS_ERR(lnk->oldpath))
return PTR_ERR(lnk->oldpath);
lnk->newpath = getname(newf);
if (IS_ERR(lnk->newpath)) {
putname(lnk->oldpath);
return PTR_ERR(lnk->newpath);
}
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_link *lnk = io_kiocb_to_cmd(req);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
lnk->newpath, lnk->flags);
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
return IOU_OK;
}
void io_link_cleanup(struct io_kiocb *req)
{
struct io_link *sl = io_kiocb_to_cmd(req);
putname(sl->oldpath);
putname(sl->newpath);
}

20
io_uring/fs.h Normal file
View File

@ -0,0 +1,20 @@
// SPDX-License-Identifier: GPL-2.0
int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_renameat(struct io_kiocb *req, unsigned int issue_flags);
void io_renameat_cleanup(struct io_kiocb *req);
int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags);
void io_unlinkat_cleanup(struct io_kiocb *req);
int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags);
void io_mkdirat_cleanup(struct io_kiocb *req);
int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags);
int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_linkat(struct io_kiocb *req, unsigned int issue_flags);
void io_link_cleanup(struct io_kiocb *req);

View File

@ -18,6 +18,8 @@
#include <uapi/linux/io_uring.h>
#include "io-wq.h"
#include "slist.h"
#include "io_uring.h"
#define WORKER_IDLE_TIMEOUT (5 * HZ)
@ -518,23 +520,11 @@ static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
return NULL;
}
static bool io_flush_signals(void)
{
if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) {
__set_current_state(TASK_RUNNING);
clear_notify_signal();
if (task_work_pending(current))
task_work_run();
return true;
}
return false;
}
static void io_assign_current_work(struct io_worker *worker,
struct io_wq_work *work)
{
if (work) {
io_flush_signals();
io_run_task_work();
cond_resched();
}
@ -654,7 +644,7 @@ static int io_wqe_worker(void *data)
last_timeout = false;
__io_worker_idle(wqe, worker);
raw_spin_unlock(&wqe->lock);
if (io_flush_signals())
if (io_run_task_work())
continue;
ret = schedule_timeout(WORKER_IDLE_TIMEOUT);
if (signal_pending(current)) {

83
io_uring/io-wq.h Normal file
View File

@ -0,0 +1,83 @@
#ifndef INTERNAL_IO_WQ_H
#define INTERNAL_IO_WQ_H
#include <linux/refcount.h>
#include <linux/io_uring_types.h>
struct io_wq;
enum {
IO_WQ_WORK_CANCEL = 1,
IO_WQ_WORK_HASHED = 2,
IO_WQ_WORK_UNBOUND = 4,
IO_WQ_WORK_CONCURRENT = 16,
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
enum io_wq_cancel {
IO_WQ_CANCEL_OK, /* cancelled before started */
IO_WQ_CANCEL_RUNNING, /* found, running, and attempted cancelled */
IO_WQ_CANCEL_NOTFOUND, /* work not found */
};
typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
typedef void (io_wq_work_fn)(struct io_wq_work *);
struct io_wq_hash {
refcount_t refs;
unsigned long map;
struct wait_queue_head wait;
};
static inline void io_wq_put_hash(struct io_wq_hash *hash)
{
if (refcount_dec_and_test(&hash->refs))
kfree(hash);
}
struct io_wq_data {
struct io_wq_hash *hash;
struct task_struct *task;
io_wq_work_fn *do_work;
free_work_fn *free_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
void io_wq_exit_start(struct io_wq *wq);
void io_wq_put_and_exit(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_hash_work(struct io_wq_work *work, void *val);
int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask);
int io_wq_max_workers(struct io_wq *wq, int *new_count);
static inline bool io_wq_is_hashed(struct io_wq_work *work)
{
return work->flags & IO_WQ_WORK_HASHED;
}
typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data, bool cancel_all);
#if defined(CONFIG_IO_WQ)
extern void io_wq_worker_sleeping(struct task_struct *);
extern void io_wq_worker_running(struct task_struct *);
#else
static inline void io_wq_worker_sleeping(struct task_struct *tsk)
{
}
static inline void io_wq_worker_running(struct task_struct *tsk)
{
}
#endif
static inline bool io_wq_current_is_worker(void)
{
return in_task() && (current->flags & PF_IO_WORKER) &&
current->worker_private;
}
#endif

3980
io_uring/io_uring.c Normal file

File diff suppressed because it is too large Load Diff

261
io_uring/io_uring.h Normal file
View File

@ -0,0 +1,261 @@
#ifndef IOU_CORE_H
#define IOU_CORE_H
#include <linux/errno.h>
#include <linux/lockdep.h>
#include <linux/io_uring_types.h>
#include "io-wq.h"
#include "slist.h"
#include "filetable.h"
#ifndef CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
#endif
enum {
IOU_OK = 0,
IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED,
/*
* Intended only when both REQ_F_POLLED and REQ_F_APOLL_MULTISHOT
* are set to indicate to the poll runner that multishot should be
* removed and the result is set on req->cqe.res.
*/
IOU_STOP_MULTISHOT = -ECANCELED,
};
struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx);
bool io_req_cqe_overflow(struct io_kiocb *req);
int io_run_task_work_sig(void);
void io_req_complete_failed(struct io_kiocb *req, s32 res);
void __io_req_complete(struct io_kiocb *req, unsigned issue_flags);
void io_req_complete_post(struct io_kiocb *req);
void __io_req_complete_post(struct io_kiocb *req);
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
bool allow_overflow);
void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
struct file *io_file_get_normal(struct io_kiocb *req, int fd);
struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned issue_flags);
static inline bool io_req_ffs_set(struct io_kiocb *req)
{
return req->flags & REQ_F_FIXED_FILE;
}
bool io_is_uring_fops(struct file *file);
bool io_alloc_async_data(struct io_kiocb *req);
void io_req_task_work_add(struct io_kiocb *req);
void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags);
void io_req_task_queue(struct io_kiocb *req);
void io_queue_iowq(struct io_kiocb *req, bool *dont_use);
void io_req_task_complete(struct io_kiocb *req, bool *locked);
void io_req_task_queue_fail(struct io_kiocb *req, int ret);
void io_req_task_submit(struct io_kiocb *req, bool *locked);
void tctx_task_work(struct callback_head *cb);
__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
int io_uring_alloc_task_context(struct task_struct *task,
struct io_ring_ctx *ctx);
int io_poll_issue(struct io_kiocb *req, bool *locked);
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node);
int io_req_prep_async(struct io_kiocb *req);
struct io_wq_work *io_wq_free_work(struct io_wq_work *work);
void io_wq_submit_work(struct io_wq_work *work);
void io_free_req(struct io_kiocb *req);
void io_queue_next(struct io_kiocb *req);
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
bool cancel_all);
#define io_for_each_link(pos, head) \
for (pos = (head); pos; pos = pos->link)
static inline void io_cq_lock(struct io_ring_ctx *ctx)
__acquires(ctx->completion_lock)
{
spin_lock(&ctx->completion_lock);
}
void io_cq_unlock_post(struct io_ring_ctx *ctx);
static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
{
if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
struct io_uring_cqe *cqe = ctx->cqe_cached;
ctx->cached_cq_tail++;
ctx->cqe_cached++;
if (ctx->flags & IORING_SETUP_CQE32)
ctx->cqe_cached++;
return cqe;
}
return __io_get_cqe(ctx);
}
static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
struct io_kiocb *req)
{
struct io_uring_cqe *cqe;
/*
* If we can't get a cq entry, userspace overflowed the
* submission (by quite a lot). Increment the overflow count in
* the ring.
*/
cqe = io_get_cqe(ctx);
if (unlikely(!cqe))
return io_req_cqe_overflow(req);
trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
req->cqe.res, req->cqe.flags,
(req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0,
(req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0);
memcpy(cqe, &req->cqe, sizeof(*cqe));
if (ctx->flags & IORING_SETUP_CQE32) {
u64 extra1 = 0, extra2 = 0;
if (req->flags & REQ_F_CQE32_INIT) {
extra1 = req->extra1;
extra2 = req->extra2;
}
WRITE_ONCE(cqe->big_cqe[0], extra1);
WRITE_ONCE(cqe->big_cqe[1], extra2);
}
return true;
}
static inline void req_set_fail(struct io_kiocb *req)
{
req->flags |= REQ_F_FAIL;
if (req->flags & REQ_F_CQE_SKIP) {
req->flags &= ~REQ_F_CQE_SKIP;
req->flags |= REQ_F_SKIP_LINK_CQES;
}
}
static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags)
{
req->cqe.res = res;
req->cqe.flags = cflags;
}
static inline bool req_has_async_data(struct io_kiocb *req)
{
return req->flags & REQ_F_ASYNC_DATA;
}
static inline void io_put_file(struct file *file)
{
if (file)
fput(file);
}
static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx,
unsigned issue_flags)
{
lockdep_assert_held(&ctx->uring_lock);
if (issue_flags & IO_URING_F_UNLOCKED)
mutex_unlock(&ctx->uring_lock);
}
static inline void io_ring_submit_lock(struct io_ring_ctx *ctx,
unsigned issue_flags)
{
/*
* "Normal" inline submissions always hold the uring_lock, since we
* grab it from the system call. Same is true for the SQPOLL offload.
* The only exception is when we've detached the request and issue it
* from an async worker thread, grab the lock for that case.
*/
if (issue_flags & IO_URING_F_UNLOCKED)
mutex_lock(&ctx->uring_lock);
lockdep_assert_held(&ctx->uring_lock);
}
static inline void io_commit_cqring(struct io_ring_ctx *ctx)
{
/* order cqe stores with ring update */
smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
}
static inline void io_cqring_wake(struct io_ring_ctx *ctx)
{
/*
* wake_up_all() may seem excessive, but io_wake_function() and
* io_should_wake() handle the termination of the loop and only
* wake as many waiters as we need to.
*/
if (wq_has_sleeper(&ctx->cq_wait))
wake_up_all(&ctx->cq_wait);
}
static inline bool io_sqring_full(struct io_ring_ctx *ctx)
{
struct io_rings *r = ctx->rings;
return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
}
static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
/* make sure SQ entry isn't read before tail */
return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
}
static inline bool io_run_task_work(void)
{
if (test_thread_flag(TIF_NOTIFY_SIGNAL)) {
__set_current_state(TASK_RUNNING);
clear_notify_signal();
if (task_work_pending(current))
task_work_run();
return true;
}
return false;
}
static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
{
if (!*locked) {
mutex_lock(&ctx->uring_lock);
*locked = true;
}
}
/*
* Don't complete immediately but use deferred completion infrastructure.
* Protected by ->uring_lock and can only be used either with
* IO_URING_F_COMPLETE_DEFER or inside a tw handler holding the mutex.
*/
static inline void io_req_complete_defer(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{
struct io_submit_state *state = &req->ctx->submit_state;
lockdep_assert_held(&req->ctx->uring_lock);
wq_list_add_tail(&req->comp_list, &state->compl_reqs);
}
static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
if (unlikely(ctx->off_timeout_used || ctx->drain_active || ctx->has_evfd))
__io_commit_cqring_flush(ctx);
}
#endif

549
io_uring/kbuf.c Normal file
View File

@ -0,0 +1,549 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/poll.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "opdef.h"
#include "kbuf.h"
#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
#define BGID_ARRAY 64
struct io_provide_buf {
struct file *file;
__u64 addr;
__u32 len;
__u32 bgid;
__u16 nbufs;
__u16 bid;
};
static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
unsigned int bgid)
{
if (ctx->io_bl && bgid < BGID_ARRAY)
return &ctx->io_bl[bgid];
return xa_load(&ctx->io_bl_xa, bgid);
}
static int io_buffer_add_list(struct io_ring_ctx *ctx,
struct io_buffer_list *bl, unsigned int bgid)
{
bl->bgid = bgid;
if (bgid < BGID_ARRAY)
return 0;
return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
}
void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer_list *bl;
struct io_buffer *buf;
/*
* For legacy provided buffer mode, don't recycle if we already did
* IO to this buffer. For ring-mapped provided buffer mode, we should
* increment ring->head to explicitly monopolize the buffer to avoid
* multiple use.
*/
if (req->flags & REQ_F_PARTIAL_IO)
return;
io_ring_submit_lock(ctx, issue_flags);
buf = req->kbuf;
bl = io_buffer_get_list(ctx, buf->bgid);
list_add(&buf->list, &bl->buf_list);
req->flags &= ~REQ_F_BUFFER_SELECTED;
req->buf_index = buf->bgid;
io_ring_submit_unlock(ctx, issue_flags);
return;
}
unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
{
unsigned int cflags;
/*
* We can add this buffer back to two lists:
*
* 1) The io_buffers_cache list. This one is protected by the
* ctx->uring_lock. If we already hold this lock, add back to this
* list as we can grab it from issue as well.
* 2) The io_buffers_comp list. This one is protected by the
* ctx->completion_lock.
*
* We migrate buffers from the comp_list to the issue cache list
* when we need one.
*/
if (req->flags & REQ_F_BUFFER_RING) {
/* no buffers to recycle for this case */
cflags = __io_put_kbuf_list(req, NULL);
} else if (issue_flags & IO_URING_F_UNLOCKED) {
struct io_ring_ctx *ctx = req->ctx;
spin_lock(&ctx->completion_lock);
cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp);
spin_unlock(&ctx->completion_lock);
} else {
lockdep_assert_held(&req->ctx->uring_lock);
cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache);
}
return cflags;
}
static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
struct io_buffer_list *bl)
{
if (!list_empty(&bl->buf_list)) {
struct io_buffer *kbuf;
kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
list_del(&kbuf->list);
if (*len == 0 || *len > kbuf->len)
*len = kbuf->len;
req->flags |= REQ_F_BUFFER_SELECTED;
req->kbuf = kbuf;
req->buf_index = kbuf->bid;
return u64_to_user_ptr(kbuf->addr);
}
return NULL;
}
static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
struct io_buffer_list *bl,
unsigned int issue_flags)
{
struct io_uring_buf_ring *br = bl->buf_ring;
struct io_uring_buf *buf;
__u16 head = bl->head;
if (unlikely(smp_load_acquire(&br->tail) == head))
return NULL;
head &= bl->mask;
if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
buf = &br->bufs[head];
} else {
int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;
buf = page_address(bl->buf_pages[index]);
buf += off;
}
if (*len == 0 || *len > buf->len)
*len = buf->len;
req->flags |= REQ_F_BUFFER_RING;
req->buf_list = bl;
req->buf_index = buf->bid;
if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) {
/*
* If we came in unlocked, we have no choice but to consume the
* buffer here, otherwise nothing ensures that the buffer won't
* get used by others. This does mean it'll be pinned until the
* IO completes, coming in unlocked means we're being called from
* io-wq context and there may be further retries in async hybrid
* mode. For the locked case, the caller must call commit when
* the transfer completes (or if we get -EAGAIN and must poll of
* retry).
*/
req->buf_list = NULL;
bl->head++;
}
return u64_to_user_ptr(buf->addr);
}
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer_list *bl;
void __user *ret = NULL;
io_ring_submit_lock(req->ctx, issue_flags);
bl = io_buffer_get_list(ctx, req->buf_index);
if (likely(bl)) {
if (bl->buf_nr_pages)
ret = io_ring_buffer_select(req, len, bl, issue_flags);
else
ret = io_provided_buffer_select(req, len, bl);
}
io_ring_submit_unlock(req->ctx, issue_flags);
return ret;
}
static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
{
int i;
ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
GFP_KERNEL);
if (!ctx->io_bl)
return -ENOMEM;
for (i = 0; i < BGID_ARRAY; i++) {
INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
ctx->io_bl[i].bgid = i;
}
return 0;
}
static int __io_remove_buffers(struct io_ring_ctx *ctx,
struct io_buffer_list *bl, unsigned nbufs)
{
unsigned i = 0;
/* shouldn't happen */
if (!nbufs)
return 0;
if (bl->buf_nr_pages) {
int j;
i = bl->buf_ring->tail - bl->head;
for (j = 0; j < bl->buf_nr_pages; j++)
unpin_user_page(bl->buf_pages[j]);
kvfree(bl->buf_pages);
bl->buf_pages = NULL;
bl->buf_nr_pages = 0;
/* make sure it's seen as empty */
INIT_LIST_HEAD(&bl->buf_list);
return i;
}
/* the head kbuf is the list itself */
while (!list_empty(&bl->buf_list)) {
struct io_buffer *nxt;
nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
list_del(&nxt->list);
if (++i == nbufs)
return i;
cond_resched();
}
i++;
return i;
}
void io_destroy_buffers(struct io_ring_ctx *ctx)
{
struct io_buffer_list *bl;
unsigned long index;
int i;
for (i = 0; i < BGID_ARRAY; i++) {
if (!ctx->io_bl)
break;
__io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
}
xa_for_each(&ctx->io_bl_xa, index, bl) {
xa_erase(&ctx->io_bl_xa, bl->bgid);
__io_remove_buffers(ctx, bl, -1U);
kfree(bl);
}
while (!list_empty(&ctx->io_buffers_pages)) {
struct page *page;
page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
list_del_init(&page->lru);
__free_page(page);
}
}
int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_provide_buf *p = io_kiocb_to_cmd(req);
u64 tmp;
if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
sqe->splice_fd_in)
return -EINVAL;
tmp = READ_ONCE(sqe->fd);
if (!tmp || tmp > USHRT_MAX)
return -EINVAL;
memset(p, 0, sizeof(*p));
p->nbufs = tmp;
p->bgid = READ_ONCE(sqe->buf_group);
return 0;
}
int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_provide_buf *p = io_kiocb_to_cmd(req);
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer_list *bl;
int ret = 0;
io_ring_submit_lock(ctx, issue_flags);
ret = -ENOENT;
bl = io_buffer_get_list(ctx, p->bgid);
if (bl) {
ret = -EINVAL;
/* can't use provide/remove buffers command on mapped buffers */
if (!bl->buf_nr_pages)
ret = __io_remove_buffers(ctx, bl, p->nbufs);
}
if (ret < 0)
req_set_fail(req);
/* complete before unlock, IOPOLL may need the lock */
io_req_set_res(req, ret, 0);
__io_req_complete(req, issue_flags);
io_ring_submit_unlock(ctx, issue_flags);
return IOU_ISSUE_SKIP_COMPLETE;
}
int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
unsigned long size, tmp_check;
struct io_provide_buf *p = io_kiocb_to_cmd(req);
u64 tmp;
if (sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
tmp = READ_ONCE(sqe->fd);
if (!tmp || tmp > USHRT_MAX)
return -E2BIG;
p->nbufs = tmp;
p->addr = READ_ONCE(sqe->addr);
p->len = READ_ONCE(sqe->len);
if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
&size))
return -EOVERFLOW;
if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
return -EOVERFLOW;
size = (unsigned long)p->len * p->nbufs;
if (!access_ok(u64_to_user_ptr(p->addr), size))
return -EFAULT;
p->bgid = READ_ONCE(sqe->buf_group);
tmp = READ_ONCE(sqe->off);
if (tmp > USHRT_MAX)
return -E2BIG;
p->bid = tmp;
return 0;
}
static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
{
struct io_buffer *buf;
struct page *page;
int bufs_in_page;
/*
* Completions that don't happen inline (eg not under uring_lock) will
* add to ->io_buffers_comp. If we don't have any free buffers, check
* the completion list and splice those entries first.
*/
if (!list_empty_careful(&ctx->io_buffers_comp)) {
spin_lock(&ctx->completion_lock);
if (!list_empty(&ctx->io_buffers_comp)) {
list_splice_init(&ctx->io_buffers_comp,
&ctx->io_buffers_cache);
spin_unlock(&ctx->completion_lock);
return 0;
}
spin_unlock(&ctx->completion_lock);
}
/*
* No free buffers and no completion entries either. Allocate a new
* page worth of buffer entries and add those to our freelist.
*/
page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!page)
return -ENOMEM;
list_add(&page->lru, &ctx->io_buffers_pages);
buf = page_address(page);
bufs_in_page = PAGE_SIZE / sizeof(*buf);
while (bufs_in_page) {
list_add_tail(&buf->list, &ctx->io_buffers_cache);
buf++;
bufs_in_page--;
}
return 0;
}
static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
struct io_buffer_list *bl)
{
struct io_buffer *buf;
u64 addr = pbuf->addr;
int i, bid = pbuf->bid;
for (i = 0; i < pbuf->nbufs; i++) {
if (list_empty(&ctx->io_buffers_cache) &&
io_refill_buffer_cache(ctx))
break;
buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
list);
list_move_tail(&buf->list, &bl->buf_list);
buf->addr = addr;
buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
buf->bid = bid;
buf->bgid = pbuf->bgid;
addr += pbuf->len;
bid++;
cond_resched();
}
return i ? 0 : -ENOMEM;
}
int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_provide_buf *p = io_kiocb_to_cmd(req);
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer_list *bl;
int ret = 0;
io_ring_submit_lock(ctx, issue_flags);
if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
ret = io_init_bl_list(ctx);
if (ret)
goto err;
}
bl = io_buffer_get_list(ctx, p->bgid);
if (unlikely(!bl)) {
bl = kzalloc(sizeof(*bl), GFP_KERNEL);
if (!bl) {
ret = -ENOMEM;
goto err;
}
INIT_LIST_HEAD(&bl->buf_list);
ret = io_buffer_add_list(ctx, bl, p->bgid);
if (ret) {
kfree(bl);
goto err;
}
}
/* can't add buffers via this command for a mapped buffer ring */
if (bl->buf_nr_pages) {
ret = -EINVAL;
goto err;
}
ret = io_add_buffers(ctx, p, bl);
err:
if (ret < 0)
req_set_fail(req);
/* complete before unlock, IOPOLL may need the lock */
io_req_set_res(req, ret, 0);
__io_req_complete(req, issue_flags);
io_ring_submit_unlock(ctx, issue_flags);
return IOU_ISSUE_SKIP_COMPLETE;
}
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_ring *br;
struct io_uring_buf_reg reg;
struct io_buffer_list *bl, *free_bl = NULL;
struct page **pages;
int nr_pages;
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
return -EINVAL;
if (!reg.ring_addr)
return -EFAULT;
if (reg.ring_addr & ~PAGE_MASK)
return -EINVAL;
if (!is_power_of_2(reg.ring_entries))
return -EINVAL;
/* cannot disambiguate full vs empty due to head/tail size */
if (reg.ring_entries >= 65536)
return -EINVAL;
if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
int ret = io_init_bl_list(ctx);
if (ret)
return ret;
}
bl = io_buffer_get_list(ctx, reg.bgid);
if (bl) {
/* if mapped buffer ring OR classic exists, don't allow */
if (bl->buf_nr_pages || !list_empty(&bl->buf_list))
return -EEXIST;
} else {
free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
if (!bl)
return -ENOMEM;
}
pages = io_pin_pages(reg.ring_addr,
struct_size(br, bufs, reg.ring_entries),
&nr_pages);
if (IS_ERR(pages)) {
kfree(free_bl);
return PTR_ERR(pages);
}
br = page_address(pages[0]);
bl->buf_pages = pages;
bl->buf_nr_pages = nr_pages;
bl->nr_entries = reg.ring_entries;
bl->buf_ring = br;
bl->mask = reg.ring_entries - 1;
io_buffer_add_list(ctx, bl, reg.bgid);
return 0;
}
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_reg reg;
struct io_buffer_list *bl;
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
return -EINVAL;
bl = io_buffer_get_list(ctx, reg.bgid);
if (!bl)
return -ENOENT;
if (!bl->buf_nr_pages)
return -EINVAL;
__io_remove_buffers(ctx, bl, -1U);
if (bl->bgid >= BGID_ARRAY) {
xa_erase(&ctx->io_bl_xa, bl->bgid);
kfree(bl);
}
return 0;
}

140
io_uring/kbuf.h Normal file
View File

@ -0,0 +1,140 @@
// SPDX-License-Identifier: GPL-2.0
#ifndef IOU_KBUF_H
#define IOU_KBUF_H
#include <uapi/linux/io_uring.h>
struct io_buffer_list {
/*
* If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
* then these are classic provided buffers and ->buf_list is used.
*/
union {
struct list_head buf_list;
struct {
struct page **buf_pages;
struct io_uring_buf_ring *buf_ring;
};
};
__u16 bgid;
/* below is for ring provided buffers */
__u16 buf_nr_pages;
__u16 nr_entries;
__u16 head;
__u16 mask;
};
struct io_buffer {
struct list_head list;
__u64 addr;
__u32 len;
__u16 bid;
__u16 bgid;
};
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
unsigned int issue_flags);
void io_destroy_buffers(struct io_ring_ctx *ctx);
int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags);
int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
static inline void io_kbuf_recycle_ring(struct io_kiocb *req)
{
/*
* We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
* the flag and hence ensure that bl->head doesn't get incremented.
* If the tail has already been incremented, hang on to it.
* The exception is partial io, that case we should increment bl->head
* to monopolize the buffer.
*/
if (req->buf_list) {
if (req->flags & REQ_F_PARTIAL_IO) {
/*
* If we end up here, then the io_uring_lock has
* been kept held since we retrieved the buffer.
* For the io-wq case, we already cleared
* req->buf_list when the buffer was retrieved,
* hence it cannot be set here for that case.
*/
req->buf_list->head++;
req->buf_list = NULL;
} else {
req->buf_index = req->buf_list->bgid;
req->flags &= ~REQ_F_BUFFER_RING;
}
}
}
static inline bool io_do_buffer_select(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_BUFFER_SELECT))
return false;
return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING));
}
static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
{
/*
* READV uses fields in `struct io_rw` (len/addr) to stash the selected
* buffer data. However if that buffer is recycled the original request
* data stored in addr is lost. Therefore forbid recycling for now.
*/
if (req->opcode == IORING_OP_READV)
return;
if (req->flags & REQ_F_BUFFER_SELECTED)
io_kbuf_recycle_legacy(req, issue_flags);
if (req->flags & REQ_F_BUFFER_RING)
io_kbuf_recycle_ring(req);
}
static inline unsigned int __io_put_kbuf_list(struct io_kiocb *req,
struct list_head *list)
{
unsigned int ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
if (req->flags & REQ_F_BUFFER_RING) {
if (req->buf_list) {
req->buf_index = req->buf_list->bgid;
req->buf_list->head++;
}
req->flags &= ~REQ_F_BUFFER_RING;
} else {
req->buf_index = req->kbuf->bgid;
list_add(&req->kbuf->list, list);
req->flags &= ~REQ_F_BUFFER_SELECTED;
}
return ret;
}
static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
{
lockdep_assert_held(&req->ctx->completion_lock);
if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return 0;
return __io_put_kbuf_list(req, &req->ctx->io_buffers_comp);
}
static inline unsigned int io_put_kbuf(struct io_kiocb *req,
unsigned issue_flags)
{
if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return 0;
return __io_put_kbuf(req, issue_flags);
}
#endif

171
io_uring/msg_ring.c Normal file
View File

@ -0,0 +1,171 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/nospec.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "rsrc.h"
#include "filetable.h"
#include "msg_ring.h"
struct io_msg {
struct file *file;
u64 user_data;
u32 len;
u32 cmd;
u32 src_fd;
u32 dst_fd;
u32 flags;
};
static int io_msg_ring_data(struct io_kiocb *req)
{
struct io_ring_ctx *target_ctx = req->file->private_data;
struct io_msg *msg = io_kiocb_to_cmd(req);
if (msg->src_fd || msg->dst_fd || msg->flags)
return -EINVAL;
if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true))
return 0;
return -EOVERFLOW;
}
static void io_double_unlock_ctx(struct io_ring_ctx *ctx,
struct io_ring_ctx *octx,
unsigned int issue_flags)
{
if (issue_flags & IO_URING_F_UNLOCKED)
mutex_unlock(&ctx->uring_lock);
mutex_unlock(&octx->uring_lock);
}
static int io_double_lock_ctx(struct io_ring_ctx *ctx,
struct io_ring_ctx *octx,
unsigned int issue_flags)
{
/*
* To ensure proper ordering between the two ctxs, we can only
* attempt a trylock on the target. If that fails and we already have
* the source ctx lock, punt to io-wq.
*/
if (!(issue_flags & IO_URING_F_UNLOCKED)) {
if (!mutex_trylock(&octx->uring_lock))
return -EAGAIN;
return 0;
}
/* Always grab smallest value ctx first. We know ctx != octx. */
if (ctx < octx) {
mutex_lock(&ctx->uring_lock);
mutex_lock(&octx->uring_lock);
} else {
mutex_lock(&octx->uring_lock);
mutex_lock(&ctx->uring_lock);
}
return 0;
}
static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *target_ctx = req->file->private_data;
struct io_msg *msg = io_kiocb_to_cmd(req);
struct io_ring_ctx *ctx = req->ctx;
unsigned long file_ptr;
struct file *src_file;
int ret;
if (target_ctx == ctx)
return -EINVAL;
ret = io_double_lock_ctx(ctx, target_ctx, issue_flags);
if (unlikely(ret))
return ret;
ret = -EBADF;
if (unlikely(msg->src_fd >= ctx->nr_user_files))
goto out_unlock;
msg->src_fd = array_index_nospec(msg->src_fd, ctx->nr_user_files);
file_ptr = io_fixed_file_slot(&ctx->file_table, msg->src_fd)->file_ptr;
src_file = (struct file *) (file_ptr & FFS_MASK);
get_file(src_file);
ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd);
if (ret < 0) {
fput(src_file);
goto out_unlock;
}
if (msg->flags & IORING_MSG_RING_CQE_SKIP)
goto out_unlock;
/*
* If this fails, the target still received the file descriptor but
* wasn't notified of the fact. This means that if this request
* completes with -EOVERFLOW, then the sender must ensure that a
* later IORING_OP_MSG_RING delivers the message.
*/
if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true))
ret = -EOVERFLOW;
out_unlock:
io_double_unlock_ctx(ctx, target_ctx, issue_flags);
return ret;
}
int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_msg *msg = io_kiocb_to_cmd(req);
if (unlikely(sqe->buf_index || sqe->personality))
return -EINVAL;
msg->user_data = READ_ONCE(sqe->off);
msg->len = READ_ONCE(sqe->len);
msg->cmd = READ_ONCE(sqe->addr);
msg->src_fd = READ_ONCE(sqe->addr3);
msg->dst_fd = READ_ONCE(sqe->file_index);
msg->flags = READ_ONCE(sqe->msg_ring_flags);
if (msg->flags & ~IORING_MSG_RING_CQE_SKIP)
return -EINVAL;
return 0;
}
int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_msg *msg = io_kiocb_to_cmd(req);
int ret;
ret = -EBADFD;
if (!io_is_uring_fops(req->file))
goto done;
switch (msg->cmd) {
case IORING_MSG_DATA:
ret = io_msg_ring_data(req);
break;
case IORING_MSG_SEND_FD:
ret = io_msg_send_fd(req, issue_flags);
break;
default:
ret = -EINVAL;
break;
}
done:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
/* put file to avoid an attempt to IOPOLL the req */
io_put_file(req->file);
req->file = NULL;
return IOU_OK;
}

4
io_uring/msg_ring.h Normal file
View File

@ -0,0 +1,4 @@
// SPDX-License-Identifier: GPL-2.0
int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags);

1047
io_uring/net.c Normal file

File diff suppressed because it is too large Load Diff

60
io_uring/net.h Normal file
View File

@ -0,0 +1,60 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/net.h>
#include <linux/uio.h>
#include "alloc_cache.h"
#if defined(CONFIG_NET)
struct io_async_msghdr {
union {
struct iovec fast_iov[UIO_FASTIOV];
struct {
struct iovec fast_iov_one;
__kernel_size_t controllen;
int namelen;
__kernel_size_t payloadlen;
};
struct io_cache_entry cache;
};
/* points to an allocated iov, if NULL we use fast_iov instead */
struct iovec *free_iov;
struct sockaddr __user *uaddr;
struct msghdr msg;
struct sockaddr_storage addr;
};
struct io_async_connect {
struct sockaddr_storage address;
};
int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_shutdown(struct io_kiocb *req, unsigned int issue_flags);
int io_sendmsg_prep_async(struct io_kiocb *req);
void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req);
int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags);
int io_send(struct io_kiocb *req, unsigned int issue_flags);
int io_recvmsg_prep_async(struct io_kiocb *req);
int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags);
int io_recv(struct io_kiocb *req, unsigned int issue_flags);
int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_accept(struct io_kiocb *req, unsigned int issue_flags);
int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_socket(struct io_kiocb *req, unsigned int issue_flags);
int io_connect_prep_async(struct io_kiocb *req);
int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_connect(struct io_kiocb *req, unsigned int issue_flags);
void io_netmsg_cache_free(struct io_cache_entry *entry);
#else
static inline void io_netmsg_cache_free(struct io_cache_entry *entry)
{
}
#endif

25
io_uring/nop.c Normal file
View File

@ -0,0 +1,25 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "nop.h"
int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
return 0;
}
/*
* IORING_OP_NOP just posts a completion event, nothing else.
*/
int io_nop(struct io_kiocb *req, unsigned int issue_flags)
{
io_req_set_res(req, 0, 0);
return IOU_OK;
}

4
io_uring/nop.h Normal file
View File

@ -0,0 +1,4 @@
// SPDX-License-Identifier: GPL-2.0
int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_nop(struct io_kiocb *req, unsigned int issue_flags);

494
io_uring/opdef.c Normal file
View File

@ -0,0 +1,494 @@
// SPDX-License-Identifier: GPL-2.0
/*
* io_uring opcode handling table
*/
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/io_uring.h>
#include "io_uring.h"
#include "opdef.h"
#include "refs.h"
#include "tctx.h"
#include "sqpoll.h"
#include "fdinfo.h"
#include "kbuf.h"
#include "rsrc.h"
#include "xattr.h"
#include "nop.h"
#include "fs.h"
#include "splice.h"
#include "sync.h"
#include "advise.h"
#include "openclose.h"
#include "uring_cmd.h"
#include "epoll.h"
#include "statx.h"
#include "net.h"
#include "msg_ring.h"
#include "timeout.h"
#include "poll.h"
#include "cancel.h"
#include "rw.h"
static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
{
WARN_ON_ONCE(1);
return -ECANCELED;
}
static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb,
const struct io_uring_sqe *sqe)
{
return -EOPNOTSUPP;
}
const struct io_op_def io_op_defs[] = {
[IORING_OP_NOP] = {
.audit_skip = 1,
.iopoll = 1,
.name = "NOP",
.prep = io_nop_prep,
.issue = io_nop,
},
[IORING_OP_READV] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
.plug = 1,
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
.async_size = sizeof(struct io_async_rw),
.name = "READV",
.prep = io_prep_rw,
.issue = io_read,
.prep_async = io_readv_prep_async,
.cleanup = io_readv_writev_cleanup,
},
[IORING_OP_WRITEV] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.plug = 1,
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
.async_size = sizeof(struct io_async_rw),
.name = "WRITEV",
.prep = io_prep_rw,
.issue = io_write,
.prep_async = io_writev_prep_async,
.cleanup = io_readv_writev_cleanup,
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
.audit_skip = 1,
.name = "FSYNC",
.prep = io_fsync_prep,
.issue = io_fsync,
},
[IORING_OP_READ_FIXED] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.plug = 1,
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
.async_size = sizeof(struct io_async_rw),
.name = "READ_FIXED",
.prep = io_prep_rw,
.issue = io_read,
},
[IORING_OP_WRITE_FIXED] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.plug = 1,
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
.async_size = sizeof(struct io_async_rw),
.name = "WRITE_FIXED",
.prep = io_prep_rw,
.issue = io_write,
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.audit_skip = 1,
.name = "POLL_ADD",
.prep = io_poll_add_prep,
.issue = io_poll_add,
},
[IORING_OP_POLL_REMOVE] = {
.audit_skip = 1,
.name = "POLL_REMOVE",
.prep = io_poll_remove_prep,
.issue = io_poll_remove,
},
[IORING_OP_SYNC_FILE_RANGE] = {
.needs_file = 1,
.audit_skip = 1,
.name = "SYNC_FILE_RANGE",
.prep = io_sfr_prep,
.issue = io_sync_file_range,
},
[IORING_OP_SENDMSG] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.ioprio = 1,
.name = "SENDMSG",
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep = io_sendmsg_prep,
.issue = io_sendmsg,
.prep_async = io_sendmsg_prep_async,
.cleanup = io_sendmsg_recvmsg_cleanup,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_RECVMSG] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
.ioprio = 1,
.name = "RECVMSG",
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep = io_recvmsg_prep,
.issue = io_recvmsg,
.prep_async = io_recvmsg_prep_async,
.cleanup = io_sendmsg_recvmsg_cleanup,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_TIMEOUT] = {
.audit_skip = 1,
.async_size = sizeof(struct io_timeout_data),
.name = "TIMEOUT",
.prep = io_timeout_prep,
.issue = io_timeout,
},
[IORING_OP_TIMEOUT_REMOVE] = {
/* used by timeout updates' prep() */
.audit_skip = 1,
.name = "TIMEOUT_REMOVE",
.prep = io_timeout_remove_prep,
.issue = io_timeout_remove,
},
[IORING_OP_ACCEPT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.poll_exclusive = 1,
.ioprio = 1, /* used for flags */
.name = "ACCEPT",
#if defined(CONFIG_NET)
.prep = io_accept_prep,
.issue = io_accept,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_ASYNC_CANCEL] = {
.audit_skip = 1,
.name = "ASYNC_CANCEL",
.prep = io_async_cancel_prep,
.issue = io_async_cancel,
},
[IORING_OP_LINK_TIMEOUT] = {
.audit_skip = 1,
.async_size = sizeof(struct io_timeout_data),
.name = "LINK_TIMEOUT",
.prep = io_link_timeout_prep,
.issue = io_no_issue,
},
[IORING_OP_CONNECT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.name = "CONNECT",
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_connect),
.prep = io_connect_prep,
.issue = io_connect,
.prep_async = io_connect_prep_async,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
.name = "FALLOCATE",
.prep = io_fallocate_prep,
.issue = io_fallocate,
},
[IORING_OP_OPENAT] = {
.name = "OPENAT",
.prep = io_openat_prep,
.issue = io_openat,
.cleanup = io_open_cleanup,
},
[IORING_OP_CLOSE] = {
.name = "CLOSE",
.prep = io_close_prep,
.issue = io_close,
},
[IORING_OP_FILES_UPDATE] = {
.audit_skip = 1,
.iopoll = 1,
.name = "FILES_UPDATE",
.prep = io_files_update_prep,
.issue = io_files_update,
},
[IORING_OP_STATX] = {
.audit_skip = 1,
.name = "STATX",
.prep = io_statx_prep,
.issue = io_statx,
.cleanup = io_statx_cleanup,
},
[IORING_OP_READ] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
.plug = 1,
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
.async_size = sizeof(struct io_async_rw),
.name = "READ",
.prep = io_prep_rw,
.issue = io_read,
},
[IORING_OP_WRITE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.plug = 1,
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
.async_size = sizeof(struct io_async_rw),
.name = "WRITE",
.prep = io_prep_rw,
.issue = io_write,
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
.audit_skip = 1,
.name = "FADVISE",
.prep = io_fadvise_prep,
.issue = io_fadvise,
},
[IORING_OP_MADVISE] = {
.name = "MADVISE",
.prep = io_madvise_prep,
.issue = io_madvise,
},
[IORING_OP_SEND] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.audit_skip = 1,
.ioprio = 1,
.name = "SEND",
#if defined(CONFIG_NET)
.prep = io_sendmsg_prep,
.issue = io_send,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_RECV] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
.audit_skip = 1,
.ioprio = 1,
.name = "RECV",
#if defined(CONFIG_NET)
.prep = io_recvmsg_prep,
.issue = io_recv,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_OPENAT2] = {
.name = "OPENAT2",
.prep = io_openat2_prep,
.issue = io_openat2,
.cleanup = io_open_cleanup,
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
.audit_skip = 1,
.name = "EPOLL",
#if defined(CONFIG_EPOLL)
.prep = io_epoll_ctl_prep,
.issue = io_epoll_ctl,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_SPLICE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.audit_skip = 1,
.name = "SPLICE",
.prep = io_splice_prep,
.issue = io_splice,
},
[IORING_OP_PROVIDE_BUFFERS] = {
.audit_skip = 1,
.iopoll = 1,
.name = "PROVIDE_BUFFERS",
.prep = io_provide_buffers_prep,
.issue = io_provide_buffers,
},
[IORING_OP_REMOVE_BUFFERS] = {
.audit_skip = 1,
.iopoll = 1,
.name = "REMOVE_BUFFERS",
.prep = io_remove_buffers_prep,
.issue = io_remove_buffers,
},
[IORING_OP_TEE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.audit_skip = 1,
.name = "TEE",
.prep = io_tee_prep,
.issue = io_tee,
},
[IORING_OP_SHUTDOWN] = {
.needs_file = 1,
.name = "SHUTDOWN",
#if defined(CONFIG_NET)
.prep = io_shutdown_prep,
.issue = io_shutdown,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_RENAMEAT] = {
.name = "RENAMEAT",
.prep = io_renameat_prep,
.issue = io_renameat,
.cleanup = io_renameat_cleanup,
},
[IORING_OP_UNLINKAT] = {
.name = "UNLINKAT",
.prep = io_unlinkat_prep,
.issue = io_unlinkat,
.cleanup = io_unlinkat_cleanup,
},
[IORING_OP_MKDIRAT] = {
.name = "MKDIRAT",
.prep = io_mkdirat_prep,
.issue = io_mkdirat,
.cleanup = io_mkdirat_cleanup,
},
[IORING_OP_SYMLINKAT] = {
.name = "SYMLINKAT",
.prep = io_symlinkat_prep,
.issue = io_symlinkat,
.cleanup = io_link_cleanup,
},
[IORING_OP_LINKAT] = {
.name = "LINKAT",
.prep = io_linkat_prep,
.issue = io_linkat,
.cleanup = io_link_cleanup,
},
[IORING_OP_MSG_RING] = {
.needs_file = 1,
.iopoll = 1,
.name = "MSG_RING",
.prep = io_msg_ring_prep,
.issue = io_msg_ring,
},
[IORING_OP_FSETXATTR] = {
.needs_file = 1,
.name = "FSETXATTR",
.prep = io_fsetxattr_prep,
.issue = io_fsetxattr,
.cleanup = io_xattr_cleanup,
},
[IORING_OP_SETXATTR] = {
.name = "SETXATTR",
.prep = io_setxattr_prep,
.issue = io_setxattr,
.cleanup = io_xattr_cleanup,
},
[IORING_OP_FGETXATTR] = {
.needs_file = 1,
.name = "FGETXATTR",
.prep = io_fgetxattr_prep,
.issue = io_fgetxattr,
.cleanup = io_xattr_cleanup,
},
[IORING_OP_GETXATTR] = {
.name = "GETXATTR",
.prep = io_getxattr_prep,
.issue = io_getxattr,
.cleanup = io_xattr_cleanup,
},
[IORING_OP_SOCKET] = {
.audit_skip = 1,
.name = "SOCKET",
#if defined(CONFIG_NET)
.prep = io_socket_prep,
.issue = io_socket,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_URING_CMD] = {
.needs_file = 1,
.plug = 1,
.name = "URING_CMD",
.async_size = uring_cmd_pdu_size(1),
.prep = io_uring_cmd_prep,
.issue = io_uring_cmd,
.prep_async = io_uring_cmd_prep_async,
},
};
const char *io_uring_get_opcode(u8 opcode)
{
if (opcode < IORING_OP_LAST)
return io_op_defs[opcode].name;
return "INVALID";
}
void __init io_uring_optable_init(void)
{
int i;
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) {
BUG_ON(!io_op_defs[i].prep);
if (io_op_defs[i].prep != io_eopnotsupp_prep)
BUG_ON(!io_op_defs[i].issue);
WARN_ON_ONCE(!io_op_defs[i].name);
}
}

42
io_uring/opdef.h Normal file
View File

@ -0,0 +1,42 @@
// SPDX-License-Identifier: GPL-2.0
#ifndef IOU_OP_DEF_H
#define IOU_OP_DEF_H
struct io_op_def {
/* needs req->file assigned */
unsigned needs_file : 1;
/* should block plug */
unsigned plug : 1;
/* hash wq insertion if file is a regular file */
unsigned hash_reg_file : 1;
/* unbound wq insertion if file is a non-regular file */
unsigned unbound_nonreg_file : 1;
/* set if opcode supports polled "wait" */
unsigned pollin : 1;
unsigned pollout : 1;
unsigned poll_exclusive : 1;
/* op supports buffer selection */
unsigned buffer_select : 1;
/* opcode is not supported by this kernel */
unsigned not_supported : 1;
/* skip auditing */
unsigned audit_skip : 1;
/* supports ioprio */
unsigned ioprio : 1;
/* supports iopoll */
unsigned iopoll : 1;
/* size of async data needed, if any */
unsigned short async_size;
const char *name;
int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
int (*issue)(struct io_kiocb *, unsigned int);
int (*prep_async)(struct io_kiocb *);
void (*cleanup)(struct io_kiocb *);
};
extern const struct io_op_def io_op_defs[];
void io_uring_optable_init(void);
#endif

256
io_uring/openclose.c Normal file
View File

@ -0,0 +1,256 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/fsnotify.h>
#include <linux/namei.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "../fs/internal.h"
#include "io_uring.h"
#include "rsrc.h"
#include "openclose.h"
struct io_open {
struct file *file;
int dfd;
u32 file_slot;
struct filename *filename;
struct open_how how;
unsigned long nofile;
};
struct io_close {
struct file *file;
int fd;
u32 file_slot;
};
static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_open *open = io_kiocb_to_cmd(req);
const char __user *fname;
int ret;
if (unlikely(sqe->buf_index))
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
/* open.how should be already initialised */
if (!(open->how.flags & O_PATH) && force_o_largefile())
open->how.flags |= O_LARGEFILE;
open->dfd = READ_ONCE(sqe->fd);
fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
open->filename = getname(fname);
if (IS_ERR(open->filename)) {
ret = PTR_ERR(open->filename);
open->filename = NULL;
return ret;
}
open->file_slot = READ_ONCE(sqe->file_index);
if (open->file_slot && (open->how.flags & O_CLOEXEC))
return -EINVAL;
open->nofile = rlimit(RLIMIT_NOFILE);
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_open *open = io_kiocb_to_cmd(req);
u64 mode = READ_ONCE(sqe->len);
u64 flags = READ_ONCE(sqe->open_flags);
open->how = build_open_how(flags, mode);
return __io_openat_prep(req, sqe);
}
int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_open *open = io_kiocb_to_cmd(req);
struct open_how __user *how;
size_t len;
int ret;
how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
len = READ_ONCE(sqe->len);
if (len < OPEN_HOW_SIZE_VER0)
return -EINVAL;
ret = copy_struct_from_user(&open->how, sizeof(open->how), how, len);
if (ret)
return ret;
return __io_openat_prep(req, sqe);
}
int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_open *open = io_kiocb_to_cmd(req);
struct open_flags op;
struct file *file;
bool resolve_nonblock, nonblock_set;
bool fixed = !!open->file_slot;
int ret;
ret = build_open_flags(&open->how, &op);
if (ret)
goto err;
nonblock_set = op.open_flag & O_NONBLOCK;
resolve_nonblock = open->how.resolve & RESOLVE_CACHED;
if (issue_flags & IO_URING_F_NONBLOCK) {
/*
* Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
* it'll always -EAGAIN
*/
if (open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
return -EAGAIN;
op.lookup_flags |= LOOKUP_CACHED;
op.open_flag |= O_NONBLOCK;
}
if (!fixed) {
ret = __get_unused_fd_flags(open->how.flags, open->nofile);
if (ret < 0)
goto err;
}
file = do_filp_open(open->dfd, open->filename, &op);
if (IS_ERR(file)) {
/*
* We could hang on to this 'fd' on retrying, but seems like
* marginal gain for something that is now known to be a slower
* path. So just put it, and we'll get a new one when we retry.
*/
if (!fixed)
put_unused_fd(ret);
ret = PTR_ERR(file);
/* only retry if RESOLVE_CACHED wasn't already set by application */
if (ret == -EAGAIN &&
(!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
return -EAGAIN;
goto err;
}
if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
file->f_flags &= ~O_NONBLOCK;
fsnotify_open(file);
if (!fixed)
fd_install(ret, file);
else
ret = io_fixed_fd_install(req, issue_flags, file,
open->file_slot);
err:
putname(open->filename);
req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
}
int io_openat(struct io_kiocb *req, unsigned int issue_flags)
{
return io_openat2(req, issue_flags);
}
void io_open_cleanup(struct io_kiocb *req)
{
struct io_open *open = io_kiocb_to_cmd(req);
if (open->filename)
putname(open->filename);
}
int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags,
unsigned int offset)
{
int ret;
io_ring_submit_lock(ctx, issue_flags);
ret = io_fixed_fd_remove(ctx, offset);
io_ring_submit_unlock(ctx, issue_flags);
return ret;
}
static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_close *close = io_kiocb_to_cmd(req);
return __io_close_fixed(req->ctx, issue_flags, close->file_slot - 1);
}
int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_close *close = io_kiocb_to_cmd(req);
if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index)
return -EINVAL;
if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
close->fd = READ_ONCE(sqe->fd);
close->file_slot = READ_ONCE(sqe->file_index);
if (close->file_slot && close->fd)
return -EINVAL;
return 0;
}
int io_close(struct io_kiocb *req, unsigned int issue_flags)
{
struct files_struct *files = current->files;
struct io_close *close = io_kiocb_to_cmd(req);
struct fdtable *fdt;
struct file *file;
int ret = -EBADF;
if (close->file_slot) {
ret = io_close_fixed(req, issue_flags);
goto err;
}
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
if (close->fd >= fdt->max_fds) {
spin_unlock(&files->file_lock);
goto err;
}
file = rcu_dereference_protected(fdt->fd[close->fd],
lockdep_is_held(&files->file_lock));
if (!file || io_is_uring_fops(file)) {
spin_unlock(&files->file_lock);
goto err;
}
/* if the file has a flush method, be safe and punt to async */
if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
spin_unlock(&files->file_lock);
return -EAGAIN;
}
file = __close_fd_get_file(close->fd);
spin_unlock(&files->file_lock);
if (!file)
goto err;
/* No ->flush() or already async, safely close from here */
ret = filp_close(file, current->files);
err:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
}

14
io_uring/openclose.h Normal file
View File

@ -0,0 +1,14 @@
// SPDX-License-Identifier: GPL-2.0
int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags,
unsigned int offset);
int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_openat(struct io_kiocb *req, unsigned int issue_flags);
void io_open_cleanup(struct io_kiocb *req);
int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_openat2(struct io_kiocb *req, unsigned int issue_flags);
int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_close(struct io_kiocb *req, unsigned int issue_flags);

965
io_uring/poll.c Normal file
View File

@ -0,0 +1,965 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/hashtable.h>
#include <linux/io_uring.h>
#include <trace/events/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "refs.h"
#include "opdef.h"
#include "kbuf.h"
#include "poll.h"
#include "cancel.h"
struct io_poll_update {
struct file *file;
u64 old_user_data;
u64 new_user_data;
__poll_t events;
bool update_events;
bool update_user_data;
};
struct io_poll_table {
struct poll_table_struct pt;
struct io_kiocb *req;
int nr_entries;
int error;
bool owning;
/* output value, set only if arm poll returns >0 */
__poll_t result_mask;
};
#define IO_POLL_CANCEL_FLAG BIT(31)
#define IO_POLL_REF_MASK GENMASK(30, 0)
#define IO_WQE_F_DOUBLE 1
static inline struct io_kiocb *wqe_to_req(struct wait_queue_entry *wqe)
{
unsigned long priv = (unsigned long)wqe->private;
return (struct io_kiocb *)(priv & ~IO_WQE_F_DOUBLE);
}
static inline bool wqe_is_double(struct wait_queue_entry *wqe)
{
unsigned long priv = (unsigned long)wqe->private;
return priv & IO_WQE_F_DOUBLE;
}
/*
* If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
* bump it and acquire ownership. It's disallowed to modify requests while not
* owning it, that prevents from races for enqueueing task_work's and b/w
* arming poll and wakeups.
*/
static inline bool io_poll_get_ownership(struct io_kiocb *req)
{
return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
}
static void io_poll_mark_cancelled(struct io_kiocb *req)
{
atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
}
static struct io_poll *io_poll_get_double(struct io_kiocb *req)
{
/* pure poll stashes this in ->async_data, poll driven retry elsewhere */
if (req->opcode == IORING_OP_POLL_ADD)
return req->async_data;
return req->apoll->double_poll;
}
static struct io_poll *io_poll_get_single(struct io_kiocb *req)
{
if (req->opcode == IORING_OP_POLL_ADD)
return io_kiocb_to_cmd(req);
return &req->apoll->poll;
}
static void io_poll_req_insert(struct io_kiocb *req)
{
struct io_hash_table *table = &req->ctx->cancel_table;
u32 index = hash_long(req->cqe.user_data, table->hash_bits);
struct io_hash_bucket *hb = &table->hbs[index];
spin_lock(&hb->lock);
hlist_add_head(&req->hash_node, &hb->list);
spin_unlock(&hb->lock);
}
static void io_poll_req_delete(struct io_kiocb *req, struct io_ring_ctx *ctx)
{
struct io_hash_table *table = &req->ctx->cancel_table;
u32 index = hash_long(req->cqe.user_data, table->hash_bits);
spinlock_t *lock = &table->hbs[index].lock;
spin_lock(lock);
hash_del(&req->hash_node);
spin_unlock(lock);
}
static void io_poll_req_insert_locked(struct io_kiocb *req)
{
struct io_hash_table *table = &req->ctx->cancel_table_locked;
u32 index = hash_long(req->cqe.user_data, table->hash_bits);
hlist_add_head(&req->hash_node, &table->hbs[index].list);
}
static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked)
{
struct io_ring_ctx *ctx = req->ctx;
if (req->flags & REQ_F_HASH_LOCKED) {
/*
* ->cancel_table_locked is protected by ->uring_lock in
* contrast to per bucket spinlocks. Likely, tctx_task_work()
* already grabbed the mutex for us, but there is a chance it
* failed.
*/
io_tw_lock(ctx, locked);
hash_del(&req->hash_node);
req->flags &= ~REQ_F_HASH_LOCKED;
} else {
io_poll_req_delete(req, ctx);
}
}
static void io_init_poll_iocb(struct io_poll *poll, __poll_t events,
wait_queue_func_t wake_func)
{
poll->head = NULL;
#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
/* mask in events that we always want/need */
poll->events = events | IO_POLL_UNMASK;
INIT_LIST_HEAD(&poll->wait.entry);
init_waitqueue_func_entry(&poll->wait, wake_func);
}
static inline void io_poll_remove_entry(struct io_poll *poll)
{
struct wait_queue_head *head = smp_load_acquire(&poll->head);
if (head) {
spin_lock_irq(&head->lock);
list_del_init(&poll->wait.entry);
poll->head = NULL;
spin_unlock_irq(&head->lock);
}
}
static void io_poll_remove_entries(struct io_kiocb *req)
{
/*
* Nothing to do if neither of those flags are set. Avoid dipping
* into the poll/apoll/double cachelines if we can.
*/
if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL)))
return;
/*
* While we hold the waitqueue lock and the waitqueue is nonempty,
* wake_up_pollfree() will wait for us. However, taking the waitqueue
* lock in the first place can race with the waitqueue being freed.
*
* We solve this as eventpoll does: by taking advantage of the fact that
* all users of wake_up_pollfree() will RCU-delay the actual free. If
* we enter rcu_read_lock() and see that the pointer to the queue is
* non-NULL, we can then lock it without the memory being freed out from
* under us.
*
* Keep holding rcu_read_lock() as long as we hold the queue lock, in
* case the caller deletes the entry from the queue, leaving it empty.
* In that case, only RCU prevents the queue memory from being freed.
*/
rcu_read_lock();
if (req->flags & REQ_F_SINGLE_POLL)
io_poll_remove_entry(io_poll_get_single(req));
if (req->flags & REQ_F_DOUBLE_POLL)
io_poll_remove_entry(io_poll_get_double(req));
rcu_read_unlock();
}
enum {
IOU_POLL_DONE = 0,
IOU_POLL_NO_ACTION = 1,
IOU_POLL_REMOVE_POLL_USE_RES = 2,
};
/*
* All poll tw should go through this. Checks for poll events, manages
* references, does rewait, etc.
*
* Returns a negative error on failure. IOU_POLL_NO_ACTION when no action require,
* which is either spurious wakeup or multishot CQE is served.
* IOU_POLL_DONE when it's done with the request, then the mask is stored in req->cqe.res.
* IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot poll and that the result
* is stored in req->cqe.
*/
static int io_poll_check_events(struct io_kiocb *req, bool *locked)
{
struct io_ring_ctx *ctx = req->ctx;
int v, ret;
/* req->task == current here, checking PF_EXITING is safe */
if (unlikely(req->task->flags & PF_EXITING))
return -ECANCELED;
do {
v = atomic_read(&req->poll_refs);
/* tw handler should be the owner, and so have some references */
if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
return IOU_POLL_DONE;
if (v & IO_POLL_CANCEL_FLAG)
return -ECANCELED;
/* the mask was stashed in __io_poll_execute */
if (!req->cqe.res) {
struct poll_table_struct pt = { ._key = req->apoll_events };
req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
}
if ((unlikely(!req->cqe.res)))
continue;
if (req->apoll_events & EPOLLONESHOT)
return IOU_POLL_DONE;
/* multishot, just fill a CQE and proceed */
if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
__poll_t mask = mangle_poll(req->cqe.res &
req->apoll_events);
if (!io_post_aux_cqe(ctx, req->cqe.user_data,
mask, IORING_CQE_F_MORE, false)) {
io_req_set_res(req, mask, 0);
return IOU_POLL_REMOVE_POLL_USE_RES;
}
} else {
ret = io_poll_issue(req, locked);
if (ret == IOU_STOP_MULTISHOT)
return IOU_POLL_REMOVE_POLL_USE_RES;
if (ret < 0)
return ret;
}
/*
* Release all references, retry if someone tried to restart
* task_work while we were executing it.
*/
} while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs));
return IOU_POLL_NO_ACTION;
}
static void io_poll_task_func(struct io_kiocb *req, bool *locked)
{
int ret;
ret = io_poll_check_events(req, locked);
if (ret == IOU_POLL_NO_ACTION)
return;
if (ret == IOU_POLL_DONE) {
struct io_poll *poll = io_kiocb_to_cmd(req);
req->cqe.res = mangle_poll(req->cqe.res & poll->events);
} else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) {
req->cqe.res = ret;
req_set_fail(req);
}
io_poll_remove_entries(req);
io_poll_tw_hash_eject(req, locked);
io_req_set_res(req, req->cqe.res, 0);
io_req_task_complete(req, locked);
}
static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
{
int ret;
ret = io_poll_check_events(req, locked);
if (ret == IOU_POLL_NO_ACTION)
return;
io_poll_remove_entries(req);
io_poll_tw_hash_eject(req, locked);
if (ret == IOU_POLL_REMOVE_POLL_USE_RES)
io_req_complete_post(req);
else if (ret == IOU_POLL_DONE)
io_req_task_submit(req, locked);
else
io_req_complete_failed(req, ret);
}
static void __io_poll_execute(struct io_kiocb *req, int mask)
{
io_req_set_res(req, mask, 0);
/*
* This is useful for poll that is armed on behalf of another
* request, and where the wakeup path could be on a different
* CPU. We want to avoid pulling in req->apoll->events for that
* case.
*/
if (req->opcode == IORING_OP_POLL_ADD)
req->io_task_work.func = io_poll_task_func;
else
req->io_task_work.func = io_apoll_task_func;
trace_io_uring_task_add(req, mask);
io_req_task_work_add(req);
}
static inline void io_poll_execute(struct io_kiocb *req, int res)
{
if (io_poll_get_ownership(req))
__io_poll_execute(req, res);
}
static void io_poll_cancel_req(struct io_kiocb *req)
{
io_poll_mark_cancelled(req);
/* kick tw, which should complete the request */
io_poll_execute(req, 0);
}
#define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI)
static __cold int io_pollfree_wake(struct io_kiocb *req, struct io_poll *poll)
{
io_poll_mark_cancelled(req);
/* we have to kick tw in case it's not already */
io_poll_execute(req, 0);
/*
* If the waitqueue is being freed early but someone is already
* holds ownership over it, we have to tear down the request as
* best we can. That means immediately removing the request from
* its waitqueue and preventing all further accesses to the
* waitqueue via the request.
*/
list_del_init(&poll->wait.entry);
/*
* Careful: this *must* be the last step, since as soon
* as req->head is NULL'ed out, the request can be
* completed and freed, since aio_poll_complete_work()
* will no longer need to take the waitqueue lock.
*/
smp_store_release(&poll->head, NULL);
return 1;
}
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
void *key)
{
struct io_kiocb *req = wqe_to_req(wait);
struct io_poll *poll = container_of(wait, struct io_poll, wait);
__poll_t mask = key_to_poll(key);
if (unlikely(mask & POLLFREE))
return io_pollfree_wake(req, poll);
/* for instances that support it check for an event match first */
if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON)))
return 0;
if (io_poll_get_ownership(req)) {
/* optional, saves extra locking for removal in tw handler */
if (mask && poll->events & EPOLLONESHOT) {
list_del_init(&poll->wait.entry);
poll->head = NULL;
if (wqe_is_double(wait))
req->flags &= ~REQ_F_DOUBLE_POLL;
else
req->flags &= ~REQ_F_SINGLE_POLL;
}
__io_poll_execute(req, mask);
}
return 1;
}
static void io_poll_double_prepare(struct io_kiocb *req)
{
struct wait_queue_head *head;
struct io_poll *poll = io_poll_get_single(req);
/* head is RCU protected, see io_poll_remove_entries() comments */
rcu_read_lock();
head = smp_load_acquire(&poll->head);
/*
* poll arm may not hold ownership and so race with
* io_poll_wake() by modifying req->flags. There is only one
* poll entry queued, serialise with it by taking its head lock.
*/
if (head)
spin_lock_irq(&head->lock);
req->flags |= REQ_F_DOUBLE_POLL;
if (req->opcode == IORING_OP_POLL_ADD)
req->flags |= REQ_F_ASYNC_DATA;
if (head)
spin_unlock_irq(&head->lock);
rcu_read_unlock();
}
static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
struct wait_queue_head *head,
struct io_poll **poll_ptr)
{
struct io_kiocb *req = pt->req;
unsigned long wqe_private = (unsigned long) req;
/*
* The file being polled uses multiple waitqueues for poll handling
* (e.g. one for read, one for write). Setup a separate io_poll
* if this happens.
*/
if (unlikely(pt->nr_entries)) {
struct io_poll *first = poll;
/* double add on the same waitqueue head, ignore */
if (first->head == head)
return;
/* already have a 2nd entry, fail a third attempt */
if (*poll_ptr) {
if ((*poll_ptr)->head == head)
return;
pt->error = -EINVAL;
return;
}
poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
if (!poll) {
pt->error = -ENOMEM;
return;
}
/* mark as double wq entry */
wqe_private |= IO_WQE_F_DOUBLE;
io_init_poll_iocb(poll, first->events, first->wait.func);
io_poll_double_prepare(req);
*poll_ptr = poll;
} else {
/* fine to modify, there is no poll queued to race with us */
req->flags |= REQ_F_SINGLE_POLL;
}
pt->nr_entries++;
poll->head = head;
poll->wait.private = (void *) wqe_private;
if (poll->events & EPOLLEXCLUSIVE)
add_wait_queue_exclusive(head, &poll->wait);
else
add_wait_queue(head, &poll->wait);
}
static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
struct poll_table_struct *p)
{
struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
struct io_poll *poll = io_kiocb_to_cmd(pt->req);
__io_queue_proc(poll, pt, head,
(struct io_poll **) &pt->req->async_data);
}
static bool io_poll_can_finish_inline(struct io_kiocb *req,
struct io_poll_table *pt)
{
return pt->owning || io_poll_get_ownership(req);
}
/*
* Returns 0 when it's handed over for polling. The caller owns the requests if
* it returns non-zero, but otherwise should not touch it. Negative values
* contain an error code. When the result is >0, the polling has completed
* inline and ipt.result_mask is set to the mask.
*/
static int __io_arm_poll_handler(struct io_kiocb *req,
struct io_poll *poll,
struct io_poll_table *ipt, __poll_t mask,
unsigned issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
int v;
INIT_HLIST_NODE(&req->hash_node);
req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
io_init_poll_iocb(poll, mask, io_poll_wake);
poll->file = req->file;
req->apoll_events = poll->events;
ipt->pt._key = mask;
ipt->req = req;
ipt->error = 0;
ipt->nr_entries = 0;
/*
* Polling is either completed here or via task_work, so if we're in the
* task context we're naturally serialised with tw by merit of running
* the same task. When it's io-wq, take the ownership to prevent tw
* from running. However, when we're in the task context, skip taking
* it as an optimisation.
*
* Note: even though the request won't be completed/freed, without
* ownership we still can race with io_poll_wake().
* io_poll_can_finish_inline() tries to deal with that.
*/
ipt->owning = issue_flags & IO_URING_F_UNLOCKED;
atomic_set(&req->poll_refs, (int)ipt->owning);
/* io-wq doesn't hold uring_lock */
if (issue_flags & IO_URING_F_UNLOCKED)
req->flags &= ~REQ_F_HASH_LOCKED;
mask = vfs_poll(req->file, &ipt->pt) & poll->events;
if (unlikely(ipt->error || !ipt->nr_entries)) {
io_poll_remove_entries(req);
if (!io_poll_can_finish_inline(req, ipt)) {
io_poll_mark_cancelled(req);
return 0;
} else if (mask && (poll->events & EPOLLET)) {
ipt->result_mask = mask;
return 1;
}
return ipt->error ?: -EINVAL;
}
if (mask &&
((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) {
if (!io_poll_can_finish_inline(req, ipt))
return 0;
io_poll_remove_entries(req);
ipt->result_mask = mask;
/* no one else has access to the req, forget about the ref */
return 1;
}
if (req->flags & REQ_F_HASH_LOCKED)
io_poll_req_insert_locked(req);
else
io_poll_req_insert(req);
if (mask && (poll->events & EPOLLET) &&
io_poll_can_finish_inline(req, ipt)) {
__io_poll_execute(req, mask);
return 0;
}
if (ipt->owning) {
/*
* Release ownership. If someone tried to queue a tw while it was
* locked, kick it off for them.
*/
v = atomic_dec_return(&req->poll_refs);
if (unlikely(v & IO_POLL_REF_MASK))
__io_poll_execute(req, 0);
}
return 0;
}
static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
struct poll_table_struct *p)
{
struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
struct async_poll *apoll = pt->req->apoll;
__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
}
static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
unsigned issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_cache_entry *entry;
struct async_poll *apoll;
if (req->flags & REQ_F_POLLED) {
apoll = req->apoll;
kfree(apoll->double_poll);
} else if (!(issue_flags & IO_URING_F_UNLOCKED) &&
(entry = io_alloc_cache_get(&ctx->apoll_cache)) != NULL) {
apoll = container_of(entry, struct async_poll, cache);
} else {
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
if (unlikely(!apoll))
return NULL;
}
apoll->double_poll = NULL;
req->apoll = apoll;
return apoll;
}
int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
struct async_poll *apoll;
struct io_poll_table ipt;
__poll_t mask = POLLPRI | POLLERR | EPOLLET;
int ret;
/*
* apoll requests already grab the mutex to complete in the tw handler,
* so removal from the mutex-backed hash is free, use it by default.
*/
req->flags |= REQ_F_HASH_LOCKED;
if (!def->pollin && !def->pollout)
return IO_APOLL_ABORTED;
if (!file_can_poll(req->file))
return IO_APOLL_ABORTED;
if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED)
return IO_APOLL_ABORTED;
if (!(req->flags & REQ_F_APOLL_MULTISHOT))
mask |= EPOLLONESHOT;
if (def->pollin) {
mask |= EPOLLIN | EPOLLRDNORM;
/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
if (req->flags & REQ_F_CLEAR_POLLIN)
mask &= ~EPOLLIN;
} else {
mask |= EPOLLOUT | EPOLLWRNORM;
}
if (def->poll_exclusive)
mask |= EPOLLEXCLUSIVE;
apoll = io_req_alloc_apoll(req, issue_flags);
if (!apoll)
return IO_APOLL_ABORTED;
req->flags |= REQ_F_POLLED;
ipt.pt._qproc = io_async_queue_proc;
io_kbuf_recycle(req, issue_flags);
ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, issue_flags);
if (ret)
return ret > 0 ? IO_APOLL_READY : IO_APOLL_ABORTED;
trace_io_uring_poll_arm(req, mask, apoll->poll.events);
return IO_APOLL_OK;
}
static __cold bool io_poll_remove_all_table(struct task_struct *tsk,
struct io_hash_table *table,
bool cancel_all)
{
unsigned nr_buckets = 1U << table->hash_bits;
struct hlist_node *tmp;
struct io_kiocb *req;
bool found = false;
int i;
for (i = 0; i < nr_buckets; i++) {
struct io_hash_bucket *hb = &table->hbs[i];
spin_lock(&hb->lock);
hlist_for_each_entry_safe(req, tmp, &hb->list, hash_node) {
if (io_match_task_safe(req, tsk, cancel_all)) {
hlist_del_init(&req->hash_node);
io_poll_cancel_req(req);
found = true;
}
}
spin_unlock(&hb->lock);
}
return found;
}
/*
* Returns true if we found and killed one or more poll requests
*/
__cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
bool cancel_all)
__must_hold(&ctx->uring_lock)
{
bool ret;
ret = io_poll_remove_all_table(tsk, &ctx->cancel_table, cancel_all);
ret |= io_poll_remove_all_table(tsk, &ctx->cancel_table_locked, cancel_all);
return ret;
}
static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
struct io_cancel_data *cd,
struct io_hash_table *table,
struct io_hash_bucket **out_bucket)
{
struct io_kiocb *req;
u32 index = hash_long(cd->data, table->hash_bits);
struct io_hash_bucket *hb = &table->hbs[index];
*out_bucket = NULL;
spin_lock(&hb->lock);
hlist_for_each_entry(req, &hb->list, hash_node) {
if (cd->data != req->cqe.user_data)
continue;
if (poll_only && req->opcode != IORING_OP_POLL_ADD)
continue;
if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
if (cd->seq == req->work.cancel_seq)
continue;
req->work.cancel_seq = cd->seq;
}
*out_bucket = hb;
return req;
}
spin_unlock(&hb->lock);
return NULL;
}
static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
struct io_cancel_data *cd,
struct io_hash_table *table,
struct io_hash_bucket **out_bucket)
{
unsigned nr_buckets = 1U << table->hash_bits;
struct io_kiocb *req;
int i;
*out_bucket = NULL;
for (i = 0; i < nr_buckets; i++) {
struct io_hash_bucket *hb = &table->hbs[i];
spin_lock(&hb->lock);
hlist_for_each_entry(req, &hb->list, hash_node) {
if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
req->file != cd->file)
continue;
if (cd->seq == req->work.cancel_seq)
continue;
req->work.cancel_seq = cd->seq;
*out_bucket = hb;
return req;
}
spin_unlock(&hb->lock);
}
return NULL;
}
static int io_poll_disarm(struct io_kiocb *req)
{
if (!req)
return -ENOENT;
if (!io_poll_get_ownership(req))
return -EALREADY;
io_poll_remove_entries(req);
hash_del(&req->hash_node);
return 0;
}
static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
struct io_hash_table *table)
{
struct io_hash_bucket *bucket;
struct io_kiocb *req;
if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY))
req = io_poll_file_find(ctx, cd, table, &bucket);
else
req = io_poll_find(ctx, false, cd, table, &bucket);
if (req)
io_poll_cancel_req(req);
if (bucket)
spin_unlock(&bucket->lock);
return req ? 0 : -ENOENT;
}
int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
unsigned issue_flags)
{
int ret;
ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table);
if (ret != -ENOENT)
return ret;
io_ring_submit_lock(ctx, issue_flags);
ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table_locked);
io_ring_submit_unlock(ctx, issue_flags);
return ret;
}
static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
unsigned int flags)
{
u32 events;
events = READ_ONCE(sqe->poll32_events);
#ifdef __BIG_ENDIAN
events = swahw32(events);
#endif
if (!(flags & IORING_POLL_ADD_MULTI))
events |= EPOLLONESHOT;
if (!(flags & IORING_POLL_ADD_LEVEL))
events |= EPOLLET;
return demangle_poll(events) |
(events & (EPOLLEXCLUSIVE|EPOLLONESHOT|EPOLLET));
}
int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_poll_update *upd = io_kiocb_to_cmd(req);
u32 flags;
if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
flags = READ_ONCE(sqe->len);
if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
IORING_POLL_ADD_MULTI))
return -EINVAL;
/* meaningless without update */
if (flags == IORING_POLL_ADD_MULTI)
return -EINVAL;
upd->old_user_data = READ_ONCE(sqe->addr);
upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
upd->new_user_data = READ_ONCE(sqe->off);
if (!upd->update_user_data && upd->new_user_data)
return -EINVAL;
if (upd->update_events)
upd->events = io_poll_parse_events(sqe, flags);
else if (sqe->poll32_events)
return -EINVAL;
return 0;
}
int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_poll *poll = io_kiocb_to_cmd(req);
u32 flags;
if (sqe->buf_index || sqe->off || sqe->addr)
return -EINVAL;
flags = READ_ONCE(sqe->len);
if (flags & ~(IORING_POLL_ADD_MULTI|IORING_POLL_ADD_LEVEL))
return -EINVAL;
if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
return -EINVAL;
poll->events = io_poll_parse_events(sqe, flags);
return 0;
}
int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_poll *poll = io_kiocb_to_cmd(req);
struct io_poll_table ipt;
int ret;
ipt.pt._qproc = io_poll_queue_proc;
/*
* If sqpoll or single issuer, there is no contention for ->uring_lock
* and we'll end up holding it in tw handlers anyway.
*/
if (req->ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_SINGLE_ISSUER))
req->flags |= REQ_F_HASH_LOCKED;
ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags);
if (ret > 0) {
io_req_set_res(req, ipt.result_mask, 0);
return IOU_OK;
}
return ret ?: IOU_ISSUE_SKIP_COMPLETE;
}
int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_poll_update *poll_update = io_kiocb_to_cmd(req);
struct io_cancel_data cd = { .data = poll_update->old_user_data, };
struct io_ring_ctx *ctx = req->ctx;
struct io_hash_bucket *bucket;
struct io_kiocb *preq;
int ret2, ret = 0;
bool locked;
preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket);
ret2 = io_poll_disarm(preq);
if (bucket)
spin_unlock(&bucket->lock);
if (!ret2)
goto found;
if (ret2 != -ENOENT) {
ret = ret2;
goto out;
}
io_ring_submit_lock(ctx, issue_flags);
preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table_locked, &bucket);
ret2 = io_poll_disarm(preq);
if (bucket)
spin_unlock(&bucket->lock);
io_ring_submit_unlock(ctx, issue_flags);
if (ret2) {
ret = ret2;
goto out;
}
found:
if (WARN_ON_ONCE(preq->opcode != IORING_OP_POLL_ADD)) {
ret = -EFAULT;
goto out;
}
if (poll_update->update_events || poll_update->update_user_data) {
/* only mask one event flags, keep behavior flags */
if (poll_update->update_events) {
struct io_poll *poll = io_kiocb_to_cmd(preq);
poll->events &= ~0xffff;
poll->events |= poll_update->events & 0xffff;
poll->events |= IO_POLL_UNMASK;
}
if (poll_update->update_user_data)
preq->cqe.user_data = poll_update->new_user_data;
ret2 = io_poll_add(preq, issue_flags);
/* successfully updated, don't complete poll request */
if (!ret2 || ret2 == -EIOCBQUEUED)
goto out;
}
req_set_fail(preq);
io_req_set_res(preq, -ECANCELED, 0);
locked = !(issue_flags & IO_URING_F_UNLOCKED);
io_req_task_complete(preq, &locked);
out:
if (ret < 0) {
req_set_fail(req);
return ret;
}
/* complete update request, we're done with it */
io_req_set_res(req, ret, 0);
return IOU_OK;
}
void io_apoll_cache_free(struct io_cache_entry *entry)
{
kfree(container_of(entry, struct async_poll, cache));
}

39
io_uring/poll.h Normal file
View File

@ -0,0 +1,39 @@
// SPDX-License-Identifier: GPL-2.0
#include "alloc_cache.h"
enum {
IO_APOLL_OK,
IO_APOLL_ABORTED,
IO_APOLL_READY
};
struct io_poll {
struct file *file;
struct wait_queue_head *head;
__poll_t events;
struct wait_queue_entry wait;
};
struct async_poll {
union {
struct io_poll poll;
struct io_cache_entry cache;
};
struct io_poll *double_poll;
};
int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_poll_add(struct io_kiocb *req, unsigned int issue_flags);
int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags);
struct io_cancel_data;
int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
unsigned issue_flags);
int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags);
bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
bool cancel_all);
void io_apoll_cache_free(struct io_cache_entry *entry);

48
io_uring/refs.h Normal file
View File

@ -0,0 +1,48 @@
#ifndef IOU_REQ_REF_H
#define IOU_REQ_REF_H
#include <linux/atomic.h>
#include <linux/io_uring_types.h>
/*
* Shamelessly stolen from the mm implementation of page reference checking,
* see commit f958d7b528b1 for details.
*/
#define req_ref_zero_or_close_to_overflow(req) \
((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
{
WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
return atomic_inc_not_zero(&req->refs);
}
static inline bool req_ref_put_and_test(struct io_kiocb *req)
{
if (likely(!(req->flags & REQ_F_REFCOUNT)))
return true;
WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
return atomic_dec_and_test(&req->refs);
}
static inline void req_ref_get(struct io_kiocb *req)
{
WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
atomic_inc(&req->refs);
}
static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
{
if (!(req->flags & REQ_F_REFCOUNT)) {
req->flags |= REQ_F_REFCOUNT;
atomic_set(&req->refs, nr);
}
}
static inline void io_req_set_refcount(struct io_kiocb *req)
{
__io_req_set_refcount(req, 1);
}
#endif

1373
io_uring/rsrc.c Normal file

File diff suppressed because it is too large Load Diff

166
io_uring/rsrc.h Normal file
View File

@ -0,0 +1,166 @@
// SPDX-License-Identifier: GPL-2.0
#ifndef IOU_RSRC_H
#define IOU_RSRC_H
#include <net/af_unix.h>
#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
enum {
IORING_RSRC_FILE = 0,
IORING_RSRC_BUFFER = 1,
};
struct io_rsrc_put {
struct list_head list;
u64 tag;
union {
void *rsrc;
struct file *file;
struct io_mapped_ubuf *buf;
};
};
typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
struct io_rsrc_data {
struct io_ring_ctx *ctx;
u64 **tags;
unsigned int nr;
rsrc_put_fn *do_put;
atomic_t refs;
struct completion done;
bool quiesce;
};
struct io_rsrc_node {
struct percpu_ref refs;
struct list_head node;
struct list_head rsrc_list;
struct io_rsrc_data *rsrc_data;
struct llist_node llist;
bool done;
};
struct io_mapped_ubuf {
u64 ubuf;
u64 ubuf_end;
unsigned int nr_bvecs;
unsigned long acct_pages;
struct bio_vec bvec[];
};
void io_rsrc_put_work(struct work_struct *work);
void io_rsrc_refs_refill(struct io_ring_ctx *ctx);
void io_wait_rsrc_data(struct io_rsrc_data *data);
void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);
void io_rsrc_refs_drop(struct io_ring_ctx *ctx);
int io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
struct io_rsrc_node *node, void *rsrc);
void io_rsrc_node_switch(struct io_ring_ctx *ctx,
struct io_rsrc_data *data_to_kill);
int io_import_fixed(int ddir, struct iov_iter *iter,
struct io_mapped_ubuf *imu,
u64 buf_addr, size_t len);
void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int nr_args, u64 __user *tags);
void __io_sqe_files_unregister(struct io_ring_ctx *ctx);
int io_sqe_files_unregister(struct io_ring_ctx *ctx);
int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args, u64 __user *tags);
int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file);
#if defined(CONFIG_UNIX)
static inline bool io_file_need_scm(struct file *filp)
{
#if defined(IO_URING_SCM_ALL)
return true;
#else
return !!unix_get_socket(filp);
#endif
}
#else
static inline bool io_file_need_scm(struct file *filp)
{
return false;
}
#endif
static inline int io_scm_file_account(struct io_ring_ctx *ctx,
struct file *file)
{
if (likely(!io_file_need_scm(file)))
return 0;
return __io_scm_file_account(ctx, file);
}
int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args);
int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned size, unsigned type);
int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
unsigned int size, unsigned int type);
static inline void io_rsrc_put_node(struct io_rsrc_node *node, int nr)
{
percpu_ref_put_many(&node->refs, nr);
}
static inline void io_req_put_rsrc(struct io_kiocb *req)
{
if (req->rsrc_node)
io_rsrc_put_node(req->rsrc_node, 1);
}
static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
struct io_rsrc_node *node = req->rsrc_node;
if (node) {
if (node == ctx->rsrc_node)
ctx->rsrc_cached_refs++;
else
io_rsrc_put_node(node, 1);
}
}
static inline void io_req_set_rsrc_node(struct io_kiocb *req,
struct io_ring_ctx *ctx,
unsigned int issue_flags)
{
if (!req->rsrc_node) {
req->rsrc_node = ctx->rsrc_node;
if (!(issue_flags & IO_URING_F_UNLOCKED)) {
lockdep_assert_held(&ctx->uring_lock);
ctx->rsrc_cached_refs--;
if (unlikely(ctx->rsrc_cached_refs < 0))
io_rsrc_refs_refill(ctx);
} else {
percpu_ref_get(&req->rsrc_node->refs);
}
}
}
static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
{
unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
return &data->tags[table_idx][off];
}
int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
#endif

1020
io_uring/rw.c Normal file

File diff suppressed because it is too large Load Diff

23
io_uring/rw.h Normal file
View File

@ -0,0 +1,23 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/pagemap.h>
struct io_rw_state {
struct iov_iter iter;
struct iov_iter_state iter_state;
struct iovec fast_iov[UIO_FASTIOV];
};
struct io_async_rw {
struct io_rw_state s;
const struct iovec *free_iovec;
size_t bytes_done;
struct wait_page_queue wpq;
};
int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_read(struct io_kiocb *req, unsigned int issue_flags);
int io_readv_prep_async(struct io_kiocb *req);
int io_write(struct io_kiocb *req, unsigned int issue_flags);
int io_writev_prep_async(struct io_kiocb *req);
void io_readv_writev_cleanup(struct io_kiocb *req);

View File

@ -1,33 +1,7 @@
#ifndef INTERNAL_IO_WQ_H
#define INTERNAL_IO_WQ_H
#ifndef INTERNAL_IO_SLIST_H
#define INTERNAL_IO_SLIST_H
#include <linux/refcount.h>
struct io_wq;
enum {
IO_WQ_WORK_CANCEL = 1,
IO_WQ_WORK_HASHED = 2,
IO_WQ_WORK_UNBOUND = 4,
IO_WQ_WORK_CONCURRENT = 16,
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
enum io_wq_cancel {
IO_WQ_CANCEL_OK, /* cancelled before started */
IO_WQ_CANCEL_RUNNING, /* found, running, and attempted cancelled */
IO_WQ_CANCEL_NOTFOUND, /* work not found */
};
struct io_wq_work_node {
struct io_wq_work_node *next;
};
struct io_wq_work_list {
struct io_wq_work_node *first;
struct io_wq_work_node *last;
};
#include <linux/io_uring_types.h>
#define wq_list_for_each(pos, prv, head) \
for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
@ -36,6 +10,7 @@ struct io_wq_work_list {
for (; pos; prv = pos, pos = (pos)->next)
#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL)
#define INIT_WQ_LIST(list) do { \
(list)->first = NULL; \
} while (0)
@ -152,12 +127,6 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
return node;
}
struct io_wq_work {
struct io_wq_work_node list;
unsigned flags;
int cancel_seq;
};
static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
{
if (!work->list.next)
@ -166,63 +135,4 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
return container_of(work->list.next, struct io_wq_work, list);
}
typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
typedef void (io_wq_work_fn)(struct io_wq_work *);
struct io_wq_hash {
refcount_t refs;
unsigned long map;
struct wait_queue_head wait;
};
static inline void io_wq_put_hash(struct io_wq_hash *hash)
{
if (refcount_dec_and_test(&hash->refs))
kfree(hash);
}
struct io_wq_data {
struct io_wq_hash *hash;
struct task_struct *task;
io_wq_work_fn *do_work;
free_work_fn *free_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
void io_wq_exit_start(struct io_wq *wq);
void io_wq_put_and_exit(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_hash_work(struct io_wq_work *work, void *val);
int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask);
int io_wq_max_workers(struct io_wq *wq, int *new_count);
static inline bool io_wq_is_hashed(struct io_wq_work *work)
{
return work->flags & IO_WQ_WORK_HASHED;
}
typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data, bool cancel_all);
#if defined(CONFIG_IO_WQ)
extern void io_wq_worker_sleeping(struct task_struct *);
extern void io_wq_worker_running(struct task_struct *);
#else
static inline void io_wq_worker_sleeping(struct task_struct *tsk)
{
}
static inline void io_wq_worker_running(struct task_struct *tsk)
{
}
#endif
static inline bool io_wq_current_is_worker(void)
{
return in_task() && (current->flags & PF_IO_WORKER) &&
current->worker_private;
}
#endif
#endif // INTERNAL_IO_SLIST_H

122
io_uring/splice.c Normal file
View File

@ -0,0 +1,122 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/io_uring.h>
#include <linux/splice.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "splice.h"
struct io_splice {
struct file *file_out;
loff_t off_out;
loff_t off_in;
u64 len;
int splice_fd_in;
unsigned int flags;
};
static int __io_splice_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
struct io_splice *sp = io_kiocb_to_cmd(req);
unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
sp->len = READ_ONCE(sqe->len);
sp->flags = READ_ONCE(sqe->splice_flags);
if (unlikely(sp->flags & ~valid_flags))
return -EINVAL;
sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
return 0;
}
int io_tee_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
return -EINVAL;
return __io_splice_prep(req, sqe);
}
int io_tee(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_splice *sp = io_kiocb_to_cmd(req);
struct file *out = sp->file_out;
unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
struct file *in;
long ret = 0;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
if (sp->flags & SPLICE_F_FD_IN_FIXED)
in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
else
in = io_file_get_normal(req, sp->splice_fd_in);
if (!in) {
ret = -EBADF;
goto done;
}
if (sp->len)
ret = do_tee(in, out, sp->len, flags);
if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
io_put_file(in);
done:
if (ret != sp->len)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
}
int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_splice *sp = io_kiocb_to_cmd(req);
sp->off_in = READ_ONCE(sqe->splice_off_in);
sp->off_out = READ_ONCE(sqe->off);
return __io_splice_prep(req, sqe);
}
int io_splice(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_splice *sp = io_kiocb_to_cmd(req);
struct file *out = sp->file_out;
unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
loff_t *poff_in, *poff_out;
struct file *in;
long ret = 0;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
if (sp->flags & SPLICE_F_FD_IN_FIXED)
in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
else
in = io_file_get_normal(req, sp->splice_fd_in);
if (!in) {
ret = -EBADF;
goto done;
}
poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
if (sp->len)
ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
io_put_file(in);
done:
if (ret != sp->len)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
}

7
io_uring/splice.h Normal file
View File

@ -0,0 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
int io_tee_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_tee(struct io_kiocb *req, unsigned int issue_flags);
int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_splice(struct io_kiocb *req, unsigned int issue_flags);

425
io_uring/sqpoll.c Normal file
View File

@ -0,0 +1,425 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Contains the core associated with submission side polling of the SQ
* ring, offloading submissions from the application to a kernel thread.
*/
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/audit.h>
#include <linux/security.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "sqpoll.h"
#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
enum {
IO_SQ_THREAD_SHOULD_STOP = 0,
IO_SQ_THREAD_SHOULD_PARK,
};
void io_sq_thread_unpark(struct io_sq_data *sqd)
__releases(&sqd->lock)
{
WARN_ON_ONCE(sqd->thread == current);
/*
* Do the dance but not conditional clear_bit() because it'd race with
* other threads incrementing park_pending and setting the bit.
*/
clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
if (atomic_dec_return(&sqd->park_pending))
set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
mutex_unlock(&sqd->lock);
}
void io_sq_thread_park(struct io_sq_data *sqd)
__acquires(&sqd->lock)
{
WARN_ON_ONCE(sqd->thread == current);
atomic_inc(&sqd->park_pending);
set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
mutex_lock(&sqd->lock);
if (sqd->thread)
wake_up_process(sqd->thread);
}
void io_sq_thread_stop(struct io_sq_data *sqd)
{
WARN_ON_ONCE(sqd->thread == current);
WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
mutex_lock(&sqd->lock);
if (sqd->thread)
wake_up_process(sqd->thread);
mutex_unlock(&sqd->lock);
wait_for_completion(&sqd->exited);
}
void io_put_sq_data(struct io_sq_data *sqd)
{
if (refcount_dec_and_test(&sqd->refs)) {
WARN_ON_ONCE(atomic_read(&sqd->park_pending));
io_sq_thread_stop(sqd);
kfree(sqd);
}
}
static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
{
struct io_ring_ctx *ctx;
unsigned sq_thread_idle = 0;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
sqd->sq_thread_idle = sq_thread_idle;
}
void io_sq_thread_finish(struct io_ring_ctx *ctx)
{
struct io_sq_data *sqd = ctx->sq_data;
if (sqd) {
io_sq_thread_park(sqd);
list_del_init(&ctx->sqd_list);
io_sqd_update_thread_idle(sqd);
io_sq_thread_unpark(sqd);
io_put_sq_data(sqd);
ctx->sq_data = NULL;
}
}
static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
{
struct io_ring_ctx *ctx_attach;
struct io_sq_data *sqd;
struct fd f;
f = fdget(p->wq_fd);
if (!f.file)
return ERR_PTR(-ENXIO);
if (!io_is_uring_fops(f.file)) {
fdput(f);
return ERR_PTR(-EINVAL);
}
ctx_attach = f.file->private_data;
sqd = ctx_attach->sq_data;
if (!sqd) {
fdput(f);
return ERR_PTR(-EINVAL);
}
if (sqd->task_tgid != current->tgid) {
fdput(f);
return ERR_PTR(-EPERM);
}
refcount_inc(&sqd->refs);
fdput(f);
return sqd;
}
static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
bool *attached)
{
struct io_sq_data *sqd;
*attached = false;
if (p->flags & IORING_SETUP_ATTACH_WQ) {
sqd = io_attach_sq_data(p);
if (!IS_ERR(sqd)) {
*attached = true;
return sqd;
}
/* fall through for EPERM case, setup new sqd/task */
if (PTR_ERR(sqd) != -EPERM)
return sqd;
}
sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
if (!sqd)
return ERR_PTR(-ENOMEM);
atomic_set(&sqd->park_pending, 0);
refcount_set(&sqd->refs, 1);
INIT_LIST_HEAD(&sqd->ctx_list);
mutex_init(&sqd->lock);
init_waitqueue_head(&sqd->wait);
init_completion(&sqd->exited);
return sqd;
}
static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
{
return READ_ONCE(sqd->state);
}
static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
{
unsigned int to_submit;
int ret = 0;
to_submit = io_sqring_entries(ctx);
/* if we're handling multiple rings, cap submit size for fairness */
if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
const struct cred *creds = NULL;
if (ctx->sq_creds != current_cred())
creds = override_creds(ctx->sq_creds);
mutex_lock(&ctx->uring_lock);
if (!wq_list_empty(&ctx->iopoll_list))
io_do_iopoll(ctx, true);
/*
* Don't submit if refs are dying, good for io_uring_register(),
* but also it is relied upon by io_ring_exit_work()
*/
if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
!(ctx->flags & IORING_SETUP_R_DISABLED))
ret = io_submit_sqes(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
wake_up(&ctx->sqo_sq_wait);
if (creds)
revert_creds(creds);
}
return ret;
}
static bool io_sqd_handle_event(struct io_sq_data *sqd)
{
bool did_sig = false;
struct ksignal ksig;
if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
signal_pending(current)) {
mutex_unlock(&sqd->lock);
if (signal_pending(current))
did_sig = get_signal(&ksig);
cond_resched();
mutex_lock(&sqd->lock);
}
return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
}
static int io_sq_thread(void *data)
{
struct io_sq_data *sqd = data;
struct io_ring_ctx *ctx;
unsigned long timeout = 0;
char buf[TASK_COMM_LEN];
DEFINE_WAIT(wait);
snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
set_task_comm(current, buf);
if (sqd->sq_cpu != -1)
set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
else
set_cpus_allowed_ptr(current, cpu_online_mask);
current->flags |= PF_NO_SETAFFINITY;
audit_alloc_kernel(current);
mutex_lock(&sqd->lock);
while (1) {
bool cap_entries, sqt_spin = false;
if (io_sqd_events_pending(sqd) || signal_pending(current)) {
if (io_sqd_handle_event(sqd))
break;
timeout = jiffies + sqd->sq_thread_idle;
}
cap_entries = !list_is_singular(&sqd->ctx_list);
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
int ret = __io_sq_thread(ctx, cap_entries);
if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
sqt_spin = true;
}
if (io_run_task_work())
sqt_spin = true;
if (sqt_spin || !time_after(jiffies, timeout)) {
cond_resched();
if (sqt_spin)
timeout = jiffies + sqd->sq_thread_idle;
continue;
}
prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) {
bool needs_sched = true;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
atomic_or(IORING_SQ_NEED_WAKEUP,
&ctx->rings->sq_flags);
if ((ctx->flags & IORING_SETUP_IOPOLL) &&
!wq_list_empty(&ctx->iopoll_list)) {
needs_sched = false;
break;
}
/*
* Ensure the store of the wakeup flag is not
* reordered with the load of the SQ tail
*/
smp_mb__after_atomic();
if (io_sqring_entries(ctx)) {
needs_sched = false;
break;
}
}
if (needs_sched) {
mutex_unlock(&sqd->lock);
schedule();
mutex_lock(&sqd->lock);
}
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
atomic_andnot(IORING_SQ_NEED_WAKEUP,
&ctx->rings->sq_flags);
}
finish_wait(&sqd->wait, &wait);
timeout = jiffies + sqd->sq_thread_idle;
}
io_uring_cancel_generic(true, sqd);
sqd->thread = NULL;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags);
io_run_task_work();
mutex_unlock(&sqd->lock);
audit_free(current);
complete(&sqd->exited);
do_exit(0);
}
int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
{
DEFINE_WAIT(wait);
do {
if (!io_sqring_full(ctx))
break;
prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
if (!io_sqring_full(ctx))
break;
schedule();
} while (!signal_pending(current));
finish_wait(&ctx->sqo_sq_wait, &wait);
return 0;
}
__cold int io_sq_offload_create(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
int ret;
/* Retain compatibility with failing for an invalid attach attempt */
if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
IORING_SETUP_ATTACH_WQ) {
struct fd f;
f = fdget(p->wq_fd);
if (!f.file)
return -ENXIO;
if (!io_is_uring_fops(f.file)) {
fdput(f);
return -EINVAL;
}
fdput(f);
}
if (ctx->flags & IORING_SETUP_SQPOLL) {
struct task_struct *tsk;
struct io_sq_data *sqd;
bool attached;
ret = security_uring_sqpoll();
if (ret)
return ret;
sqd = io_get_sq_data(p, &attached);
if (IS_ERR(sqd)) {
ret = PTR_ERR(sqd);
goto err;
}
ctx->sq_creds = get_current_cred();
ctx->sq_data = sqd;
ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
if (!ctx->sq_thread_idle)
ctx->sq_thread_idle = HZ;
io_sq_thread_park(sqd);
list_add(&ctx->sqd_list, &sqd->ctx_list);
io_sqd_update_thread_idle(sqd);
/* don't attach to a dying SQPOLL thread, would be racy */
ret = (attached && !sqd->thread) ? -ENXIO : 0;
io_sq_thread_unpark(sqd);
if (ret < 0)
goto err;
if (attached)
return 0;
if (p->flags & IORING_SETUP_SQ_AFF) {
int cpu = p->sq_thread_cpu;
ret = -EINVAL;
if (cpu >= nr_cpu_ids || !cpu_online(cpu))
goto err_sqpoll;
sqd->sq_cpu = cpu;
} else {
sqd->sq_cpu = -1;
}
sqd->task_pid = current->pid;
sqd->task_tgid = current->tgid;
tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
if (IS_ERR(tsk)) {
ret = PTR_ERR(tsk);
goto err_sqpoll;
}
sqd->thread = tsk;
ret = io_uring_alloc_task_context(tsk, ctx);
wake_up_new_task(tsk);
if (ret)
goto err;
} else if (p->flags & IORING_SETUP_SQ_AFF) {
/* Can't have SQ_AFF without SQPOLL */
ret = -EINVAL;
goto err;
}
return 0;
err_sqpoll:
complete(&ctx->sq_data->exited);
err:
io_sq_thread_finish(ctx);
return ret;
}

29
io_uring/sqpoll.h Normal file
View File

@ -0,0 +1,29 @@
// SPDX-License-Identifier: GPL-2.0
struct io_sq_data {
refcount_t refs;
atomic_t park_pending;
struct mutex lock;
/* ctx's that are using this sqd */
struct list_head ctx_list;
struct task_struct *thread;
struct wait_queue_head wait;
unsigned sq_thread_idle;
int sq_cpu;
pid_t task_pid;
pid_t task_tgid;
unsigned long state;
struct completion exited;
};
int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p);
void io_sq_thread_finish(struct io_ring_ctx *ctx);
void io_sq_thread_stop(struct io_sq_data *sqd);
void io_sq_thread_park(struct io_sq_data *sqd);
void io_sq_thread_unpark(struct io_sq_data *sqd);
void io_put_sq_data(struct io_sq_data *sqd);
int io_sqpoll_wait_sq(struct io_ring_ctx *ctx);

73
io_uring/statx.c Normal file
View File

@ -0,0 +1,73 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "../fs/internal.h"
#include "io_uring.h"
#include "statx.h"
struct io_statx {
struct file *file;
int dfd;
unsigned int mask;
unsigned int flags;
struct filename *filename;
struct statx __user *buffer;
};
int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_statx *sx = io_kiocb_to_cmd(req);
const char __user *path;
if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
sx->dfd = READ_ONCE(sqe->fd);
sx->mask = READ_ONCE(sqe->len);
path = u64_to_user_ptr(READ_ONCE(sqe->addr));
sx->buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
sx->flags = READ_ONCE(sqe->statx_flags);
sx->filename = getname_flags(path,
getname_statx_lookup_flags(sx->flags),
NULL);
if (IS_ERR(sx->filename)) {
int ret = PTR_ERR(sx->filename);
sx->filename = NULL;
return ret;
}
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
int io_statx(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_statx *sx = io_kiocb_to_cmd(req);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer);
io_req_set_res(req, ret, 0);
return IOU_OK;
}
void io_statx_cleanup(struct io_kiocb *req)
{
struct io_statx *sx = io_kiocb_to_cmd(req);
if (sx->filename)
putname(sx->filename);
}

5
io_uring/statx.h Normal file
View File

@ -0,0 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_statx(struct io_kiocb *req, unsigned int issue_flags);
void io_statx_cleanup(struct io_kiocb *req);

110
io_uring/sync.c Normal file
View File

@ -0,0 +1,110 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/io_uring.h>
#include <linux/fsnotify.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "sync.h"
struct io_sync {
struct file *file;
loff_t len;
loff_t off;
int flags;
int mode;
};
int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_sync *sync = io_kiocb_to_cmd(req);
if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
return -EINVAL;
sync->off = READ_ONCE(sqe->off);
sync->len = READ_ONCE(sqe->len);
sync->flags = READ_ONCE(sqe->sync_range_flags);
return 0;
}
int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_sync *sync = io_kiocb_to_cmd(req);
int ret;
/* sync_file_range always requires a blocking context */
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
ret = sync_file_range(req->file, sync->off, sync->len, sync->flags);
io_req_set_res(req, ret, 0);
return IOU_OK;
}
int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_sync *sync = io_kiocb_to_cmd(req);
if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
return -EINVAL;
sync->flags = READ_ONCE(sqe->fsync_flags);
if (unlikely(sync->flags & ~IORING_FSYNC_DATASYNC))
return -EINVAL;
sync->off = READ_ONCE(sqe->off);
sync->len = READ_ONCE(sqe->len);
return 0;
}
int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_sync *sync = io_kiocb_to_cmd(req);
loff_t end = sync->off + sync->len;
int ret;
/* fsync always requires a blocking context */
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX,
sync->flags & IORING_FSYNC_DATASYNC);
io_req_set_res(req, ret, 0);
return IOU_OK;
}
int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_sync *sync = io_kiocb_to_cmd(req);
if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
sync->off = READ_ONCE(sqe->off);
sync->len = READ_ONCE(sqe->addr);
sync->mode = READ_ONCE(sqe->len);
return 0;
}
int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_sync *sync = io_kiocb_to_cmd(req);
int ret;
/* fallocate always requiring blocking context */
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len);
if (ret >= 0)
fsnotify_modify(req->file);
io_req_set_res(req, ret, 0);
return IOU_OK;
}

10
io_uring/sync.h Normal file
View File

@ -0,0 +1,10 @@
// SPDX-License-Identifier: GPL-2.0
int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags);
int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_fsync(struct io_kiocb *req, unsigned int issue_flags);
int io_fallocate(struct io_kiocb *req, unsigned int issue_flags);
int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);

340
io_uring/tctx.c Normal file
View File

@ -0,0 +1,340 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/nospec.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "tctx.h"
static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
struct task_struct *task)
{
struct io_wq_hash *hash;
struct io_wq_data data;
unsigned int concurrency;
mutex_lock(&ctx->uring_lock);
hash = ctx->hash_map;
if (!hash) {
hash = kzalloc(sizeof(*hash), GFP_KERNEL);
if (!hash) {
mutex_unlock(&ctx->uring_lock);
return ERR_PTR(-ENOMEM);
}
refcount_set(&hash->refs, 1);
init_waitqueue_head(&hash->wait);
ctx->hash_map = hash;
}
mutex_unlock(&ctx->uring_lock);
data.hash = hash;
data.task = task;
data.free_work = io_wq_free_work;
data.do_work = io_wq_submit_work;
/* Do QD, or 4 * CPUS, whatever is smallest */
concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
return io_wq_create(concurrency, &data);
}
void __io_uring_free(struct task_struct *tsk)
{
struct io_uring_task *tctx = tsk->io_uring;
WARN_ON_ONCE(!xa_empty(&tctx->xa));
WARN_ON_ONCE(tctx->io_wq);
WARN_ON_ONCE(tctx->cached_refs);
percpu_counter_destroy(&tctx->inflight);
kfree(tctx);
tsk->io_uring = NULL;
}
__cold int io_uring_alloc_task_context(struct task_struct *task,
struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx;
int ret;
tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
if (unlikely(!tctx))
return -ENOMEM;
ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
if (unlikely(ret)) {
kfree(tctx);
return ret;
}
tctx->io_wq = io_init_wq_offload(ctx, task);
if (IS_ERR(tctx->io_wq)) {
ret = PTR_ERR(tctx->io_wq);
percpu_counter_destroy(&tctx->inflight);
kfree(tctx);
return ret;
}
xa_init(&tctx->xa);
init_waitqueue_head(&tctx->wait);
atomic_set(&tctx->in_idle, 0);
atomic_set(&tctx->inflight_tracked, 0);
task->io_uring = tctx;
init_llist_head(&tctx->task_list);
init_task_work(&tctx->task_work, tctx_task_work);
return 0;
}
static int io_register_submitter(struct io_ring_ctx *ctx)
{
int ret = 0;
mutex_lock(&ctx->uring_lock);
if (!ctx->submitter_task)
ctx->submitter_task = get_task_struct(current);
else if (ctx->submitter_task != current)
ret = -EEXIST;
mutex_unlock(&ctx->uring_lock);
return ret;
}
int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool submitter)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_node *node;
int ret;
if ((ctx->flags & IORING_SETUP_SINGLE_ISSUER) && submitter) {
ret = io_register_submitter(ctx);
if (ret)
return ret;
}
if (unlikely(!tctx)) {
ret = io_uring_alloc_task_context(current, ctx);
if (unlikely(ret))
return ret;
tctx = current->io_uring;
if (ctx->iowq_limits_set) {
unsigned int limits[2] = { ctx->iowq_limits[0],
ctx->iowq_limits[1], };
ret = io_wq_max_workers(tctx->io_wq, limits);
if (ret)
return ret;
}
}
if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
node = kmalloc(sizeof(*node), GFP_KERNEL);
if (!node)
return -ENOMEM;
node->ctx = ctx;
node->task = current;
ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
node, GFP_KERNEL));
if (ret) {
kfree(node);
return ret;
}
mutex_lock(&ctx->uring_lock);
list_add(&node->ctx_node, &ctx->tctx_list);
mutex_unlock(&ctx->uring_lock);
}
if (submitter)
tctx->last = ctx;
return 0;
}
/*
* Remove this io_uring_file -> task mapping.
*/
__cold void io_uring_del_tctx_node(unsigned long index)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_node *node;
if (!tctx)
return;
node = xa_erase(&tctx->xa, index);
if (!node)
return;
WARN_ON_ONCE(current != node->task);
WARN_ON_ONCE(list_empty(&node->ctx_node));
mutex_lock(&node->ctx->uring_lock);
list_del(&node->ctx_node);
mutex_unlock(&node->ctx->uring_lock);
if (tctx->last == node->ctx)
tctx->last = NULL;
kfree(node);
}
__cold void io_uring_clean_tctx(struct io_uring_task *tctx)
{
struct io_wq *wq = tctx->io_wq;
struct io_tctx_node *node;
unsigned long index;
xa_for_each(&tctx->xa, index, node) {
io_uring_del_tctx_node(index);
cond_resched();
}
if (wq) {
/*
* Must be after io_uring_del_tctx_node() (removes nodes under
* uring_lock) to avoid race with io_uring_try_cancel_iowq().
*/
io_wq_put_and_exit(wq);
tctx->io_wq = NULL;
}
}
void io_uring_unreg_ringfd(void)
{
struct io_uring_task *tctx = current->io_uring;
int i;
for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
if (tctx->registered_rings[i]) {
fput(tctx->registered_rings[i]);
tctx->registered_rings[i] = NULL;
}
}
}
static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
int start, int end)
{
struct file *file;
int offset;
for (offset = start; offset < end; offset++) {
offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
if (tctx->registered_rings[offset])
continue;
file = fget(fd);
if (!file) {
return -EBADF;
} else if (!io_is_uring_fops(file)) {
fput(file);
return -EOPNOTSUPP;
}
tctx->registered_rings[offset] = file;
return offset;
}
return -EBUSY;
}
/*
* Register a ring fd to avoid fdget/fdput for each io_uring_enter()
* invocation. User passes in an array of struct io_uring_rsrc_update
* with ->data set to the ring_fd, and ->offset given for the desired
* index. If no index is desired, application may set ->offset == -1U
* and we'll find an available index. Returns number of entries
* successfully processed, or < 0 on error if none were processed.
*/
int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
unsigned nr_args)
{
struct io_uring_rsrc_update __user *arg = __arg;
struct io_uring_rsrc_update reg;
struct io_uring_task *tctx;
int ret, i;
if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
return -EINVAL;
mutex_unlock(&ctx->uring_lock);
ret = __io_uring_add_tctx_node(ctx, false);
mutex_lock(&ctx->uring_lock);
if (ret)
return ret;
tctx = current->io_uring;
for (i = 0; i < nr_args; i++) {
int start, end;
if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
ret = -EFAULT;
break;
}
if (reg.resv) {
ret = -EINVAL;
break;
}
if (reg.offset == -1U) {
start = 0;
end = IO_RINGFD_REG_MAX;
} else {
if (reg.offset >= IO_RINGFD_REG_MAX) {
ret = -EINVAL;
break;
}
start = reg.offset;
end = start + 1;
}
ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
if (ret < 0)
break;
reg.offset = ret;
if (copy_to_user(&arg[i], &reg, sizeof(reg))) {
fput(tctx->registered_rings[reg.offset]);
tctx->registered_rings[reg.offset] = NULL;
ret = -EFAULT;
break;
}
}
return i ? i : ret;
}
int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
unsigned nr_args)
{
struct io_uring_rsrc_update __user *arg = __arg;
struct io_uring_task *tctx = current->io_uring;
struct io_uring_rsrc_update reg;
int ret = 0, i;
if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
return -EINVAL;
if (!tctx)
return 0;
for (i = 0; i < nr_args; i++) {
if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
ret = -EFAULT;
break;
}
if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) {
ret = -EINVAL;
break;
}
reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
if (tctx->registered_rings[reg.offset]) {
fput(tctx->registered_rings[reg.offset]);
tctx->registered_rings[reg.offset] = NULL;
}
}
return i ? i : ret;
}

57
io_uring/tctx.h Normal file
View File

@ -0,0 +1,57 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/llist.h>
/*
* Arbitrary limit, can be raised if need be
*/
#define IO_RINGFD_REG_MAX 16
struct io_uring_task {
/* submission side */
int cached_refs;
const struct io_ring_ctx *last;
struct io_wq *io_wq;
struct file *registered_rings[IO_RINGFD_REG_MAX];
struct xarray xa;
struct wait_queue_head wait;
atomic_t in_idle;
atomic_t inflight_tracked;
struct percpu_counter inflight;
struct { /* task_work */
struct llist_head task_list;
struct callback_head task_work;
} ____cacheline_aligned_in_smp;
};
struct io_tctx_node {
struct list_head ctx_node;
struct task_struct *task;
struct io_ring_ctx *ctx;
};
int io_uring_alloc_task_context(struct task_struct *task,
struct io_ring_ctx *ctx);
void io_uring_del_tctx_node(unsigned long index);
int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool submitter);
void io_uring_clean_tctx(struct io_uring_task *tctx);
void io_uring_unreg_ringfd(void);
int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
unsigned nr_args);
int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
unsigned nr_args);
/*
* Note that this task has used io_uring. We use it for cancelation purposes.
*/
static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx = current->io_uring;
if (likely(tctx && tctx->last == ctx))
return 0;
return __io_uring_add_tctx_node(ctx, true);
}

644
io_uring/timeout.c Normal file
View File

@ -0,0 +1,644 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/io_uring.h>
#include <trace/events/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "refs.h"
#include "cancel.h"
#include "timeout.h"
struct io_timeout {
struct file *file;
u32 off;
u32 target_seq;
struct list_head list;
/* head of the link, used by linked timeouts only */
struct io_kiocb *head;
/* for linked completions */
struct io_kiocb *prev;
};
struct io_timeout_rem {
struct file *file;
u64 addr;
/* timeout update */
struct timespec64 ts;
u32 flags;
bool ltimeout;
};
static inline bool io_is_timeout_noseq(struct io_kiocb *req)
{
struct io_timeout *timeout = io_kiocb_to_cmd(req);
return !timeout->off;
}
static inline void io_put_req(struct io_kiocb *req)
{
if (req_ref_put_and_test(req)) {
io_queue_next(req);
io_free_req(req);
}
}
static bool io_kill_timeout(struct io_kiocb *req, int status)
__must_hold(&req->ctx->completion_lock)
__must_hold(&req->ctx->timeout_lock)
{
struct io_timeout_data *io = req->async_data;
if (hrtimer_try_to_cancel(&io->timer) != -1) {
struct io_timeout *timeout = io_kiocb_to_cmd(req);
if (status)
req_set_fail(req);
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
list_del_init(&timeout->list);
io_req_tw_post_queue(req, status, 0);
return true;
}
return false;
}
__cold void io_flush_timeouts(struct io_ring_ctx *ctx)
__must_hold(&ctx->completion_lock)
{
u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
struct io_timeout *timeout, *tmp;
spin_lock_irq(&ctx->timeout_lock);
list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) {
struct io_kiocb *req = cmd_to_io_kiocb(timeout);
u32 events_needed, events_got;
if (io_is_timeout_noseq(req))
break;
/*
* Since seq can easily wrap around over time, subtract
* the last seq at which timeouts were flushed before comparing.
* Assuming not more than 2^31-1 events have happened since,
* these subtractions won't have wrapped, so we can check if
* target is in [last_seq, current_seq] by comparing the two.
*/
events_needed = timeout->target_seq - ctx->cq_last_tm_flush;
events_got = seq - ctx->cq_last_tm_flush;
if (events_got < events_needed)
break;
io_kill_timeout(req, 0);
}
ctx->cq_last_tm_flush = seq;
spin_unlock_irq(&ctx->timeout_lock);
}
static void io_req_tw_fail_links(struct io_kiocb *link, bool *locked)
{
io_tw_lock(link->ctx, locked);
while (link) {
struct io_kiocb *nxt = link->link;
long res = -ECANCELED;
if (link->flags & REQ_F_FAIL)
res = link->cqe.res;
link->link = NULL;
io_req_set_res(link, res, 0);
io_req_task_complete(link, locked);
link = nxt;
}
}
static void io_fail_links(struct io_kiocb *req)
__must_hold(&req->ctx->completion_lock)
{
struct io_kiocb *link = req->link;
bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES;
if (!link)
return;
while (link) {
if (ignore_cqes)
link->flags |= REQ_F_CQE_SKIP;
else
link->flags &= ~REQ_F_CQE_SKIP;
trace_io_uring_fail_link(req, link);
link = link->link;
}
link = req->link;
link->io_task_work.func = io_req_tw_fail_links;
io_req_task_work_add(link);
req->link = NULL;
}
static inline void io_remove_next_linked(struct io_kiocb *req)
{
struct io_kiocb *nxt = req->link;
req->link = nxt->link;
nxt->link = NULL;
}
bool io_disarm_next(struct io_kiocb *req)
__must_hold(&req->ctx->completion_lock)
{
struct io_kiocb *link = NULL;
bool posted = false;
if (req->flags & REQ_F_ARM_LTIMEOUT) {
link = req->link;
req->flags &= ~REQ_F_ARM_LTIMEOUT;
if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
io_remove_next_linked(req);
io_req_tw_post_queue(link, -ECANCELED, 0);
posted = true;
}
} else if (req->flags & REQ_F_LINK_TIMEOUT) {
struct io_ring_ctx *ctx = req->ctx;
spin_lock_irq(&ctx->timeout_lock);
link = io_disarm_linked_timeout(req);
spin_unlock_irq(&ctx->timeout_lock);
if (link) {
posted = true;
io_req_tw_post_queue(link, -ECANCELED, 0);
}
}
if (unlikely((req->flags & REQ_F_FAIL) &&
!(req->flags & REQ_F_HARDLINK))) {
posted |= (req->link != NULL);
io_fail_links(req);
}
return posted;
}
struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
struct io_kiocb *link)
__must_hold(&req->ctx->completion_lock)
__must_hold(&req->ctx->timeout_lock)
{
struct io_timeout_data *io = link->async_data;
struct io_timeout *timeout = io_kiocb_to_cmd(link);
io_remove_next_linked(req);
timeout->head = NULL;
if (hrtimer_try_to_cancel(&io->timer) != -1) {
list_del(&timeout->list);
return link;
}
return NULL;
}
static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
{
struct io_timeout_data *data = container_of(timer,
struct io_timeout_data, timer);
struct io_kiocb *req = data->req;
struct io_timeout *timeout = io_kiocb_to_cmd(req);
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
spin_lock_irqsave(&ctx->timeout_lock, flags);
list_del_init(&timeout->list);
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
spin_unlock_irqrestore(&ctx->timeout_lock, flags);
if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
req_set_fail(req);
io_req_set_res(req, -ETIME, 0);
req->io_task_work.func = io_req_task_complete;
io_req_task_work_add(req);
return HRTIMER_NORESTART;
}
static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
struct io_cancel_data *cd)
__must_hold(&ctx->timeout_lock)
{
struct io_timeout *timeout;
struct io_timeout_data *io;
struct io_kiocb *req = NULL;
list_for_each_entry(timeout, &ctx->timeout_list, list) {
struct io_kiocb *tmp = cmd_to_io_kiocb(timeout);
if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
cd->data != tmp->cqe.user_data)
continue;
if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
if (cd->seq == tmp->work.cancel_seq)
continue;
tmp->work.cancel_seq = cd->seq;
}
req = tmp;
break;
}
if (!req)
return ERR_PTR(-ENOENT);
io = req->async_data;
if (hrtimer_try_to_cancel(&io->timer) == -1)
return ERR_PTR(-EALREADY);
timeout = io_kiocb_to_cmd(req);
list_del_init(&timeout->list);
return req;
}
int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
__must_hold(&ctx->completion_lock)
{
struct io_kiocb *req;
spin_lock_irq(&ctx->timeout_lock);
req = io_timeout_extract(ctx, cd);
spin_unlock_irq(&ctx->timeout_lock);
if (IS_ERR(req))
return PTR_ERR(req);
io_req_task_queue_fail(req, -ECANCELED);
return 0;
}
static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
{
unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED;
struct io_timeout *timeout = io_kiocb_to_cmd(req);
struct io_kiocb *prev = timeout->prev;
int ret = -ENOENT;
if (prev) {
if (!(req->task->flags & PF_EXITING)) {
struct io_cancel_data cd = {
.ctx = req->ctx,
.data = prev->cqe.user_data,
};
ret = io_try_cancel(req->task->io_uring, &cd, issue_flags);
}
io_req_set_res(req, ret ?: -ETIME, 0);
io_req_complete_post(req);
io_put_req(prev);
} else {
io_req_set_res(req, -ETIME, 0);
io_req_complete_post(req);
}
}
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
{
struct io_timeout_data *data = container_of(timer,
struct io_timeout_data, timer);
struct io_kiocb *prev, *req = data->req;
struct io_timeout *timeout = io_kiocb_to_cmd(req);
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
spin_lock_irqsave(&ctx->timeout_lock, flags);
prev = timeout->head;
timeout->head = NULL;
/*
* We don't expect the list to be empty, that will only happen if we
* race with the completion of the linked work.
*/
if (prev) {
io_remove_next_linked(prev);
if (!req_ref_inc_not_zero(prev))
prev = NULL;
}
list_del(&timeout->list);
timeout->prev = prev;
spin_unlock_irqrestore(&ctx->timeout_lock, flags);
req->io_task_work.func = io_req_task_link_timeout;
io_req_task_work_add(req);
return HRTIMER_NORESTART;
}
static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
{
switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
case IORING_TIMEOUT_BOOTTIME:
return CLOCK_BOOTTIME;
case IORING_TIMEOUT_REALTIME:
return CLOCK_REALTIME;
default:
/* can't happen, vetted at prep time */
WARN_ON_ONCE(1);
fallthrough;
case 0:
return CLOCK_MONOTONIC;
}
}
static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
struct timespec64 *ts, enum hrtimer_mode mode)
__must_hold(&ctx->timeout_lock)
{
struct io_timeout_data *io;
struct io_timeout *timeout;
struct io_kiocb *req = NULL;
list_for_each_entry(timeout, &ctx->ltimeout_list, list) {
struct io_kiocb *tmp = cmd_to_io_kiocb(timeout);
if (user_data == tmp->cqe.user_data) {
req = tmp;
break;
}
}
if (!req)
return -ENOENT;
io = req->async_data;
if (hrtimer_try_to_cancel(&io->timer) == -1)
return -EALREADY;
hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
io->timer.function = io_link_timeout_fn;
hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
return 0;
}
static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
struct timespec64 *ts, enum hrtimer_mode mode)
__must_hold(&ctx->timeout_lock)
{
struct io_cancel_data cd = { .data = user_data, };
struct io_kiocb *req = io_timeout_extract(ctx, &cd);
struct io_timeout *timeout = io_kiocb_to_cmd(req);
struct io_timeout_data *data;
if (IS_ERR(req))
return PTR_ERR(req);
timeout->off = 0; /* noseq */
data = req->async_data;
list_add_tail(&timeout->list, &ctx->timeout_list);
hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
data->timer.function = io_timeout_fn;
hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
return 0;
}
int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_timeout_rem *tr = io_kiocb_to_cmd(req);
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
if (sqe->buf_index || sqe->len || sqe->splice_fd_in)
return -EINVAL;
tr->ltimeout = false;
tr->addr = READ_ONCE(sqe->addr);
tr->flags = READ_ONCE(sqe->timeout_flags);
if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
return -EINVAL;
if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
tr->ltimeout = true;
if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
return -EINVAL;
if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
return -EFAULT;
if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0)
return -EINVAL;
} else if (tr->flags) {
/* timeout removal doesn't support flags */
return -EINVAL;
}
return 0;
}
static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
{
return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
: HRTIMER_MODE_REL;
}
/*
* Remove or update an existing timeout command
*/
int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_timeout_rem *tr = io_kiocb_to_cmd(req);
struct io_ring_ctx *ctx = req->ctx;
int ret;
if (!(tr->flags & IORING_TIMEOUT_UPDATE)) {
struct io_cancel_data cd = { .data = tr->addr, };
spin_lock(&ctx->completion_lock);
ret = io_timeout_cancel(ctx, &cd);
spin_unlock(&ctx->completion_lock);
} else {
enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
spin_lock_irq(&ctx->timeout_lock);
if (tr->ltimeout)
ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
else
ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
spin_unlock_irq(&ctx->timeout_lock);
}
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
}
static int __io_timeout_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe,
bool is_timeout_link)
{
struct io_timeout *timeout = io_kiocb_to_cmd(req);
struct io_timeout_data *data;
unsigned flags;
u32 off = READ_ONCE(sqe->off);
if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in)
return -EINVAL;
if (off && is_timeout_link)
return -EINVAL;
flags = READ_ONCE(sqe->timeout_flags);
if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
IORING_TIMEOUT_ETIME_SUCCESS))
return -EINVAL;
/* more than one clock specified is invalid, obviously */
if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
return -EINVAL;
INIT_LIST_HEAD(&timeout->list);
timeout->off = off;
if (unlikely(off && !req->ctx->off_timeout_used))
req->ctx->off_timeout_used = true;
if (WARN_ON_ONCE(req_has_async_data(req)))
return -EFAULT;
if (io_alloc_async_data(req))
return -ENOMEM;
data = req->async_data;
data->req = req;
data->flags = flags;
if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
return -EFAULT;
if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
return -EINVAL;
INIT_LIST_HEAD(&timeout->list);
data->mode = io_translate_timeout_mode(flags);
hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
if (is_timeout_link) {
struct io_submit_link *link = &req->ctx->submit_state.link;
if (!link->head)
return -EINVAL;
if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
return -EINVAL;
timeout->head = link->last;
link->last->flags |= REQ_F_ARM_LTIMEOUT;
}
return 0;
}
int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
return __io_timeout_prep(req, sqe, false);
}
int io_link_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
return __io_timeout_prep(req, sqe, true);
}
int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_timeout *timeout = io_kiocb_to_cmd(req);
struct io_ring_ctx *ctx = req->ctx;
struct io_timeout_data *data = req->async_data;
struct list_head *entry;
u32 tail, off = timeout->off;
spin_lock_irq(&ctx->timeout_lock);
/*
* sqe->off holds how many events that need to occur for this
* timeout event to be satisfied. If it isn't set, then this is
* a pure timeout request, sequence isn't used.
*/
if (io_is_timeout_noseq(req)) {
entry = ctx->timeout_list.prev;
goto add;
}
tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
timeout->target_seq = tail + off;
/* Update the last seq here in case io_flush_timeouts() hasn't.
* This is safe because ->completion_lock is held, and submissions
* and completions are never mixed in the same ->completion_lock section.
*/
ctx->cq_last_tm_flush = tail;
/*
* Insertion sort, ensuring the first entry in the list is always
* the one we need first.
*/
list_for_each_prev(entry, &ctx->timeout_list) {
struct io_timeout *nextt = list_entry(entry, struct io_timeout, list);
struct io_kiocb *nxt = cmd_to_io_kiocb(nextt);
if (io_is_timeout_noseq(nxt))
continue;
/* nxt.seq is behind @tail, otherwise would've been completed */
if (off >= nextt->target_seq - tail)
break;
}
add:
list_add(&timeout->list, entry);
data->timer.function = io_timeout_fn;
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
spin_unlock_irq(&ctx->timeout_lock);
return IOU_ISSUE_SKIP_COMPLETE;
}
void io_queue_linked_timeout(struct io_kiocb *req)
{
struct io_timeout *timeout = io_kiocb_to_cmd(req);
struct io_ring_ctx *ctx = req->ctx;
spin_lock_irq(&ctx->timeout_lock);
/*
* If the back reference is NULL, then our linked request finished
* before we got a chance to setup the timer
*/
if (timeout->head) {
struct io_timeout_data *data = req->async_data;
data->timer.function = io_link_timeout_fn;
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
data->mode);
list_add_tail(&timeout->list, &ctx->ltimeout_list);
}
spin_unlock_irq(&ctx->timeout_lock);
/* drop submission reference */
io_put_req(req);
}
static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
bool cancel_all)
__must_hold(&req->ctx->timeout_lock)
{
struct io_kiocb *req;
if (task && head->task != task)
return false;
if (cancel_all)
return true;
io_for_each_link(req, head) {
if (req->flags & REQ_F_INFLIGHT)
return true;
}
return false;
}
/* Returns true if we found and killed one or more timeouts */
__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
bool cancel_all)
{
struct io_timeout *timeout, *tmp;
int canceled = 0;
io_cq_lock(ctx);
spin_lock_irq(&ctx->timeout_lock);
list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) {
struct io_kiocb *req = cmd_to_io_kiocb(timeout);
if (io_match_task(req, tsk, cancel_all) &&
io_kill_timeout(req, -ECANCELED))
canceled++;
}
spin_unlock_irq(&ctx->timeout_lock);
io_cq_unlock_post(ctx);
return canceled != 0;
}

36
io_uring/timeout.h Normal file
View File

@ -0,0 +1,36 @@
// SPDX-License-Identifier: GPL-2.0
struct io_timeout_data {
struct io_kiocb *req;
struct hrtimer timer;
struct timespec64 ts;
enum hrtimer_mode mode;
u32 flags;
};
struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
struct io_kiocb *link);
static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req)
{
struct io_kiocb *link = req->link;
if (link && link->opcode == IORING_OP_LINK_TIMEOUT)
return __io_disarm_linked_timeout(req, link);
return NULL;
}
__cold void io_flush_timeouts(struct io_ring_ctx *ctx);
struct io_cancel_data;
int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd);
__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
bool cancel_all);
void io_queue_linked_timeout(struct io_kiocb *req);
bool io_disarm_next(struct io_kiocb *req);
int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_link_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_timeout(struct io_kiocb *req, unsigned int issue_flags);
int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags);

114
io_uring/uring_cmd.c Normal file
View File

@ -0,0 +1,114 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "uring_cmd.h"
static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
ioucmd->task_work_cb(ioucmd);
}
void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *))
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
ioucmd->task_work_cb = task_work_cb;
req->io_task_work.func = io_uring_cmd_work;
io_req_task_work_add(req);
}
EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task);
static inline void io_req_set_cqe32_extra(struct io_kiocb *req,
u64 extra1, u64 extra2)
{
req->extra1 = extra1;
req->extra2 = extra2;
req->flags |= REQ_F_CQE32_INIT;
}
/*
* Called by consumers of io_uring_cmd, if they originally returned
* -EIOCBQUEUED upon receiving the command.
*/
void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, 0, ret);
if (req->ctx->flags & IORING_SETUP_CQE32)
io_req_set_cqe32_extra(req, res2, 0);
__io_req_complete(req, 0);
}
EXPORT_SYMBOL_GPL(io_uring_cmd_done);
int io_uring_cmd_prep_async(struct io_kiocb *req)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
size_t cmd_size;
cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
memcpy(req->async_data, ioucmd->cmd, cmd_size);
return 0;
}
int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
if (sqe->rw_flags || sqe->__pad1)
return -EINVAL;
ioucmd->cmd = sqe->cmd;
ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
return 0;
}
int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req);
struct io_ring_ctx *ctx = req->ctx;
struct file *file = req->file;
int ret;
if (!req->file->f_op->uring_cmd)
return -EOPNOTSUPP;
if (ctx->flags & IORING_SETUP_SQE128)
issue_flags |= IO_URING_F_SQE128;
if (ctx->flags & IORING_SETUP_CQE32)
issue_flags |= IO_URING_F_CQE32;
if (ctx->flags & IORING_SETUP_IOPOLL)
issue_flags |= IO_URING_F_IOPOLL;
if (req_has_async_data(req))
ioucmd->cmd = req->async_data;
ret = file->f_op->uring_cmd(ioucmd, issue_flags);
if (ret == -EAGAIN) {
if (!req_has_async_data(req)) {
if (io_alloc_async_data(req))
return -ENOMEM;
io_uring_cmd_prep_async(req);
}
return -EAGAIN;
}
if (ret != -EIOCBQUEUED) {
io_uring_cmd_done(ioucmd, ret, 0);
return IOU_OK;
}
return IOU_ISSUE_SKIP_COMPLETE;
}

13
io_uring/uring_cmd.h Normal file
View File

@ -0,0 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags);
int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_uring_cmd_prep_async(struct io_kiocb *req);
/*
* The URING_CMD payload starts at 'cmd' in the first sqe, and continues into
* the following sqe if SQE128 is used.
*/
#define uring_cmd_pdu_size(is_sqe128) \
((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \
offsetof(struct io_uring_sqe, cmd))

258
io_uring/xattr.c Normal file
View File

@ -0,0 +1,258 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/io_uring.h>
#include <linux/xattr.h>
#include <uapi/linux/io_uring.h>
#include "../fs/internal.h"
#include "io_uring.h"
#include "xattr.h"
struct io_xattr {
struct file *file;
struct xattr_ctx ctx;
struct filename *filename;
};
void io_xattr_cleanup(struct io_kiocb *req)
{
struct io_xattr *ix = io_kiocb_to_cmd(req);
if (ix->filename)
putname(ix->filename);
kfree(ix->ctx.kname);
kvfree(ix->ctx.kvalue);
}
static void io_xattr_finish(struct io_kiocb *req, int ret)
{
req->flags &= ~REQ_F_NEED_CLEANUP;
io_xattr_cleanup(req);
io_req_set_res(req, ret, 0);
}
static int __io_getxattr_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
struct io_xattr *ix = io_kiocb_to_cmd(req);
const char __user *name;
int ret;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
ix->filename = NULL;
ix->ctx.kvalue = NULL;
name = u64_to_user_ptr(READ_ONCE(sqe->addr));
ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2));
ix->ctx.size = READ_ONCE(sqe->len);
ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
if (ix->ctx.flags)
return -EINVAL;
ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
if (!ix->ctx.kname)
return -ENOMEM;
ret = strncpy_from_user(ix->ctx.kname->name, name,
sizeof(ix->ctx.kname->name));
if (!ret || ret == sizeof(ix->ctx.kname->name))
ret = -ERANGE;
if (ret < 0) {
kfree(ix->ctx.kname);
return ret;
}
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
int io_fgetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
return __io_getxattr_prep(req, sqe);
}
int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_xattr *ix = io_kiocb_to_cmd(req);
const char __user *path;
int ret;
ret = __io_getxattr_prep(req, sqe);
if (ret)
return ret;
path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
if (IS_ERR(ix->filename)) {
ret = PTR_ERR(ix->filename);
ix->filename = NULL;
}
return ret;
}
int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_xattr *ix = io_kiocb_to_cmd(req);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt),
req->file->f_path.dentry,
&ix->ctx);
io_xattr_finish(req, ret);
return IOU_OK;
}
int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_xattr *ix = io_kiocb_to_cmd(req);
unsigned int lookup_flags = LOOKUP_FOLLOW;
struct path path;
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
retry:
ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
if (!ret) {
ret = do_getxattr(mnt_user_ns(path.mnt),
path.dentry,
&ix->ctx);
path_put(&path);
if (retry_estale(ret, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
}
io_xattr_finish(req, ret);
return IOU_OK;
}
static int __io_setxattr_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
struct io_xattr *ix = io_kiocb_to_cmd(req);
const char __user *name;
int ret;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
ix->filename = NULL;
name = u64_to_user_ptr(READ_ONCE(sqe->addr));
ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2));
ix->ctx.kvalue = NULL;
ix->ctx.size = READ_ONCE(sqe->len);
ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
if (!ix->ctx.kname)
return -ENOMEM;
ret = setxattr_copy(name, &ix->ctx);
if (ret) {
kfree(ix->ctx.kname);
return ret;
}
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_xattr *ix = io_kiocb_to_cmd(req);
const char __user *path;
int ret;
ret = __io_setxattr_prep(req, sqe);
if (ret)
return ret;
path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
if (IS_ERR(ix->filename)) {
ret = PTR_ERR(ix->filename);
ix->filename = NULL;
}
return ret;
}
int io_fsetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
return __io_setxattr_prep(req, sqe);
}
static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags,
struct path *path)
{
struct io_xattr *ix = io_kiocb_to_cmd(req);
int ret;
ret = mnt_want_write(path->mnt);
if (!ret) {
ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx);
mnt_drop_write(path->mnt);
}
return ret;
}
int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
{
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
ret = __io_setxattr(req, issue_flags, &req->file->f_path);
io_xattr_finish(req, ret);
return IOU_OK;
}
int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_xattr *ix = io_kiocb_to_cmd(req);
unsigned int lookup_flags = LOOKUP_FOLLOW;
struct path path;
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
retry:
ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
if (!ret) {
ret = __io_setxattr(req, issue_flags, &path);
path_put(&path);
if (retry_estale(ret, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
}
io_xattr_finish(req, ret);
return IOU_OK;
}

15
io_uring/xattr.h Normal file
View File

@ -0,0 +1,15 @@
// SPDX-License-Identifier: GPL-2.0
void io_xattr_cleanup(struct io_kiocb *req);
int io_fsetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags);
int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_setxattr(struct io_kiocb *req, unsigned int issue_flags);
int io_fgetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags);
int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_getxattr(struct io_kiocb *req, unsigned int issue_flags);

View File

@ -91,7 +91,7 @@
#include "stats.h"
#include "../workqueue_internal.h"
#include "../../fs/io-wq.h"
#include "../../io_uring/io-wq.h"
#include "../smpboot.h"
/*

View File

@ -34,20 +34,15 @@
#include <net/compat.h>
int __get_compat_msghdr(struct msghdr *kmsg,
struct compat_msghdr __user *umsg,
struct sockaddr __user **save_addr,
compat_uptr_t *ptr, compat_size_t *len)
struct compat_msghdr *msg,
struct sockaddr __user **save_addr)
{
struct compat_msghdr msg;
ssize_t err;
if (copy_from_user(&msg, umsg, sizeof(*umsg)))
return -EFAULT;
kmsg->msg_flags = msg->msg_flags;
kmsg->msg_namelen = msg->msg_namelen;
kmsg->msg_flags = msg.msg_flags;
kmsg->msg_namelen = msg.msg_namelen;
if (!msg.msg_name)
if (!msg->msg_name)
kmsg->msg_namelen = 0;
if (kmsg->msg_namelen < 0)
@ -57,15 +52,15 @@ int __get_compat_msghdr(struct msghdr *kmsg,
kmsg->msg_namelen = sizeof(struct sockaddr_storage);
kmsg->msg_control_is_user = true;
kmsg->msg_control_user = compat_ptr(msg.msg_control);
kmsg->msg_controllen = msg.msg_controllen;
kmsg->msg_control_user = compat_ptr(msg->msg_control);
kmsg->msg_controllen = msg->msg_controllen;
if (save_addr)
*save_addr = compat_ptr(msg.msg_name);
*save_addr = compat_ptr(msg->msg_name);
if (msg.msg_name && kmsg->msg_namelen) {
if (msg->msg_name && kmsg->msg_namelen) {
if (!save_addr) {
err = move_addr_to_kernel(compat_ptr(msg.msg_name),
err = move_addr_to_kernel(compat_ptr(msg->msg_name),
kmsg->msg_namelen,
kmsg->msg_name);
if (err < 0)
@ -76,12 +71,10 @@ int __get_compat_msghdr(struct msghdr *kmsg,
kmsg->msg_namelen = 0;
}
if (msg.msg_iovlen > UIO_MAXIOV)
if (msg->msg_iovlen > UIO_MAXIOV)
return -EMSGSIZE;
kmsg->msg_iocb = NULL;
*ptr = msg.msg_iov;
*len = msg.msg_iovlen;
return 0;
}
@ -90,15 +83,17 @@ int get_compat_msghdr(struct msghdr *kmsg,
struct sockaddr __user **save_addr,
struct iovec **iov)
{
compat_uptr_t ptr;
compat_size_t len;
struct compat_msghdr msg;
ssize_t err;
err = __get_compat_msghdr(kmsg, umsg, save_addr, &ptr, &len);
if (copy_from_user(&msg, umsg, sizeof(*umsg)))
return -EFAULT;
err = __get_compat_msghdr(kmsg, &msg, save_addr);
if (err)
return err;
err = import_iovec(save_addr ? READ : WRITE, compat_ptr(ptr), len,
err = import_iovec(save_addr ? READ : WRITE, compat_ptr(msg.msg_iov), msg.msg_iovlen,
UIO_FASTIOV, iov, &kmsg->msg_iter);
return err < 0 ? err : 0;
}

View File

@ -2358,25 +2358,20 @@ struct used_address {
unsigned int name_len;
};
int __copy_msghdr_from_user(struct msghdr *kmsg,
struct user_msghdr __user *umsg,
struct sockaddr __user **save_addr,
struct iovec __user **uiov, size_t *nsegs)
int __copy_msghdr(struct msghdr *kmsg,
struct user_msghdr *msg,
struct sockaddr __user **save_addr)
{
struct user_msghdr msg;
ssize_t err;
if (copy_from_user(&msg, umsg, sizeof(*umsg)))
return -EFAULT;
kmsg->msg_control_is_user = true;
kmsg->msg_get_inq = 0;
kmsg->msg_control_user = msg.msg_control;
kmsg->msg_controllen = msg.msg_controllen;
kmsg->msg_flags = msg.msg_flags;
kmsg->msg_control_user = msg->msg_control;
kmsg->msg_controllen = msg->msg_controllen;
kmsg->msg_flags = msg->msg_flags;
kmsg->msg_namelen = msg.msg_namelen;
if (!msg.msg_name)
kmsg->msg_namelen = msg->msg_namelen;
if (!msg->msg_name)
kmsg->msg_namelen = 0;
if (kmsg->msg_namelen < 0)
@ -2386,11 +2381,11 @@ int __copy_msghdr_from_user(struct msghdr *kmsg,
kmsg->msg_namelen = sizeof(struct sockaddr_storage);
if (save_addr)
*save_addr = msg.msg_name;
*save_addr = msg->msg_name;
if (msg.msg_name && kmsg->msg_namelen) {
if (msg->msg_name && kmsg->msg_namelen) {
if (!save_addr) {
err = move_addr_to_kernel(msg.msg_name,
err = move_addr_to_kernel(msg->msg_name,
kmsg->msg_namelen,
kmsg->msg_name);
if (err < 0)
@ -2401,12 +2396,10 @@ int __copy_msghdr_from_user(struct msghdr *kmsg,
kmsg->msg_namelen = 0;
}
if (msg.msg_iovlen > UIO_MAXIOV)
if (msg->msg_iovlen > UIO_MAXIOV)
return -EMSGSIZE;
kmsg->msg_iocb = NULL;
*uiov = msg.msg_iov;
*nsegs = msg.msg_iovlen;
return 0;
}
@ -2418,8 +2411,10 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
struct user_msghdr msg;
ssize_t err;
err = __copy_msghdr_from_user(kmsg, umsg, save_addr, &msg.msg_iov,
&msg.msg_iovlen);
if (copy_from_user(&msg, umsg, sizeof(*umsg)))
return -EFAULT;
err = __copy_msghdr(kmsg, &msg, save_addr);
if (err)
return err;