io_uring: add io_uring_types.h
authorJens Axboe <axboe@kernel.dk>
Tue, 24 May 2022 16:56:14 +0000 (10:56 -0600)
committerJens Axboe <axboe@kernel.dk>
Mon, 25 Jul 2022 00:39:11 +0000 (18:39 -0600)
This adds definitions of structs that both the core and the various
opcode handlers need to know about.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
io_uring/io_uring.c
io_uring/io_uring_types.h [new file with mode: 0644]

index 75d8c31..ff7886c 100644 (file)
@@ -90,6 +90,8 @@
 #include "../fs/internal.h"
 #include "io-wq.h"
 
+#include "io_uring_types.h"
+
 #define IORING_MAX_ENTRIES     32768
 #define IORING_MAX_CQ_ENTRIES  (2 * IORING_MAX_ENTRIES)
 #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
 
 #define IO_TCTX_REFS_CACHE_NR  (1U << 10)
 
-struct io_uring {
-       u32 head ____cacheline_aligned_in_smp;
-       u32 tail ____cacheline_aligned_in_smp;
-};
-
-/*
- * This data is shared with the application through the mmap at offsets
- * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
- *
- * The offsets to the member fields are published through struct
- * io_sqring_offsets when calling io_uring_setup.
- */
-struct io_rings {
-       /*
-        * Head and tail offsets into the ring; the offsets need to be
-        * masked to get valid indices.
-        *
-        * The kernel controls head of the sq ring and the tail of the cq ring,
-        * and the application controls tail of the sq ring and the head of the
-        * cq ring.
-        */
-       struct io_uring         sq, cq;
-       /*
-        * Bitmasks to apply to head and tail offsets (constant, equals
-        * ring_entries - 1)
-        */
-       u32                     sq_ring_mask, cq_ring_mask;
-       /* Ring sizes (constant, power of 2) */
-       u32                     sq_ring_entries, cq_ring_entries;
-       /*
-        * Number of invalid entries dropped by the kernel due to
-        * invalid index stored in array
-        *
-        * Written by the kernel, shouldn't be modified by the
-        * application (i.e. get number of "new events" by comparing to
-        * cached value).
-        *
-        * After a new SQ head value was read by the application this
-        * counter includes all submissions that were dropped reaching
-        * the new SQ head (and possibly more).
-        */
-       u32                     sq_dropped;
-       /*
-        * Runtime SQ flags
-        *
-        * Written by the kernel, shouldn't be modified by the
-        * application.
-        *
-        * The application needs a full memory barrier before checking
-        * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
-        */
-       atomic_t                sq_flags;
-       /*
-        * Runtime CQ flags
-        *
-        * Written by the application, shouldn't be modified by the
-        * kernel.
-        */
-       u32                     cq_flags;
-       /*
-        * Number of completion events lost because the queue was full;
-        * this should be avoided by the application by making sure
-        * there are not more requests pending than there is space in
-        * the completion queue.
-        *
-        * Written by the kernel, shouldn't be modified by the
-        * application (i.e. get number of "new events" by comparing to
-        * cached value).
-        *
-        * As completion events come in out of order this counter is not
-        * ordered with any other data.
-        */
-       u32                     cq_overflow;
-       /*
-        * Ring buffer of completion events.
-        *
-        * The kernel writes completion events fresh every time they are
-        * produced, so the application is allowed to modify pending
-        * entries.
-        */
-       struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
-};
-
 struct io_mapped_ubuf {
        u64             ubuf;
        u64             ubuf_end;
@@ -252,12 +171,6 @@ struct io_rsrc_put {
        };
 };
 
-struct io_file_table {
-       struct io_fixed_file *files;
-       unsigned long *bitmap;
-       unsigned int alloc_hint;
-};
-
 struct io_rsrc_node {
        struct percpu_ref               refs;
        struct list_head                node;
@@ -310,14 +223,6 @@ struct io_buffer {
        __u16 bgid;
 };
 
-struct io_restriction {
-       DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
-       DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
-       u8 sqe_flags_allowed;
-       u8 sqe_flags_required;
-       bool registered;
-};
-
 enum {
        IO_SQ_THREAD_SHOULD_STOP = 0,
        IO_SQ_THREAD_SHOULD_PARK,
@@ -347,186 +252,7 @@ struct io_sq_data {
 #define IO_REQ_CACHE_SIZE              32
 #define IO_REQ_ALLOC_BATCH             8
 
-struct io_submit_link {
-       struct io_kiocb         *head;
-       struct io_kiocb         *last;
-};
-
-struct io_submit_state {
-       /* inline/task_work completion list, under ->uring_lock */
-       struct io_wq_work_node  free_list;
-       /* batch completion logic */
-       struct io_wq_work_list  compl_reqs;
-       struct io_submit_link   link;
-
-       bool                    plug_started;
-       bool                    need_plug;
-       bool                    flush_cqes;
-       unsigned short          submit_nr;
-       struct blk_plug         plug;
-};
-
-struct io_ev_fd {
-       struct eventfd_ctx      *cq_ev_fd;
-       unsigned int            eventfd_async: 1;
-       struct rcu_head         rcu;
-};
-
-#define BGID_ARRAY     64
-
-struct io_ring_ctx {
-       /* const or read-mostly hot data */
-       struct {
-               struct percpu_ref       refs;
-
-               struct io_rings         *rings;
-               unsigned int            flags;
-               enum task_work_notify_mode      notify_method;
-               unsigned int            compat: 1;
-               unsigned int            drain_next: 1;
-               unsigned int            restricted: 1;
-               unsigned int            off_timeout_used: 1;
-               unsigned int            drain_active: 1;
-               unsigned int            drain_disabled: 1;
-               unsigned int            has_evfd: 1;
-               unsigned int            syscall_iopoll: 1;
-       } ____cacheline_aligned_in_smp;
-
-       /* submission data */
-       struct {
-               struct mutex            uring_lock;
-
-               /*
-                * Ring buffer of indices into array of io_uring_sqe, which is
-                * mmapped by the application using the IORING_OFF_SQES offset.
-                *
-                * This indirection could e.g. be used to assign fixed
-                * io_uring_sqe entries to operations and only submit them to
-                * the queue when needed.
-                *
-                * The kernel modifies neither the indices array nor the entries
-                * array.
-                */
-               u32                     *sq_array;
-               struct io_uring_sqe     *sq_sqes;
-               unsigned                cached_sq_head;
-               unsigned                sq_entries;
-               struct list_head        defer_list;
-
-               /*
-                * Fixed resources fast path, should be accessed only under
-                * uring_lock, and updated through io_uring_register(2)
-                */
-               struct io_rsrc_node     *rsrc_node;
-               int                     rsrc_cached_refs;
-               atomic_t                cancel_seq;
-               struct io_file_table    file_table;
-               unsigned                nr_user_files;
-               unsigned                nr_user_bufs;
-               struct io_mapped_ubuf   **user_bufs;
-
-               struct io_submit_state  submit_state;
-
-               struct io_buffer_list   *io_bl;
-               struct xarray           io_bl_xa;
-               struct list_head        io_buffers_cache;
-
-               struct list_head        timeout_list;
-               struct list_head        ltimeout_list;
-               struct list_head        cq_overflow_list;
-               struct list_head        apoll_cache;
-               struct xarray           personalities;
-               u32                     pers_next;
-               unsigned                sq_thread_idle;
-       } ____cacheline_aligned_in_smp;
-
-       /* IRQ completion list, under ->completion_lock */
-       struct io_wq_work_list  locked_free_list;
-       unsigned int            locked_free_nr;
-
-       const struct cred       *sq_creds;      /* cred used for __io_sq_thread() */
-       struct io_sq_data       *sq_data;       /* if using sq thread polling */
-
-       struct wait_queue_head  sqo_sq_wait;
-       struct list_head        sqd_list;
-
-       unsigned long           check_cq;
-
-       struct {
-               /*
-                * We cache a range of free CQEs we can use, once exhausted it
-                * should go through a slower range setup, see __io_get_cqe()
-                */
-               struct io_uring_cqe     *cqe_cached;
-               struct io_uring_cqe     *cqe_sentinel;
-
-               unsigned                cached_cq_tail;
-               unsigned                cq_entries;
-               struct io_ev_fd __rcu   *io_ev_fd;
-               struct wait_queue_head  cq_wait;
-               unsigned                cq_extra;
-               atomic_t                cq_timeouts;
-               unsigned                cq_last_tm_flush;
-       } ____cacheline_aligned_in_smp;
-
-       struct {
-               spinlock_t              completion_lock;
-
-               spinlock_t              timeout_lock;
-
-               /*
-                * ->iopoll_list is protected by the ctx->uring_lock for
-                * io_uring instances that don't use IORING_SETUP_SQPOLL.
-                * For SQPOLL, only the single threaded io_sq_thread() will
-                * manipulate the list, hence no extra locking is needed there.
-                */
-               struct io_wq_work_list  iopoll_list;
-               struct hlist_head       *cancel_hash;
-               unsigned                cancel_hash_bits;
-               bool                    poll_multi_queue;
-
-               struct list_head        io_buffers_comp;
-       } ____cacheline_aligned_in_smp;
-
-       struct io_restriction           restrictions;
-
-       /* slow path rsrc auxilary data, used by update/register */
-       struct {
-               struct io_rsrc_node             *rsrc_backup_node;
-               struct io_mapped_ubuf           *dummy_ubuf;
-               struct io_rsrc_data             *file_data;
-               struct io_rsrc_data             *buf_data;
-
-               struct delayed_work             rsrc_put_work;
-               struct llist_head               rsrc_put_llist;
-               struct list_head                rsrc_ref_list;
-               spinlock_t                      rsrc_ref_lock;
-
-               struct list_head        io_buffers_pages;
-       };
-
-       /* Keep this last, we don't need it for the fast path */
-       struct {
-               #if defined(CONFIG_UNIX)
-                       struct socket           *ring_sock;
-               #endif
-               /* hashed buffered write serialization */
-               struct io_wq_hash               *hash_map;
-
-               /* Only used for accounting purposes */
-               struct user_struct              *user;
-               struct mm_struct                *mm_account;
-
-               /* ctx exit and cancelation */
-               struct llist_head               fallback_llist;
-               struct delayed_work             fallback_work;
-               struct work_struct              exit_work;
-               struct list_head                tctx_list;
-               struct completion               ref_comp;
-               u32                             iowq_limits[2];
-               bool                            iowq_limits_set;
-       };
-};
+#define BGID_ARRAY                     64
 
 /*
  * Arbitrary limit, can be raised if need be
@@ -808,232 +534,21 @@ struct io_xattr {
        struct filename                 *filename;
 };
 
-enum {
-       REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
-       REQ_F_IO_DRAIN_BIT      = IOSQE_IO_DRAIN_BIT,
-       REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
-       REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
-       REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
-       REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
-       REQ_F_CQE_SKIP_BIT      = IOSQE_CQE_SKIP_SUCCESS_BIT,
-
-       /* first byte is taken by user flags, shift it to not overlap */
-       REQ_F_FAIL_BIT          = 8,
-       REQ_F_INFLIGHT_BIT,
-       REQ_F_CUR_POS_BIT,
-       REQ_F_NOWAIT_BIT,
-       REQ_F_LINK_TIMEOUT_BIT,
-       REQ_F_NEED_CLEANUP_BIT,
-       REQ_F_POLLED_BIT,
-       REQ_F_BUFFER_SELECTED_BIT,
-       REQ_F_BUFFER_RING_BIT,
-       REQ_F_COMPLETE_INLINE_BIT,
-       REQ_F_REISSUE_BIT,
-       REQ_F_CREDS_BIT,
-       REQ_F_REFCOUNT_BIT,
-       REQ_F_ARM_LTIMEOUT_BIT,
-       REQ_F_ASYNC_DATA_BIT,
-       REQ_F_SKIP_LINK_CQES_BIT,
-       REQ_F_SINGLE_POLL_BIT,
-       REQ_F_DOUBLE_POLL_BIT,
-       REQ_F_PARTIAL_IO_BIT,
-       REQ_F_CQE32_INIT_BIT,
-       REQ_F_APOLL_MULTISHOT_BIT,
-       REQ_F_CLEAR_POLLIN_BIT,
-       /* keep async read/write and isreg together and in order */
-       REQ_F_SUPPORT_NOWAIT_BIT,
-       REQ_F_ISREG_BIT,
-
-       /* not a real bit, just to check we're not overflowing the space */
-       __REQ_F_LAST_BIT,
-};
-
-enum {
-       /* ctx owns file */
-       REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
-       /* drain existing IO first */
-       REQ_F_IO_DRAIN          = BIT(REQ_F_IO_DRAIN_BIT),
-       /* linked sqes */
-       REQ_F_LINK              = BIT(REQ_F_LINK_BIT),
-       /* doesn't sever on completion < 0 */
-       REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
-       /* IOSQE_ASYNC */
-       REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
-       /* IOSQE_BUFFER_SELECT */
-       REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
-       /* IOSQE_CQE_SKIP_SUCCESS */
-       REQ_F_CQE_SKIP          = BIT(REQ_F_CQE_SKIP_BIT),
-
-       /* fail rest of links */
-       REQ_F_FAIL              = BIT(REQ_F_FAIL_BIT),
-       /* on inflight list, should be cancelled and waited on exit reliably */
-       REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
-       /* read/write uses file position */
-       REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
-       /* must not punt to workers */
-       REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
-       /* has or had linked timeout */
-       REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
-       /* needs cleanup */
-       REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
-       /* already went through poll handler */
-       REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
-       /* buffer already selected */
-       REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
-       /* buffer selected from ring, needs commit */
-       REQ_F_BUFFER_RING       = BIT(REQ_F_BUFFER_RING_BIT),
-       /* completion is deferred through io_comp_state */
-       REQ_F_COMPLETE_INLINE   = BIT(REQ_F_COMPLETE_INLINE_BIT),
-       /* caller should reissue async */
-       REQ_F_REISSUE           = BIT(REQ_F_REISSUE_BIT),
-       /* supports async reads/writes */
-       REQ_F_SUPPORT_NOWAIT    = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
-       /* regular file */
-       REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
-       /* has creds assigned */
-       REQ_F_CREDS             = BIT(REQ_F_CREDS_BIT),
-       /* skip refcounting if not set */
-       REQ_F_REFCOUNT          = BIT(REQ_F_REFCOUNT_BIT),
-       /* there is a linked timeout that has to be armed */
-       REQ_F_ARM_LTIMEOUT      = BIT(REQ_F_ARM_LTIMEOUT_BIT),
-       /* ->async_data allocated */
-       REQ_F_ASYNC_DATA        = BIT(REQ_F_ASYNC_DATA_BIT),
-       /* don't post CQEs while failing linked requests */
-       REQ_F_SKIP_LINK_CQES    = BIT(REQ_F_SKIP_LINK_CQES_BIT),
-       /* single poll may be active */
-       REQ_F_SINGLE_POLL       = BIT(REQ_F_SINGLE_POLL_BIT),
-       /* double poll may active */
-       REQ_F_DOUBLE_POLL       = BIT(REQ_F_DOUBLE_POLL_BIT),
-       /* request has already done partial IO */
-       REQ_F_PARTIAL_IO        = BIT(REQ_F_PARTIAL_IO_BIT),
-       /* fast poll multishot mode */
-       REQ_F_APOLL_MULTISHOT   = BIT(REQ_F_APOLL_MULTISHOT_BIT),
-       /* ->extra1 and ->extra2 are initialised */
-       REQ_F_CQE32_INIT        = BIT(REQ_F_CQE32_INIT_BIT),
-       /* recvmsg special flag, clear EPOLLIN */
-       REQ_F_CLEAR_POLLIN      = BIT(REQ_F_CLEAR_POLLIN_BIT),
-};
-
 struct async_poll {
        struct io_poll          poll;
        struct io_poll          *double_poll;
 };
 
-typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
-
-struct io_task_work {
-       union {
-               struct io_wq_work_node  node;
-               struct llist_node       fallback_node;
-       };
-       io_req_tw_func_t                func;
-};
-
 enum {
        IORING_RSRC_FILE                = 0,
        IORING_RSRC_BUFFER              = 1,
 };
 
-struct io_cqe {
-       __u64   user_data;
-       __s32   res;
-       /* fd initially, then cflags for completion */
-       union {
-               __u32   flags;
-               int     fd;
-       };
-};
-
 enum {
        IO_CHECK_CQ_OVERFLOW_BIT,
        IO_CHECK_CQ_DROPPED_BIT,
 };
 
-/*
- * Each request type overlays its private data structure on top of this one.
- * They must not exceed this one in size.
- */
-struct io_cmd_data {
-       struct file             *file;
-       /* each command gets 56 bytes of data */
-       __u8                    data[56];
-};
-
-#define io_kiocb_to_cmd(req)   ((void *) &(req)->cmd)
-#define cmd_to_io_kiocb(ptr)   ((struct io_kiocb *) ptr)
-
-struct io_kiocb {
-       union {
-               /*
-                * NOTE! Each of the io_kiocb union members has the file pointer
-                * as the first entry in their struct definition. So you can
-                * access the file pointer through any of the sub-structs,
-                * or directly as just 'file' in this struct.
-                */
-               struct file             *file;
-               struct io_cmd_data      cmd;
-       };
-
-       u8                              opcode;
-       /* polled IO has completed */
-       u8                              iopoll_completed;
-       /*
-        * Can be either a fixed buffer index, or used with provided buffers.
-        * For the latter, before issue it points to the buffer group ID,
-        * and after selection it points to the buffer ID itself.
-        */
-       u16                             buf_index;
-       unsigned int                    flags;
-
-       struct io_cqe                   cqe;
-
-       struct io_ring_ctx              *ctx;
-       struct task_struct              *task;
-
-       struct io_rsrc_node             *rsrc_node;
-
-       union {
-               /* store used ubuf, so we can prevent reloading */
-               struct io_mapped_ubuf   *imu;
-
-               /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
-               struct io_buffer        *kbuf;
-
-               /*
-                * stores buffer ID for ring provided buffers, valid IFF
-                * REQ_F_BUFFER_RING is set.
-                */
-               struct io_buffer_list   *buf_list;
-       };
-
-       union {
-               /* used by request caches, completion batching and iopoll */
-               struct io_wq_work_node  comp_list;
-               /* cache ->apoll->events */
-               __poll_t apoll_events;
-       };
-       atomic_t                        refs;
-       atomic_t                        poll_refs;
-       struct io_task_work             io_task_work;
-       /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
-       union {
-               struct hlist_node       hash_node;
-               struct {
-                       u64             extra1;
-                       u64             extra2;
-               };
-       };
-       /* internal polling, see IORING_FEAT_FAST_POLL */
-       struct async_poll               *apoll;
-       /* opcode allocated if it needs to store data for async defer */
-       void                            *async_data;
-       /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
-       struct io_kiocb                 *link;
-       /* custom credentials, valid IFF REQ_F_CREDS is set */
-       const struct cred               *creds;
-       struct io_wq_work               work;
-};
-
 struct io_tctx_node {
        struct list_head        ctx_node;
        struct task_struct      *task;
diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h
new file mode 100644 (file)
index 0000000..1a0f592
--- /dev/null
@@ -0,0 +1,496 @@
+#ifndef IO_URING_TYPES_H
+#define IO_URING_TYPES_H
+
+#include <linux/blkdev.h>
+#include <linux/task_work.h>
+
+#include "io-wq.h"
+
+struct io_uring {
+       u32 head ____cacheline_aligned_in_smp;
+       u32 tail ____cacheline_aligned_in_smp;
+};
+
+/*
+ * This data is shared with the application through the mmap at offsets
+ * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
+ *
+ * The offsets to the member fields are published through struct
+ * io_sqring_offsets when calling io_uring_setup.
+ */
+struct io_rings {
+       /*
+        * Head and tail offsets into the ring; the offsets need to be
+        * masked to get valid indices.
+        *
+        * The kernel controls head of the sq ring and the tail of the cq ring,
+        * and the application controls tail of the sq ring and the head of the
+        * cq ring.
+        */
+       struct io_uring         sq, cq;
+       /*
+        * Bitmasks to apply to head and tail offsets (constant, equals
+        * ring_entries - 1)
+        */
+       u32                     sq_ring_mask, cq_ring_mask;
+       /* Ring sizes (constant, power of 2) */
+       u32                     sq_ring_entries, cq_ring_entries;
+       /*
+        * Number of invalid entries dropped by the kernel due to
+        * invalid index stored in array
+        *
+        * Written by the kernel, shouldn't be modified by the
+        * application (i.e. get number of "new events" by comparing to
+        * cached value).
+        *
+        * After a new SQ head value was read by the application this
+        * counter includes all submissions that were dropped reaching
+        * the new SQ head (and possibly more).
+        */
+       u32                     sq_dropped;
+       /*
+        * Runtime SQ flags
+        *
+        * Written by the kernel, shouldn't be modified by the
+        * application.
+        *
+        * The application needs a full memory barrier before checking
+        * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
+        */
+       atomic_t                sq_flags;
+       /*
+        * Runtime CQ flags
+        *
+        * Written by the application, shouldn't be modified by the
+        * kernel.
+        */
+       u32                     cq_flags;
+       /*
+        * Number of completion events lost because the queue was full;
+        * this should be avoided by the application by making sure
+        * there are not more requests pending than there is space in
+        * the completion queue.
+        *
+        * Written by the kernel, shouldn't be modified by the
+        * application (i.e. get number of "new events" by comparing to
+        * cached value).
+        *
+        * As completion events come in out of order this counter is not
+        * ordered with any other data.
+        */
+       u32                     cq_overflow;
+       /*
+        * Ring buffer of completion events.
+        *
+        * The kernel writes completion events fresh every time they are
+        * produced, so the application is allowed to modify pending
+        * entries.
+        */
+       struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
+};
+
+struct io_restriction {
+       DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
+       DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
+       u8 sqe_flags_allowed;
+       u8 sqe_flags_required;
+       bool registered;
+};
+
+struct io_submit_link {
+       struct io_kiocb         *head;
+       struct io_kiocb         *last;
+};
+
+struct io_submit_state {
+       /* inline/task_work completion list, under ->uring_lock */
+       struct io_wq_work_node  free_list;
+       /* batch completion logic */
+       struct io_wq_work_list  compl_reqs;
+       struct io_submit_link   link;
+
+       bool                    plug_started;
+       bool                    need_plug;
+       bool                    flush_cqes;
+       unsigned short          submit_nr;
+       struct blk_plug         plug;
+};
+
+struct io_ev_fd {
+       struct eventfd_ctx      *cq_ev_fd;
+       unsigned int            eventfd_async: 1;
+       struct rcu_head         rcu;
+};
+
+struct io_file_table {
+       struct io_fixed_file *files;
+       unsigned long *bitmap;
+       unsigned int alloc_hint;
+};
+
+struct io_ring_ctx {
+       /* const or read-mostly hot data */
+       struct {
+               struct percpu_ref       refs;
+
+               struct io_rings         *rings;
+               unsigned int            flags;
+               enum task_work_notify_mode      notify_method;
+               unsigned int            compat: 1;
+               unsigned int            drain_next: 1;
+               unsigned int            restricted: 1;
+               unsigned int            off_timeout_used: 1;
+               unsigned int            drain_active: 1;
+               unsigned int            drain_disabled: 1;
+               unsigned int            has_evfd: 1;
+               unsigned int            syscall_iopoll: 1;
+       } ____cacheline_aligned_in_smp;
+
+       /* submission data */
+       struct {
+               struct mutex            uring_lock;
+
+               /*
+                * Ring buffer of indices into array of io_uring_sqe, which is
+                * mmapped by the application using the IORING_OFF_SQES offset.
+                *
+                * This indirection could e.g. be used to assign fixed
+                * io_uring_sqe entries to operations and only submit them to
+                * the queue when needed.
+                *
+                * The kernel modifies neither the indices array nor the entries
+                * array.
+                */
+               u32                     *sq_array;
+               struct io_uring_sqe     *sq_sqes;
+               unsigned                cached_sq_head;
+               unsigned                sq_entries;
+               struct list_head        defer_list;
+
+               /*
+                * Fixed resources fast path, should be accessed only under
+                * uring_lock, and updated through io_uring_register(2)
+                */
+               struct io_rsrc_node     *rsrc_node;
+               int                     rsrc_cached_refs;
+               atomic_t                cancel_seq;
+               struct io_file_table    file_table;
+               unsigned                nr_user_files;
+               unsigned                nr_user_bufs;
+               struct io_mapped_ubuf   **user_bufs;
+
+               struct io_submit_state  submit_state;
+
+               struct io_buffer_list   *io_bl;
+               struct xarray           io_bl_xa;
+               struct list_head        io_buffers_cache;
+
+               struct list_head        timeout_list;
+               struct list_head        ltimeout_list;
+               struct list_head        cq_overflow_list;
+               struct list_head        apoll_cache;
+               struct xarray           personalities;
+               u32                     pers_next;
+               unsigned                sq_thread_idle;
+       } ____cacheline_aligned_in_smp;
+
+       /* IRQ completion list, under ->completion_lock */
+       struct io_wq_work_list  locked_free_list;
+       unsigned int            locked_free_nr;
+
+       const struct cred       *sq_creds;      /* cred used for __io_sq_thread() */
+       struct io_sq_data       *sq_data;       /* if using sq thread polling */
+
+       struct wait_queue_head  sqo_sq_wait;
+       struct list_head        sqd_list;
+
+       unsigned long           check_cq;
+
+       struct {
+               /*
+                * We cache a range of free CQEs we can use, once exhausted it
+                * should go through a slower range setup, see __io_get_cqe()
+                */
+               struct io_uring_cqe     *cqe_cached;
+               struct io_uring_cqe     *cqe_sentinel;
+
+               unsigned                cached_cq_tail;
+               unsigned                cq_entries;
+               struct io_ev_fd __rcu   *io_ev_fd;
+               struct wait_queue_head  cq_wait;
+               unsigned                cq_extra;
+               atomic_t                cq_timeouts;
+               unsigned                cq_last_tm_flush;
+       } ____cacheline_aligned_in_smp;
+
+       struct {
+               spinlock_t              completion_lock;
+
+               spinlock_t              timeout_lock;
+
+               /*
+                * ->iopoll_list is protected by the ctx->uring_lock for
+                * io_uring instances that don't use IORING_SETUP_SQPOLL.
+                * For SQPOLL, only the single threaded io_sq_thread() will
+                * manipulate the list, hence no extra locking is needed there.
+                */
+               struct io_wq_work_list  iopoll_list;
+               struct hlist_head       *cancel_hash;
+               unsigned                cancel_hash_bits;
+               bool                    poll_multi_queue;
+
+               struct list_head        io_buffers_comp;
+       } ____cacheline_aligned_in_smp;
+
+       struct io_restriction           restrictions;
+
+       /* slow path rsrc auxilary data, used by update/register */
+       struct {
+               struct io_rsrc_node             *rsrc_backup_node;
+               struct io_mapped_ubuf           *dummy_ubuf;
+               struct io_rsrc_data             *file_data;
+               struct io_rsrc_data             *buf_data;
+
+               struct delayed_work             rsrc_put_work;
+               struct llist_head               rsrc_put_llist;
+               struct list_head                rsrc_ref_list;
+               spinlock_t                      rsrc_ref_lock;
+
+               struct list_head        io_buffers_pages;
+       };
+
+       /* Keep this last, we don't need it for the fast path */
+       struct {
+               #if defined(CONFIG_UNIX)
+                       struct socket           *ring_sock;
+               #endif
+               /* hashed buffered write serialization */
+               struct io_wq_hash               *hash_map;
+
+               /* Only used for accounting purposes */
+               struct user_struct              *user;
+               struct mm_struct                *mm_account;
+
+               /* ctx exit and cancelation */
+               struct llist_head               fallback_llist;
+               struct delayed_work             fallback_work;
+               struct work_struct              exit_work;
+               struct list_head                tctx_list;
+               struct completion               ref_comp;
+               u32                             iowq_limits[2];
+               bool                            iowq_limits_set;
+       };
+};
+
+enum {
+       REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
+       REQ_F_IO_DRAIN_BIT      = IOSQE_IO_DRAIN_BIT,
+       REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
+       REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
+       REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
+       REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
+       REQ_F_CQE_SKIP_BIT      = IOSQE_CQE_SKIP_SUCCESS_BIT,
+
+       /* first byte is taken by user flags, shift it to not overlap */
+       REQ_F_FAIL_BIT          = 8,
+       REQ_F_INFLIGHT_BIT,
+       REQ_F_CUR_POS_BIT,
+       REQ_F_NOWAIT_BIT,
+       REQ_F_LINK_TIMEOUT_BIT,
+       REQ_F_NEED_CLEANUP_BIT,
+       REQ_F_POLLED_BIT,
+       REQ_F_BUFFER_SELECTED_BIT,
+       REQ_F_BUFFER_RING_BIT,
+       REQ_F_COMPLETE_INLINE_BIT,
+       REQ_F_REISSUE_BIT,
+       REQ_F_CREDS_BIT,
+       REQ_F_REFCOUNT_BIT,
+       REQ_F_ARM_LTIMEOUT_BIT,
+       REQ_F_ASYNC_DATA_BIT,
+       REQ_F_SKIP_LINK_CQES_BIT,
+       REQ_F_SINGLE_POLL_BIT,
+       REQ_F_DOUBLE_POLL_BIT,
+       REQ_F_PARTIAL_IO_BIT,
+       REQ_F_CQE32_INIT_BIT,
+       REQ_F_APOLL_MULTISHOT_BIT,
+       REQ_F_CLEAR_POLLIN_BIT,
+       /* keep async read/write and isreg together and in order */
+       REQ_F_SUPPORT_NOWAIT_BIT,
+       REQ_F_ISREG_BIT,
+
+       /* not a real bit, just to check we're not overflowing the space */
+       __REQ_F_LAST_BIT,
+};
+
+enum {
+       /* ctx owns file */
+       REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
+       /* drain existing IO first */
+       REQ_F_IO_DRAIN          = BIT(REQ_F_IO_DRAIN_BIT),
+       /* linked sqes */
+       REQ_F_LINK              = BIT(REQ_F_LINK_BIT),
+       /* doesn't sever on completion < 0 */
+       REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
+       /* IOSQE_ASYNC */
+       REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
+       /* IOSQE_BUFFER_SELECT */
+       REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
+       /* IOSQE_CQE_SKIP_SUCCESS */
+       REQ_F_CQE_SKIP          = BIT(REQ_F_CQE_SKIP_BIT),
+
+       /* fail rest of links */
+       REQ_F_FAIL              = BIT(REQ_F_FAIL_BIT),
+       /* on inflight list, should be cancelled and waited on exit reliably */
+       REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
+       /* read/write uses file position */
+       REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
+       /* must not punt to workers */
+       REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
+       /* has or had linked timeout */
+       REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
+       /* needs cleanup */
+       REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
+       /* already went through poll handler */
+       REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
+       /* buffer already selected */
+       REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
+       /* buffer selected from ring, needs commit */
+       REQ_F_BUFFER_RING       = BIT(REQ_F_BUFFER_RING_BIT),
+       /* completion is deferred through io_comp_state */
+       REQ_F_COMPLETE_INLINE   = BIT(REQ_F_COMPLETE_INLINE_BIT),
+       /* caller should reissue async */
+       REQ_F_REISSUE           = BIT(REQ_F_REISSUE_BIT),
+       /* supports async reads/writes */
+       REQ_F_SUPPORT_NOWAIT    = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
+       /* regular file */
+       REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
+       /* has creds assigned */
+       REQ_F_CREDS             = BIT(REQ_F_CREDS_BIT),
+       /* skip refcounting if not set */
+       REQ_F_REFCOUNT          = BIT(REQ_F_REFCOUNT_BIT),
+       /* there is a linked timeout that has to be armed */
+       REQ_F_ARM_LTIMEOUT      = BIT(REQ_F_ARM_LTIMEOUT_BIT),
+       /* ->async_data allocated */
+       REQ_F_ASYNC_DATA        = BIT(REQ_F_ASYNC_DATA_BIT),
+       /* don't post CQEs while failing linked requests */
+       REQ_F_SKIP_LINK_CQES    = BIT(REQ_F_SKIP_LINK_CQES_BIT),
+       /* single poll may be active */
+       REQ_F_SINGLE_POLL       = BIT(REQ_F_SINGLE_POLL_BIT),
+       /* double poll may active */
+       REQ_F_DOUBLE_POLL       = BIT(REQ_F_DOUBLE_POLL_BIT),
+       /* request has already done partial IO */
+       REQ_F_PARTIAL_IO        = BIT(REQ_F_PARTIAL_IO_BIT),
+       /* fast poll multishot mode */
+       REQ_F_APOLL_MULTISHOT   = BIT(REQ_F_APOLL_MULTISHOT_BIT),
+       /* ->extra1 and ->extra2 are initialised */
+       REQ_F_CQE32_INIT        = BIT(REQ_F_CQE32_INIT_BIT),
+       /* recvmsg special flag, clear EPOLLIN */
+       REQ_F_CLEAR_POLLIN      = BIT(REQ_F_CLEAR_POLLIN_BIT),
+};
+
+typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
+
+struct io_task_work {
+       union {
+               struct io_wq_work_node  node;
+               struct llist_node       fallback_node;
+       };
+       io_req_tw_func_t                func;
+};
+
+struct io_cqe {
+       __u64   user_data;
+       __s32   res;
+       /* fd initially, then cflags for completion */
+       union {
+               __u32   flags;
+               int     fd;
+       };
+};
+
+/*
+ * Each request type overlays its private data structure on top of this one.
+ * They must not exceed this one in size.
+ */
+struct io_cmd_data {
+       struct file             *file;
+       /* each command gets 56 bytes of data */
+       __u8                    data[56];
+};
+
+#define io_kiocb_to_cmd(req)   ((void *) &(req)->cmd)
+#define cmd_to_io_kiocb(ptr)   ((struct io_kiocb *) ptr)
+
+struct io_kiocb {
+       union {
+               /*
+                * NOTE! Each of the io_kiocb union members has the file pointer
+                * as the first entry in their struct definition. So you can
+                * access the file pointer through any of the sub-structs,
+                * or directly as just 'file' in this struct.
+                */
+               struct file             *file;
+               struct io_cmd_data      cmd;
+       };
+
+       u8                              opcode;
+       /* polled IO has completed */
+       u8                              iopoll_completed;
+       /*
+        * Can be either a fixed buffer index, or used with provided buffers.
+        * For the latter, before issue it points to the buffer group ID,
+        * and after selection it points to the buffer ID itself.
+        */
+       u16                             buf_index;
+       unsigned int                    flags;
+
+       struct io_cqe                   cqe;
+
+       struct io_ring_ctx              *ctx;
+       struct task_struct              *task;
+
+       struct io_rsrc_node             *rsrc_node;
+
+       union {
+               /* store used ubuf, so we can prevent reloading */
+               struct io_mapped_ubuf   *imu;
+
+               /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
+               struct io_buffer        *kbuf;
+
+               /*
+                * stores buffer ID for ring provided buffers, valid IFF
+                * REQ_F_BUFFER_RING is set.
+                */
+               struct io_buffer_list   *buf_list;
+       };
+
+       union {
+               /* used by request caches, completion batching and iopoll */
+               struct io_wq_work_node  comp_list;
+               /* cache ->apoll->events */
+               __poll_t apoll_events;
+       };
+       atomic_t                        refs;
+       atomic_t                        poll_refs;
+       struct io_task_work             io_task_work;
+       /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
+       union {
+               struct hlist_node       hash_node;
+               struct {
+                       u64             extra1;
+                       u64             extra2;
+               };
+       };
+       /* internal polling, see IORING_FEAT_FAST_POLL */
+       struct async_poll               *apoll;
+       /* opcode allocated if it needs to store data for async defer */
+       void                            *async_data;
+       /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
+       struct io_kiocb                 *link;
+       /* custom credentials, valid IFF REQ_F_CREDS is set */
+       const struct cred               *creds;
+       struct io_wq_work               work;
+};
+
+#endif