Merge tag 'for-5.13/io_uring-2021-04-27' of git://git.kernel.dk/linux-block

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 28 Apr 2021 21:56:09 +0000 (14:56 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 28 Apr 2021 21:56:09 +0000 (14:56 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 28 Apr 2021 21:56:09 +0000 (14:56 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 28 Apr 2021 21:56:09 +0000 (14:56 -0700)
diff --combined fs/io_uring.c

index dff3497,63ff705..360f813
--- 1/fs/io_uring.c
--- 2/fs/io_uring.c
+++ b/fs/io_uring.c
@@@ -194,43 -194,56 +194,56 @@@ enum io_uring_cmd_flags 
   
   struct io_mapped_ubuf {
         u64             ubuf;
-       size_t          len;
-       struct          bio_vec *bvec;
+       u64             ubuf_end;
         unsigned int    nr_bvecs;
         unsigned long   acct_pages;
+       struct bio_vec  bvec[];
   };
   
   struct io_ring_ctx;
   
+ struct io_overflow_cqe {
+       struct io_uring_cqe cqe;
+       struct list_head list;
+ };
+ 
+ struct io_fixed_file {
+       /* file * with additional FFS_* flags */
+       unsigned long file_ptr;
+ };
+ 
   struct io_rsrc_put {
         struct list_head list;
+       u64 tag;
         union {
                 void *rsrc;
                 struct file *file;
+               struct io_mapped_ubuf *buf;
         };
   };
   
- struct fixed_rsrc_table {
-       struct file             **files;
+ struct io_file_table {
+       /* two level table */
+       struct io_fixed_file **files;
   };
   
- struct fixed_rsrc_ref_node {
+ struct io_rsrc_node {
         struct percpu_ref               refs;
         struct list_head                node;
         struct list_head                rsrc_list;
-       struct fixed_rsrc_data          *rsrc_data;
-       void                            (*rsrc_put)(struct io_ring_ctx *ctx,
-                                                   struct io_rsrc_put *prsrc);
+       struct io_rsrc_data             *rsrc_data;
         struct llist_node               llist;
         bool                            done;
   };
   
- struct fixed_rsrc_data {
-       struct fixed_rsrc_table         *table;
+ typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
+ 
+ struct io_rsrc_data {
         struct io_ring_ctx              *ctx;
   
-       struct fixed_rsrc_ref_node      *node;
-       struct percpu_ref               refs;
+       u64                             *tags;
+       rsrc_put_fn                     *do_put;
+       atomic_t                        refs;
         struct completion               done;
         bool                            quiesce;
   };
@@@ -330,7 -343,6 +343,6 @@@ struct io_ring_ctx 
         struct {
                 unsigned int            flags;
                 unsigned int            compat: 1;
-               unsigned int            cq_overflow_flushed: 1;
                 unsigned int            drain_next: 1;
                 unsigned int            eventfd_async: 1;
                 unsigned int            restricted: 1;
@@@ -388,12 -400,14 +400,14 @@@
          * readers must ensure that ->refs is alive as long as the file* is
          * used. Only updated through io_uring_register(2).
          */
-       struct fixed_rsrc_data  *file_data;
+       struct io_rsrc_data     *file_data;
+       struct io_file_table    file_table;
         unsigned                nr_user_files;
   
         /* if used, fixed mapped user buffers */
+       struct io_rsrc_data     *buf_data;
         unsigned                nr_user_bufs;
-       struct io_mapped_ubuf   *user_bufs;
+       struct io_mapped_ubuf   **user_bufs;
   
         struct user_struct      *user;
   
@@@ -414,6 -428,7 +428,7 @@@
                 unsigned                cq_mask;
                 atomic_t                cq_timeouts;
                 unsigned                cq_last_tm_flush;
+               unsigned                cq_extra;
                 unsigned long           cq_check_overflow;
                 struct wait_queue_head  cq_wait;
                 struct fasync_struct    *cq_fasync;
@@@ -433,23 -448,20 +448,20 @@@
                 struct hlist_head       *cancel_hash;
                 unsigned                cancel_hash_bits;
                 bool                    poll_multi_file;
- 
-               spinlock_t              inflight_lock;
-               struct list_head        inflight_list;
         } ____cacheline_aligned_in_smp;
   
         struct delayed_work             rsrc_put_work;
         struct llist_head               rsrc_put_llist;
         struct list_head                rsrc_ref_list;
         spinlock_t                      rsrc_ref_lock;
+       struct io_rsrc_node             *rsrc_node;
+       struct io_rsrc_node             *rsrc_backup_node;
   
         struct io_restriction           restrictions;
   
         /* exit task_work */
         struct callback_head            *exit_task_work;
   
-       struct wait_queue_head          hash_wait;
- 
         /* Keep this last, we don't need it for the fast path */
         struct work_struct              exit_work;
         struct list_head                tctx_list;
@@@ -462,8 -474,8 +474,8 @@@ struct io_uring_task 
         const struct io_ring_ctx *last;
         struct io_wq            *io_wq;
         struct percpu_counter   inflight;
+       atomic_t                inflight_tracked;
         atomic_t                in_idle;
-       bool                    sqpoll;
   
         spinlock_t              task_lock;
         struct io_wq_work_list  task_list;
@@@ -484,9 -496,13 +496,13 @@@ struct io_poll_iocb 
         struct wait_queue_entry         wait;
   };
   
- struct io_poll_remove {
+ struct io_poll_update {
         struct file                     *file;
-       u64                             addr;
+       u64                             old_user_data;
+       u64                             new_user_data;
+       __poll_t                        events;
+       bool                            update_events;
+       bool                            update_user_data;
   };
   
   struct io_close {
@@@ -556,8 -572,9 +572,9 @@@ struct io_connect 
   struct io_sr_msg {
         struct file                     *file;
         union {
-               struct user_msghdr __user *umsg;
-               void __user             *buf;
+               struct compat_msghdr __user     *umsg_compat;
+               struct user_msghdr __user       *umsg;
+               void __user                     *buf;
         };
         int                             msg_flags;
         int                             bgid;
@@@ -614,7 -631,7 +631,7 @@@ struct io_splice 
   struct io_provide_buf {
         struct file                     *file;
         __u64                           addr;
-       __s32                           len;
+       __u32                           len;
         __u32                           bgid;
         __u16                           nbufs;
         __u16                           bid;
@@@ -653,7 -670,7 +670,7 @@@ struct io_unlink 
   struct io_completion {
         struct file                     *file;
         struct list_head                list;
-       int                             cflags;
+       u32                             cflags;
   };
   
   struct io_async_connect {
@@@ -690,14 -707,17 +707,17 @@@ enum 
         REQ_F_CUR_POS_BIT,
         REQ_F_NOWAIT_BIT,
         REQ_F_LINK_TIMEOUT_BIT,
-       REQ_F_ISREG_BIT,
         REQ_F_NEED_CLEANUP_BIT,
         REQ_F_POLLED_BIT,
         REQ_F_BUFFER_SELECTED_BIT,
-       REQ_F_NO_FILE_TABLE_BIT,
         REQ_F_LTIMEOUT_ACTIVE_BIT,
         REQ_F_COMPLETE_INLINE_BIT,
         REQ_F_REISSUE_BIT,
+       REQ_F_DONT_REISSUE_BIT,
+       /* keep async read/write and isreg together and in order */
+       REQ_F_ASYNC_READ_BIT,
+       REQ_F_ASYNC_WRITE_BIT,
+       REQ_F_ISREG_BIT,
   
         /* not a real bit, just to check we're not overflowing the space */
         __REQ_F_LAST_BIT,
@@@ -727,22 -747,26 +747,26 @@@ enum 
         REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
         /* has or had linked timeout */
         REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
-       /* regular file */
-       REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
         /* needs cleanup */
         REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
         /* already went through poll handler */
         REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
         /* buffer already selected */
         REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
-       /* doesn't need file table for this request */
-       REQ_F_NO_FILE_TABLE     = BIT(REQ_F_NO_FILE_TABLE_BIT),
         /* linked timeout is active, i.e. prepared by link's head */
         REQ_F_LTIMEOUT_ACTIVE   = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
         /* completion is deferred through io_comp_state */
         REQ_F_COMPLETE_INLINE   = BIT(REQ_F_COMPLETE_INLINE_BIT),
         /* caller should reissue async */
         REQ_F_REISSUE           = BIT(REQ_F_REISSUE_BIT),
+       /* don't attempt request reissue, see io_rw_reissue() */
+       REQ_F_DONT_REISSUE      = BIT(REQ_F_DONT_REISSUE_BIT),
+       /* supports async reads */
+       REQ_F_ASYNC_READ        = BIT(REQ_F_ASYNC_READ_BIT),
+       /* supports async writes */
+       REQ_F_ASYNC_WRITE       = BIT(REQ_F_ASYNC_WRITE_BIT),
+       /* regular file */
+       REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
   };
   
   struct async_poll {
@@@ -766,7 -790,7 +790,7 @@@ struct io_kiocb 
                 struct file             *file;
                 struct io_rw            rw;
                 struct io_poll_iocb     poll;
-               struct io_poll_remove   poll_remove;
+               struct io_poll_update   poll_update;
                 struct io_accept        accept;
                 struct io_sync          sync;
                 struct io_cancel        cancel;
@@@ -801,17 -825,14 +825,14 @@@
   
         struct io_ring_ctx              *ctx;
         unsigned int                    flags;
-       refcount_t                      refs;
+       atomic_t                        refs;
         struct task_struct              *task;
         u64                             user_data;
   
         struct io_kiocb                 *link;
         struct percpu_ref               *fixed_rsrc_refs;
   
-       /*
-        * 1. used with ctx->iopoll_list with reads/writes
-        * 2. to track reqs with ->files (see io_op_def::file_table)
-        */
+       /* used with ctx->iopoll_list with reads/writes */
         struct list_head                inflight_entry;
         union {
                 struct io_task_work     io_task_work;
@@@ -821,6 -842,8 +842,8 @@@
         struct hlist_node               hash_node;
         struct async_poll               *apoll;
         struct io_wq_work               work;
+       /* store used ubuf, so we can prevent reloading */
+       struct io_mapped_ubuf           *imu;
   };
   
   struct io_tctx_node {
@@@ -849,8 -872,8 +872,8 @@@ struct io_op_def 
         unsigned                pollout : 1;
         /* op supports buffer selection */
         unsigned                buffer_select : 1;
-       /* must always have async data allocated */
-       unsigned                needs_async_data : 1;
+       /* do prep async if is going to be punted */
+       unsigned                needs_async_setup : 1;
         /* should block plug */
         unsigned                plug : 1;
         /* size of async data needed, if any */
@@@ -864,7 -887,7 +887,7 @@@ static const struct io_op_def io_op_def
                 .unbound_nonreg_file    = 1,
                 .pollin                 = 1,
                 .buffer_select          = 1,
-               .needs_async_data       = 1,
+               .needs_async_setup      = 1,
                 .plug                   = 1,
                 .async_size             = sizeof(struct io_async_rw),
         },
@@@ -873,7 -896,7 +896,7 @@@
                 .hash_reg_file          = 1,
                 .unbound_nonreg_file    = 1,
                 .pollout                = 1,
-               .needs_async_data       = 1,
+               .needs_async_setup      = 1,
                 .plug                   = 1,
                 .async_size             = sizeof(struct io_async_rw),
         },
@@@ -907,7 -930,7 +930,7 @@@
                 .needs_file             = 1,
                 .unbound_nonreg_file    = 1,
                 .pollout                = 1,
-               .needs_async_data       = 1,
+               .needs_async_setup      = 1,
                 .async_size             = sizeof(struct io_async_msghdr),
         },
         [IORING_OP_RECVMSG] = {
@@@ -915,11 -938,10 +938,10 @@@
                 .unbound_nonreg_file    = 1,
                 .pollin                 = 1,
                 .buffer_select          = 1,
-               .needs_async_data       = 1,
+               .needs_async_setup      = 1,
                 .async_size             = sizeof(struct io_async_msghdr),
         },
         [IORING_OP_TIMEOUT] = {
-               .needs_async_data       = 1,
                 .async_size             = sizeof(struct io_timeout_data),
         },
         [IORING_OP_TIMEOUT_REMOVE] = {
@@@ -932,14 -954,13 +954,13 @@@
         },
         [IORING_OP_ASYNC_CANCEL] = {},
         [IORING_OP_LINK_TIMEOUT] = {
-               .needs_async_data       = 1,
                 .async_size             = sizeof(struct io_timeout_data),
         },
         [IORING_OP_CONNECT] = {
                 .needs_file             = 1,
                 .unbound_nonreg_file    = 1,
                 .pollout                = 1,
-               .needs_async_data       = 1,
+               .needs_async_setup      = 1,
                 .async_size             = sizeof(struct io_async_connect),
         },
         [IORING_OP_FALLOCATE] = {
@@@ -1008,40 -1029,31 +1029,31 @@@ static void io_uring_del_task_file(unsi
   static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
                                          struct task_struct *task,
                                          struct files_struct *files);
- static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx);
- static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
- static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
-                       struct io_ring_ctx *ctx);
- static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
- 
- static bool io_rw_reissue(struct io_kiocb *req);
- static void io_cqring_fill_event(struct io_kiocb *req, long res);
+ static void io_uring_cancel_sqpoll(struct io_sq_data *sqd);
+ static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
+ 
+ static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
+                                long res, unsigned int cflags);
   static void io_put_req(struct io_kiocb *req);
   static void io_put_req_deferred(struct io_kiocb *req, int nr);
- static void io_double_put_req(struct io_kiocb *req);
   static void io_dismantle_req(struct io_kiocb *req);
   static void io_put_task(struct task_struct *task, int nr);
- static void io_queue_next(struct io_kiocb *req);
   static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
- static void __io_queue_linked_timeout(struct io_kiocb *req);
   static void io_queue_linked_timeout(struct io_kiocb *req);
- static int __io_sqe_files_update(struct io_ring_ctx *ctx,
-                                struct io_uring_rsrc_update *ip,
-                                unsigned nr_args);
- static void __io_clean_op(struct io_kiocb *req);
+ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
+                                    struct io_uring_rsrc_update2 *up,
+                                    unsigned nr_args);
+ static void io_clean_op(struct io_kiocb *req);
   static struct file *io_file_get(struct io_submit_state *state,
                                 struct io_kiocb *req, int fd, bool fixed);
   static void __io_queue_sqe(struct io_kiocb *req);
   static void io_rsrc_put_work(struct work_struct *work);
   
- static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
-                          struct iov_iter *iter, bool needs_lock);
- static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
-                            const struct iovec *fast_iov,
-                            struct iov_iter *iter, bool force);
   static void io_req_task_queue(struct io_kiocb *req);
   static void io_submit_flush_completions(struct io_comp_state *cs,
                                         struct io_ring_ctx *ctx);
+ static bool io_poll_remove_waitqs(struct io_kiocb *req);
+ static int io_req_prep_async(struct io_kiocb *req);
   
   static struct kmem_cache *req_cachep;
   
@@@ -1063,34 -1075,36 +1075,36 @@@ EXPORT_SYMBOL(io_uring_get_socket)
   #define io_for_each_link(pos, head) \
         for (pos = (head); pos; pos = pos->link)
   
- static inline void io_clean_op(struct io_kiocb *req)
- {
-       if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
-               __io_clean_op(req);
- }
- 
- static inline void io_set_resource_node(struct io_kiocb *req)
+ static inline void io_req_set_rsrc_node(struct io_kiocb *req)
   {
         struct io_ring_ctx *ctx = req->ctx;
   
         if (!req->fixed_rsrc_refs) {
-               req->fixed_rsrc_refs = &ctx->file_data->node->refs;
+               req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
                 percpu_ref_get(req->fixed_rsrc_refs);
         }
   }
   
+ static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
+ {
+       bool got = percpu_ref_tryget(ref);
+ 
+       /* already at zero, wait for ->release() */
+       if (!got)
+               wait_for_completion(compl);
+       percpu_ref_resurrect(ref);
+       if (got)
+               percpu_ref_put(ref);
+ }
+ 
   static bool io_match_task(struct io_kiocb *head,
                           struct task_struct *task,
                           struct files_struct *files)
   {
         struct io_kiocb *req;
   
-       if (task && head->task != task) {
-               /* in terms of cancelation, always match if req task is dead */
-               if (head->task->flags & PF_EXITING)
-                       return true;
+       if (task && head->task != task)
                 return false;
-       }
         if (!files)
                 return true;
   
@@@ -1103,7 -1117,7 +1117,7 @@@
   
   static inline void req_set_fail_links(struct io_kiocb *req)
   {
-       if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
+       if (req->flags & REQ_F_LINK)
                 req->flags |= REQ_F_FAIL_LINK;
   }
   
@@@ -1161,8 -1175,6 +1175,6 @@@ static struct io_ring_ctx *io_ring_ctx_
         INIT_LIST_HEAD(&ctx->iopoll_list);
         INIT_LIST_HEAD(&ctx->defer_list);
         INIT_LIST_HEAD(&ctx->timeout_list);
-       spin_lock_init(&ctx->inflight_lock);
-       INIT_LIST_HEAD(&ctx->inflight_list);
         spin_lock_init(&ctx->rsrc_ref_lock);
         INIT_LIST_HEAD(&ctx->rsrc_ref_list);
         INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
@@@ -1182,7 -1194,7 +1194,7 @@@ static bool req_need_defer(struct io_ki
         if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
                 struct io_ring_ctx *ctx = req->ctx;
   
-               return seq != ctx->cached_cq_tail
+               return seq + ctx->cq_extra != ctx->cached_cq_tail
                                 + READ_ONCE(ctx->cached_cq_overflow);
         }
   
@@@ -1191,14 -1203,9 +1203,9 @@@
   
   static void io_req_track_inflight(struct io_kiocb *req)
   {
-       struct io_ring_ctx *ctx = req->ctx;
- 
         if (!(req->flags & REQ_F_INFLIGHT)) {
                 req->flags |= REQ_F_INFLIGHT;
- 
-               spin_lock_irq(&ctx->inflight_lock);
-               list_add(&req->inflight_entry, &ctx->inflight_list);
-               spin_unlock_irq(&ctx->inflight_lock);
+               atomic_inc(&current->io_uring->inflight_tracked);
         }
   }
   
@@@ -1210,6 -1217,8 +1217,8 @@@ static void io_prep_async_work(struct i
         if (!req->work.creds)
                 req->work.creds = get_current_cred();
   
+       req->work.list.next = NULL;
+       req->work.flags = 0;
         if (req->flags & REQ_F_FORCE_ASYNC)
                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
   
@@@ -1220,6 -1229,14 +1229,14 @@@
                 if (def->unbound_nonreg_file)
                         req->work.flags |= IO_WQ_WORK_UNBOUND;
         }
+ 
+       switch (req->opcode) {
+       case IORING_OP_SPLICE:
+       case IORING_OP_TEE:
+               if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
+                       req->work.flags |= IO_WQ_WORK_UNBOUND;
+               break;
+       }
   }
   
   static void io_prep_async_link(struct io_kiocb *req)
@@@ -1249,16 -1266,15 +1266,15 @@@ static void io_queue_async_work(struct 
   }
   
   static void io_kill_timeout(struct io_kiocb *req, int status)
+       __must_hold(&req->ctx->completion_lock)
   {
         struct io_timeout_data *io = req->async_data;
-       int ret;
   
-       ret = hrtimer_try_to_cancel(&io->timer);
-       if (ret != -1) {
+       if (hrtimer_try_to_cancel(&io->timer) != -1) {
                 atomic_set(&req->ctx->cq_timeouts,
                         atomic_read(&req->ctx->cq_timeouts) + 1);
                 list_del_init(&req->timeout.list);
-               io_cqring_fill_event(req, status);
+               io_cqring_fill_event(req->ctx, req->user_data, status, 0);
                 io_put_req_deferred(req, 1);
         }
   }
@@@ -1336,7 -1352,7 +1352,7 @@@ static inline unsigned int __io_cqring_
         return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
   }
   
- static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
+ static inline struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
   {
         struct io_rings *rings = ctx->rings;
         unsigned tail;
@@@ -1355,13 -1371,11 +1371,11 @@@
   
   static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
   {
-       if (!ctx->cq_ev_fd)
+       if (likely(!ctx->cq_ev_fd))
                 return false;
         if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
                 return false;
-       if (!ctx->eventfd_async)
-               return true;
-       return io_wq_current_is_worker();
+       return !ctx->eventfd_async || io_wq_current_is_worker();
   }
   
   static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
@@@ -1399,41 -1413,33 +1413,33 @@@ static void io_cqring_ev_posted_iopoll(
   }
   
   /* Returns true if there are no backlogged entries after the flush */
- static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
-                                      struct task_struct *tsk,
-                                      struct files_struct *files)
+ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
   {
         struct io_rings *rings = ctx->rings;
-       struct io_kiocb *req, *tmp;
-       struct io_uring_cqe *cqe;
         unsigned long flags;
         bool all_flushed, posted;
-       LIST_HEAD(list);
   
         if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries)
                 return false;
   
         posted = false;
         spin_lock_irqsave(&ctx->completion_lock, flags);
-       list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
-               if (!io_match_task(req, tsk, files))
-                       continue;
+       while (!list_empty(&ctx->cq_overflow_list)) {
+               struct io_uring_cqe *cqe = io_get_cqring(ctx);
+               struct io_overflow_cqe *ocqe;
   
-               cqe = io_get_cqring(ctx);
                 if (!cqe && !force)
                         break;
- 
-               list_move(&req->compl.list, &list);
-               if (cqe) {
-                       WRITE_ONCE(cqe->user_data, req->user_data);
-                       WRITE_ONCE(cqe->res, req->result);
-                       WRITE_ONCE(cqe->flags, req->compl.cflags);
-               } else {
-                       ctx->cached_cq_overflow++;
+               ocqe = list_first_entry(&ctx->cq_overflow_list,
+                                       struct io_overflow_cqe, list);
+               if (cqe)
+                       memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
+               else
                         WRITE_ONCE(ctx->rings->cq_overflow,
-                                  ctx->cached_cq_overflow);
-               }
+                                  ++ctx->cached_cq_overflow);
                 posted = true;
+               list_del(&ocqe->list);
+               kfree(ocqe);
         }
   
         all_flushed = list_empty(&ctx->cq_overflow_list);
@@@ -1448,19 -1454,10 +1454,10 @@@
         spin_unlock_irqrestore(&ctx->completion_lock, flags);
         if (posted)
                 io_cqring_ev_posted(ctx);
- 
-       while (!list_empty(&list)) {
-               req = list_first_entry(&list, struct io_kiocb, compl.list);
-               list_del(&req->compl.list);
-               io_put_req(req);
-       }
- 
         return all_flushed;
   }
   
- static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
-                                    struct task_struct *tsk,
-                                    struct files_struct *files)
+ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
   {
         bool ret = true;
   
@@@ -1468,7 -1465,7 +1465,7 @@@
                 /* iopoll syncs against uring_lock, not completion_lock */
                 if (ctx->flags & IORING_SETUP_IOPOLL)
                         mutex_lock(&ctx->uring_lock);
-               ret = __io_cqring_overflow_flush(ctx, force, tsk, files);
+               ret = __io_cqring_overflow_flush(ctx, force);
                 if (ctx->flags & IORING_SETUP_IOPOLL)
                         mutex_unlock(&ctx->uring_lock);
         }
@@@ -1476,12 -1473,74 +1473,74 @@@
         return ret;
   }
   
- static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
+ /*
+  * Shamelessly stolen from the mm implementation of page reference checking,
+  * see commit f958d7b528b1 for details.
+  */
+ #define req_ref_zero_or_close_to_overflow(req)        \
+       ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
+ 
+ static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
+ {
+       return atomic_inc_not_zero(&req->refs);
+ }
+ 
+ static inline bool req_ref_sub_and_test(struct io_kiocb *req, int refs)
+ {
+       WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+       return atomic_sub_and_test(refs, &req->refs);
+ }
+ 
+ static inline bool req_ref_put_and_test(struct io_kiocb *req)
+ {
+       WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+       return atomic_dec_and_test(&req->refs);
+ }
+ 
+ static inline void req_ref_put(struct io_kiocb *req)
+ {
+       WARN_ON_ONCE(req_ref_put_and_test(req));
+ }
+ 
+ static inline void req_ref_get(struct io_kiocb *req)
+ {
+       WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+       atomic_inc(&req->refs);
+ }
+ 
+ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
+                                    long res, unsigned int cflags)
+ {
+       struct io_overflow_cqe *ocqe;
+ 
+       ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
+       if (!ocqe) {
+               /*
+                * If we're in ring overflow flush mode, or in task cancel mode,
+                * or cannot allocate an overflow entry, then we need to drop it
+                * on the floor.
+                */
+               WRITE_ONCE(ctx->rings->cq_overflow, ++ctx->cached_cq_overflow);
+               return false;
+       }
+       if (list_empty(&ctx->cq_overflow_list)) {
+               set_bit(0, &ctx->sq_check_overflow);
+               set_bit(0, &ctx->cq_check_overflow);
+               ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
+       }
+       ocqe->cqe.user_data = user_data;
+       ocqe->cqe.res = res;
+       ocqe->cqe.flags = cflags;
+       list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
+       return true;
+ }
+ 
+ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
+                                         long res, unsigned int cflags)
   {
-       struct io_ring_ctx *ctx = req->ctx;
         struct io_uring_cqe *cqe;
   
-       trace_io_uring_complete(ctx, req->user_data, res);
+       trace_io_uring_complete(ctx, user_data, res, cflags);
   
         /*
          * If we can't get a cq entry, userspace overflowed the
@@@ -1490,35 -1549,19 +1549,19 @@@
          */
         cqe = io_get_cqring(ctx);
         if (likely(cqe)) {
-               WRITE_ONCE(cqe->user_data, req->user_data);
+               WRITE_ONCE(cqe->user_data, user_data);
                 WRITE_ONCE(cqe->res, res);
                 WRITE_ONCE(cqe->flags, cflags);
-       } else if (ctx->cq_overflow_flushed ||
-                  atomic_read(&req->task->io_uring->in_idle)) {
-               /*
-                * If we're in ring overflow flush mode, or in task cancel mode,
-                * then we cannot store the request for later flushing, we need
-                * to drop it on the floor.
-                */
-               ctx->cached_cq_overflow++;
-               WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow);
-       } else {
-               if (list_empty(&ctx->cq_overflow_list)) {
-                       set_bit(0, &ctx->sq_check_overflow);
-                       set_bit(0, &ctx->cq_check_overflow);
-                       ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
-               }
-               io_clean_op(req);
-               req->result = res;
-               req->compl.cflags = cflags;
-               refcount_inc(&req->refs);
-               list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
+               return true;
         }
+       return io_cqring_event_overflow(ctx, user_data, res, cflags);
   }
   
- static void io_cqring_fill_event(struct io_kiocb *req, long res)
+ /* not as hot to bloat with inlining */
+ static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
+                                         long res, unsigned int cflags)
   {
-       __io_cqring_fill_event(req, res, 0);
+       return __io_cqring_fill_event(ctx, user_data, res, cflags);
   }
   
   static void io_req_complete_post(struct io_kiocb *req, long res,
@@@ -1528,12 -1571,12 +1571,12 @@@
         unsigned long flags;
   
         spin_lock_irqsave(&ctx->completion_lock, flags);
-       __io_cqring_fill_event(req, res, cflags);
+       __io_cqring_fill_event(ctx, req->user_data, res, cflags);
         /*
          * If we're the last reference to this request, add to our locked
          * free_list cache.
          */
-       if (refcount_dec_and_test(&req->refs)) {
+       if (req_ref_put_and_test(req)) {
                 struct io_comp_state *cs = &ctx->submit_state.comp;
   
                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
@@@ -1561,10 -1604,17 +1604,17 @@@
         }
   }
   
+ static inline bool io_req_needs_clean(struct io_kiocb *req)
+ {
+       return req->flags & (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP |
+                               REQ_F_POLLED | REQ_F_INFLIGHT);
+ }
+ 
   static void io_req_complete_state(struct io_kiocb *req, long res,
                                   unsigned int cflags)
   {
-       io_clean_op(req);
+       if (io_req_needs_clean(req))
+               io_clean_op(req);
         req->result = res;
         req->compl.cflags = cflags;
         req->flags |= REQ_F_COMPLETE_INLINE;
@@@ -1584,34 -1634,50 +1634,50 @@@ static inline void io_req_complete(stru
         __io_req_complete(req, 0, res, 0);
   }
   
+ static void io_req_complete_failed(struct io_kiocb *req, long res)
+ {
+       req_set_fail_links(req);
+       io_put_req(req);
+       io_req_complete_post(req, res, 0);
+ }
+ 
+ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
+                                       struct io_comp_state *cs)
+ {
+       spin_lock_irq(&ctx->completion_lock);
+       list_splice_init(&cs->locked_free_list, &cs->free_list);
+       cs->locked_free_nr = 0;
+       spin_unlock_irq(&ctx->completion_lock);
+ }
+ 
+ /* Returns true IFF there are requests in the cache */
   static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
   {
         struct io_submit_state *state = &ctx->submit_state;
         struct io_comp_state *cs = &state->comp;
-       struct io_kiocb *req = NULL;
+       int nr;
   
         /*
          * If we have more than a batch's worth of requests in our IRQ side
          * locked cache, grab the lock and move them over to our submission
          * side cache.
          */
-       if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH) {
-               spin_lock_irq(&ctx->completion_lock);
-               list_splice_init(&cs->locked_free_list, &cs->free_list);
-               cs->locked_free_nr = 0;
-               spin_unlock_irq(&ctx->completion_lock);
-       }
+       if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH)
+               io_flush_cached_locked_reqs(ctx, cs);
   
+       nr = state->free_reqs;
         while (!list_empty(&cs->free_list)) {
-               req = list_first_entry(&cs->free_list, struct io_kiocb,
-                                       compl.list);
+               struct io_kiocb *req = list_first_entry(&cs->free_list,
+                                               struct io_kiocb, compl.list);
+ 
                 list_del(&req->compl.list);
-               state->reqs[state->free_reqs++] = req;
-               if (state->free_reqs == ARRAY_SIZE(state->reqs))
+               state->reqs[nr++] = req;
+               if (nr == ARRAY_SIZE(state->reqs))
                         break;
         }
   
-       return req != NULL;
+       state->free_reqs = nr;
+       return nr != 0;
   }
   
   static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
@@@ -1647,37 -1713,28 +1713,28 @@@ got_req
         return state->reqs[state->free_reqs];
   }
   
- static inline void io_put_file(struct io_kiocb *req, struct file *file,
-                         bool fixed)
+ static inline void io_put_file(struct file *file)
   {
-       if (!fixed)
+       if (file)
                 fput(file);
   }
   
   static void io_dismantle_req(struct io_kiocb *req)
   {
-       io_clean_op(req);
+       unsigned int flags = req->flags;
   
-       if (req->async_data)
-               kfree(req->async_data);
-       if (req->file)
-               io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
+       if (io_req_needs_clean(req))
+               io_clean_op(req);
+       if (!(flags & REQ_F_FIXED_FILE))
+               io_put_file(req->file);
         if (req->fixed_rsrc_refs)
                 percpu_ref_put(req->fixed_rsrc_refs);
+       if (req->async_data)
+               kfree(req->async_data);
         if (req->work.creds) {
                 put_cred(req->work.creds);
                 req->work.creds = NULL;
         }
- 
-       if (req->flags & REQ_F_INFLIGHT) {
-               struct io_ring_ctx *ctx = req->ctx;
-               unsigned long flags;
- 
-               spin_lock_irqsave(&ctx->inflight_lock, flags);
-               list_del(&req->inflight_entry);
-               spin_unlock_irqrestore(&ctx->inflight_lock, flags);
-               req->flags &= ~REQ_F_INFLIGHT;
-       }
   }
   
   /* must to be called somewhat shortly after putting a request */
@@@ -1714,7 -1771,6 +1771,6 @@@ static bool io_kill_linked_timeout(stru
         __must_hold(&req->ctx->completion_lock)
   {
         struct io_kiocb *link = req->link;
-       bool cancelled = false;
   
         /*
          * Can happen if a linked timeout fired and link had been like
@@@ -1722,19 -1778,17 +1778,17 @@@
          */
         if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
                 struct io_timeout_data *io = link->async_data;
-               int ret;
   
                 io_remove_next_linked(req);
                 link->timeout.head = NULL;
-               ret = hrtimer_try_to_cancel(&io->timer);
-               if (ret != -1) {
-                       io_cqring_fill_event(link, -ECANCELED);
+               if (hrtimer_try_to_cancel(&io->timer) != -1) {
+                       io_cqring_fill_event(link->ctx, link->user_data,
+                                            -ECANCELED, 0);
                         io_put_req_deferred(link, 1);
-                       cancelled = true;
+                       return true;
                 }
         }
-       req->flags &= ~REQ_F_LINK_TIMEOUT;
-       return cancelled;
+       return false;
   }
   
   static void io_fail_links(struct io_kiocb *req)
@@@ -1748,7 -1802,7 +1802,7 @@@
                 link->link = NULL;
   
                 trace_io_uring_fail_link(req, link);
-               io_cqring_fill_event(link, -ECANCELED);
+               io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0);
                 io_put_req_deferred(link, 2);
                 link = nxt;
         }
@@@ -1761,7 -1815,8 +1815,8 @@@ static bool io_disarm_next(struct io_ki
   
         if (likely(req->flags & REQ_F_LINK_TIMEOUT))
                 posted = io_kill_linked_timeout(req);
-       if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
+       if (unlikely((req->flags & REQ_F_FAIL_LINK) &&
+                    !(req->flags & REQ_F_HARDLINK))) {
                 posted |= (req->link != NULL);
                 io_fail_links(req);
         }
@@@ -1859,13 -1914,17 +1914,17 @@@ static void tctx_task_work(struct callb
                 cond_resched();
   }
   
- static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
-                           enum task_work_notify_mode notify)
+ static int io_req_task_work_add(struct io_kiocb *req)
   {
+       struct task_struct *tsk = req->task;
         struct io_uring_task *tctx = tsk->io_uring;
+       enum task_work_notify_mode notify;
         struct io_wq_work_node *node, *prev;
         unsigned long flags;
-       int ret;
+       int ret = 0;
+ 
+       if (unlikely(tsk->flags & PF_EXITING))
+               return -ESRCH;
   
         WARN_ON_ONCE(!tctx);
   
@@@ -1878,14 -1937,23 +1937,23 @@@
             test_and_set_bit(0, &tctx->task_state))
                 return 0;
   
-       if (!task_work_add(tsk, &tctx->task_work, notify))
+       /*
+        * SQPOLL kernel thread doesn't need notification, just a wakeup. For
+        * all other cases, use TWA_SIGNAL unconditionally to ensure we're
+        * processing task_work. There's no reliable way to tell if TWA_RESUME
+        * will do the job.
+        */
+       notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
+ 
+       if (!task_work_add(tsk, &tctx->task_work, notify)) {
+               wake_up_process(tsk);
                 return 0;
+       }
   
         /*
          * Slow path - we failed, find and delete work. if the work is not
          * in the list, it got run and we're fine.
          */
-       ret = 0;
         spin_lock_irqsave(&tctx->task_lock, flags);
         wq_list_for_each(node, prev, &tctx->task_list) {
                 if (&req->io_task_work.node == node) {
@@@ -1899,33 -1967,6 +1967,6 @@@
         return ret;
   }
   
- static int io_req_task_work_add(struct io_kiocb *req)
- {
-       struct task_struct *tsk = req->task;
-       struct io_ring_ctx *ctx = req->ctx;
-       enum task_work_notify_mode notify;
-       int ret;
- 
-       if (tsk->flags & PF_EXITING)
-               return -ESRCH;
- 
-       /*
-        * SQPOLL kernel thread doesn't need notification, just a wakeup. For
-        * all other cases, use TWA_SIGNAL unconditionally to ensure we're
-        * processing task_work. There's no reliable way to tell if TWA_RESUME
-        * will do the job.
-        */
-       notify = TWA_NONE;
-       if (!(ctx->flags & IORING_SETUP_SQPOLL))
-               notify = TWA_SIGNAL;
- 
-       ret = io_task_work_add(tsk, req, notify);
-       if (!ret)
-               wake_up_process(tsk);
- 
-       return ret;
- }
- 
   static bool io_run_task_work_head(struct callback_head **work_head)
   {
         struct callback_head *work, *next;
@@@ -1966,29 -2007,15 +2007,15 @@@ static void io_req_task_work_add_fallba
         io_task_work_add_head(&req->ctx->exit_task_work, &req->task_work);
   }
   
- static void __io_req_task_cancel(struct io_kiocb *req, int error)
- {
-       struct io_ring_ctx *ctx = req->ctx;
- 
-       spin_lock_irq(&ctx->completion_lock);
-       io_cqring_fill_event(req, error);
-       io_commit_cqring(ctx);
-       spin_unlock_irq(&ctx->completion_lock);
- 
-       io_cqring_ev_posted(ctx);
-       req_set_fail_links(req);
-       io_double_put_req(req);
- }
- 
   static void io_req_task_cancel(struct callback_head *cb)
   {
         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
         struct io_ring_ctx *ctx = req->ctx;
   
+       /* ctx is guaranteed to stay alive while we hold uring_lock */
         mutex_lock(&ctx->uring_lock);
-       __io_req_task_cancel(req, req->result);
+       io_req_complete_failed(req, req->result);
         mutex_unlock(&ctx->uring_lock);
-       percpu_ref_put(&ctx->refs);
   }
   
   static void __io_req_task_submit(struct io_kiocb *req)
@@@ -2000,7 -2027,7 +2027,7 @@@
         if (!(current->flags & PF_EXITING) && !current->in_execve)
                 __io_queue_sqe(req);
         else
-               __io_req_task_cancel(req, -EFAULT);
+               io_req_complete_failed(req, -EFAULT);
         mutex_unlock(&ctx->uring_lock);
   }
   
@@@ -2011,27 -2038,21 +2038,21 @@@ static void io_req_task_submit(struct c
         __io_req_task_submit(req);
   }
   
- static void io_req_task_queue(struct io_kiocb *req)
+ static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
   {
-       int ret;
+       req->result = ret;
+       req->task_work.func = io_req_task_cancel;
   
-       req->task_work.func = io_req_task_submit;
-       ret = io_req_task_work_add(req);
-       if (unlikely(ret)) {
-               req->result = -ECANCELED;
-               percpu_ref_get(&req->ctx->refs);
+       if (unlikely(io_req_task_work_add(req)))
                 io_req_task_work_add_fallback(req, io_req_task_cancel);
-       }
   }
   
- static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
+ static void io_req_task_queue(struct io_kiocb *req)
   {
-       percpu_ref_get(&req->ctx->refs);
-       req->result = ret;
-       req->task_work.func = io_req_task_cancel;
+       req->task_work.func = io_req_task_submit;
   
         if (unlikely(io_req_task_work_add(req)))
-               io_req_task_work_add_fallback(req, io_req_task_cancel);
+               io_req_task_queue_fail(req, -ECANCELED);
   }
   
   static inline void io_queue_next(struct io_kiocb *req)
@@@ -2074,6 -2095,7 +2095,7 @@@ static void io_req_free_batch(struct re
                               struct io_submit_state *state)
   {
         io_queue_next(req);
+       io_dismantle_req(req);
   
         if (req->task != rb->task) {
                 if (rb->task)
@@@ -2084,7 -2106,6 +2106,6 @@@
         rb->task_refs++;
         rb->ctx_refs++;
   
-       io_dismantle_req(req);
         if (state->free_reqs != ARRAY_SIZE(state->reqs))
                 state->reqs[state->free_reqs++] = req;
         else
@@@ -2102,7 -2123,8 +2123,8 @@@ static void io_submit_flush_completions
         spin_lock_irq(&ctx->completion_lock);
         for (i = 0; i < nr; i++) {
                 req = cs->reqs[i];
-               __io_cqring_fill_event(req, req->result, req->compl.cflags);
+               __io_cqring_fill_event(ctx, req->user_data, req->result,
+                                       req->compl.cflags);
         }
         io_commit_cqring(ctx);
         spin_unlock_irq(&ctx->completion_lock);
@@@ -2112,7 -2134,7 +2134,7 @@@
                 req = cs->reqs[i];
   
                 /* submission and completion refs */
-               if (refcount_sub_and_test(2, &req->refs))
+               if (req_ref_sub_and_test(req, 2))
                         io_req_free_batch(&rb, req, &ctx->submit_state);
         }
   
@@@ -2124,20 -2146,20 +2146,20 @@@
    * Drop reference to request, return next in chain (if there is one) if this
    * was the last reference to this request.
    */
- static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
+ static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
   {
         struct io_kiocb *nxt = NULL;
   
-       if (refcount_dec_and_test(&req->refs)) {
+       if (req_ref_put_and_test(req)) {
                 nxt = io_req_find_next(req);
                 __io_free_req(req);
         }
         return nxt;
   }
   
- static void io_put_req(struct io_kiocb *req)
+ static inline void io_put_req(struct io_kiocb *req)
   {
-       if (refcount_dec_and_test(&req->refs))
+       if (req_ref_put_and_test(req))
                 io_free_req(req);
   }
   
@@@ -2150,27 -2172,17 +2172,17 @@@ static void io_put_req_deferred_cb(stru
   
   static void io_free_req_deferred(struct io_kiocb *req)
   {
-       int ret;
- 
         req->task_work.func = io_put_req_deferred_cb;
-       ret = io_req_task_work_add(req);
-       if (unlikely(ret))
+       if (unlikely(io_req_task_work_add(req)))
                 io_req_task_work_add_fallback(req, io_put_req_deferred_cb);
   }
   
   static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
   {
-       if (refcount_sub_and_test(refs, &req->refs))
+       if (req_ref_sub_and_test(req, refs))
                 io_free_req_deferred(req);
   }
   
- static void io_double_put_req(struct io_kiocb *req)
- {
-       /* drop both submit and complete references */
-       if (refcount_sub_and_test(2, &req->refs))
-               io_free_req(req);
- }
- 
   static unsigned io_cqring_events(struct io_ring_ctx *ctx)
   {
         /* See comment at the top of this file */
@@@ -2241,19 -2253,21 +2253,21 @@@ static void io_iopoll_complete(struct i
                 req = list_first_entry(done, struct io_kiocb, inflight_entry);
                 list_del(&req->inflight_entry);
   
-               if (READ_ONCE(req->result) == -EAGAIN) {
+               if (READ_ONCE(req->result) == -EAGAIN &&
+                   !(req->flags & REQ_F_DONT_REISSUE)) {
                         req->iopoll_completed = 0;
-                       if (io_rw_reissue(req))
-                               continue;
+                       req_ref_get(req);
+                       io_queue_async_work(req);
+                       continue;
                 }
   
                 if (req->flags & REQ_F_BUFFER_SELECTED)
                         cflags = io_put_rw_kbuf(req);
   
-               __io_cqring_fill_event(req, req->result, cflags);
+               __io_cqring_fill_event(ctx, req->user_data, req->result, cflags);
                 (*nr_events)++;
   
-               if (refcount_dec_and_test(&req->refs))
+               if (req_ref_put_and_test(req))
                         io_req_free_batch(&rb, req, &ctx->submit_state);
         }
   
@@@ -2311,27 -2325,6 +2325,6 @@@ static int io_do_iopoll(struct io_ring_
         return ret;
   }
   
- /*
-  * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
-  * non-spinning poll check - we'll still enter the driver poll loop, but only
-  * as a non-spinning completion check.
-  */
- static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
-                               long min)
- {
-       while (!list_empty(&ctx->iopoll_list) && !need_resched()) {
-               int ret;
- 
-               ret = io_do_iopoll(ctx, nr_events, min);
-               if (ret < 0)
-                       return ret;
-               if (*nr_events >= min)
-                       return 0;
-       }
- 
-       return 1;
- }
- 
   /*
    * We can't just wait for polled events to come to us, we have to actively
    * find and complete them.
@@@ -2367,7 -2360,7 +2360,7 @@@ static void io_iopoll_try_reap_events(s
   static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
   {
         unsigned int nr_events = 0;
-       int iters = 0, ret = 0;
+       int ret = 0;
   
         /*
          * We disallow the app entering submit/complete with polling, but we
@@@ -2375,17 -2368,16 +2368,16 @@@
          * that got punted to a workqueue.
          */
         mutex_lock(&ctx->uring_lock);
+       /*
+        * Don't enter poll loop if we already have events pending.
+        * If we do, we can potentially be spinning for commands that
+        * already triggered a CQE (eg in error).
+        */
+       if (test_bit(0, &ctx->cq_check_overflow))
+               __io_cqring_overflow_flush(ctx, false);
+       if (io_cqring_events(ctx))
+               goto out;
         do {
-               /*
-                * Don't enter poll loop if we already have events pending.
-                * If we do, we can potentially be spinning for commands that
-                * already triggered a CQE (eg in error).
-                */
-               if (test_bit(0, &ctx->cq_check_overflow))
-                       __io_cqring_overflow_flush(ctx, false, NULL, NULL);
-               if (io_cqring_events(ctx))
-                       break;
- 
                 /*
                  * If a submit got punted to a workqueue, we can have the
                  * application entering polling for a command before it gets
@@@ -2396,18 -2388,17 +2388,17 @@@
                  * forever, while the workqueue is stuck trying to acquire the
                  * very same mutex.
                  */
-               if (!(++iters & 7)) {
+               if (list_empty(&ctx->iopoll_list)) {
                         mutex_unlock(&ctx->uring_lock);
                         io_run_task_work();
                         mutex_lock(&ctx->uring_lock);
-               }
- 
-               ret = io_iopoll_getevents(ctx, &nr_events, min);
-               if (ret <= 0)
-                       break;
-               ret = 0;
-       } while (min && !nr_events && !need_resched());
   
+                       if (list_empty(&ctx->iopoll_list))
+                               break;
+               }
+               ret = io_do_iopoll(ctx, &nr_events, min);
+       } while (!ret && nr_events < min && !need_resched());
+ out:
         mutex_unlock(&ctx->uring_lock);
         return ret;
   }
@@@ -2419,46 -2410,24 +2410,24 @@@ static void kiocb_end_write(struct io_k
          * thread.
          */
         if (req->flags & REQ_F_ISREG) {
-               struct inode *inode = file_inode(req->file);
+               struct super_block *sb = file_inode(req->file)->i_sb;
   
-               __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
+               __sb_writers_acquired(sb, SB_FREEZE_WRITE);
+               sb_end_write(sb);
         }
-       file_end_write(req->file);
   }
   
   #ifdef CONFIG_BLOCK
   static bool io_resubmit_prep(struct io_kiocb *req)
   {
-       struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
-       int rw, ret;
-       struct iov_iter iter;
+       struct io_async_rw *rw = req->async_data;
   
-       /* already prepared */
-       if (req->async_data)
-               return true;
- 
-       switch (req->opcode) {
-       case IORING_OP_READV:
-       case IORING_OP_READ_FIXED:
-       case IORING_OP_READ:
-               rw = READ;
-               break;
-       case IORING_OP_WRITEV:
-       case IORING_OP_WRITE_FIXED:
-       case IORING_OP_WRITE:
-               rw = WRITE;
-               break;
-       default:
-               printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n",
-                               req->opcode);
-               return false;
-       }
- 
-       ret = io_import_iovec(rw, req, &iovec, &iter, false);
-       if (ret < 0)
-               return false;
-       return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
- }
+       if (!rw)
+               return !io_req_prep_async(req);
+       /* may have left rw->iter inconsistent on -EIOCBQUEUED */
+       iov_iter_revert(&rw->iter, req->result - iov_iter_count(&rw->iter));
+       return true;
+ }
   
   static bool io_rw_should_reissue(struct io_kiocb *req)
   {
@@@ -2480,29 -2449,15 +2449,15 @@@
         return true;
   }
   #else
- static bool io_rw_should_reissue(struct io_kiocb *req)
+ static bool io_resubmit_prep(struct io_kiocb *req)
   {
         return false;
   }
- #endif
- 
- static bool io_rw_reissue(struct io_kiocb *req)
+ static bool io_rw_should_reissue(struct io_kiocb *req)
   {
- #ifdef CONFIG_BLOCK
-       if (!io_rw_should_reissue(req))
-               return false;
- 
-       lockdep_assert_held(&req->ctx->uring_lock);
- 
-       if (io_resubmit_prep(req)) {
-               refcount_inc(&req->refs);
-               io_queue_async_work(req);
-               return true;
-       }
-       req_set_fail_links(req);
- #endif
         return false;
   }
+ #endif
   
   static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
                              unsigned int issue_flags)
@@@ -2511,12 -2466,14 +2466,14 @@@
   
         if (req->rw.kiocb.ki_flags & IOCB_WRITE)
                 kiocb_end_write(req);
-       if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) {
-               req->flags |= REQ_F_REISSUE;
-               return;
-       }
-       if (res != req->result)
+       if (res != req->result) {
+               if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
+                   io_rw_should_reissue(req)) {
+                       req->flags |= REQ_F_REISSUE;
+                       return;
+               }
                 req_set_fail_links(req);
+       }
         if (req->flags & REQ_F_BUFFER_SELECTED)
                 cflags = io_put_rw_kbuf(req);
         __io_req_complete(req, issue_flags, res, cflags);
@@@ -2533,27 -2490,18 +2490,18 @@@ static void io_complete_rw_iopoll(struc
   {
         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
   
- #ifdef CONFIG_BLOCK
-       /* Rewind iter, if we have one. iopoll path resubmits as usual */
-       if (res == -EAGAIN && io_rw_should_reissue(req)) {
-               struct io_async_rw *rw = req->async_data;
- 
-               if (rw)
-                       iov_iter_revert(&rw->iter,
-                                       req->result - iov_iter_count(&rw->iter));
-               else if (!io_resubmit_prep(req))
-                       res = -EIO;
-       }
- #endif
- 
         if (kiocb->ki_flags & IOCB_WRITE)
                 kiocb_end_write(req);
- 
-       if (res != -EAGAIN && res != req->result)
-               req_set_fail_links(req);
+       if (unlikely(res != req->result)) {
+               if (!(res == -EAGAIN && io_rw_should_reissue(req) &&
+                   io_resubmit_prep(req))) {
+                       req_set_fail_links(req);
+                       req->flags |= REQ_F_DONT_REISSUE;
+               }
+       }
   
         WRITE_ONCE(req->result, res);
-       /* order with io_poll_complete() checking ->result */
+       /* order with io_iopoll_complete() checking ->result */
         smp_wmb();
         WRITE_ONCE(req->iopoll_completed, 1);
   }
@@@ -2561,7 -2509,7 +2509,7 @@@
   /*
    * After the iocb has been issued, it's safe to be found on the poll list.
    * Adding the kiocb to the list AFTER submission ensures that we don't
-  * find it from a io_iopoll_getevents() thread before the issuer is done
+  * find it from a io_do_iopoll() thread before the issuer is done
    * accessing the kiocb cookie.
    */
   static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
@@@ -2647,7 -2595,7 +2595,7 @@@ static bool io_bdev_nowait(struct block
    * any file. For now, just ensure that anything potentially problematic is done
    * inline.
    */
- static bool io_file_supports_async(struct file *file, int rw)
+ static bool __io_file_supports_async(struct file *file, int rw)
   {
         umode_t mode = file_inode(file)->i_mode;
   
@@@ -2680,6 -2628,16 +2628,16 @@@
         return file->f_op->write_iter != NULL;
   }
   
+ static bool io_file_supports_async(struct io_kiocb *req, int rw)
+ {
+       if (rw == READ && (req->flags & REQ_F_ASYNC_READ))
+               return true;
+       else if (rw == WRITE && (req->flags & REQ_F_ASYNC_WRITE))
+               return true;
+ 
+       return __io_file_supports_async(req->file, rw);
+ }
+ 
   static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   {
         struct io_ring_ctx *ctx = req->ctx;
@@@ -2688,7 -2646,7 +2646,7 @@@
         unsigned ioprio;
         int ret;
   
-       if (S_ISREG(file_inode(file)->i_mode))
+       if (!(req->flags & REQ_F_ISREG) && S_ISREG(file_inode(file)->i_mode))
                 req->flags |= REQ_F_ISREG;
   
         kiocb->ki_pos = READ_ONCE(sqe->off);
@@@ -2730,6 -2688,12 +2688,12 @@@
                 kiocb->ki_complete = io_complete_rw;
         }
   
+       if (req->opcode == IORING_OP_READ_FIXED ||
+           req->opcode == IORING_OP_WRITE_FIXED) {
+               req->imu = NULL;
+               io_req_set_rsrc_node(req);
+       }
+ 
         req->rw.addr = READ_ONCE(sqe->addr);
         req->rw.len = READ_ONCE(sqe->len);
         req->buf_index = READ_ONCE(sqe->buf_index);
@@@ -2781,7 -2745,10 +2745,10 @@@ static void kiocb_done(struct kiocb *ki
   
         if (check_reissue && req->flags & REQ_F_REISSUE) {
                 req->flags &= ~REQ_F_REISSUE;
-               if (!io_rw_reissue(req)) {
+               if (io_resubmit_prep(req)) {
+                       req_ref_get(req);
+                       io_queue_async_work(req);
+               } else {
                         int cflags = 0;
   
                         req_set_fail_links(req);
@@@ -2792,26 -2759,17 +2759,17 @@@
         }
   }
   
- static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
+ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
+                            struct io_mapped_ubuf *imu)
   {
-       struct io_ring_ctx *ctx = req->ctx;
         size_t len = req->rw.len;
-       struct io_mapped_ubuf *imu;
-       u16 index, buf_index = req->buf_index;
+       u64 buf_end, buf_addr = req->rw.addr;
         size_t offset;
-       u64 buf_addr;
- 
-       if (unlikely(buf_index >= ctx->nr_user_bufs))
-               return -EFAULT;
-       index = array_index_nospec(buf_index, ctx->nr_user_bufs);
-       imu = &ctx->user_bufs[index];
-       buf_addr = req->rw.addr;
   
-       /* overflow */
-       if (buf_addr + len < buf_addr)
+       if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
                 return -EFAULT;
         /* not inside the mapped region */
-       if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
+       if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
                 return -EFAULT;
   
         /*
@@@ -2859,6 -2817,22 +2817,22 @@@
         return 0;
   }
   
+ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
+ {
+       struct io_ring_ctx *ctx = req->ctx;
+       struct io_mapped_ubuf *imu = req->imu;
+       u16 index, buf_index = req->buf_index;
+ 
+       if (likely(!imu)) {
+               if (unlikely(buf_index >= ctx->nr_user_bufs))
+                       return -EFAULT;
+               index = array_index_nospec(buf_index, ctx->nr_user_bufs);
+               imu = READ_ONCE(ctx->user_bufs[index]);
+               req->imu = imu;
+       }
+       return __io_import_fixed(req, rw, iter, imu);
+ }
+ 
   static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
   {
         if (needs_lock)
@@@ -3126,29 -3100,21 +3100,21 @@@ static void io_req_map_rw(struct io_kio
         }
   }
   
- static inline int __io_alloc_async_data(struct io_kiocb *req)
+ static inline int io_alloc_async_data(struct io_kiocb *req)
   {
         WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
         req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
         return req->async_data == NULL;
   }
   
- static int io_alloc_async_data(struct io_kiocb *req)
- {
-       if (!io_op_defs[req->opcode].needs_async_data)
-               return 0;
- 
-       return  __io_alloc_async_data(req);
- }
- 
   static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
                              const struct iovec *fast_iov,
                              struct iov_iter *iter, bool force)
   {
-       if (!force && !io_op_defs[req->opcode].needs_async_data)
+       if (!force && !io_op_defs[req->opcode].needs_async_setup)
                 return 0;
         if (!req->async_data) {
-               if (__io_alloc_async_data(req)) {
+               if (io_alloc_async_data(req)) {
                         kfree(iovec);
                         return -ENOMEM;
                 }
@@@ -3208,7 -3174,7 +3174,7 @@@ static int io_async_buf_func(struct wai
         list_del_init(&wait->entry);
   
         /* submit ref gets dropped, acquire a new one */
-       refcount_inc(&req->refs);
+       req_ref_get(req);
         io_req_task_queue(req);
         return 1;
   }
@@@ -3293,7 -3259,7 +3259,7 @@@ static int io_read(struct io_kiocb *req
                 kiocb->ki_flags |= IOCB_NOWAIT;
   
         /* If the file doesn't support async, just async punt */
-       if (force_nonblock && !io_file_supports_async(req->file, READ)) {
+       if (force_nonblock && !io_file_supports_async(req, READ)) {
                 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
                 return ret ?: -EAGAIN;
         }
@@@ -3398,7 -3364,7 +3364,7 @@@ static int io_write(struct io_kiocb *re
                 kiocb->ki_flags |= IOCB_NOWAIT;
   
         /* If the file doesn't support async, just async punt */
-       if (force_nonblock && !io_file_supports_async(req->file, WRITE))
+       if (force_nonblock && !io_file_supports_async(req, WRITE))
                 goto copy_iov;
   
         /* file path doesn't support NOWAIT for non-direct_IO */
@@@ -3617,15 -3583,6 +3583,6 @@@ static int __io_splice_prep(struct io_k
         if (!sp->file_in)
                 return -EBADF;
         req->flags |= REQ_F_NEED_CLEANUP;
- 
-       if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
-               /*
-                * Splice operation will be punted aync, and here need to
-                * modify io_wq_work.flags, so initialize io_wq_work firstly.
-                */
-               req->work.flags |= IO_WQ_WORK_UNBOUND;
-       }
- 
         return 0;
   }
   
@@@ -3650,7 -3607,8 +3607,8 @@@ static int io_tee(struct io_kiocb *req
         if (sp->len)
                 ret = do_tee(in, out, sp->len, flags);
   
-       io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
+       if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
+               io_put_file(in);
         req->flags &= ~REQ_F_NEED_CLEANUP;
   
         if (ret != sp->len)
@@@ -3686,7 -3644,8 +3644,8 @@@ static int io_splice(struct io_kiocb *r
         if (sp->len)
                 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
   
-       io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
+       if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
+               io_put_file(in);
         req->flags &= ~REQ_F_NEED_CLEANUP;
   
         if (ret != sp->len)
@@@ -3892,7 -3851,7 +3851,7 @@@ err
         req->flags &= ~REQ_F_NEED_CLEANUP;
         if (ret < 0)
                 req_set_fail_links(req);
-       io_req_complete(req, ret);
+       __io_req_complete(req, issue_flags, ret, 0);
         return 0;
   }
   
@@@ -3965,21 -3924,16 +3924,16 @@@ static int io_remove_buffers(struct io_
         if (ret < 0)
                 req_set_fail_links(req);
   
-       /* need to hold the lock to complete IOPOLL requests */
-       if (ctx->flags & IORING_SETUP_IOPOLL) {
-               __io_req_complete(req, issue_flags, ret, 0);
-               io_ring_submit_unlock(ctx, !force_nonblock);
-       } else {
-               io_ring_submit_unlock(ctx, !force_nonblock);
-               __io_req_complete(req, issue_flags, ret, 0);
-       }
+       /* complete before unlock, IOPOLL may need the lock */
+       __io_req_complete(req, issue_flags, ret, 0);
+       io_ring_submit_unlock(ctx, !force_nonblock);
         return 0;
   }
   
   static int io_provide_buffers_prep(struct io_kiocb *req,
                                    const struct io_uring_sqe *sqe)
   {
-       unsigned long size;
+       unsigned long size, tmp_check;
         struct io_provide_buf *p = &req->pbuf;
         u64 tmp;
   
@@@ -3993,6 -3947,12 +3947,12 @@@
         p->addr = READ_ONCE(sqe->addr);
         p->len = READ_ONCE(sqe->len);
   
+       if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
+                               &size))
+               return -EOVERFLOW;
+       if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
+               return -EOVERFLOW;
+ 
         size = (unsigned long)p->len * p->nbufs;
         if (!access_ok(u64_to_user_ptr(p->addr), size))
                 return -EFAULT;
@@@ -4054,15 -4014,9 +4014,9 @@@ static int io_provide_buffers(struct io
         }
         if (ret < 0)
                 req_set_fail_links(req);
- 
-       /* need to hold the lock to complete IOPOLL requests */
-       if (ctx->flags & IORING_SETUP_IOPOLL) {
-               __io_req_complete(req, issue_flags, ret, 0);
-               io_ring_submit_unlock(ctx, !force_nonblock);
-       } else {
-               io_ring_submit_unlock(ctx, !force_nonblock);
-               __io_req_complete(req, issue_flags, ret, 0);
-       }
+       /* complete before unlock, IOPOLL may need the lock */
+       __io_req_complete(req, issue_flags, ret, 0);
+       io_ring_submit_unlock(ctx, !force_nonblock);
         return 0;
   }
   
@@@ -4181,7 -4135,7 +4135,7 @@@ static int io_fadvise(struct io_kiocb *
         ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
         if (ret < 0)
                 req_set_fail_links(req);
-       io_req_complete(req, ret);
+       __io_req_complete(req, issue_flags, ret, 0);
         return 0;
   }
   
@@@ -4208,12 -4162,8 +4162,8 @@@ static int io_statx(struct io_kiocb *re
         struct io_statx *ctx = &req->statx;
         int ret;
   
-       if (issue_flags & IO_URING_F_NONBLOCK) {
-               /* only need file table for an actual valid fd */
-               if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
-                       req->flags |= REQ_F_NO_FILE_TABLE;
+       if (issue_flags & IO_URING_F_NONBLOCK)
                 return -EAGAIN;
-       }
   
         ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
                        ctx->buffer);
@@@ -4243,11 -4193,9 +4193,9 @@@ static int io_close(struct io_kiocb *re
         struct files_struct *files = current->files;
         struct io_close *close = &req->close;
         struct fdtable *fdt;
-       struct file *file;
-       int ret;
+       struct file *file = NULL;
+       int ret = -EBADF;
   
-       file = NULL;
-       ret = -EBADF;
         spin_lock(&files->file_lock);
         fdt = files_fdtable(files);
         if (close->fd >= fdt->max_fds) {
@@@ -4255,12 -4203,7 +4203,7 @@@
                 goto err;
         }
         file = fdt->fd[close->fd];
-       if (!file) {
-               spin_unlock(&files->file_lock);
-               goto err;
-       }
- 
-       if (file->f_op == &io_uring_fops) {
+       if (!file || file->f_op == &io_uring_fops) {
                 spin_unlock(&files->file_lock);
                 file = NULL;
                 goto err;
@@@ -4358,8 -4301,6 +4301,6 @@@ static int io_sendmsg_prep_async(struc
   {
         int ret;
   
-       if (!io_op_defs[req->opcode].needs_async_data)
-               return 0;
         ret = io_sendmsg_copy_hdr(req, req->async_data);
         if (!ret)
                 req->flags |= REQ_F_NEED_CLEANUP;
@@@ -4373,9 -4314,11 +4314,11 @@@ static int io_sendmsg_prep(struct io_ki
         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
   
-       sr->msg_flags = READ_ONCE(sqe->msg_flags);
         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
         sr->len = READ_ONCE(sqe->len);
+       sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
+       if (sr->msg_flags & MSG_DONTWAIT)
+               req->flags |= REQ_F_NOWAIT;
   
   #ifdef CONFIG_COMPAT
         if (req->ctx->compat)
@@@ -4404,12 -4347,9 +4347,9 @@@ static int io_sendmsg(struct io_kiocb *
                 kmsg = &iomsg;
         }
   
-       flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
-       if (flags & MSG_DONTWAIT)
-               req->flags |= REQ_F_NOWAIT;
-       else if (issue_flags & IO_URING_F_NONBLOCK)
+       flags = req->sr_msg.msg_flags;
+       if (issue_flags & IO_URING_F_NONBLOCK)
                 flags |= MSG_DONTWAIT;
- 
         if (flags & MSG_WAITALL)
                 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
   
@@@ -4452,12 -4392,9 +4392,9 @@@ static int io_send(struct io_kiocb *req
         msg.msg_controllen = 0;
         msg.msg_namelen = 0;
   
-       flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
-       if (flags & MSG_DONTWAIT)
-               req->flags |= REQ_F_NOWAIT;
-       else if (issue_flags & IO_URING_F_NONBLOCK)
+       flags = req->sr_msg.msg_flags;
+       if (issue_flags & IO_URING_F_NONBLOCK)
                 flags |= MSG_DONTWAIT;
- 
         if (flags & MSG_WAITALL)
                 min_ret = iov_iter_count(&msg.msg_iter);
   
@@@ -4510,16 -4447,14 +4447,14 @@@ static int __io_recvmsg_copy_hdr(struc
   static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
                                         struct io_async_msghdr *iomsg)
   {
-       struct compat_msghdr __user *msg_compat;
         struct io_sr_msg *sr = &req->sr_msg;
         struct compat_iovec __user *uiov;
         compat_uptr_t ptr;
         compat_size_t len;
         int ret;
   
-       msg_compat = (struct compat_msghdr __user *) sr->umsg;
-       ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr,
-                                       &ptr, &len);
+       ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
+                                 &ptr, &len);
         if (ret)
                 return ret;
   
@@@ -4587,8 -4522,6 +4522,6 @@@ static int io_recvmsg_prep_async(struc
   {
         int ret;
   
-       if (!io_op_defs[req->opcode].needs_async_data)
-               return 0;
         ret = io_recvmsg_copy_hdr(req, req->async_data);
         if (!ret)
                 req->flags |= REQ_F_NEED_CLEANUP;
@@@ -4602,10 -4535,12 +4535,12 @@@ static int io_recvmsg_prep(struct io_ki
         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
   
-       sr->msg_flags = READ_ONCE(sqe->msg_flags);
         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
         sr->len = READ_ONCE(sqe->len);
         sr->bgid = READ_ONCE(sqe->buf_group);
+       sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
+       if (sr->msg_flags & MSG_DONTWAIT)
+               req->flags |= REQ_F_NOWAIT;
   
   #ifdef CONFIG_COMPAT
         if (req->ctx->compat)
@@@ -4646,12 -4581,9 +4581,9 @@@ static int io_recvmsg(struct io_kiocb *
                                 1, req->sr_msg.len);
         }
   
-       flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
-       if (flags & MSG_DONTWAIT)
-               req->flags |= REQ_F_NOWAIT;
-       else if (force_nonblock)
+       flags = req->sr_msg.msg_flags;
+       if (force_nonblock)
                 flags |= MSG_DONTWAIT;
- 
         if (flags & MSG_WAITALL)
                 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
   
@@@ -4709,12 -4641,9 +4641,9 @@@ static int io_recv(struct io_kiocb *req
         msg.msg_iocb = NULL;
         msg.msg_flags = 0;
   
-       flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
-       if (flags & MSG_DONTWAIT)
-               req->flags |= REQ_F_NOWAIT;
-       else if (force_nonblock)
+       flags = req->sr_msg.msg_flags;
+       if (force_nonblock)
                 flags |= MSG_DONTWAIT;
- 
         if (flags & MSG_WAITALL)
                 min_ret = iov_iter_count(&msg.msg_iter);
   
@@@ -4884,7 -4813,6 +4813,6 @@@ static int __io_async_wake(struct io_ki
   
         req->result = mask;
         req->task_work.func = func;
-       percpu_ref_get(&req->ctx->refs);
   
         /*
          * If this fails, then the task is exiting. When a task exits, the
@@@ -4936,6 -4864,7 +4864,7 @@@ static struct io_poll_iocb *io_poll_get
   }
   
   static void io_poll_remove_double(struct io_kiocb *req)
+       __must_hold(&req->ctx->completion_lock)
   {
         struct io_poll_iocb *poll = io_poll_get_double(req);
   
@@@ -4947,20 -4876,37 +4876,37 @@@
                 spin_lock(&head->lock);
                 list_del_init(&poll->wait.entry);
                 if (poll->wait.private)
-                       refcount_dec(&req->refs);
+                       req_ref_put(req);
                 poll->head = NULL;
                 spin_unlock(&head->lock);
         }
   }
   
- static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
+ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
+       __must_hold(&req->ctx->completion_lock)
   {
         struct io_ring_ctx *ctx = req->ctx;
+       unsigned flags = IORING_CQE_F_MORE;
+       int error;
+ 
+       if (READ_ONCE(req->poll.canceled)) {
+               error = -ECANCELED;
+               req->poll.events |= EPOLLONESHOT;
+       } else {
+               error = mangle_poll(mask);
+       }
+       if (req->poll.events & EPOLLONESHOT)
+               flags = 0;
+       if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
+               io_poll_remove_waitqs(req);
+               req->poll.done = true;
+               flags = 0;
+       }
+       if (flags & IORING_CQE_F_MORE)
+               ctx->cq_extra++;
   
-       io_poll_remove_double(req);
-       req->poll.done = true;
-       io_cqring_fill_event(req, error ? error : mangle_poll(mask));
         io_commit_cqring(ctx);
+       return !(flags & IORING_CQE_F_MORE);
   }
   
   static void io_poll_task_func(struct callback_head *cb)
@@@ -4972,17 -4918,24 +4918,24 @@@
         if (io_poll_rewait(req, &req->poll)) {
                 spin_unlock_irq(&ctx->completion_lock);
         } else {
-               hash_del(&req->hash_node);
-               io_poll_complete(req, req->result, 0);
-               spin_unlock_irq(&ctx->completion_lock);
+               bool done;
   
-               nxt = io_put_req_find_next(req);
+               done = io_poll_complete(req, req->result);
+               if (done) {
+                       hash_del(&req->hash_node);
+               } else {
+                       req->result = 0;
+                       add_wait_queue(req->poll.head, &req->poll.wait);
+               }
+               spin_unlock_irq(&ctx->completion_lock);
                 io_cqring_ev_posted(ctx);
-               if (nxt)
-                       __io_req_task_submit(nxt);
-       }
   
-       percpu_ref_put(&ctx->refs);
+               if (done) {
+                       nxt = io_put_req_find_next(req);
+                       if (nxt)
+                               __io_req_task_submit(nxt);
+               }
+       }
   }
   
   static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
@@@ -4995,6 -4948,8 +4948,8 @@@
         /* for instances that support it check for an event match first: */
         if (mask && !(mask & poll->events))
                 return 0;
+       if (!(poll->events & EPOLLONESHOT))
+               return poll->wait.func(&poll->wait, mode, sync, key);
   
         list_del_init(&wait->entry);
   
@@@ -5013,7 -4968,7 +4968,7 @@@
                         poll->wait.func(&poll->wait, mode, sync, key);
                 }
         }
-       refcount_dec(&req->refs);
+       req_ref_put(req);
         return 1;
   }
   
@@@ -5023,7 -4978,9 +4978,9 @@@ static void io_init_poll_iocb(struct io
         poll->head = NULL;
         poll->done = false;
         poll->canceled = false;
-       poll->events = events;
+ #define IO_POLL_UNMASK        (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
+       /* mask in events that we always want/need */
+       poll->events = events | IO_POLL_UNMASK;
         INIT_LIST_HEAD(&poll->wait.entry);
         init_waitqueue_func_entry(&poll->wait, wake_func);
   }
@@@ -5047,6 -5004,12 +5004,12 @@@ static void __io_queue_proc(struct io_p
                         pt->error = -EINVAL;
                         return;
                 }
+               /*
+                * Can't handle multishot for double wait for now, turn it
+                * into one-shot mode.
+                */
+               if (!(req->poll.events & EPOLLONESHOT))
+                       req->poll.events |= EPOLLONESHOT;
                 /* double add on the same waitqueue head, ignore */
                 if (poll->head == head)
                         return;
@@@ -5056,7 -5019,7 +5019,7 @@@
                         return;
                 }
                 io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
-               refcount_inc(&req->refs);
+               req_ref_get(req);
                 poll->wait.private = req;
                 *poll_ptr = poll;
         }
@@@ -5089,25 -5052,17 +5052,17 @@@ static void io_async_task_func(struct c
   
         if (io_poll_rewait(req, &apoll->poll)) {
                 spin_unlock_irq(&ctx->completion_lock);
-               percpu_ref_put(&ctx->refs);
                 return;
         }
   
-       /* If req is still hashed, it cannot have been canceled. Don't check. */
-       if (hash_hashed(&req->hash_node))
-               hash_del(&req->hash_node);
- 
+       hash_del(&req->hash_node);
         io_poll_remove_double(req);
         spin_unlock_irq(&ctx->completion_lock);
   
         if (!READ_ONCE(apoll->poll.canceled))
                 __io_req_task_submit(req);
         else
-               __io_req_task_cancel(req, -ECANCELED);
- 
-       percpu_ref_put(&ctx->refs);
-       kfree(apoll->double_poll);
-       kfree(apoll);
+               io_req_complete_failed(req, -ECANCELED);
   }
   
   static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@@ -5160,7 -5115,7 +5115,7 @@@ static __poll_t __io_arm_poll_handler(s
                         ipt->error = 0;
                         mask = 0;
                 }
-               if (mask || ipt->error)
+               if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error)
                         list_del_init(&poll->wait.entry);
                 else if (cancel)
                         WRITE_ONCE(poll->canceled, true);
@@@ -5192,7 -5147,7 +5147,7 @@@ static bool io_arm_poll_handler(struct 
         else
                 return false;
         /* if we can't nonblock try, then no point in arming a poll handler */
-       if (!io_file_supports_async(req->file, rw))
+       if (!io_file_supports_async(req, rw))
                 return false;
   
         apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
@@@ -5203,7 -5158,7 +5158,7 @@@
         req->flags |= REQ_F_POLLED;
         req->apoll = apoll;
   
-       mask = 0;
+       mask = EPOLLONESHOT;
         if (def->pollin)
                 mask |= POLLIN | POLLRDNORM;
         if (def->pollout)
@@@ -5223,8 -5178,6 +5178,6 @@@
         if (ret || ipt.error) {
                 io_poll_remove_double(req);
                 spin_unlock_irq(&ctx->completion_lock);
-               kfree(apoll->double_poll);
-               kfree(apoll);
                 return false;
         }
         spin_unlock_irq(&ctx->completion_lock);
@@@ -5234,12 -5187,16 +5187,16 @@@
   }
   
   static bool __io_poll_remove_one(struct io_kiocb *req,
-                                struct io_poll_iocb *poll)
+                                struct io_poll_iocb *poll, bool do_cancel)
+       __must_hold(&req->ctx->completion_lock)
   {
         bool do_complete = false;
   
+       if (!poll->head)
+               return false;
         spin_lock(&poll->head->lock);
-       WRITE_ONCE(poll->canceled, true);
+       if (do_cancel)
+               WRITE_ONCE(poll->canceled, true);
         if (!list_empty(&poll->wait.entry)) {
                 list_del_init(&poll->wait.entry);
                 do_complete = true;
@@@ -5249,28 -5206,29 +5206,29 @@@
         return do_complete;
   }
   
- static bool io_poll_remove_one(struct io_kiocb *req)
+ static bool io_poll_remove_waitqs(struct io_kiocb *req)
+       __must_hold(&req->ctx->completion_lock)
   {
         bool do_complete;
   
         io_poll_remove_double(req);
+       do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
   
-       if (req->opcode == IORING_OP_POLL_ADD) {
-               do_complete = __io_poll_remove_one(req, &req->poll);
-       } else {
-               struct async_poll *apoll = req->apoll;
- 
+       if (req->opcode != IORING_OP_POLL_ADD && do_complete) {
                 /* non-poll requests have submit ref still */
-               do_complete = __io_poll_remove_one(req, &apoll->poll);
-               if (do_complete) {
-                       io_put_req(req);
-                       kfree(apoll->double_poll);
-                       kfree(apoll);
-               }
+               req_ref_put(req);
         }
+       return do_complete;
+ }
+ 
+ static bool io_poll_remove_one(struct io_kiocb *req)
+       __must_hold(&req->ctx->completion_lock)
+ {
+       bool do_complete;
   
+       do_complete = io_poll_remove_waitqs(req);
         if (do_complete) {
-               io_cqring_fill_event(req, -ECANCELED);
+               io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
                 io_commit_cqring(req->ctx);
                 req_set_fail_links(req);
                 io_put_req_deferred(req, 1);
@@@ -5307,7 -5265,9 +5265,9 @@@ static bool io_poll_remove_all(struct i
         return posted != 0;
   }
   
- static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
+ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
+                                    bool poll_only)
+       __must_hold(&ctx->completion_lock)
   {
         struct hlist_head *list;
         struct io_kiocb *req;
@@@ -5316,43 -5276,72 +5276,72 @@@
         hlist_for_each_entry(req, list, hash_node) {
                 if (sqe_addr != req->user_data)
                         continue;
-               if (io_poll_remove_one(req))
-                       return 0;
-               return -EALREADY;
+               if (poll_only && req->opcode != IORING_OP_POLL_ADD)
+                       continue;
+               return req;
         }
+       return NULL;
+ }
+ 
+ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
+                         bool poll_only)
+       __must_hold(&ctx->completion_lock)
+ {
+       struct io_kiocb *req;
+ 
+       req = io_poll_find(ctx, sqe_addr, poll_only);
+       if (!req)
+               return -ENOENT;
+       if (io_poll_remove_one(req))
+               return 0;
+ 
+       return -EALREADY;
+ }
+ 
+ static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
+                                    unsigned int flags)
+ {
+       u32 events;
   
-       return -ENOENT;
+       events = READ_ONCE(sqe->poll32_events);
+ #ifdef __BIG_ENDIAN
+       events = swahw32(events);
+ #endif
+       if (!(flags & IORING_POLL_ADD_MULTI))
+               events |= EPOLLONESHOT;
+       return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
   }
   
- static int io_poll_remove_prep(struct io_kiocb *req,
+ static int io_poll_update_prep(struct io_kiocb *req,
                                const struct io_uring_sqe *sqe)
   {
+       struct io_poll_update *upd = &req->poll_update;
+       u32 flags;
+ 
         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
-       if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
-           sqe->poll_events)
+       if (sqe->ioprio || sqe->buf_index)
+               return -EINVAL;
+       flags = READ_ONCE(sqe->len);
+       if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
+                     IORING_POLL_ADD_MULTI))
+               return -EINVAL;
+       /* meaningless without update */
+       if (flags == IORING_POLL_ADD_MULTI)
                 return -EINVAL;
   
-       req->poll_remove.addr = READ_ONCE(sqe->addr);
-       return 0;
- }
- 
- /*
-  * Find a running poll command that matches one specified in sqe->addr,
-  * and remove it if found.
-  */
- static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
- {
-       struct io_ring_ctx *ctx = req->ctx;
-       int ret;
+       upd->old_user_data = READ_ONCE(sqe->addr);
+       upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
+       upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
   
-       spin_lock_irq(&ctx->completion_lock);
-       ret = io_poll_cancel(ctx, req->poll_remove.addr);
-       spin_unlock_irq(&ctx->completion_lock);
+       upd->new_user_data = READ_ONCE(sqe->off);
+       if (!upd->update_user_data && upd->new_user_data)
+               return -EINVAL;
+       if (upd->update_events)
+               upd->events = io_poll_parse_events(sqe, flags);
+       else if (sqe->poll32_events)
+               return -EINVAL;
   
-       if (ret < 0)
-               req_set_fail_links(req);
-       io_req_complete(req, ret);
         return 0;
   }
   
@@@ -5376,19 -5365,17 +5365,17 @@@ static void io_poll_queue_proc(struct f
   static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   {
         struct io_poll_iocb *poll = &req->poll;
-       u32 events;
+       u32 flags;
   
         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
-       if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
+       if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
+               return -EINVAL;
+       flags = READ_ONCE(sqe->len);
+       if (flags & ~IORING_POLL_ADD_MULTI)
                 return -EINVAL;
   
-       events = READ_ONCE(sqe->poll32_events);
- #ifdef __BIG_ENDIAN
-       events = swahw32(events);
- #endif
-       poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
-                      (events & EPOLLEXCLUSIVE);
+       poll->events = io_poll_parse_events(sqe, flags);
         return 0;
   }
   
@@@ -5406,17 -5393,80 +5393,80 @@@ static int io_poll_add(struct io_kiocb 
   
         if (mask) { /* no async, we'd stolen it */
                 ipt.error = 0;
-               io_poll_complete(req, mask, 0);
+               io_poll_complete(req, mask);
         }
         spin_unlock_irq(&ctx->completion_lock);
   
         if (mask) {
                 io_cqring_ev_posted(ctx);
-               io_put_req(req);
+               if (poll->events & EPOLLONESHOT)
+                       io_put_req(req);
         }
         return ipt.error;
   }
   
+ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
+ {
+       struct io_ring_ctx *ctx = req->ctx;
+       struct io_kiocb *preq;
+       bool completing;
+       int ret;
+ 
+       spin_lock_irq(&ctx->completion_lock);
+       preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
+       if (!preq) {
+               ret = -ENOENT;
+               goto err;
+       }
+ 
+       if (!req->poll_update.update_events && !req->poll_update.update_user_data) {
+               completing = true;
+               ret = io_poll_remove_one(preq) ? 0 : -EALREADY;
+               goto err;
+       }
+ 
+       /*
+        * Don't allow racy completion with singleshot, as we cannot safely
+        * update those. For multishot, if we're racing with completion, just
+        * let completion re-add it.
+        */
+       completing = !__io_poll_remove_one(preq, &preq->poll, false);
+       if (completing && (preq->poll.events & EPOLLONESHOT)) {
+               ret = -EALREADY;
+               goto err;
+       }
+       /* we now have a detached poll request. reissue. */
+       ret = 0;
+ err:
+       if (ret < 0) {
+               spin_unlock_irq(&ctx->completion_lock);
+               req_set_fail_links(req);
+               io_req_complete(req, ret);
+               return 0;
+       }
+       /* only mask one event flags, keep behavior flags */
+       if (req->poll_update.update_events) {
+               preq->poll.events &= ~0xffff;
+               preq->poll.events |= req->poll_update.events & 0xffff;
+               preq->poll.events |= IO_POLL_UNMASK;
+       }
+       if (req->poll_update.update_user_data)
+               preq->user_data = req->poll_update.new_user_data;
+       spin_unlock_irq(&ctx->completion_lock);
+ 
+       /* complete update request, we're done with it */
+       io_req_complete(req, ret);
+ 
+       if (!completing) {
+               ret = io_poll_add(preq, issue_flags);
+               if (ret < 0) {
+                       req_set_fail_links(preq);
+                       io_req_complete(preq, ret);
+               }
+       }
+       return 0;
+ }
+ 
   static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
   {
         struct io_timeout_data *data = container_of(timer,
@@@ -5430,7 -5480,7 +5480,7 @@@
         atomic_set(&req->ctx->cq_timeouts,
                 atomic_read(&req->ctx->cq_timeouts) + 1);
   
-       io_cqring_fill_event(req, -ETIME);
+       io_cqring_fill_event(ctx, req->user_data, -ETIME, 0);
         io_commit_cqring(ctx);
         spin_unlock_irqrestore(&ctx->completion_lock, flags);
   
@@@ -5442,30 -5492,29 +5492,29 @@@
   
   static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
                                            __u64 user_data)
+       __must_hold(&ctx->completion_lock)
   {
         struct io_timeout_data *io;
         struct io_kiocb *req;
-       int ret = -ENOENT;
+       bool found = false;
   
         list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
-               if (user_data == req->user_data) {
-                       ret = 0;
+               found = user_data == req->user_data;
+               if (found)
                         break;
-               }
         }
- 
-       if (ret == -ENOENT)
-               return ERR_PTR(ret);
+       if (!found)
+               return ERR_PTR(-ENOENT);
   
         io = req->async_data;
-       ret = hrtimer_try_to_cancel(&io->timer);
-       if (ret == -1)
+       if (hrtimer_try_to_cancel(&io->timer) == -1)
                 return ERR_PTR(-EALREADY);
         list_del_init(&req->timeout.list);
         return req;
   }
   
   static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+       __must_hold(&ctx->completion_lock)
   {
         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
   
@@@ -5473,13 -5522,14 +5522,14 @@@
                 return PTR_ERR(req);
   
         req_set_fail_links(req);
-       io_cqring_fill_event(req, -ECANCELED);
+       io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
         io_put_req_deferred(req, 1);
         return 0;
   }
   
   static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
                              struct timespec64 *ts, enum hrtimer_mode mode)
+       __must_hold(&ctx->completion_lock)
   {
         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
         struct io_timeout_data *data;
@@@ -5545,7 -5595,7 +5595,7 @@@ static int io_timeout_remove(struct io_
                 ret = io_timeout_update(ctx, tr->addr, &tr->ts,
                                         io_translate_timeout_mode(tr->flags));
   
-       io_cqring_fill_event(req, ret);
+       io_cqring_fill_event(ctx, req->user_data, ret, 0);
         io_commit_cqring(ctx);
         spin_unlock_irq(&ctx->completion_lock);
         io_cqring_ev_posted(ctx);
@@@ -5687,27 -5737,23 +5737,23 @@@ static void io_async_find_and_cancel(st
         int ret;
   
         ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
-       if (ret != -ENOENT) {
-               spin_lock_irqsave(&ctx->completion_lock, flags);
-               goto done;
-       }
- 
         spin_lock_irqsave(&ctx->completion_lock, flags);
+       if (ret != -ENOENT)
+               goto done;
         ret = io_timeout_cancel(ctx, sqe_addr);
         if (ret != -ENOENT)
                 goto done;
-       ret = io_poll_cancel(ctx, sqe_addr);
+       ret = io_poll_cancel(ctx, sqe_addr, false);
   done:
         if (!ret)
                 ret = success_ret;
-       io_cqring_fill_event(req, ret);
+       io_cqring_fill_event(ctx, req->user_data, ret, 0);
         io_commit_cqring(ctx);
         spin_unlock_irqrestore(&ctx->completion_lock, flags);
         io_cqring_ev_posted(ctx);
   
         if (ret < 0)
                 req_set_fail_links(req);
-       io_put_req(req);
   }
   
   static int io_async_cancel_prep(struct io_kiocb *req,
@@@ -5739,7 -5785,7 +5785,7 @@@ static int io_async_cancel(struct io_ki
         ret = io_timeout_cancel(ctx, sqe_addr);
         if (ret != -ENOENT)
                 goto done;
-       ret = io_poll_cancel(ctx, sqe_addr);
+       ret = io_poll_cancel(ctx, sqe_addr, false);
         if (ret != -ENOENT)
                 goto done;
         spin_unlock_irq(&ctx->completion_lock);
@@@ -5750,8 -5796,6 +5796,6 @@@
         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
                 struct io_uring_task *tctx = node->task->io_uring;
   
-               if (!tctx || !tctx->io_wq)
-                       continue;
                 ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
                 if (ret != -ENOENT)
                         break;
@@@ -5760,7 -5804,7 +5804,7 @@@
   
         spin_lock_irq(&ctx->completion_lock);
   done:
-       io_cqring_fill_event(req, ret);
+       io_cqring_fill_event(ctx, req->user_data, ret, 0);
         io_commit_cqring(ctx);
         spin_unlock_irq(&ctx->completion_lock);
         io_cqring_ev_posted(ctx);
@@@ -5792,7 -5836,7 +5836,7 @@@ static int io_rsrc_update_prep(struct i
   static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
   {
         struct io_ring_ctx *ctx = req->ctx;
-       struct io_uring_rsrc_update up;
+       struct io_uring_rsrc_update2 up;
         int ret;
   
         if (issue_flags & IO_URING_F_NONBLOCK)
@@@ -5800,9 -5844,13 +5844,13 @@@
   
         up.offset = req->rsrc_update.offset;
         up.data = req->rsrc_update.arg;
+       up.nr = 0;
+       up.tags = 0;
+       up.resv = 0;
   
         mutex_lock(&ctx->uring_lock);
-       ret = __io_sqe_files_update(ctx, &up, req->rsrc_update.nr_args);
+       ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
+                                       &up, req->rsrc_update.nr_args);
         mutex_unlock(&ctx->uring_lock);
   
         if (ret < 0)
@@@ -5827,7 -5875,7 +5875,7 @@@ static int io_req_prep(struct io_kiocb 
         case IORING_OP_POLL_ADD:
                 return io_poll_add_prep(req, sqe);
         case IORING_OP_POLL_REMOVE:
-               return io_poll_remove_prep(req, sqe);
+               return io_poll_update_prep(req, sqe);
         case IORING_OP_FSYNC:
                 return io_fsync_prep(req, sqe);
         case IORING_OP_SYNC_FILE_RANGE:
@@@ -5886,42 -5934,33 +5934,33 @@@
   
         printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
                         req->opcode);
-       return-EINVAL;
+       return -EINVAL;
   }
   
   static int io_req_prep_async(struct io_kiocb *req)
   {
-       switch (req->opcode) {
+       if (!io_op_defs[req->opcode].needs_async_setup)
+               return 0;
+       if (WARN_ON_ONCE(req->async_data))
+               return -EFAULT;
+       if (io_alloc_async_data(req))
+               return -EAGAIN;
+ 
+       switch (req->opcode) {
         case IORING_OP_READV:
-       case IORING_OP_READ_FIXED:
-       case IORING_OP_READ:
                 return io_rw_prep_async(req, READ);
         case IORING_OP_WRITEV:
-       case IORING_OP_WRITE_FIXED:
-       case IORING_OP_WRITE:
                 return io_rw_prep_async(req, WRITE);
         case IORING_OP_SENDMSG:
-       case IORING_OP_SEND:
                 return io_sendmsg_prep_async(req);
         case IORING_OP_RECVMSG:
-       case IORING_OP_RECV:
                 return io_recvmsg_prep_async(req);
         case IORING_OP_CONNECT:
                 return io_connect_prep_async(req);
         }
-       return 0;
- }
- 
- static int io_req_defer_prep(struct io_kiocb *req)
- {
-       if (!io_op_defs[req->opcode].needs_async_data)
-               return 0;
-       /* some opcodes init it during the inital prep */
-       if (req->async_data)
-               return 0;
-       if (__io_alloc_async_data(req))
-               return -EAGAIN;
-       return io_req_prep_async(req);
+       printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
+                   req->opcode);
+       return -EFAULT;
   }
   
   static u32 io_get_sequence(struct io_kiocb *req)
@@@ -5954,7 -5993,7 +5993,7 @@@ static int io_req_defer(struct io_kioc
         if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
                 return 0;
   
-       ret = io_req_defer_prep(req);
+       ret = io_req_prep_async(req);
         if (ret)
                 return ret;
         io_prep_async_link(req);
@@@ -5978,7 -6017,7 +6017,7 @@@
         return -EIOCBQUEUED;
   }
   
- static void __io_clean_op(struct io_kiocb *req)
+ static void io_clean_op(struct io_kiocb *req)
   {
         if (req->flags & REQ_F_BUFFER_SELECTED) {
                 switch (req->opcode) {
@@@ -6017,8 -6056,8 +6056,8 @@@
                         }
                 case IORING_OP_SPLICE:
                 case IORING_OP_TEE:
-                       io_put_file(req, req->splice.file_in,
-                                   (req->splice.flags & SPLICE_F_FD_IN_FIXED));
+                       if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED))
+                               io_put_file(req->splice.file_in);
                         break;
                 case IORING_OP_OPENAT:
                 case IORING_OP_OPENAT2:
@@@ -6035,6 -6074,17 +6074,17 @@@
                 }
                 req->flags &= ~REQ_F_NEED_CLEANUP;
         }
+       if ((req->flags & REQ_F_POLLED) && req->apoll) {
+               kfree(req->apoll->double_poll);
+               kfree(req->apoll);
+               req->apoll = NULL;
+       }
+       if (req->flags & REQ_F_INFLIGHT) {
+               struct io_uring_task *tctx = req->task->io_uring;
+ 
+               atomic_dec(&tctx->inflight_tracked);
+               req->flags &= ~REQ_F_INFLIGHT;
+       }
   }
   
   static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
@@@ -6067,7 -6117,7 +6117,7 @@@
                 ret = io_poll_add(req, issue_flags);
                 break;
         case IORING_OP_POLL_REMOVE:
-               ret = io_poll_remove(req, issue_flags);
+               ret = io_poll_update(req, issue_flags);
                 break;
         case IORING_OP_SYNC_FILE_RANGE:
                 ret = io_sync_file_range(req, issue_flags);
@@@ -6205,18 -6255,48 +6255,48 @@@ static void io_wq_submit_work(struct io
         /* avoid locking problems by failing it from a clean context */
         if (ret) {
                 /* io-wq is going to take one down */
-               refcount_inc(&req->refs);
+               req_ref_get(req);
                 io_req_task_queue_fail(req, ret);
         }
   }
   
+ #define FFS_ASYNC_READ                0x1UL
+ #define FFS_ASYNC_WRITE               0x2UL
+ #ifdef CONFIG_64BIT
+ #define FFS_ISREG             0x4UL
+ #else
+ #define FFS_ISREG             0x0UL
+ #endif
+ #define FFS_MASK              ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
+ 
+ static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
+                                                     unsigned i)
+ {
+       struct io_fixed_file *table_l2;
+ 
+       table_l2 = table->files[i >> IORING_FILE_TABLE_SHIFT];
+       return &table_l2[i & IORING_FILE_TABLE_MASK];
+ }
+ 
   static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
                                               int index)
   {
-       struct fixed_rsrc_table *table;
+       struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
   
-       table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
-       return table->files[index & IORING_FILE_TABLE_MASK];
+       return (struct file *) (slot->file_ptr & FFS_MASK);
+ }
+ 
+ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
+ {
+       unsigned long file_ptr = (unsigned long) file;
+ 
+       if (__io_file_supports_async(file, READ))
+               file_ptr |= FFS_ASYNC_READ;
+       if (__io_file_supports_async(file, WRITE))
+               file_ptr |= FFS_ASYNC_WRITE;
+       if (S_ISREG(file_inode(file)->i_mode))
+               file_ptr |= FFS_ISREG;
+       file_slot->file_ptr = file_ptr;
   }
   
   static struct file *io_file_get(struct io_submit_state *state,
@@@ -6226,18 -6306,26 +6306,26 @@@
         struct file *file;
   
         if (fixed) {
+               unsigned long file_ptr;
+ 
                 if (unlikely((unsigned int)fd >= ctx->nr_user_files))
                         return NULL;
                 fd = array_index_nospec(fd, ctx->nr_user_files);
-               file = io_file_from_index(ctx, fd);
-               io_set_resource_node(req);
+               file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
+               file = (struct file *) (file_ptr & FFS_MASK);
+               file_ptr &= ~FFS_MASK;
+               /* mask in overlapping REQ_F and FFS bits */
+               req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT);
+               io_req_set_rsrc_node(req);
         } else {
                 trace_io_uring_file_get(ctx, fd);
                 file = __io_file_get(state, fd);
+ 
+               /* we don't allow fixed io_uring files */
+               if (file && unlikely(file->f_op == &io_uring_fops))
+                       io_req_track_inflight(req);
         }
   
-       if (file && unlikely(file->f_op == &io_uring_fops))
-               io_req_track_inflight(req);
         return file;
   }
   
@@@ -6257,7 -6345,7 +6345,7 @@@ static enum hrtimer_restart io_link_tim
          * We don't expect the list to be empty, that will only happen if we
          * race with the completion of the linked work.
          */
-       if (prev && refcount_inc_not_zero(&prev->refs))
+       if (prev && req_ref_inc_not_zero(prev))
                 io_remove_next_linked(prev);
         else
                 prev = NULL;
@@@ -6268,13 -6356,16 +6356,16 @@@
                 io_put_req_deferred(prev, 1);
         } else {
                 io_req_complete_post(req, -ETIME, 0);
-               io_put_req_deferred(req, 1);
         }
+       io_put_req_deferred(req, 1);
         return HRTIMER_NORESTART;
   }
   
- static void __io_queue_linked_timeout(struct io_kiocb *req)
+ static void io_queue_linked_timeout(struct io_kiocb *req)
   {
+       struct io_ring_ctx *ctx = req->ctx;
+ 
+       spin_lock_irq(&ctx->completion_lock);
         /*
          * If the back reference is NULL, then our linked request finished
          * before we got a chance to setup the timer
@@@ -6286,16 -6377,7 +6377,7 @@@
                 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
                                 data->mode);
         }
- }
- 
- static void io_queue_linked_timeout(struct io_kiocb *req)
- {
-       struct io_ring_ctx *ctx = req->ctx;
- 
-       spin_lock_irq(&ctx->completion_lock);
-       __io_queue_linked_timeout(req);
         spin_unlock_irq(&ctx->completion_lock);
- 
         /* drop submission reference */
         io_put_req(req);
   }
@@@ -6325,15 -6407,7 +6407,7 @@@ static void __io_queue_sqe(struct io_ki
          * We async punt it if the file wasn't marked NOWAIT, or if the file
          * doesn't support non-blocking read/write attempts
          */
-       if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
-               if (!io_arm_poll_handler(req)) {
-                       /*
-                        * Queued up for async execution, worker will release
-                        * submit reference when the iocb is actually submitted.
-                        */
-                       io_queue_async_work(req);
-               }
-       } else if (likely(!ret)) {
+       if (likely(!ret)) {
                 /* drop submission reference */
                 if (req->flags & REQ_F_COMPLETE_INLINE) {
                         struct io_ring_ctx *ctx = req->ctx;
@@@ -6345,10 -6419,16 +6419,16 @@@
                 } else {
                         io_put_req(req);
                 }
+       } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
+               if (!io_arm_poll_handler(req)) {
+                       /*
+                        * Queued up for async execution, worker will release
+                        * submit reference when the iocb is actually submitted.
+                        */
+                       io_queue_async_work(req);
+               }
         } else {
-               req_set_fail_links(req);
-               io_put_req(req);
-               io_req_complete(req, ret);
+               io_req_complete_failed(req, ret);
         }
         if (linked_timeout)
                 io_queue_linked_timeout(linked_timeout);
@@@ -6362,12 -6442,10 +6442,10 @@@ static void io_queue_sqe(struct io_kioc
         if (ret) {
                 if (ret != -EIOCBQUEUED) {
   fail_req:
-                       req_set_fail_links(req);
-                       io_put_req(req);
-                       io_req_complete(req, ret);
+                       io_req_complete_failed(req, ret);
                 }
         } else if (req->flags & REQ_F_FORCE_ASYNC) {
-               ret = io_req_defer_prep(req);
+               ret = io_req_prep_async(req);
                 if (unlikely(ret))
                         goto fail_req;
                 io_queue_async_work(req);
@@@ -6419,12 -6497,10 +6497,10 @@@ static int io_init_req(struct io_ring_c
         req->link = NULL;
         req->fixed_rsrc_refs = NULL;
         /* one is dropped after submission, the other at completion */
-       refcount_set(&req->refs, 2);
+       atomic_set(&req->refs, 2);
         req->task = current;
         req->result = 0;
-       req->work.list.next = NULL;
         req->work.creds = NULL;
-       req->work.flags = 0;
   
         /* enforce forwards compatibility on users */
         if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
@@@ -6485,12 -6561,10 +6561,10 @@@ fail_req
                 if (link->head) {
                         /* fail even hard links since we don't submit */
                         link->head->flags |= REQ_F_FAIL_LINK;
-                       io_put_req(link->head);
-                       io_req_complete(link->head, -ECANCELED);
+                       io_req_complete_failed(link->head, -ECANCELED);
                         link->head = NULL;
                 }
-               io_put_req(req);
-               io_req_complete(req, ret);
+               io_req_complete_failed(req, ret);
                 return ret;
         }
         ret = io_req_prep(req, sqe);
@@@ -6522,7 -6596,7 +6596,7 @@@
                         head->flags |= REQ_F_IO_DRAIN;
                         ctx->drain_next = 1;
                 }
-               ret = io_req_defer_prep(req);
+               ret = io_req_prep_async(req);
                 if (unlikely(ret))
                         goto fail_req;
                 trace_io_uring_link(ctx, req, head);
@@@ -6624,12 -6698,6 +6698,6 @@@ static int io_submit_sqes(struct io_rin
   {
         int submitted = 0;
   
-       /* if we have a backlog and couldn't flush it all, return BUSY */
-       if (test_bit(0, &ctx->sq_check_overflow)) {
-               if (!__io_cqring_overflow_flush(ctx, false, NULL, NULL))
-                       return -EBUSY;
-       }
- 
         /* make sure SQ entry isn't read before tail */
         nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
   
@@@ -6710,6 -6778,10 +6778,10 @@@ static int __io_sq_thread(struct io_rin
                 if (!list_empty(&ctx->iopoll_list))
                         io_do_iopoll(ctx, &nr_events, 0);
   
+               /*
+                * Don't submit if refs are dying, good for io_uring_register(),
+                * but also it is relied upon by io_ring_exit_work()
+                */
                 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
                     !(ctx->flags & IORING_SETUP_R_DISABLED))
                         ret = io_submit_sqes(ctx, to_submit);
@@@ -6727,11 -6799,8 +6799,8 @@@ static void io_sqd_update_thread_idle(s
         struct io_ring_ctx *ctx;
         unsigned sq_thread_idle = 0;
   
-       list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
-               if (sq_thread_idle < ctx->sq_thread_idle)
-                       sq_thread_idle = ctx->sq_thread_idle;
-       }
- 
+       list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+               sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
         sqd->sq_thread_idle = sq_thread_idle;
   }
   
@@@ -6745,7 -6814,6 +6814,6 @@@ static int io_sq_thread(void *data
   
         snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
         set_task_comm(current, buf);
-       current->pf_io_worker = NULL;
   
         if (sqd->sq_cpu != -1)
                 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
@@@ -6754,9 -6822,6 +6822,9 @@@
         current->flags |= PF_NO_SETAFFINITY;
   
         mutex_lock(&sqd->lock);
+ +      /* a user may had exited before the thread started */
+ +      io_run_task_work_head(&sqd->park_task_work);
+ +
         while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) {
                 int ret;
                 bool cap_entries, sqt_spin, needs_sched;
@@@ -6773,10 -6838,10 +6841,10 @@@
                         }
                         cond_resched();
                         mutex_lock(&sqd->lock);
- -                      if (did_sig)
- -                              break;
                         io_run_task_work();
                         io_run_task_work_head(&sqd->park_task_work);
+ +                      if (did_sig)
+ +                              break;
                         timeout = jiffies + sqd->sq_thread_idle;
                         continue;
                 }
@@@ -6802,27 -6867,29 +6870,29 @@@
                         continue;
                 }
   
-               needs_sched = true;
                 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
-               list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
-                       if ((ctx->flags & IORING_SETUP_IOPOLL) &&
-                           !list_empty_careful(&ctx->iopoll_list)) {
-                               needs_sched = false;
-                               break;
-                       }
-                       if (io_sqring_entries(ctx)) {
-                               needs_sched = false;
-                               break;
-                       }
-               }
- 
-               if (needs_sched && !test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) {
+               if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) {
                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                 io_ring_set_wakeup_flag(ctx);
   
-                       mutex_unlock(&sqd->lock);
-                       schedule();
-                       mutex_lock(&sqd->lock);
+                       needs_sched = true;
+                       list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+                               if ((ctx->flags & IORING_SETUP_IOPOLL) &&
+                                   !list_empty_careful(&ctx->iopoll_list)) {
+                                       needs_sched = false;
+                                       break;
+                               }
+                               if (io_sqring_entries(ctx)) {
+                                       needs_sched = false;
+                                       break;
+                               }
+                       }
+ 
+                       if (needs_sched) {
+                               mutex_unlock(&sqd->lock);
+                               schedule();
+                               mutex_lock(&sqd->lock);
+                       }
                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                 io_ring_clear_wakeup_flag(ctx);
                 }
@@@ -6832,15 -6899,14 +6902,14 @@@
                 timeout = jiffies + sqd->sq_thread_idle;
         }
   
-       list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
-               io_uring_cancel_sqpoll(ctx);
+       io_uring_cancel_sqpoll(sqd);
         sqd->thread = NULL;
         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                 io_ring_set_wakeup_flag(ctx);
-       mutex_unlock(&sqd->lock);
- 
         io_run_task_work();
         io_run_task_work_head(&sqd->park_task_work);
+       mutex_unlock(&sqd->lock);
+ 
         complete(&sqd->exited);
         do_exit(0);
   }
@@@ -6932,7 -6998,7 +7001,7 @@@ static int io_cqring_wait(struct io_rin
         int ret;
   
         do {
-               io_cqring_overflow_flush(ctx, false, NULL, NULL);
+               io_cqring_overflow_flush(ctx, false);
                 if (io_cqring_events(ctx) >= min_events)
                         return 0;
                 if (!io_run_task_work())
@@@ -6964,7 -7030,7 +7033,7 @@@
         trace_io_uring_cqring_wait(ctx, min_events);
         do {
                 /* if we can't even flush overflow, don't wait for more */
-               if (!io_cqring_overflow_flush(ctx, false, NULL, NULL)) {
+               if (!io_cqring_overflow_flush(ctx, false)) {
                         ret = -EBUSY;
                         break;
                 }
@@@ -6980,35 -7046,14 +7049,14 @@@
         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
   }
   
- static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
- {
- #if defined(CONFIG_UNIX)
-       if (ctx->ring_sock) {
-               struct sock *sock = ctx->ring_sock->sk;
-               struct sk_buff *skb;
- 
-               while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
-                       kfree_skb(skb);
-       }
- #else
-       int i;
- 
-       for (i = 0; i < ctx->nr_user_files; i++) {
-               struct file *file;
- 
-               file = io_file_from_index(ctx, i);
-               if (file)
-                       fput(file);
-       }
- #endif
- }
- 
- static void io_rsrc_data_ref_zero(struct percpu_ref *ref)
+ static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
   {
-       struct fixed_rsrc_data *data;
+       unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
   
-       data = container_of(ref, struct fixed_rsrc_data, refs);
-       complete(&data->done);
+       for (i = 0; i < nr_tables; i++)
+               kfree(table->files[i]);
+       kfree(table->files);
+       table->files = NULL;
   }
   
   static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
@@@ -7021,122 -7066,148 +7069,148 @@@ static inline void io_rsrc_ref_unlock(s
         spin_unlock_bh(&ctx->rsrc_ref_lock);
   }
   
- static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx,
-                                struct fixed_rsrc_data *rsrc_data,
-                                struct fixed_rsrc_ref_node *ref_node)
+ static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
   {
-       io_rsrc_ref_lock(ctx);
-       rsrc_data->node = ref_node;
-       list_add_tail(&ref_node->node, &ctx->rsrc_ref_list);
-       io_rsrc_ref_unlock(ctx);
-       percpu_ref_get(&rsrc_data->refs);
+       percpu_ref_exit(&ref_node->refs);
+       kfree(ref_node);
   }
   
- static void io_sqe_rsrc_kill_node(struct io_ring_ctx *ctx, struct fixed_rsrc_data *data)
+ static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
+                               struct io_rsrc_data *data_to_kill)
   {
-       struct fixed_rsrc_ref_node *ref_node = NULL;
+       WARN_ON_ONCE(!ctx->rsrc_backup_node);
+       WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
   
-       io_rsrc_ref_lock(ctx);
-       ref_node = data->node;
-       data->node = NULL;
-       io_rsrc_ref_unlock(ctx);
-       if (ref_node)
-               percpu_ref_kill(&ref_node->refs);
+       if (data_to_kill) {
+               struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
+ 
+               rsrc_node->rsrc_data = data_to_kill;
+               io_rsrc_ref_lock(ctx);
+               list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
+               io_rsrc_ref_unlock(ctx);
+ 
+               atomic_inc(&data_to_kill->refs);
+               percpu_ref_kill(&rsrc_node->refs);
+               ctx->rsrc_node = NULL;
+       }
+ 
+       if (!ctx->rsrc_node) {
+               ctx->rsrc_node = ctx->rsrc_backup_node;
+               ctx->rsrc_backup_node = NULL;
+       }
+ }
+ 
+ static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
+ {
+       if (ctx->rsrc_backup_node)
+               return 0;
+       ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
+       return ctx->rsrc_backup_node ? 0 : -ENOMEM;
   }
   
- static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
-                              struct io_ring_ctx *ctx,
-                              void (*rsrc_put)(struct io_ring_ctx *ctx,
-                                               struct io_rsrc_put *prsrc))
+ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx)
   {
-       struct fixed_rsrc_ref_node *backup_node;
         int ret;
   
+       /* As we may drop ->uring_lock, other task may have started quiesce */
         if (data->quiesce)
                 return -ENXIO;
   
         data->quiesce = true;
         do {
-               ret = -ENOMEM;
-               backup_node = alloc_fixed_rsrc_ref_node(ctx);
-               if (!backup_node)
+               ret = io_rsrc_node_switch_start(ctx);
+               if (ret)
                         break;
-               backup_node->rsrc_data = data;
-               backup_node->rsrc_put = rsrc_put;
+               io_rsrc_node_switch(ctx, data);
   
-               io_sqe_rsrc_kill_node(ctx, data);
-               percpu_ref_kill(&data->refs);
+               /* kill initial ref, already quiesced if zero */
+               if (atomic_dec_and_test(&data->refs))
+                       break;
                 flush_delayed_work(&ctx->rsrc_put_work);
- 
                 ret = wait_for_completion_interruptible(&data->done);
                 if (!ret)
                         break;
   
-               percpu_ref_resurrect(&data->refs);
-               io_sqe_rsrc_set_node(ctx, data, backup_node);
-               backup_node = NULL;
+               atomic_inc(&data->refs);
+               /* wait for all works potentially completing data->done */
+               flush_delayed_work(&ctx->rsrc_put_work);
                 reinit_completion(&data->done);
+ 
                 mutex_unlock(&ctx->uring_lock);
                 ret = io_run_task_work_sig();
                 mutex_lock(&ctx->uring_lock);
         } while (ret >= 0);
         data->quiesce = false;
   
-       if (backup_node)
-               destroy_fixed_rsrc_ref_node(backup_node);
         return ret;
   }
   
- static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx)
+ static void io_rsrc_data_free(struct io_rsrc_data *data)
   {
-       struct fixed_rsrc_data *data;
+       kvfree(data->tags);
+       kfree(data);
+ }
+ 
+ static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx,
+                                              rsrc_put_fn *do_put,
+                                              unsigned nr)
+ {
+       struct io_rsrc_data *data;
   
         data = kzalloc(sizeof(*data), GFP_KERNEL);
         if (!data)
                 return NULL;
   
-       if (percpu_ref_init(&data->refs, io_rsrc_data_ref_zero,
-                           PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
+       data->tags = kvcalloc(nr, sizeof(*data->tags), GFP_KERNEL);
+       if (!data->tags) {
                 kfree(data);
                 return NULL;
         }
+ 
+       atomic_set(&data->refs, 1);
         data->ctx = ctx;
+       data->do_put = do_put;
         init_completion(&data->done);
         return data;
   }
   
- static void free_fixed_rsrc_data(struct fixed_rsrc_data *data)
+ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
   {
-       percpu_ref_exit(&data->refs);
-       kfree(data->table);
-       kfree(data);
+ #if defined(CONFIG_UNIX)
+       if (ctx->ring_sock) {
+               struct sock *sock = ctx->ring_sock->sk;
+               struct sk_buff *skb;
+ 
+               while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
+                       kfree_skb(skb);
+       }
+ #else
+       int i;
+ 
+       for (i = 0; i < ctx->nr_user_files; i++) {
+               struct file *file;
+ 
+               file = io_file_from_index(ctx, i);
+               if (file)
+                       fput(file);
+       }
+ #endif
+       io_free_file_tables(&ctx->file_table, ctx->nr_user_files);
+       io_rsrc_data_free(ctx->file_data);
+       ctx->file_data = NULL;
+       ctx->nr_user_files = 0;
   }
   
   static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
   {
-       struct fixed_rsrc_data *data = ctx->file_data;
-       unsigned nr_tables, i;
         int ret;
   
-       /*
-        * percpu_ref_is_dying() is to stop parallel files unregister
-        * Since we possibly drop uring lock later in this function to
-        * run task work.
-        */
-       if (!data || percpu_ref_is_dying(&data->refs))
+       if (!ctx->file_data)
                 return -ENXIO;
-       ret = io_rsrc_ref_quiesce(data, ctx, io_ring_file_put);
-       if (ret)
-               return ret;
- 
-       __io_sqe_files_unregister(ctx);
-       nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
-       for (i = 0; i < nr_tables; i++)
-               kfree(data->table[i].files);
-       free_fixed_rsrc_data(data);
-       ctx->file_data = NULL;
-       ctx->nr_user_files = 0;
-       return 0;
+       ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
+       if (!ret)
+               __io_sqe_files_unregister(ctx);
+       return ret;
   }
   
   static void io_sq_thread_unpark(struct io_sq_data *sqd)
@@@ -7169,9 -7240,10 +7243,10 @@@ static void io_sq_thread_park(struct io
   static void io_sq_thread_stop(struct io_sq_data *sqd)
   {
         WARN_ON_ONCE(sqd->thread == current);
+       WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
   
-       mutex_lock(&sqd->lock);
         set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+       mutex_lock(&sqd->lock);
         if (sqd->thread)
                 wake_up_process(sqd->thread);
         mutex_unlock(&sqd->lock);
@@@ -7200,8 -7272,6 +7275,6 @@@ static void io_sq_thread_finish(struct 
   
                 io_put_sq_data(sqd);
                 ctx->sq_data = NULL;
-               if (ctx->sq_creds)
-                       put_cred(ctx->sq_creds);
         }
   }
   
@@@ -7362,34 -7432,32 +7435,32 @@@ static int io_sqe_files_scm(struct io_r
   }
   #endif
   
- static int io_sqe_alloc_file_tables(struct fixed_rsrc_data *file_data,
-                                   unsigned nr_tables, unsigned nr_files)
+ static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
   {
-       int i;
+       unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
+ 
+       table->files = kcalloc(nr_tables, sizeof(*table->files), GFP_KERNEL);
+       if (!table->files)
+               return false;
   
         for (i = 0; i < nr_tables; i++) {
-               struct fixed_rsrc_table *table = &file_data->table[i];
-               unsigned this_files;
+               unsigned int this_files = min(nr_files, IORING_MAX_FILES_TABLE);
   
-               this_files = min(nr_files, IORING_MAX_FILES_TABLE);
-               table->files = kcalloc(this_files, sizeof(struct file *),
+               table->files[i] = kcalloc(this_files, sizeof(*table->files[i]),
                                         GFP_KERNEL);
-               if (!table->files)
+               if (!table->files[i])
                         break;
                 nr_files -= this_files;
         }
   
         if (i == nr_tables)
-               return 0;
+               return true;
   
-       for (i = 0; i < nr_tables; i++) {
-               struct fixed_rsrc_table *table = &file_data->table[i];
-               kfree(table->files);
-       }
-       return 1;
+       io_free_file_tables(table, nr_tables * IORING_MAX_FILES_TABLE);
+       return false;
   }
   
- static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
+ static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
   {
         struct file *file = prsrc->file;
   #if defined(CONFIG_UNIX)
@@@ -7452,21 -7520,35 +7523,35 @@@
   #endif
   }
   
- static void __io_rsrc_put_work(struct fixed_rsrc_ref_node *ref_node)
+ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
   {
-       struct fixed_rsrc_data *rsrc_data = ref_node->rsrc_data;
+       struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
         struct io_ring_ctx *ctx = rsrc_data->ctx;
         struct io_rsrc_put *prsrc, *tmp;
   
         list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
                 list_del(&prsrc->list);
-               ref_node->rsrc_put(ctx, prsrc);
+ 
+               if (prsrc->tag) {
+                       bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
+                       unsigned long flags;
+ 
+                       io_ring_submit_lock(ctx, lock_ring);
+                       spin_lock_irqsave(&ctx->completion_lock, flags);
+                       io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
+                       io_commit_cqring(ctx);
+                       spin_unlock_irqrestore(&ctx->completion_lock, flags);
+                       io_cqring_ev_posted(ctx);
+                       io_ring_submit_unlock(ctx, lock_ring);
+               }
+ 
+               rsrc_data->do_put(ctx, prsrc);
                 kfree(prsrc);
         }
   
-       percpu_ref_exit(&ref_node->refs);
-       kfree(ref_node);
-       percpu_ref_put(&rsrc_data->refs);
+       io_rsrc_node_destroy(ref_node);
+       if (atomic_dec_and_test(&rsrc_data->refs))
+               complete(&rsrc_data->done);
   }
   
   static void io_rsrc_put_work(struct work_struct *work)
@@@ -7478,63 -7560,42 +7563,42 @@@
         node = llist_del_all(&ctx->rsrc_put_llist);
   
         while (node) {
-               struct fixed_rsrc_ref_node *ref_node;
+               struct io_rsrc_node *ref_node;
                 struct llist_node *next = node->next;
   
-               ref_node = llist_entry(node, struct fixed_rsrc_ref_node, llist);
+               ref_node = llist_entry(node, struct io_rsrc_node, llist);
                 __io_rsrc_put_work(ref_node);
                 node = next;
         }
   }
   
- static struct file **io_fixed_file_slot(struct fixed_rsrc_data *file_data,
-                                       unsigned i)
- {
-       struct fixed_rsrc_table *table;
- 
-       table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
-       return &table->files[i & IORING_FILE_TABLE_MASK];
- }
- 
   static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
   {
-       struct fixed_rsrc_ref_node *ref_node;
-       struct fixed_rsrc_data *data;
-       struct io_ring_ctx *ctx;
+       struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
+       struct io_ring_ctx *ctx = node->rsrc_data->ctx;
         bool first_add = false;
-       int delay = HZ;
- 
-       ref_node = container_of(ref, struct fixed_rsrc_ref_node, refs);
-       data = ref_node->rsrc_data;
-       ctx = data->ctx;
   
         io_rsrc_ref_lock(ctx);
-       ref_node->done = true;
+       node->done = true;
   
         while (!list_empty(&ctx->rsrc_ref_list)) {
-               ref_node = list_first_entry(&ctx->rsrc_ref_list,
-                                       struct fixed_rsrc_ref_node, node);
+               node = list_first_entry(&ctx->rsrc_ref_list,
+                                           struct io_rsrc_node, node);
                 /* recycle ref nodes in order */
-               if (!ref_node->done)
+               if (!node->done)
                         break;
-               list_del(&ref_node->node);
-               first_add |= llist_add(&ref_node->llist, &ctx->rsrc_put_llist);
+               list_del(&node->node);
+               first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
         }
         io_rsrc_ref_unlock(ctx);
   
-       if (percpu_ref_is_dying(&data->refs))
-               delay = 0;
- 
-       if (!delay)
-               mod_delayed_work(system_wq, &ctx->rsrc_put_work, 0);
-       else if (first_add)
-               queue_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
+       if (first_add)
+               mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
   }
   
- static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
-                       struct io_ring_ctx *ctx)
+ static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
   {
-       struct fixed_rsrc_ref_node *ref_node;
+       struct io_rsrc_node *ref_node;
   
         ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
         if (!ref_node)
@@@ -7551,29 -7612,14 +7615,14 @@@
         return ref_node;
   }
   
- static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
-                                    struct fixed_rsrc_ref_node *ref_node)
- {
-       ref_node->rsrc_data = ctx->file_data;
-       ref_node->rsrc_put = io_ring_file_put;
- }
- 
- static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
- {
-       percpu_ref_exit(&ref_node->refs);
-       kfree(ref_node);
- }
- 
- 
   static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
-                                unsigned nr_args)
+                                unsigned nr_args, u64 __user *tags)
   {
         __s32 __user *fds = (__s32 __user *) arg;
-       unsigned nr_tables, i;
         struct file *file;
-       int fd, ret = -ENOMEM;
-       struct fixed_rsrc_ref_node *ref_node;
-       struct fixed_rsrc_data *file_data;
+       int fd, ret;
+       unsigned i;
+       struct io_rsrc_data *file_data;
   
         if (ctx->file_data)
                 return -EBUSY;
@@@ -7581,33 -7627,37 +7630,37 @@@
                 return -EINVAL;
         if (nr_args > IORING_MAX_FIXED_FILES)
                 return -EMFILE;
+       ret = io_rsrc_node_switch_start(ctx);
+       if (ret)
+               return ret;
   
-       file_data = alloc_fixed_rsrc_data(ctx);
+       file_data = io_rsrc_data_alloc(ctx, io_rsrc_file_put, nr_args);
         if (!file_data)
                 return -ENOMEM;
         ctx->file_data = file_data;
- 
-       nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
-       file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
-                                  GFP_KERNEL);
-       if (!file_data->table)
-               goto out_free;
- 
-       if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args))
+       ret = -ENOMEM;
+       if (!io_alloc_file_tables(&ctx->file_table, nr_args))
                 goto out_free;
   
         for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
-               if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
+               u64 tag = 0;
+ 
+               if ((tags && copy_from_user(&tag, &tags[i], sizeof(tag))) ||
+                   copy_from_user(&fd, &fds[i], sizeof(fd))) {
                         ret = -EFAULT;
                         goto out_fput;
                 }
                 /* allow sparse sets */
-               if (fd == -1)
+               if (fd == -1) {
+                       ret = -EINVAL;
+                       if (unlikely(tag))
+                               goto out_fput;
                         continue;
+               }
   
                 file = fget(fd);
                 ret = -EBADF;
-               if (!file)
+               if (unlikely(!file))
                         goto out_fput;
   
                 /*
@@@ -7621,23 -7671,17 +7674,17 @@@
                         fput(file);
                         goto out_fput;
                 }
-               *io_fixed_file_slot(file_data, i) = file;
+               ctx->file_data->tags[i] = tag;
+               io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
         }
   
         ret = io_sqe_files_scm(ctx);
         if (ret) {
-               io_sqe_files_unregister(ctx);
+               __io_sqe_files_unregister(ctx);
                 return ret;
         }
   
-       ref_node = alloc_fixed_rsrc_ref_node(ctx);
-       if (!ref_node) {
-               io_sqe_files_unregister(ctx);
-               return -ENOMEM;
-       }
-       init_fixed_file_ref_node(ctx, ref_node);
- 
-       io_sqe_rsrc_set_node(ctx, file_data, ref_node);
+       io_rsrc_node_switch(ctx, NULL);
         return ret;
   out_fput:
         for (i = 0; i < ctx->nr_user_files; i++) {
@@@ -7645,11 -7689,10 +7692,10 @@@
                 if (file)
                         fput(file);
         }
-       for (i = 0; i < nr_tables; i++)
-               kfree(file_data->table[i].files);
+       io_free_file_tables(&ctx->file_table, nr_args);
         ctx->nr_user_files = 0;
   out_free:
-       free_fixed_rsrc_data(ctx->file_data);
+       io_rsrc_data_free(ctx->file_data);
         ctx->file_data = NULL;
         return ret;
   }
@@@ -7697,67 -7740,64 +7743,64 @@@ static int io_sqe_file_register(struct 
   #endif
   }
   
- static int io_queue_rsrc_removal(struct fixed_rsrc_data *data, void *rsrc)
+ static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
+                                struct io_rsrc_node *node, void *rsrc)
   {
         struct io_rsrc_put *prsrc;
-       struct fixed_rsrc_ref_node *ref_node = data->node;
   
         prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
         if (!prsrc)
                 return -ENOMEM;
   
+       prsrc->tag = data->tags[idx];
         prsrc->rsrc = rsrc;
-       list_add(&prsrc->list, &ref_node->rsrc_list);
- 
+       list_add(&prsrc->list, &node->rsrc_list);
         return 0;
   }
   
- static inline int io_queue_file_removal(struct fixed_rsrc_data *data,
-                                       struct file *file)
- {
-       return io_queue_rsrc_removal(data, (void *)file);
- }
- 
   static int __io_sqe_files_update(struct io_ring_ctx *ctx,
-                                struct io_uring_rsrc_update *up,
+                                struct io_uring_rsrc_update2 *up,
                                  unsigned nr_args)
   {
-       struct fixed_rsrc_data *data = ctx->file_data;
-       struct fixed_rsrc_ref_node *ref_node;
-       struct file *file, **file_slot;
-       __s32 __user *fds;
-       int fd, i, err;
-       __u32 done;
+       u64 __user *tags = u64_to_user_ptr(up->tags);
+       __s32 __user *fds = u64_to_user_ptr(up->data);
+       struct io_rsrc_data *data = ctx->file_data;
+       struct io_fixed_file *file_slot;
+       struct file *file;
+       int fd, i, err = 0;
+       unsigned int done;
         bool needs_switch = false;
   
-       if (check_add_overflow(up->offset, nr_args, &done))
-               return -EOVERFLOW;
-       if (done > ctx->nr_user_files)
+       if (!ctx->file_data)
+               return -ENXIO;
+       if (up->offset + nr_args > ctx->nr_user_files)
                 return -EINVAL;
   
-       ref_node = alloc_fixed_rsrc_ref_node(ctx);
-       if (!ref_node)
-               return -ENOMEM;
-       init_fixed_file_ref_node(ctx, ref_node);
- 
-       fds = u64_to_user_ptr(up->data);
         for (done = 0; done < nr_args; done++) {
-               err = 0;
-               if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
+               u64 tag = 0;
+ 
+               if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
+                   copy_from_user(&fd, &fds[done], sizeof(fd))) {
                         err = -EFAULT;
                         break;
                 }
+               if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
+                       err = -EINVAL;
+                       break;
+               }
                 if (fd == IORING_REGISTER_FILES_SKIP)
                         continue;
   
                 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
-               file_slot = io_fixed_file_slot(ctx->file_data, i);
+               file_slot = io_fixed_file_slot(&ctx->file_table, i);
   
-               if (*file_slot) {
-                       err = io_queue_file_removal(data, *file_slot);
+               if (file_slot->file_ptr) {
+                       file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+                       err = io_queue_rsrc_removal(data, up->offset + done,
+                                                   ctx->rsrc_node, file);
                         if (err)
                                 break;
-                       *file_slot = NULL;
+                       file_slot->file_ptr = 0;
                         needs_switch = true;
                 }
                 if (fd != -1) {
@@@ -7779,42 -7819,22 +7822,22 @@@
                                 err = -EBADF;
                                 break;
                         }
-                       *file_slot = file;
+                       data->tags[up->offset + done] = tag;
+                       io_fixed_file_set(file_slot, file);
                         err = io_sqe_file_register(ctx, file, i);
                         if (err) {
-                               *file_slot = NULL;
+                               file_slot->file_ptr = 0;
                                 fput(file);
                                 break;
                         }
                 }
         }
   
-       if (needs_switch) {
-               percpu_ref_kill(&data->node->refs);
-               io_sqe_rsrc_set_node(ctx, data, ref_node);
-       } else
-               destroy_fixed_rsrc_ref_node(ref_node);
- 
+       if (needs_switch)
+               io_rsrc_node_switch(ctx, data);
         return done ? done : err;
   }
   
- static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
-                              unsigned nr_args)
- {
-       struct io_uring_rsrc_update up;
- 
-       if (!ctx->file_data)
-               return -ENXIO;
-       if (!nr_args)
-               return -EINVAL;
-       if (copy_from_user(&up, arg, sizeof(up)))
-               return -EFAULT;
-       if (up.resv)
-               return -EINVAL;
- 
-       return __io_sqe_files_update(ctx, &up, nr_args);
- }
- 
   static struct io_wq_work *io_free_work(struct io_wq_work *work)
   {
         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
@@@ -7823,7 -7843,8 +7846,8 @@@
         return req ? &req->work : NULL;
   }
   
- static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
+ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
+                                       struct task_struct *task)
   {
         struct io_wq_hash *hash;
         struct io_wq_data data;
@@@ -7840,6 -7861,7 +7864,7 @@@
         }
   
         data.hash = hash;
+       data.task = task;
         data.free_work = io_free_work;
         data.do_work = io_wq_submit_work;
   
@@@ -7865,7 -7887,7 +7890,7 @@@ static int io_uring_alloc_task_context(
                 return ret;
         }
   
-       tctx->io_wq = io_init_wq_offload(ctx);
+       tctx->io_wq = io_init_wq_offload(ctx, task);
         if (IS_ERR(tctx->io_wq)) {
                 ret = PTR_ERR(tctx->io_wq);
                 percpu_counter_destroy(&tctx->inflight);
@@@ -7877,6 -7899,7 +7902,7 @@@
         init_waitqueue_head(&tctx->wait);
         tctx->last = NULL;
         atomic_set(&tctx->in_idle, 0);
+       atomic_set(&tctx->inflight_tracked, 0);
         task->io_uring = tctx;
         spin_lock_init(&tctx->task_lock);
         INIT_WQ_LIST(&tctx->task_list);
@@@ -7910,21 -7933,15 +7936,15 @@@ static int io_sq_offload_create(struct 
                 f = fdget(p->wq_fd);
                 if (!f.file)
                         return -ENXIO;
-               if (f.file->f_op != &io_uring_fops) {
-                       fdput(f);
-                       return -EINVAL;
-               }
                 fdput(f);
+               if (f.file->f_op != &io_uring_fops)
+                       return -EINVAL;
         }
         if (ctx->flags & IORING_SETUP_SQPOLL) {
                 struct task_struct *tsk;
                 struct io_sq_data *sqd;
                 bool attached;
   
-               ret = -EPERM;
-               if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
-                       goto err;
- 
                 sqd = io_get_sq_data(p, &attached);
                 if (IS_ERR(sqd)) {
                         ret = PTR_ERR(sqd);
@@@ -7937,13 -7954,11 +7957,11 @@@
                 if (!ctx->sq_thread_idle)
                         ctx->sq_thread_idle = HZ;
   
-               ret = 0;
                 io_sq_thread_park(sqd);
                 list_add(&ctx->sqd_list, &sqd->ctx_list);
                 io_sqd_update_thread_idle(sqd);
                 /* don't attach to a dying SQPOLL thread, would be racy */
-               if (attached && !sqd->thread)
-                       ret = -ENXIO;
+               ret = (attached && !sqd->thread) ? -ENXIO : 0;
                 io_sq_thread_unpark(sqd);
   
                 if (ret < 0)
@@@ -7955,11 -7970,8 +7973,8 @@@
                         int cpu = p->sq_thread_cpu;
   
                         ret = -EINVAL;
-                       if (cpu >= nr_cpu_ids)
-                               goto err_sqpoll;
-                       if (!cpu_online(cpu))
+                       if (cpu >= nr_cpu_ids || !cpu_online(cpu))
                                 goto err_sqpoll;
- 
                         sqd->sq_cpu = cpu;
                 } else {
                         sqd->sq_cpu = -1;
@@@ -7985,12 -7997,11 +8000,11 @@@
         }
   
         return 0;
+ err_sqpoll:
+       complete(&ctx->sq_data->exited);
   err:
         io_sq_thread_finish(ctx);
         return ret;
- err_sqpoll:
-       complete(&ctx->sq_data->exited);
-       goto err;
   }
   
   static inline void __io_unaccount_mem(struct user_struct *user,
@@@ -8092,29 -8103,49 +8106,49 @@@ static unsigned long rings_size(unsigne
         return off;
   }
   
+ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
+ {
+       struct io_mapped_ubuf *imu = *slot;
+       unsigned int i;
+ 
+       for (i = 0; i < imu->nr_bvecs; i++)
+               unpin_user_page(imu->bvec[i].bv_page);
+       if (imu->acct_pages)
+               io_unaccount_mem(ctx, imu->acct_pages);
+       kvfree(imu);
+       *slot = NULL;
+ }
+ 
+ static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
+ {
+       io_buffer_unmap(ctx, &prsrc->buf);
+       prsrc->buf = NULL;
+ }
+ 
+ static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
+ {
+       unsigned int i;
+ 
+       for (i = 0; i < ctx->nr_user_bufs; i++)
+               io_buffer_unmap(ctx, &ctx->user_bufs[i]);
+       kfree(ctx->user_bufs);
+       kfree(ctx->buf_data);
+       ctx->user_bufs = NULL;
+       ctx->buf_data = NULL;
+       ctx->nr_user_bufs = 0;
+ }
+ 
   static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
   {
-       int i, j;
+       int ret;
   
-       if (!ctx->user_bufs)
+       if (!ctx->buf_data)
                 return -ENXIO;
   
-       for (i = 0; i < ctx->nr_user_bufs; i++) {
-               struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
- 
-               for (j = 0; j < imu->nr_bvecs; j++)
-                       unpin_user_page(imu->bvec[j].bv_page);
- 
-               if (imu->acct_pages)
-                       io_unaccount_mem(ctx, imu->acct_pages);
-               kvfree(imu->bvec);
-               imu->nr_bvecs = 0;
-       }
- 
-       kfree(ctx->user_bufs);
-       ctx->user_bufs = NULL;
-       ctx->nr_user_bufs = 0;
-       return 0;
+       ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
+       if (!ret)
+               __io_sqe_buffers_unregister(ctx);
+       return ret;
   }
   
   static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
@@@ -8166,7 -8197,7 +8200,7 @@@ static bool headpage_already_acct(struc
   
         /* check previously registered pages */
         for (i = 0; i < ctx->nr_user_bufs; i++) {
-               struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+               struct io_mapped_ubuf *imu = ctx->user_bufs[i];
   
                 for (j = 0; j < imu->nr_bvecs; j++) {
                         if (!PageCompound(imu->bvec[j].bv_page))
@@@ -8211,9 -8242,10 +8245,10 @@@ static int io_buffer_account_pin(struc
   }
   
   static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
-                                 struct io_mapped_ubuf *imu,
+                                 struct io_mapped_ubuf **pimu,
                                   struct page **last_hpage)
   {
+       struct io_mapped_ubuf *imu = NULL;
         struct vm_area_struct **vmas = NULL;
         struct page **pages = NULL;
         unsigned long off, start, end, ubuf;
@@@ -8225,6 -8257,7 +8260,7 @@@
         start = ubuf >> PAGE_SHIFT;
         nr_pages = end - start;
   
+       *pimu = NULL;
         ret = -ENOMEM;
   
         pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
@@@ -8236,9 -8269,8 +8272,8 @@@
         if (!vmas)
                 goto done;
   
-       imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
-                                  GFP_KERNEL);
-       if (!imu->bvec)
+       imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
+       if (!imu)
                 goto done;
   
         ret = 0;
@@@ -8267,14 -8299,12 +8302,12 @@@
                  */
                 if (pret > 0)
                         unpin_user_pages(pages, pret);
-               kvfree(imu->bvec);
                 goto done;
         }
   
         ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
         if (ret) {
                 unpin_user_pages(pages, pret);
-               kvfree(imu->bvec);
                 goto done;
         }
   
@@@ -8292,10 -8322,13 +8325,13 @@@
         }
         /* store original address for later verification */
         imu->ubuf = ubuf;
-       imu->len = iov->iov_len;
+       imu->ubuf_end = ubuf + iov->iov_len;
         imu->nr_bvecs = nr_pages;
+       *pimu = imu;
         ret = 0;
   done:
+       if (ret)
+               kvfree(imu);
         kvfree(pages);
         kvfree(vmas);
         return ret;
@@@ -8303,21 -8336,14 +8339,14 @@@
   
   static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
   {
-       if (ctx->user_bufs)
-               return -EBUSY;
-       if (!nr_args || nr_args > UIO_MAXIOV)
-               return -EINVAL;
- 
-       ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
-                                       GFP_KERNEL);
-       if (!ctx->user_bufs)
-               return -ENOMEM;
- 
-       return 0;
+       ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
+       return ctx->user_bufs ? 0 : -ENOMEM;
   }
   
   static int io_buffer_validate(struct iovec *iov)
   {
+       unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
+ 
         /*
          * Don't impose further limits on the size and buffer
          * constraints here, we'll -EINVAL later when IO is
@@@ -8330,44 -8356,123 +8359,123 @@@
         if (iov->iov_len > SZ_1G)
                 return -EFAULT;
   
+       if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
+               return -EOVERFLOW;
+ 
         return 0;
   }
   
   static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
-                                  unsigned int nr_args)
+                                  unsigned int nr_args, u64 __user *tags)
   {
+       struct page *last_hpage = NULL;
+       struct io_rsrc_data *data;
         int i, ret;
         struct iovec iov;
-       struct page *last_hpage = NULL;
   
-       ret = io_buffers_map_alloc(ctx, nr_args);
+       if (ctx->user_bufs)
+               return -EBUSY;
+       if (!nr_args || nr_args > UIO_MAXIOV)
+               return -EINVAL;
+       ret = io_rsrc_node_switch_start(ctx);
         if (ret)
                 return ret;
+       data = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, nr_args);
+       if (!data)
+               return -ENOMEM;
+       ret = io_buffers_map_alloc(ctx, nr_args);
+       if (ret) {
+               kfree(data);
+               return ret;
+       }
   
-       for (i = 0; i < nr_args; i++) {
-               struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+       for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
+               u64 tag = 0;
   
+               if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) {
+                       ret = -EFAULT;
+                       break;
+               }
                 ret = io_copy_iov(ctx, &iov, arg, i);
                 if (ret)
                         break;
- 
                 ret = io_buffer_validate(&iov);
                 if (ret)
                         break;
   
-               ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
+               ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
+                                            &last_hpage);
                 if (ret)
                         break;
- 
-               ctx->nr_user_bufs++;
+               data->tags[i] = tag;
         }
   
-       if (ret)
-               io_sqe_buffers_unregister(ctx);
+       WARN_ON_ONCE(ctx->buf_data);
   
+       ctx->buf_data = data;
+       if (ret)
+               __io_sqe_buffers_unregister(ctx);
+       else
+               io_rsrc_node_switch(ctx, NULL);
         return ret;
   }
   
+ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
+                                  struct io_uring_rsrc_update2 *up,
+                                  unsigned int nr_args)
+ {
+       u64 __user *tags = u64_to_user_ptr(up->tags);
+       struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
+       struct page *last_hpage = NULL;
+       bool needs_switch = false;
+       __u32 done;
+       int i, err;
+ 
+       if (!ctx->buf_data)
+               return -ENXIO;
+       if (up->offset + nr_args > ctx->nr_user_bufs)
+               return -EINVAL;
+ 
+       for (done = 0; done < nr_args; done++) {
+               struct io_mapped_ubuf *imu;
+               int offset = up->offset + done;
+               u64 tag = 0;
+ 
+               err = io_copy_iov(ctx, &iov, iovs, done);
+               if (err)
+                       break;
+               if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
+                       err = -EFAULT;
+                       break;
+               }
+               err = io_buffer_validate(&iov);
+               if (err)
+                       break;
+               err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
+               if (err)
+                       break;
+ 
+               i = array_index_nospec(offset, ctx->nr_user_bufs);
+               if (ctx->user_bufs[i]) {
+                       err = io_queue_rsrc_removal(ctx->buf_data, offset,
+                                                   ctx->rsrc_node, ctx->user_bufs[i]);
+                       if (unlikely(err)) {
+                               io_buffer_unmap(ctx, &imu);
+                               break;
+                       }
+                       ctx->user_bufs[i] = NULL;
+                       needs_switch = true;
+               }
+ 
+               ctx->user_bufs[i] = imu;
+               ctx->buf_data->tags[offset] = tag;
+       }
+ 
+       if (needs_switch)
+               io_rsrc_node_switch(ctx, ctx->buf_data);
+       return done ? done : err;
+ }
+ 
   static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
   {
         __s32 __user *fds = arg;
@@@ -8434,30 -8539,23 +8542,23 @@@ static void io_req_caches_free(struct i
                 submit_state->free_reqs = 0;
         }
   
-       spin_lock_irq(&ctx->completion_lock);
-       list_splice_init(&cs->locked_free_list, &cs->free_list);
-       cs->locked_free_nr = 0;
-       spin_unlock_irq(&ctx->completion_lock);
- 
+       io_flush_cached_locked_reqs(ctx, cs);
         io_req_cache_free(&cs->free_list, NULL);
- 
         mutex_unlock(&ctx->uring_lock);
   }
   
- static void io_ring_ctx_free(struct io_ring_ctx *ctx)
+ static bool io_wait_rsrc_data(struct io_rsrc_data *data)
   {
-       /*
-        * Some may use context even when all refs and requests have been put,
-        * and they are free to do so while still holding uring_lock or
-        * completion_lock, see __io_req_task_submit(). Wait for them to finish.
-        */
-       mutex_lock(&ctx->uring_lock);
-       mutex_unlock(&ctx->uring_lock);
-       spin_lock_irq(&ctx->completion_lock);
-       spin_unlock_irq(&ctx->completion_lock);
+       if (!data)
+               return false;
+       if (!atomic_dec_and_test(&data->refs))
+               wait_for_completion(&data->done);
+       return true;
+ }
   
+ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
+ {
         io_sq_thread_finish(ctx);
-       io_sqe_buffers_unregister(ctx);
   
         if (ctx->mm_account) {
                 mmdrop(ctx->mm_account);
@@@ -8465,10 -8563,27 +8566,27 @@@
         }
   
         mutex_lock(&ctx->uring_lock);
-       io_sqe_files_unregister(ctx);
+       if (io_wait_rsrc_data(ctx->buf_data))
+               __io_sqe_buffers_unregister(ctx);
+       if (io_wait_rsrc_data(ctx->file_data))
+               __io_sqe_files_unregister(ctx);
+       if (ctx->rings)
+               __io_cqring_overflow_flush(ctx, true);
         mutex_unlock(&ctx->uring_lock);
         io_eventfd_unregister(ctx);
         io_destroy_buffers(ctx);
+       if (ctx->sq_creds)
+               put_cred(ctx->sq_creds);
+ 
+       /* there are no registered resources left, nobody uses it */
+       if (ctx->rsrc_node)
+               io_rsrc_node_destroy(ctx->rsrc_node);
+       if (ctx->rsrc_backup_node)
+               io_rsrc_node_destroy(ctx->rsrc_backup_node);
+       flush_delayed_work(&ctx->rsrc_put_work);
+ 
+       WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
+       WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
   
   #if defined(CONFIG_UNIX)
         if (ctx->ring_sock) {
@@@ -8568,6 -8683,13 +8686,13 @@@ static void io_tctx_exit_cb(struct call
         complete(&work->completion);
   }
   
+ static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
+ {
+       struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+ 
+       return req->ctx == data;
+ }
+ 
   static void io_ring_exit_work(struct work_struct *work)
   {
         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
@@@ -8576,14 -8698,6 +8701,6 @@@
         struct io_tctx_node *node;
         int ret;
   
-       /* prevent SQPOLL from submitting new requests */
-       if (ctx->sq_data) {
-               io_sq_thread_park(ctx->sq_data);
-               list_del_init(&ctx->sqd_list);
-               io_sqd_update_thread_idle(ctx->sq_data);
-               io_sq_thread_unpark(ctx->sq_data);
-       }
- 
         /*
          * If we're doing polled IO and end up having requests being
          * submitted async (out-of-line), then completions can come in while
@@@ -8592,19 -8706,38 +8709,38 @@@
          */
         do {
                 io_uring_try_cancel_requests(ctx, NULL, NULL);
+               if (ctx->sq_data) {
+                       struct io_sq_data *sqd = ctx->sq_data;
+                       struct task_struct *tsk;
+ 
+                       io_sq_thread_park(sqd);
+                       tsk = sqd->thread;
+                       if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
+                               io_wq_cancel_cb(tsk->io_uring->io_wq,
+                                               io_cancel_ctx_cb, ctx, true);
+                       io_sq_thread_unpark(sqd);
+               }
   
                 WARN_ON_ONCE(time_after(jiffies, timeout));
         } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
   
+       init_completion(&exit.completion);
+       init_task_work(&exit.task_work, io_tctx_exit_cb);
+       exit.ctx = ctx;
+       /*
+        * Some may use context even when all refs and requests have been put,
+        * and they are free to do so while still holding uring_lock or
+        * completion_lock, see __io_req_task_submit(). Apart from other work,
+        * this lock/unlock section also waits them to finish.
+        */
         mutex_lock(&ctx->uring_lock);
         while (!list_empty(&ctx->tctx_list)) {
                 WARN_ON_ONCE(time_after(jiffies, timeout));
   
                 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
                                         ctx_node);
-               exit.ctx = ctx;
-               init_completion(&exit.completion);
-               init_task_work(&exit.task_work, io_tctx_exit_cb);
+               /* don't spin on a single task if cancellation failed */
+               list_rotate_left(&ctx->tctx_list);
                 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
                 if (WARN_ON_ONCE(ret))
                         continue;
@@@ -8612,10 -8745,11 +8748,11 @@@
   
                 mutex_unlock(&ctx->uring_lock);
                 wait_for_completion(&exit.completion);
-               cond_resched();
                 mutex_lock(&ctx->uring_lock);
         }
         mutex_unlock(&ctx->uring_lock);
+       spin_lock_irq(&ctx->completion_lock);
+       spin_unlock_irq(&ctx->completion_lock);
   
         io_ring_ctx_free(ctx);
   }
@@@ -8649,10 -8783,8 +8786,8 @@@ static void io_ring_ctx_wait_and_kill(s
   
         mutex_lock(&ctx->uring_lock);
         percpu_ref_kill(&ctx->refs);
-       /* if force is set, the ring is going away. always drop after that */
-       ctx->cq_overflow_flushed = 1;
         if (ctx->rings)
-               __io_cqring_overflow_flush(ctx, true, NULL, NULL);
+               __io_cqring_overflow_flush(ctx, true);
         xa_for_each(&ctx->personalities, index, creds)
                 io_unregister_personality(ctx, index);
         mutex_unlock(&ctx->uring_lock);
@@@ -8728,21 -8860,12 +8863,12 @@@ static bool io_cancel_defer_files(struc
         while (!list_empty(&list)) {
                 de = list_first_entry(&list, struct io_defer_entry, list);
                 list_del_init(&de->list);
-               req_set_fail_links(de->req);
-               io_put_req(de->req);
-               io_req_complete(de->req, -ECANCELED);
+               io_req_complete_failed(de->req, -ECANCELED);
                 kfree(de);
         }
         return true;
   }
   
- static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
- {
-       struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- 
-       return req->ctx == data;
- }
- 
   static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
   {
         struct io_tctx_node *node;
@@@ -8804,53 -8927,13 +8930,13 @@@ static void io_uring_try_cancel_request
                 ret |= io_kill_timeouts(ctx, task, files);
                 ret |= io_run_task_work();
                 ret |= io_run_ctx_fallback(ctx);
-               io_cqring_overflow_flush(ctx, true, task, files);
                 if (!ret)
                         break;
                 cond_resched();
         }
   }
   
- static int io_uring_count_inflight(struct io_ring_ctx *ctx,
-                                  struct task_struct *task,
-                                  struct files_struct *files)
- {
-       struct io_kiocb *req;
-       int cnt = 0;
- 
-       spin_lock_irq(&ctx->inflight_lock);
-       list_for_each_entry(req, &ctx->inflight_list, inflight_entry)
-               cnt += io_match_task(req, task, files);
-       spin_unlock_irq(&ctx->inflight_lock);
-       return cnt;
- }
- 
- static void io_uring_cancel_files(struct io_ring_ctx *ctx,
-                                 struct task_struct *task,
-                                 struct files_struct *files)
- {
-       while (!list_empty_careful(&ctx->inflight_list)) {
-               DEFINE_WAIT(wait);
-               int inflight;
- 
-               inflight = io_uring_count_inflight(ctx, task, files);
-               if (!inflight)
-                       break;
- 
-               io_uring_try_cancel_requests(ctx, task, files);
- 
-               prepare_to_wait(&task->io_uring->wait, &wait,
-                               TASK_UNINTERRUPTIBLE);
-               if (inflight == io_uring_count_inflight(ctx, task, files))
-                       schedule();
-               finish_wait(&task->io_uring->wait, &wait);
-       }
- }
- 
- /*
-  * Note that this task has used io_uring. We use it for cancelation purposes.
-  */
- static int io_uring_add_task_file(struct io_ring_ctx *ctx)
+ static int __io_uring_add_task_file(struct io_ring_ctx *ctx)
   {
         struct io_uring_task *tctx = current->io_uring;
         struct io_tctx_node *node;
@@@ -8862,32 -8945,40 +8948,40 @@@
                         return ret;
                 tctx = current->io_uring;
         }
-       if (tctx->last != ctx) {
-               void *old = xa_load(&tctx->xa, (unsigned long)ctx);
- 
-               if (!old) {
-                       node = kmalloc(sizeof(*node), GFP_KERNEL);
-                       if (!node)
-                               return -ENOMEM;
-                       node->ctx = ctx;
-                       node->task = current;
- 
-                       ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
-                                               node, GFP_KERNEL));
-                       if (ret) {
-                               kfree(node);
-                               return ret;
-                       }
+       if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
+               node = kmalloc(sizeof(*node), GFP_KERNEL);
+               if (!node)
+                       return -ENOMEM;
+               node->ctx = ctx;
+               node->task = current;
   
-                       mutex_lock(&ctx->uring_lock);
-                       list_add(&node->ctx_node, &ctx->tctx_list);
-                       mutex_unlock(&ctx->uring_lock);
+               ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
+                                       node, GFP_KERNEL));
+               if (ret) {
+                       kfree(node);
+                       return ret;
                 }
-               tctx->last = ctx;
+ 
+               mutex_lock(&ctx->uring_lock);
+               list_add(&node->ctx_node, &ctx->tctx_list);
+               mutex_unlock(&ctx->uring_lock);
         }
+       tctx->last = ctx;
         return 0;
   }
   
+ /*
+  * Note that this task has used io_uring. We use it for cancelation purposes.
+  */
+ static inline int io_uring_add_task_file(struct io_ring_ctx *ctx)
+ {
+       struct io_uring_task *tctx = current->io_uring;
+ 
+       if (likely(tctx && tctx->last == ctx))
+               return 0;
+       return __io_uring_add_task_file(ctx);
+ }
+ 
   /*
    * Remove this io_uring_file -> task mapping.
    */
@@@ -8927,86 -9018,48 +9021,48 @@@ static void io_uring_clean_tctx(struct 
         }
   }
   
- static s64 tctx_inflight(struct io_uring_task *tctx)
+ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
   {
+       if (tracked)
+               return atomic_read(&tctx->inflight_tracked);
         return percpu_counter_sum(&tctx->inflight);
   }
   
- static void io_sqpoll_cancel_cb(struct callback_head *cb)
- {
-       struct io_tctx_exit *work = container_of(cb, struct io_tctx_exit, task_work);
-       struct io_ring_ctx *ctx = work->ctx;
-       struct io_sq_data *sqd = ctx->sq_data;
- 
-       if (sqd->thread)
-               io_uring_cancel_sqpoll(ctx);
-       complete(&work->completion);
- }
- 
- static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx)
- {
-       struct io_sq_data *sqd = ctx->sq_data;
-       struct io_tctx_exit work = { .ctx = ctx, };
-       struct task_struct *task;
- 
-       io_sq_thread_park(sqd);
-       list_del_init(&ctx->sqd_list);
-       io_sqd_update_thread_idle(sqd);
-       task = sqd->thread;
-       if (task) {
-               init_completion(&work.completion);
-               init_task_work(&work.task_work, io_sqpoll_cancel_cb);
-               io_task_work_add_head(&sqd->park_task_work, &work.task_work);
-               wake_up_process(task);
-       }
-       io_sq_thread_unpark(sqd);
- 
-       if (task)
-               wait_for_completion(&work.completion);
- }
- 
- void __io_uring_files_cancel(struct files_struct *files)
+ static void io_uring_try_cancel(struct files_struct *files)
   {
         struct io_uring_task *tctx = current->io_uring;
         struct io_tctx_node *node;
         unsigned long index;
   
-       /* make sure overflow events are dropped */
-       atomic_inc(&tctx->in_idle);
         xa_for_each(&tctx->xa, index, node) {
                 struct io_ring_ctx *ctx = node->ctx;
   
-               if (ctx->sq_data) {
-                       io_sqpoll_cancel_sync(ctx);
-                       continue;
-               }
-               io_uring_cancel_files(ctx, current, files);
-               if (!files)
-                       io_uring_try_cancel_requests(ctx, current, NULL);
+               /* sqpoll task will cancel all its requests */
+               if (!ctx->sq_data)
+                       io_uring_try_cancel_requests(ctx, current, files);
         }
-       atomic_dec(&tctx->in_idle);
- 
-       if (files)
-               io_uring_clean_tctx(tctx);
   }
   
   /* should only be called by SQPOLL task */
- static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
+ static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
   {
-       struct io_sq_data *sqd = ctx->sq_data;
         struct io_uring_task *tctx = current->io_uring;
+       struct io_ring_ctx *ctx;
         s64 inflight;
         DEFINE_WAIT(wait);
   
-       WARN_ON_ONCE(!sqd || ctx->sq_data->thread != current);
+       if (!current->io_uring)
+               return;
+       WARN_ON_ONCE(!sqd || sqd->thread != current);
   
         atomic_inc(&tctx->in_idle);
         do {
                 /* read completions before cancelations */
-               inflight = tctx_inflight(tctx);
+               inflight = tctx_inflight(tctx, false);
                 if (!inflight)
                         break;
-               io_uring_try_cancel_requests(ctx, current, NULL);
+               list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+                       io_uring_try_cancel_requests(ctx, current, NULL);
   
                 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
                 /*
@@@ -9014,7 -9067,7 +9070,7 @@@
                  * avoids a race where a completion comes in before we did
                  * prepare_to_wait().
                  */
-               if (inflight == tctx_inflight(tctx))
+               if (inflight == tctx_inflight(tctx, false))
                         schedule();
                 finish_wait(&tctx->wait, &wait);
         } while (1);
@@@ -9025,7 -9078,7 +9081,7 @@@
    * Find any io_uring fd that this task has registered or done IO on, and cancel
    * requests.
    */
- void __io_uring_task_cancel(void)
+ void __io_uring_cancel(struct files_struct *files)
   {
         struct io_uring_task *tctx = current->io_uring;
         DEFINE_WAIT(wait);
@@@ -9033,15 -9086,12 +9089,12 @@@
   
         /* make sure overflow events are dropped */
         atomic_inc(&tctx->in_idle);
-       __io_uring_files_cancel(NULL);
- 
         do {
                 /* read completions before cancelations */
-               inflight = tctx_inflight(tctx);
+               inflight = tctx_inflight(tctx, !!files);
                 if (!inflight)
                         break;
-               __io_uring_files_cancel(NULL);
- 
+               io_uring_try_cancel(files);
                 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
   
                 /*
@@@ -9049,16 -9099,17 +9102,17 @@@
                  * avoids a race where a completion comes in before we did
                  * prepare_to_wait().
                  */
-               if (inflight == tctx_inflight(tctx))
+               if (inflight == tctx_inflight(tctx, !!files))
                         schedule();
                 finish_wait(&tctx->wait, &wait);
         } while (1);
- 
         atomic_dec(&tctx->in_idle);
   
         io_uring_clean_tctx(tctx);
-       /* all current's requests should be gone, we can kill tctx */
-       __io_uring_free(current);
+       if (!files) {
+               /* for exec all current's requests should be gone, kill tctx */
+               __io_uring_free(current);
+       }
   }
   
   static void *io_uring_validate_mmap_request(struct file *file,
@@@ -9184,31 -9235,31 +9238,31 @@@ SYSCALL_DEFINE6(io_uring_enter, unsigne
                 size_t, argsz)
   {
         struct io_ring_ctx *ctx;
-       long ret = -EBADF;
         int submitted = 0;
         struct fd f;
+       long ret;
   
         io_run_task_work();
   
-       if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
-                       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))
+       if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
+                              IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
                 return -EINVAL;
   
         f = fdget(fd);
-       if (!f.file)
+       if (unlikely(!f.file))
                 return -EBADF;
   
         ret = -EOPNOTSUPP;
-       if (f.file->f_op != &io_uring_fops)
+       if (unlikely(f.file->f_op != &io_uring_fops))
                 goto out_fput;
   
         ret = -ENXIO;
         ctx = f.file->private_data;
-       if (!percpu_ref_tryget(&ctx->refs))
+       if (unlikely(!percpu_ref_tryget(&ctx->refs)))
                 goto out_fput;
   
         ret = -EBADFD;
-       if (ctx->flags & IORING_SETUP_R_DISABLED)
+       if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
                 goto out;
   
         /*
@@@ -9218,7 -9269,7 +9272,7 @@@
          */
         ret = 0;
         if (ctx->flags & IORING_SETUP_SQPOLL) {
-               io_cqring_overflow_flush(ctx, false, NULL, NULL);
+               io_cqring_overflow_flush(ctx, false);
   
                 ret = -EOWNERDEAD;
                 if (unlikely(ctx->sq_data->thread == NULL)) {
@@@ -9331,7 -9382,7 +9385,7 @@@ static void __io_uring_show_fdinfo(stru
         seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
         seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
         for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
-               struct file *f = *io_fixed_file_slot(ctx->file_data, i);
+               struct file *f = io_file_from_index(ctx, i);
   
                 if (f)
                         seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
@@@ -9340,10 -9391,10 +9394,10 @@@
         }
         seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
         for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
-               struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
+               struct io_mapped_ubuf *buf = ctx->user_bufs[i];
+               unsigned int len = buf->ubuf_end - buf->ubuf;
   
-               seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
-                                               (unsigned int) buf->len);
+               seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
         }
         if (has_lock && !xa_empty(&ctx->personalities)) {
                 unsigned long index;
@@@ -9552,6 -9603,9 +9606,9 @@@ static int io_uring_create(unsigned ent
         ret = io_sq_offload_create(ctx, p);
         if (ret)
                 goto err;
+       /* always set a rsrc node */
+       io_rsrc_node_switch_start(ctx);
+       io_rsrc_node_switch(ctx, NULL);
   
         memset(&p->sq_off, 0, sizeof(p->sq_off));
         p->sq_off.head = offsetof(struct io_rings, sq.head);
@@@ -9777,14 -9831,96 +9834,96 @@@ static int io_register_enable_rings(str
         return 0;
   }
   
+ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
+                                    struct io_uring_rsrc_update2 *up,
+                                    unsigned nr_args)
+ {
+       __u32 tmp;
+       int err;
+ 
+       if (up->resv)
+               return -EINVAL;
+       if (check_add_overflow(up->offset, nr_args, &tmp))
+               return -EOVERFLOW;
+       err = io_rsrc_node_switch_start(ctx);
+       if (err)
+               return err;
+ 
+       switch (type) {
+       case IORING_RSRC_FILE:
+               return __io_sqe_files_update(ctx, up, nr_args);
+       case IORING_RSRC_BUFFER:
+               return __io_sqe_buffers_update(ctx, up, nr_args);
+       }
+       return -EINVAL;
+ }
+ 
+ static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
+                                   unsigned nr_args)
+ {
+       struct io_uring_rsrc_update2 up;
+ 
+       if (!nr_args)
+               return -EINVAL;
+       memset(&up, 0, sizeof(up));
+       if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
+               return -EFAULT;
+       return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
+ }
+ 
+ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
+                                  unsigned size)
+ {
+       struct io_uring_rsrc_update2 up;
+ 
+       if (size != sizeof(up))
+               return -EINVAL;
+       if (copy_from_user(&up, arg, sizeof(up)))
+               return -EFAULT;
+       if (!up.nr)
+               return -EINVAL;
+       return __io_register_rsrc_update(ctx, up.type, &up, up.nr);
+ }
+ 
+ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
+                           unsigned int size)
+ {
+       struct io_uring_rsrc_register rr;
+ 
+       /* keep it extendible */
+       if (size != sizeof(rr))
+               return -EINVAL;
+ 
+       memset(&rr, 0, sizeof(rr));
+       if (copy_from_user(&rr, arg, size))
+               return -EFAULT;
+       if (!rr.nr)
+               return -EINVAL;
+ 
+       switch (rr.type) {
+       case IORING_RSRC_FILE:
+               return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
+                                            rr.nr, u64_to_user_ptr(rr.tags));
+       case IORING_RSRC_BUFFER:
+               return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
+                                              rr.nr, u64_to_user_ptr(rr.tags));
+       }
+       return -EINVAL;
+ }
+ 
   static bool io_register_op_must_quiesce(int op)
   {
         switch (op) {
+       case IORING_REGISTER_BUFFERS:
+       case IORING_UNREGISTER_BUFFERS:
+       case IORING_REGISTER_FILES:
         case IORING_UNREGISTER_FILES:
         case IORING_REGISTER_FILES_UPDATE:
         case IORING_REGISTER_PROBE:
         case IORING_REGISTER_PERSONALITY:
         case IORING_UNREGISTER_PERSONALITY:
+       case IORING_REGISTER_RSRC:
+       case IORING_REGISTER_RSRC_UPDATE:
                 return false;
         default:
                 return true;
@@@ -9806,6 -9942,14 +9945,14 @@@ static int __io_uring_register(struct i
         if (percpu_ref_is_dying(&ctx->refs))
                 return -ENXIO;
   
+       if (ctx->restricted) {
+               if (opcode >= IORING_REGISTER_LAST)
+                       return -EINVAL;
+               opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
+               if (!test_bit(opcode, ctx->restrictions.register_op))
+                       return -EACCES;
+       }
+ 
         if (io_register_op_must_quiesce(opcode)) {
                 percpu_ref_kill(&ctx->refs);
   
@@@ -9826,30 -9970,17 +9973,17 @@@
                         if (ret < 0)
                                 break;
                 } while (1);
- 
                 mutex_lock(&ctx->uring_lock);
   
                 if (ret) {
-                       percpu_ref_resurrect(&ctx->refs);
-                       goto out_quiesce;
-               }
-       }
- 
-       if (ctx->restricted) {
-               if (opcode >= IORING_REGISTER_LAST) {
-                       ret = -EINVAL;
-                       goto out;
-               }
- 
-               if (!test_bit(opcode, ctx->restrictions.register_op)) {
-                       ret = -EACCES;
-                       goto out;
+                       io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
+                       return ret;
                 }
         }
   
         switch (opcode) {
         case IORING_REGISTER_BUFFERS:
-               ret = io_sqe_buffers_register(ctx, arg, nr_args);
+               ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
                 break;
         case IORING_UNREGISTER_BUFFERS:
                 ret = -EINVAL;
@@@ -9858,7 -9989,7 +9992,7 @@@
                 ret = io_sqe_buffers_unregister(ctx);
                 break;
         case IORING_REGISTER_FILES:
-               ret = io_sqe_files_register(ctx, arg, nr_args);
+               ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
                 break;
         case IORING_UNREGISTER_FILES:
                 ret = -EINVAL;
@@@ -9867,7 -9998,7 +10001,7 @@@
                 ret = io_sqe_files_unregister(ctx);
                 break;
         case IORING_REGISTER_FILES_UPDATE:
-               ret = io_sqe_files_update(ctx, arg, nr_args);
+               ret = io_register_files_update(ctx, arg, nr_args);
                 break;
         case IORING_REGISTER_EVENTFD:
         case IORING_REGISTER_EVENTFD_ASYNC:
@@@ -9915,16 -10046,20 +10049,20 @@@
         case IORING_REGISTER_RESTRICTIONS:
                 ret = io_register_restrictions(ctx, arg, nr_args);
                 break;
+       case IORING_REGISTER_RSRC:
+               ret = io_register_rsrc(ctx, arg, nr_args);
+               break;
+       case IORING_REGISTER_RSRC_UPDATE:
+               ret = io_register_rsrc_update(ctx, arg, nr_args);
+               break;
         default:
                 ret = -EINVAL;
                 break;
         }
   
- out:
         if (io_register_op_must_quiesce(opcode)) {
                 /* bring the ctx back to life */
                 percpu_ref_reinit(&ctx->refs);
- out_quiesce:
                 reinit_completion(&ctx->ref_comp);
         }
         return ret;
diff --combined kernel/fork.c

index f98de49,224c831..0f1992d
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -927,6 -927,7 +927,7 @@@ static struct task_struct *dup_task_str
         tsk->splice_pipe = NULL;
         tsk->task_frag.page = NULL;
         tsk->wake_q.next = NULL;
+       tsk->pf_io_worker = NULL;
   
         account_kernel_stack(tsk, 1);
   
@@@ -1941,7 -1942,7 +1942,7 @@@ static __latent_entropy struct task_str
         recalc_sigpending();
         spin_unlock_irq(&current->sighand->siglock);
         retval = -ERESTARTNOINTR;
-       if (signal_pending(current))
+       if (task_sigpending(current))
                 goto fork_out;
   
         retval = -ENOMEM;
@@@ -2009,7 -2010,6 +2010,7 @@@
         spin_lock_init(&p->alloc_lock);
   
         init_sigpending(&p->pending);
+ +      p->sigqueue_cache = NULL;
   
         p->utime = p->stime = p->gtime = 0;
   #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
@@@ -2085,7 -2085,7 +2086,7 @@@
         if (retval)
                 goto bad_fork_cleanup_policy;
   
- -      retval = perf_event_init_task(p);
+ +      retval = perf_event_init_task(p, clone_flags);
         if (retval)
                 goto bad_fork_cleanup_policy;
         retval = audit_alloc(p);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 28 Apr 2021 21:56:09 +0000 (14:56 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 28 Apr 2021 21:56:09 +0000 (14:56 -0700)
		1	2
fs/io_uring.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history