io_uring: always wait for sqd exited when stopping SQPOLL thread

[linux-2.6-microblaze.git] / fs / io_uring.c
diff --git a/fs/io_uring.c b/fs/io_uring.c

index 904bf0f..62f998b 100644 (file)
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -74,13 +74,11 @@
  #include <linux/fsnotify.h>
  #include <linux/fadvise.h>
  #include <linux/eventpoll.h>
-#include <linux/fs_struct.h>
  #include <linux/splice.h>
  #include <linux/task_work.h>
  #include <linux/pagemap.h>
  #include <linux/io_uring.h>
-#include <linux/blk-cgroup.h>
-#include <linux/audit.h>
+#include <linux/freezer.h>
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/io_uring.h>
@@ -260,12 +258,11 @@ enum {
  
  struct io_sq_data {
         refcount_t              refs;
-       struct mutex            lock;
+       struct rw_semaphore     rw_lock;
  
         /* ctx's that are using this sqd */
         struct list_head        ctx_list;
         struct list_head        ctx_new_list;
-       struct mutex            ctx_lock;
  
         struct task_struct      *thread;
         struct wait_queue_head  wait;
@@ -276,7 +273,6 @@ struct io_sq_data {
  
         unsigned long           state;
         struct completion       startup;
-       struct completion       completion;
         struct completion       exited;
  };
  
@@ -338,8 +334,6 @@ struct io_ring_ctx {
                 unsigned int            drain_next: 1;
                 unsigned int            eventfd_async: 1;
                 unsigned int            restricted: 1;
-               unsigned int            sqo_dead: 1;
-               unsigned int            sqo_exec: 1;
  
                 /*
                  * Ring buffer of indices into array of io_uring_sqe, which is
@@ -380,14 +374,10 @@ struct io_ring_ctx {
  
         struct io_rings *rings;
  
-       /*
-        * For SQPOLL usage
-        */
-       struct task_struct      *sqo_task;
-
         /* Only used for accounting purposes */
         struct mm_struct        *mm_account;
  
+       const struct cred       *sq_creds;      /* cred used for __io_sq_thread() */
         struct io_sq_data       *sq_data;       /* if using sq thread polling */
  
         struct wait_queue_head  sqo_sq_wait;
@@ -416,7 +406,8 @@ struct io_ring_ctx {
  
         struct idr              io_buffer_idr;
  
-       struct idr              personality_idr;
+       struct xarray           personalities;
+       u32                     pers_next;
  
         struct {
                 unsigned                cached_cq_tail;
@@ -462,6 +453,7 @@ struct io_ring_ctx {
  
         /* Keep this last, we don't need it for the fast path */
         struct work_struct              exit_work;
+       struct list_head                tctx_list;
  };
  
  /*
@@ -688,7 +680,6 @@ enum {
         REQ_F_POLLED_BIT,
         REQ_F_BUFFER_SELECTED_BIT,
         REQ_F_NO_FILE_TABLE_BIT,
-       REQ_F_WORK_INITIALIZED_BIT,
         REQ_F_LTIMEOUT_ACTIVE_BIT,
         REQ_F_COMPLETE_INLINE_BIT,
  
@@ -712,7 +703,7 @@ enum {
  
         /* fail rest of links */
         REQ_F_FAIL_LINK         = BIT(REQ_F_FAIL_LINK_BIT),
-       /* on inflight list */
+       /* on inflight list, should be cancelled and waited on exit reliably */
         REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
         /* read/write uses file position */
         REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
@@ -730,8 +721,6 @@ enum {
         REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
         /* doesn't need file table for this request */
         REQ_F_NO_FILE_TABLE     = BIT(REQ_F_NO_FILE_TABLE_BIT),
-       /* io_wq_work is initialized */
-       REQ_F_WORK_INITIALIZED  = BIT(REQ_F_WORK_INITIALIZED_BIT),
         /* linked timeout is active, i.e. prepared by link's head */
         REQ_F_LTIMEOUT_ACTIVE   = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
         /* completion is deferred through io_comp_state */
@@ -816,6 +805,12 @@ struct io_kiocb {
         struct io_wq_work               work;
  };
  
+struct io_tctx_node {
+       struct list_head        ctx_node;
+       struct task_struct      *task;
+       struct io_ring_ctx      *ctx;
+};
+
  struct io_defer_entry {
         struct list_head        list;
         struct io_kiocb         *req;
@@ -990,6 +985,8 @@ static const struct io_op_def io_op_defs[] = {
         [IORING_OP_UNLINKAT] = {},
  };
  
+static bool io_disarm_next(struct io_kiocb *req);
+static void io_uring_del_task_file(unsigned long index);
  static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
                                          struct task_struct *task,
                                          struct files_struct *files);
@@ -1080,9 +1077,7 @@ static bool io_match_task(struct io_kiocb *head,
                 return true;
  
         io_for_each_link(req, head) {
-               if (!(req->flags & REQ_F_WORK_INITIALIZED))
-                       continue;
-               if (req->file && req->file->f_op == &io_uring_fops)
+               if (req->flags & REQ_F_INFLIGHT)
                         return true;
                 if (req->task->files == files)
                         return true;
@@ -1096,24 +1091,6 @@ static inline void req_set_fail_links(struct io_kiocb *req)
                 req->flags |= REQ_F_FAIL_LINK;
  }
  
-static inline void __io_req_init_async(struct io_kiocb *req)
-{
-       memset(&req->work, 0, sizeof(req->work));
-       req->flags |= REQ_F_WORK_INITIALIZED;
-}
-
-/*
- * Note: must call io_req_init_async() for the first time you
- * touch any members of io_wq_work.
- */
-static inline void io_req_init_async(struct io_kiocb *req)
-{
-       if (req->flags & REQ_F_WORK_INITIALIZED)
-               return;
-
-       __io_req_init_async(req);
-}
-
  static void io_ring_ctx_ref_free(struct percpu_ref *ref)
  {
         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
@@ -1162,7 +1139,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
         init_completion(&ctx->ref_comp);
         init_completion(&ctx->sq_thread_comp);
         idr_init(&ctx->io_buffer_idr);
-       idr_init(&ctx->personality_idr);
+       xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
         mutex_init(&ctx->uring_lock);
         init_waitqueue_head(&ctx->wait);
         spin_lock_init(&ctx->completion_lock);
@@ -1175,6 +1152,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
         INIT_LIST_HEAD(&ctx->rsrc_ref_list);
         INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
         init_llist_head(&ctx->rsrc_put_llist);
+       INIT_LIST_HEAD(&ctx->tctx_list);
         INIT_LIST_HEAD(&ctx->submit_state.comp.free_list);
         INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list);
         return ctx;
@@ -1196,37 +1174,11 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
         return false;
  }
  
-static void io_req_clean_work(struct io_kiocb *req)
-{
-       if (!(req->flags & REQ_F_WORK_INITIALIZED))
-               return;
-
-       if (req->work.creds) {
-               put_cred(req->work.creds);
-               req->work.creds = NULL;
-       }
-       if (req->flags & REQ_F_INFLIGHT) {
-               struct io_ring_ctx *ctx = req->ctx;
-               struct io_uring_task *tctx = req->task->io_uring;
-               unsigned long flags;
-
-               spin_lock_irqsave(&ctx->inflight_lock, flags);
-               list_del(&req->inflight_entry);
-               spin_unlock_irqrestore(&ctx->inflight_lock, flags);
-               req->flags &= ~REQ_F_INFLIGHT;
-               if (atomic_read(&tctx->in_idle))
-                       wake_up(&tctx->wait);
-       }
-
-       req->flags &= ~REQ_F_WORK_INITIALIZED;
-}
-
  static void io_req_track_inflight(struct io_kiocb *req)
  {
         struct io_ring_ctx *ctx = req->ctx;
  
         if (!(req->flags & REQ_F_INFLIGHT)) {
-               io_req_init_async(req);
                 req->flags |= REQ_F_INFLIGHT;
  
                 spin_lock_irq(&ctx->inflight_lock);
@@ -1240,7 +1192,8 @@ static void io_prep_async_work(struct io_kiocb *req)
         const struct io_op_def *def = &io_op_defs[req->opcode];
         struct io_ring_ctx *ctx = req->ctx;
  
-       io_req_init_async(req);
+       if (!req->work.creds)
+               req->work.creds = get_current_cred();
  
         if (req->flags & REQ_F_FORCE_ASYNC)
                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
@@ -1252,8 +1205,6 @@ static void io_prep_async_work(struct io_kiocb *req)
                 if (def->unbound_nonreg_file)
                         req->work.flags |= IO_WQ_WORK_UNBOUND;
         }
-       if (!req->work.creds)
-               req->work.creds = get_current_cred();
  }
  
  static void io_prep_async_link(struct io_kiocb *req)
@@ -1264,7 +1215,7 @@ static void io_prep_async_link(struct io_kiocb *req)
                 io_prep_async_work(cur);
  }
  
-static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
+static void io_queue_async_work(struct io_kiocb *req)
  {
         struct io_ring_ctx *ctx = req->ctx;
         struct io_kiocb *link = io_prep_linked_timeout(req);
@@ -1275,18 +1226,9 @@ static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
  
         trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
                                         &req->work, req->flags);
-       io_wq_enqueue(tctx->io_wq, &req->work);
-       return link;
-}
-
-static void io_queue_async_work(struct io_kiocb *req)
-{
-       struct io_kiocb *link;
-
         /* init ->work of the whole link before punting */
         io_prep_async_link(req);
-       link = __io_queue_async_work(req);
-
+       io_wq_enqueue(tctx->io_wq, &req->work);
         if (link)
                 io_queue_linked_timeout(link);
  }
@@ -1521,18 +1463,22 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
         return all_flushed;
  }
  
-static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
+static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
                                      struct task_struct *tsk,
                                      struct files_struct *files)
  {
+       bool ret = true;
+
         if (test_bit(0, &ctx->cq_check_overflow)) {
                 /* iopoll syncs against uring_lock, not completion_lock */
                 if (ctx->flags & IORING_SETUP_IOPOLL)
                         mutex_lock(&ctx->uring_lock);
-               __io_cqring_overflow_flush(ctx, force, tsk, files);
+               ret = __io_cqring_overflow_flush(ctx, force, tsk, files);
                 if (ctx->flags & IORING_SETUP_IOPOLL)
                         mutex_unlock(&ctx->uring_lock);
         }
+
+       return ret;
  }
  
  static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
@@ -1580,15 +1526,14 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
         __io_cqring_fill_event(req, res, 0);
  }
  
-static inline void io_req_complete_post(struct io_kiocb *req, long res,
-                                       unsigned int cflags)
+static void io_req_complete_post(struct io_kiocb *req, long res,
+                                unsigned int cflags)
  {
         struct io_ring_ctx *ctx = req->ctx;
         unsigned long flags;
  
         spin_lock_irqsave(&ctx->completion_lock, flags);
         __io_cqring_fill_event(req, res, cflags);
-       io_commit_cqring(ctx);
         /*
          * If we're the last reference to this request, add to our locked
          * free_list cache.
@@ -1596,19 +1541,26 @@ static inline void io_req_complete_post(struct io_kiocb *req, long res,
         if (refcount_dec_and_test(&req->refs)) {
                 struct io_comp_state *cs = &ctx->submit_state.comp;
  
+               if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
+                       if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK))
+                               io_disarm_next(req);
+                       if (req->link) {
+                               io_req_task_queue(req->link);
+                               req->link = NULL;
+                       }
+               }
                 io_dismantle_req(req);
                 io_put_task(req->task, 1);
                 list_add(&req->compl.list, &cs->locked_free_list);
                 cs->locked_free_nr++;
         } else
                 req = NULL;
+       io_commit_cqring(ctx);
         spin_unlock_irqrestore(&ctx->completion_lock, flags);
-
         io_cqring_ev_posted(ctx);
-       if (req) {
-               io_queue_next(req);
+
+       if (req)
                 percpu_ref_put(&ctx->refs);
-       }
  }
  
  static void io_req_complete_state(struct io_kiocb *req, long res,
@@ -1714,9 +1666,23 @@ static void io_dismantle_req(struct io_kiocb *req)
                 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
         if (req->fixed_rsrc_refs)
                 percpu_ref_put(req->fixed_rsrc_refs);
-       io_req_clean_work(req);
+       if (req->work.creds) {
+               put_cred(req->work.creds);
+               req->work.creds = NULL;
+       }
+
+       if (req->flags & REQ_F_INFLIGHT) {
+               struct io_ring_ctx *ctx = req->ctx;
+               unsigned long flags;
+
+               spin_lock_irqsave(&ctx->inflight_lock, flags);
+               list_del(&req->inflight_entry);
+               spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+               req->flags &= ~REQ_F_INFLIGHT;
+       }
  }
  
+/* must to be called somewhat shortly after putting a request */
  static inline void io_put_task(struct task_struct *task, int nr)
  {
         struct io_uring_task *tctx = task->io_uring;
@@ -1746,15 +1712,11 @@ static inline void io_remove_next_linked(struct io_kiocb *req)
         nxt->link = NULL;
  }
  
-static void io_kill_linked_timeout(struct io_kiocb *req)
+static bool io_kill_linked_timeout(struct io_kiocb *req)
+       __must_hold(&req->ctx->completion_lock)
  {
-       struct io_ring_ctx *ctx = req->ctx;
-       struct io_kiocb *link;
+       struct io_kiocb *link = req->link;
         bool cancelled = false;
-       unsigned long flags;
-
-       spin_lock_irqsave(&ctx->completion_lock, flags);
-       link = req->link;
  
         /*
          * Can happen if a linked timeout fired and link had been like
@@ -1769,58 +1731,48 @@ static void io_kill_linked_timeout(struct io_kiocb *req)
                 ret = hrtimer_try_to_cancel(&io->timer);
                 if (ret != -1) {
                         io_cqring_fill_event(link, -ECANCELED);
-                       io_commit_cqring(ctx);
+                       io_put_req_deferred(link, 1);
                         cancelled = true;
                 }
         }
         req->flags &= ~REQ_F_LINK_TIMEOUT;
-       spin_unlock_irqrestore(&ctx->completion_lock, flags);
-
-       if (cancelled) {
-               io_cqring_ev_posted(ctx);
-               io_put_req(link);
-       }
+       return cancelled;
  }
  
-
  static void io_fail_links(struct io_kiocb *req)
+       __must_hold(&req->ctx->completion_lock)
  {
-       struct io_kiocb *link, *nxt;
-       struct io_ring_ctx *ctx = req->ctx;
-       unsigned long flags;
+       struct io_kiocb *nxt, *link = req->link;
  
-       spin_lock_irqsave(&ctx->completion_lock, flags);
-       link = req->link;
         req->link = NULL;
-
         while (link) {
                 nxt = link->link;
                 link->link = NULL;
  
                 trace_io_uring_fail_link(req, link);
                 io_cqring_fill_event(link, -ECANCELED);
-
-               /*
-                * It's ok to free under spinlock as they're not linked anymore,
-                * but avoid REQ_F_WORK_INITIALIZED because it may deadlock on
-                * work.fs->lock.
-                */
-               if (link->flags & REQ_F_WORK_INITIALIZED)
-                       io_put_req_deferred(link, 2);
-               else
-                       io_double_put_req(link);
+               io_put_req_deferred(link, 2);
                 link = nxt;
         }
-       io_commit_cqring(ctx);
-       spin_unlock_irqrestore(&ctx->completion_lock, flags);
+}
  
-       io_cqring_ev_posted(ctx);
+static bool io_disarm_next(struct io_kiocb *req)
+       __must_hold(&req->ctx->completion_lock)
+{
+       bool posted = false;
+
+       if (likely(req->flags & REQ_F_LINK_TIMEOUT))
+               posted = io_kill_linked_timeout(req);
+       if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
+               posted |= (req->link != NULL);
+               io_fail_links(req);
+       }
+       return posted;
  }
  
  static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
  {
-       if (req->flags & REQ_F_LINK_TIMEOUT)
-               io_kill_linked_timeout(req);
+       struct io_kiocb *nxt;
  
         /*
          * If LINK is set, we have dependent requests in this chain. If we
@@ -1828,14 +1780,22 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
          * dependencies to the next request. In case of failure, fail the rest
          * of the chain.
          */
-       if (likely(!(req->flags & REQ_F_FAIL_LINK))) {
-               struct io_kiocb *nxt = req->link;
+       if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK)) {
+               struct io_ring_ctx *ctx = req->ctx;
+               unsigned long flags;
+               bool posted;
  
-               req->link = NULL;
-               return nxt;
+               spin_lock_irqsave(&ctx->completion_lock, flags);
+               posted = io_disarm_next(req);
+               if (posted)
+                       io_commit_cqring(req->ctx);
+               spin_unlock_irqrestore(&ctx->completion_lock, flags);
+               if (posted)
+                       io_cqring_ev_posted(ctx);
         }
-       io_fail_links(req);
-       return NULL;
+       nxt = req->link;
+       req->link = NULL;
+       return nxt;
  }
  
  static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
@@ -1845,6 +1805,18 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
         return __io_req_find_next(req);
  }
  
+static void ctx_flush_and_put(struct io_ring_ctx *ctx)
+{
+       if (!ctx)
+               return;
+       if (ctx->submit_state.comp.nr) {
+               mutex_lock(&ctx->uring_lock);
+               io_submit_flush_completions(&ctx->submit_state.comp, ctx);
+               mutex_unlock(&ctx->uring_lock);
+       }
+       percpu_ref_put(&ctx->refs);
+}
+
  static bool __tctx_task_work(struct io_uring_task *tctx)
  {
         struct io_ring_ctx *ctx = NULL;
@@ -1862,30 +1834,20 @@ static bool __tctx_task_work(struct io_uring_task *tctx)
         node = list.first;
         while (node) {
                 struct io_wq_work_node *next = node->next;
-               struct io_ring_ctx *this_ctx;
                 struct io_kiocb *req;
  
                 req = container_of(node, struct io_kiocb, io_task_work.node);
-               this_ctx = req->ctx;
-               req->task_work.func(&req->task_work);
-               node = next;
-
-               if (!ctx) {
-                       ctx = this_ctx;
-               } else if (ctx != this_ctx) {
-                       mutex_lock(&ctx->uring_lock);
-                       io_submit_flush_completions(&ctx->submit_state.comp, ctx);
-                       mutex_unlock(&ctx->uring_lock);
-                       ctx = this_ctx;
+               if (req->ctx != ctx) {
+                       ctx_flush_and_put(ctx);
+                       ctx = req->ctx;
+                       percpu_ref_get(&ctx->refs);
                 }
-       }
  
-       if (ctx && ctx->submit_state.comp.nr) {
-               mutex_lock(&ctx->uring_lock);
-               io_submit_flush_completions(&ctx->submit_state.comp, ctx);
-               mutex_unlock(&ctx->uring_lock);
+               req->task_work.func(&req->task_work);
+               node = next;
         }
  
+       ctx_flush_and_put(ctx);
         return list.first != NULL;
  }
  
@@ -1893,10 +1855,10 @@ static void tctx_task_work(struct callback_head *cb)
  {
         struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work);
  
+       clear_bit(0, &tctx->task_state);
+
         while (__tctx_task_work(tctx))
                 cond_resched();
-
-       clear_bit(0, &tctx->task_state);
  }
  
  static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
@@ -2010,7 +1972,7 @@ static void __io_req_task_submit(struct io_kiocb *req)
  
         /* ctx stays valid until unlock, even if we drop all ours ctx->refs */
         mutex_lock(&ctx->uring_lock);
-       if (!ctx->sqo_dead && !(current->flags & PF_EXITING) && !current->in_execve)
+       if (!(current->flags & PF_EXITING) && !current->in_execve)
                 __io_queue_sqe(req);
         else
                 __io_req_task_cancel(req, -EFAULT);
@@ -2472,23 +2434,32 @@ static bool io_resubmit_prep(struct io_kiocb *req)
                 return false;
         return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
  }
-#endif
  
-static bool io_rw_reissue(struct io_kiocb *req)
+static bool io_rw_should_reissue(struct io_kiocb *req)
  {
-#ifdef CONFIG_BLOCK
         umode_t mode = file_inode(req->file)->i_mode;
+       struct io_ring_ctx *ctx = req->ctx;
  
         if (!S_ISBLK(mode) && !S_ISREG(mode))
                 return false;
-       if ((req->flags & REQ_F_NOWAIT) || io_wq_current_is_worker())
+       if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
+           !(ctx->flags & IORING_SETUP_IOPOLL)))
                 return false;
         /*
          * If ref is dying, we might be running poll reap from the exit work.
          * Don't attempt to reissue from that path, just let it fail with
          * -EAGAIN.
          */
-       if (percpu_ref_is_dying(&req->ctx->refs))
+       if (percpu_ref_is_dying(&ctx->refs))
+               return false;
+       return true;
+}
+#endif
+
+static bool io_rw_reissue(struct io_kiocb *req)
+{
+#ifdef CONFIG_BLOCK
+       if (!io_rw_should_reissue(req))
                 return false;
  
         lockdep_assert_held(&req->ctx->uring_lock);
@@ -2531,6 +2502,19 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
  {
         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
  
+#ifdef CONFIG_BLOCK
+       /* Rewind iter, if we have one. iopoll path resubmits as usual */
+       if (res == -EAGAIN && io_rw_should_reissue(req)) {
+               struct io_async_rw *rw = req->async_data;
+
+               if (rw)
+                       iov_iter_revert(&rw->iter,
+                                       req->result - iov_iter_count(&rw->iter));
+               else if (!io_resubmit_prep(req))
+                       res = -EIO;
+       }
+#endif
+
         if (kiocb->ki_flags & IOCB_WRITE)
                 kiocb_end_write(req);
  
@@ -3279,6 +3263,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
         ret = io_iter_do_read(req, iter);
  
         if (ret == -EIOCBQUEUED) {
+               if (req->async_data)
+                       iov_iter_revert(iter, io_size - iov_iter_count(iter));
                 goto out_free;
         } else if (ret == -EAGAIN) {
                 /* IOPOLL retry should happen for io-wq threads */
@@ -3324,6 +3310,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
                 if (ret == -EIOCBQUEUED)
                         return 0;
                 /* we got some bytes, but not all. retry. */
+               kiocb->ki_flags &= ~IOCB_WAITQ;
         } while (ret > 0 && ret < io_size);
  done:
         kiocb_done(kiocb, ret, issue_flags);
@@ -3410,6 +3397,8 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
         /* no retry on NONBLOCK nor RWF_NOWAIT */
         if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
                 goto done;
+       if (ret2 == -EIOCBQUEUED && req->async_data)
+               iov_iter_revert(iter, io_size - iov_iter_count(iter));
         if (!force_nonblock || ret2 != -EAGAIN) {
                 /* IOPOLL retry should happen for io-wq threads */
                 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
@@ -3588,7 +3577,6 @@ static int __io_splice_prep(struct io_kiocb *req,
                  * Splice operation will be punted aync, and here need to
                  * modify io_wq_work.flags, so initialize io_wq_work firstly.
                  */
-               io_req_init_async(req);
                 req->work.flags |= IO_WQ_WORK_UNBOUND;
         }
  
@@ -3864,7 +3852,7 @@ err:
  
  static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
  {
-       return io_openat2(req, issue_flags & IO_URING_F_NONBLOCK);
+       return io_openat2(req, issue_flags);
  }
  
  static int io_remove_buffers_prep(struct io_kiocb *req,
@@ -5003,6 +4991,9 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
                         pt->error = -EINVAL;
                         return;
                 }
+               /* double add on the same waitqueue head, ignore */
+               if (poll->head == head)
+                       return;
                 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
                 if (!poll) {
                         pt->error = -ENOMEM;
@@ -5538,6 +5529,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
  
         data->mode = io_translate_timeout_mode(flags);
         hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
+       io_req_track_inflight(req);
         return 0;
  }
  
@@ -5591,22 +5583,30 @@ add:
         return 0;
  }
  
+struct io_cancel_data {
+       struct io_ring_ctx *ctx;
+       u64 user_data;
+};
+
  static bool io_cancel_cb(struct io_wq_work *work, void *data)
  {
         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+       struct io_cancel_data *cd = data;
  
-       return req->user_data == (unsigned long) data;
+       return req->ctx == cd->ctx && req->user_data == cd->user_data;
  }
  
-static int io_async_cancel_one(struct io_uring_task *tctx, void *sqe_addr)
+static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
+                              struct io_ring_ctx *ctx)
  {
+       struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
         enum io_wq_cancel cancel_ret;
         int ret = 0;
  
-       if (!tctx->io_wq)
+       if (!tctx || !tctx->io_wq)
                 return -ENOENT;
  
-       cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, sqe_addr, false);
+       cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
         switch (cancel_ret) {
         case IO_WQ_CANCEL_OK:
                 ret = 0;
@@ -5629,8 +5629,7 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
         unsigned long flags;
         int ret;
  
-       ret = io_async_cancel_one(req->task->io_uring,
-                                       (void *) (unsigned long) sqe_addr);
+       ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
         if (ret != -ENOENT) {
                 spin_lock_irqsave(&ctx->completion_lock, flags);
                 goto done;
@@ -5945,8 +5944,12 @@ static void __io_clean_op(struct io_kiocb *req)
  static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
  {
         struct io_ring_ctx *ctx = req->ctx;
+       const struct cred *creds = NULL;
         int ret;
  
+       if (req->work.creds && req->work.creds != current_cred())
+               creds = override_creds(req->work.creds);
+
         switch (req->opcode) {
         case IORING_OP_NOP:
                 ret = io_nop(req, issue_flags);
@@ -6053,6 +6056,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
                 break;
         }
  
+       if (creds)
+               revert_creds(creds);
+
         if (ret)
                 return ret;
  
@@ -6216,18 +6222,10 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
  static void __io_queue_sqe(struct io_kiocb *req)
  {
         struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
-       const struct cred *old_creds = NULL;
         int ret;
  
-       if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds &&
-           req->work.creds != current_cred())
-               old_creds = override_creds(req->work.creds);
-
         ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
  
-       if (old_creds)
-               revert_creds(old_creds);
-
         /*
          * We async punt it if the file wasn't marked NOWAIT, or if the file
          * doesn't support non-blocking read/write attempts
@@ -6314,7 +6312,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
  {
         struct io_submit_state *state;
         unsigned int sqe_flags;
-       int id, ret = 0;
+       int personality, ret = 0;
  
         req->opcode = READ_ONCE(sqe->opcode);
         /* same numerical values with corresponding REQ_F_*, safe to copy */
@@ -6329,6 +6327,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
         refcount_set(&req->refs, 2);
         req->task = current;
         req->result = 0;
+       req->work.list.next = NULL;
+       req->work.creds = NULL;
+       req->work.flags = 0;
  
         /* enforce forwards compatibility on users */
         if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
@@ -6346,15 +6347,13 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
             !io_op_defs[req->opcode].buffer_select)
                 return -EOPNOTSUPP;
  
-       id = READ_ONCE(sqe->personality);
-       if (id) {
-               __io_req_init_async(req);
-               req->work.creds = idr_find(&ctx->personality_idr, id);
-               if (unlikely(!req->work.creds))
+       personality = READ_ONCE(sqe->personality);
+       if (personality) {
+               req->work.creds = xa_load(&ctx->personalities, personality);
+               if (!req->work.creds)
                         return -EINVAL;
                 get_cred(req->work.creds);
         }
-
         state = &ctx->submit_state;
  
         /*
@@ -6616,8 +6615,8 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
                 if (!list_empty(&ctx->iopoll_list))
                         io_do_iopoll(ctx, &nr_events, 0);
  
-               if (to_submit && !ctx->sqo_dead &&
-                   likely(!percpu_ref_is_dying(&ctx->refs)))
+               if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
+                   !(ctx->flags & IORING_SETUP_R_DISABLED))
                         ret = io_submit_sqes(ctx, to_submit);
                 mutex_unlock(&ctx->uring_lock);
         }
@@ -6654,45 +6653,6 @@ static void io_sqd_init_new(struct io_sq_data *sqd)
         io_sqd_update_thread_idle(sqd);
  }
  
-static bool io_sq_thread_should_stop(struct io_sq_data *sqd)
-{
-       return test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
-}
-
-static bool io_sq_thread_should_park(struct io_sq_data *sqd)
-{
-       return test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
-}
-
-static void io_sq_thread_parkme(struct io_sq_data *sqd)
-{
-       for (;;) {
-               /*
-                * TASK_PARKED is a special state; we must serialize against
-                * possible pending wakeups to avoid store-store collisions on
-                * task->state.
-                *
-                * Such a collision might possibly result in the task state
-                * changin from TASK_PARKED and us failing the
-                * wait_task_inactive() in kthread_park().
-                */
-               set_special_state(TASK_PARKED);
-               if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state))
-                       break;
-
-               /*
-                * Thread is going to call schedule(), do not preempt it,
-                * or the caller of kthread_park() may spend more time in
-                * wait_task_inactive().
-                */
-               preempt_disable();
-               complete(&sqd->completion);
-               schedule_preempt_disabled();
-               preempt_enable();
-       }
-       __set_current_state(TASK_RUNNING);
-}
-
  static int io_sq_thread(void *data)
  {
         struct io_sq_data *sqd = data;
@@ -6703,7 +6663,6 @@ static int io_sq_thread(void *data)
  
         sprintf(buf, "iou-sqp-%d", sqd->task_pid);
         set_task_comm(current, buf);
-       sqd->thread = current;
         current->pf_io_worker = NULL;
  
         if (sqd->sq_cpu != -1)
@@ -6712,21 +6671,18 @@ static int io_sq_thread(void *data)
                 set_cpus_allowed_ptr(current, cpu_online_mask);
         current->flags |= PF_NO_SETAFFINITY;
  
-       complete(&sqd->completion);
-
         wait_for_completion(&sqd->startup);
  
-       while (!io_sq_thread_should_stop(sqd)) {
+       down_read(&sqd->rw_lock);
+
+       while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) {
                 int ret;
                 bool cap_entries, sqt_spin, needs_sched;
  
-               /*
-                * Any changes to the sqd lists are synchronized through the
-                * thread parking. This synchronizes the thread vs users,
-                * the users are synchronized on the sqd->ctx_lock.
-                */
-               if (io_sq_thread_should_park(sqd)) {
-                       io_sq_thread_parkme(sqd);
+               if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) {
+                       up_read(&sqd->rw_lock);
+                       cond_resched();
+                       down_read(&sqd->rw_lock);
                         continue;
                 }
                 if (unlikely(!list_empty(&sqd->ctx_new_list))) {
@@ -6738,7 +6694,13 @@ static int io_sq_thread(void *data)
                 sqt_spin = false;
                 cap_entries = !list_is_singular(&sqd->ctx_list);
                 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+                       const struct cred *creds = NULL;
+
+                       if (ctx->sq_creds != current_cred())
+                               creds = override_creds(ctx->sq_creds);
                         ret = __io_sq_thread(ctx, cap_entries);
+                       if (creds)
+                               revert_creds(creds);
                         if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
                                 sqt_spin = true;
                 }
@@ -6765,11 +6727,14 @@ static int io_sq_thread(void *data)
                         }
                 }
  
-               if (needs_sched && !io_sq_thread_should_park(sqd)) {
+               if (needs_sched && !test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) {
                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                 io_ring_set_wakeup_flag(ctx);
  
+                       up_read(&sqd->rw_lock);
                         schedule();
+                       try_to_freeze();
+                       down_read(&sqd->rw_lock);
                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                 io_ring_clear_wakeup_flag(ctx);
                 }
@@ -6780,25 +6745,16 @@ static int io_sq_thread(void *data)
  
         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                 io_uring_cancel_sqpoll(ctx);
+       up_read(&sqd->rw_lock);
  
         io_run_task_work();
  
-       if (io_sq_thread_should_park(sqd))
-               io_sq_thread_parkme(sqd);
-
-       /*
-        * Clear thread under lock so that concurrent parks work correctly
-        */
-       complete(&sqd->completion);
-       mutex_lock(&sqd->lock);
+       down_write(&sqd->rw_lock);
         sqd->thread = NULL;
-       list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
-               ctx->sqo_exec = 1;
+       list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                 io_ring_set_wakeup_flag(ctx);
-       }
-
+       up_write(&sqd->rw_lock);
         complete(&sqd->exited);
-       mutex_unlock(&sqd->lock);
         do_exit(0);
  }
  
@@ -6920,11 +6876,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
         trace_io_uring_cqring_wait(ctx, min_events);
         do {
-               io_cqring_overflow_flush(ctx, false, NULL, NULL);
+               /* if we can't even flush overflow, don't wait for more */
+               if (!io_cqring_overflow_flush(ctx, false, NULL, NULL)) {
+                       ret = -EBUSY;
+                       break;
+               }
                 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
                                                 TASK_INTERRUPTIBLE);
                 ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
                 finish_wait(&ctx->wait, &iowq.wq);
+               cond_resched();
         } while (ret > 0);
  
         restore_saved_sigmask_unless(ret == -EINTR);
@@ -7092,48 +7053,37 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
  }
  
  static void io_sq_thread_unpark(struct io_sq_data *sqd)
-       __releases(&sqd->lock)
+       __releases(&sqd->rw_lock)
  {
-       if (!sqd->thread)
-               return;
         if (sqd->thread == current)
                 return;
         clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
-       wake_up_state(sqd->thread, TASK_PARKED);
-       mutex_unlock(&sqd->lock);
+       up_write(&sqd->rw_lock);
  }
  
-static bool io_sq_thread_park(struct io_sq_data *sqd)
-       __acquires(&sqd->lock)
+static void io_sq_thread_park(struct io_sq_data *sqd)
+       __acquires(&sqd->rw_lock)
  {
         if (sqd->thread == current)
-               return true;
-       mutex_lock(&sqd->lock);
-       if (!sqd->thread) {
-               mutex_unlock(&sqd->lock);
-               return false;
-       }
+               return;
         set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
-       wake_up_process(sqd->thread);
-       wait_for_completion(&sqd->completion);
-       return true;
+       down_write(&sqd->rw_lock);
+       /* set again for consistency, in case concurrent parks are happening */
+       set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+       if (sqd->thread)
+               wake_up_process(sqd->thread);
  }
  
  static void io_sq_thread_stop(struct io_sq_data *sqd)
  {
         if (test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state))
                 return;
-       mutex_lock(&sqd->lock);
-       if (sqd->thread) {
-               set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
-               WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state));
+       down_write(&sqd->rw_lock);
+       set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+       if (sqd->thread)
                 wake_up_process(sqd->thread);
-               mutex_unlock(&sqd->lock);
-               wait_for_completion(&sqd->exited);
-               WARN_ON_ONCE(sqd->thread);
-       } else {
-               mutex_unlock(&sqd->lock);
-       }
+       up_write(&sqd->rw_lock);
+       wait_for_completion(&sqd->exited);
  }
  
  static void io_put_sq_data(struct io_sq_data *sqd)
@@ -7150,21 +7100,18 @@ static void io_sq_thread_finish(struct io_ring_ctx *ctx)
  
         if (sqd) {
                 complete(&sqd->startup);
-               if (sqd->thread) {
+               if (sqd->thread)
                         wait_for_completion(&ctx->sq_thread_comp);
-                       io_sq_thread_park(sqd);
-               }
  
-               mutex_lock(&sqd->ctx_lock);
+               io_sq_thread_park(sqd);
                 list_del(&ctx->sqd_list);
                 io_sqd_update_thread_idle(sqd);
-               mutex_unlock(&sqd->ctx_lock);
-
-               if (sqd->thread)
-                       io_sq_thread_unpark(sqd);
+               io_sq_thread_unpark(sqd);
  
                 io_put_sq_data(sqd);
                 ctx->sq_data = NULL;
+               if (ctx->sq_creds)
+                       put_cred(ctx->sq_creds);
         }
  }
  
@@ -7208,11 +7155,9 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p)
         refcount_set(&sqd->refs, 1);
         INIT_LIST_HEAD(&sqd->ctx_list);
         INIT_LIST_HEAD(&sqd->ctx_new_list);
-       mutex_init(&sqd->ctx_lock);
-       mutex_init(&sqd->lock);
+       init_rwsem(&sqd->rw_lock);
         init_waitqueue_head(&sqd->wait);
         init_completion(&sqd->startup);
-       init_completion(&sqd->completion);
         init_completion(&sqd->exited);
         return sqd;
  }
@@ -7843,30 +7788,13 @@ void __io_uring_free(struct task_struct *tsk)
         struct io_uring_task *tctx = tsk->io_uring;
  
         WARN_ON_ONCE(!xa_empty(&tctx->xa));
+       WARN_ON_ONCE(tctx->io_wq);
+
         percpu_counter_destroy(&tctx->inflight);
         kfree(tctx);
         tsk->io_uring = NULL;
  }
  
-static int io_sq_thread_fork(struct io_sq_data *sqd, struct io_ring_ctx *ctx)
-{
-       int ret;
-
-       clear_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
-       reinit_completion(&sqd->completion);
-       ctx->sqo_dead = ctx->sqo_exec = 0;
-       sqd->task_pid = current->pid;
-       current->flags |= PF_IO_WORKER;
-       ret = io_wq_fork_thread(io_sq_thread, sqd);
-       current->flags &= ~PF_IO_WORKER;
-       if (ret < 0) {
-               sqd->thread = NULL;
-               return ret;
-       }
-       wait_for_completion(&sqd->completion);
-       return io_uring_alloc_task_context(sqd->thread, ctx);
-}
-
  static int io_sq_offload_create(struct io_ring_ctx *ctx,
                                 struct io_uring_params *p)
  {
@@ -7887,6 +7815,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
                 fdput(f);
         }
         if (ctx->flags & IORING_SETUP_SQPOLL) {
+               struct task_struct *tsk;
                 struct io_sq_data *sqd;
  
                 ret = -EPERM;
@@ -7899,11 +7828,10 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
                         goto err;
                 }
  
+               ctx->sq_creds = get_current_cred();
                 ctx->sq_data = sqd;
                 io_sq_thread_park(sqd);
-               mutex_lock(&sqd->ctx_lock);
                 list_add(&ctx->sqd_list, &sqd->ctx_new_list);
-               mutex_unlock(&sqd->ctx_lock);
                 io_sq_thread_unpark(sqd);
  
                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
@@ -7918,9 +7846,9 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
  
                         ret = -EINVAL;
                         if (cpu >= nr_cpu_ids)
-                               goto err;
+                               goto err_sqpoll;
                         if (!cpu_online(cpu))
-                               goto err;
+                               goto err_sqpoll;
  
                         sqd->sq_cpu = cpu;
                 } else {
@@ -7928,17 +7856,18 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
                 }
  
                 sqd->task_pid = current->pid;
-               current->flags |= PF_IO_WORKER;
-               ret = io_wq_fork_thread(io_sq_thread, sqd);
-               current->flags &= ~PF_IO_WORKER;
-               if (ret < 0) {
-                       sqd->thread = NULL;
-                       goto err;
+               tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
+               if (IS_ERR(tsk)) {
+                       ret = PTR_ERR(tsk);
+                       goto err_sqpoll;
                 }
-               wait_for_completion(&sqd->completion);
-               ret = io_uring_alloc_task_context(sqd->thread, ctx);
+
+               sqd->thread = tsk;
+               ret = io_uring_alloc_task_context(tsk, ctx);
+               wake_up_new_task(tsk);
                 if (ret)
                         goto err;
+               complete(&sqd->startup);
         } else if (p->flags & IORING_SETUP_SQ_AFF) {
                 /* Can't have SQ_AFF without SQPOLL */
                 ret = -EINVAL;
@@ -7949,14 +7878,9 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
  err:
         io_sq_thread_finish(ctx);
         return ret;
-}
-
-static void io_sq_offload_start(struct io_ring_ctx *ctx)
-{
-       struct io_sq_data *sqd = ctx->sq_data;
-
-       if (ctx->flags & IORING_SETUP_SQPOLL)
-               complete(&sqd->startup);
+err_sqpoll:
+       complete(&ctx->sq_data->exited);
+       goto err;
  }
  
  static inline void __io_unaccount_mem(struct user_struct *user,
@@ -8393,7 +8317,7 @@ static void io_req_cache_free(struct list_head *list, struct task_struct *tsk)
         }
  }
  
-static void io_req_caches_free(struct io_ring_ctx *ctx, struct task_struct *tsk)
+static void io_req_caches_free(struct io_ring_ctx *ctx)
  {
         struct io_submit_state *submit_state = &ctx->submit_state;
         struct io_comp_state *cs = &ctx->submit_state.comp;
@@ -8439,7 +8363,6 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
         mutex_unlock(&ctx->uring_lock);
         io_eventfd_unregister(ctx);
         io_destroy_buffers(ctx);
-       idr_destroy(&ctx->personality_idr);
  
  #if defined(CONFIG_UNIX)
         if (ctx->ring_sock) {
@@ -8453,7 +8376,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
  
         percpu_ref_exit(&ctx->refs);
         free_uid(ctx->user);
-       io_req_caches_free(ctx, NULL);
+       io_req_caches_free(ctx);
         if (ctx->hash_map)
                 io_wq_put_hash(ctx->hash_map);
         kfree(ctx->cancel_hash);
@@ -8504,7 +8427,7 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
  {
         const struct cred *creds;
  
-       creds = idr_remove(&ctx->personality_idr, id);
+       creds = xa_erase(&ctx->personalities, id);
         if (creds) {
                 put_cred(creds);
                 return 0;
@@ -8513,25 +8436,13 @@ static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
         return -EINVAL;
  }
  
-static int io_remove_personalities(int id, void *p, void *data)
-{
-       struct io_ring_ctx *ctx = data;
-
-       io_unregister_personality(ctx, id);
-       return 0;
-}
-
  static bool io_run_ctx_fallback(struct io_ring_ctx *ctx)
  {
-       struct callback_head *work, *head, *next;
+       struct callback_head *work, *next;
         bool executed = false;
  
         do {
-               do {
-                       head = NULL;
-                       work = READ_ONCE(ctx->exit_task_work);
-               } while (cmpxchg(&ctx->exit_task_work, work, head) != work);
-
+               work = xchg(&ctx->exit_task_work, NULL);
                 if (!work)
                         break;
  
@@ -8547,10 +8458,34 @@ static bool io_run_ctx_fallback(struct io_ring_ctx *ctx)
         return executed;
  }
  
+struct io_tctx_exit {
+       struct callback_head            task_work;
+       struct completion               completion;
+       struct io_ring_ctx              *ctx;
+};
+
+static void io_tctx_exit_cb(struct callback_head *cb)
+{
+       struct io_uring_task *tctx = current->io_uring;
+       struct io_tctx_exit *work;
+
+       work = container_of(cb, struct io_tctx_exit, task_work);
+       /*
+        * When @in_idle, we're in cancellation and it's racy to remove the
+        * node. It'll be removed by the end of cancellation, just ignore it.
+        */
+       if (!atomic_read(&tctx->in_idle))
+               io_uring_del_task_file((unsigned long)work->ctx);
+       complete(&work->completion);
+}
+
  static void io_ring_exit_work(struct work_struct *work)
  {
-       struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
-                                              exit_work);
+       struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
+       unsigned long timeout = jiffies + HZ * 60 * 5;
+       struct io_tctx_exit exit;
+       struct io_tctx_node *node;
+       int ret;
  
         /*
          * If we're doing polled IO and end up having requests being
@@ -8560,24 +8495,47 @@ static void io_ring_exit_work(struct work_struct *work)
          */
         do {
                 io_uring_try_cancel_requests(ctx, NULL, NULL);
-               io_run_ctx_fallback(ctx);
+
+               WARN_ON_ONCE(time_after(jiffies, timeout));
         } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
+
+       mutex_lock(&ctx->uring_lock);
+       while (!list_empty(&ctx->tctx_list)) {
+               WARN_ON_ONCE(time_after(jiffies, timeout));
+
+               node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
+                                       ctx_node);
+               exit.ctx = ctx;
+               init_completion(&exit.completion);
+               init_task_work(&exit.task_work, io_tctx_exit_cb);
+               ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
+               if (WARN_ON_ONCE(ret))
+                       continue;
+               wake_up_process(node->task);
+
+               mutex_unlock(&ctx->uring_lock);
+               wait_for_completion(&exit.completion);
+               cond_resched();
+               mutex_lock(&ctx->uring_lock);
+       }
+       mutex_unlock(&ctx->uring_lock);
+
         io_ring_ctx_free(ctx);
  }
  
  static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
  {
+       unsigned long index;
+       struct creds *creds;
+
         mutex_lock(&ctx->uring_lock);
         percpu_ref_kill(&ctx->refs);
-
-       if (WARN_ON_ONCE((ctx->flags & IORING_SETUP_SQPOLL) && !ctx->sqo_dead))
-               ctx->sqo_dead = 1;
-
         /* if force is set, the ring is going away. always drop after that */
         ctx->cq_overflow_flushed = 1;
         if (ctx->rings)
                 __io_cqring_overflow_flush(ctx, true, NULL, NULL);
-       idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
+       xa_for_each(&ctx->personalities, index, creds)
+               io_unregister_personality(ctx, index);
         mutex_unlock(&ctx->uring_lock);
  
         io_kill_timeouts(ctx, NULL, NULL);
@@ -8656,18 +8614,55 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,
         }
  }
  
+static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
+{
+       struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+
+       return req->ctx == data;
+}
+
+static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
+{
+       struct io_tctx_node *node;
+       enum io_wq_cancel cret;
+       bool ret = false;
+
+       mutex_lock(&ctx->uring_lock);
+       list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
+               struct io_uring_task *tctx = node->task->io_uring;
+
+               /*
+                * io_wq will stay alive while we hold uring_lock, because it's
+                * killed after ctx nodes, which requires to take the lock.
+                */
+               if (!tctx || !tctx->io_wq)
+                       continue;
+               cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
+               ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
+       }
+       mutex_unlock(&ctx->uring_lock);
+
+       return ret;
+}
+
  static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
                                          struct task_struct *task,
                                          struct files_struct *files)
  {
         struct io_task_cancel cancel = { .task = task, .files = files, };
-       struct io_uring_task *tctx = current->io_uring;
+       struct io_uring_task *tctx = task ? task->io_uring : NULL;
  
         while (1) {
                 enum io_wq_cancel cret;
                 bool ret = false;
  
-               if (tctx && tctx->io_wq) {
+               if (!task) {
+                       ret |= io_uring_try_cancel_iowq(ctx);
+               } else if (tctx && tctx->io_wq) {
+                       /*
+                        * Cancels requests of all rings, not only @ctx, but
+                        * it's fine as the task is in exit/exec.
+                        */
                         cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
                                                &cancel, true);
                         ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
@@ -8732,17 +8727,6 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
         }
  }
  
-static void io_disable_sqo_submit(struct io_ring_ctx *ctx)
-{
-       mutex_lock(&ctx->uring_lock);
-       ctx->sqo_dead = 1;
-       mutex_unlock(&ctx->uring_lock);
-
-       /* make sure callers enter the ring to get error */
-       if (ctx->rings)
-               io_ring_set_wakeup_flag(ctx);
-}
-
  /*
   * We need to iteratively cancel requests, in case a request has dependent
   * hard links. These persist even for failure of cancelations, hence keep
@@ -8752,15 +8736,12 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
                                           struct files_struct *files)
  {
         struct task_struct *task = current;
-       bool did_park = false;
  
         if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
-               io_disable_sqo_submit(ctx);
-               did_park = io_sq_thread_park(ctx->sq_data);
-               if (did_park) {
-                       task = ctx->sq_data->thread;
+               io_sq_thread_park(ctx->sq_data);
+               task = ctx->sq_data->thread;
+               if (task)
                         atomic_inc(&task->io_uring->in_idle);
-               }
         }
  
         io_cancel_defer_files(ctx, task, files);
@@ -8769,18 +8750,19 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
         if (!files)
                 io_uring_try_cancel_requests(ctx, task, NULL);
  
-       if (did_park) {
+       if (task)
                 atomic_dec(&task->io_uring->in_idle);
+       if (ctx->sq_data)
                 io_sq_thread_unpark(ctx->sq_data);
-       }
  }
  
  /*
   * Note that this task has used io_uring. We use it for cancelation purposes.
   */
-static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
+static int io_uring_add_task_file(struct io_ring_ctx *ctx)
  {
         struct io_uring_task *tctx = current->io_uring;
+       struct io_tctx_node *node;
         int ret;
  
         if (unlikely(!tctx)) {
@@ -8789,23 +8771,28 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
                         return ret;
                 tctx = current->io_uring;
         }
-       if (tctx->last != file) {
-               void *old = xa_load(&tctx->xa, (unsigned long)file);
+       if (tctx->last != ctx) {
+               void *old = xa_load(&tctx->xa, (unsigned long)ctx);
  
                 if (!old) {
-                       get_file(file);
-                       ret = xa_err(xa_store(&tctx->xa, (unsigned long)file,
-                                               file, GFP_KERNEL));
+                       node = kmalloc(sizeof(*node), GFP_KERNEL);
+                       if (!node)
+                               return -ENOMEM;
+                       node->ctx = ctx;
+                       node->task = current;
+
+                       ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
+                                               node, GFP_KERNEL));
                         if (ret) {
-                               fput(file);
+                               kfree(node);
                                 return ret;
                         }
  
-                       /* one and only SQPOLL file note, held by sqo_task */
-                       WARN_ON_ONCE((ctx->flags & IORING_SETUP_SQPOLL) &&
-                                    current != ctx->sqo_task);
+                       mutex_lock(&ctx->uring_lock);
+                       list_add(&node->ctx_node, &ctx->tctx_list);
+                       mutex_unlock(&ctx->uring_lock);
                 }
-               tctx->last = file;
+               tctx->last = ctx;
         }
  
         /*
@@ -8822,45 +8809,56 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
  /*
   * Remove this io_uring_file -> task mapping.
   */
-static void io_uring_del_task_file(struct file *file)
+static void io_uring_del_task_file(unsigned long index)
  {
         struct io_uring_task *tctx = current->io_uring;
+       struct io_tctx_node *node;
+
+       if (!tctx)
+               return;
+       node = xa_erase(&tctx->xa, index);
+       if (!node)
+               return;
  
-       if (tctx->last == file)
+       WARN_ON_ONCE(current != node->task);
+       WARN_ON_ONCE(list_empty(&node->ctx_node));
+
+       mutex_lock(&node->ctx->uring_lock);
+       list_del(&node->ctx_node);
+       mutex_unlock(&node->ctx->uring_lock);
+
+       if (tctx->last == node->ctx)
                 tctx->last = NULL;
-       file = xa_erase(&tctx->xa, (unsigned long)file);
-       if (file)
-               fput(file);
+       kfree(node);
  }
  
-static void io_uring_remove_task_files(struct io_uring_task *tctx)
+static void io_uring_clean_tctx(struct io_uring_task *tctx)
  {
-       struct file *file;
+       struct io_tctx_node *node;
         unsigned long index;
  
-       xa_for_each(&tctx->xa, index, file)
-               io_uring_del_task_file(file);
+       xa_for_each(&tctx->xa, index, node)
+               io_uring_del_task_file(index);
+       if (tctx->io_wq) {
+               io_wq_put_and_exit(tctx->io_wq);
+               tctx->io_wq = NULL;
+       }
  }
  
  void __io_uring_files_cancel(struct files_struct *files)
  {
         struct io_uring_task *tctx = current->io_uring;
-       struct file *file;
+       struct io_tctx_node *node;
         unsigned long index;
  
         /* make sure overflow events are dropped */
         atomic_inc(&tctx->in_idle);
-       xa_for_each(&tctx->xa, index, file)
-               io_uring_cancel_task_requests(file->private_data, files);
+       xa_for_each(&tctx->xa, index, node)
+               io_uring_cancel_task_requests(node->ctx, files);
         atomic_dec(&tctx->in_idle);
  
-       if (files) {
-               io_uring_remove_task_files(tctx);
-               if (tctx->io_wq) {
-                       io_wq_put(tctx->io_wq);
-                       tctx->io_wq = NULL;
-               }
-       }
+       if (files)
+               io_uring_clean_tctx(tctx);
  }
  
  static s64 tctx_inflight(struct io_uring_task *tctx)
@@ -8877,16 +8875,12 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
  
         if (!sqd)
                 return;
-       io_disable_sqo_submit(ctx);
-       if (!io_sq_thread_park(sqd))
-               return;
-       tctx = ctx->sq_data->thread->io_uring;
-       /* can happen on fork/alloc failure, just ignore that state */
-       if (!tctx) {
+       io_sq_thread_park(sqd);
+       if (!sqd->thread || !sqd->thread->io_uring) {
                 io_sq_thread_unpark(sqd);
                 return;
         }
-
+       tctx = ctx->sq_data->thread->io_uring;
         atomic_inc(&tctx->in_idle);
         do {
                 /* read completions before cancelations */
@@ -8922,13 +8916,12 @@ void __io_uring_task_cancel(void)
         /* make sure overflow events are dropped */
         atomic_inc(&tctx->in_idle);
  
-       /* trigger io_disable_sqo_submit() */
         if (tctx->sqpoll) {
-               struct file *file;
+               struct io_tctx_node *node;
                 unsigned long index;
  
-               xa_for_each(&tctx->xa, index, file)
-                       io_uring_cancel_sqpoll(file->private_data);
+               xa_for_each(&tctx->xa, index, node)
+                       io_uring_cancel_sqpoll(node->ctx);
         }
  
         do {
@@ -8952,53 +8945,9 @@ void __io_uring_task_cancel(void)
  
         atomic_dec(&tctx->in_idle);
  
-       io_uring_remove_task_files(tctx);
-}
-
-static int io_uring_flush(struct file *file, void *data)
-{
-       struct io_uring_task *tctx = current->io_uring;
-       struct io_ring_ctx *ctx = file->private_data;
-
-       /* Ignore helper thread files exit */
-       if (current->flags & PF_IO_WORKER)
-               return 0;
-
-       if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
-               io_uring_cancel_task_requests(ctx, NULL);
-               io_req_caches_free(ctx, current);
-       }
-
-       io_run_ctx_fallback(ctx);
-
-       if (!tctx)
-               return 0;
-
-       /* we should have cancelled and erased it before PF_EXITING */
-       WARN_ON_ONCE((current->flags & PF_EXITING) &&
-                    xa_load(&tctx->xa, (unsigned long)file));
-
-       /*
-        * fput() is pending, will be 2 if the only other ref is our potential
-        * task file note. If the task is exiting, drop regardless of count.
-        */
-       if (atomic_long_read(&file->f_count) != 2)
-               return 0;
-
-       if (ctx->flags & IORING_SETUP_SQPOLL) {
-               /* there is only one file note, which is owned by sqo_task */
-               WARN_ON_ONCE(ctx->sqo_task != current &&
-                            xa_load(&tctx->xa, (unsigned long)file));
-               /* sqo_dead check is for when this happens after cancellation */
-               WARN_ON_ONCE(ctx->sqo_task == current && !ctx->sqo_dead &&
-                            !xa_load(&tctx->xa, (unsigned long)file));
-
-               io_disable_sqo_submit(ctx);
-       }
-
-       if (!(ctx->flags & IORING_SETUP_SQPOLL) || ctx->sqo_task == current)
-               io_uring_del_task_file(file);
-       return 0;
+       io_uring_clean_tctx(tctx);
+       /* all current's requests should be gone, we can kill tctx */
+       __io_uring_free(current);
  }
  
  static void *io_uring_validate_mmap_request(struct file *file,
@@ -9073,29 +9022,20 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
  
  static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
  {
-       int ret = 0;
         DEFINE_WAIT(wait);
  
         do {
                 if (!io_sqring_full(ctx))
                         break;
-
                 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
  
-               if (unlikely(ctx->sqo_dead)) {
-                       ret = -EOWNERDEAD;
-                       goto out;
-               }
-
                 if (!io_sqring_full(ctx))
                         break;
-
                 schedule();
         } while (!signal_pending(current));
  
         finish_wait(&ctx->sqo_sq_wait, &wait);
-out:
-       return ret;
+       return 0;
  }
  
  static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
@@ -9169,15 +9109,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
         if (ctx->flags & IORING_SETUP_SQPOLL) {
                 io_cqring_overflow_flush(ctx, false, NULL, NULL);
  
-               if (unlikely(ctx->sqo_exec)) {
-                       ret = io_sq_thread_fork(ctx->sq_data, ctx);
-                       if (ret)
-                               goto out;
-                       ctx->sqo_exec = 0;
-               }
                 ret = -EOWNERDEAD;
-               if (unlikely(ctx->sqo_dead))
+               if (unlikely(ctx->sq_data->thread == NULL)) {
                         goto out;
+               }
                 if (flags & IORING_ENTER_SQ_WAKEUP)
                         wake_up(&ctx->sq_data->wait);
                 if (flags & IORING_ENTER_SQ_WAIT) {
@@ -9187,7 +9122,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
                 }
                 submitted = to_submit;
         } else if (to_submit) {
-               ret = io_uring_add_task_file(ctx, f.file);
+               ret = io_uring_add_task_file(ctx);
                 if (unlikely(ret))
                         goto out;
                 mutex_lock(&ctx->uring_lock);
@@ -9229,10 +9164,9 @@ out_fput:
  }
  
  #ifdef CONFIG_PROC_FS
-static int io_uring_show_cred(int id, void *p, void *data)
+static int io_uring_show_cred(struct seq_file *m, unsigned int id,
+               const struct cred *cred)
  {
-       const struct cred *cred = p;
-       struct seq_file *m = data;
         struct user_namespace *uns = seq_user_ns(m);
         struct group_info *gi;
         kernel_cap_t cap;
@@ -9300,9 +9234,13 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
                 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
                                                 (unsigned int) buf->len);
         }
-       if (has_lock && !idr_is_empty(&ctx->personality_idr)) {
+       if (has_lock && !xa_empty(&ctx->personalities)) {
+               unsigned long index;
+               const struct cred *cred;
+
                 seq_printf(m, "Personalities:\n");
-               idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
+               xa_for_each(&ctx->personalities, index, cred)
+                       io_uring_show_cred(m, index, cred);
         }
         seq_printf(m, "PollList:\n");
         spin_lock_irq(&ctx->completion_lock);
@@ -9332,7 +9270,6 @@ static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
  
  static const struct file_operations io_uring_fops = {
         .release        = io_uring_release,
-       .flush          = io_uring_flush,
         .mmap           = io_uring_mmap,
  #ifndef CONFIG_MMU
         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
@@ -9397,7 +9334,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
         if (fd < 0)
                 return fd;
  
-       ret = io_uring_add_task_file(ctx, file);
+       ret = io_uring_add_task_file(ctx);
         if (ret) {
                 put_unused_fd(fd);
                 return ret;
@@ -9487,7 +9424,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
         ctx->compat = in_compat_syscall();
         if (!capable(CAP_IPC_LOCK))
                 ctx->user = get_uid(current_user());
-       ctx->sqo_task = current;
  
         /*
          * This is just grabbed for accounting purposes. When a process exits,
@@ -9506,9 +9442,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
         if (ret)
                 goto err;
  
-       if (!(p->flags & IORING_SETUP_R_DISABLED))
-               io_sq_offload_start(ctx);
-
         memset(&p->sq_off, 0, sizeof(p->sq_off));
         p->sq_off.head = offsetof(struct io_rings, sq.head);
         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
@@ -9550,7 +9483,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
          */
         ret = io_uring_install_fd(ctx, file);
         if (ret < 0) {
-               io_disable_sqo_submit(ctx);
                 /* fput will clean it up */
                 fput(file);
                 return ret;
@@ -9559,7 +9491,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
         return ret;
  err:
-       io_disable_sqo_submit(ctx);
         io_ring_ctx_wait_and_kill(ctx);
         return ret;
  }
@@ -9638,14 +9569,16 @@ out:
  static int io_register_personality(struct io_ring_ctx *ctx)
  {
         const struct cred *creds;
+       u32 id;
         int ret;
  
         creds = get_current_cred();
  
-       ret = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
-                               USHRT_MAX, GFP_KERNEL);
-       if (ret < 0)
-               put_cred(creds);
+       ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
+                       XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
+       if (!ret)
+               return id;
+       put_cred(creds);
         return ret;
  }
  
@@ -9728,9 +9661,8 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx)
                 ctx->restricted = 1;
  
         ctx->flags &= ~IORING_SETUP_R_DISABLED;
-
-       io_sq_offload_start(ctx);
-
+       if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
+               wake_up(&ctx->sq_data->wait);
         return 0;
  }