io_uring: fix race condition in task_work add and clear
[linux-2.6-microblaze.git] / fs / io_uring.c
index 14ce789..83973f6 100644 (file)
@@ -57,7 +57,6 @@
 #include <linux/mman.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
-#include <linux/kthread.h>
 #include <linux/blkdev.h>
 #include <linux/bvec.h>
 #include <linux/net.h>
 #define IORING_MAX_RESTRICTIONS        (IORING_RESTRICTION_LAST + \
                                 IORING_REGISTER_LAST + IORING_OP_LAST)
 
+#define SQE_VALID_FLAGS        (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
+                               IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
+                               IOSQE_BUFFER_SELECT)
+
 struct io_uring {
        u32 head ____cacheline_aligned_in_smp;
        u32 tail ____cacheline_aligned_in_smp;
@@ -232,6 +235,7 @@ struct fixed_rsrc_data {
        struct fixed_rsrc_ref_node      *node;
        struct percpu_ref               refs;
        struct completion               done;
+       bool                            quiesce;
 };
 
 struct io_buffer {
@@ -249,6 +253,11 @@ struct io_restriction {
        bool registered;
 };
 
+enum {
+       IO_SQ_THREAD_SHOULD_STOP = 0,
+       IO_SQ_THREAD_SHOULD_PARK,
+};
+
 struct io_sq_data {
        refcount_t              refs;
        struct mutex            lock;
@@ -262,6 +271,13 @@ struct io_sq_data {
        struct wait_queue_head  wait;
 
        unsigned                sq_thread_idle;
+       int                     sq_cpu;
+       pid_t                   task_pid;
+
+       unsigned long           state;
+       struct completion       startup;
+       struct completion       completion;
+       struct completion       exited;
 };
 
 #define IO_IOPOLL_BATCH                        8
@@ -279,8 +295,14 @@ struct io_comp_state {
        struct list_head        locked_free_list;
 };
 
+struct io_submit_link {
+       struct io_kiocb         *head;
+       struct io_kiocb         *last;
+};
+
 struct io_submit_state {
        struct blk_plug         plug;
+       struct io_submit_link   link;
 
        /*
         * io_kiocb alloc cache
@@ -312,12 +334,12 @@ struct io_ring_ctx {
        struct {
                unsigned int            flags;
                unsigned int            compat: 1;
-               unsigned int            limit_mem: 1;
                unsigned int            cq_overflow_flushed: 1;
                unsigned int            drain_next: 1;
                unsigned int            eventfd_async: 1;
                unsigned int            restricted: 1;
                unsigned int            sqo_dead: 1;
+               unsigned int            sqo_exec: 1;
 
                /*
                 * Ring buffer of indices into array of io_uring_sqe, which is
@@ -339,6 +361,9 @@ struct io_ring_ctx {
                unsigned                cached_cq_overflow;
                unsigned long           sq_check_overflow;
 
+               /* hashed buffered write serialization */
+               struct io_wq_hash       *hash_map;
+
                struct list_head        defer_list;
                struct list_head        timeout_list;
                struct list_head        cq_overflow_list;
@@ -355,22 +380,14 @@ struct io_ring_ctx {
 
        struct io_rings *rings;
 
-       /* IO offload */
-       struct io_wq            *io_wq;
-
        /*
-        * For SQPOLL usage - we hold a reference to the parent task, so we
-        * have access to the ->files
+        * For SQPOLL usage
         */
        struct task_struct      *sqo_task;
 
        /* Only used for accounting purposes */
        struct mm_struct        *mm_account;
 
-#ifdef CONFIG_BLK_CGROUP
-       struct cgroup_subsys_state      *sqo_blkcg_css;
-#endif
-
        struct io_sq_data       *sq_data;       /* if using sq thread polling */
 
        struct wait_queue_head  sqo_sq_wait;
@@ -390,13 +407,6 @@ struct io_ring_ctx {
 
        struct user_struct      *user;
 
-       const struct cred       *creds;
-
-#ifdef CONFIG_AUDIT
-       kuid_t                  loginuid;
-       unsigned int            sessionid;
-#endif
-
        struct completion       ref_comp;
        struct completion       sq_thread_comp;
 
@@ -445,6 +455,11 @@ struct io_ring_ctx {
 
        struct io_restriction           restrictions;
 
+       /* exit task_work */
+       struct callback_head            *exit_task_work;
+
+       struct wait_queue_head          hash_wait;
+
        /* Keep this last, we don't need it for the fast path */
        struct work_struct              exit_work;
 };
@@ -827,7 +842,6 @@ struct io_op_def {
        unsigned                plug : 1;
        /* size of async data needed, if any */
        unsigned short          async_size;
-       unsigned                work_flags;
 };
 
 static const struct io_op_def io_op_defs[] = {
@@ -840,7 +854,6 @@ static const struct io_op_def io_op_defs[] = {
                .needs_async_data       = 1,
                .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
        },
        [IORING_OP_WRITEV] = {
                .needs_file             = 1,
@@ -850,12 +863,9 @@ static const struct io_op_def io_op_defs[] = {
                .needs_async_data       = 1,
                .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
-                                               IO_WQ_WORK_FSIZE,
        },
        [IORING_OP_FSYNC] = {
                .needs_file             = 1,
-               .work_flags             = IO_WQ_WORK_BLKCG,
        },
        [IORING_OP_READ_FIXED] = {
                .needs_file             = 1,
@@ -863,7 +873,6 @@ static const struct io_op_def io_op_defs[] = {
                .pollin                 = 1,
                .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
-               .work_flags             = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
        },
        [IORING_OP_WRITE_FIXED] = {
                .needs_file             = 1,
@@ -872,8 +881,6 @@ static const struct io_op_def io_op_defs[] = {
                .pollout                = 1,
                .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
-               .work_flags             = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
-                                               IO_WQ_WORK_MM,
        },
        [IORING_OP_POLL_ADD] = {
                .needs_file             = 1,
@@ -882,7 +889,6 @@ static const struct io_op_def io_op_defs[] = {
        [IORING_OP_POLL_REMOVE] = {},
        [IORING_OP_SYNC_FILE_RANGE] = {
                .needs_file             = 1,
-               .work_flags             = IO_WQ_WORK_BLKCG,
        },
        [IORING_OP_SENDMSG] = {
                .needs_file             = 1,
@@ -890,8 +896,6 @@ static const struct io_op_def io_op_defs[] = {
                .pollout                = 1,
                .needs_async_data       = 1,
                .async_size             = sizeof(struct io_async_msghdr),
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
-                                               IO_WQ_WORK_FS,
        },
        [IORING_OP_RECVMSG] = {
                .needs_file             = 1,
@@ -900,29 +904,23 @@ static const struct io_op_def io_op_defs[] = {
                .buffer_select          = 1,
                .needs_async_data       = 1,
                .async_size             = sizeof(struct io_async_msghdr),
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
-                                               IO_WQ_WORK_FS,
        },
        [IORING_OP_TIMEOUT] = {
                .needs_async_data       = 1,
                .async_size             = sizeof(struct io_timeout_data),
-               .work_flags             = IO_WQ_WORK_MM,
        },
        [IORING_OP_TIMEOUT_REMOVE] = {
                /* used by timeout updates' prep() */
-               .work_flags             = IO_WQ_WORK_MM,
        },
        [IORING_OP_ACCEPT] = {
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
                .pollin                 = 1,
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_FILES,
        },
        [IORING_OP_ASYNC_CANCEL] = {},
        [IORING_OP_LINK_TIMEOUT] = {
                .needs_async_data       = 1,
                .async_size             = sizeof(struct io_timeout_data),
-               .work_flags             = IO_WQ_WORK_MM,
        },
        [IORING_OP_CONNECT] = {
                .needs_file             = 1,
@@ -930,26 +928,14 @@ static const struct io_op_def io_op_defs[] = {
                .pollout                = 1,
                .needs_async_data       = 1,
                .async_size             = sizeof(struct io_async_connect),
-               .work_flags             = IO_WQ_WORK_MM,
        },
        [IORING_OP_FALLOCATE] = {
                .needs_file             = 1,
-               .work_flags             = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE,
-       },
-       [IORING_OP_OPENAT] = {
-               .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
-                                               IO_WQ_WORK_FS | IO_WQ_WORK_MM,
-       },
-       [IORING_OP_CLOSE] = {
-               .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG,
-       },
-       [IORING_OP_FILES_UPDATE] = {
-               .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_MM,
-       },
-       [IORING_OP_STATX] = {
-               .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_MM |
-                                               IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
        },
+       [IORING_OP_OPENAT] = {},
+       [IORING_OP_CLOSE] = {},
+       [IORING_OP_FILES_UPDATE] = {},
+       [IORING_OP_STATX] = {},
        [IORING_OP_READ] = {
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
@@ -957,7 +943,6 @@ static const struct io_op_def io_op_defs[] = {
                .buffer_select          = 1,
                .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
        },
        [IORING_OP_WRITE] = {
                .needs_file             = 1,
@@ -965,42 +950,31 @@ static const struct io_op_def io_op_defs[] = {
                .pollout                = 1,
                .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
-                                               IO_WQ_WORK_FSIZE,
        },
        [IORING_OP_FADVISE] = {
                .needs_file             = 1,
-               .work_flags             = IO_WQ_WORK_BLKCG,
-       },
-       [IORING_OP_MADVISE] = {
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
        },
+       [IORING_OP_MADVISE] = {},
        [IORING_OP_SEND] = {
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
                .pollout                = 1,
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
        },
        [IORING_OP_RECV] = {
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
                .pollin                 = 1,
                .buffer_select          = 1,
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
        },
        [IORING_OP_OPENAT2] = {
-               .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
-                                               IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
        },
        [IORING_OP_EPOLL_CTL] = {
                .unbound_nonreg_file    = 1,
-               .work_flags             = IO_WQ_WORK_FILES,
        },
        [IORING_OP_SPLICE] = {
                .needs_file             = 1,
                .hash_reg_file          = 1,
                .unbound_nonreg_file    = 1,
-               .work_flags             = IO_WQ_WORK_BLKCG,
        },
        [IORING_OP_PROVIDE_BUFFERS] = {},
        [IORING_OP_REMOVE_BUFFERS] = {},
@@ -1012,24 +986,18 @@ static const struct io_op_def io_op_defs[] = {
        [IORING_OP_SHUTDOWN] = {
                .needs_file             = 1,
        },
-       [IORING_OP_RENAMEAT] = {
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
-                                               IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
-       },
-       [IORING_OP_UNLINKAT] = {
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
-                                               IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
-       },
+       [IORING_OP_RENAMEAT] = {},
+       [IORING_OP_UNLINKAT] = {},
 };
 
 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
                                         struct task_struct *task,
                                         struct files_struct *files);
+static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx);
 static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
 static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
                        struct io_ring_ctx *ctx);
-static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
-                                    struct fixed_rsrc_ref_node *ref_node);
+static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
 
 static bool io_rw_reissue(struct io_kiocb *req);
 static void io_cqring_fill_event(struct io_kiocb *req, long res);
@@ -1116,161 +1084,18 @@ static bool io_match_task(struct io_kiocb *head,
                        continue;
                if (req->file && req->file->f_op == &io_uring_fops)
                        return true;
-               if ((req->work.flags & IO_WQ_WORK_FILES) &&
-                   req->work.identity->files == files)
+               if (req->task->files == files)
                        return true;
        }
        return false;
 }
 
-static void io_sq_thread_drop_mm_files(void)
-{
-       struct files_struct *files = current->files;
-       struct mm_struct *mm = current->mm;
-
-       if (mm) {
-               kthread_unuse_mm(mm);
-               mmput(mm);
-               current->mm = NULL;
-       }
-       if (files) {
-               struct nsproxy *nsproxy = current->nsproxy;
-
-               task_lock(current);
-               current->files = NULL;
-               current->nsproxy = NULL;
-               task_unlock(current);
-               put_files_struct(files);
-               put_nsproxy(nsproxy);
-       }
-}
-
-static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx)
-{
-       if (!current->files) {
-               struct files_struct *files;
-               struct nsproxy *nsproxy;
-
-               task_lock(ctx->sqo_task);
-               files = ctx->sqo_task->files;
-               if (!files) {
-                       task_unlock(ctx->sqo_task);
-                       return -EOWNERDEAD;
-               }
-               atomic_inc(&files->count);
-               get_nsproxy(ctx->sqo_task->nsproxy);
-               nsproxy = ctx->sqo_task->nsproxy;
-               task_unlock(ctx->sqo_task);
-
-               task_lock(current);
-               current->files = files;
-               current->nsproxy = nsproxy;
-               task_unlock(current);
-       }
-       return 0;
-}
-
-static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
-{
-       struct mm_struct *mm;
-
-       if (current->mm)
-               return 0;
-
-       task_lock(ctx->sqo_task);
-       mm = ctx->sqo_task->mm;
-       if (unlikely(!mm || !mmget_not_zero(mm)))
-               mm = NULL;
-       task_unlock(ctx->sqo_task);
-
-       if (mm) {
-               kthread_use_mm(mm);
-               return 0;
-       }
-
-       return -EFAULT;
-}
-
-static int __io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
-                                          struct io_kiocb *req)
-{
-       const struct io_op_def *def = &io_op_defs[req->opcode];
-       int ret;
-
-       if (def->work_flags & IO_WQ_WORK_MM) {
-               ret = __io_sq_thread_acquire_mm(ctx);
-               if (unlikely(ret))
-                       return ret;
-       }
-
-       if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) {
-               ret = __io_sq_thread_acquire_files(ctx);
-               if (unlikely(ret))
-                       return ret;
-       }
-
-       return 0;
-}
-
-static inline int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
-                                               struct io_kiocb *req)
-{
-       if (!(ctx->flags & IORING_SETUP_SQPOLL))
-               return 0;
-       return __io_sq_thread_acquire_mm_files(ctx, req);
-}
-
-static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
-                                        struct cgroup_subsys_state **cur_css)
-
-{
-#ifdef CONFIG_BLK_CGROUP
-       /* puts the old one when swapping */
-       if (*cur_css != ctx->sqo_blkcg_css) {
-               kthread_associate_blkcg(ctx->sqo_blkcg_css);
-               *cur_css = ctx->sqo_blkcg_css;
-       }
-#endif
-}
-
-static void io_sq_thread_unassociate_blkcg(void)
-{
-#ifdef CONFIG_BLK_CGROUP
-       kthread_associate_blkcg(NULL);
-#endif
-}
-
 static inline void req_set_fail_links(struct io_kiocb *req)
 {
        if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
                req->flags |= REQ_F_FAIL_LINK;
 }
 
-/*
- * None of these are dereferenced, they are simply used to check if any of
- * them have changed. If we're under current and check they are still the
- * same, we're fine to grab references to them for actual out-of-line use.
- */
-static void io_init_identity(struct io_identity *id)
-{
-       id->files = current->files;
-       id->mm = current->mm;
-#ifdef CONFIG_BLK_CGROUP
-       rcu_read_lock();
-       id->blkcg_css = blkcg_css();
-       rcu_read_unlock();
-#endif
-       id->creds = current_cred();
-       id->nsproxy = current->nsproxy;
-       id->fs = current->fs;
-       id->fsize = rlimit(RLIMIT_FSIZE);
-#ifdef CONFIG_AUDIT
-       id->loginuid = current->loginuid;
-       id->sessionid = current->sessionid;
-#endif
-       refcount_set(&id->count, 1);
-}
-
 static inline void __io_req_init_async(struct io_kiocb *req)
 {
        memset(&req->work, 0, sizeof(req->work));
@@ -1283,17 +1108,10 @@ static inline void __io_req_init_async(struct io_kiocb *req)
  */
 static inline void io_req_init_async(struct io_kiocb *req)
 {
-       struct io_uring_task *tctx = current->io_uring;
-
        if (req->flags & REQ_F_WORK_INITIALIZED)
                return;
 
        __io_req_init_async(req);
-
-       /* Grab a ref if this isn't our static identity */
-       req->work.identity = tctx->identity;
-       if (tctx->identity != &tctx->__identity)
-               refcount_inc(&req->work.identity->count);
 }
 
 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
@@ -1378,40 +1196,14 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
        return false;
 }
 
-static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req)
-{
-       if (req->work.identity == &tctx->__identity)
-               return;
-       if (refcount_dec_and_test(&req->work.identity->count))
-               kfree(req->work.identity);
-}
-
 static void io_req_clean_work(struct io_kiocb *req)
 {
        if (!(req->flags & REQ_F_WORK_INITIALIZED))
                return;
 
-       if (req->work.flags & IO_WQ_WORK_MM)
-               mmdrop(req->work.identity->mm);
-#ifdef CONFIG_BLK_CGROUP
-       if (req->work.flags & IO_WQ_WORK_BLKCG)
-               css_put(req->work.identity->blkcg_css);
-#endif
-       if (req->work.flags & IO_WQ_WORK_CREDS)
-               put_cred(req->work.identity->creds);
-       if (req->work.flags & IO_WQ_WORK_FS) {
-               struct fs_struct *fs = req->work.identity->fs;
-
-               spin_lock(&req->work.identity->fs->lock);
-               if (--fs->users)
-                       fs = NULL;
-               spin_unlock(&req->work.identity->fs->lock);
-               if (fs)
-                       free_fs_struct(fs);
-       }
-       if (req->work.flags & IO_WQ_WORK_FILES) {
-               put_files_struct(req->work.identity->files);
-               put_nsproxy(req->work.identity->nsproxy);
+       if (req->work.creds) {
+               put_cred(req->work.creds);
+               req->work.creds = NULL;
        }
        if (req->flags & REQ_F_INFLIGHT) {
                struct io_ring_ctx *ctx = req->ctx;
@@ -1427,54 +1219,6 @@ static void io_req_clean_work(struct io_kiocb *req)
        }
 
        req->flags &= ~REQ_F_WORK_INITIALIZED;
-       req->work.flags &= ~(IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | IO_WQ_WORK_FS |
-                            IO_WQ_WORK_CREDS | IO_WQ_WORK_FILES);
-       io_put_identity(req->task->io_uring, req);
-}
-
-/*
- * Create a private copy of io_identity, since some fields don't match
- * the current context.
- */
-static bool io_identity_cow(struct io_kiocb *req)
-{
-       struct io_uring_task *tctx = current->io_uring;
-       const struct cred *creds = NULL;
-       struct io_identity *id;
-
-       if (req->work.flags & IO_WQ_WORK_CREDS)
-               creds = req->work.identity->creds;
-
-       id = kmemdup(req->work.identity, sizeof(*id), GFP_KERNEL);
-       if (unlikely(!id)) {
-               req->work.flags |= IO_WQ_WORK_CANCEL;
-               return false;
-       }
-
-       /*
-        * We can safely just re-init the creds we copied  Either the field
-        * matches the current one, or we haven't grabbed it yet. The only
-        * exception is ->creds, through registered personalities, so handle
-        * that one separately.
-        */
-       io_init_identity(id);
-       if (creds)
-               id->creds = creds;
-
-       /* add one for this request */
-       refcount_inc(&id->count);
-
-       /* drop tctx and req identity references, if needed */
-       if (tctx->identity != &tctx->__identity &&
-           refcount_dec_and_test(&tctx->identity->count))
-               kfree(tctx->identity);
-       if (req->work.identity != &tctx->__identity &&
-           refcount_dec_and_test(&req->work.identity->count))
-               kfree(req->work.identity);
-
-       req->work.identity = id;
-       tctx->identity = id;
-       return true;
 }
 
 static void io_req_track_inflight(struct io_kiocb *req)
@@ -1491,79 +1235,6 @@ static void io_req_track_inflight(struct io_kiocb *req)
        }
 }
 
-static bool io_grab_identity(struct io_kiocb *req)
-{
-       const struct io_op_def *def = &io_op_defs[req->opcode];
-       struct io_identity *id = req->work.identity;
-
-       if (def->work_flags & IO_WQ_WORK_FSIZE) {
-               if (id->fsize != rlimit(RLIMIT_FSIZE))
-                       return false;
-               req->work.flags |= IO_WQ_WORK_FSIZE;
-       }
-#ifdef CONFIG_BLK_CGROUP
-       if (!(req->work.flags & IO_WQ_WORK_BLKCG) &&
-           (def->work_flags & IO_WQ_WORK_BLKCG)) {
-               rcu_read_lock();
-               if (id->blkcg_css != blkcg_css()) {
-                       rcu_read_unlock();
-                       return false;
-               }
-               /*
-                * This should be rare, either the cgroup is dying or the task
-                * is moving cgroups. Just punt to root for the handful of ios.
-                */
-               if (css_tryget_online(id->blkcg_css))
-                       req->work.flags |= IO_WQ_WORK_BLKCG;
-               rcu_read_unlock();
-       }
-#endif
-       if (!(req->work.flags & IO_WQ_WORK_CREDS)) {
-               if (id->creds != current_cred())
-                       return false;
-               get_cred(id->creds);
-               req->work.flags |= IO_WQ_WORK_CREDS;
-       }
-#ifdef CONFIG_AUDIT
-       if (!uid_eq(current->loginuid, id->loginuid) ||
-           current->sessionid != id->sessionid)
-               return false;
-#endif
-       if (!(req->work.flags & IO_WQ_WORK_FS) &&
-           (def->work_flags & IO_WQ_WORK_FS)) {
-               if (current->fs != id->fs)
-                       return false;
-               spin_lock(&id->fs->lock);
-               if (!id->fs->in_exec) {
-                       id->fs->users++;
-                       req->work.flags |= IO_WQ_WORK_FS;
-               } else {
-                       req->work.flags |= IO_WQ_WORK_CANCEL;
-               }
-               spin_unlock(&current->fs->lock);
-       }
-       if (!(req->work.flags & IO_WQ_WORK_FILES) &&
-           (def->work_flags & IO_WQ_WORK_FILES) &&
-           !(req->flags & REQ_F_NO_FILE_TABLE)) {
-               if (id->files != current->files ||
-                   id->nsproxy != current->nsproxy)
-                       return false;
-               atomic_inc(&id->files->count);
-               get_nsproxy(id->nsproxy);
-               req->work.flags |= IO_WQ_WORK_FILES;
-               io_req_track_inflight(req);
-       }
-       if (!(req->work.flags & IO_WQ_WORK_MM) &&
-           (def->work_flags & IO_WQ_WORK_MM)) {
-               if (id->mm != current->mm)
-                       return false;
-               mmgrab(id->mm);
-               req->work.flags |= IO_WQ_WORK_MM;
-       }
-
-       return true;
-}
-
 static void io_prep_async_work(struct io_kiocb *req)
 {
        const struct io_op_def *def = &io_op_defs[req->opcode];
@@ -1581,17 +1252,8 @@ static void io_prep_async_work(struct io_kiocb *req)
                if (def->unbound_nonreg_file)
                        req->work.flags |= IO_WQ_WORK_UNBOUND;
        }
-
-       /* if we fail grabbing identity, we must COW, regrab, and retry */
-       if (io_grab_identity(req))
-               return;
-
-       if (!io_identity_cow(req))
-               return;
-
-       /* can't fail at this point */
-       if (!io_grab_identity(req))
-               WARN_ON(1);
+       if (!req->work.creds)
+               req->work.creds = get_current_cred();
 }
 
 static void io_prep_async_link(struct io_kiocb *req)
@@ -1606,10 +1268,14 @@ static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
 {
        struct io_ring_ctx *ctx = req->ctx;
        struct io_kiocb *link = io_prep_linked_timeout(req);
+       struct io_uring_task *tctx = req->task->io_uring;
+
+       BUG_ON(!tctx);
+       BUG_ON(!tctx->io_wq);
 
        trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
                                        &req->work, req->flags);
-       io_wq_enqueue(ctx->io_wq, &req->work);
+       io_wq_enqueue(tctx->io_wq, &req->work);
        return link;
 }
 
@@ -2227,10 +1893,10 @@ static void tctx_task_work(struct callback_head *cb)
 {
        struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work);
 
+       clear_bit(0, &tctx->task_state);
+
        while (__tctx_task_work(tctx))
                cond_resched();
-
-       clear_bit(0, &tctx->task_state);
 }
 
 static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
@@ -2303,11 +1969,14 @@ static int io_req_task_work_add(struct io_kiocb *req)
 static void io_req_task_work_add_fallback(struct io_kiocb *req,
                                          task_work_func_t cb)
 {
-       struct task_struct *tsk = io_wq_get_task(req->ctx->io_wq);
+       struct io_ring_ctx *ctx = req->ctx;
+       struct callback_head *head;
 
        init_task_work(&req->task_work, cb);
-       task_work_add(tsk, &req->task_work, TWA_NONE);
-       wake_up_process(tsk);
+       do {
+               head = READ_ONCE(ctx->exit_task_work);
+               req->task_work.next = head;
+       } while (cmpxchg(&ctx->exit_task_work, head, &req->task_work) != head);
 }
 
 static void __io_req_task_cancel(struct io_kiocb *req, int error)
@@ -2329,7 +1998,9 @@ static void io_req_task_cancel(struct callback_head *cb)
        struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
        struct io_ring_ctx *ctx = req->ctx;
 
-       __io_req_task_cancel(req, -ECANCELED);
+       mutex_lock(&ctx->uring_lock);
+       __io_req_task_cancel(req, req->result);
+       mutex_unlock(&ctx->uring_lock);
        percpu_ref_put(&ctx->refs);
 }
 
@@ -2339,15 +2010,11 @@ static void __io_req_task_submit(struct io_kiocb *req)
 
        /* ctx stays valid until unlock, even if we drop all ours ctx->refs */
        mutex_lock(&ctx->uring_lock);
-       if (!ctx->sqo_dead && !(current->flags & PF_EXITING) &&
-           !io_sq_thread_acquire_mm_files(ctx, req))
+       if (!ctx->sqo_dead && !(current->flags & PF_EXITING) && !current->in_execve)
                __io_queue_sqe(req);
        else
                __io_req_task_cancel(req, -EFAULT);
        mutex_unlock(&ctx->uring_lock);
-
-       if (ctx->flags & IORING_SETUP_SQPOLL)
-               io_sq_thread_drop_mm_files();
 }
 
 static void io_req_task_submit(struct callback_head *cb)
@@ -2364,11 +2031,22 @@ static void io_req_task_queue(struct io_kiocb *req)
        req->task_work.func = io_req_task_submit;
        ret = io_req_task_work_add(req);
        if (unlikely(ret)) {
+               req->result = -ECANCELED;
                percpu_ref_get(&req->ctx->refs);
                io_req_task_work_add_fallback(req, io_req_task_cancel);
        }
 }
 
+static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
+{
+       percpu_ref_get(&req->ctx->refs);
+       req->result = ret;
+       req->task_work.func = io_req_task_cancel;
+
+       if (unlikely(io_req_task_work_add(req)))
+               io_req_task_work_add_fallback(req, io_req_task_cancel);
+}
+
 static inline void io_queue_next(struct io_kiocb *req)
 {
        struct io_kiocb *nxt = io_req_find_next(req);
@@ -2800,18 +2478,22 @@ static bool io_rw_reissue(struct io_kiocb *req)
 {
 #ifdef CONFIG_BLOCK
        umode_t mode = file_inode(req->file)->i_mode;
-       int ret;
 
        if (!S_ISBLK(mode) && !S_ISREG(mode))
                return false;
        if ((req->flags & REQ_F_NOWAIT) || io_wq_current_is_worker())
                return false;
+       /*
+        * If ref is dying, we might be running poll reap from the exit work.
+        * Don't attempt to reissue from that path, just let it fail with
+        * -EAGAIN.
+        */
+       if (percpu_ref_is_dying(&req->ctx->refs))
+               return false;
 
        lockdep_assert_held(&req->ctx->uring_lock);
 
-       ret = io_sq_thread_acquire_mm_files(req->ctx, req);
-
-       if (!ret && io_resubmit_prep(req)) {
+       if (io_resubmit_prep(req)) {
                refcount_inc(&req->refs);
                io_queue_async_work(req);
                return true;
@@ -3467,19 +3149,9 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
 
 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-       ssize_t ret;
-
-       ret = io_prep_rw(req, sqe);
-       if (ret)
-               return ret;
-
        if (unlikely(!(req->file->f_mode & FMODE_READ)))
                return -EBADF;
-
-       /* either don't need iovec imported or already have it */
-       if (!req->async_data)
-               return 0;
-       return io_rw_prep_async(req, READ);
+       return io_prep_rw(req, sqe);
 }
 
 /*
@@ -3607,10 +3279,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
        ret = io_iter_do_read(req, iter);
 
        if (ret == -EIOCBQUEUED) {
-               /* it's faster to check here then delegate to kfree */
-               if (iovec)
-                       kfree(iovec);
-               return 0;
+               goto out_free;
        } else if (ret == -EAGAIN) {
                /* IOPOLL retry should happen for io-wq threads */
                if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
@@ -3631,6 +3300,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
        if (ret2)
                return ret2;
 
+       iovec = NULL;
        rw = req->async_data;
        /* now use our persistent iterator, if we aren't already */
        iter = &rw->iter;
@@ -3657,24 +3327,18 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
        } while (ret > 0 && ret < io_size);
 done:
        kiocb_done(kiocb, ret, issue_flags);
+out_free:
+       /* it's faster to check here then delegate to kfree */
+       if (iovec)
+               kfree(iovec);
        return 0;
 }
 
 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-       ssize_t ret;
-
-       ret = io_prep_rw(req, sqe);
-       if (ret)
-               return ret;
-
        if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
                return -EBADF;
-
-       /* either don't need iovec imported or already have it */
-       if (!req->async_data)
-               return 0;
-       return io_rw_prep_async(req, WRITE);
+       return io_prep_rw(req, sqe);
 }
 
 static int io_write(struct io_kiocb *req, unsigned int issue_flags)
@@ -4011,7 +3675,7 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
        return 0;
 }
 
-static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
        struct io_ring_ctx *ctx = req->ctx;
 
@@ -4598,13 +4262,10 @@ err:
        return 0;
 }
 
-static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
        struct io_ring_ctx *ctx = req->ctx;
 
-       if (!req->file)
-               return -EBADF;
-
        if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
@@ -4664,11 +4325,21 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req,
                                   req->sr_msg.msg_flags, &iomsg->free_iov);
 }
 
+static int io_sendmsg_prep_async(struct io_kiocb *req)
+{
+       int ret;
+
+       if (!io_op_defs[req->opcode].needs_async_data)
+               return 0;
+       ret = io_sendmsg_copy_hdr(req, req->async_data);
+       if (!ret)
+               req->flags |= REQ_F_NEED_CLEANUP;
+       return ret;
+}
+
 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-       struct io_async_msghdr *async_msg = req->async_data;
        struct io_sr_msg *sr = &req->sr_msg;
-       int ret;
 
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
@@ -4681,13 +4352,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        if (req->ctx->compat)
                sr->msg_flags |= MSG_CMSG_COMPAT;
 #endif
-
-       if (!async_msg || !io_op_defs[req->opcode].needs_async_data)
-               return 0;
-       ret = io_sendmsg_copy_hdr(req, async_msg);
-       if (!ret)
-               req->flags |= REQ_F_NEED_CLEANUP;
-       return ret;
+       return 0;
 }
 
 static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
@@ -4881,13 +4546,22 @@ static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
        return io_put_kbuf(req, req->sr_msg.kbuf);
 }
 
-static int io_recvmsg_prep(struct io_kiocb *req,
-                          const struct io_uring_sqe *sqe)
+static int io_recvmsg_prep_async(struct io_kiocb *req)
 {
-       struct io_async_msghdr *async_msg = req->async_data;
-       struct io_sr_msg *sr = &req->sr_msg;
        int ret;
 
+       if (!io_op_defs[req->opcode].needs_async_data)
+               return 0;
+       ret = io_recvmsg_copy_hdr(req, req->async_data);
+       if (!ret)
+               req->flags |= REQ_F_NEED_CLEANUP;
+       return ret;
+}
+
+static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+       struct io_sr_msg *sr = &req->sr_msg;
+
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
 
@@ -4900,13 +4574,7 @@ static int io_recvmsg_prep(struct io_kiocb *req,
        if (req->ctx->compat)
                sr->msg_flags |= MSG_CMSG_COMPAT;
 #endif
-
-       if (!async_msg || !io_op_defs[req->opcode].needs_async_data)
-               return 0;
-       ret = io_recvmsg_copy_hdr(req, async_msg);
-       if (!ret)
-               req->flags |= REQ_F_NEED_CLEANUP;
-       return ret;
+       return 0;
 }
 
 static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
@@ -5059,10 +4727,17 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
        return 0;
 }
 
+static int io_connect_prep_async(struct io_kiocb *req)
+{
+       struct io_async_connect *io = req->async_data;
+       struct io_connect *conn = &req->connect;
+
+       return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
+}
+
 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
        struct io_connect *conn = &req->connect;
-       struct io_async_connect *io = req->async_data;
 
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
@@ -5071,12 +4746,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
        conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
        conn->addr_len =  READ_ONCE(sqe->addr2);
-
-       if (!io)
-               return 0;
-
-       return move_addr_to_kernel(conn->addr, conn->addr_len,
-                                       &io->address);
+       return 0;
 }
 
 static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
@@ -5121,72 +4791,48 @@ out:
        return 0;
 }
 #else /* !CONFIG_NET */
-static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-       return -EOPNOTSUPP;
-}
+#define IO_NETOP_FN(op)                                                        \
+static int io_##op(struct io_kiocb *req, unsigned int issue_flags)     \
+{                                                                      \
+       return -EOPNOTSUPP;                                             \
+}
+
+#define IO_NETOP_PREP(op)                                              \
+IO_NETOP_FN(op)                                                                \
+static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
+{                                                                      \
+       return -EOPNOTSUPP;                                             \
+}                                                                      \
+
+#define IO_NETOP_PREP_ASYNC(op)                                                \
+IO_NETOP_PREP(op)                                                      \
+static int io_##op##_prep_async(struct io_kiocb *req)                  \
+{                                                                      \
+       return -EOPNOTSUPP;                                             \
+}
+
+IO_NETOP_PREP_ASYNC(sendmsg);
+IO_NETOP_PREP_ASYNC(recvmsg);
+IO_NETOP_PREP_ASYNC(connect);
+IO_NETOP_PREP(accept);
+IO_NETOP_FN(send);
+IO_NETOP_FN(recv);
+#endif /* CONFIG_NET */
 
-static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
-{
-       return -EOPNOTSUPP;
-}
+struct io_poll_table {
+       struct poll_table_struct pt;
+       struct io_kiocb *req;
+       int error;
+};
 
-static int io_send(struct io_kiocb *req, unsigned int issue_flags)
+static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
+                          __poll_t mask, task_work_func_t func)
 {
-       return -EOPNOTSUPP;
-}
+       int ret;
 
-static int io_recvmsg_prep(struct io_kiocb *req,
-                          const struct io_uring_sqe *sqe)
-{
-       return -EOPNOTSUPP;
-}
-
-static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
-{
-       return -EOPNOTSUPP;
-}
-
-static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
-{
-       return -EOPNOTSUPP;
-}
-
-static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-       return -EOPNOTSUPP;
-}
-
-static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
-{
-       return -EOPNOTSUPP;
-}
-
-static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
-{
-       return -EOPNOTSUPP;
-}
-
-static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
-{
-       return -EOPNOTSUPP;
-}
-#endif /* CONFIG_NET */
-
-struct io_poll_table {
-       struct poll_table_struct pt;
-       struct io_kiocb *req;
-       int error;
-};
-
-static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
-                          __poll_t mask, task_work_func_t func)
-{
-       int ret;
-
-       /* for instances that support it check for an event match first: */
-       if (mask && !(mask & poll->events))
-               return 0;
+       /* for instances that support it check for an event match first: */
+       if (mask && !(mask & poll->events))
+               return 0;
 
        trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
 
@@ -5952,12 +5598,15 @@ static bool io_cancel_cb(struct io_wq_work *work, void *data)
        return req->user_data == (unsigned long) data;
 }
 
-static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
+static int io_async_cancel_one(struct io_uring_task *tctx, void *sqe_addr)
 {
        enum io_wq_cancel cancel_ret;
        int ret = 0;
 
-       cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false);
+       if (!tctx->io_wq)
+               return -ENOENT;
+
+       cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, sqe_addr, false);
        switch (cancel_ret) {
        case IO_WQ_CANCEL_OK:
                ret = 0;
@@ -5980,7 +5629,8 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
        unsigned long flags;
        int ret;
 
-       ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
+       ret = io_async_cancel_one(req->task->io_uring,
+                                       (void *) (unsigned long) sqe_addr);
        if (ret != -ENOENT) {
                spin_lock_irqsave(&ctx->completion_lock, flags);
                goto done;
@@ -6084,9 +5734,9 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        case IORING_OP_POLL_REMOVE:
                return io_poll_remove_prep(req, sqe);
        case IORING_OP_FSYNC:
-               return io_prep_fsync(req, sqe);
+               return io_fsync_prep(req, sqe);
        case IORING_OP_SYNC_FILE_RANGE:
-               return io_prep_sfr(req, sqe);
+               return io_sfr_prep(req, sqe);
        case IORING_OP_SENDMSG:
        case IORING_OP_SEND:
                return io_sendmsg_prep(req, sqe);
@@ -6144,14 +5794,39 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        return-EINVAL;
 }
 
-static int io_req_defer_prep(struct io_kiocb *req,
-                            const struct io_uring_sqe *sqe)
+static int io_req_prep_async(struct io_kiocb *req)
+{
+       switch (req->opcode) {
+       case IORING_OP_READV:
+       case IORING_OP_READ_FIXED:
+       case IORING_OP_READ:
+               return io_rw_prep_async(req, READ);
+       case IORING_OP_WRITEV:
+       case IORING_OP_WRITE_FIXED:
+       case IORING_OP_WRITE:
+               return io_rw_prep_async(req, WRITE);
+       case IORING_OP_SENDMSG:
+       case IORING_OP_SEND:
+               return io_sendmsg_prep_async(req);
+       case IORING_OP_RECVMSG:
+       case IORING_OP_RECV:
+               return io_recvmsg_prep_async(req);
+       case IORING_OP_CONNECT:
+               return io_connect_prep_async(req);
+       }
+       return 0;
+}
+
+static int io_req_defer_prep(struct io_kiocb *req)
 {
-       if (!sqe)
+       if (!io_op_defs[req->opcode].needs_async_data)
+               return 0;
+       /* some opcodes init it during the inital prep */
+       if (req->async_data)
                return 0;
-       if (io_alloc_async_data(req))
+       if (__io_alloc_async_data(req))
                return -EAGAIN;
-       return io_req_prep(req, sqe);
+       return io_req_prep_async(req);
 }
 
 static u32 io_get_sequence(struct io_kiocb *req)
@@ -6167,7 +5842,7 @@ static u32 io_get_sequence(struct io_kiocb *req)
        return total_submitted - nr_reqs;
 }
 
-static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_req_defer(struct io_kiocb *req)
 {
        struct io_ring_ctx *ctx = req->ctx;
        struct io_defer_entry *de;
@@ -6184,11 +5859,9 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
                return 0;
 
-       if (!req->async_data) {
-               ret = io_req_defer_prep(req, sqe);
-               if (ret)
-                       return ret;
-       }
+       ret = io_req_defer_prep(req);
+       if (ret)
+               return ret;
        io_prep_async_link(req);
        de = kmalloc(sizeof(*de), GFP_KERNEL);
        if (!de)
@@ -6427,29 +6100,11 @@ static void io_wq_submit_work(struct io_wq_work *work)
                } while (1);
        }
 
+       /* avoid locking problems by failing it from a clean context */
        if (ret) {
-               struct io_ring_ctx *lock_ctx = NULL;
-
-               if (req->ctx->flags & IORING_SETUP_IOPOLL)
-                       lock_ctx = req->ctx;
-
-               /*
-                * io_iopoll_complete() does not hold completion_lock to
-                * complete polled io, so here for polled io, we can not call
-                * io_req_complete() directly, otherwise there maybe concurrent
-                * access to cqring, defer_list, etc, which is not safe. Given
-                * that io_iopoll_complete() is always called under uring_lock,
-                * so here for polled io, we also get uring_lock to complete
-                * it.
-                */
-               if (lock_ctx)
-                       mutex_lock(&lock_ctx->uring_lock);
-
-               req_set_fail_links(req);
-               io_req_complete(req, ret);
-
-               if (lock_ctx)
-                       mutex_unlock(&lock_ctx->uring_lock);
+               /* io-wq is going to take one down */
+               refcount_inc(&req->refs);
+               io_req_task_queue_fail(req, ret);
        }
 }
 
@@ -6564,10 +6219,9 @@ static void __io_queue_sqe(struct io_kiocb *req)
        const struct cred *old_creds = NULL;
        int ret;
 
-       if ((req->flags & REQ_F_WORK_INITIALIZED) &&
-           (req->work.flags & IO_WQ_WORK_CREDS) &&
-           req->work.identity->creds != current_cred())
-               old_creds = override_creds(req->work.identity->creds);
+       if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds &&
+           req->work.creds != current_cred())
+               old_creds = override_creds(req->work.creds);
 
        ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
 
@@ -6607,11 +6261,11 @@ static void __io_queue_sqe(struct io_kiocb *req)
                io_queue_linked_timeout(linked_timeout);
 }
 
-static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static void io_queue_sqe(struct io_kiocb *req)
 {
        int ret;
 
-       ret = io_req_defer(req, sqe);
+       ret = io_req_defer(req);
        if (ret) {
                if (ret != -EIOCBQUEUED) {
 fail_req:
@@ -6620,42 +6274,139 @@ fail_req:
                        io_req_complete(req, ret);
                }
        } else if (req->flags & REQ_F_FORCE_ASYNC) {
-               if (!req->async_data) {
-                       ret = io_req_defer_prep(req, sqe);
-                       if (unlikely(ret))
-                               goto fail_req;
-               }
+               ret = io_req_defer_prep(req);
+               if (unlikely(ret))
+                       goto fail_req;
                io_queue_async_work(req);
        } else {
-               if (sqe) {
-                       ret = io_req_prep(req, sqe);
-                       if (unlikely(ret))
-                               goto fail_req;
-               }
                __io_queue_sqe(req);
        }
 }
 
-static inline void io_queue_link_head(struct io_kiocb *req)
+/*
+ * Check SQE restrictions (opcode and flags).
+ *
+ * Returns 'true' if SQE is allowed, 'false' otherwise.
+ */
+static inline bool io_check_restriction(struct io_ring_ctx *ctx,
+                                       struct io_kiocb *req,
+                                       unsigned int sqe_flags)
 {
-       if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
-               io_put_req(req);
-               io_req_complete(req, -ECANCELED);
-       } else
-               io_queue_sqe(req, NULL);
+       if (!ctx->restricted)
+               return true;
+
+       if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
+               return false;
+
+       if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
+           ctx->restrictions.sqe_flags_required)
+               return false;
+
+       if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
+                         ctx->restrictions.sqe_flags_required))
+               return false;
+
+       return true;
 }
 
-struct io_submit_link {
-       struct io_kiocb *head;
-       struct io_kiocb *last;
-};
+static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
+                      const struct io_uring_sqe *sqe)
+{
+       struct io_submit_state *state;
+       unsigned int sqe_flags;
+       int id, ret = 0;
+
+       req->opcode = READ_ONCE(sqe->opcode);
+       /* same numerical values with corresponding REQ_F_*, safe to copy */
+       req->flags = sqe_flags = READ_ONCE(sqe->flags);
+       req->user_data = READ_ONCE(sqe->user_data);
+       req->async_data = NULL;
+       req->file = NULL;
+       req->ctx = ctx;
+       req->link = NULL;
+       req->fixed_rsrc_refs = NULL;
+       /* one is dropped after submission, the other at completion */
+       refcount_set(&req->refs, 2);
+       req->task = current;
+       req->result = 0;
+
+       /* enforce forwards compatibility on users */
+       if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
+               req->flags = 0;
+               return -EINVAL;
+       }
+
+       if (unlikely(req->opcode >= IORING_OP_LAST))
+               return -EINVAL;
+
+       if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
+               return -EACCES;
+
+       if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
+           !io_op_defs[req->opcode].buffer_select)
+               return -EOPNOTSUPP;
+
+       id = READ_ONCE(sqe->personality);
+       if (id) {
+               __io_req_init_async(req);
+               req->work.creds = idr_find(&ctx->personality_idr, id);
+               if (unlikely(!req->work.creds))
+                       return -EINVAL;
+               get_cred(req->work.creds);
+       }
+
+       state = &ctx->submit_state;
+
+       /*
+        * Plug now if we have more than 1 IO left after this, and the target
+        * is potentially a read/write to block based storage.
+        */
+       if (!state->plug_started && state->ios_left > 1 &&
+           io_op_defs[req->opcode].plug) {
+               blk_start_plug(&state->plug);
+               state->plug_started = true;
+       }
+
+       if (io_op_defs[req->opcode].needs_file) {
+               bool fixed = req->flags & REQ_F_FIXED_FILE;
+
+               req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
+               if (unlikely(!req->file))
+                       ret = -EBADF;
+       }
+
+       state->ios_left--;
+       return ret;
+}
 
-static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-                        struct io_submit_link *link)
+static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
+                        const struct io_uring_sqe *sqe)
 {
-       struct io_ring_ctx *ctx = req->ctx;
+       struct io_submit_link *link = &ctx->submit_state.link;
        int ret;
 
+       ret = io_init_req(ctx, req, sqe);
+       if (unlikely(ret)) {
+fail_req:
+               io_put_req(req);
+               io_req_complete(req, ret);
+               if (link->head) {
+                       /* fail even hard links since we don't submit */
+                       link->head->flags |= REQ_F_FAIL_LINK;
+                       io_put_req(link->head);
+                       io_req_complete(link->head, -ECANCELED);
+                       link->head = NULL;
+               }
+               return ret;
+       }
+       ret = io_req_prep(req, sqe);
+       if (unlikely(ret))
+               goto fail_req;
+
+       /* don't need @sqe from now on */
+       trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
+                               true, ctx->flags & IORING_SETUP_SQPOLL);
+
        /*
         * If we already have a head request, queue this one for async
         * submittal once the head completes. If we don't have a head but
@@ -6677,19 +6428,16 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                        head->flags |= REQ_F_IO_DRAIN;
                        ctx->drain_next = 1;
                }
-               ret = io_req_defer_prep(req, sqe);
-               if (unlikely(ret)) {
-                       /* fail even hard links since we don't submit */
-                       head->flags |= REQ_F_FAIL_LINK;
-                       return ret;
-               }
+               ret = io_req_defer_prep(req);
+               if (unlikely(ret))
+                       goto fail_req;
                trace_io_uring_link(ctx, req, head);
                link->last->link = req;
                link->last = req;
 
                /* last request of a link, enqueue the link */
                if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
-                       io_queue_link_head(head);
+                       io_queue_sqe(head);
                        link->head = NULL;
                }
        } else {
@@ -6698,13 +6446,10 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                        ctx->drain_next = 0;
                }
                if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
-                       ret = io_req_defer_prep(req, sqe);
-                       if (unlikely(ret))
-                               req->flags |= REQ_F_FAIL_LINK;
                        link->head = req;
                        link->last = req;
                } else {
-                       io_queue_sqe(req, sqe);
+                       io_queue_sqe(req);
                }
        }
 
@@ -6717,6 +6462,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 static void io_submit_state_end(struct io_submit_state *state,
                                struct io_ring_ctx *ctx)
 {
+       if (state->link.head)
+               io_queue_sqe(state->link.head);
        if (state->comp.nr)
                io_submit_flush_completions(&state->comp, ctx);
        if (state->plug_started)
@@ -6732,6 +6479,8 @@ static void io_submit_state_start(struct io_submit_state *state,
 {
        state->plug_started = false;
        state->ios_left = max_ios;
+       /* set only head, no need to init link_last in advance */
+       state->link.head = NULL;
 }
 
 static void io_commit_sqring(struct io_ring_ctx *ctx)
@@ -6777,117 +6526,9 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
        return NULL;
 }
 
-/*
- * Check SQE restrictions (opcode and flags).
- *
- * Returns 'true' if SQE is allowed, 'false' otherwise.
- */
-static inline bool io_check_restriction(struct io_ring_ctx *ctx,
-                                       struct io_kiocb *req,
-                                       unsigned int sqe_flags)
-{
-       if (!ctx->restricted)
-               return true;
-
-       if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
-               return false;
-
-       if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
-           ctx->restrictions.sqe_flags_required)
-               return false;
-
-       if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
-                         ctx->restrictions.sqe_flags_required))
-               return false;
-
-       return true;
-}
-
-#define SQE_VALID_FLAGS        (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
-                               IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
-                               IOSQE_BUFFER_SELECT)
-
-static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
-                      const struct io_uring_sqe *sqe)
-{
-       struct io_submit_state *state;
-       unsigned int sqe_flags;
-       int id, ret = 0;
-
-       req->opcode = READ_ONCE(sqe->opcode);
-       /* same numerical values with corresponding REQ_F_*, safe to copy */
-       req->flags = sqe_flags = READ_ONCE(sqe->flags);
-       req->user_data = READ_ONCE(sqe->user_data);
-       req->async_data = NULL;
-       req->file = NULL;
-       req->ctx = ctx;
-       req->link = NULL;
-       req->fixed_rsrc_refs = NULL;
-       /* one is dropped after submission, the other at completion */
-       refcount_set(&req->refs, 2);
-       req->task = current;
-       req->result = 0;
-
-       /* enforce forwards compatibility on users */
-       if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
-               return -EINVAL;
-
-       if (unlikely(req->opcode >= IORING_OP_LAST))
-               return -EINVAL;
-
-       if (unlikely(io_sq_thread_acquire_mm_files(ctx, req)))
-               return -EFAULT;
-
-       if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
-               return -EACCES;
-
-       if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
-           !io_op_defs[req->opcode].buffer_select)
-               return -EOPNOTSUPP;
-
-       id = READ_ONCE(sqe->personality);
-       if (id) {
-               struct io_identity *iod;
-
-               iod = idr_find(&ctx->personality_idr, id);
-               if (unlikely(!iod))
-                       return -EINVAL;
-               refcount_inc(&iod->count);
-
-               __io_req_init_async(req);
-               get_cred(iod->creds);
-               req->work.identity = iod;
-               req->work.flags |= IO_WQ_WORK_CREDS;
-       }
-
-       state = &ctx->submit_state;
-
-       /*
-        * Plug now if we have more than 1 IO left after this, and the target
-        * is potentially a read/write to block based storage.
-        */
-       if (!state->plug_started && state->ios_left > 1 &&
-           io_op_defs[req->opcode].plug) {
-               blk_start_plug(&state->plug);
-               state->plug_started = true;
-       }
-
-       if (io_op_defs[req->opcode].needs_file) {
-               bool fixed = req->flags & REQ_F_FIXED_FILE;
-
-               req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
-               if (unlikely(!req->file))
-                       ret = -EBADF;
-       }
-
-       state->ios_left--;
-       return ret;
-}
-
 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 {
-       struct io_submit_link link;
-       int i, submitted = 0;
+       int submitted = 0;
 
        /* if we have a backlog and couldn't flush it all, return BUSY */
        if (test_bit(0, &ctx->sq_check_overflow)) {
@@ -6903,14 +6544,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 
        percpu_counter_add(&current->io_uring->inflight, nr);
        refcount_add(nr, &current->usage);
-
        io_submit_state_start(&ctx->submit_state, nr);
-       link.head = NULL;
 
-       for (i = 0; i < nr; i++) {
+       while (submitted < nr) {
                const struct io_uring_sqe *sqe;
                struct io_kiocb *req;
-               int err;
 
                req = io_alloc_req(ctx);
                if (unlikely(!req)) {
@@ -6925,20 +6563,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
                }
                /* will complete beyond this point, count as submitted */
                submitted++;
-
-               err = io_init_req(ctx, req, sqe);
-               if (unlikely(err)) {
-fail_req:
-                       io_put_req(req);
-                       io_req_complete(req, err);
+               if (io_submit_sqe(ctx, req, sqe))
                        break;
-               }
-
-               trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
-                                       true, ctx->flags & IORING_SETUP_SQPOLL);
-               err = io_submit_sqe(req, sqe, &link);
-               if (err)
-                       goto fail_req;
        }
 
        if (unlikely(submitted != nr)) {
@@ -6950,10 +6576,8 @@ fail_req:
                percpu_counter_sub(&tctx->inflight, unused);
                put_task_struct_many(current, unused);
        }
-       if (link.head)
-               io_queue_link_head(link.head);
-       io_submit_state_end(&ctx->submit_state, ctx);
 
+       io_submit_state_end(&ctx->submit_state, ctx);
         /* Commit SQ ring head once we've consumed and submitted all SQEs */
        io_commit_sqring(ctx);
 
@@ -7030,71 +6654,97 @@ static void io_sqd_init_new(struct io_sq_data *sqd)
        io_sqd_update_thread_idle(sqd);
 }
 
+static bool io_sq_thread_should_stop(struct io_sq_data *sqd)
+{
+       return test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+}
+
+static bool io_sq_thread_should_park(struct io_sq_data *sqd)
+{
+       return test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+}
+
+static void io_sq_thread_parkme(struct io_sq_data *sqd)
+{
+       for (;;) {
+               /*
+                * TASK_PARKED is a special state; we must serialize against
+                * possible pending wakeups to avoid store-store collisions on
+                * task->state.
+                *
+                * Such a collision might possibly result in the task state
+                * changin from TASK_PARKED and us failing the
+                * wait_task_inactive() in kthread_park().
+                */
+               set_special_state(TASK_PARKED);
+               if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state))
+                       break;
+
+               /*
+                * Thread is going to call schedule(), do not preempt it,
+                * or the caller of kthread_park() may spend more time in
+                * wait_task_inactive().
+                */
+               preempt_disable();
+               complete(&sqd->completion);
+               schedule_preempt_disabled();
+               preempt_enable();
+       }
+       __set_current_state(TASK_RUNNING);
+}
+
 static int io_sq_thread(void *data)
 {
-       struct cgroup_subsys_state *cur_css = NULL;
-       struct files_struct *old_files = current->files;
-       struct nsproxy *old_nsproxy = current->nsproxy;
-       const struct cred *old_cred = NULL;
        struct io_sq_data *sqd = data;
        struct io_ring_ctx *ctx;
        unsigned long timeout = 0;
+       char buf[TASK_COMM_LEN];
        DEFINE_WAIT(wait);
 
-       task_lock(current);
-       current->files = NULL;
-       current->nsproxy = NULL;
-       task_unlock(current);
+       sprintf(buf, "iou-sqp-%d", sqd->task_pid);
+       set_task_comm(current, buf);
+       sqd->thread = current;
+       current->pf_io_worker = NULL;
 
-       while (!kthread_should_stop()) {
+       if (sqd->sq_cpu != -1)
+               set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
+       else
+               set_cpus_allowed_ptr(current, cpu_online_mask);
+       current->flags |= PF_NO_SETAFFINITY;
+
+       complete(&sqd->completion);
+
+       wait_for_completion(&sqd->startup);
+
+       while (!io_sq_thread_should_stop(sqd)) {
                int ret;
                bool cap_entries, sqt_spin, needs_sched;
 
                /*
                 * Any changes to the sqd lists are synchronized through the
-                * kthread parking. This synchronizes the thread vs users,
+                * thread parking. This synchronizes the thread vs users,
                 * the users are synchronized on the sqd->ctx_lock.
                 */
-               if (kthread_should_park()) {
-                       kthread_parkme();
-                       /*
-                        * When sq thread is unparked, in case the previous park operation
-                        * comes from io_put_sq_data(), which means that sq thread is going
-                        * to be stopped, so here needs to have a check.
-                        */
-                       if (kthread_should_stop())
-                               break;
+               if (io_sq_thread_should_park(sqd)) {
+                       io_sq_thread_parkme(sqd);
+                       continue;
                }
-
                if (unlikely(!list_empty(&sqd->ctx_new_list))) {
                        io_sqd_init_new(sqd);
                        timeout = jiffies + sqd->sq_thread_idle;
                }
-
+               if (fatal_signal_pending(current))
+                       break;
                sqt_spin = false;
                cap_entries = !list_is_singular(&sqd->ctx_list);
                list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
-                       if (current->cred != ctx->creds) {
-                               if (old_cred)
-                                       revert_creds(old_cred);
-                               old_cred = override_creds(ctx->creds);
-                       }
-                       io_sq_thread_associate_blkcg(ctx, &cur_css);
-#ifdef CONFIG_AUDIT
-                       current->loginuid = ctx->loginuid;
-                       current->sessionid = ctx->sessionid;
-#endif
-
                        ret = __io_sq_thread(ctx, cap_entries);
                        if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
                                sqt_spin = true;
-
-                       io_sq_thread_drop_mm_files();
                }
 
                if (sqt_spin || !time_after(jiffies, timeout)) {
                        io_run_task_work();
-                       io_sq_thread_drop_mm_files();
                        cond_resched();
                        if (sqt_spin)
                                timeout = jiffies + sqd->sq_thread_idle;
@@ -7115,7 +6765,7 @@ static int io_sq_thread(void *data)
                        }
                }
 
-               if (needs_sched && !kthread_should_park()) {
+               if (needs_sched && !io_sq_thread_should_park(sqd)) {
                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                io_ring_set_wakeup_flag(ctx);
 
@@ -7128,22 +6778,28 @@ static int io_sq_thread(void *data)
                timeout = jiffies + sqd->sq_thread_idle;
        }
 
-       io_run_task_work();
-       io_sq_thread_drop_mm_files();
+       list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+               io_uring_cancel_sqpoll(ctx);
 
-       if (cur_css)
-               io_sq_thread_unassociate_blkcg();
-       if (old_cred)
-               revert_creds(old_cred);
+       io_run_task_work();
 
-       task_lock(current);
-       current->files = old_files;
-       current->nsproxy = old_nsproxy;
-       task_unlock(current);
+       if (io_sq_thread_should_park(sqd))
+               io_sq_thread_parkme(sqd);
 
-       kthread_parkme();
+       /*
+        * Clear thread under lock so that concurrent parks work correctly
+        */
+       complete(&sqd->completion);
+       mutex_lock(&sqd->lock);
+       sqd->thread = NULL;
+       list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+               ctx->sqo_exec = 1;
+               io_ring_set_wakeup_flag(ctx);
+       }
 
-       return 0;
+       complete(&sqd->exited);
+       mutex_unlock(&sqd->lock);
+       do_exit(0);
 }
 
 struct io_wait_queue {
@@ -7328,38 +6984,59 @@ static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx,
        percpu_ref_get(&rsrc_data->refs);
 }
 
-static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
-                              struct io_ring_ctx *ctx,
-                              struct fixed_rsrc_ref_node *backup_node)
+static void io_sqe_rsrc_kill_node(struct io_ring_ctx *ctx, struct fixed_rsrc_data *data)
 {
-       struct fixed_rsrc_ref_node *ref_node;
-       int ret;
+       struct fixed_rsrc_ref_node *ref_node = NULL;
 
        io_rsrc_ref_lock(ctx);
        ref_node = data->node;
+       data->node = NULL;
        io_rsrc_ref_unlock(ctx);
        if (ref_node)
                percpu_ref_kill(&ref_node->refs);
+}
+
+static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
+                              struct io_ring_ctx *ctx,
+                              void (*rsrc_put)(struct io_ring_ctx *ctx,
+                                               struct io_rsrc_put *prsrc))
+{
+       struct fixed_rsrc_ref_node *backup_node;
+       int ret;
 
-       percpu_ref_kill(&data->refs);
+       if (data->quiesce)
+               return -ENXIO;
 
-       /* wait for all refs nodes to complete */
-       flush_delayed_work(&ctx->rsrc_put_work);
+       data->quiesce = true;
        do {
+               ret = -ENOMEM;
+               backup_node = alloc_fixed_rsrc_ref_node(ctx);
+               if (!backup_node)
+                       break;
+               backup_node->rsrc_data = data;
+               backup_node->rsrc_put = rsrc_put;
+
+               io_sqe_rsrc_kill_node(ctx, data);
+               percpu_ref_kill(&data->refs);
+               flush_delayed_work(&ctx->rsrc_put_work);
+
                ret = wait_for_completion_interruptible(&data->done);
                if (!ret)
                        break;
+
+               percpu_ref_resurrect(&data->refs);
+               io_sqe_rsrc_set_node(ctx, data, backup_node);
+               backup_node = NULL;
+               reinit_completion(&data->done);
+               mutex_unlock(&ctx->uring_lock);
                ret = io_run_task_work_sig();
-               if (ret < 0) {
-                       percpu_ref_resurrect(&data->refs);
-                       reinit_completion(&data->done);
-                       io_sqe_rsrc_set_node(ctx, data, backup_node);
-                       return ret;
-               }
-       } while (1);
+               mutex_lock(&ctx->uring_lock);
+       } while (ret >= 0);
+       data->quiesce = false;
 
-       destroy_fixed_rsrc_ref_node(backup_node);
-       return 0;
+       if (backup_node)
+               destroy_fixed_rsrc_ref_node(backup_node);
+       return ret;
 }
 
 static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx)
@@ -7390,18 +7067,17 @@ static void free_fixed_rsrc_data(struct fixed_rsrc_data *data)
 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
        struct fixed_rsrc_data *data = ctx->file_data;
-       struct fixed_rsrc_ref_node *backup_node;
        unsigned nr_tables, i;
        int ret;
 
-       if (!data)
+       /*
+        * percpu_ref_is_dying() is to stop parallel files unregister
+        * Since we possibly drop uring lock later in this function to
+        * run task work.
+        */
+       if (!data || percpu_ref_is_dying(&data->refs))
                return -ENXIO;
-       backup_node = alloc_fixed_rsrc_ref_node(ctx);
-       if (!backup_node)
-               return -ENOMEM;
-       init_fixed_file_ref_node(ctx, backup_node);
-
-       ret = io_rsrc_ref_quiesce(data, ctx, backup_node);
+       ret = io_rsrc_ref_quiesce(data, ctx, io_ring_file_put);
        if (ret)
                return ret;
 
@@ -7415,20 +7091,80 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
        return 0;
 }
 
+static void io_sq_thread_unpark(struct io_sq_data *sqd)
+       __releases(&sqd->lock)
+{
+       if (!sqd->thread)
+               return;
+       if (sqd->thread == current)
+               return;
+       clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+       wake_up_state(sqd->thread, TASK_PARKED);
+       mutex_unlock(&sqd->lock);
+}
+
+static bool io_sq_thread_park(struct io_sq_data *sqd)
+       __acquires(&sqd->lock)
+{
+       if (sqd->thread == current)
+               return true;
+       mutex_lock(&sqd->lock);
+       if (!sqd->thread) {
+               mutex_unlock(&sqd->lock);
+               return false;
+       }
+       set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+       wake_up_process(sqd->thread);
+       wait_for_completion(&sqd->completion);
+       return true;
+}
+
+static void io_sq_thread_stop(struct io_sq_data *sqd)
+{
+       if (test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state))
+               return;
+       mutex_lock(&sqd->lock);
+       if (sqd->thread) {
+               set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+               WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state));
+               wake_up_process(sqd->thread);
+               mutex_unlock(&sqd->lock);
+               wait_for_completion(&sqd->exited);
+               WARN_ON_ONCE(sqd->thread);
+       } else {
+               mutex_unlock(&sqd->lock);
+       }
+}
+
 static void io_put_sq_data(struct io_sq_data *sqd)
 {
        if (refcount_dec_and_test(&sqd->refs)) {
-               /*
-                * The park is a bit of a work-around, without it we get
-                * warning spews on shutdown with SQPOLL set and affinity
-                * set to a single CPU.
-                */
+               io_sq_thread_stop(sqd);
+               kfree(sqd);
+       }
+}
+
+static void io_sq_thread_finish(struct io_ring_ctx *ctx)
+{
+       struct io_sq_data *sqd = ctx->sq_data;
+
+       if (sqd) {
+               complete(&sqd->startup);
                if (sqd->thread) {
-                       kthread_park(sqd->thread);
-                       kthread_stop(sqd->thread);
+                       wait_for_completion(&ctx->sq_thread_comp);
+                       io_sq_thread_park(sqd);
                }
 
-               kfree(sqd);
+               mutex_lock(&sqd->ctx_lock);
+               list_del(&ctx->sqd_list);
+               io_sqd_update_thread_idle(sqd);
+               mutex_unlock(&sqd->ctx_lock);
+
+               if (sqd->thread)
+                       io_sq_thread_unpark(sqd);
+
+               io_put_sq_data(sqd);
+               ctx->sq_data = NULL;
        }
 }
 
@@ -7475,68 +7211,12 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p)
        mutex_init(&sqd->ctx_lock);
        mutex_init(&sqd->lock);
        init_waitqueue_head(&sqd->wait);
+       init_completion(&sqd->startup);
+       init_completion(&sqd->completion);
+       init_completion(&sqd->exited);
        return sqd;
 }
 
-static void io_sq_thread_unpark(struct io_sq_data *sqd)
-       __releases(&sqd->lock)
-{
-       if (!sqd->thread)
-               return;
-       kthread_unpark(sqd->thread);
-       mutex_unlock(&sqd->lock);
-}
-
-static void io_sq_thread_park(struct io_sq_data *sqd)
-       __acquires(&sqd->lock)
-{
-       if (!sqd->thread)
-               return;
-       mutex_lock(&sqd->lock);
-       kthread_park(sqd->thread);
-}
-
-static void io_sq_thread_stop(struct io_ring_ctx *ctx)
-{
-       struct io_sq_data *sqd = ctx->sq_data;
-
-       if (sqd) {
-               if (sqd->thread) {
-                       /*
-                        * We may arrive here from the error branch in
-                        * io_sq_offload_create() where the kthread is created
-                        * without being waked up, thus wake it up now to make
-                        * sure the wait will complete.
-                        */
-                       wake_up_process(sqd->thread);
-                       wait_for_completion(&ctx->sq_thread_comp);
-
-                       io_sq_thread_park(sqd);
-               }
-
-               mutex_lock(&sqd->ctx_lock);
-               list_del(&ctx->sqd_list);
-               io_sqd_update_thread_idle(sqd);
-               mutex_unlock(&sqd->ctx_lock);
-
-               if (sqd->thread)
-                       io_sq_thread_unpark(sqd);
-
-               io_put_sq_data(sqd);
-               ctx->sq_data = NULL;
-       }
-}
-
-static void io_finish_async(struct io_ring_ctx *ctx)
-{
-       io_sq_thread_stop(ctx);
-
-       if (ctx->io_wq) {
-               io_wq_destroy(ctx->io_wq);
-               ctx->io_wq = NULL;
-       }
-}
-
 #if defined(CONFIG_UNIX)
 /*
  * Ensure the UNIX gc is aware of our file set, so we are certain that
@@ -7563,7 +7243,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
        skb->sk = sk;
 
        nr_files = 0;
-       fpl->user = get_uid(ctx->user);
+       fpl->user = get_uid(current_user());
        for (i = 0; i < nr; i++) {
                struct file *file = io_file_from_index(ctx, i + offset);
 
@@ -8095,54 +7775,34 @@ static struct io_wq_work *io_free_work(struct io_wq_work *work)
        return req ? &req->work : NULL;
 }
 
-static int io_init_wq_offload(struct io_ring_ctx *ctx,
-                             struct io_uring_params *p)
+static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
 {
+       struct io_wq_hash *hash;
        struct io_wq_data data;
-       struct fd f;
-       struct io_ring_ctx *ctx_attach;
        unsigned int concurrency;
-       int ret = 0;
-
-       data.user = ctx->user;
-       data.free_work = io_free_work;
-       data.do_work = io_wq_submit_work;
-
-       if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
-               /* Do QD, or 4 * CPUS, whatever is smallest */
-               concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
 
-               ctx->io_wq = io_wq_create(concurrency, &data);
-               if (IS_ERR(ctx->io_wq)) {
-                       ret = PTR_ERR(ctx->io_wq);
-                       ctx->io_wq = NULL;
-               }
-               return ret;
+       hash = ctx->hash_map;
+       if (!hash) {
+               hash = kzalloc(sizeof(*hash), GFP_KERNEL);
+               if (!hash)
+                       return ERR_PTR(-ENOMEM);
+               refcount_set(&hash->refs, 1);
+               init_waitqueue_head(&hash->wait);
+               ctx->hash_map = hash;
        }
 
-       f = fdget(p->wq_fd);
-       if (!f.file)
-               return -EBADF;
-
-       if (f.file->f_op != &io_uring_fops) {
-               ret = -EINVAL;
-               goto out_fput;
-       }
+       data.hash = hash;
+       data.free_work = io_free_work;
+       data.do_work = io_wq_submit_work;
 
-       ctx_attach = f.file->private_data;
-       /* @io_wq is protected by holding the fd */
-       if (!io_wq_get(ctx_attach->io_wq, &data)) {
-               ret = -EINVAL;
-               goto out_fput;
-       }
+       /* Do QD, or 4 * CPUS, whatever is smallest */
+       concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
 
-       ctx->io_wq = ctx_attach->io_wq;
-out_fput:
-       fdput(f);
-       return ret;
+       return io_wq_create(concurrency, &data);
 }
 
-static int io_uring_alloc_task_context(struct task_struct *task)
+static int io_uring_alloc_task_context(struct task_struct *task,
+                                      struct io_ring_ctx *ctx)
 {
        struct io_uring_task *tctx;
        int ret;
@@ -8157,13 +7817,19 @@ static int io_uring_alloc_task_context(struct task_struct *task)
                return ret;
        }
 
+       tctx->io_wq = io_init_wq_offload(ctx);
+       if (IS_ERR(tctx->io_wq)) {
+               ret = PTR_ERR(tctx->io_wq);
+               percpu_counter_destroy(&tctx->inflight);
+               kfree(tctx);
+               return ret;
+       }
+
        xa_init(&tctx->xa);
        init_waitqueue_head(&tctx->wait);
        tctx->last = NULL;
        atomic_set(&tctx->in_idle, 0);
        tctx->sqpoll = false;
-       io_init_identity(&tctx->__identity);
-       tctx->identity = &tctx->__identity;
        task->io_uring = tctx;
        spin_lock_init(&tctx->task_lock);
        INIT_WQ_LIST(&tctx->task_list);
@@ -8177,19 +7843,49 @@ void __io_uring_free(struct task_struct *tsk)
        struct io_uring_task *tctx = tsk->io_uring;
 
        WARN_ON_ONCE(!xa_empty(&tctx->xa));
-       WARN_ON_ONCE(refcount_read(&tctx->identity->count) != 1);
-       if (tctx->identity != &tctx->__identity)
-               kfree(tctx->identity);
        percpu_counter_destroy(&tctx->inflight);
        kfree(tctx);
        tsk->io_uring = NULL;
 }
 
+static int io_sq_thread_fork(struct io_sq_data *sqd, struct io_ring_ctx *ctx)
+{
+       int ret;
+
+       clear_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+       reinit_completion(&sqd->completion);
+       ctx->sqo_dead = ctx->sqo_exec = 0;
+       sqd->task_pid = current->pid;
+       current->flags |= PF_IO_WORKER;
+       ret = io_wq_fork_thread(io_sq_thread, sqd);
+       current->flags &= ~PF_IO_WORKER;
+       if (ret < 0) {
+               sqd->thread = NULL;
+               return ret;
+       }
+       wait_for_completion(&sqd->completion);
+       return io_uring_alloc_task_context(sqd->thread, ctx);
+}
+
 static int io_sq_offload_create(struct io_ring_ctx *ctx,
                                struct io_uring_params *p)
 {
        int ret;
 
+       /* Retain compatibility with failing for an invalid attach attempt */
+       if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
+                               IORING_SETUP_ATTACH_WQ) {
+               struct fd f;
+
+               f = fdget(p->wq_fd);
+               if (!f.file)
+                       return -ENXIO;
+               if (f.file->f_op != &io_uring_fops) {
+                       fdput(f);
+                       return -EINVAL;
+               }
+               fdput(f);
+       }
        if (ctx->flags & IORING_SETUP_SQPOLL) {
                struct io_sq_data *sqd;
 
@@ -8215,7 +7911,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
                        ctx->sq_thread_idle = HZ;
 
                if (sqd->thread)
-                       goto done;
+                       return 0;
 
                if (p->flags & IORING_SETUP_SQ_AFF) {
                        int cpu = p->sq_thread_cpu;
@@ -8226,18 +7922,21 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
                        if (!cpu_online(cpu))
                                goto err;
 
-                       sqd->thread = kthread_create_on_cpu(io_sq_thread, sqd,
-                                                       cpu, "io_uring-sq");
+                       sqd->sq_cpu = cpu;
                } else {
-                       sqd->thread = kthread_create(io_sq_thread, sqd,
-                                                       "io_uring-sq");
+                       sqd->sq_cpu = -1;
                }
-               if (IS_ERR(sqd->thread)) {
-                       ret = PTR_ERR(sqd->thread);
+
+               sqd->task_pid = current->pid;
+               current->flags |= PF_IO_WORKER;
+               ret = io_wq_fork_thread(io_sq_thread, sqd);
+               current->flags &= ~PF_IO_WORKER;
+               if (ret < 0) {
                        sqd->thread = NULL;
                        goto err;
                }
-               ret = io_uring_alloc_task_context(sqd->thread);
+               wait_for_completion(&sqd->completion);
+               ret = io_uring_alloc_task_context(sqd->thread, ctx);
                if (ret)
                        goto err;
        } else if (p->flags & IORING_SETUP_SQ_AFF) {
@@ -8246,14 +7945,9 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
                goto err;
        }
 
-done:
-       ret = io_init_wq_offload(ctx, p);
-       if (ret)
-               goto err;
-
        return 0;
 err:
-       io_finish_async(ctx);
+       io_sq_thread_finish(ctx);
        return ret;
 }
 
@@ -8261,8 +7955,8 @@ static void io_sq_offload_start(struct io_ring_ctx *ctx)
 {
        struct io_sq_data *sqd = ctx->sq_data;
 
-       if ((ctx->flags & IORING_SETUP_SQPOLL) && sqd->thread)
-               wake_up_process(sqd->thread);
+       if (ctx->flags & IORING_SETUP_SQPOLL)
+               complete(&sqd->startup);
 }
 
 static inline void __io_unaccount_mem(struct user_struct *user,
@@ -8292,7 +7986,7 @@ static inline int __io_account_mem(struct user_struct *user,
 
 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
 {
-       if (ctx->limit_mem)
+       if (ctx->user)
                __io_unaccount_mem(ctx->user, nr_pages);
 
        if (ctx->mm_account)
@@ -8303,7 +7997,7 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
 {
        int ret;
 
-       if (ctx->limit_mem) {
+       if (ctx->user) {
                ret = __io_account_mem(ctx->user, nr_pages);
                if (ret)
                        return ret;
@@ -8702,19 +8396,23 @@ static void io_req_cache_free(struct list_head *list, struct task_struct *tsk)
 static void io_req_caches_free(struct io_ring_ctx *ctx, struct task_struct *tsk)
 {
        struct io_submit_state *submit_state = &ctx->submit_state;
+       struct io_comp_state *cs = &ctx->submit_state.comp;
 
        mutex_lock(&ctx->uring_lock);
 
-       if (submit_state->free_reqs)
+       if (submit_state->free_reqs) {
                kmem_cache_free_bulk(req_cachep, submit_state->free_reqs,
                                     submit_state->reqs);
-
-       io_req_cache_free(&submit_state->comp.free_list, NULL);
+               submit_state->free_reqs = 0;
+       }
 
        spin_lock_irq(&ctx->completion_lock);
-       io_req_cache_free(&submit_state->comp.locked_free_list, NULL);
+       list_splice_init(&cs->locked_free_list, &cs->free_list);
+       cs->locked_free_nr = 0;
        spin_unlock_irq(&ctx->completion_lock);
 
+       io_req_cache_free(&cs->free_list, NULL);
+
        mutex_unlock(&ctx->uring_lock);
 }
 
@@ -8728,22 +8426,17 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
        mutex_lock(&ctx->uring_lock);
        mutex_unlock(&ctx->uring_lock);
 
-       io_finish_async(ctx);
+       io_sq_thread_finish(ctx);
        io_sqe_buffers_unregister(ctx);
 
-       if (ctx->sqo_task) {
-               put_task_struct(ctx->sqo_task);
-               ctx->sqo_task = NULL;
+       if (ctx->mm_account) {
                mmdrop(ctx->mm_account);
                ctx->mm_account = NULL;
        }
 
-#ifdef CONFIG_BLK_CGROUP
-       if (ctx->sqo_blkcg_css)
-               css_put(ctx->sqo_blkcg_css);
-#endif
-
+       mutex_lock(&ctx->uring_lock);
        io_sqe_files_unregister(ctx);
+       mutex_unlock(&ctx->uring_lock);
        io_eventfd_unregister(ctx);
        io_destroy_buffers(ctx);
        idr_destroy(&ctx->personality_idr);
@@ -8760,8 +8453,9 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 
        percpu_ref_exit(&ctx->refs);
        free_uid(ctx->user);
-       put_cred(ctx->creds);
        io_req_caches_free(ctx, NULL);
+       if (ctx->hash_map)
+               io_wq_put_hash(ctx->hash_map);
        kfree(ctx->cancel_hash);
        kfree(ctx);
 }
@@ -8808,13 +8502,11 @@ static int io_uring_fasync(int fd, struct file *file, int on)
 
 static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
 {
-       struct io_identity *iod;
+       const struct cred *creds;
 
-       iod = idr_remove(&ctx->personality_idr, id);
-       if (iod) {
-               put_cred(iod->creds);
-               if (refcount_dec_and_test(&iod->count))
-                       kfree(iod);
+       creds = idr_remove(&ctx->personality_idr, id);
+       if (creds) {
+               put_cred(creds);
                return 0;
        }
 
@@ -8829,6 +8521,32 @@ static int io_remove_personalities(int id, void *p, void *data)
        return 0;
 }
 
+static bool io_run_ctx_fallback(struct io_ring_ctx *ctx)
+{
+       struct callback_head *work, *head, *next;
+       bool executed = false;
+
+       do {
+               do {
+                       head = NULL;
+                       work = READ_ONCE(ctx->exit_task_work);
+               } while (cmpxchg(&ctx->exit_task_work, work, head) != work);
+
+               if (!work)
+                       break;
+
+               do {
+                       next = work->next;
+                       work->func(work);
+                       work = next;
+                       cond_resched();
+               } while (work);
+               executed = true;
+       } while (1);
+
+       return executed;
+}
+
 static void io_ring_exit_work(struct work_struct *work)
 {
        struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
@@ -8842,17 +8560,11 @@ static void io_ring_exit_work(struct work_struct *work)
         */
        do {
                io_uring_try_cancel_requests(ctx, NULL, NULL);
+               io_run_ctx_fallback(ctx);
        } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
        io_ring_ctx_free(ctx);
 }
 
-static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
-{
-       struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-
-       return req->ctx == data;
-}
-
 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 {
        mutex_lock(&ctx->uring_lock);
@@ -8871,9 +8583,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
        io_kill_timeouts(ctx, NULL, NULL);
        io_poll_remove_all(ctx, NULL, NULL);
 
-       if (ctx->io_wq)
-               io_wq_cancel_cb(ctx->io_wq, io_cancel_ctx_cb, ctx, true);
-
        /* if we failed setting up the ctx, we might not have any rings */
        io_iopoll_try_reap_events(ctx);
 
@@ -8952,13 +8661,14 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
                                         struct files_struct *files)
 {
        struct io_task_cancel cancel = { .task = task, .files = files, };
+       struct io_uring_task *tctx = current->io_uring;
 
        while (1) {
                enum io_wq_cancel cret;
                bool ret = false;
 
-               if (ctx->io_wq) {
-                       cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb,
+               if (tctx && tctx->io_wq) {
+                       cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
                                               &cancel, true);
                        ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
                }
@@ -8974,6 +8684,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
                ret |= io_poll_remove_all(ctx, task, files);
                ret |= io_kill_timeouts(ctx, task, files);
                ret |= io_run_task_work();
+               ret |= io_run_ctx_fallback(ctx);
                io_cqring_overflow_flush(ctx, true, task, files);
                if (!ret)
                        break;
@@ -9041,12 +8752,15 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
                                          struct files_struct *files)
 {
        struct task_struct *task = current;
+       bool did_park = false;
 
        if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
                io_disable_sqo_submit(ctx);
-               task = ctx->sq_data->thread;
-               atomic_inc(&task->io_uring->in_idle);
-               io_sq_thread_park(ctx->sq_data);
+               did_park = io_sq_thread_park(ctx->sq_data);
+               if (did_park) {
+                       task = ctx->sq_data->thread;
+                       atomic_inc(&task->io_uring->in_idle);
+               }
        }
 
        io_cancel_defer_files(ctx, task, files);
@@ -9055,7 +8769,7 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
        if (!files)
                io_uring_try_cancel_requests(ctx, task, NULL);
 
-       if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
+       if (did_park) {
                atomic_dec(&task->io_uring->in_idle);
                io_sq_thread_unpark(ctx->sq_data);
        }
@@ -9070,7 +8784,7 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
        int ret;
 
        if (unlikely(!tctx)) {
-               ret = io_uring_alloc_task_context(current);
+               ret = io_uring_alloc_task_context(current, ctx);
                if (unlikely(ret))
                        return ret;
                tctx = current->io_uring;
@@ -9140,8 +8854,13 @@ void __io_uring_files_cancel(struct files_struct *files)
                io_uring_cancel_task_requests(file->private_data, files);
        atomic_dec(&tctx->in_idle);
 
-       if (files)
+       if (files) {
                io_uring_remove_task_files(tctx);
+               if (tctx->io_wq) {
+                       io_wq_put_and_exit(tctx->io_wq);
+                       tctx->io_wq = NULL;
+               }
+       }
 }
 
 static s64 tctx_inflight(struct io_uring_task *tctx)
@@ -9151,14 +8870,22 @@ static s64 tctx_inflight(struct io_uring_task *tctx)
 
 static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
 {
+       struct io_sq_data *sqd = ctx->sq_data;
        struct io_uring_task *tctx;
        s64 inflight;
        DEFINE_WAIT(wait);
 
-       if (!ctx->sq_data)
+       if (!sqd)
                return;
-       tctx = ctx->sq_data->thread->io_uring;
        io_disable_sqo_submit(ctx);
+       if (!io_sq_thread_park(sqd))
+               return;
+       tctx = ctx->sq_data->thread->io_uring;
+       /* can happen on fork/alloc failure, just ignore that state */
+       if (!tctx) {
+               io_sq_thread_unpark(sqd);
+               return;
+       }
 
        atomic_inc(&tctx->in_idle);
        do {
@@ -9179,6 +8906,7 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
                finish_wait(&tctx->wait, &wait);
        } while (1);
        atomic_dec(&tctx->in_idle);
+       io_sq_thread_unpark(sqd);
 }
 
 /*
@@ -9232,11 +8960,17 @@ static int io_uring_flush(struct file *file, void *data)
        struct io_uring_task *tctx = current->io_uring;
        struct io_ring_ctx *ctx = file->private_data;
 
+       /* Ignore helper thread files exit */
+       if (current->flags & PF_IO_WORKER)
+               return 0;
+
        if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
                io_uring_cancel_task_requests(ctx, NULL);
                io_req_caches_free(ctx, current);
        }
 
+       io_run_ctx_fallback(ctx);
+
        if (!tctx)
                return 0;
 
@@ -9435,6 +9169,12 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
        if (ctx->flags & IORING_SETUP_SQPOLL) {
                io_cqring_overflow_flush(ctx, false, NULL, NULL);
 
+               if (unlikely(ctx->sqo_exec)) {
+                       ret = io_sq_thread_fork(ctx->sq_data, ctx);
+                       if (ret)
+                               goto out;
+                       ctx->sqo_exec = 0;
+               }
                ret = -EOWNERDEAD;
                if (unlikely(ctx->sqo_dead))
                        goto out;
@@ -9491,8 +9231,7 @@ out_fput:
 #ifdef CONFIG_PROC_FS
 static int io_uring_show_cred(int id, void *p, void *data)
 {
-       struct io_identity *iod = p;
-       const struct cred *cred = iod->creds;
+       const struct cred *cred = p;
        struct seq_file *m = data;
        struct user_namespace *uns = seq_user_ns(m);
        struct group_info *gi;
@@ -9537,8 +9276,11 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
         */
        has_lock = mutex_trylock(&ctx->uring_lock);
 
-       if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL))
+       if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
                sq = ctx->sq_data;
+               if (!sq->thread)
+                       sq = NULL;
+       }
 
        seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
        seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
@@ -9698,7 +9440,6 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
 static int io_uring_create(unsigned entries, struct io_uring_params *p,
                           struct io_uring_params __user *params)
 {
-       struct user_struct *user = NULL;
        struct io_ring_ctx *ctx;
        struct file *file;
        int ret;
@@ -9740,22 +9481,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
                p->cq_entries = 2 * p->sq_entries;
        }
 
-       user = get_uid(current_user());
-
        ctx = io_ring_ctx_alloc(p);
-       if (!ctx) {
-               free_uid(user);
+       if (!ctx)
                return -ENOMEM;
-       }
        ctx->compat = in_compat_syscall();
-       ctx->limit_mem = !capable(CAP_IPC_LOCK);
-       ctx->user = user;
-       ctx->creds = get_current_cred();
-#ifdef CONFIG_AUDIT
-       ctx->loginuid = current->loginuid;
-       ctx->sessionid = current->sessionid;
-#endif
-       ctx->sqo_task = get_task_struct(current);
+       if (!capable(CAP_IPC_LOCK))
+               ctx->user = get_uid(current_user());
+       ctx->sqo_task = current;
 
        /*
         * This is just grabbed for accounting purposes. When a process exits,
@@ -9766,24 +9498,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
        mmgrab(current->mm);
        ctx->mm_account = current->mm;
 
-#ifdef CONFIG_BLK_CGROUP
-       /*
-        * The sq thread will belong to the original cgroup it was inited in.
-        * If the cgroup goes offline (e.g. disabling the io controller), then
-        * issued bios will be associated with the closest cgroup later in the
-        * block layer.
-        */
-       rcu_read_lock();
-       ctx->sqo_blkcg_css = blkcg_css();
-       ret = css_tryget_online(ctx->sqo_blkcg_css);
-       rcu_read_unlock();
-       if (!ret) {
-               /* don't init against a dying cgroup, have the user try again */
-               ctx->sqo_blkcg_css = NULL;
-               ret = -ENODEV;
-               goto err;
-       }
-#endif
        ret = io_allocate_scq_urings(ctx, p);
        if (ret)
                goto err;
@@ -9817,7 +9531,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
                        IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
                        IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
                        IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
-                       IORING_FEAT_EXT_ARG;
+                       IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS;
 
        if (copy_to_user(params, p, sizeof(*p))) {
                ret = -EFAULT;
@@ -9923,21 +9637,15 @@ out:
 
 static int io_register_personality(struct io_ring_ctx *ctx)
 {
-       struct io_identity *id;
+       const struct cred *creds;
        int ret;
 
-       id = kmalloc(sizeof(*id), GFP_KERNEL);
-       if (unlikely(!id))
-               return -ENOMEM;
-
-       io_init_identity(id);
-       id->creds = get_current_cred();
+       creds = get_current_cred();
 
-       ret = idr_alloc_cyclic(&ctx->personality_idr, id, 1, USHRT_MAX, GFP_KERNEL);
-       if (ret < 0) {
-               put_cred(id->creds);
-               kfree(id);
-       }
+       ret = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
+                               USHRT_MAX, GFP_KERNEL);
+       if (ret < 0)
+               put_cred(creds);
        return ret;
 }
 
@@ -10196,6 +9904,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 
        ctx = f.file->private_data;
 
+       io_run_task_work();
+
        mutex_lock(&ctx->uring_lock);
        ret = __io_uring_register(ctx, opcode, arg, nr_args);
        mutex_unlock(&ctx->uring_lock);