Merge tag 'for-5.11/io_uring-2020-12-14' of git://git.kernel.dk/linux-block
[linux-2.6-microblaze.git] / fs / io_uring.c
index 0e8902b..22e3105 100644 (file)
@@ -245,6 +245,8 @@ struct io_sq_data {
 
        struct task_struct      *thread;
        struct wait_queue_head  wait;
+
+       unsigned                sq_thread_idle;
 };
 
 struct io_ring_ctx {
@@ -285,7 +287,6 @@ struct io_ring_ctx {
                struct list_head        timeout_list;
                struct list_head        cq_overflow_list;
 
-               wait_queue_head_t       inflight_wait;
                struct io_uring_sqe     *sq_sqes;
        } ____cacheline_aligned_in_smp;
 
@@ -310,7 +311,6 @@ struct io_ring_ctx {
        struct io_sq_data       *sq_data;       /* if using sq thread polling */
 
        struct wait_queue_head  sqo_sq_wait;
-       struct wait_queue_entry sqo_wait_entry;
        struct list_head        sqd_list;
 
        /*
@@ -395,16 +395,18 @@ struct io_ring_ctx {
  */
 struct io_poll_iocb {
        struct file                     *file;
-       union {
-               struct wait_queue_head  *head;
-               u64                     addr;
-       };
+       struct wait_queue_head          *head;
        __poll_t                        events;
        bool                            done;
        bool                            canceled;
        struct wait_queue_entry         wait;
 };
 
+struct io_poll_remove {
+       struct file                     *file;
+       u64                             addr;
+};
+
 struct io_close {
        struct file                     *file;
        struct file                     *put_file;
@@ -444,11 +446,17 @@ struct io_timeout {
        u32                             off;
        u32                             target_seq;
        struct list_head                list;
+       /* head of the link, used by linked timeouts only */
+       struct io_kiocb                 *head;
 };
 
 struct io_timeout_rem {
        struct file                     *file;
        u64                             addr;
+
+       /* timeout update */
+       struct timespec64               ts;
+       u32                             flags;
 };
 
 struct io_rw {
@@ -541,6 +549,27 @@ struct io_statx {
        struct statx __user             *buffer;
 };
 
+struct io_shutdown {
+       struct file                     *file;
+       int                             how;
+};
+
+struct io_rename {
+       struct file                     *file;
+       int                             old_dfd;
+       int                             new_dfd;
+       struct filename                 *oldpath;
+       struct filename                 *newpath;
+       int                             flags;
+};
+
+struct io_unlink {
+       struct file                     *file;
+       int                             dfd;
+       int                             flags;
+       struct filename                 *filename;
+};
+
 struct io_completion {
        struct file                     *file;
        struct list_head                list;
@@ -575,7 +604,6 @@ enum {
        REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
        REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
 
-       REQ_F_LINK_HEAD_BIT,
        REQ_F_FAIL_LINK_BIT,
        REQ_F_INFLIGHT_BIT,
        REQ_F_CUR_POS_BIT,
@@ -607,8 +635,6 @@ enum {
        /* IOSQE_BUFFER_SELECT */
        REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
 
-       /* head of a link */
-       REQ_F_LINK_HEAD         = BIT(REQ_F_LINK_HEAD_BIT),
        /* fail rest of links */
        REQ_F_FAIL_LINK         = BIT(REQ_F_FAIL_LINK_BIT),
        /* on inflight list */
@@ -651,6 +677,7 @@ struct io_kiocb {
                struct file             *file;
                struct io_rw            rw;
                struct io_poll_iocb     poll;
+               struct io_poll_remove   poll_remove;
                struct io_accept        accept;
                struct io_sync          sync;
                struct io_cancel        cancel;
@@ -667,6 +694,9 @@ struct io_kiocb {
                struct io_splice        splice;
                struct io_provide_buf   pbuf;
                struct io_statx         statx;
+               struct io_shutdown      shutdown;
+               struct io_rename        rename;
+               struct io_unlink        unlink;
                /* use only after cleaning per-op data, see io_clean_op() */
                struct io_completion    compl;
        };
@@ -686,15 +716,14 @@ struct io_kiocb {
        struct task_struct              *task;
        u64                             user_data;
 
-       struct list_head                link_list;
+       struct io_kiocb                 *link;
+       struct percpu_ref               *fixed_file_refs;
 
        /*
         * 1. used with ctx->iopoll_list with reads/writes
         * 2. to track reqs with ->files (see io_op_def::file_table)
         */
        struct list_head                inflight_entry;
-
-       struct percpu_ref               *fixed_file_refs;
        struct callback_head            task_work;
        /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
        struct hlist_node               hash_node;
@@ -725,6 +754,8 @@ struct io_submit_state {
        void                    *reqs[IO_IOPOLL_BATCH];
        unsigned int            free_reqs;
 
+       bool                    plug_started;
+
        /*
         * Batch completion logic
         */
@@ -735,7 +766,7 @@ struct io_submit_state {
         */
        struct file             *file;
        unsigned int            fd;
-       unsigned int            has_refs;
+       unsigned int            file_refs;
        unsigned int            ios_left;
 };
 
@@ -757,6 +788,8 @@ struct io_op_def {
        unsigned                buffer_select : 1;
        /* must always have async data allocated */
        unsigned                needs_async_data : 1;
+       /* should block plug */
+       unsigned                plug : 1;
        /* size of async data needed, if any */
        unsigned short          async_size;
        unsigned                work_flags;
@@ -770,6 +803,7 @@ static const struct io_op_def io_op_defs[] = {
                .pollin                 = 1,
                .buffer_select          = 1,
                .needs_async_data       = 1,
+               .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
                .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
        },
@@ -779,6 +813,7 @@ static const struct io_op_def io_op_defs[] = {
                .unbound_nonreg_file    = 1,
                .pollout                = 1,
                .needs_async_data       = 1,
+               .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
                .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
                                                IO_WQ_WORK_FSIZE,
@@ -791,6 +826,7 @@ static const struct io_op_def io_op_defs[] = {
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
                .pollin                 = 1,
+               .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
                .work_flags             = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
        },
@@ -799,6 +835,7 @@ static const struct io_op_def io_op_defs[] = {
                .hash_reg_file          = 1,
                .unbound_nonreg_file    = 1,
                .pollout                = 1,
+               .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
                .work_flags             = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
                                                IO_WQ_WORK_MM,
@@ -818,8 +855,7 @@ static const struct io_op_def io_op_defs[] = {
                .pollout                = 1,
                .needs_async_data       = 1,
                .async_size             = sizeof(struct io_async_msghdr),
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
-                                               IO_WQ_WORK_FS,
+               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
        },
        [IORING_OP_RECVMSG] = {
                .needs_file             = 1,
@@ -828,15 +864,17 @@ static const struct io_op_def io_op_defs[] = {
                .buffer_select          = 1,
                .needs_async_data       = 1,
                .async_size             = sizeof(struct io_async_msghdr),
-               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
-                                               IO_WQ_WORK_FS,
+               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
        },
        [IORING_OP_TIMEOUT] = {
                .needs_async_data       = 1,
                .async_size             = sizeof(struct io_timeout_data),
                .work_flags             = IO_WQ_WORK_MM,
        },
-       [IORING_OP_TIMEOUT_REMOVE] = {},
+       [IORING_OP_TIMEOUT_REMOVE] = {
+               /* used by timeout updates' prep() */
+               .work_flags             = IO_WQ_WORK_MM,
+       },
        [IORING_OP_ACCEPT] = {
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
@@ -863,7 +901,7 @@ static const struct io_op_def io_op_defs[] = {
        },
        [IORING_OP_OPENAT] = {
                .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
-                                               IO_WQ_WORK_FS,
+                                               IO_WQ_WORK_FS | IO_WQ_WORK_MM,
        },
        [IORING_OP_CLOSE] = {
                .needs_file             = 1,
@@ -882,6 +920,7 @@ static const struct io_op_def io_op_defs[] = {
                .unbound_nonreg_file    = 1,
                .pollin                 = 1,
                .buffer_select          = 1,
+               .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
                .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
        },
@@ -889,6 +928,7 @@ static const struct io_op_def io_op_defs[] = {
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
                .pollout                = 1,
+               .plug                   = 1,
                .async_size             = sizeof(struct io_async_rw),
                .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
                                                IO_WQ_WORK_FSIZE,
@@ -915,7 +955,7 @@ static const struct io_op_def io_op_defs[] = {
        },
        [IORING_OP_OPENAT2] = {
                .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
-                                               IO_WQ_WORK_BLKCG,
+                                               IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
        },
        [IORING_OP_EPOLL_CTL] = {
                .unbound_nonreg_file    = 1,
@@ -934,6 +974,17 @@ static const struct io_op_def io_op_defs[] = {
                .hash_reg_file          = 1,
                .unbound_nonreg_file    = 1,
        },
+       [IORING_OP_SHUTDOWN] = {
+               .needs_file             = 1,
+       },
+       [IORING_OP_RENAMEAT] = {
+               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
+                                               IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
+       },
+       [IORING_OP_UNLINKAT] = {
+               .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_FILES |
+                                               IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
+       },
 };
 
 enum io_mem_account {
@@ -983,6 +1034,9 @@ struct sock *io_uring_get_socket(struct file *file)
 }
 EXPORT_SYMBOL(io_uring_get_socket);
 
+#define io_for_each_link(pos, head) \
+       for (pos = (head); pos; pos = pos->link)
+
 static inline void io_clean_op(struct io_kiocb *req)
 {
        if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED |
@@ -990,8 +1044,39 @@ static inline void io_clean_op(struct io_kiocb *req)
                __io_clean_op(req);
 }
 
-static void io_sq_thread_drop_mm(void)
+static inline void io_set_resource_node(struct io_kiocb *req)
+{
+       struct io_ring_ctx *ctx = req->ctx;
+
+       if (!req->fixed_file_refs) {
+               req->fixed_file_refs = &ctx->file_data->node->refs;
+               percpu_ref_get(req->fixed_file_refs);
+       }
+}
+
+static bool io_match_task(struct io_kiocb *head,
+                         struct task_struct *task,
+                         struct files_struct *files)
+{
+       struct io_kiocb *req;
+
+       if (task && head->task != task)
+               return false;
+       if (!files)
+               return true;
+
+       io_for_each_link(req, head) {
+               if ((req->flags & REQ_F_WORK_INITIALIZED) &&
+                   (req->work.flags & IO_WQ_WORK_FILES) &&
+                   req->work.identity->files == files)
+                       return true;
+       }
+       return false;
+}
+
+static void io_sq_thread_drop_mm_files(void)
 {
+       struct files_struct *files = current->files;
        struct mm_struct *mm = current->mm;
 
        if (mm) {
@@ -999,6 +1084,41 @@ static void io_sq_thread_drop_mm(void)
                mmput(mm);
                current->mm = NULL;
        }
+       if (files) {
+               struct nsproxy *nsproxy = current->nsproxy;
+
+               task_lock(current);
+               current->files = NULL;
+               current->nsproxy = NULL;
+               task_unlock(current);
+               put_files_struct(files);
+               put_nsproxy(nsproxy);
+       }
+}
+
+static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx)
+{
+       if (!current->files) {
+               struct files_struct *files;
+               struct nsproxy *nsproxy;
+
+               task_lock(ctx->sqo_task);
+               files = ctx->sqo_task->files;
+               if (!files) {
+                       task_unlock(ctx->sqo_task);
+                       return -EOWNERDEAD;
+               }
+               atomic_inc(&files->count);
+               get_nsproxy(ctx->sqo_task->nsproxy);
+               nsproxy = ctx->sqo_task->nsproxy;
+               task_unlock(ctx->sqo_task);
+
+               task_lock(current);
+               current->files = files;
+               current->nsproxy = nsproxy;
+               task_unlock(current);
+       }
+       return 0;
 }
 
 static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
@@ -1026,12 +1146,25 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
        return -EFAULT;
 }
 
-static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
-                                  struct io_kiocb *req)
+static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
+                                        struct io_kiocb *req)
 {
-       if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM))
-               return 0;
-       return __io_sq_thread_acquire_mm(ctx);
+       const struct io_op_def *def = &io_op_defs[req->opcode];
+       int ret;
+
+       if (def->work_flags & IO_WQ_WORK_MM) {
+               ret = __io_sq_thread_acquire_mm(ctx);
+               if (unlikely(ret))
+                       return ret;
+       }
+
+       if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) {
+               ret = __io_sq_thread_acquire_files(ctx);
+               if (unlikely(ret))
+                       return ret;
+       }
+
+       return 0;
 }
 
 static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
@@ -1174,7 +1307,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
        INIT_LIST_HEAD(&ctx->iopoll_list);
        INIT_LIST_HEAD(&ctx->defer_list);
        INIT_LIST_HEAD(&ctx->timeout_list);
-       init_waitqueue_head(&ctx->inflight_wait);
        spin_lock_init(&ctx->inflight_lock);
        INIT_LIST_HEAD(&ctx->inflight_list);
        INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
@@ -1416,10 +1548,8 @@ static void io_prep_async_link(struct io_kiocb *req)
 {
        struct io_kiocb *cur;
 
-       io_prep_async_work(req);
-       if (req->flags & REQ_F_LINK_HEAD)
-               list_for_each_entry(cur, &req->link_list, link_list)
-                       io_prep_async_work(cur);
+       io_for_each_link(cur, req)
+               io_prep_async_work(cur);
 }
 
 static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
@@ -1460,30 +1590,18 @@ static void io_kill_timeout(struct io_kiocb *req)
        }
 }
 
-static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk)
-{
-       struct io_ring_ctx *ctx = req->ctx;
-
-       if (!tsk || req->task == tsk)
-               return true;
-       if (ctx->flags & IORING_SETUP_SQPOLL) {
-               if (ctx->sq_data && req->task == ctx->sq_data->thread)
-                       return true;
-       }
-       return false;
-}
-
 /*
  * Returns true if we found and killed one or more timeouts
  */
-static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk)
+static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
+                            struct files_struct *files)
 {
        struct io_kiocb *req, *tmp;
        int canceled = 0;
 
        spin_lock_irq(&ctx->completion_lock);
        list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
-               if (io_task_match(req, tsk)) {
+               if (io_match_task(req, tsk, files)) {
                        io_kill_timeout(req);
                        canceled++;
                }
@@ -1594,32 +1712,6 @@ static void io_cqring_mark_overflow(struct io_ring_ctx *ctx)
        }
 }
 
-static inline bool __io_match_files(struct io_kiocb *req,
-                                   struct files_struct *files)
-{
-       return ((req->flags & REQ_F_WORK_INITIALIZED) &&
-               (req->work.flags & IO_WQ_WORK_FILES)) &&
-               req->work.identity->files == files;
-}
-
-static bool io_match_files(struct io_kiocb *req,
-                          struct files_struct *files)
-{
-       struct io_kiocb *link;
-
-       if (!files)
-               return true;
-       if (__io_match_files(req, files))
-               return true;
-       if (req->flags & REQ_F_LINK_HEAD) {
-               list_for_each_entry(link, &req->link_list, link_list) {
-                       if (__io_match_files(link, files))
-                               return true;
-               }
-       }
-       return false;
-}
-
 /* Returns true if there are no backlogged entries after the flush */
 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
                                     struct task_struct *tsk,
@@ -1647,9 +1739,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
 
        cqe = NULL;
        list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
-               if (tsk && req->task != tsk)
-                       continue;
-               if (!io_match_files(req, files))
+               if (!io_match_task(req, tsk, files))
                        continue;
 
                cqe = io_get_cqring(ctx);
@@ -1845,9 +1935,7 @@ fallback:
 static inline void io_put_file(struct io_kiocb *req, struct file *file,
                          bool fixed)
 {
-       if (fixed)
-               percpu_ref_put(req->fixed_file_refs);
-       else
+       if (!fixed)
                fput(file);
 }
 
@@ -1859,7 +1947,8 @@ static void io_dismantle_req(struct io_kiocb *req)
                kfree(req->async_data);
        if (req->file)
                io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
-
+       if (req->fixed_file_refs)
+               percpu_ref_put(req->fixed_file_refs);
        io_req_clean_work(req);
 }
 
@@ -1882,6 +1971,14 @@ static void __io_free_req(struct io_kiocb *req)
        percpu_ref_put(&ctx->refs);
 }
 
+static inline void io_remove_next_linked(struct io_kiocb *req)
+{
+       struct io_kiocb *nxt = req->link;
+
+       req->link = nxt->link;
+       nxt->link = NULL;
+}
+
 static void io_kill_linked_timeout(struct io_kiocb *req)
 {
        struct io_ring_ctx *ctx = req->ctx;
@@ -1890,8 +1987,8 @@ static void io_kill_linked_timeout(struct io_kiocb *req)
        unsigned long flags;
 
        spin_lock_irqsave(&ctx->completion_lock, flags);
-       link = list_first_entry_or_null(&req->link_list, struct io_kiocb,
-                                       link_list);
+       link = req->link;
+
        /*
         * Can happen if a linked timeout fired and link had been like
         * req -> link t-out -> link t-out [-> ...]
@@ -1900,7 +1997,8 @@ static void io_kill_linked_timeout(struct io_kiocb *req)
                struct io_timeout_data *io = link->async_data;
                int ret;
 
-               list_del_init(&link->link_list);
+               io_remove_next_linked(req);
+               link->timeout.head = NULL;
                ret = hrtimer_try_to_cancel(&io->timer);
                if (ret != -1) {
                        io_cqring_fill_event(link, -ECANCELED);
@@ -1917,41 +2015,22 @@ static void io_kill_linked_timeout(struct io_kiocb *req)
        }
 }
 
-static struct io_kiocb *io_req_link_next(struct io_kiocb *req)
-{
-       struct io_kiocb *nxt;
-
-       /*
-        * The list should never be empty when we are called here. But could
-        * potentially happen if the chain is messed up, check to be on the
-        * safe side.
-        */
-       if (unlikely(list_empty(&req->link_list)))
-               return NULL;
-
-       nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list);
-       list_del_init(&req->link_list);
-       if (!list_empty(&nxt->link_list))
-               nxt->flags |= REQ_F_LINK_HEAD;
-       return nxt;
-}
 
-/*
- * Called if REQ_F_LINK_HEAD is set, and we fail the head request
- */
 static void io_fail_links(struct io_kiocb *req)
 {
+       struct io_kiocb *link, *nxt;
        struct io_ring_ctx *ctx = req->ctx;
        unsigned long flags;
 
        spin_lock_irqsave(&ctx->completion_lock, flags);
-       while (!list_empty(&req->link_list)) {
-               struct io_kiocb *link = list_first_entry(&req->link_list,
-                                               struct io_kiocb, link_list);
+       link = req->link;
+       req->link = NULL;
 
-               list_del_init(&link->link_list);
-               trace_io_uring_fail_link(req, link);
+       while (link) {
+               nxt = link->link;
+               link->link = NULL;
 
+               trace_io_uring_fail_link(req, link);
                io_cqring_fill_event(link, -ECANCELED);
 
                /*
@@ -1963,8 +2042,8 @@ static void io_fail_links(struct io_kiocb *req)
                        io_put_req_deferred(link, 2);
                else
                        io_double_put_req(link);
+               link = nxt;
        }
-
        io_commit_cqring(ctx);
        spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
@@ -1973,7 +2052,6 @@ static void io_fail_links(struct io_kiocb *req)
 
 static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
 {
-       req->flags &= ~REQ_F_LINK_HEAD;
        if (req->flags & REQ_F_LINK_TIMEOUT)
                io_kill_linked_timeout(req);
 
@@ -1983,15 +2061,19 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
         * dependencies to the next request. In case of failure, fail the rest
         * of the chain.
         */
-       if (likely(!(req->flags & REQ_F_FAIL_LINK)))
-               return io_req_link_next(req);
+       if (likely(!(req->flags & REQ_F_FAIL_LINK))) {
+               struct io_kiocb *nxt = req->link;
+
+               req->link = NULL;
+               return nxt;
+       }
        io_fail_links(req);
        return NULL;
 }
 
-static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
+static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
 {
-       if (likely(!(req->flags & REQ_F_LINK_HEAD)))
+       if (likely(!(req->link) && !(req->flags & REQ_F_LINK_TIMEOUT)))
                return NULL;
        return __io_req_find_next(req);
 }
@@ -2050,7 +2132,8 @@ static void __io_req_task_submit(struct io_kiocb *req)
 {
        struct io_ring_ctx *ctx = req->ctx;
 
-       if (!__io_sq_thread_acquire_mm(ctx)) {
+       if (!__io_sq_thread_acquire_mm(ctx) &&
+           !__io_sq_thread_acquire_files(ctx)) {
                mutex_lock(&ctx->uring_lock);
                __io_queue_sqe(req, NULL);
                mutex_unlock(&ctx->uring_lock);
@@ -2086,7 +2169,7 @@ static void io_req_task_queue(struct io_kiocb *req)
        }
 }
 
-static void io_queue_next(struct io_kiocb *req)
+static inline void io_queue_next(struct io_kiocb *req)
 {
        struct io_kiocb *nxt = io_req_find_next(req);
 
@@ -2143,8 +2226,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
                io_free_req(req);
                return;
        }
-       if (req->flags & REQ_F_LINK_HEAD)
-               io_queue_next(req);
+       io_queue_next(req);
 
        if (req->task != rb->task) {
                if (rb->task) {
@@ -2246,7 +2328,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
                 * we wake up the task, and the next invocation will flush the
                 * entries. We cannot safely to it from here.
                 */
-               if (noflush && !list_empty(&ctx->cq_overflow_list))
+               if (noflush)
                        return -1U;
 
                io_cqring_overflow_flush(ctx, false, NULL, NULL);
@@ -2593,7 +2675,7 @@ static bool io_rw_reissue(struct io_kiocb *req, long res)
        if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
                return false;
 
-       ret = io_sq_thread_acquire_mm(req->ctx, req);
+       ret = io_sq_thread_acquire_mm_files(req->ctx, req);
 
        if (io_resubmit_prep(req, ret)) {
                refcount_inc(&req->refs);
@@ -2641,7 +2723,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
  * find it from a io_iopoll_getevents() thread before the issuer is done
  * accessing the kiocb cookie.
  */
-static void io_iopoll_req_issued(struct io_kiocb *req)
+static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
 {
        struct io_ring_ctx *ctx = req->ctx;
 
@@ -2670,21 +2752,25 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
        else
                list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
 
-       if ((ctx->flags & IORING_SETUP_SQPOLL) &&
+       /*
+        * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread
+        * task context or in io worker task context. If current task context is
+        * sq thread, we don't need to check whether should wake up sq thread.
+        */
+       if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) &&
            wq_has_sleeper(&ctx->sq_data->wait))
                wake_up(&ctx->sq_data->wait);
 }
 
-static void __io_state_file_put(struct io_submit_state *state)
+static inline void __io_state_file_put(struct io_submit_state *state)
 {
-       if (state->has_refs)
-               fput_many(state->file, state->has_refs);
-       state->file = NULL;
+       fput_many(state->file, state->file_refs);
+       state->file_refs = 0;
 }
 
 static inline void io_state_file_put(struct io_submit_state *state)
 {
-       if (state->file)
+       if (state->file_refs)
                __io_state_file_put(state);
 }
 
@@ -2698,19 +2784,19 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
        if (!state)
                return fget(fd);
 
-       if (state->file) {
+       if (state->file_refs) {
                if (state->fd == fd) {
-                       state->has_refs--;
+                       state->file_refs--;
                        return state->file;
                }
                __io_state_file_put(state);
        }
        state->file = fget_many(fd, state->ios_left);
-       if (!state->file)
+       if (unlikely(!state->file))
                return NULL;
 
        state->fd = fd;
-       state->has_refs = state->ios_left - 1;
+       state->file_refs = state->ios_left - 1;
        return state->file;
 }
 
@@ -3065,7 +3151,7 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
        return __io_iov_buffer_select(req, iov, needs_lock);
 }
 
-static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
+static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
                                 struct iovec **iovec, struct iov_iter *iter,
                                 bool needs_lock)
 {
@@ -3094,7 +3180,7 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
 
                ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
                *iovec = NULL;
-               return ret < 0 ? ret : sqe_len;
+               return ret;
        }
 
        if (req->flags & REQ_F_BUFFER_SELECT) {
@@ -3111,18 +3197,6 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
                              req->ctx->compat);
 }
 
-static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
-                              struct iovec **iovec, struct iov_iter *iter,
-                              bool needs_lock)
-{
-       struct io_async_rw *iorw = req->async_data;
-
-       if (!iorw)
-               return __io_import_iovec(rw, req, iovec, iter, needs_lock);
-       *iovec = NULL;
-       return iov_iter_count(&iorw->iter);
-}
-
 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
 {
        return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
@@ -3246,7 +3320,7 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
        struct iovec *iov = iorw->fast_iov;
        ssize_t ret;
 
-       ret = __io_import_iovec(rw, req, &iov, &iorw->iter, false);
+       ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
        if (unlikely(ret < 0))
                return ret;
 
@@ -3379,17 +3453,17 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
        struct iov_iter __iter, *iter = &__iter;
        struct io_async_rw *rw = req->async_data;
        ssize_t io_size, ret, ret2;
-       size_t iov_count;
        bool no_async;
 
-       if (rw)
+       if (rw) {
                iter = &rw->iter;
-
-       ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
-       if (ret < 0)
-               return ret;
-       iov_count = iov_iter_count(iter);
-       io_size = ret;
+               iovec = NULL;
+       } else {
+               ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
+               if (ret < 0)
+                       return ret;
+       }
+       io_size = iov_iter_count(iter);
        req->result = io_size;
        ret = 0;
 
@@ -3405,7 +3479,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
        if (no_async)
                goto copy_iov;
 
-       ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count);
+       ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
        if (unlikely(ret))
                goto out_free;
 
@@ -3424,7 +3498,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
                if (req->file->f_flags & O_NONBLOCK)
                        goto done;
                /* some cases will consume bytes even on error returns */
-               iov_iter_revert(iter, iov_count - iov_iter_count(iter));
+               iov_iter_revert(iter, io_size - iov_iter_count(iter));
                ret = 0;
                goto copy_iov;
        } else if (ret < 0) {
@@ -3507,17 +3581,17 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
        struct kiocb *kiocb = &req->rw.kiocb;
        struct iov_iter __iter, *iter = &__iter;
        struct io_async_rw *rw = req->async_data;
-       size_t iov_count;
        ssize_t ret, ret2, io_size;
 
-       if (rw)
+       if (rw) {
                iter = &rw->iter;
-
-       ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
-       if (ret < 0)
-               return ret;
-       iov_count = iov_iter_count(iter);
-       io_size = ret;
+               iovec = NULL;
+       } else {
+               ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
+               if (ret < 0)
+                       return ret;
+       }
+       io_size = iov_iter_count(iter);
        req->result = io_size;
 
        /* Ensure we clear previously set non-block flag */
@@ -3535,7 +3609,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
            (req->flags & REQ_F_ISREG))
                goto copy_iov;
 
-       ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count);
+       ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
        if (unlikely(ret))
                goto out_free;
 
@@ -3578,7 +3652,7 @@ done:
        } else {
 copy_iov:
                /* some cases will consume bytes even on error returns */
-               iov_iter_revert(iter, iov_count - iov_iter_count(iter));
+               iov_iter_revert(iter, io_size - iov_iter_count(iter));
                ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
                if (!ret)
                        return -EAGAIN;
@@ -3590,6 +3664,135 @@ out_free:
        return ret;
 }
 
+static int io_renameat_prep(struct io_kiocb *req,
+                           const struct io_uring_sqe *sqe)
+{
+       struct io_rename *ren = &req->rename;
+       const char __user *oldf, *newf;
+
+       if (unlikely(req->flags & REQ_F_FIXED_FILE))
+               return -EBADF;
+
+       ren->old_dfd = READ_ONCE(sqe->fd);
+       oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+       newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+       ren->new_dfd = READ_ONCE(sqe->len);
+       ren->flags = READ_ONCE(sqe->rename_flags);
+
+       ren->oldpath = getname(oldf);
+       if (IS_ERR(ren->oldpath))
+               return PTR_ERR(ren->oldpath);
+
+       ren->newpath = getname(newf);
+       if (IS_ERR(ren->newpath)) {
+               putname(ren->oldpath);
+               return PTR_ERR(ren->newpath);
+       }
+
+       req->flags |= REQ_F_NEED_CLEANUP;
+       return 0;
+}
+
+static int io_renameat(struct io_kiocb *req, bool force_nonblock)
+{
+       struct io_rename *ren = &req->rename;
+       int ret;
+
+       if (force_nonblock)
+               return -EAGAIN;
+
+       ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
+                               ren->newpath, ren->flags);
+
+       req->flags &= ~REQ_F_NEED_CLEANUP;
+       if (ret < 0)
+               req_set_fail_links(req);
+       io_req_complete(req, ret);
+       return 0;
+}
+
+static int io_unlinkat_prep(struct io_kiocb *req,
+                           const struct io_uring_sqe *sqe)
+{
+       struct io_unlink *un = &req->unlink;
+       const char __user *fname;
+
+       if (unlikely(req->flags & REQ_F_FIXED_FILE))
+               return -EBADF;
+
+       un->dfd = READ_ONCE(sqe->fd);
+
+       un->flags = READ_ONCE(sqe->unlink_flags);
+       if (un->flags & ~AT_REMOVEDIR)
+               return -EINVAL;
+
+       fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+       un->filename = getname(fname);
+       if (IS_ERR(un->filename))
+               return PTR_ERR(un->filename);
+
+       req->flags |= REQ_F_NEED_CLEANUP;
+       return 0;
+}
+
+static int io_unlinkat(struct io_kiocb *req, bool force_nonblock)
+{
+       struct io_unlink *un = &req->unlink;
+       int ret;
+
+       if (force_nonblock)
+               return -EAGAIN;
+
+       if (un->flags & AT_REMOVEDIR)
+               ret = do_rmdir(un->dfd, un->filename);
+       else
+               ret = do_unlinkat(un->dfd, un->filename);
+
+       req->flags &= ~REQ_F_NEED_CLEANUP;
+       if (ret < 0)
+               req_set_fail_links(req);
+       io_req_complete(req, ret);
+       return 0;
+}
+
+static int io_shutdown_prep(struct io_kiocb *req,
+                           const struct io_uring_sqe *sqe)
+{
+#if defined(CONFIG_NET)
+       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+               return -EINVAL;
+       if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
+           sqe->buf_index)
+               return -EINVAL;
+
+       req->shutdown.how = READ_ONCE(sqe->len);
+       return 0;
+#else
+       return -EOPNOTSUPP;
+#endif
+}
+
+static int io_shutdown(struct io_kiocb *req, bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+       struct socket *sock;
+       int ret;
+
+       if (force_nonblock)
+               return -EAGAIN;
+
+       sock = sock_from_file(req->file);
+       if (unlikely(!sock))
+               return -ENOTSOCK;
+
+       ret = __sys_shutdown_sock(sock, req->shutdown.how);
+       io_req_complete(req, ret);
+       return 0;
+#else
+       return -EOPNOTSUPP;
+#endif
+}
+
 static int __io_splice_prep(struct io_kiocb *req,
                            const struct io_uring_sqe *sqe)
 {
@@ -3804,7 +4007,7 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
        u64 flags, mode;
 
-       if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        mode = READ_ONCE(sqe->len);
        flags = READ_ONCE(sqe->open_flags);
@@ -3818,7 +4021,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        size_t len;
        int ret;
 
-       if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
        len = READ_ONCE(sqe->len);
@@ -3948,15 +4151,21 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock,
        head = idr_find(&ctx->io_buffer_idr, p->bgid);
        if (head)
                ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
-
-       io_ring_submit_lock(ctx, !force_nonblock);
        if (ret < 0)
                req_set_fail_links(req);
-       __io_req_complete(req, ret, 0, cs);
-       return 0;
-}
 
-static int io_provide_buffers_prep(struct io_kiocb *req,
+       /* need to hold the lock to complete IOPOLL requests */
+       if (ctx->flags & IORING_SETUP_IOPOLL) {
+               __io_req_complete(req, ret, 0, cs);
+               io_ring_submit_unlock(ctx, !force_nonblock);
+       } else {
+               io_ring_submit_unlock(ctx, !force_nonblock);
+               __io_req_complete(req, ret, 0, cs);
+       }
+       return 0;
+}
+
+static int io_provide_buffers_prep(struct io_kiocb *req,
                                   const struct io_uring_sqe *sqe)
 {
        struct io_provide_buf *p = &req->pbuf;
@@ -4037,10 +4246,17 @@ static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock,
                }
        }
 out:
-       io_ring_submit_unlock(ctx, !force_nonblock);
        if (ret < 0)
                req_set_fail_links(req);
-       __io_req_complete(req, ret, 0, cs);
+
+       /* need to hold the lock to complete IOPOLL requests */
+       if (ctx->flags & IORING_SETUP_IOPOLL) {
+               __io_req_complete(req, ret, 0, cs);
+               io_ring_submit_unlock(ctx, !force_nonblock);
+       } else {
+               io_ring_submit_unlock(ctx, !force_nonblock);
+               __io_req_complete(req, ret, 0, cs);
+       }
        return 0;
 }
 
@@ -4212,7 +4428,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        io_req_init_async(req);
        req->work.flags |= IO_WQ_WORK_NO_CANCEL;
 
-       if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
            sqe->rw_flags || sqe->buf_index)
@@ -4694,7 +4910,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
        struct io_accept *accept = &req->accept;
 
-       if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->len || sqe->buf_index)
                return -EINVAL;
@@ -4735,7 +4951,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        struct io_connect *conn = &req->connect;
        struct io_async_connect *io = req->async_data;
 
-       if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
                return -EINVAL;
@@ -5270,7 +5486,8 @@ static bool io_poll_remove_one(struct io_kiocb *req)
 /*
  * Returns true if we found and killed one or more poll requests
  */
-static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk)
+static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
+                              struct files_struct *files)
 {
        struct hlist_node *tmp;
        struct io_kiocb *req;
@@ -5282,7 +5499,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk)
 
                list = &ctx->cancel_hash[i];
                hlist_for_each_entry_safe(req, tmp, list, hash_node) {
-                       if (io_task_match(req, tsk))
+                       if (io_match_task(req, tsk, files))
                                posted += io_poll_remove_one(req);
                }
        }
@@ -5320,7 +5537,7 @@ static int io_poll_remove_prep(struct io_kiocb *req,
            sqe->poll_events)
                return -EINVAL;
 
-       req->poll.addr = READ_ONCE(sqe->addr);
+       req->poll_remove.addr = READ_ONCE(sqe->addr);
        return 0;
 }
 
@@ -5331,12 +5548,10 @@ static int io_poll_remove_prep(struct io_kiocb *req,
 static int io_poll_remove(struct io_kiocb *req)
 {
        struct io_ring_ctx *ctx = req->ctx;
-       u64 addr;
        int ret;
 
-       addr = req->poll.addr;
        spin_lock_irq(&ctx->completion_lock);
-       ret = io_poll_cancel(ctx, addr);
+       ret = io_poll_cancel(ctx, req->poll_remove.addr);
        spin_unlock_irq(&ctx->completion_lock);
 
        if (ret < 0)
@@ -5429,15 +5644,37 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
        return HRTIMER_NORESTART;
 }
 
-static int __io_timeout_cancel(struct io_kiocb *req)
+static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
+                                          __u64 user_data)
 {
-       struct io_timeout_data *io = req->async_data;
-       int ret;
+       struct io_timeout_data *io;
+       struct io_kiocb *req;
+       int ret = -ENOENT;
 
+       list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
+               if (user_data == req->user_data) {
+                       ret = 0;
+                       break;
+               }
+       }
+
+       if (ret == -ENOENT)
+               return ERR_PTR(ret);
+
+       io = req->async_data;
        ret = hrtimer_try_to_cancel(&io->timer);
        if (ret == -1)
-               return -EALREADY;
+               return ERR_PTR(-EALREADY);
        list_del_init(&req->timeout.list);
+       return req;
+}
+
+static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+{
+       struct io_kiocb *req = io_timeout_extract(ctx, user_data);
+
+       if (IS_ERR(req))
+               return PTR_ERR(req);
 
        req_set_fail_links(req);
        io_cqring_fill_event(req, -ECANCELED);
@@ -5445,35 +5682,48 @@ static int __io_timeout_cancel(struct io_kiocb *req)
        return 0;
 }
 
-static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
+                            struct timespec64 *ts, enum hrtimer_mode mode)
 {
-       struct io_kiocb *req;
-       int ret = -ENOENT;
-
-       list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
-               if (user_data == req->user_data) {
-                       ret = 0;
-                       break;
-               }
-       }
+       struct io_kiocb *req = io_timeout_extract(ctx, user_data);
+       struct io_timeout_data *data;
 
-       if (ret == -ENOENT)
-               return ret;
+       if (IS_ERR(req))
+               return PTR_ERR(req);
 
-       return __io_timeout_cancel(req);
+       req->timeout.off = 0; /* noseq */
+       data = req->async_data;
+       list_add_tail(&req->timeout.list, &ctx->timeout_list);
+       hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
+       data->timer.function = io_timeout_fn;
+       hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
+       return 0;
 }
 
 static int io_timeout_remove_prep(struct io_kiocb *req,
                                  const struct io_uring_sqe *sqe)
 {
+       struct io_timeout_rem *tr = &req->timeout_rem;
+
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
        if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
                return -EINVAL;
-       if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags)
+       if (sqe->ioprio || sqe->buf_index || sqe->len)
                return -EINVAL;
 
-       req->timeout_rem.addr = READ_ONCE(sqe->addr);
+       tr->addr = READ_ONCE(sqe->addr);
+       tr->flags = READ_ONCE(sqe->timeout_flags);
+       if (tr->flags & IORING_TIMEOUT_UPDATE) {
+               if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
+                       return -EINVAL;
+               if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
+                       return -EFAULT;
+       } else if (tr->flags) {
+               /* timeout removal doesn't support flags */
+               return -EINVAL;
+       }
+
        return 0;
 }
 
@@ -5482,11 +5732,19 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
  */
 static int io_timeout_remove(struct io_kiocb *req)
 {
+       struct io_timeout_rem *tr = &req->timeout_rem;
        struct io_ring_ctx *ctx = req->ctx;
        int ret;
 
        spin_lock_irq(&ctx->completion_lock);
-       ret = io_timeout_cancel(ctx, req->timeout_rem.addr);
+       if (req->timeout_rem.flags & IORING_TIMEOUT_UPDATE) {
+               enum hrtimer_mode mode = (tr->flags & IORING_TIMEOUT_ABS)
+                                       ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
+
+               ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
+       } else {
+               ret = io_timeout_cancel(ctx, tr->addr);
+       }
 
        io_cqring_fill_event(req, ret);
        io_commit_cqring(ctx);
@@ -5766,6 +6024,12 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
                return io_remove_buffers_prep(req, sqe);
        case IORING_OP_TEE:
                return io_tee_prep(req, sqe);
+       case IORING_OP_SHUTDOWN:
+               return io_shutdown_prep(req, sqe);
+       case IORING_OP_RENAMEAT:
+               return io_renameat_prep(req, sqe);
+       case IORING_OP_UNLINKAT:
+               return io_unlinkat_prep(req, sqe);
        }
 
        printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -5787,11 +6051,10 @@ static u32 io_get_sequence(struct io_kiocb *req)
 {
        struct io_kiocb *pos;
        struct io_ring_ctx *ctx = req->ctx;
-       u32 total_submitted, nr_reqs = 1;
+       u32 total_submitted, nr_reqs = 0;
 
-       if (req->flags & REQ_F_LINK_HEAD)
-               list_for_each_entry(pos, &req->link_list, link_list)
-                       nr_reqs++;
+       io_for_each_link(pos, req)
+               nr_reqs++;
 
        total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
        return total_submitted - nr_reqs;
@@ -5843,12 +6106,13 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 static void io_req_drop_files(struct io_kiocb *req)
 {
        struct io_ring_ctx *ctx = req->ctx;
+       struct io_uring_task *tctx = req->task->io_uring;
        unsigned long flags;
 
        spin_lock_irqsave(&ctx->inflight_lock, flags);
        list_del(&req->inflight_entry);
-       if (waitqueue_active(&ctx->inflight_wait))
-               wake_up(&ctx->inflight_wait);
+       if (atomic_read(&tctx->in_idle))
+               wake_up(&tctx->wait);
        spin_unlock_irqrestore(&ctx->inflight_lock, flags);
        req->flags &= ~REQ_F_INFLIGHT;
        put_files_struct(req->work.identity->files);
@@ -5903,6 +6167,13 @@ static void __io_clean_op(struct io_kiocb *req)
                        if (req->open.filename)
                                putname(req->open.filename);
                        break;
+               case IORING_OP_RENAMEAT:
+                       putname(req->rename.oldpath);
+                       putname(req->rename.newpath);
+                       break;
+               case IORING_OP_UNLINKAT:
+                       putname(req->unlink.filename);
+                       break;
                }
                req->flags &= ~REQ_F_NEED_CLEANUP;
        }
@@ -6009,6 +6280,15 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
        case IORING_OP_TEE:
                ret = io_tee(req, force_nonblock);
                break;
+       case IORING_OP_SHUTDOWN:
+               ret = io_shutdown(req, force_nonblock);
+               break;
+       case IORING_OP_RENAMEAT:
+               ret = io_renameat(req, force_nonblock);
+               break;
+       case IORING_OP_UNLINKAT:
+               ret = io_unlinkat(req, force_nonblock);
+               break;
        default:
                ret = -EINVAL;
                break;
@@ -6025,7 +6305,7 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
                if (in_async)
                        mutex_lock(&ctx->uring_lock);
 
-               io_iopoll_req_issued(req);
+               io_iopoll_req_issued(req, in_async);
 
                if (in_async)
                        mutex_unlock(&ctx->uring_lock);
@@ -6065,8 +6345,19 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
        }
 
        if (ret) {
-               req_set_fail_links(req);
-               io_req_complete(req, ret);
+               /*
+                * io_iopoll_complete() does not hold completion_lock to complete
+                * polled io, so here for polled io, just mark it done and still let
+                * io_iopoll_complete() complete it.
+                */
+               if (req->ctx->flags & IORING_SETUP_IOPOLL) {
+                       struct kiocb *kiocb = &req->rw.kiocb;
+
+                       kiocb_done(kiocb, ret, NULL);
+               } else {
+                       req_set_fail_links(req);
+                       io_req_complete(req, ret);
+               }
        }
 
        return io_steal_work(req);
@@ -6092,10 +6383,7 @@ static struct file *io_file_get(struct io_submit_state *state,
                        return NULL;
                fd = array_index_nospec(fd, ctx->nr_user_files);
                file = io_file_from_index(ctx, fd);
-               if (file) {
-                       req->fixed_file_refs = &ctx->file_data->node->refs;
-                       percpu_ref_get(req->fixed_file_refs);
-               }
+               io_set_resource_node(req);
        } else {
                trace_io_uring_file_get(ctx, fd);
                file = __io_file_get(state, fd);
@@ -6104,45 +6392,26 @@ static struct file *io_file_get(struct io_submit_state *state,
        return file;
 }
 
-static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
-                          int fd)
-{
-       bool fixed;
-
-       fixed = (req->flags & REQ_F_FIXED_FILE) != 0;
-       if (unlikely(!fixed && io_async_submit(req->ctx)))
-               return -EBADF;
-
-       req->file = io_file_get(state, req, fd, fixed);
-       if (req->file || io_op_defs[req->opcode].needs_file_no_error)
-               return 0;
-       return -EBADF;
-}
-
 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 {
        struct io_timeout_data *data = container_of(timer,
                                                struct io_timeout_data, timer);
-       struct io_kiocb *req = data->req;
+       struct io_kiocb *prev, *req = data->req;
        struct io_ring_ctx *ctx = req->ctx;
-       struct io_kiocb *prev = NULL;
        unsigned long flags;
 
        spin_lock_irqsave(&ctx->completion_lock, flags);
+       prev = req->timeout.head;
+       req->timeout.head = NULL;
 
        /*
         * We don't expect the list to be empty, that will only happen if we
         * race with the completion of the linked work.
         */
-       if (!list_empty(&req->link_list)) {
-               prev = list_entry(req->link_list.prev, struct io_kiocb,
-                                 link_list);
-               if (refcount_inc_not_zero(&prev->refs))
-                       list_del_init(&req->link_list);
-               else
-                       prev = NULL;
-       }
-
+       if (prev && refcount_inc_not_zero(&prev->refs))
+               io_remove_next_linked(prev);
+       else
+               prev = NULL;
        spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
        if (prev) {
@@ -6158,10 +6427,10 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 static void __io_queue_linked_timeout(struct io_kiocb *req)
 {
        /*
-        * If the list is now empty, then our linked request finished before
-        * we got a chance to setup the timer
+        * If the back reference is NULL, then our linked request finished
+        * before we got a chance to setup the timer
         */
-       if (!list_empty(&req->link_list)) {
+       if (req->timeout.head) {
                struct io_timeout_data *data = req->async_data;
 
                data->timer.function = io_link_timeout_fn;
@@ -6184,18 +6453,13 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
 
 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 {
-       struct io_kiocb *nxt;
-
-       if (!(req->flags & REQ_F_LINK_HEAD))
-               return NULL;
-       if (req->flags & REQ_F_LINK_TIMEOUT)
-               return NULL;
+       struct io_kiocb *nxt = req->link;
 
-       nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
-                                       link_list);
-       if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
+       if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) ||
+           nxt->opcode != IORING_OP_LINK_TIMEOUT)
                return NULL;
 
+       nxt->timeout.head = req;
        nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
        req->flags |= REQ_F_LINK_TIMEOUT;
        return nxt;
@@ -6301,8 +6565,13 @@ static inline void io_queue_link_head(struct io_kiocb *req,
                io_queue_sqe(req, NULL, cs);
 }
 
+struct io_submit_link {
+       struct io_kiocb *head;
+       struct io_kiocb *last;
+};
+
 static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-                        struct io_kiocb **link, struct io_comp_state *cs)
+                        struct io_submit_link *link, struct io_comp_state *cs)
 {
        struct io_ring_ctx *ctx = req->ctx;
        int ret;
@@ -6314,8 +6583,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
         * submitted sync once the chain is complete. If none of those
         * conditions are true (normal request), then just queue it.
         */
-       if (*link) {
-               struct io_kiocb *head = *link;
+       if (link->head) {
+               struct io_kiocb *head = link->head;
 
                /*
                 * Taking sequential execution of a link, draining both sides
@@ -6335,12 +6604,13 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                        return ret;
                }
                trace_io_uring_link(ctx, req, head);
-               list_add_tail(&req->link_list, &head->link_list);
+               link->last->link = req;
+               link->last = req;
 
                /* last request of a link, enqueue the link */
                if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
                        io_queue_link_head(head, cs);
-                       *link = NULL;
+                       link->head = NULL;
                }
        } else {
                if (unlikely(ctx->drain_next)) {
@@ -6348,13 +6618,11 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                        ctx->drain_next = 0;
                }
                if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
-                       req->flags |= REQ_F_LINK_HEAD;
-                       INIT_LIST_HEAD(&req->link_list);
-
                        ret = io_req_defer_prep(req, sqe);
                        if (unlikely(ret))
                                req->flags |= REQ_F_FAIL_LINK;
-                       *link = req;
+                       link->head = req;
+                       link->last = req;
                } else {
                        io_queue_sqe(req, sqe, cs);
                }
@@ -6370,7 +6638,8 @@ static void io_submit_state_end(struct io_submit_state *state)
 {
        if (!list_empty(&state->comp.list))
                io_submit_flush_completions(&state->comp);
-       blk_finish_plug(&state->plug);
+       if (state->plug_started)
+               blk_finish_plug(&state->plug);
        io_state_file_put(state);
        if (state->free_reqs)
                kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
@@ -6382,12 +6651,12 @@ static void io_submit_state_end(struct io_submit_state *state)
 static void io_submit_state_start(struct io_submit_state *state,
                                  struct io_ring_ctx *ctx, unsigned int max_ios)
 {
-       blk_start_plug(&state->plug);
+       state->plug_started = false;
        state->comp.nr = 0;
        INIT_LIST_HEAD(&state->comp.list);
        state->comp.ctx = ctx;
        state->free_reqs = 0;
-       state->file = NULL;
+       state->file_refs = 0;
        state->ios_left = max_ios;
 }
 
@@ -6482,6 +6751,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
        req->file = NULL;
        req->ctx = ctx;
        req->flags = 0;
+       req->link = NULL;
+       req->fixed_file_refs = NULL;
        /* one is dropped after submission, the other at completion */
        refcount_set(&req->refs, 2);
        req->task = current;
@@ -6490,7 +6761,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
        if (unlikely(req->opcode >= IORING_OP_LAST))
                return -EINVAL;
 
-       if (unlikely(io_sq_thread_acquire_mm(ctx, req)))
+       if (unlikely(io_sq_thread_acquire_mm_files(ctx, req)))
                return -EFAULT;
 
        sqe_flags = READ_ONCE(sqe->flags);
@@ -6523,10 +6794,26 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
        /* same numerical values with corresponding REQ_F_*, safe to copy */
        req->flags |= sqe_flags;
 
-       if (!io_op_defs[req->opcode].needs_file)
-               return 0;
+       /*
+        * Plug now if we have more than 1 IO left after this, and the target
+        * is potentially a read/write to block based storage.
+        */
+       if (!state->plug_started && state->ios_left > 1 &&
+           io_op_defs[req->opcode].plug) {
+               blk_start_plug(&state->plug);
+               state->plug_started = true;
+       }
+
+       ret = 0;
+       if (io_op_defs[req->opcode].needs_file) {
+               bool fixed = req->flags & REQ_F_FIXED_FILE;
+
+               req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
+               if (unlikely(!req->file &&
+                   !io_op_defs[req->opcode].needs_file_no_error))
+                       ret = -EBADF;
+       }
 
-       ret = io_req_set_file(state, req, READ_ONCE(sqe->fd));
        state->ios_left--;
        return ret;
 }
@@ -6534,7 +6821,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 {
        struct io_submit_state state;
-       struct io_kiocb *link = NULL;
+       struct io_submit_link link;
        int i, submitted = 0;
 
        /* if we have a backlog and couldn't flush it all, return BUSY */
@@ -6554,6 +6841,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
        refcount_add(nr, &current->usage);
 
        io_submit_state_start(&state, ctx, nr);
+       link.head = NULL;
 
        for (i = 0; i < nr; i++) {
                const struct io_uring_sqe *sqe;
@@ -6599,8 +6887,8 @@ fail_req:
                percpu_counter_sub(&tctx->inflight, unused);
                put_task_struct_many(current, unused);
        }
-       if (link)
-               io_queue_link_head(link, &state.comp);
+       if (link.head)
+               io_queue_link_head(link.head, &state.comp);
        io_submit_state_end(&state);
 
         /* Commit SQ ring head once we've consumed and submitted all SQEs */
@@ -6624,111 +6912,45 @@ static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
        spin_unlock_irq(&ctx->completion_lock);
 }
 
-static int io_sq_wake_function(struct wait_queue_entry *wqe, unsigned mode,
-                              int sync, void *key)
+static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
 {
-       struct io_ring_ctx *ctx = container_of(wqe, struct io_ring_ctx, sqo_wait_entry);
-       int ret;
-
-       ret = autoremove_wake_function(wqe, mode, sync, key);
-       if (ret) {
-               unsigned long flags;
-
-               spin_lock_irqsave(&ctx->completion_lock, flags);
-               ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
-               spin_unlock_irqrestore(&ctx->completion_lock, flags);
-       }
-       return ret;
-}
-
-enum sq_ret {
-       SQT_IDLE        = 1,
-       SQT_SPIN        = 2,
-       SQT_DID_WORK    = 4,
-};
-
-static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx,
-                                 unsigned long start_jiffies, bool cap_entries)
-{
-       unsigned long timeout = start_jiffies + ctx->sq_thread_idle;
-       struct io_sq_data *sqd = ctx->sq_data;
        unsigned int to_submit;
        int ret = 0;
 
-again:
-       if (!list_empty(&ctx->iopoll_list)) {
+       to_submit = io_sqring_entries(ctx);
+       /* if we're handling multiple rings, cap submit size for fairness */
+       if (cap_entries && to_submit > 8)
+               to_submit = 8;
+
+       if (!list_empty(&ctx->iopoll_list) || to_submit) {
                unsigned nr_events = 0;
 
                mutex_lock(&ctx->uring_lock);
-               if (!list_empty(&ctx->iopoll_list) && !need_resched())
+               if (!list_empty(&ctx->iopoll_list))
                        io_do_iopoll(ctx, &nr_events, 0);
+
+               if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)))
+                       ret = io_submit_sqes(ctx, to_submit);
                mutex_unlock(&ctx->uring_lock);
        }
 
-       to_submit = io_sqring_entries(ctx);
-
-       /*
-        * If submit got -EBUSY, flag us as needing the application
-        * to enter the kernel to reap and flush events.
-        */
-       if (!to_submit || ret == -EBUSY || need_resched()) {
-               /*
-                * Drop cur_mm before scheduling, we can't hold it for
-                * long periods (or over schedule()). Do this before
-                * adding ourselves to the waitqueue, as the unuse/drop
-                * may sleep.
-                */
-               io_sq_thread_drop_mm();
-
-               /*
-                * We're polling. If we're within the defined idle
-                * period, then let us spin without work before going
-                * to sleep. The exception is if we got EBUSY doing
-                * more IO, we should wait for the application to
-                * reap events and wake us up.
-                */
-               if (!list_empty(&ctx->iopoll_list) || need_resched() ||
-                   (!time_after(jiffies, timeout) && ret != -EBUSY &&
-                   !percpu_ref_is_dying(&ctx->refs)))
-                       return SQT_SPIN;
+       if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
+               wake_up(&ctx->sqo_sq_wait);
 
-               prepare_to_wait(&sqd->wait, &ctx->sqo_wait_entry,
-                                       TASK_INTERRUPTIBLE);
+       return ret;
+}
 
-               /*
-                * While doing polled IO, before going to sleep, we need
-                * to check if there are new reqs added to iopoll_list,
-                * it is because reqs may have been punted to io worker
-                * and will be added to iopoll_list later, hence check
-                * the iopoll_list again.
-                */
-               if ((ctx->flags & IORING_SETUP_IOPOLL) &&
-                   !list_empty_careful(&ctx->iopoll_list)) {
-                       finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
-                       goto again;
-               }
+static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
+{
+       struct io_ring_ctx *ctx;
+       unsigned sq_thread_idle = 0;
 
-               to_submit = io_sqring_entries(ctx);
-               if (!to_submit || ret == -EBUSY)
-                       return SQT_IDLE;
+       list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+               if (sq_thread_idle < ctx->sq_thread_idle)
+                       sq_thread_idle = ctx->sq_thread_idle;
        }
 
-       finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
-       io_ring_clear_wakeup_flag(ctx);
-
-       /* if we're handling multiple rings, cap submit size for fairness */
-       if (cap_entries && to_submit > 8)
-               to_submit = 8;
-
-       mutex_lock(&ctx->uring_lock);
-       if (likely(!percpu_ref_is_dying(&ctx->refs)))
-               ret = io_submit_sqes(ctx, to_submit);
-       mutex_unlock(&ctx->uring_lock);
-
-       if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
-               wake_up(&ctx->sqo_sq_wait);
-
-       return SQT_DID_WORK;
+       sqd->sq_thread_idle = sq_thread_idle;
 }
 
 static void io_sqd_init_new(struct io_sq_data *sqd)
@@ -6737,39 +6959,56 @@ static void io_sqd_init_new(struct io_sq_data *sqd)
 
        while (!list_empty(&sqd->ctx_new_list)) {
                ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list);
-               init_wait(&ctx->sqo_wait_entry);
-               ctx->sqo_wait_entry.func = io_sq_wake_function;
                list_move_tail(&ctx->sqd_list, &sqd->ctx_list);
                complete(&ctx->sq_thread_comp);
        }
+
+       io_sqd_update_thread_idle(sqd);
 }
 
 static int io_sq_thread(void *data)
 {
        struct cgroup_subsys_state *cur_css = NULL;
+       struct files_struct *old_files = current->files;
+       struct nsproxy *old_nsproxy = current->nsproxy;
        const struct cred *old_cred = NULL;
        struct io_sq_data *sqd = data;
        struct io_ring_ctx *ctx;
-       unsigned long start_jiffies;
+       unsigned long timeout = 0;
+       DEFINE_WAIT(wait);
+
+       task_lock(current);
+       current->files = NULL;
+       current->nsproxy = NULL;
+       task_unlock(current);
 
-       start_jiffies = jiffies;
        while (!kthread_should_stop()) {
-               enum sq_ret ret = 0;
-               bool cap_entries;
+               int ret;
+               bool cap_entries, sqt_spin, needs_sched;
 
                /*
                 * Any changes to the sqd lists are synchronized through the
                 * kthread parking. This synchronizes the thread vs users,
                 * the users are synchronized on the sqd->ctx_lock.
                 */
-               if (kthread_should_park())
+               if (kthread_should_park()) {
                        kthread_parkme();
+                       /*
+                        * When sq thread is unparked, in case the previous park operation
+                        * comes from io_put_sq_data(), which means that sq thread is going
+                        * to be stopped, so here needs to have a check.
+                        */
+                       if (kthread_should_stop())
+                               break;
+               }
 
-               if (unlikely(!list_empty(&sqd->ctx_new_list)))
+               if (unlikely(!list_empty(&sqd->ctx_new_list))) {
                        io_sqd_init_new(sqd);
+                       timeout = jiffies + sqd->sq_thread_idle;
+               }
 
+               sqt_spin = false;
                cap_entries = !list_is_singular(&sqd->ctx_list);
-
                list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
                        if (current->cred != ctx->creds) {
                                if (old_cred)
@@ -6782,24 +7021,49 @@ static int io_sq_thread(void *data)
                        current->sessionid = ctx->sessionid;
 #endif
 
-                       ret |= __io_sq_thread(ctx, start_jiffies, cap_entries);
+                       ret = __io_sq_thread(ctx, cap_entries);
+                       if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
+                               sqt_spin = true;
 
-                       io_sq_thread_drop_mm();
+                       io_sq_thread_drop_mm_files();
                }
 
-               if (ret & SQT_SPIN) {
+               if (sqt_spin || !time_after(jiffies, timeout)) {
                        io_run_task_work();
                        cond_resched();
-               } else if (ret == SQT_IDLE) {
-                       if (kthread_should_park())
-                               continue;
+                       if (sqt_spin)
+                               timeout = jiffies + sqd->sq_thread_idle;
+                       continue;
+               }
+
+               if (kthread_should_park())
+                       continue;
+
+               needs_sched = true;
+               prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
+               list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+                       if ((ctx->flags & IORING_SETUP_IOPOLL) &&
+                           !list_empty_careful(&ctx->iopoll_list)) {
+                               needs_sched = false;
+                               break;
+                       }
+                       if (io_sqring_entries(ctx)) {
+                               needs_sched = false;
+                               break;
+                       }
+               }
+
+               if (needs_sched) {
                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                io_ring_set_wakeup_flag(ctx);
+
                        schedule();
-                       start_jiffies = jiffies;
                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                io_ring_clear_wakeup_flag(ctx);
                }
+
+               finish_wait(&sqd->wait, &wait);
+               timeout = jiffies + sqd->sq_thread_idle;
        }
 
        io_run_task_work();
@@ -6809,6 +7073,11 @@ static int io_sq_thread(void *data)
        if (old_cred)
                revert_creds(old_cred);
 
+       task_lock(current);
+       current->files = old_files;
+       current->nsproxy = old_nsproxy;
+       task_unlock(current);
+
        kthread_parkme();
 
        return 0;
@@ -6863,7 +7132,8 @@ static int io_run_task_work_sig(void)
  * application must reap them itself, as they reside on the shared cq ring.
  */
 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
-                         const sigset_t __user *sig, size_t sigsz)
+                         const sigset_t __user *sig, size_t sigsz,
+                         struct __kernel_timespec __user *uts)
 {
        struct io_wait_queue iowq = {
                .wq = {
@@ -6875,6 +7145,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
                .to_wait        = min_events,
        };
        struct io_rings *rings = ctx->rings;
+       struct timespec64 ts;
+       signed long timeout = 0;
        int ret = 0;
 
        do {
@@ -6897,6 +7169,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
                        return ret;
        }
 
+       if (uts) {
+               if (get_timespec64(&ts, uts))
+                       return -EFAULT;
+               timeout = timespec64_to_jiffies(&ts);
+       }
+
        iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
        trace_io_uring_cqring_wait(ctx, min_events);
        do {
@@ -6910,7 +7188,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
                        break;
                if (io_should_wake(&iowq, false))
                        break;
-               schedule();
+               if (uts) {
+                       timeout = schedule_timeout(timeout);
+                       if (timeout == 0) {
+                               ret = -ETIME;
+                               break;
+                       }
+               } else {
+                       schedule();
+               }
        } while (1);
        finish_wait(&ctx->wait, &iowq.wq);
 
@@ -6959,9 +7245,9 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
        if (!data)
                return -ENXIO;
 
-       spin_lock(&data->lock);
+       spin_lock_bh(&data->lock);
        ref_node = data->node;
-       spin_unlock(&data->lock);
+       spin_unlock_bh(&data->lock);
        if (ref_node)
                percpu_ref_kill(&ref_node->refs);
 
@@ -7084,12 +7370,11 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx)
 
                mutex_lock(&sqd->ctx_lock);
                list_del(&ctx->sqd_list);
+               io_sqd_update_thread_idle(sqd);
                mutex_unlock(&sqd->ctx_lock);
 
-               if (sqd->thread) {
-                       finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
+               if (sqd->thread)
                        io_sq_thread_unpark(sqd);
-               }
 
                io_put_sq_data(sqd);
                ctx->sq_data = NULL;
@@ -7344,7 +7629,7 @@ static void io_file_data_ref_zero(struct percpu_ref *ref)
        data = ref_node->file_data;
        ctx = data->ctx;
 
-       spin_lock(&data->lock);
+       spin_lock_bh(&data->lock);
        ref_node->done = true;
 
        while (!list_empty(&data->ref_list)) {
@@ -7356,7 +7641,7 @@ static void io_file_data_ref_zero(struct percpu_ref *ref)
                list_del(&ref_node->node);
                first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist);
        }
-       spin_unlock(&data->lock);
+       spin_unlock_bh(&data->lock);
 
        if (percpu_ref_is_dying(&data->refs))
                delay = 0;
@@ -7479,9 +7764,9 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
        }
 
        file_data->node = ref_node;
-       spin_lock(&file_data->lock);
+       spin_lock_bh(&file_data->lock);
        list_add_tail(&ref_node->node, &file_data->ref_list);
-       spin_unlock(&file_data->lock);
+       spin_unlock_bh(&file_data->lock);
        percpu_ref_get(&file_data->refs);
        return ret;
 out_fput:
@@ -7638,10 +7923,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 
        if (needs_switch) {
                percpu_ref_kill(&data->node->refs);
-               spin_lock(&data->lock);
+               spin_lock_bh(&data->lock);
                list_add_tail(&ref_node->node, &data->ref_list);
                data->node = ref_node;
-               spin_unlock(&data->lock);
+               spin_unlock_bh(&data->lock);
                percpu_ref_get(&ctx->file_data->refs);
        } else
                destroy_fixed_file_ref_node(ref_node);
@@ -7769,7 +8054,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
                struct io_sq_data *sqd;
 
                ret = -EPERM;
-               if (!capable(CAP_SYS_ADMIN))
+               if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
                        goto err;
 
                sqd = io_get_sq_data(p);
@@ -8355,8 +8640,6 @@ static void io_ring_exit_work(struct work_struct *work)
         * as nobody else will be looking for them.
         */
        do {
-               if (ctx->rings)
-                       io_cqring_overflow_flush(ctx, true, NULL, NULL);
                io_iopoll_try_reap_events(ctx);
        } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
        io_ring_ctx_free(ctx);
@@ -8366,17 +8649,17 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 {
        mutex_lock(&ctx->uring_lock);
        percpu_ref_kill(&ctx->refs);
+       if (ctx->rings)
+               io_cqring_overflow_flush(ctx, true, NULL, NULL);
        mutex_unlock(&ctx->uring_lock);
 
-       io_kill_timeouts(ctx, NULL);
-       io_poll_remove_all(ctx, NULL);
+       io_kill_timeouts(ctx, NULL, NULL);
+       io_poll_remove_all(ctx, NULL, NULL);
 
        if (ctx->io_wq)
                io_wq_cancel_all(ctx->io_wq);
 
        /* if we failed setting up the ctx, we might not have any rings */
-       if (ctx->rings)
-               io_cqring_overflow_flush(ctx, true, NULL, NULL);
        io_iopoll_try_reap_events(ctx);
        idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
 
@@ -8407,120 +8690,31 @@ static int io_uring_release(struct inode *inode, struct file *file)
        return 0;
 }
 
-static bool io_wq_files_match(struct io_wq_work *work, void *data)
-{
-       struct files_struct *files = data;
-
-       return !files || ((work->flags & IO_WQ_WORK_FILES) &&
-                               work->identity->files == files);
-}
-
-/*
- * Returns true if 'preq' is the link parent of 'req'
- */
-static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req)
-{
-       struct io_kiocb *link;
-
-       if (!(preq->flags & REQ_F_LINK_HEAD))
-               return false;
-
-       list_for_each_entry(link, &preq->link_list, link_list) {
-               if (link == req)
-                       return true;
-       }
-
-       return false;
-}
-
-/*
- * We're looking to cancel 'req' because it's holding on to our files, but
- * 'req' could be a link to another request. See if it is, and cancel that
- * parent request if so.
- */
-static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req)
-{
-       struct hlist_node *tmp;
-       struct io_kiocb *preq;
-       bool found = false;
-       int i;
-
-       spin_lock_irq(&ctx->completion_lock);
-       for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
-               struct hlist_head *list;
-
-               list = &ctx->cancel_hash[i];
-               hlist_for_each_entry_safe(preq, tmp, list, hash_node) {
-                       found = io_match_link(preq, req);
-                       if (found) {
-                               io_poll_remove_one(preq);
-                               break;
-                       }
-               }
-       }
-       spin_unlock_irq(&ctx->completion_lock);
-       return found;
-}
-
-static bool io_timeout_remove_link(struct io_ring_ctx *ctx,
-                                  struct io_kiocb *req)
-{
-       struct io_kiocb *preq;
-       bool found = false;
-
-       spin_lock_irq(&ctx->completion_lock);
-       list_for_each_entry(preq, &ctx->timeout_list, timeout.list) {
-               found = io_match_link(preq, req);
-               if (found) {
-                       __io_timeout_cancel(preq);
-                       break;
-               }
-       }
-       spin_unlock_irq(&ctx->completion_lock);
-       return found;
-}
+struct io_task_cancel {
+       struct task_struct *task;
+       struct files_struct *files;
+};
 
-static bool io_cancel_link_cb(struct io_wq_work *work, void *data)
+static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
 {
        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+       struct io_task_cancel *cancel = data;
        bool ret;
 
-       if (req->flags & REQ_F_LINK_TIMEOUT) {
+       if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) {
                unsigned long flags;
                struct io_ring_ctx *ctx = req->ctx;
 
                /* protect against races with linked timeouts */
                spin_lock_irqsave(&ctx->completion_lock, flags);
-               ret = io_match_link(req, data);
+               ret = io_match_task(req, cancel->task, cancel->files);
                spin_unlock_irqrestore(&ctx->completion_lock, flags);
        } else {
-               ret = io_match_link(req, data);
+               ret = io_match_task(req, cancel->task, cancel->files);
        }
        return ret;
 }
 
-static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
-{
-       enum io_wq_cancel cret;
-
-       /* cancel this particular work, if it's running */
-       cret = io_wq_cancel_work(ctx->io_wq, &req->work);
-       if (cret != IO_WQ_CANCEL_NOTFOUND)
-               return;
-
-       /* find links that hold this pending, cancel those */
-       cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true);
-       if (cret != IO_WQ_CANCEL_NOTFOUND)
-               return;
-
-       /* if we have a poll link holding this pending, cancel that */
-       if (io_poll_remove_link(ctx, req))
-               return;
-
-       /* final option, timeout link is holding this req pending */
-       io_timeout_remove_link(ctx, req);
-}
-
 static void io_cancel_defer_files(struct io_ring_ctx *ctx,
                                  struct task_struct *task,
                                  struct files_struct *files)
@@ -8530,8 +8724,7 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,
 
        spin_lock_irq(&ctx->completion_lock);
        list_for_each_entry_reverse(de, &ctx->defer_list, list) {
-               if (io_task_match(de->req, task) &&
-                   io_match_files(de->req, files)) {
+               if (io_match_task(de->req, task, files)) {
                        list_cut_position(&list, &ctx->defer_list, &de->list);
                        break;
                }
@@ -8548,72 +8741,52 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,
        }
 }
 
-/*
- * Returns true if we found and killed one or more files pinning requests
- */
-static bool io_uring_cancel_files(struct io_ring_ctx *ctx,
+static void io_uring_cancel_files(struct io_ring_ctx *ctx,
+                                 struct task_struct *task,
                                  struct files_struct *files)
 {
-       if (list_empty_careful(&ctx->inflight_list))
-               return false;
-
-       /* cancel all at once, should be faster than doing it one by one*/
-       io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
-
        while (!list_empty_careful(&ctx->inflight_list)) {
-               struct io_kiocb *cancel_req = NULL, *req;
+               struct io_task_cancel cancel = { .task = task, .files = files };
+               struct io_kiocb *req;
                DEFINE_WAIT(wait);
+               bool found = false;
 
                spin_lock_irq(&ctx->inflight_lock);
                list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
-                       if (files && (req->work.flags & IO_WQ_WORK_FILES) &&
+                       if (req->task != task ||
                            req->work.identity->files != files)
                                continue;
-                       /* req is being completed, ignore */
-                       if (!refcount_inc_not_zero(&req->refs))
-                               continue;
-                       cancel_req = req;
+                       found = true;
                        break;
                }
-               if (cancel_req)
-                       prepare_to_wait(&ctx->inflight_wait, &wait,
-                                               TASK_UNINTERRUPTIBLE);
+               if (found)
+                       prepare_to_wait(&task->io_uring->wait, &wait,
+                                       TASK_UNINTERRUPTIBLE);
                spin_unlock_irq(&ctx->inflight_lock);
 
                /* We need to keep going until we don't find a matching req */
-               if (!cancel_req)
+               if (!found)
                        break;
-               /* cancel this request, or head link requests */
-               io_attempt_cancel(ctx, cancel_req);
-               io_put_req(cancel_req);
+
+               io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true);
+               io_poll_remove_all(ctx, task, files);
+               io_kill_timeouts(ctx, task, files);
                /* cancellations _may_ trigger task work */
                io_run_task_work();
                schedule();
-               finish_wait(&ctx->inflight_wait, &wait);
+               finish_wait(&task->io_uring->wait, &wait);
        }
-
-       return true;
-}
-
-static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
-{
-       struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-       struct task_struct *task = data;
-
-       return io_task_match(req, task);
 }
 
-static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
-                                           struct task_struct *task,
-                                           struct files_struct *files)
+static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
+                                           struct task_struct *task)
 {
-       bool ret;
-
-       ret = io_uring_cancel_files(ctx, files);
-       if (!files) {
+       while (1) {
+               struct io_task_cancel cancel = { .task = task, .files = NULL, };
                enum io_wq_cancel cret;
+               bool ret = false;
 
-               cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true);
+               cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true);
                if (cret != IO_WQ_CANCEL_NOTFOUND)
                        ret = true;
 
@@ -8625,11 +8798,13 @@ static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
                        }
                }
 
-               ret |= io_poll_remove_all(ctx, task);
-               ret |= io_kill_timeouts(ctx, task);
+               ret |= io_poll_remove_all(ctx, task, NULL);
+               ret |= io_kill_timeouts(ctx, task, NULL);
+               if (!ret)
+                       break;
+               io_run_task_work();
+               cond_resched();
        }
-
-       return ret;
 }
 
 /*
@@ -8648,17 +8823,15 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
                io_sq_thread_park(ctx->sq_data);
        }
 
-       if (files)
-               io_cancel_defer_files(ctx, NULL, files);
-       else
-               io_cancel_defer_files(ctx, task, NULL);
-
+       io_cancel_defer_files(ctx, task, files);
+       io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
        io_cqring_overflow_flush(ctx, true, task, files);
+       io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
 
-       while (__io_uring_cancel_task_requests(ctx, task, files)) {
-               io_run_task_work();
-               cond_resched();
-       }
+       if (!files)
+               __io_uring_cancel_task_requests(ctx, task);
+       else
+               io_uring_cancel_files(ctx, task, files);
 
        if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
                atomic_dec(&task->io_uring->in_idle);
@@ -8916,9 +9089,39 @@ static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
        finish_wait(&ctx->sqo_sq_wait, &wait);
 }
 
+static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
+                         struct __kernel_timespec __user **ts,
+                         const sigset_t __user **sig)
+{
+       struct io_uring_getevents_arg arg;
+
+       /*
+        * If EXT_ARG isn't set, then we have no timespec and the argp pointer
+        * is just a pointer to the sigset_t.
+        */
+       if (!(flags & IORING_ENTER_EXT_ARG)) {
+               *sig = (const sigset_t __user *) argp;
+               *ts = NULL;
+               return 0;
+       }
+
+       /*
+        * EXT_ARG is set - ensure we agree on the size of it and copy in our
+        * timespec and sigset_t pointers if good.
+        */
+       if (*argsz != sizeof(arg))
+               return -EINVAL;
+       if (copy_from_user(&arg, argp, sizeof(arg)))
+               return -EFAULT;
+       *sig = u64_to_user_ptr(arg.sigmask);
+       *argsz = arg.sigmask_sz;
+       *ts = u64_to_user_ptr(arg.ts);
+       return 0;
+}
+
 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
-               u32, min_complete, u32, flags, const sigset_t __user *, sig,
-               size_t, sigsz)
+               u32, min_complete, u32, flags, const void __user *, argp,
+               size_t, argsz)
 {
        struct io_ring_ctx *ctx;
        long ret = -EBADF;
@@ -8928,7 +9131,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
        io_run_task_work();
 
        if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
-                       IORING_ENTER_SQ_WAIT))
+                       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))
                return -EINVAL;
 
        f = fdget(fd);
@@ -8955,8 +9158,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
         */
        ret = 0;
        if (ctx->flags & IORING_SETUP_SQPOLL) {
+               io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
                if (!list_empty_careful(&ctx->cq_overflow_list))
                        io_cqring_overflow_flush(ctx, false, NULL, NULL);
+               io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL));
                if (flags & IORING_ENTER_SQ_WAKEUP)
                        wake_up(&ctx->sq_data->wait);
                if (flags & IORING_ENTER_SQ_WAIT)
@@ -8974,6 +9179,13 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
                        goto out;
        }
        if (flags & IORING_ENTER_GETEVENTS) {
+               const sigset_t __user *sig;
+               struct __kernel_timespec __user *ts;
+
+               ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
+               if (unlikely(ret))
+                       goto out;
+
                min_complete = min(min_complete, ctx->cq_entries);
 
                /*
@@ -8986,7 +9198,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
                    !(ctx->flags & IORING_SETUP_SQPOLL)) {
                        ret = io_iopoll_check(ctx, min_complete);
                } else {
-                       ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
+                       ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
                }
        }
 
@@ -9354,7 +9566,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
        p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
                        IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
                        IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
-                       IORING_FEAT_POLL_32BITS;
+                       IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
+                       IORING_FEAT_EXT_ARG;
 
        if (copy_to_user(params, p, sizeof(*p))) {
                ret = -EFAULT;