Merge tag 'pm-5.7-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
[linux-2.6-microblaze.git] / fs / io_uring.c
index 3affd96..358f97b 100644 (file)
@@ -44,6 +44,7 @@
 #include <linux/errno.h>
 #include <linux/syscalls.h>
 #include <linux/compat.h>
+#include <net/compat.h>
 #include <linux/refcount.h>
 #include <linux/uio.h>
 #include <linux/bits.h>
@@ -76,6 +77,8 @@
 #include <linux/fadvise.h>
 #include <linux/eventpoll.h>
 #include <linux/fs_struct.h>
+#include <linux/splice.h>
+#include <linux/task_work.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
@@ -193,6 +196,13 @@ struct fixed_file_data {
        struct completion               done;
 };
 
+struct io_buffer {
+       struct list_head list;
+       __u64 addr;
+       __s32 len;
+       __u16 bid;
+};
+
 struct io_ring_ctx {
        struct {
                struct percpu_ref       refs;
@@ -270,6 +280,8 @@ struct io_ring_ctx {
        struct socket           *ring_sock;
 #endif
 
+       struct idr              io_buffer_idr;
+
        struct idr              personality_idr;
 
        struct {
@@ -290,7 +302,6 @@ struct io_ring_ctx {
 
        struct {
                spinlock_t              completion_lock;
-               struct llist_head       poll_llist;
 
                /*
                 * ->poll_list is protected by the ctx->uring_lock for
@@ -386,7 +397,9 @@ struct io_sr_msg {
                void __user             *buf;
        };
        int                             msg_flags;
+       int                             bgid;
        size_t                          len;
+       struct io_buffer                *kbuf;
 };
 
 struct io_open {
@@ -430,6 +443,24 @@ struct io_epoll {
        struct epoll_event              event;
 };
 
+struct io_splice {
+       struct file                     *file_out;
+       struct file                     *file_in;
+       loff_t                          off_out;
+       loff_t                          off_in;
+       u64                             len;
+       unsigned int                    flags;
+};
+
+struct io_provide_buf {
+       struct file                     *file;
+       __u64                           addr;
+       __s32                           len;
+       __u32                           bgid;
+       __u16                           nbufs;
+       __u16                           bid;
+};
+
 struct io_async_connect {
        struct sockaddr_storage         address;
 };
@@ -464,6 +495,7 @@ enum {
        REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
        REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
        REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
+       REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
 
        REQ_F_LINK_NEXT_BIT,
        REQ_F_FAIL_LINK_BIT,
@@ -479,6 +511,11 @@ enum {
        REQ_F_COMP_LOCKED_BIT,
        REQ_F_NEED_CLEANUP_BIT,
        REQ_F_OVERFLOW_BIT,
+       REQ_F_POLLED_BIT,
+       REQ_F_BUFFER_SELECTED_BIT,
+
+       /* not a real bit, just to check we're not overflowing the space */
+       __REQ_F_LAST_BIT,
 };
 
 enum {
@@ -492,6 +529,8 @@ enum {
        REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
        /* IOSQE_ASYNC */
        REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
+       /* IOSQE_BUFFER_SELECT */
+       REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
 
        /* already grabbed next link */
        REQ_F_LINK_NEXT         = BIT(REQ_F_LINK_NEXT_BIT),
@@ -521,6 +560,15 @@ enum {
        REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
        /* in overflow list */
        REQ_F_OVERFLOW          = BIT(REQ_F_OVERFLOW_BIT),
+       /* already went through poll handler */
+       REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
+       /* buffer already selected */
+       REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
+};
+
+struct async_poll {
+       struct io_poll_iocb     poll;
+       struct io_wq_work       work;
 };
 
 /*
@@ -546,32 +594,45 @@ struct io_kiocb {
                struct io_fadvise       fadvise;
                struct io_madvise       madvise;
                struct io_epoll         epoll;
+               struct io_splice        splice;
+               struct io_provide_buf   pbuf;
        };
 
        struct io_async_ctx             *io;
-       /*
-        * llist_node is only used for poll deferred completions
-        */
-       struct llist_node               llist_node;
-       bool                            in_async;
        bool                            needs_fixed_file;
        u8                              opcode;
 
        struct io_ring_ctx      *ctx;
-       union {
-               struct list_head        list;
-               struct hlist_node       hash_node;
-       };
-       struct list_head        link_list;
+       struct list_head        list;
        unsigned int            flags;
        refcount_t              refs;
+       union {
+               struct task_struct      *task;
+               unsigned long           fsize;
+       };
        u64                     user_data;
        u32                     result;
        u32                     sequence;
 
+       struct list_head        link_list;
+
        struct list_head        inflight_entry;
 
-       struct io_wq_work       work;
+       union {
+               /*
+                * Only commands that never go async can use the below fields,
+                * obviously. Right now only IORING_OP_POLL_ADD uses them, and
+                * async armed poll handlers for regular commands. The latter
+                * restore the work, if needed.
+                */
+               struct {
+                       struct callback_head    task_work;
+                       struct hlist_node       hash_node;
+                       struct async_poll       *apoll;
+                       int                     cflags;
+               };
+               struct io_wq_work       work;
+       };
 };
 
 #define IO_PLUG_THRESHOLD              2
@@ -615,6 +676,11 @@ struct io_op_def {
        unsigned                file_table : 1;
        /* needs ->fs */
        unsigned                needs_fs : 1;
+       /* set if opcode supports polled "wait" */
+       unsigned                pollin : 1;
+       unsigned                pollout : 1;
+       /* op supports buffer selection */
+       unsigned                buffer_select : 1;
 };
 
 static const struct io_op_def io_op_defs[] = {
@@ -624,6 +690,8 @@ static const struct io_op_def io_op_defs[] = {
                .needs_mm               = 1,
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
+               .pollin                 = 1,
+               .buffer_select          = 1,
        },
        [IORING_OP_WRITEV] = {
                .async_ctx              = 1,
@@ -631,6 +699,7 @@ static const struct io_op_def io_op_defs[] = {
                .needs_file             = 1,
                .hash_reg_file          = 1,
                .unbound_nonreg_file    = 1,
+               .pollout                = 1,
        },
        [IORING_OP_FSYNC] = {
                .needs_file             = 1,
@@ -638,11 +707,13 @@ static const struct io_op_def io_op_defs[] = {
        [IORING_OP_READ_FIXED] = {
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
+               .pollin                 = 1,
        },
        [IORING_OP_WRITE_FIXED] = {
                .needs_file             = 1,
                .hash_reg_file          = 1,
                .unbound_nonreg_file    = 1,
+               .pollout                = 1,
        },
        [IORING_OP_POLL_ADD] = {
                .needs_file             = 1,
@@ -658,6 +729,7 @@ static const struct io_op_def io_op_defs[] = {
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
                .needs_fs               = 1,
+               .pollout                = 1,
        },
        [IORING_OP_RECVMSG] = {
                .async_ctx              = 1,
@@ -665,6 +737,8 @@ static const struct io_op_def io_op_defs[] = {
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
                .needs_fs               = 1,
+               .pollin                 = 1,
+               .buffer_select          = 1,
        },
        [IORING_OP_TIMEOUT] = {
                .async_ctx              = 1,
@@ -676,6 +750,7 @@ static const struct io_op_def io_op_defs[] = {
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
                .file_table             = 1,
+               .pollin                 = 1,
        },
        [IORING_OP_ASYNC_CANCEL] = {},
        [IORING_OP_LINK_TIMEOUT] = {
@@ -687,6 +762,7 @@ static const struct io_op_def io_op_defs[] = {
                .needs_mm               = 1,
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
+               .pollout                = 1,
        },
        [IORING_OP_FALLOCATE] = {
                .needs_file             = 1,
@@ -715,11 +791,14 @@ static const struct io_op_def io_op_defs[] = {
                .needs_mm               = 1,
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
+               .pollin                 = 1,
+               .buffer_select          = 1,
        },
        [IORING_OP_WRITE] = {
                .needs_mm               = 1,
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
+               .pollout                = 1,
        },
        [IORING_OP_FADVISE] = {
                .needs_file             = 1,
@@ -731,11 +810,14 @@ static const struct io_op_def io_op_defs[] = {
                .needs_mm               = 1,
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
+               .pollout                = 1,
        },
        [IORING_OP_RECV] = {
                .needs_mm               = 1,
                .needs_file             = 1,
                .unbound_nonreg_file    = 1,
+               .pollin                 = 1,
+               .buffer_select          = 1,
        },
        [IORING_OP_OPENAT2] = {
                .needs_file             = 1,
@@ -747,6 +829,13 @@ static const struct io_op_def io_op_defs[] = {
                .unbound_nonreg_file    = 1,
                .file_table             = 1,
        },
+       [IORING_OP_SPLICE] = {
+               .needs_file             = 1,
+               .hash_reg_file          = 1,
+               .unbound_nonreg_file    = 1,
+       },
+       [IORING_OP_PROVIDE_BUFFERS] = {},
+       [IORING_OP_REMOVE_BUFFERS] = {},
 };
 
 static void io_wq_submit_work(struct io_wq_work **workptr);
@@ -761,6 +850,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 static int io_grab_files(struct io_kiocb *req);
 static void io_ring_file_ref_flush(struct fixed_file_data *data);
 static void io_cleanup_req(struct io_kiocb *req);
+static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
+                      int fd, struct file **out_file, bool fixed);
+static void __io_queue_sqe(struct io_kiocb *req,
+                          const struct io_uring_sqe *sqe);
 
 static struct kmem_cache *req_cachep;
 
@@ -827,11 +920,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
        INIT_LIST_HEAD(&ctx->cq_overflow_list);
        init_completion(&ctx->completions[0]);
        init_completion(&ctx->completions[1]);
+       idr_init(&ctx->io_buffer_idr);
        idr_init(&ctx->personality_idr);
        mutex_init(&ctx->uring_lock);
        init_waitqueue_head(&ctx->wait);
        spin_lock_init(&ctx->completion_lock);
-       init_llist_head(&ctx->poll_llist);
        INIT_LIST_HEAD(&ctx->poll_list);
        INIT_LIST_HEAD(&ctx->defer_list);
        INIT_LIST_HEAD(&ctx->timeout_list);
@@ -952,15 +1045,14 @@ static inline void io_req_work_drop_env(struct io_kiocb *req)
        }
 }
 
-static inline bool io_prep_async_work(struct io_kiocb *req,
+static inline void io_prep_async_work(struct io_kiocb *req,
                                      struct io_kiocb **link)
 {
        const struct io_op_def *def = &io_op_defs[req->opcode];
-       bool do_hashed = false;
 
        if (req->flags & REQ_F_ISREG) {
                if (def->hash_reg_file)
-                       do_hashed = true;
+                       io_wq_hash_work(&req->work, file_inode(req->file));
        } else {
                if (def->unbound_nonreg_file)
                        req->work.flags |= IO_WQ_WORK_UNBOUND;
@@ -969,25 +1061,18 @@ static inline bool io_prep_async_work(struct io_kiocb *req,
        io_req_work_grab_env(req, def);
 
        *link = io_prep_linked_timeout(req);
-       return do_hashed;
 }
 
 static inline void io_queue_async_work(struct io_kiocb *req)
 {
        struct io_ring_ctx *ctx = req->ctx;
        struct io_kiocb *link;
-       bool do_hashed;
 
-       do_hashed = io_prep_async_work(req, &link);
+       io_prep_async_work(req, &link);
 
-       trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
-                                       req->flags);
-       if (!do_hashed) {
-               io_wq_enqueue(ctx->io_wq, &req->work);
-       } else {
-               io_wq_enqueue_hashed(ctx->io_wq, &req->work,
-                                       file_inode(req->file));
-       }
+       trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
+                                       &req->work, req->flags);
+       io_wq_enqueue(ctx->io_wq, &req->work);
 
        if (link)
                io_queue_linked_timeout(link);
@@ -1054,24 +1139,19 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
                return false;
        if (!ctx->eventfd_async)
                return true;
-       return io_wq_current_is_worker() || in_interrupt();
+       return io_wq_current_is_worker();
 }
 
-static void __io_cqring_ev_posted(struct io_ring_ctx *ctx, bool trigger_ev)
+static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 {
        if (waitqueue_active(&ctx->wait))
                wake_up(&ctx->wait);
        if (waitqueue_active(&ctx->sqo_wait))
                wake_up(&ctx->sqo_wait);
-       if (trigger_ev)
+       if (io_should_trigger_evfd(ctx))
                eventfd_signal(ctx->cq_ev_fd, 1);
 }
 
-static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
-{
-       __io_cqring_ev_posted(ctx, io_should_trigger_evfd(ctx));
-}
-
 /* Returns true if there are no backlogged entries after the flush */
 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 {
@@ -1108,7 +1188,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
                if (cqe) {
                        WRITE_ONCE(cqe->user_data, req->user_data);
                        WRITE_ONCE(cqe->res, req->result);
-                       WRITE_ONCE(cqe->flags, 0);
+                       WRITE_ONCE(cqe->flags, req->cflags);
                } else {
                        WRITE_ONCE(ctx->rings->cq_overflow,
                                atomic_inc_return(&ctx->cached_cq_overflow));
@@ -1132,7 +1212,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
        return cqe != NULL;
 }
 
-static void io_cqring_fill_event(struct io_kiocb *req, long res)
+static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
 {
        struct io_ring_ctx *ctx = req->ctx;
        struct io_uring_cqe *cqe;
@@ -1148,7 +1228,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
        if (likely(cqe)) {
                WRITE_ONCE(cqe->user_data, req->user_data);
                WRITE_ONCE(cqe->res, res);
-               WRITE_ONCE(cqe->flags, 0);
+               WRITE_ONCE(cqe->flags, cflags);
        } else if (ctx->cq_overflow_flushed) {
                WRITE_ONCE(ctx->rings->cq_overflow,
                                atomic_inc_return(&ctx->cached_cq_overflow));
@@ -1160,23 +1240,34 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
                req->flags |= REQ_F_OVERFLOW;
                refcount_inc(&req->refs);
                req->result = res;
+               req->cflags = cflags;
                list_add_tail(&req->list, &ctx->cq_overflow_list);
        }
 }
 
-static void io_cqring_add_event(struct io_kiocb *req, long res)
+static void io_cqring_fill_event(struct io_kiocb *req, long res)
+{
+       __io_cqring_fill_event(req, res, 0);
+}
+
+static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
 {
        struct io_ring_ctx *ctx = req->ctx;
        unsigned long flags;
 
        spin_lock_irqsave(&ctx->completion_lock, flags);
-       io_cqring_fill_event(req, res);
+       __io_cqring_fill_event(req, res, cflags);
        io_commit_cqring(ctx);
        spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
        io_cqring_ev_posted(ctx);
 }
 
+static void io_cqring_add_event(struct io_kiocb *req, long res)
+{
+       __io_cqring_add_event(req, res, 0);
+}
+
 static inline bool io_is_fallback_req(struct io_kiocb *req)
 {
        return req == (struct io_kiocb *)
@@ -1246,6 +1337,15 @@ fallback:
        return NULL;
 }
 
+static inline void io_put_file(struct io_kiocb *req, struct file *file,
+                         bool fixed)
+{
+       if (fixed)
+               percpu_ref_put(&req->ctx->file_data->refs);
+       else
+               fput(file);
+}
+
 static void __io_req_do_free(struct io_kiocb *req)
 {
        if (likely(!io_is_fallback_req(req)))
@@ -1256,18 +1356,12 @@ static void __io_req_do_free(struct io_kiocb *req)
 
 static void __io_req_aux_free(struct io_kiocb *req)
 {
-       struct io_ring_ctx *ctx = req->ctx;
-
        if (req->flags & REQ_F_NEED_CLEANUP)
                io_cleanup_req(req);
 
        kfree(req->io);
-       if (req->file) {
-               if (req->flags & REQ_F_FIXED_FILE)
-                       percpu_ref_put(&ctx->file_data->refs);
-               else
-                       fput(req->file);
-       }
+       if (req->file)
+               io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
 
        io_req_work_drop_env(req);
 }
@@ -1474,6 +1568,30 @@ static void io_free_req(struct io_kiocb *req)
                io_queue_async_work(nxt);
 }
 
+static void io_link_work_cb(struct io_wq_work **workptr)
+{
+       struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+       struct io_kiocb *link;
+
+       link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
+       io_queue_linked_timeout(link);
+       io_wq_submit_work(workptr);
+}
+
+static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
+{
+       struct io_kiocb *link;
+       const struct io_op_def *def = &io_op_defs[nxt->opcode];
+
+       if ((nxt->flags & REQ_F_ISREG) && def->hash_reg_file)
+               io_wq_hash_work(&nxt->work, file_inode(nxt->file));
+
+       *workptr = &nxt->work;
+       link = io_prep_linked_timeout(nxt);
+       if (link)
+               nxt->work.func = io_link_work_cb;
+}
+
 /*
  * Drop reference to request, return next in chain (if there is one) if this
  * was the last reference to this request.
@@ -1493,6 +1611,26 @@ static void io_put_req(struct io_kiocb *req)
                io_free_req(req);
 }
 
+static void io_steal_work(struct io_kiocb *req,
+                         struct io_wq_work **workptr)
+{
+       /*
+        * It's in an io-wq worker, so there always should be at least
+        * one reference, which will be dropped in io_put_work() just
+        * after the current handler returns.
+        *
+        * It also means, that if the counter dropped to 1, then there is
+        * no asynchronous users left, so it's safe to steal the next work.
+        */
+       if (refcount_read(&req->refs) == 1) {
+               struct io_kiocb *nxt = NULL;
+
+               io_req_find_next(req, &nxt);
+               if (nxt)
+                       io_wq_assign_next(workptr, nxt);
+       }
+}
+
 /*
  * Must only be used if we don't need to care about links, usually from
  * within the completion handling itself.
@@ -1554,6 +1692,19 @@ static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
        return true;
 }
 
+static int io_put_kbuf(struct io_kiocb *req)
+{
+       struct io_buffer *kbuf;
+       int cflags;
+
+       kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
+       cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
+       cflags |= IORING_CQE_F_BUFFER;
+       req->rw.addr = 0;
+       kfree(kbuf);
+       return cflags;
+}
+
 /*
  * Find and free completed poll iocbs
  */
@@ -1565,10 +1716,15 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 
        rb.to_free = rb.need_iter = 0;
        while (!list_empty(done)) {
+               int cflags = 0;
+
                req = list_first_entry(done, struct io_kiocb, list);
                list_del(&req->list);
 
-               io_cqring_fill_event(req, req->result);
+               if (req->flags & REQ_F_BUFFER_SELECTED)
+                       cflags = io_put_kbuf(req);
+
+               __io_cqring_fill_event(req, req->result, cflags);
                (*nr_events)++;
 
                if (refcount_dec_and_test(&req->refs) &&
@@ -1577,6 +1733,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
        }
 
        io_commit_cqring(ctx);
+       if (ctx->flags & IORING_SETUP_SQPOLL)
+               io_cqring_ev_posted(ctx);
        io_free_req_many(ctx, &rb);
 }
 
@@ -1743,13 +1901,16 @@ static inline void req_set_fail_links(struct io_kiocb *req)
 static void io_complete_rw_common(struct kiocb *kiocb, long res)
 {
        struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
+       int cflags = 0;
 
        if (kiocb->ki_flags & IOCB_WRITE)
                kiocb_end_write(req);
 
        if (res != req->result)
                req_set_fail_links(req);
-       io_cqring_add_event(req, res);
+       if (req->flags & REQ_F_BUFFER_SELECTED)
+               cflags = io_put_kbuf(req);
+       __io_cqring_add_event(req, res, cflags);
 }
 
 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
@@ -1760,17 +1921,6 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
        io_put_req(req);
 }
 
-static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res)
-{
-       struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
-       struct io_kiocb *nxt = NULL;
-
-       io_complete_rw_common(kiocb, res);
-       io_put_req_find_next(req, &nxt);
-
-       return nxt;
-}
-
 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
 {
        struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
@@ -1841,7 +1991,7 @@ static void io_file_put(struct io_submit_state *state)
  * assuming most submissions are for one file, or at least that each file
  * has more than one submission.
  */
-static struct file *io_file_get(struct io_submit_state *state, int fd)
+static struct file *__io_file_get(struct io_submit_state *state, int fd)
 {
        if (!state)
                return fget(fd);
@@ -1938,7 +2088,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
        req->rw.addr = READ_ONCE(sqe->addr);
        req->rw.len = READ_ONCE(sqe->len);
-       /* we own ->private, reuse it for the buffer index */
+       /* we own ->private, reuse it for the buffer index  / buffer ID */
        req->rw.kiocb.private = (void *) (unsigned long)
                                        READ_ONCE(sqe->buf_index);
        return 0;
@@ -1965,15 +2115,14 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
        }
 }
 
-static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
-                      bool in_async)
+static void kiocb_done(struct kiocb *kiocb, ssize_t ret)
 {
        struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 
        if (req->flags & REQ_F_CUR_POS)
                req->file->f_pos = kiocb->ki_pos;
-       if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw)
-               *nxt = __io_complete_rw(kiocb, ret);
+       if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
+               io_complete_rw(kiocb, ret, 0);
        else
                io_rw_done(kiocb, ret);
 }
@@ -2052,11 +2201,147 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
        return len;
 }
 
+static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
+{
+       if (needs_lock)
+               mutex_unlock(&ctx->uring_lock);
+}
+
+static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
+{
+       /*
+        * "Normal" inline submissions always hold the uring_lock, since we
+        * grab it from the system call. Same is true for the SQPOLL offload.
+        * The only exception is when we've detached the request and issue it
+        * from an async worker thread, grab the lock for that case.
+        */
+       if (needs_lock)
+               mutex_lock(&ctx->uring_lock);
+}
+
+static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
+                                         int bgid, struct io_buffer *kbuf,
+                                         bool needs_lock)
+{
+       struct io_buffer *head;
+
+       if (req->flags & REQ_F_BUFFER_SELECTED)
+               return kbuf;
+
+       io_ring_submit_lock(req->ctx, needs_lock);
+
+       lockdep_assert_held(&req->ctx->uring_lock);
+
+       head = idr_find(&req->ctx->io_buffer_idr, bgid);
+       if (head) {
+               if (!list_empty(&head->list)) {
+                       kbuf = list_last_entry(&head->list, struct io_buffer,
+                                                       list);
+                       list_del(&kbuf->list);
+               } else {
+                       kbuf = head;
+                       idr_remove(&req->ctx->io_buffer_idr, bgid);
+               }
+               if (*len > kbuf->len)
+                       *len = kbuf->len;
+       } else {
+               kbuf = ERR_PTR(-ENOBUFS);
+       }
+
+       io_ring_submit_unlock(req->ctx, needs_lock);
+
+       return kbuf;
+}
+
+static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
+                                       bool needs_lock)
+{
+       struct io_buffer *kbuf;
+       int bgid;
+
+       kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
+       bgid = (int) (unsigned long) req->rw.kiocb.private;
+       kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
+       if (IS_ERR(kbuf))
+               return kbuf;
+       req->rw.addr = (u64) (unsigned long) kbuf;
+       req->flags |= REQ_F_BUFFER_SELECTED;
+       return u64_to_user_ptr(kbuf->addr);
+}
+
+#ifdef CONFIG_COMPAT
+static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
+                               bool needs_lock)
+{
+       struct compat_iovec __user *uiov;
+       compat_ssize_t clen;
+       void __user *buf;
+       ssize_t len;
+
+       uiov = u64_to_user_ptr(req->rw.addr);
+       if (!access_ok(uiov, sizeof(*uiov)))
+               return -EFAULT;
+       if (__get_user(clen, &uiov->iov_len))
+               return -EFAULT;
+       if (clen < 0)
+               return -EINVAL;
+
+       len = clen;
+       buf = io_rw_buffer_select(req, &len, needs_lock);
+       if (IS_ERR(buf))
+               return PTR_ERR(buf);
+       iov[0].iov_base = buf;
+       iov[0].iov_len = (compat_size_t) len;
+       return 0;
+}
+#endif
+
+static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
+                                     bool needs_lock)
+{
+       struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
+       void __user *buf;
+       ssize_t len;
+
+       if (copy_from_user(iov, uiov, sizeof(*uiov)))
+               return -EFAULT;
+
+       len = iov[0].iov_len;
+       if (len < 0)
+               return -EINVAL;
+       buf = io_rw_buffer_select(req, &len, needs_lock);
+       if (IS_ERR(buf))
+               return PTR_ERR(buf);
+       iov[0].iov_base = buf;
+       iov[0].iov_len = len;
+       return 0;
+}
+
+static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
+                                   bool needs_lock)
+{
+       if (req->flags & REQ_F_BUFFER_SELECTED)
+               return 0;
+       if (!req->rw.len)
+               return 0;
+       else if (req->rw.len > 1)
+               return -EINVAL;
+
+#ifdef CONFIG_COMPAT
+       if (req->ctx->compat)
+               return io_compat_import(req, iov, needs_lock);
+#endif
+
+       return __io_iov_buffer_select(req, iov, needs_lock);
+}
+
 static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
-                              struct iovec **iovec, struct iov_iter *iter)
+                              struct iovec **iovec, struct iov_iter *iter,
+                              bool needs_lock)
 {
        void __user *buf = u64_to_user_ptr(req->rw.addr);
        size_t sqe_len = req->rw.len;
+       ssize_t ret;
        u8 opcode;
 
        opcode = req->opcode;
@@ -2065,12 +2350,20 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
                return io_import_fixed(req, rw, iter);
        }
 
-       /* buffer index only valid with fixed read/write */
-       if (req->rw.kiocb.private)
+       /* buffer index only valid with fixed read/write, or buffer select  */
+       if (req->rw.kiocb.private && !(req->flags & REQ_F_BUFFER_SELECT))
                return -EINVAL;
 
        if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
-               ssize_t ret;
+               if (req->flags & REQ_F_BUFFER_SELECT) {
+                       buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
+                       if (IS_ERR(buf)) {
+                               *iovec = NULL;
+                               return PTR_ERR(buf);
+                       }
+                       req->rw.len = sqe_len;
+               }
+
                ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
                *iovec = NULL;
                return ret < 0 ? ret : sqe_len;
@@ -2086,6 +2379,16 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
                return iorw->size;
        }
 
+       if (req->flags & REQ_F_BUFFER_SELECT) {
+               ret = io_iov_buffer_select(req, *iovec, needs_lock);
+               if (!ret) {
+                       ret = (*iovec)->iov_len;
+                       iov_iter_init(iter, rw, *iovec, 1, ret);
+               }
+               *iovec = NULL;
+               return ret;
+       }
+
 #ifdef CONFIG_COMPAT
        if (req->ctx->compat)
                return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
@@ -2169,12 +2472,18 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
        }
 }
 
+static inline int __io_alloc_async_ctx(struct io_kiocb *req)
+{
+       req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
+       return req->io == NULL;
+}
+
 static int io_alloc_async_ctx(struct io_kiocb *req)
 {
        if (!io_op_defs[req->opcode].async_ctx)
                return 0;
-       req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
-       return req->io == NULL;
+
+       return  __io_alloc_async_ctx(req);
 }
 
 static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
@@ -2184,7 +2493,7 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
        if (!io_op_defs[req->opcode].async_ctx)
                return 0;
        if (!req->io) {
-               if (io_alloc_async_ctx(req))
+               if (__io_alloc_async_ctx(req))
                        return -ENOMEM;
 
                io_req_map_rw(req, io_size, iovec, fast_iov, iter);
@@ -2213,7 +2522,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
        io = req->io;
        io->rw.iov = io->rw.fast_iov;
        req->io = NULL;
-       ret = io_import_iovec(READ, req, &io->rw.iov, &iter);
+       ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock);
        req->io = io;
        if (ret < 0)
                return ret;
@@ -2222,8 +2531,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
        return 0;
 }
 
-static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
-                  bool force_nonblock)
+static int io_read(struct io_kiocb *req, bool force_nonblock)
 {
        struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
        struct kiocb *kiocb = &req->rw.kiocb;
@@ -2231,13 +2539,13 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
        size_t iov_count;
        ssize_t io_size, ret;
 
-       ret = io_import_iovec(READ, req, &iovec, &iter);
+       ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock);
        if (ret < 0)
                return ret;
 
        /* Ensure we clear previously set non-block flag */
        if (!force_nonblock)
-               req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
+               kiocb->ki_flags &= ~IOCB_NOWAIT;
 
        req->result = 0;
        io_size = ret;
@@ -2248,10 +2556,8 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
         * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
         * we know to async punt it even if it was opened O_NONBLOCK
         */
-       if (force_nonblock && !io_file_supports_async(req->file)) {
-               req->flags |= REQ_F_MUST_PUNT;
+       if (force_nonblock && !io_file_supports_async(req->file))
                goto copy_iov;
-       }
 
        iov_count = iov_iter_count(&iter);
        ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
@@ -2265,13 +2571,16 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
 
                /* Catch -EAGAIN return for forced non-blocking submission */
                if (!force_nonblock || ret2 != -EAGAIN) {
-                       kiocb_done(kiocb, ret2, nxt, req->in_async);
+                       kiocb_done(kiocb, ret2);
                } else {
 copy_iov:
                        ret = io_setup_async_rw(req, io_size, iovec,
                                                inline_vecs, &iter);
                        if (ret)
                                goto out_free;
+                       /* any defer here is final, must blocking retry */
+                       if (!(req->flags & REQ_F_NOWAIT))
+                               req->flags |= REQ_F_MUST_PUNT;
                        return -EAGAIN;
                }
        }
@@ -2295,6 +2604,8 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
        if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
                return -EBADF;
 
+       req->fsize = rlimit(RLIMIT_FSIZE);
+
        /* either don't need iovec imported or already have it */
        if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
                return 0;
@@ -2302,7 +2613,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
        io = req->io;
        io->rw.iov = io->rw.fast_iov;
        req->io = NULL;
-       ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter);
+       ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock);
        req->io = io;
        if (ret < 0)
                return ret;
@@ -2311,8 +2622,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
        return 0;
 }
 
-static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
-                   bool force_nonblock)
+static int io_write(struct io_kiocb *req, bool force_nonblock)
 {
        struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
        struct kiocb *kiocb = &req->rw.kiocb;
@@ -2320,7 +2630,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
        size_t iov_count;
        ssize_t ret, io_size;
 
-       ret = io_import_iovec(WRITE, req, &iovec, &iter);
+       ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock);
        if (ret < 0)
                return ret;
 
@@ -2337,10 +2647,8 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
         * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
         * we know to async punt it even if it was opened O_NONBLOCK
         */
-       if (force_nonblock && !io_file_supports_async(req->file)) {
-               req->flags |= REQ_F_MUST_PUNT;
+       if (force_nonblock && !io_file_supports_async(req->file))
                goto copy_iov;
-       }
 
        /* file path doesn't support NOWAIT for non-direct_IO */
        if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
@@ -2367,24 +2675,33 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
                }
                kiocb->ki_flags |= IOCB_WRITE;
 
+               if (!force_nonblock)
+                       current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
+
                if (req->file->f_op->write_iter)
                        ret2 = call_write_iter(req->file, kiocb, &iter);
                else
                        ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
+
+               if (!force_nonblock)
+                       current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+
                /*
-                * Raw bdev writes will -EOPNOTSUPP for IOCB_NOWAIT. Just
+                * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
                 * retry them without IOCB_NOWAIT.
                 */
                if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
                        ret2 = -EAGAIN;
                if (!force_nonblock || ret2 != -EAGAIN) {
-                       kiocb_done(kiocb, ret2, nxt, req->in_async);
+                       kiocb_done(kiocb, ret2);
                } else {
 copy_iov:
                        ret = io_setup_async_rw(req, io_size, iovec,
                                                inline_vecs, &iter);
                        if (ret)
                                goto out_free;
+                       /* any defer here is final, must blocking retry */
+                       req->flags |= REQ_F_MUST_PUNT;
                        return -EAGAIN;
                }
        }
@@ -2394,6 +2711,76 @@ out_free:
        return ret;
 }
 
+static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+       struct io_splice* sp = &req->splice;
+       unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
+       int ret;
+
+       if (req->flags & REQ_F_NEED_CLEANUP)
+               return 0;
+
+       sp->file_in = NULL;
+       sp->off_in = READ_ONCE(sqe->splice_off_in);
+       sp->off_out = READ_ONCE(sqe->off);
+       sp->len = READ_ONCE(sqe->len);
+       sp->flags = READ_ONCE(sqe->splice_flags);
+
+       if (unlikely(sp->flags & ~valid_flags))
+               return -EINVAL;
+
+       ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in,
+                         (sp->flags & SPLICE_F_FD_IN_FIXED));
+       if (ret)
+               return ret;
+       req->flags |= REQ_F_NEED_CLEANUP;
+
+       if (!S_ISREG(file_inode(sp->file_in)->i_mode))
+               req->work.flags |= IO_WQ_WORK_UNBOUND;
+
+       return 0;
+}
+
+static bool io_splice_punt(struct file *file)
+{
+       if (get_pipe_info(file))
+               return false;
+       if (!io_file_supports_async(file))
+               return true;
+       return !(file->f_mode & O_NONBLOCK);
+}
+
+static int io_splice(struct io_kiocb *req, bool force_nonblock)
+{
+       struct io_splice *sp = &req->splice;
+       struct file *in = sp->file_in;
+       struct file *out = sp->file_out;
+       unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
+       loff_t *poff_in, *poff_out;
+       long ret;
+
+       if (force_nonblock) {
+               if (io_splice_punt(in) || io_splice_punt(out))
+                       return -EAGAIN;
+               flags |= SPLICE_F_NONBLOCK;
+       }
+
+       poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
+       poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
+       ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
+       if (force_nonblock && ret == -EAGAIN)
+               return -EAGAIN;
+
+       io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
+       req->flags &= ~REQ_F_NEED_CLEANUP;
+
+       io_cqring_add_event(req, ret);
+       if (ret != sp->len)
+               req_set_fail_links(req);
+       io_put_req(req);
+       return 0;
+}
+
 /*
  * IORING_OP_NOP just posts a completion event, nothing else.
  */
@@ -2442,85 +2829,63 @@ static bool io_req_cancelled(struct io_kiocb *req)
        return false;
 }
 
-static void io_link_work_cb(struct io_wq_work **workptr)
-{
-       struct io_wq_work *work = *workptr;
-       struct io_kiocb *link = work->data;
-
-       io_queue_linked_timeout(link);
-       work->func = io_wq_submit_work;
-}
-
-static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
-{
-       struct io_kiocb *link;
-
-       io_prep_async_work(nxt, &link);
-       *workptr = &nxt->work;
-       if (link) {
-               nxt->work.flags |= IO_WQ_WORK_CB;
-               nxt->work.func = io_link_work_cb;
-               nxt->work.data = link;
-       }
-}
-
-static void io_fsync_finish(struct io_wq_work **workptr)
+static void __io_fsync(struct io_kiocb *req)
 {
-       struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
        loff_t end = req->sync.off + req->sync.len;
-       struct io_kiocb *nxt = NULL;
        int ret;
 
-       if (io_req_cancelled(req))
-               return;
-
        ret = vfs_fsync_range(req->file, req->sync.off,
                                end > 0 ? end : LLONG_MAX,
                                req->sync.flags & IORING_FSYNC_DATASYNC);
        if (ret < 0)
                req_set_fail_links(req);
        io_cqring_add_event(req, ret);
-       io_put_req_find_next(req, &nxt);
-       if (nxt)
-               io_wq_assign_next(workptr, nxt);
+       io_put_req(req);
 }
 
-static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt,
-                   bool force_nonblock)
+static void io_fsync_finish(struct io_wq_work **workptr)
 {
-       struct io_wq_work *work, *old_work;
+       struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+
+       if (io_req_cancelled(req))
+               return;
+       __io_fsync(req);
+       io_steal_work(req, workptr);
+}
 
+static int io_fsync(struct io_kiocb *req, bool force_nonblock)
+{
        /* fsync always requires a blocking context */
        if (force_nonblock) {
-               io_put_req(req);
                req->work.func = io_fsync_finish;
                return -EAGAIN;
        }
-
-       work = old_work = &req->work;
-       io_fsync_finish(&work);
-       if (work && work != old_work)
-               *nxt = container_of(work, struct io_kiocb, work);
+       __io_fsync(req);
        return 0;
 }
 
-static void io_fallocate_finish(struct io_wq_work **workptr)
+static void __io_fallocate(struct io_kiocb *req)
 {
-       struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-       struct io_kiocb *nxt = NULL;
        int ret;
 
-       if (io_req_cancelled(req))
-               return;
-
+       current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
        ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
                                req->sync.len);
+       current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
        if (ret < 0)
                req_set_fail_links(req);
        io_cqring_add_event(req, ret);
-       io_put_req_find_next(req, &nxt);
-       if (nxt)
-               io_wq_assign_next(workptr, nxt);
+       io_put_req(req);
+}
+
+static void io_fallocate_finish(struct io_wq_work **workptr)
+{
+       struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+
+       if (io_req_cancelled(req))
+               return;
+       __io_fallocate(req);
+       io_steal_work(req, workptr);
 }
 
 static int io_fallocate_prep(struct io_kiocb *req,
@@ -2532,26 +2897,19 @@ static int io_fallocate_prep(struct io_kiocb *req,
        req->sync.off = READ_ONCE(sqe->off);
        req->sync.len = READ_ONCE(sqe->addr);
        req->sync.mode = READ_ONCE(sqe->len);
+       req->fsize = rlimit(RLIMIT_FSIZE);
        return 0;
 }
 
-static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt,
-                       bool force_nonblock)
+static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
 {
-       struct io_wq_work *work, *old_work;
-
        /* fallocate always requiring blocking context */
        if (force_nonblock) {
-               io_put_req(req);
                req->work.func = io_fallocate_finish;
                return -EAGAIN;
        }
 
-       work = old_work = &req->work;
-       io_fallocate_finish(&work);
-       if (work && work != old_work)
-               *nxt = container_of(work, struct io_kiocb, work);
-
+       __io_fallocate(req);
        return 0;
 }
 
@@ -2626,8 +2984,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        return 0;
 }
 
-static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt,
-                     bool force_nonblock)
+static int io_openat2(struct io_kiocb *req, bool force_nonblock)
 {
        struct open_flags op;
        struct file *file;
@@ -2658,15 +3015,171 @@ err:
        if (ret < 0)
                req_set_fail_links(req);
        io_cqring_add_event(req, ret);
-       io_put_req_find_next(req, nxt);
+       io_put_req(req);
        return 0;
 }
 
-static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt,
-                    bool force_nonblock)
+static int io_openat(struct io_kiocb *req, bool force_nonblock)
 {
        req->open.how = build_open_how(req->open.how.flags, req->open.how.mode);
-       return io_openat2(req, nxt, force_nonblock);
+       return io_openat2(req, force_nonblock);
+}
+
+static int io_remove_buffers_prep(struct io_kiocb *req,
+                                 const struct io_uring_sqe *sqe)
+{
+       struct io_provide_buf *p = &req->pbuf;
+       u64 tmp;
+
+       if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
+               return -EINVAL;
+
+       tmp = READ_ONCE(sqe->fd);
+       if (!tmp || tmp > USHRT_MAX)
+               return -EINVAL;
+
+       memset(p, 0, sizeof(*p));
+       p->nbufs = tmp;
+       p->bgid = READ_ONCE(sqe->buf_group);
+       return 0;
+}
+
+static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
+                              int bgid, unsigned nbufs)
+{
+       unsigned i = 0;
+
+       /* shouldn't happen */
+       if (!nbufs)
+               return 0;
+
+       /* the head kbuf is the list itself */
+       while (!list_empty(&buf->list)) {
+               struct io_buffer *nxt;
+
+               nxt = list_first_entry(&buf->list, struct io_buffer, list);
+               list_del(&nxt->list);
+               kfree(nxt);
+               if (++i == nbufs)
+                       return i;
+       }
+       i++;
+       kfree(buf);
+       idr_remove(&ctx->io_buffer_idr, bgid);
+
+       return i;
+}
+
+static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock)
+{
+       struct io_provide_buf *p = &req->pbuf;
+       struct io_ring_ctx *ctx = req->ctx;
+       struct io_buffer *head;
+       int ret = 0;
+
+       io_ring_submit_lock(ctx, !force_nonblock);
+
+       lockdep_assert_held(&ctx->uring_lock);
+
+       ret = -ENOENT;
+       head = idr_find(&ctx->io_buffer_idr, p->bgid);
+       if (head)
+               ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
+
+       io_ring_submit_lock(ctx, !force_nonblock);
+       if (ret < 0)
+               req_set_fail_links(req);
+       io_cqring_add_event(req, ret);
+       io_put_req(req);
+       return 0;
+}
+
+static int io_provide_buffers_prep(struct io_kiocb *req,
+                                  const struct io_uring_sqe *sqe)
+{
+       struct io_provide_buf *p = &req->pbuf;
+       u64 tmp;
+
+       if (sqe->ioprio || sqe->rw_flags)
+               return -EINVAL;
+
+       tmp = READ_ONCE(sqe->fd);
+       if (!tmp || tmp > USHRT_MAX)
+               return -E2BIG;
+       p->nbufs = tmp;
+       p->addr = READ_ONCE(sqe->addr);
+       p->len = READ_ONCE(sqe->len);
+
+       if (!access_ok(u64_to_user_ptr(p->addr), p->len))
+               return -EFAULT;
+
+       p->bgid = READ_ONCE(sqe->buf_group);
+       tmp = READ_ONCE(sqe->off);
+       if (tmp > USHRT_MAX)
+               return -E2BIG;
+       p->bid = tmp;
+       return 0;
+}
+
+static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
+{
+       struct io_buffer *buf;
+       u64 addr = pbuf->addr;
+       int i, bid = pbuf->bid;
+
+       for (i = 0; i < pbuf->nbufs; i++) {
+               buf = kmalloc(sizeof(*buf), GFP_KERNEL);
+               if (!buf)
+                       break;
+
+               buf->addr = addr;
+               buf->len = pbuf->len;
+               buf->bid = bid;
+               addr += pbuf->len;
+               bid++;
+               if (!*head) {
+                       INIT_LIST_HEAD(&buf->list);
+                       *head = buf;
+               } else {
+                       list_add_tail(&buf->list, &(*head)->list);
+               }
+       }
+
+       return i ? i : -ENOMEM;
+}
+
+static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock)
+{
+       struct io_provide_buf *p = &req->pbuf;
+       struct io_ring_ctx *ctx = req->ctx;
+       struct io_buffer *head, *list;
+       int ret = 0;
+
+       io_ring_submit_lock(ctx, !force_nonblock);
+
+       lockdep_assert_held(&ctx->uring_lock);
+
+       list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
+
+       ret = io_add_buffers(p, &head);
+       if (ret < 0)
+               goto out;
+
+       if (!list) {
+               ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
+                                       GFP_KERNEL);
+               if (ret < 0) {
+                       __io_remove_buffers(ctx, head, p->bgid, -1U);
+                       goto out;
+               }
+       }
+out:
+       io_ring_submit_unlock(ctx, !force_nonblock);
+       if (ret < 0)
+               req_set_fail_links(req);
+       io_cqring_add_event(req, ret);
+       io_put_req(req);
+       return 0;
 }
 
 static int io_epoll_ctl_prep(struct io_kiocb *req,
@@ -2694,8 +3207,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
 #endif
 }
 
-static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt,
-                       bool force_nonblock)
+static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock)
 {
 #if defined(CONFIG_EPOLL)
        struct io_epoll *ie = &req->epoll;
@@ -2708,7 +3220,7 @@ static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt,
        if (ret < 0)
                req_set_fail_links(req);
        io_cqring_add_event(req, ret);
-       io_put_req_find_next(req, nxt);
+       io_put_req(req);
        return 0;
 #else
        return -EOPNOTSUPP;
@@ -2730,8 +3242,7 @@ static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 #endif
 }
 
-static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt,
-                     bool force_nonblock)
+static int io_madvise(struct io_kiocb *req, bool force_nonblock)
 {
 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
        struct io_madvise *ma = &req->madvise;
@@ -2744,7 +3255,7 @@ static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt,
        if (ret < 0)
                req_set_fail_links(req);
        io_cqring_add_event(req, ret);
-       io_put_req_find_next(req, nxt);
+       io_put_req(req);
        return 0;
 #else
        return -EOPNOTSUPP;
@@ -2762,8 +3273,7 @@ static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        return 0;
 }
 
-static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt,
-                     bool force_nonblock)
+static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
 {
        struct io_fadvise *fa = &req->fadvise;
        int ret;
@@ -2783,7 +3293,7 @@ static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt,
        if (ret < 0)
                req_set_fail_links(req);
        io_cqring_add_event(req, ret);
-       io_put_req_find_next(req, nxt);
+       io_put_req(req);
        return 0;
 }
 
@@ -2820,8 +3330,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        return 0;
 }
 
-static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt,
-                   bool force_nonblock)
+static int io_statx(struct io_kiocb *req, bool force_nonblock)
 {
        struct io_open *ctx = &req->open;
        unsigned lookup_flags;
@@ -2858,7 +3367,7 @@ err:
        if (ret < 0)
                req_set_fail_links(req);
        io_cqring_add_event(req, ret);
-       io_put_req_find_next(req, nxt);
+       io_put_req(req);
        return 0;
 }
 
@@ -2885,7 +3394,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 }
 
 /* only called when __close_fd_get_file() is done */
-static void __io_close_finish(struct io_kiocb *req, struct io_kiocb **nxt)
+static void __io_close_finish(struct io_kiocb *req)
 {
        int ret;
 
@@ -2894,22 +3403,19 @@ static void __io_close_finish(struct io_kiocb *req, struct io_kiocb **nxt)
                req_set_fail_links(req);
        io_cqring_add_event(req, ret);
        fput(req->close.put_file);
-       io_put_req_find_next(req, nxt);
+       io_put_req(req);
 }
 
 static void io_close_finish(struct io_wq_work **workptr)
 {
        struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-       struct io_kiocb *nxt = NULL;
 
        /* not cancellable, don't do io_req_cancelled() */
-       __io_close_finish(req, &nxt);
-       if (nxt)
-               io_wq_assign_next(workptr, nxt);
+       __io_close_finish(req);
+       io_steal_work(req, workptr);
 }
 
-static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
-                   bool force_nonblock)
+static int io_close(struct io_kiocb *req, bool force_nonblock)
 {
        int ret;
 
@@ -2919,23 +3425,25 @@ static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
                return ret;
 
        /* if the file has a flush method, be safe and punt to async */
-       if (req->close.put_file->f_op->flush && !io_wq_current_is_worker())
-               goto eagain;
+       if (req->close.put_file->f_op->flush && force_nonblock) {
+               /* submission ref will be dropped, take it for async */
+               refcount_inc(&req->refs);
+
+               req->work.func = io_close_finish;
+               /*
+                * Do manual async queue here to avoid grabbing files - we don't
+                * need the files, and it'll cause io_close_finish() to close
+                * the file again and cause a double CQE entry for this request
+                */
+               io_queue_async_work(req);
+               return 0;
+       }
 
        /*
         * No ->flush(), safely close from here and just punt the
         * fput() to async context.
         */
-       __io_close_finish(req, nxt);
-       return 0;
-eagain:
-       req->work.func = io_close_finish;
-       /*
-        * Do manual async queue here to avoid grabbing files - we don't
-        * need the files, and it'll cause io_close_finish() to close
-        * the file again and cause a double CQE entry for this request
-        */
-       io_queue_async_work(req);
+       __io_close_finish(req);
        return 0;
 }
 
@@ -2957,47 +3465,62 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        return 0;
 }
 
-static void io_sync_file_range_finish(struct io_wq_work **workptr)
+static void __io_sync_file_range(struct io_kiocb *req)
 {
-       struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-       struct io_kiocb *nxt = NULL;
        int ret;
 
-       if (io_req_cancelled(req))
-               return;
-
        ret = sync_file_range(req->file, req->sync.off, req->sync.len,
                                req->sync.flags);
        if (ret < 0)
                req_set_fail_links(req);
        io_cqring_add_event(req, ret);
-       io_put_req_find_next(req, &nxt);
+       io_put_req(req);
+}
+
+
+static void io_sync_file_range_finish(struct io_wq_work **workptr)
+{
+       struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+       struct io_kiocb *nxt = NULL;
+
+       if (io_req_cancelled(req))
+               return;
+       __io_sync_file_range(req);
+       io_put_req(req); /* put submission ref */
        if (nxt)
                io_wq_assign_next(workptr, nxt);
 }
 
-static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt,
-                             bool force_nonblock)
+static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
 {
-       struct io_wq_work *work, *old_work;
-
        /* sync_file_range always requires a blocking context */
        if (force_nonblock) {
-               io_put_req(req);
                req->work.func = io_sync_file_range_finish;
                return -EAGAIN;
        }
 
-       work = old_work = &req->work;
-       io_sync_file_range_finish(&work);
-       if (work && work != old_work)
-               *nxt = container_of(work, struct io_kiocb, work);
+       __io_sync_file_range(req);
        return 0;
 }
 
+#if defined(CONFIG_NET)
+static int io_setup_async_msg(struct io_kiocb *req,
+                             struct io_async_msghdr *kmsg)
+{
+       if (req->io)
+               return -EAGAIN;
+       if (io_alloc_async_ctx(req)) {
+               if (kmsg->iov != kmsg->fast_iov)
+                       kfree(kmsg->iov);
+               return -ENOMEM;
+       }
+       req->flags |= REQ_F_NEED_CLEANUP;
+       memcpy(&req->io->msg, kmsg, sizeof(*kmsg));
+       return -EAGAIN;
+}
+
 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-#if defined(CONFIG_NET)
        struct io_sr_msg *sr = &req->sr_msg;
        struct io_async_ctx *io = req->io;
        int ret;
@@ -3023,15 +3546,10 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        if (!ret)
                req->flags |= REQ_F_NEED_CLEANUP;
        return ret;
-#else
-       return -EOPNOTSUPP;
-#endif
 }
 
-static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
-                     bool force_nonblock)
+static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
 {
-#if defined(CONFIG_NET)
        struct io_async_msghdr *kmsg = NULL;
        struct socket *sock;
        int ret;
@@ -3071,18 +3589,8 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
                        flags |= MSG_DONTWAIT;
 
                ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
-               if (force_nonblock && ret == -EAGAIN) {
-                       if (req->io)
-                               return -EAGAIN;
-                       if (io_alloc_async_ctx(req)) {
-                               if (kmsg->iov != kmsg->fast_iov)
-                                       kfree(kmsg->iov);
-                               return -ENOMEM;
-                       }
-                       req->flags |= REQ_F_NEED_CLEANUP;
-                       memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
-                       return -EAGAIN;
-               }
+               if (force_nonblock && ret == -EAGAIN)
+                       return io_setup_async_msg(req, kmsg);
                if (ret == -ERESTARTSYS)
                        ret = -EINTR;
        }
@@ -3093,17 +3601,12 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
        io_cqring_add_event(req, ret);
        if (ret < 0)
                req_set_fail_links(req);
-       io_put_req_find_next(req, nxt);
+       io_put_req(req);
        return 0;
-#else
-       return -EOPNOTSUPP;
-#endif
 }
 
-static int io_send(struct io_kiocb *req, struct io_kiocb **nxt,
-                  bool force_nonblock)
+static int io_send(struct io_kiocb *req, bool force_nonblock)
 {
-#if defined(CONFIG_NET)
        struct socket *sock;
        int ret;
 
@@ -3144,17 +3647,120 @@ static int io_send(struct io_kiocb *req, struct io_kiocb **nxt,
        io_cqring_add_event(req, ret);
        if (ret < 0)
                req_set_fail_links(req);
-       io_put_req_find_next(req, nxt);
+       io_put_req(req);
        return 0;
-#else
-       return -EOPNOTSUPP;
+}
+
+static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
+{
+       struct io_sr_msg *sr = &req->sr_msg;
+       struct iovec __user *uiov;
+       size_t iov_len;
+       int ret;
+
+       ret = __copy_msghdr_from_user(&io->msg.msg, sr->msg, &io->msg.uaddr,
+                                       &uiov, &iov_len);
+       if (ret)
+               return ret;
+
+       if (req->flags & REQ_F_BUFFER_SELECT) {
+               if (iov_len > 1)
+                       return -EINVAL;
+               if (copy_from_user(io->msg.iov, uiov, sizeof(*uiov)))
+                       return -EFAULT;
+               sr->len = io->msg.iov[0].iov_len;
+               iov_iter_init(&io->msg.msg.msg_iter, READ, io->msg.iov, 1,
+                               sr->len);
+               io->msg.iov = NULL;
+       } else {
+               ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
+                                       &io->msg.iov, &io->msg.msg.msg_iter);
+               if (ret > 0)
+                       ret = 0;
+       }
+
+       return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
+                                       struct io_async_ctx *io)
+{
+       struct compat_msghdr __user *msg_compat;
+       struct io_sr_msg *sr = &req->sr_msg;
+       struct compat_iovec __user *uiov;
+       compat_uptr_t ptr;
+       compat_size_t len;
+       int ret;
+
+       msg_compat = (struct compat_msghdr __user *) sr->msg;
+       ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr,
+                                       &ptr, &len);
+       if (ret)
+               return ret;
+
+       uiov = compat_ptr(ptr);
+       if (req->flags & REQ_F_BUFFER_SELECT) {
+               compat_ssize_t clen;
+
+               if (len > 1)
+                       return -EINVAL;
+               if (!access_ok(uiov, sizeof(*uiov)))
+                       return -EFAULT;
+               if (__get_user(clen, &uiov->iov_len))
+                       return -EFAULT;
+               if (clen < 0)
+                       return -EINVAL;
+               sr->len = io->msg.iov[0].iov_len;
+               io->msg.iov = NULL;
+       } else {
+               ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV,
+                                               &io->msg.iov,
+                                               &io->msg.msg.msg_iter);
+               if (ret < 0)
+                       return ret;
+       }
+
+       return 0;
+}
+#endif
+
+static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
+{
+       io->msg.iov = io->msg.fast_iov;
+
+#ifdef CONFIG_COMPAT
+       if (req->ctx->compat)
+               return __io_compat_recvmsg_copy_hdr(req, io);
 #endif
+
+       return __io_recvmsg_copy_hdr(req, io);
+}
+
+static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
+                                              int *cflags, bool needs_lock)
+{
+       struct io_sr_msg *sr = &req->sr_msg;
+       struct io_buffer *kbuf;
+
+       if (!(req->flags & REQ_F_BUFFER_SELECT))
+               return NULL;
+
+       kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
+       if (IS_ERR(kbuf))
+               return kbuf;
+
+       sr->kbuf = kbuf;
+       req->flags |= REQ_F_BUFFER_SELECTED;
+
+       *cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
+       *cflags |= IORING_CQE_F_BUFFER;
+       return kbuf;
 }
 
 static int io_recvmsg_prep(struct io_kiocb *req,
                           const struct io_uring_sqe *sqe)
 {
-#if defined(CONFIG_NET)
        struct io_sr_msg *sr = &req->sr_msg;
        struct io_async_ctx *io = req->io;
        int ret;
@@ -3162,6 +3768,7 @@ static int io_recvmsg_prep(struct io_kiocb *req,
        sr->msg_flags = READ_ONCE(sqe->msg_flags);
        sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
        sr->len = READ_ONCE(sqe->len);
+       sr->bgid = READ_ONCE(sqe->buf_group);
 
 #ifdef CONFIG_COMPAT
        if (req->ctx->compat)
@@ -3174,30 +3781,24 @@ static int io_recvmsg_prep(struct io_kiocb *req,
        if (req->flags & REQ_F_NEED_CLEANUP)
                return 0;
 
-       io->msg.iov = io->msg.fast_iov;
-       ret = recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
-                                       &io->msg.uaddr, &io->msg.iov);
+       ret = io_recvmsg_copy_hdr(req, io);
        if (!ret)
                req->flags |= REQ_F_NEED_CLEANUP;
        return ret;
-#else
-       return -EOPNOTSUPP;
-#endif
 }
 
-static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
-                     bool force_nonblock)
+static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
 {
-#if defined(CONFIG_NET)
        struct io_async_msghdr *kmsg = NULL;
        struct socket *sock;
-       int ret;
+       int ret, cflags = 0;
 
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
 
        sock = sock_from_file(req->file, &ret);
        if (sock) {
+               struct io_buffer *kbuf;
                struct io_async_ctx io;
                unsigned flags;
 
@@ -3209,19 +3810,23 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
                                kmsg->iov = kmsg->fast_iov;
                        kmsg->msg.msg_iter.iov = kmsg->iov;
                } else {
-                       struct io_sr_msg *sr = &req->sr_msg;
-
                        kmsg = &io.msg;
                        kmsg->msg.msg_name = &io.msg.addr;
 
-                       io.msg.iov = io.msg.fast_iov;
-                       ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg,
-                                       sr->msg_flags, &io.msg.uaddr,
-                                       &io.msg.iov);
+                       ret = io_recvmsg_copy_hdr(req, &io);
                        if (ret)
                                return ret;
                }
 
+               kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
+               if (IS_ERR(kbuf)) {
+                       return PTR_ERR(kbuf);
+               } else if (kbuf) {
+                       kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
+                       iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
+                                       1, req->sr_msg.len);
+               }
+
                flags = req->sr_msg.msg_flags;
                if (flags & MSG_DONTWAIT)
                        req->flags |= REQ_F_NOWAIT;
@@ -3230,18 +3835,8 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
 
                ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
                                                kmsg->uaddr, flags);
-               if (force_nonblock && ret == -EAGAIN) {
-                       if (req->io)
-                               return -EAGAIN;
-                       if (io_alloc_async_ctx(req)) {
-                               if (kmsg->iov != kmsg->fast_iov)
-                                       kfree(kmsg->iov);
-                               return -ENOMEM;
-                       }
-                       memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
-                       req->flags |= REQ_F_NEED_CLEANUP;
-                       return -EAGAIN;
-               }
+               if (force_nonblock && ret == -EAGAIN)
+                       return io_setup_async_msg(req, kmsg);
                if (ret == -ERESTARTSYS)
                        ret = -EINTR;
        }
@@ -3249,22 +3844,18 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
        if (kmsg && kmsg->iov != kmsg->fast_iov)
                kfree(kmsg->iov);
        req->flags &= ~REQ_F_NEED_CLEANUP;
-       io_cqring_add_event(req, ret);
+       __io_cqring_add_event(req, ret, cflags);
        if (ret < 0)
                req_set_fail_links(req);
-       io_put_req_find_next(req, nxt);
+       io_put_req(req);
        return 0;
-#else
-       return -EOPNOTSUPP;
-#endif
 }
 
-static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt,
-                  bool force_nonblock)
+static int io_recv(struct io_kiocb *req, bool force_nonblock)
 {
-#if defined(CONFIG_NET)
+       struct io_buffer *kbuf = NULL;
        struct socket *sock;
-       int ret;
+       int ret, cflags = 0;
 
        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                return -EINVAL;
@@ -3272,15 +3863,25 @@ static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt,
        sock = sock_from_file(req->file, &ret);
        if (sock) {
                struct io_sr_msg *sr = &req->sr_msg;
+               void __user *buf = sr->buf;
                struct msghdr msg;
                struct iovec iov;
                unsigned flags;
 
-               ret = import_single_range(READ, sr->buf, sr->len, &iov,
+               kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
+               if (IS_ERR(kbuf))
+                       return PTR_ERR(kbuf);
+               else if (kbuf)
+                       buf = u64_to_user_ptr(kbuf->addr);
+
+               ret = import_single_range(READ, buf, sr->len, &iov,
                                                &msg.msg_iter);
-               if (ret)
+               if (ret) {
+                       kfree(kbuf);
                        return ret;
+               }
 
+               req->flags |= REQ_F_NEED_CLEANUP;
                msg.msg_name = NULL;
                msg.msg_control = NULL;
                msg.msg_controllen = 0;
@@ -3301,20 +3902,17 @@ static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt,
                        ret = -EINTR;
        }
 
-       io_cqring_add_event(req, ret);
+       kfree(kbuf);
+       req->flags &= ~REQ_F_NEED_CLEANUP;
+       __io_cqring_add_event(req, ret, cflags);
        if (ret < 0)
                req_set_fail_links(req);
-       io_put_req_find_next(req, nxt);
+       io_put_req(req);
        return 0;
-#else
-       return -EOPNOTSUPP;
-#endif
 }
 
-
 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-#if defined(CONFIG_NET)
        struct io_accept *accept = &req->accept;
 
        if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
@@ -3327,14 +3925,9 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        accept->flags = READ_ONCE(sqe->accept_flags);
        accept->nofile = rlimit(RLIMIT_NOFILE);
        return 0;
-#else
-       return -EOPNOTSUPP;
-#endif
 }
 
-#if defined(CONFIG_NET)
-static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
-                      bool force_nonblock)
+static int __io_accept(struct io_kiocb *req, bool force_nonblock)
 {
        struct io_accept *accept = &req->accept;
        unsigned file_flags;
@@ -3351,44 +3944,34 @@ static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
        if (ret < 0)
                req_set_fail_links(req);
        io_cqring_add_event(req, ret);
-       io_put_req_find_next(req, nxt);
+       io_put_req(req);
        return 0;
 }
 
 static void io_accept_finish(struct io_wq_work **workptr)
 {
        struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-       struct io_kiocb *nxt = NULL;
 
        if (io_req_cancelled(req))
                return;
-       __io_accept(req, &nxt, false);
-       if (nxt)
-               io_wq_assign_next(workptr, nxt);
+       __io_accept(req, false);
+       io_steal_work(req, workptr);
 }
-#endif
 
-static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
-                    bool force_nonblock)
+static int io_accept(struct io_kiocb *req, bool force_nonblock)
 {
-#if defined(CONFIG_NET)
        int ret;
 
-       ret = __io_accept(req, nxt, force_nonblock);
+       ret = __io_accept(req, force_nonblock);
        if (ret == -EAGAIN && force_nonblock) {
                req->work.func = io_accept_finish;
-               io_put_req(req);
                return -EAGAIN;
        }
        return 0;
-#else
-       return -EOPNOTSUPP;
-#endif
 }
 
 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-#if defined(CONFIG_NET)
        struct io_connect *conn = &req->connect;
        struct io_async_ctx *io = req->io;
 
@@ -3405,15 +3988,10 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
        return move_addr_to_kernel(conn->addr, conn->addr_len,
                                        &io->connect.address);
-#else
-       return -EOPNOTSUPP;
-#endif
 }
 
-static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt,
-                     bool force_nonblock)
+static int io_connect(struct io_kiocb *req, bool force_nonblock)
 {
-#if defined(CONFIG_NET)
        struct io_async_ctx __io, *io;
        unsigned file_flags;
        int ret;
@@ -3449,25 +4027,301 @@ out:
        if (ret < 0)
                req_set_fail_links(req);
        io_cqring_add_event(req, ret);
-       io_put_req_find_next(req, nxt);
+       io_put_req(req);
        return 0;
-#else
+}
+#else /* !CONFIG_NET */
+static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
        return -EOPNOTSUPP;
-#endif
 }
 
-static void io_poll_remove_one(struct io_kiocb *req)
+static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
+{
+       return -EOPNOTSUPP;
+}
+
+static int io_send(struct io_kiocb *req, bool force_nonblock)
+{
+       return -EOPNOTSUPP;
+}
+
+static int io_recvmsg_prep(struct io_kiocb *req,
+                          const struct io_uring_sqe *sqe)
+{
+       return -EOPNOTSUPP;
+}
+
+static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
+{
+       return -EOPNOTSUPP;
+}
+
+static int io_recv(struct io_kiocb *req, bool force_nonblock)
+{
+       return -EOPNOTSUPP;
+}
+
+static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+       return -EOPNOTSUPP;
+}
+
+static int io_accept(struct io_kiocb *req, bool force_nonblock)
+{
+       return -EOPNOTSUPP;
+}
+
+static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+       return -EOPNOTSUPP;
+}
+
+static int io_connect(struct io_kiocb *req, bool force_nonblock)
+{
+       return -EOPNOTSUPP;
+}
+#endif /* CONFIG_NET */
+
+struct io_poll_table {
+       struct poll_table_struct pt;
+       struct io_kiocb *req;
+       int error;
+};
+
+static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
+                           struct wait_queue_head *head)
+{
+       if (unlikely(poll->head)) {
+               pt->error = -EINVAL;
+               return;
+       }
+
+       pt->error = 0;
+       poll->head = head;
+       add_wait_queue(head, &poll->wait);
+}
+
+static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
+                              struct poll_table_struct *p)
+{
+       struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
+
+       __io_queue_proc(&pt->req->apoll->poll, pt, head);
+}
+
+static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
+                          __poll_t mask, task_work_func_t func)
+{
+       struct task_struct *tsk;
+
+       /* for instances that support it check for an event match first: */
+       if (mask && !(mask & poll->events))
+               return 0;
+
+       trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
+
+       list_del_init(&poll->wait.entry);
+
+       tsk = req->task;
+       req->result = mask;
+       init_task_work(&req->task_work, func);
+       /*
+        * If this fails, then the task is exiting. If that is the case, then
+        * the exit check will ultimately cancel these work items. Hence we
+        * don't need to check here and handle it specifically.
+        */
+       task_work_add(tsk, &req->task_work, true);
+       wake_up_process(tsk);
+       return 1;
+}
+
+static void io_async_task_func(struct callback_head *cb)
+{
+       struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+       struct async_poll *apoll = req->apoll;
+       struct io_ring_ctx *ctx = req->ctx;
+
+       trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
+
+       WARN_ON_ONCE(!list_empty(&req->apoll->poll.wait.entry));
+
+       if (hash_hashed(&req->hash_node)) {
+               spin_lock_irq(&ctx->completion_lock);
+               hash_del(&req->hash_node);
+               spin_unlock_irq(&ctx->completion_lock);
+       }
+
+       /* restore ->work in case we need to retry again */
+       memcpy(&req->work, &apoll->work, sizeof(req->work));
+
+       __set_current_state(TASK_RUNNING);
+       mutex_lock(&ctx->uring_lock);
+       __io_queue_sqe(req, NULL);
+       mutex_unlock(&ctx->uring_lock);
+
+       kfree(apoll);
+}
+
+static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
+                       void *key)
+{
+       struct io_kiocb *req = wait->private;
+       struct io_poll_iocb *poll = &req->apoll->poll;
+
+       trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
+                                       key_to_poll(key));
+
+       return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
+}
+
+static void io_poll_req_insert(struct io_kiocb *req)
+{
+       struct io_ring_ctx *ctx = req->ctx;
+       struct hlist_head *list;
+
+       list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
+       hlist_add_head(&req->hash_node, list);
+}
+
+static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
+                                     struct io_poll_iocb *poll,
+                                     struct io_poll_table *ipt, __poll_t mask,
+                                     wait_queue_func_t wake_func)
+       __acquires(&ctx->completion_lock)
+{
+       struct io_ring_ctx *ctx = req->ctx;
+       bool cancel = false;
+
+       poll->file = req->file;
+       poll->head = NULL;
+       poll->done = poll->canceled = false;
+       poll->events = mask;
+
+       ipt->pt._key = mask;
+       ipt->req = req;
+       ipt->error = -EINVAL;
+
+       INIT_LIST_HEAD(&poll->wait.entry);
+       init_waitqueue_func_entry(&poll->wait, wake_func);
+       poll->wait.private = req;
+
+       mask = vfs_poll(req->file, &ipt->pt) & poll->events;
+
+       spin_lock_irq(&ctx->completion_lock);
+       if (likely(poll->head)) {
+               spin_lock(&poll->head->lock);
+               if (unlikely(list_empty(&poll->wait.entry))) {
+                       if (ipt->error)
+                               cancel = true;
+                       ipt->error = 0;
+                       mask = 0;
+               }
+               if (mask || ipt->error)
+                       list_del_init(&poll->wait.entry);
+               else if (cancel)
+                       WRITE_ONCE(poll->canceled, true);
+               else if (!poll->done) /* actually waiting for an event */
+                       io_poll_req_insert(req);
+               spin_unlock(&poll->head->lock);
+       }
+
+       return mask;
+}
+
+static bool io_arm_poll_handler(struct io_kiocb *req)
+{
+       const struct io_op_def *def = &io_op_defs[req->opcode];
+       struct io_ring_ctx *ctx = req->ctx;
+       struct async_poll *apoll;
+       struct io_poll_table ipt;
+       __poll_t mask, ret;
+
+       if (!req->file || !file_can_poll(req->file))
+               return false;
+       if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED))
+               return false;
+       if (!def->pollin && !def->pollout)
+               return false;
+
+       apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
+       if (unlikely(!apoll))
+               return false;
+
+       req->flags |= REQ_F_POLLED;
+       memcpy(&apoll->work, &req->work, sizeof(req->work));
+
+       /*
+        * Don't need a reference here, as we're adding it to the task
+        * task_works list. If the task exits, the list is pruned.
+        */
+       req->task = current;
+       req->apoll = apoll;
+       INIT_HLIST_NODE(&req->hash_node);
+
+       mask = 0;
+       if (def->pollin)
+               mask |= POLLIN | POLLRDNORM;
+       if (def->pollout)
+               mask |= POLLOUT | POLLWRNORM;
+       mask |= POLLERR | POLLPRI;
+
+       ipt.pt._qproc = io_async_queue_proc;
+
+       ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
+                                       io_async_wake);
+       if (ret) {
+               ipt.error = 0;
+               apoll->poll.done = true;
+               spin_unlock_irq(&ctx->completion_lock);
+               memcpy(&req->work, &apoll->work, sizeof(req->work));
+               kfree(apoll);
+               return false;
+       }
+       spin_unlock_irq(&ctx->completion_lock);
+       trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
+                                       apoll->poll.events);
+       return true;
+}
+
+static bool __io_poll_remove_one(struct io_kiocb *req,
+                                struct io_poll_iocb *poll)
 {
-       struct io_poll_iocb *poll = &req->poll;
+       bool do_complete = false;
 
        spin_lock(&poll->head->lock);
        WRITE_ONCE(poll->canceled, true);
        if (!list_empty(&poll->wait.entry)) {
                list_del_init(&poll->wait.entry);
-               io_queue_async_work(req);
+               do_complete = true;
        }
        spin_unlock(&poll->head->lock);
+       return do_complete;
+}
+
+static bool io_poll_remove_one(struct io_kiocb *req)
+{
+       bool do_complete;
+
+       if (req->opcode == IORING_OP_POLL_ADD) {
+               do_complete = __io_poll_remove_one(req, &req->poll);
+       } else {
+               /* non-poll requests have submit ref still */
+               do_complete = __io_poll_remove_one(req, &req->apoll->poll);
+               if (do_complete)
+                       io_put_req(req);
+       }
+
        hash_del(&req->hash_node);
+
+       if (do_complete) {
+               io_cqring_fill_event(req, -ECANCELED);
+               io_commit_cqring(req->ctx);
+               req->flags |= REQ_F_COMP_LOCKED;
+               io_put_req(req);
+       }
+
+       return do_complete;
 }
 
 static void io_poll_remove_all(struct io_ring_ctx *ctx)
@@ -3485,6 +4339,8 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)
                        io_poll_remove_one(req);
        }
        spin_unlock_irq(&ctx->completion_lock);
+
+       io_cqring_ev_posted(ctx);
 }
 
 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
@@ -3494,10 +4350,11 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
 
        list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
        hlist_for_each_entry(req, list, hash_node) {
-               if (sqe_addr == req->user_data) {
-                       io_poll_remove_one(req);
+               if (sqe_addr != req->user_data)
+                       continue;
+               if (io_poll_remove_one(req))
                        return 0;
-               }
+               return -EALREADY;
        }
 
        return -ENOENT;
@@ -3543,186 +4400,54 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
        struct io_ring_ctx *ctx = req->ctx;
 
        req->poll.done = true;
-       if (error)
-               io_cqring_fill_event(req, error);
-       else
-               io_cqring_fill_event(req, mangle_poll(mask));
+       io_cqring_fill_event(req, error ? error : mangle_poll(mask));
        io_commit_cqring(ctx);
 }
 
-static void io_poll_complete_work(struct io_wq_work **workptr)
+static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
 {
-       struct io_wq_work *work = *workptr;
-       struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-       struct io_poll_iocb *poll = &req->poll;
-       struct poll_table_struct pt = { ._key = poll->events };
        struct io_ring_ctx *ctx = req->ctx;
-       struct io_kiocb *nxt = NULL;
-       __poll_t mask = 0;
-       int ret = 0;
-
-       if (work->flags & IO_WQ_WORK_CANCEL) {
-               WRITE_ONCE(poll->canceled, true);
-               ret = -ECANCELED;
-       } else if (READ_ONCE(poll->canceled)) {
-               ret = -ECANCELED;
-       }
-
-       if (ret != -ECANCELED)
-               mask = vfs_poll(poll->file, &pt) & poll->events;
 
-       /*
-        * Note that ->ki_cancel callers also delete iocb from active_reqs after
-        * calling ->ki_cancel.  We need the ctx_lock roundtrip here to
-        * synchronize with them.  In the cancellation case the list_del_init
-        * itself is not actually needed, but harmless so we keep it in to
-        * avoid further branches in the fast path.
-        */
        spin_lock_irq(&ctx->completion_lock);
-       if (!mask && ret != -ECANCELED) {
-               add_wait_queue(poll->head, &poll->wait);
-               spin_unlock_irq(&ctx->completion_lock);
-               return;
-       }
        hash_del(&req->hash_node);
-       io_poll_complete(req, mask, ret);
+       io_poll_complete(req, req->result, 0);
+       req->flags |= REQ_F_COMP_LOCKED;
+       io_put_req_find_next(req, nxt);
        spin_unlock_irq(&ctx->completion_lock);
 
        io_cqring_ev_posted(ctx);
-
-       if (ret < 0)
-               req_set_fail_links(req);
-       io_put_req_find_next(req, &nxt);
-       if (nxt)
-               io_wq_assign_next(workptr, nxt);
 }
 
-static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes)
+static void io_poll_task_func(struct callback_head *cb)
 {
-       struct io_kiocb *req, *tmp;
-       struct req_batch rb;
+       struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+       struct io_kiocb *nxt = NULL;
 
-       rb.to_free = rb.need_iter = 0;
-       spin_lock_irq(&ctx->completion_lock);
-       llist_for_each_entry_safe(req, tmp, nodes, llist_node) {
-               hash_del(&req->hash_node);
-               io_poll_complete(req, req->result, 0);
+       io_poll_task_handler(req, &nxt);
+       if (nxt) {
+               struct io_ring_ctx *ctx = nxt->ctx;
 
-               if (refcount_dec_and_test(&req->refs) &&
-                   !io_req_multi_free(&rb, req)) {
-                       req->flags |= REQ_F_COMP_LOCKED;
-                       io_free_req(req);
-               }
+               mutex_lock(&ctx->uring_lock);
+               __io_queue_sqe(nxt, NULL);
+               mutex_unlock(&ctx->uring_lock);
        }
-       spin_unlock_irq(&ctx->completion_lock);
-
-       io_cqring_ev_posted(ctx);
-       io_free_req_many(ctx, &rb);
-}
-
-static void io_poll_flush(struct io_wq_work **workptr)
-{
-       struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-       struct llist_node *nodes;
-
-       nodes = llist_del_all(&req->ctx->poll_llist);
-       if (nodes)
-               __io_poll_flush(req->ctx, nodes);
-}
-
-static void io_poll_trigger_evfd(struct io_wq_work **workptr)
-{
-       struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
-       eventfd_signal(req->ctx->cq_ev_fd, 1);
-       io_put_req(req);
 }
 
 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
                        void *key)
 {
-       struct io_poll_iocb *poll = wait->private;
-       struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
-       struct io_ring_ctx *ctx = req->ctx;
-       __poll_t mask = key_to_poll(key);
-
-       /* for instances that support it check for an event match first: */
-       if (mask && !(mask & poll->events))
-               return 0;
-
-       list_del_init(&poll->wait.entry);
-
-       /*
-        * Run completion inline if we can. We're using trylock here because
-        * we are violating the completion_lock -> poll wq lock ordering.
-        * If we have a link timeout we're going to need the completion_lock
-        * for finalizing the request, mark us as having grabbed that already.
-        */
-       if (mask) {
-               unsigned long flags;
-
-               if (llist_empty(&ctx->poll_llist) &&
-                   spin_trylock_irqsave(&ctx->completion_lock, flags)) {
-                       bool trigger_ev;
-
-                       hash_del(&req->hash_node);
-                       io_poll_complete(req, mask, 0);
-
-                       trigger_ev = io_should_trigger_evfd(ctx);
-                       if (trigger_ev && eventfd_signal_count()) {
-                               trigger_ev = false;
-                               req->work.func = io_poll_trigger_evfd;
-                       } else {
-                               req->flags |= REQ_F_COMP_LOCKED;
-                               io_put_req(req);
-                               req = NULL;
-                       }
-                       spin_unlock_irqrestore(&ctx->completion_lock, flags);
-                       __io_cqring_ev_posted(ctx, trigger_ev);
-               } else {
-                       req->result = mask;
-                       req->llist_node.next = NULL;
-                       /* if the list wasn't empty, we're done */
-                       if (!llist_add(&req->llist_node, &ctx->poll_llist))
-                               req = NULL;
-                       else
-                               req->work.func = io_poll_flush;
-               }
-       }
-       if (req)
-               io_queue_async_work(req);
+       struct io_kiocb *req = wait->private;
+       struct io_poll_iocb *poll = &req->poll;
 
-       return 1;
+       return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
 }
 
-struct io_poll_table {
-       struct poll_table_struct pt;
-       struct io_kiocb *req;
-       int error;
-};
-
 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
                               struct poll_table_struct *p)
 {
        struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
 
-       if (unlikely(pt->req->poll.head)) {
-               pt->error = -EINVAL;
-               return;
-       }
-
-       pt->error = 0;
-       pt->req->poll.head = head;
-       add_wait_queue(head, &pt->req->poll.wait);
-}
-
-static void io_poll_req_insert(struct io_kiocb *req)
-{
-       struct io_ring_ctx *ctx = req->ctx;
-       struct hlist_head *list;
-
-       list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
-       hlist_add_head(&req->hash_node, list);
+       __io_queue_proc(&pt->req->poll, pt, head);
 }
 
 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -3739,55 +4464,29 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 
        events = READ_ONCE(sqe->poll_events);
        poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
+
+       /*
+        * Don't need a reference here, as we're adding it to the task
+        * task_works list. If the task exits, the list is pruned.
+        */
+       req->task = current;
        return 0;
 }
 
-static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt)
+static int io_poll_add(struct io_kiocb *req)
 {
        struct io_poll_iocb *poll = &req->poll;
        struct io_ring_ctx *ctx = req->ctx;
        struct io_poll_table ipt;
-       bool cancel = false;
        __poll_t mask;
 
-       INIT_IO_WORK(&req->work, io_poll_complete_work);
        INIT_HLIST_NODE(&req->hash_node);
-
-       poll->head = NULL;
-       poll->done = false;
-       poll->canceled = false;
-
-       ipt.pt._qproc = io_poll_queue_proc;
-       ipt.pt._key = poll->events;
-       ipt.req = req;
-       ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
-
-       /* initialized the list so that we can do list_empty checks */
-       INIT_LIST_HEAD(&poll->wait.entry);
-       init_waitqueue_func_entry(&poll->wait, io_poll_wake);
-       poll->wait.private = poll;
-
        INIT_LIST_HEAD(&req->list);
+       ipt.pt._qproc = io_poll_queue_proc;
 
-       mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
+       mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
+                                       io_poll_wake);
 
-       spin_lock_irq(&ctx->completion_lock);
-       if (likely(poll->head)) {
-               spin_lock(&poll->head->lock);
-               if (unlikely(list_empty(&poll->wait.entry))) {
-                       if (ipt.error)
-                               cancel = true;
-                       ipt.error = 0;
-                       mask = 0;
-               }
-               if (mask || ipt.error)
-                       list_del_init(&poll->wait.entry);
-               else if (cancel)
-                       WRITE_ONCE(poll->canceled, true);
-               else if (!poll->done) /* actually waiting for an event */
-                       io_poll_req_insert(req);
-               spin_unlock(&poll->head->lock);
-       }
        if (mask) { /* no async, we'd stolen it */
                ipt.error = 0;
                io_poll_complete(req, mask, 0);
@@ -3796,7 +4495,7 @@ static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt)
 
        if (mask) {
                io_cqring_ev_posted(ctx);
-               io_put_req_find_next(req, nxt);
+               io_put_req(req);
        }
        return ipt.error;
 }
@@ -4045,7 +4744,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
 
 static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
                                     struct io_kiocb *req, __u64 sqe_addr,
-                                    struct io_kiocb **nxt, int success_ret)
+                                    int success_ret)
 {
        unsigned long flags;
        int ret;
@@ -4071,7 +4770,7 @@ done:
 
        if (ret < 0)
                req_set_fail_links(req);
-       io_put_req_find_next(req, nxt);
+       io_put_req(req);
 }
 
 static int io_async_cancel_prep(struct io_kiocb *req,
@@ -4087,11 +4786,11 @@ static int io_async_cancel_prep(struct io_kiocb *req,
        return 0;
 }
 
-static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt)
+static int io_async_cancel(struct io_kiocb *req)
 {
        struct io_ring_ctx *ctx = req->ctx;
 
-       io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0);
+       io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
        return 0;
 }
 
@@ -4226,6 +4925,15 @@ static int io_req_defer_prep(struct io_kiocb *req,
        case IORING_OP_EPOLL_CTL:
                ret = io_epoll_ctl_prep(req, sqe);
                break;
+       case IORING_OP_SPLICE:
+               ret = io_splice_prep(req, sqe);
+               break;
+       case IORING_OP_PROVIDE_BUFFERS:
+               ret = io_provide_buffers_prep(req, sqe);
+               break;
+       case IORING_OP_REMOVE_BUFFERS:
+               ret = io_remove_buffers_prep(req, sqe);
+               break;
        default:
                printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
                                req->opcode);
@@ -4272,29 +4980,43 @@ static void io_cleanup_req(struct io_kiocb *req)
        case IORING_OP_READV:
        case IORING_OP_READ_FIXED:
        case IORING_OP_READ:
+               if (req->flags & REQ_F_BUFFER_SELECTED)
+                       kfree((void *)(unsigned long)req->rw.addr);
+               /* fallthrough */
        case IORING_OP_WRITEV:
        case IORING_OP_WRITE_FIXED:
        case IORING_OP_WRITE:
                if (io->rw.iov != io->rw.fast_iov)
                        kfree(io->rw.iov);
                break;
-       case IORING_OP_SENDMSG:
        case IORING_OP_RECVMSG:
+               if (req->flags & REQ_F_BUFFER_SELECTED)
+                       kfree(req->sr_msg.kbuf);
+               /* fallthrough */
+       case IORING_OP_SENDMSG:
                if (io->msg.iov != io->msg.fast_iov)
                        kfree(io->msg.iov);
                break;
+       case IORING_OP_RECV:
+               if (req->flags & REQ_F_BUFFER_SELECTED)
+                       kfree(req->sr_msg.kbuf);
+               break;
        case IORING_OP_OPENAT:
        case IORING_OP_OPENAT2:
        case IORING_OP_STATX:
                putname(req->open.filename);
                break;
+       case IORING_OP_SPLICE:
+               io_put_file(req, req->splice.file_in,
+                           (req->splice.flags & SPLICE_F_FD_IN_FIXED));
+               break;
        }
 
        req->flags &= ~REQ_F_NEED_CLEANUP;
 }
 
 static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-                       struct io_kiocb **nxt, bool force_nonblock)
+                       bool force_nonblock)
 {
        struct io_ring_ctx *ctx = req->ctx;
        int ret;
@@ -4311,7 +5033,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                        if (ret < 0)
                                break;
                }
-               ret = io_read(req, nxt, force_nonblock);
+               ret = io_read(req, force_nonblock);
                break;
        case IORING_OP_WRITEV:
        case IORING_OP_WRITE_FIXED:
@@ -4321,7 +5043,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                        if (ret < 0)
                                break;
                }
-               ret = io_write(req, nxt, force_nonblock);
+               ret = io_write(req, force_nonblock);
                break;
        case IORING_OP_FSYNC:
                if (sqe) {
@@ -4329,7 +5051,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                        if (ret < 0)
                                break;
                }
-               ret = io_fsync(req, nxt, force_nonblock);
+               ret = io_fsync(req, force_nonblock);
                break;
        case IORING_OP_POLL_ADD:
                if (sqe) {
@@ -4337,7 +5059,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                        if (ret)
                                break;
                }
-               ret = io_poll_add(req, nxt);
+               ret = io_poll_add(req);
                break;
        case IORING_OP_POLL_REMOVE:
                if (sqe) {
@@ -4353,7 +5075,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                        if (ret < 0)
                                break;
                }
-               ret = io_sync_file_range(req, nxt, force_nonblock);
+               ret = io_sync_file_range(req, force_nonblock);
                break;
        case IORING_OP_SENDMSG:
        case IORING_OP_SEND:
@@ -4363,9 +5085,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                                break;
                }
                if (req->opcode == IORING_OP_SENDMSG)
-                       ret = io_sendmsg(req, nxt, force_nonblock);
+                       ret = io_sendmsg(req, force_nonblock);
                else
-                       ret = io_send(req, nxt, force_nonblock);
+                       ret = io_send(req, force_nonblock);
                break;
        case IORING_OP_RECVMSG:
        case IORING_OP_RECV:
@@ -4375,9 +5097,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                                break;
                }
                if (req->opcode == IORING_OP_RECVMSG)
-                       ret = io_recvmsg(req, nxt, force_nonblock);
+                       ret = io_recvmsg(req, force_nonblock);
                else
-                       ret = io_recv(req, nxt, force_nonblock);
+                       ret = io_recv(req, force_nonblock);
                break;
        case IORING_OP_TIMEOUT:
                if (sqe) {
@@ -4401,7 +5123,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                        if (ret)
                                break;
                }
-               ret = io_accept(req, nxt, force_nonblock);
+               ret = io_accept(req, force_nonblock);
                break;
        case IORING_OP_CONNECT:
                if (sqe) {
@@ -4409,7 +5131,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                        if (ret)
                                break;
                }
-               ret = io_connect(req, nxt, force_nonblock);
+               ret = io_connect(req, force_nonblock);
                break;
        case IORING_OP_ASYNC_CANCEL:
                if (sqe) {
@@ -4417,7 +5139,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                        if (ret)
                                break;
                }
-               ret = io_async_cancel(req, nxt);
+               ret = io_async_cancel(req);
                break;
        case IORING_OP_FALLOCATE:
                if (sqe) {
@@ -4425,7 +5147,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                        if (ret)
                                break;
                }
-               ret = io_fallocate(req, nxt, force_nonblock);
+               ret = io_fallocate(req, force_nonblock);
                break;
        case IORING_OP_OPENAT:
                if (sqe) {
@@ -4433,7 +5155,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                        if (ret)
                                break;
                }
-               ret = io_openat(req, nxt, force_nonblock);
+               ret = io_openat(req, force_nonblock);
                break;
        case IORING_OP_CLOSE:
                if (sqe) {
@@ -4441,7 +5163,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                        if (ret)
                                break;
                }
-               ret = io_close(req, nxt, force_nonblock);
+               ret = io_close(req, force_nonblock);
                break;
        case IORING_OP_FILES_UPDATE:
                if (sqe) {
@@ -4457,7 +5179,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                        if (ret)
                                break;
                }
-               ret = io_statx(req, nxt, force_nonblock);
+               ret = io_statx(req, force_nonblock);
                break;
        case IORING_OP_FADVISE:
                if (sqe) {
@@ -4465,7 +5187,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                        if (ret)
                                break;
                }
-               ret = io_fadvise(req, nxt, force_nonblock);
+               ret = io_fadvise(req, force_nonblock);
                break;
        case IORING_OP_MADVISE:
                if (sqe) {
@@ -4473,7 +5195,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                        if (ret)
                                break;
                }
-               ret = io_madvise(req, nxt, force_nonblock);
+               ret = io_madvise(req, force_nonblock);
                break;
        case IORING_OP_OPENAT2:
                if (sqe) {
@@ -4481,7 +5203,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                        if (ret)
                                break;
                }
-               ret = io_openat2(req, nxt, force_nonblock);
+               ret = io_openat2(req, force_nonblock);
                break;
        case IORING_OP_EPOLL_CTL:
                if (sqe) {
@@ -4489,7 +5211,31 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                        if (ret)
                                break;
                }
-               ret = io_epoll_ctl(req, nxt, force_nonblock);
+               ret = io_epoll_ctl(req, force_nonblock);
+               break;
+       case IORING_OP_SPLICE:
+               if (sqe) {
+                       ret = io_splice_prep(req, sqe);
+                       if (ret < 0)
+                               break;
+               }
+               ret = io_splice(req, force_nonblock);
+               break;
+       case IORING_OP_PROVIDE_BUFFERS:
+               if (sqe) {
+                       ret = io_provide_buffers_prep(req, sqe);
+                       if (ret)
+                               break;
+               }
+               ret = io_provide_buffers(req, force_nonblock);
+               break;
+       case IORING_OP_REMOVE_BUFFERS:
+               if (sqe) {
+                       ret = io_remove_buffers_prep(req, sqe);
+                       if (ret)
+                               break;
+               }
+               ret = io_remove_buffers(req, force_nonblock);
                break;
        default:
                ret = -EINVAL;
@@ -4522,7 +5268,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 {
        struct io_wq_work *work = *workptr;
        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-       struct io_kiocb *nxt = NULL;
        int ret = 0;
 
        /* if NO_CANCEL is set, we must still run the work */
@@ -4532,9 +5277,8 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
        }
 
        if (!ret) {
-               req->in_async = true;
                do {
-                       ret = io_issue_sqe(req, NULL, &nxt, false);
+                       ret = io_issue_sqe(req, NULL, false);
                        /*
                         * We can get EAGAIN for polled IO even though we're
                         * forcing a sync submission from here, since we can't
@@ -4546,18 +5290,13 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
                } while (1);
        }
 
-       /* drop submission reference */
-       io_put_req(req);
-
        if (ret) {
                req_set_fail_links(req);
                io_cqring_add_event(req, ret);
                io_put_req(req);
        }
 
-       /* if a dependent link is ready, pass it back */
-       if (!ret && nxt)
-               io_wq_assign_next(workptr, nxt);
+       io_steal_work(req, workptr);
 }
 
 static int io_req_needs_file(struct io_kiocb *req, int fd)
@@ -4578,41 +5317,52 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
        return table->files[index & IORING_FILE_TABLE_MASK];;
 }
 
-static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
-                          const struct io_uring_sqe *sqe)
+static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
+                       int fd, struct file **out_file, bool fixed)
 {
        struct io_ring_ctx *ctx = req->ctx;
-       unsigned flags;
-       int fd;
-
-       flags = READ_ONCE(sqe->flags);
-       fd = READ_ONCE(sqe->fd);
-
-       if (!io_req_needs_file(req, fd))
-               return 0;
+       struct file *file;
 
-       if (flags & IOSQE_FIXED_FILE) {
+       if (fixed) {
                if (unlikely(!ctx->file_data ||
                    (unsigned) fd >= ctx->nr_user_files))
                        return -EBADF;
                fd = array_index_nospec(fd, ctx->nr_user_files);
-               req->file = io_file_from_index(ctx, fd);
-               if (!req->file)
+               file = io_file_from_index(ctx, fd);
+               if (!file)
                        return -EBADF;
-               req->flags |= REQ_F_FIXED_FILE;
                percpu_ref_get(&ctx->file_data->refs);
        } else {
-               if (req->needs_fixed_file)
-                       return -EBADF;
                trace_io_uring_file_get(ctx, fd);
-               req->file = io_file_get(state, fd);
-               if (unlikely(!req->file))
+               file = __io_file_get(state, fd);
+               if (unlikely(!file))
                        return -EBADF;
        }
 
+       *out_file = file;
        return 0;
 }
 
+static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
+                          const struct io_uring_sqe *sqe)
+{
+       unsigned flags;
+       int fd;
+       bool fixed;
+
+       flags = READ_ONCE(sqe->flags);
+       fd = READ_ONCE(sqe->fd);
+
+       if (!io_req_needs_file(req, fd))
+               return 0;
+
+       fixed = (flags & IOSQE_FIXED_FILE);
+       if (unlikely(!fixed && req->needs_fixed_file))
+               return -EBADF;
+
+       return io_file_get(state, req, fd, &req->file, fixed);
+}
+
 static int io_grab_files(struct io_kiocb *req)
 {
        int ret = -EBADF;
@@ -4672,8 +5422,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 
        if (prev) {
                req_set_fail_links(prev);
-               io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
-                                               -ETIME);
+               io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
                io_put_req(prev);
        } else {
                io_cqring_add_event(req, -ETIME);
@@ -4710,6 +5459,9 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 
        if (!(req->flags & REQ_F_LINK))
                return NULL;
+       /* for polled retry, if flag is set, we already went through here */
+       if (req->flags & REQ_F_POLLED)
+               return NULL;
 
        nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
                                        link_list);
@@ -4723,7 +5475,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
        struct io_kiocb *linked_timeout;
-       struct io_kiocb *nxt = NULL;
+       struct io_kiocb *nxt;
        const struct cred *old_creds = NULL;
        int ret;
 
@@ -4739,7 +5491,7 @@ again:
                        old_creds = override_creds(req->work.creds);
        }
 
-       ret = io_issue_sqe(req, sqe, &nxt, true);
+       ret = io_issue_sqe(req, sqe, true);
 
        /*
         * We async punt it if the file wasn't marked NOWAIT, or if the file
@@ -4747,6 +5499,11 @@ again:
         */
        if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
            (req->flags & REQ_F_MUST_PUNT))) {
+               if (io_arm_poll_handler(req)) {
+                       if (linked_timeout)
+                               io_queue_linked_timeout(linked_timeout);
+                       goto exit;
+               }
 punt:
                if (io_op_defs[req->opcode].file_table) {
                        ret = io_grab_files(req);
@@ -4759,10 +5516,11 @@ punt:
                 * submit reference when the iocb is actually submitted.
                 */
                io_queue_async_work(req);
-               goto done_req;
+               goto exit;
        }
 
 err:
+       nxt = NULL;
        /* drop submission reference */
        io_put_req_find_next(req, &nxt);
 
@@ -4779,15 +5537,14 @@ err:
                req_set_fail_links(req);
                io_put_req(req);
        }
-done_req:
        if (nxt) {
                req = nxt;
-               nxt = NULL;
 
                if (req->flags & REQ_F_FORCE_ASYNC)
                        goto punt;
                goto again;
        }
+exit:
        if (old_creds)
                revert_creds(old_creds);
 }
@@ -4829,7 +5586,8 @@ static inline void io_queue_link_head(struct io_kiocb *req)
 }
 
 #define SQE_VALID_FLAGS        (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
-                               IOSQE_IO_HARDLINK | IOSQE_ASYNC)
+                               IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
+                               IOSQE_BUFFER_SELECT)
 
 static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                          struct io_submit_state *state, struct io_kiocb **link)
@@ -4846,6 +5604,12 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                goto err_req;
        }
 
+       if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
+           !io_op_defs[req->opcode].buffer_select) {
+               ret = -EOPNOTSUPP;
+               goto err_req;
+       }
+
        id = READ_ONCE(sqe->personality);
        if (id) {
                req->work.creds = idr_find(&ctx->personality_idr, id);
@@ -4857,8 +5621,9 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
        }
 
        /* same numerical values with corresponding REQ_F_*, safe to copy */
-       req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK|
-                                       IOSQE_ASYNC);
+       req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK |
+                                       IOSQE_ASYNC | IOSQE_FIXED_FILE |
+                                       IOSQE_BUFFER_SELECT);
 
        ret = io_req_set_file(state, req, sqe);
        if (unlikely(ret)) {
@@ -5079,7 +5844,6 @@ fail_req:
                        *mm = ctx->sqo_mm;
                }
 
-               req->in_async = async;
                req->needs_fixed_file = async;
                trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
                                                true, async);
@@ -5163,6 +5927,8 @@ static int io_sq_thread(void *data)
                        if (!list_empty(&ctx->poll_list) ||
                            (!time_after(jiffies, timeout) && ret != -EBUSY &&
                            !percpu_ref_is_dying(&ctx->refs))) {
+                               if (current->task_works)
+                                       task_work_run();
                                cond_resched();
                                continue;
                        }
@@ -5194,6 +5960,10 @@ static int io_sq_thread(void *data)
                                        finish_wait(&ctx->sqo_wait, &wait);
                                        break;
                                }
+                               if (current->task_works) {
+                                       task_work_run();
+                                       continue;
+                               }
                                if (signal_pending(current))
                                        flush_signals(current);
                                schedule();
@@ -5213,6 +5983,9 @@ static int io_sq_thread(void *data)
                timeout = jiffies + ctx->sq_thread_idle;
        }
 
+       if (current->task_works)
+               task_work_run();
+
        set_fs(old_fs);
        if (cur_mm) {
                unuse_mm(cur_mm);
@@ -5277,8 +6050,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
        struct io_rings *rings = ctx->rings;
        int ret = 0;
 
-       if (io_cqring_events(ctx, false) >= min_events)
-               return 0;
+       do {
+               if (io_cqring_events(ctx, false) >= min_events)
+                       return 0;
+               if (!current->task_works)
+                       break;
+               task_work_run();
+       } while (1);
 
        if (sig) {
 #ifdef CONFIG_COMPAT
@@ -5298,6 +6076,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
        do {
                prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
                                                TASK_INTERRUPTIBLE);
+               if (current->task_works)
+                       task_work_run();
                if (io_should_wake(&iowq, false))
                        break;
                schedule();
@@ -5607,7 +6387,6 @@ static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
 struct io_file_put {
        struct llist_node llist;
        struct file *file;
-       struct completion *done;
 };
 
 static void io_ring_file_ref_flush(struct fixed_file_data *data)
@@ -5618,10 +6397,7 @@ static void io_ring_file_ref_flush(struct fixed_file_data *data)
        while ((node = llist_del_all(&data->put_llist)) != NULL) {
                llist_for_each_entry_safe(pfile, tmp, node, llist) {
                        io_ring_file_put(data->ctx, pfile->file);
-                       if (pfile->done)
-                               complete(pfile->done);
-                       else
-                               kfree(pfile);
+                       kfree(pfile);
                }
        }
 }
@@ -5816,33 +6592,18 @@ static void io_atomic_switch(struct percpu_ref *ref)
        percpu_ref_get(&data->refs);
 }
 
-static bool io_queue_file_removal(struct fixed_file_data *data,
+static int io_queue_file_removal(struct fixed_file_data *data,
                                  struct file *file)
 {
-       struct io_file_put *pfile, pfile_stack;
-       DECLARE_COMPLETION_ONSTACK(done);
+       struct io_file_put *pfile;
 
-       /*
-        * If we fail allocating the struct we need for doing async reomval
-        * of this file, just punt to sync and wait for it.
-        */
        pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
-       if (!pfile) {
-               pfile = &pfile_stack;
-               pfile->done = &done;
-       }
+       if (!pfile)
+               return -ENOMEM;
 
        pfile->file = file;
        llist_add(&pfile->llist, &data->put_llist);
-
-       if (pfile == &pfile_stack) {
-               percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
-               wait_for_completion(&done);
-               flush_work(&data->ref_work);
-               return false;
-       }
-
-       return true;
+       return 0;
 }
 
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
@@ -5877,9 +6638,11 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
                index = i & IORING_FILE_TABLE_MASK;
                if (table->files[index]) {
                        file = io_file_from_index(ctx, index);
+                       err = io_queue_file_removal(data, file);
+                       if (err)
+                               break;
                        table->files[index] = NULL;
-                       if (io_queue_file_removal(data, file))
-                               ref_switch = true;
+                       ref_switch = true;
                }
                if (fd != -1) {
                        file = fget(fd);
@@ -5932,20 +6695,14 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
        return __io_sqe_files_update(ctx, &up, nr_args);
 }
 
-static void io_put_work(struct io_wq_work *work)
+static void io_free_work(struct io_wq_work *work)
 {
        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 
+       /* Consider that io_steal_work() relies on this ref */
        io_put_req(req);
 }
 
-static void io_get_work(struct io_wq_work *work)
-{
-       struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-
-       refcount_inc(&req->refs);
-}
-
 static int io_init_wq_offload(struct io_ring_ctx *ctx,
                              struct io_uring_params *p)
 {
@@ -5956,8 +6713,7 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,
        int ret = 0;
 
        data.user = ctx->user;
-       data.get_work = io_get_work;
-       data.put_work = io_put_work;
+       data.free_work = io_free_work;
 
        if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
                /* Do QD, or 4 * CPUS, whatever is smallest */
@@ -6359,6 +7115,21 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
        return -ENXIO;
 }
 
+static int __io_destroy_buffers(int id, void *p, void *data)
+{
+       struct io_ring_ctx *ctx = data;
+       struct io_buffer *buf = p;
+
+       __io_remove_buffers(ctx, buf, id, -1U);
+       return 0;
+}
+
+static void io_destroy_buffers(struct io_ring_ctx *ctx)
+{
+       idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
+       idr_destroy(&ctx->io_buffer_idr);
+}
+
 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
        io_finish_async(ctx);
@@ -6369,6 +7140,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
        io_sqe_buffer_unregister(ctx);
        io_sqe_files_unregister(ctx);
        io_eventfd_unregister(ctx);
+       io_destroy_buffers(ctx);
        idr_destroy(&ctx->personality_idr);
 
 #if defined(CONFIG_UNIX)
@@ -6623,6 +7395,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
        int submitted = 0;
        struct fd f;
 
+       if (current->task_works)
+               task_work_run();
+
        if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
                return -EINVAL;
 
@@ -6669,7 +7444,14 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 
                min_complete = min(min_complete, ctx->cq_entries);
 
-               if (ctx->flags & IORING_SETUP_IOPOLL) {
+               /*
+                * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
+                * space applications don't need to do io completion events
+                * polling again, they can rely on io_sq_thread to do polling
+                * work, which can reduce cpu usage and uring_lock contention.
+                */
+               if (ctx->flags & IORING_SETUP_IOPOLL &&
+                   !(ctx->flags & IORING_SETUP_SQPOLL)) {
                        ret = io_iopoll_check(ctx, &nr_events, min_complete);
                } else {
                        ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
@@ -6745,6 +7527,17 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
                seq_printf(m, "Personalities:\n");
                idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
        }
+       seq_printf(m, "PollList:\n");
+       spin_lock_irq(&ctx->completion_lock);
+       for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
+               struct hlist_head *list = &ctx->cancel_hash[i];
+               struct io_kiocb *req;
+
+               hlist_for_each_entry(req, list, hash_node)
+                       seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
+                                       req->task->task_works != NULL);
+       }
+       spin_unlock_irq(&ctx->completion_lock);
        mutex_unlock(&ctx->uring_lock);
 }
 
@@ -6961,7 +7754,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
 
        p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
                        IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
-                       IORING_FEAT_CUR_PERSONALITY;
+                       IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL;
        trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
        return ret;
 err:
@@ -7239,6 +8032,7 @@ static int __init io_uring_init(void)
        BUILD_BUG_SQE_ELEM(8,  __u64,  off);
        BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
        BUILD_BUG_SQE_ELEM(16, __u64,  addr);
+       BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
        BUILD_BUG_SQE_ELEM(24, __u32,  len);
        BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
        BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
@@ -7253,11 +8047,14 @@ static int __init io_uring_init(void)
        BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
        BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
+       BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
        BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
        BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
        BUILD_BUG_SQE_ELEM(42, __u16,  personality);
+       BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
 
        BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
+       BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
        req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
        return 0;
 };