c6x: switch to ->regset_get()
[linux-2.6-microblaze.git] / fs / io_uring.c
index 9fb0dc6..155f3d8 100644 (file)
@@ -55,7 +55,6 @@
 #include <linux/fdtable.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
-#include <linux/mmu_context.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
 #include <linux/kthread.h>
@@ -529,7 +528,6 @@ enum {
        REQ_F_INFLIGHT_BIT,
        REQ_F_CUR_POS_BIT,
        REQ_F_NOWAIT_BIT,
-       REQ_F_IOPOLL_COMPLETED_BIT,
        REQ_F_LINK_TIMEOUT_BIT,
        REQ_F_TIMEOUT_BIT,
        REQ_F_ISREG_BIT,
@@ -541,6 +539,8 @@ enum {
        REQ_F_POLLED_BIT,
        REQ_F_BUFFER_SELECTED_BIT,
        REQ_F_NO_FILE_TABLE_BIT,
+       REQ_F_QUEUE_TIMEOUT_BIT,
+       REQ_F_WORK_INITIALIZED_BIT,
 
        /* not a real bit, just to check we're not overflowing the space */
        __REQ_F_LAST_BIT,
@@ -572,8 +572,6 @@ enum {
        REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
        /* must not punt to workers */
        REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
-       /* polled IO has completed */
-       REQ_F_IOPOLL_COMPLETED  = BIT(REQ_F_IOPOLL_COMPLETED_BIT),
        /* has linked timeout */
        REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
        /* timeout request */
@@ -596,6 +594,10 @@ enum {
        REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
        /* doesn't need file table for this request */
        REQ_F_NO_FILE_TABLE     = BIT(REQ_F_NO_FILE_TABLE_BIT),
+       /* needs to queue linked timeout */
+       REQ_F_QUEUE_TIMEOUT     = BIT(REQ_F_QUEUE_TIMEOUT_BIT),
+       /* io_wq_work is initialized */
+       REQ_F_WORK_INITIALIZED  = BIT(REQ_F_WORK_INITIALIZED_BIT),
 };
 
 struct async_poll {
@@ -634,6 +636,8 @@ struct io_kiocb {
        struct io_async_ctx             *io;
        int                             cflags;
        u8                              opcode;
+       /* polled IO has completed */
+       u8                              iopoll_completed;
 
        u16                             buf_index;
 
@@ -698,6 +702,8 @@ struct io_op_def {
        unsigned                needs_mm : 1;
        /* needs req->file assigned */
        unsigned                needs_file : 1;
+       /* don't fail if file grab fails */
+       unsigned                needs_file_no_error : 1;
        /* hash wq insertion if file is a regular file */
        unsigned                hash_reg_file : 1;
        /* unbound wq insertion if file is a non-regular file */
@@ -804,6 +810,8 @@ static const struct io_op_def io_op_defs[] = {
                .needs_fs               = 1,
        },
        [IORING_OP_CLOSE] = {
+               .needs_file             = 1,
+               .needs_file_no_error    = 1,
                .file_table             = 1,
        },
        [IORING_OP_FILES_UPDATE] = {
@@ -904,6 +912,19 @@ EXPORT_SYMBOL(io_uring_get_socket);
 
 static void io_file_put_work(struct work_struct *work);
 
+/*
+ * Note: must call io_req_init_async() for the first time you
+ * touch any members of io_wq_work.
+ */
+static inline void io_req_init_async(struct io_kiocb *req)
+{
+       if (req->flags & REQ_F_WORK_INITIALIZED)
+               return;
+
+       memset(&req->work, 0, sizeof(req->work));
+       req->flags |= REQ_F_WORK_INITIALIZED;
+}
+
 static inline bool io_async_submit(struct io_ring_ctx *ctx)
 {
        return ctx->flags & IORING_SETUP_SQPOLL;
@@ -1030,6 +1051,9 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
 
 static inline void io_req_work_drop_env(struct io_kiocb *req)
 {
+       if (!(req->flags & REQ_F_WORK_INITIALIZED))
+               return;
+
        if (req->work.mm) {
                mmdrop(req->work.mm);
                req->work.mm = NULL;
@@ -1576,16 +1600,6 @@ static void io_free_req(struct io_kiocb *req)
                io_queue_async_work(nxt);
 }
 
-static void io_link_work_cb(struct io_wq_work **workptr)
-{
-       struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-       struct io_kiocb *link;
-
-       link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
-       io_queue_linked_timeout(link);
-       io_wq_submit_work(workptr);
-}
-
 static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
 {
        struct io_kiocb *link;
@@ -1597,7 +1611,7 @@ static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
        *workptr = &nxt->work;
        link = io_prep_linked_timeout(nxt);
        if (link)
-               nxt->work.func = io_link_work_cb;
+               nxt->flags |= REQ_F_QUEUE_TIMEOUT;
 }
 
 /*
@@ -1782,7 +1796,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
                 * If we find a request that requires polling, break out
                 * and complete those lists first, if we have entries there.
                 */
-               if (req->flags & REQ_F_IOPOLL_COMPLETED) {
+               if (READ_ONCE(req->iopoll_completed)) {
                        list_move_tail(&req->list, &done);
                        continue;
                }
@@ -1963,7 +1977,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
                req_set_fail_links(req);
        req->result = res;
        if (res != -EAGAIN)
-               req->flags |= REQ_F_IOPOLL_COMPLETED;
+               WRITE_ONCE(req->iopoll_completed, 1);
 }
 
 /*
@@ -1996,7 +2010,7 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
         * For fast devices, IO may have already completed. If it has, add
         * it to the front so we find it first.
         */
-       if (req->flags & REQ_F_IOPOLL_COMPLETED)
+       if (READ_ONCE(req->iopoll_completed))
                list_add(&req->list, &ctx->poll_list);
        else
                list_add_tail(&req->list, &ctx->poll_list);
@@ -2064,6 +2078,10 @@ static bool io_file_supports_async(struct file *file, int rw)
        if (S_ISREG(mode) && file->f_op != &io_uring_fops)
                return true;
 
+       /* any ->read/write should understand O_NONBLOCK */
+       if (file->f_flags & O_NONBLOCK)
+               return true;
+
        if (!(file->f_mode & FMODE_NOWAIT))
                return false;
 
@@ -2106,8 +2124,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                kiocb->ki_ioprio = get_current_ioprio();
 
        /* don't allow async punt if RWF_NOWAIT was requested */
-       if ((kiocb->ki_flags & IOCB_NOWAIT) ||
-           (req->file->f_flags & O_NONBLOCK))
+       if (kiocb->ki_flags & IOCB_NOWAIT)
                req->flags |= REQ_F_NOWAIT;
 
        if (force_nonblock)
@@ -2121,6 +2138,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                kiocb->ki_flags |= IOCB_HIPRI;
                kiocb->ki_complete = io_complete_rw_iopoll;
                req->result = 0;
+               req->iopoll_completed = 0;
        } else {
                if (kiocb->ki_flags & IOCB_HIPRI)
                        return -EINVAL;
@@ -2359,8 +2377,14 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
                                    bool needs_lock)
 {
-       if (req->flags & REQ_F_BUFFER_SELECTED)
+       if (req->flags & REQ_F_BUFFER_SELECTED) {
+               struct io_buffer *kbuf;
+
+               kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
+               iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
+               iov[0].iov_len = kbuf->len;
                return 0;
+       }
        if (!req->rw.len)
                return 0;
        else if (req->rw.len > 1)
@@ -2742,7 +2766,8 @@ copy_iov:
                        if (ret)
                                goto out_free;
                        /* any defer here is final, must blocking retry */
-                       if (!file_can_poll(req->file))
+                       if (!(req->flags & REQ_F_NOWAIT) &&
+                           !file_can_poll(req->file))
                                req->flags |= REQ_F_MUST_PUNT;
                        return -EAGAIN;
                }
@@ -2762,6 +2787,8 @@ static int __io_splice_prep(struct io_kiocb *req,
 
        if (req->flags & REQ_F_NEED_CLEANUP)
                return 0;
+       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+               return -EINVAL;
 
        sp->file_in = NULL;
        sp->len = READ_ONCE(sqe->len);
@@ -2776,8 +2803,14 @@ static int __io_splice_prep(struct io_kiocb *req,
                return ret;
        req->flags |= REQ_F_NEED_CLEANUP;
 
-       if (!S_ISREG(file_inode(sp->file_in)->i_mode))
+       if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
+               /*
+                * Splice operation will be punted aync, and here need to
+                * modify io_wq_work.flags, so initialize io_wq_work firstly.
+                */
+               io_req_init_async(req);
                req->work.flags |= IO_WQ_WORK_UNBOUND;
+       }
 
        return 0;
 }
@@ -2886,23 +2919,15 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        return 0;
 }
 
-static bool io_req_cancelled(struct io_kiocb *req)
-{
-       if (req->work.flags & IO_WQ_WORK_CANCEL) {
-               req_set_fail_links(req);
-               io_cqring_add_event(req, -ECANCELED);
-               io_put_req(req);
-               return true;
-       }
-
-       return false;
-}
-
-static void __io_fsync(struct io_kiocb *req)
+static int io_fsync(struct io_kiocb *req, bool force_nonblock)
 {
        loff_t end = req->sync.off + req->sync.len;
        int ret;
 
+       /* fsync always requires a blocking context */
+       if (force_nonblock)
+               return -EAGAIN;
+
        ret = vfs_fsync_range(req->file, req->sync.off,
                                end > 0 ? end : LLONG_MAX,
                                req->sync.flags & IORING_FSYNC_DATASYNC);
@@ -2910,58 +2935,16 @@ static void __io_fsync(struct io_kiocb *req)
                req_set_fail_links(req);
        io_cqring_add_event(req, ret);
        io_put_req(req);
-}
-
-static void io_fsync_finish(struct io_wq_work **workptr)
-{
-       struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
-       if (io_req_cancelled(req))
-               return;
-       __io_fsync(req);
-       io_steal_work(req, workptr);
-}
-
-static int io_fsync(struct io_kiocb *req, bool force_nonblock)
-{
-       /* fsync always requires a blocking context */
-       if (force_nonblock) {
-               req->work.func = io_fsync_finish;
-               return -EAGAIN;
-       }
-       __io_fsync(req);
        return 0;
 }
 
-static void __io_fallocate(struct io_kiocb *req)
-{
-       int ret;
-
-       current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
-       ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
-                               req->sync.len);
-       current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
-       if (ret < 0)
-               req_set_fail_links(req);
-       io_cqring_add_event(req, ret);
-       io_put_req(req);
-}
-
-static void io_fallocate_finish(struct io_wq_work **workptr)
-{
-       struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
-       if (io_req_cancelled(req))
-               return;
-       __io_fallocate(req);
-       io_steal_work(req, workptr);
-}
-
 static int io_fallocate_prep(struct io_kiocb *req,
                             const struct io_uring_sqe *sqe)
 {
        if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
                return -EINVAL;
+       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+               return -EINVAL;
 
        req->sync.off = READ_ONCE(sqe->off);
        req->sync.len = READ_ONCE(sqe->addr);
@@ -2972,66 +2955,74 @@ static int io_fallocate_prep(struct io_kiocb *req,
 
 static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
 {
+       int ret;
+
        /* fallocate always requiring blocking context */
-       if (force_nonblock) {
-               req->work.func = io_fallocate_finish;
+       if (force_nonblock)
                return -EAGAIN;
-       }
 
-       __io_fallocate(req);
+       current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
+       ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
+                               req->sync.len);
+       current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+       if (ret < 0)
+               req_set_fail_links(req);
+       io_cqring_add_event(req, ret);
+       io_put_req(req);
        return 0;
 }
 
-static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
        const char __user *fname;
        int ret;
 
-       if (sqe->ioprio || sqe->buf_index)
+       if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
                return -EINVAL;
-       if (req->flags & REQ_F_FIXED_FILE)
+       if (unlikely(sqe->ioprio || sqe->buf_index))
+               return -EINVAL;
+       if (unlikely(req->flags & REQ_F_FIXED_FILE))
                return -EBADF;
-       if (req->flags & REQ_F_NEED_CLEANUP)
-               return 0;
 
-       req->open.dfd = READ_ONCE(sqe->fd);
-       req->open.how.mode = READ_ONCE(sqe->len);
-       fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
-       req->open.how.flags = READ_ONCE(sqe->open_flags);
-       if (force_o_largefile())
+       /* open.how should be already initialised */
+       if (!(req->open.how.flags & O_PATH) && force_o_largefile())
                req->open.how.flags |= O_LARGEFILE;
 
+       req->open.dfd = READ_ONCE(sqe->fd);
+       fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
        req->open.filename = getname(fname);
        if (IS_ERR(req->open.filename)) {
                ret = PTR_ERR(req->open.filename);
                req->open.filename = NULL;
                return ret;
        }
-
        req->open.nofile = rlimit(RLIMIT_NOFILE);
        req->flags |= REQ_F_NEED_CLEANUP;
        return 0;
 }
 
+static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+       u64 flags, mode;
+
+       if (req->flags & REQ_F_NEED_CLEANUP)
+               return 0;
+       mode = READ_ONCE(sqe->len);
+       flags = READ_ONCE(sqe->open_flags);
+       req->open.how = build_open_how(flags, mode);
+       return __io_openat_prep(req, sqe);
+}
+
 static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
        struct open_how __user *how;
-       const char __user *fname;
        size_t len;
        int ret;
 
-       if (sqe->ioprio || sqe->buf_index)
-               return -EINVAL;
-       if (req->flags & REQ_F_FIXED_FILE)
-               return -EBADF;
        if (req->flags & REQ_F_NEED_CLEANUP)
                return 0;
-
-       req->open.dfd = READ_ONCE(sqe->fd);
-       fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
        how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
        len = READ_ONCE(sqe->len);
-
        if (len < OPEN_HOW_SIZE_VER0)
                return -EINVAL;
 
@@ -3040,19 +3031,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        if (ret)
                return ret;
 
-       if (!(req->open.how.flags & O_PATH) && force_o_largefile())
-               req->open.how.flags |= O_LARGEFILE;
-
-       req->open.filename = getname(fname);
-       if (IS_ERR(req->open.filename)) {
-               ret = PTR_ERR(req->open.filename);
-               req->open.filename = NULL;
-               return ret;
-       }
-
-       req->open.nofile = rlimit(RLIMIT_NOFILE);
-       req->flags |= REQ_F_NEED_CLEANUP;
-       return 0;
+       return __io_openat_prep(req, sqe);
 }
 
 static int io_openat2(struct io_kiocb *req, bool force_nonblock)
@@ -3092,7 +3071,6 @@ err:
 
 static int io_openat(struct io_kiocb *req, bool force_nonblock)
 {
-       req->open.how = build_open_how(req->open.how.flags, req->open.how.mode);
        return io_openat2(req, force_nonblock);
 }
 
@@ -3181,7 +3159,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
        p->addr = READ_ONCE(sqe->addr);
        p->len = READ_ONCE(sqe->len);
 
-       if (!access_ok(u64_to_user_ptr(p->addr), p->len))
+       if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
                return -EFAULT;
 
        p->bgid = READ_ONCE(sqe->buf_group);
@@ -3259,6 +3237,8 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
 #if defined(CONFIG_EPOLL)
        if (sqe->ioprio || sqe->buf_index)
                return -EINVAL;
+       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+               return -EINVAL;
 
        req->epoll.epfd = READ_ONCE(sqe->fd);
        req->epoll.op = READ_ONCE(sqe->len);
@@ -3303,6 +3283,8 @@ static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
        if (sqe->ioprio || sqe->buf_index || sqe->off)
                return -EINVAL;
+       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+               return -EINVAL;
 
        req->madvise.addr = READ_ONCE(sqe->addr);
        req->madvise.len = READ_ONCE(sqe->len);
@@ -3337,6 +3319,8 @@ static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
        if (sqe->ioprio || sqe->buf_index || sqe->addr)
                return -EINVAL;
+       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+               return -EINVAL;
 
        req->fadvise.offset = READ_ONCE(sqe->off);
        req->fadvise.len = READ_ONCE(sqe->len);
@@ -3370,6 +3354,8 @@ static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
 
 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
+       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+               return -EINVAL;
        if (sqe->ioprio || sqe->buf_index)
                return -EINVAL;
        if (req->flags & REQ_F_FIXED_FILE)
@@ -3410,10 +3396,14 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
        /*
         * If we queue this for async, it must not be cancellable. That would
-        * leave the 'file' in an undeterminate state.
+        * leave the 'file' in an undeterminate state, and here need to modify
+        * io_wq_work.flags, so initialize io_wq_work firstly.
         */
+       io_req_init_async(req);
        req->work.flags |= IO_WQ_WORK_NO_CANCEL;
 
+       if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+               return -EINVAL;
        if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
            sqe->rw_flags || sqe->buf_index)
                return -EINVAL;
@@ -3421,53 +3411,41 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
                return -EBADF;
 
        req->close.fd = READ_ONCE(sqe->fd);
-       return 0;
-}
-
-/* only called when __close_fd_get_file() is done */
-static void __io_close_finish(struct io_kiocb *req)
-{
-       int ret;
-
-       ret = filp_close(req->close.put_file, req->work.files);
-       if (ret < 0)
-               req_set_fail_links(req);
-       io_cqring_add_event(req, ret);
-       fput(req->close.put_file);
-       io_put_req(req);
-}
-
-static void io_close_finish(struct io_wq_work **workptr)
-{
-       struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
+       if ((req->file && req->file->f_op == &io_uring_fops) ||
+           req->close.fd == req->ctx->ring_fd)
+               return -EBADF;
 
-       /* not cancellable, don't do io_req_cancelled() */
-       __io_close_finish(req);
-       io_steal_work(req, workptr);
+       req->close.put_file = NULL;
+       return 0;
 }
 
 static int io_close(struct io_kiocb *req, bool force_nonblock)
 {
+       struct io_close *close = &req->close;
        int ret;
 
-       req->close.put_file = NULL;
-       ret = __close_fd_get_file(req->close.fd, &req->close.put_file);
-       if (ret < 0)
-               return (ret == -ENOENT) ? -EBADF : ret;
+       /* might be already done during nonblock submission */
+       if (!close->put_file) {
+               ret = __close_fd_get_file(close->fd, &close->put_file);
+               if (ret < 0)
+                       return (ret == -ENOENT) ? -EBADF : ret;
+       }
 
        /* if the file has a flush method, be safe and punt to async */
-       if (req->close.put_file->f_op->flush && force_nonblock) {
+       if (close->put_file->f_op->flush && force_nonblock) {
                /* avoid grabbing files - we don't need the files */
                req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT;
-               req->work.func = io_close_finish;
                return -EAGAIN;
        }
 
-       /*
-        * No ->flush(), safely close from here and just punt the
-        * fput() to async context.
-        */
-       __io_close_finish(req);
+       /* No ->flush() or already async, safely close from here */
+       ret = filp_close(close->put_file, req->work.files);
+       if (ret < 0)
+               req_set_fail_links(req);
+       io_cqring_add_event(req, ret);
+       fput(close->put_file);
+       close->put_file = NULL;
+       io_put_req(req);
        return 0;
 }
 
@@ -3489,38 +3467,20 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        return 0;
 }
 
-static void __io_sync_file_range(struct io_kiocb *req)
+static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
 {
        int ret;
 
+       /* sync_file_range always requires a blocking context */
+       if (force_nonblock)
+               return -EAGAIN;
+
        ret = sync_file_range(req->file, req->sync.off, req->sync.len,
                                req->sync.flags);
        if (ret < 0)
                req_set_fail_links(req);
        io_cqring_add_event(req, ret);
        io_put_req(req);
-}
-
-
-static void io_sync_file_range_finish(struct io_wq_work **workptr)
-{
-       struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
-       if (io_req_cancelled(req))
-               return;
-       __io_sync_file_range(req);
-       io_steal_work(req, workptr);
-}
-
-static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
-{
-       /* sync_file_range always requires a blocking context */
-       if (force_nonblock) {
-               req->work.func = io_sync_file_range_finish;
-               return -EAGAIN;
-       }
-
-       __io_sync_file_range(req);
        return 0;
 }
 
@@ -3546,6 +3506,9 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        struct io_async_ctx *io = req->io;
        int ret;
 
+       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+               return -EINVAL;
+
        sr->msg_flags = READ_ONCE(sqe->msg_flags);
        sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
        sr->len = READ_ONCE(sqe->len);
@@ -3575,9 +3538,6 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
        struct socket *sock;
        int ret;
 
-       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-               return -EINVAL;
-
        sock = sock_from_file(req->file, &ret);
        if (sock) {
                struct io_async_ctx io;
@@ -3631,9 +3591,6 @@ static int io_send(struct io_kiocb *req, bool force_nonblock)
        struct socket *sock;
        int ret;
 
-       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-               return -EINVAL;
-
        sock = sock_from_file(req->file, &ret);
        if (sock) {
                struct io_sr_msg *sr = &req->sr_msg;
@@ -3786,6 +3743,9 @@ static int io_recvmsg_prep(struct io_kiocb *req,
        struct io_async_ctx *io = req->io;
        int ret;
 
+       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+               return -EINVAL;
+
        sr->msg_flags = READ_ONCE(sqe->msg_flags);
        sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
        sr->len = READ_ONCE(sqe->len);
@@ -3814,9 +3774,6 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
        struct socket *sock;
        int ret, cflags = 0;
 
-       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-               return -EINVAL;
-
        sock = sock_from_file(req->file, &ret);
        if (sock) {
                struct io_buffer *kbuf;
@@ -3878,9 +3835,6 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock)
        struct socket *sock;
        int ret, cflags = 0;
 
-       if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
-               return -EINVAL;
-
        sock = sock_from_file(req->file, &ret);
        if (sock) {
                struct io_sr_msg *sr = &req->sr_msg;
@@ -3948,49 +3902,30 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        return 0;
 }
 
-static int __io_accept(struct io_kiocb *req, bool force_nonblock)
+static int io_accept(struct io_kiocb *req, bool force_nonblock)
 {
        struct io_accept *accept = &req->accept;
-       unsigned file_flags;
+       unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
        int ret;
 
-       file_flags = force_nonblock ? O_NONBLOCK : 0;
+       if (req->file->f_flags & O_NONBLOCK)
+               req->flags |= REQ_F_NOWAIT;
+
        ret = __sys_accept4_file(req->file, file_flags, accept->addr,
                                        accept->addr_len, accept->flags,
                                        accept->nofile);
        if (ret == -EAGAIN && force_nonblock)
                return -EAGAIN;
-       if (ret == -ERESTARTSYS)
-               ret = -EINTR;
-       if (ret < 0)
+       if (ret < 0) {
+               if (ret == -ERESTARTSYS)
+                       ret = -EINTR;
                req_set_fail_links(req);
+       }
        io_cqring_add_event(req, ret);
        io_put_req(req);
        return 0;
 }
 
-static void io_accept_finish(struct io_wq_work **workptr)
-{
-       struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
-
-       if (io_req_cancelled(req))
-               return;
-       __io_accept(req, false);
-       io_steal_work(req, workptr);
-}
-
-static int io_accept(struct io_kiocb *req, bool force_nonblock)
-{
-       int ret;
-
-       ret = __io_accept(req, force_nonblock);
-       if (ret == -EAGAIN && force_nonblock) {
-               req->work.func = io_accept_finish;
-               return -EAGAIN;
-       }
-       return 0;
-}
-
 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
        struct io_connect *conn = &req->connect;
@@ -4329,7 +4264,8 @@ static void io_async_task_func(struct callback_head *cb)
        spin_unlock_irq(&ctx->completion_lock);
 
        /* restore ->work in case we need to retry again */
-       memcpy(&req->work, &apoll->work, sizeof(req->work));
+       if (req->flags & REQ_F_WORK_INITIALIZED)
+               memcpy(&req->work, &apoll->work, sizeof(req->work));
        kfree(apoll);
 
        if (!canceled) {
@@ -4426,7 +4362,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
                return false;
 
        req->flags |= REQ_F_POLLED;
-       memcpy(&apoll->work, &req->work, sizeof(req->work));
+       if (req->flags & REQ_F_WORK_INITIALIZED)
+               memcpy(&apoll->work, &req->work, sizeof(req->work));
        had_io = req->io != NULL;
 
        get_task_struct(current);
@@ -4451,7 +4388,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
                if (!had_io)
                        io_poll_remove_double(req);
                spin_unlock_irq(&ctx->completion_lock);
-               memcpy(&req->work, &apoll->work, sizeof(req->work));
+               if (req->flags & REQ_F_WORK_INITIALIZED)
+                       memcpy(&req->work, &apoll->work, sizeof(req->work));
                kfree(apoll);
                return false;
        }
@@ -4496,7 +4434,9 @@ static bool io_poll_remove_one(struct io_kiocb *req)
                         * io_req_work_drop_env below when dropping the
                         * final reference.
                         */
-                       memcpy(&req->work, &apoll->work, sizeof(req->work));
+                       if (req->flags & REQ_F_WORK_INITIALIZED)
+                               memcpy(&req->work, &apoll->work,
+                                      sizeof(req->work));
                        kfree(apoll);
                }
        }
@@ -4945,6 +4885,8 @@ static int io_req_defer_prep(struct io_kiocb *req,
        if (!sqe)
                return 0;
 
+       io_req_init_async(req);
+
        if (io_op_defs[req->opcode].file_table) {
                ret = io_grab_files(req);
                if (unlikely(ret))
@@ -5382,12 +5324,26 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
        return 0;
 }
 
+static void io_arm_async_linked_timeout(struct io_kiocb *req)
+{
+       struct io_kiocb *link;
+
+       /* link head's timeout is queued in io_queue_async_work() */
+       if (!(req->flags & REQ_F_QUEUE_TIMEOUT))
+               return;
+
+       link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
+       io_queue_linked_timeout(link);
+}
+
 static void io_wq_submit_work(struct io_wq_work **workptr)
 {
        struct io_wq_work *work = *workptr;
        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
        int ret = 0;
 
+       io_arm_async_linked_timeout(req);
+
        /* if NO_CANCEL is set, we must still run the work */
        if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
                                IO_WQ_WORK_CANCEL) {
@@ -5438,19 +5394,20 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
                        return -EBADF;
                fd = array_index_nospec(fd, ctx->nr_user_files);
                file = io_file_from_index(ctx, fd);
-               if (!file)
-                       return -EBADF;
-               req->fixed_file_refs = ctx->file_data->cur_refs;
-               percpu_ref_get(req->fixed_file_refs);
+               if (file) {
+                       req->fixed_file_refs = ctx->file_data->cur_refs;
+                       percpu_ref_get(req->fixed_file_refs);
+               }
        } else {
                trace_io_uring_file_get(ctx, fd);
                file = __io_file_get(state, fd);
-               if (unlikely(!file))
-                       return -EBADF;
        }
 
-       *out_file = file;
-       return 0;
+       if (file || io_op_defs[req->opcode].needs_file_no_error) {
+               *out_file = file;
+               return 0;
+       }
+       return -EBADF;
 }
 
 static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
@@ -5584,7 +5541,8 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 again:
        linked_timeout = io_prep_linked_timeout(req);
 
-       if (req->work.creds && req->work.creds != current_cred()) {
+       if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds &&
+           req->work.creds != current_cred()) {
                if (old_creds)
                        revert_creds(old_creds);
                if (old_creds == req->work.creds)
@@ -5607,6 +5565,8 @@ again:
                        goto exit;
                }
 punt:
+               io_req_init_async(req);
+
                if (io_op_defs[req->opcode].file_table) {
                        ret = io_grab_files(req);
                        if (ret)
@@ -5859,7 +5819,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
        refcount_set(&req->refs, 2);
        req->task = NULL;
        req->result = 0;
-       INIT_IO_WORK(&req->work, io_wq_submit_work);
 
        if (unlikely(req->opcode >= IORING_OP_LAST))
                return -EINVAL;
@@ -5867,7 +5826,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
        if (io_op_defs[req->opcode].needs_mm && !current->mm) {
                if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
                        return -EFAULT;
-               use_mm(ctx->sqo_mm);
+               kthread_use_mm(ctx->sqo_mm);
        }
 
        sqe_flags = READ_ONCE(sqe->flags);
@@ -5881,6 +5840,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 
        id = READ_ONCE(sqe->personality);
        if (id) {
+               io_req_init_async(req);
                req->work.creds = idr_find(&ctx->personality_idr, id);
                if (unlikely(!req->work.creds))
                        return -EINVAL;
@@ -5981,7 +5941,7 @@ static inline void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
        struct mm_struct *mm = current->mm;
 
        if (mm) {
-               unuse_mm(mm);
+               kthread_unuse_mm(mm);
                mmput(mm);
        }
 }
@@ -5990,15 +5950,12 @@ static int io_sq_thread(void *data)
 {
        struct io_ring_ctx *ctx = data;
        const struct cred *old_cred;
-       mm_segment_t old_fs;
        DEFINE_WAIT(wait);
        unsigned long timeout;
        int ret = 0;
 
        complete(&ctx->sq_thread_comp);
 
-       old_fs = get_fs();
-       set_fs(USER_DS);
        old_cred = override_creds(ctx->creds);
 
        timeout = jiffies + ctx->sq_thread_idle;
@@ -6103,7 +6060,6 @@ static int io_sq_thread(void *data)
        if (current->task_works)
                task_work_run();
 
-       set_fs(old_fs);
        io_sq_thread_drop_mm(ctx);
        revert_creds(old_cred);
 
@@ -6879,6 +6835,7 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx,
 
        data.user = ctx->user;
        data.free_work = io_free_work;
+       data.do_work = io_wq_submit_work;
 
        if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
                /* Do QD, or 4 * CPUS, whatever is smallest */
@@ -7160,8 +7117,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
 
                ret = 0;
                if (!pages || nr_pages > got_pages) {
-                       kfree(vmas);
-                       kfree(pages);
+                       kvfree(vmas);
+                       kvfree(pages);
                        pages = kvmalloc_array(nr_pages, sizeof(struct page *),
                                                GFP_KERNEL);
                        vmas = kvmalloc_array(nr_pages,