X-Git-Url: http://git.monstr.eu/?a=blobdiff_plain;f=fs%2Fio_uring.c;h=99582cf5106b4e29bb9bf4171d8c0278d1e9deb3;hb=6d816e088c359866f9867057e04f244c608c42fe;hp=32b0064f806ef88466051d5ca01b3a7bce85f91c;hpb=21391520cbb597823050ac1bc343a0df3222ac90;p=linux-2.6-microblaze.git diff --git a/fs/io_uring.c b/fs/io_uring.c index 32b0064f806e..99582cf5106b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -78,6 +78,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -226,7 +227,7 @@ struct io_ring_ctx { struct { unsigned int flags; unsigned int compat: 1; - unsigned int account_mem: 1; + unsigned int limit_mem: 1; unsigned int cq_overflow_flushed: 1; unsigned int drain_next: 1; unsigned int eventfd_async: 1; @@ -319,12 +320,12 @@ struct io_ring_ctx { spinlock_t completion_lock; /* - * ->poll_list is protected by the ctx->uring_lock for + * ->iopoll_list is protected by the ctx->uring_lock for * io_uring instances that don't use IORING_SETUP_SQPOLL. * For SQPOLL, only the single threaded io_sq_thread() will * manipulate the list, hence no extra locking is needed there. */ - struct list_head poll_list; + struct list_head iopoll_list; struct hlist_head *cancel_hash; unsigned cancel_hash_bits; bool poll_multi_file; @@ -395,6 +396,7 @@ struct io_timeout { int flags; u32 off; u32 target_seq; + struct list_head list; }; struct io_rw { @@ -413,7 +415,7 @@ struct io_connect { struct io_sr_msg { struct file *file; union { - struct user_msghdr __user *msg; + struct user_msghdr __user *umsg; void __user *buf; }; int msg_flags; @@ -486,6 +488,12 @@ struct io_statx { struct statx __user *buffer; }; +struct io_completion { + struct file *file; + struct list_head list; + int cflags; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -503,6 +511,7 @@ struct io_async_rw { struct iovec *iov; ssize_t nr_segs; ssize_t size; + struct wait_page_queue wpq; }; struct io_async_ctx { @@ -523,23 +532,18 @@ enum { REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, REQ_F_LINK_HEAD_BIT, - REQ_F_LINK_NEXT_BIT, REQ_F_FAIL_LINK_BIT, REQ_F_INFLIGHT_BIT, REQ_F_CUR_POS_BIT, REQ_F_NOWAIT_BIT, REQ_F_LINK_TIMEOUT_BIT, - REQ_F_TIMEOUT_BIT, REQ_F_ISREG_BIT, - REQ_F_MUST_PUNT_BIT, - REQ_F_TIMEOUT_NOSEQ_BIT, REQ_F_COMP_LOCKED_BIT, REQ_F_NEED_CLEANUP_BIT, REQ_F_OVERFLOW_BIT, REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, REQ_F_NO_FILE_TABLE_BIT, - REQ_F_QUEUE_TIMEOUT_BIT, REQ_F_WORK_INITIALIZED_BIT, REQ_F_TASK_PINNED_BIT, @@ -563,8 +567,6 @@ enum { /* head of a link */ REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT), - /* already grabbed next link */ - REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT), /* fail rest of links */ REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), /* on inflight list */ @@ -575,14 +577,8 @@ enum { REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), /* has linked timeout */ REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), - /* timeout request */ - REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), - /* must be punted even for NONBLOCK */ - REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT), - /* no timeout sequence */ - REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT), /* completion under lock */ REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), /* needs cleanup */ @@ -595,8 +591,6 @@ enum { REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), /* doesn't need file table for this request */ REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), - /* needs to queue linked timeout */ - REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT), /* io_wq_work is initialized */ REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), /* req->task is refcounted */ @@ -606,7 +600,6 @@ enum { struct async_poll { struct io_poll_iocb poll; struct io_poll_iocb *double_poll; - struct io_wq_work work; }; /* @@ -635,51 +628,54 @@ struct io_kiocb { struct io_splice splice; struct io_provide_buf pbuf; struct io_statx statx; + /* use only after cleaning per-op data, see io_clean_op() */ + struct io_completion compl; }; struct io_async_ctx *io; - int cflags; u8 opcode; /* polled IO has completed */ u8 iopoll_completed; u16 buf_index; + u32 result; - struct io_ring_ctx *ctx; - struct list_head list; - unsigned int flags; - refcount_t refs; - struct task_struct *task; - unsigned long fsize; - u64 user_data; - u32 result; - u32 sequence; - - struct list_head link_list; + struct io_ring_ctx *ctx; + unsigned int flags; + refcount_t refs; + struct task_struct *task; + u64 user_data; - struct list_head inflight_entry; + struct list_head link_list; - struct percpu_ref *fixed_file_refs; + /* + * 1. used with ctx->iopoll_list with reads/writes + * 2. to track reqs with ->files (see io_op_def::file_table) + */ + struct list_head inflight_entry; + + struct percpu_ref *fixed_file_refs; + struct callback_head task_work; + /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ + struct hlist_node hash_node; + struct async_poll *apoll; + struct io_wq_work work; +}; - union { - /* - * Only commands that never go async can use the below fields, - * obviously. Right now only IORING_OP_POLL_ADD uses them, and - * async armed poll handlers for regular commands. The latter - * restore the work, if needed. - */ - struct { - struct callback_head task_work; - struct hlist_node hash_node; - struct async_poll *apoll; - }; - struct io_wq_work work; - }; +struct io_defer_entry { + struct list_head list; + struct io_kiocb *req; + u32 seq; }; -#define IO_PLUG_THRESHOLD 2 #define IO_IOPOLL_BATCH 8 +struct io_comp_state { + unsigned int nr; + struct list_head list; + struct io_ring_ctx *ctx; +}; + struct io_submit_state { struct blk_plug plug; @@ -689,13 +685,17 @@ struct io_submit_state { void *reqs[IO_IOPOLL_BATCH]; unsigned int free_reqs; + /* + * Batch completion logic + */ + struct io_comp_state comp; + /* * File reference cache */ struct file *file; unsigned int fd; unsigned int has_refs; - unsigned int used_refs; unsigned int ios_left; }; @@ -723,6 +723,7 @@ struct io_op_def { unsigned pollout : 1; /* op supports buffer selection */ unsigned buffer_select : 1; + unsigned needs_fsize : 1; }; static const struct io_op_def io_op_defs[] = { @@ -742,6 +743,7 @@ static const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .needs_fsize = 1, }, [IORING_OP_FSYNC] = { .needs_file = 1, @@ -756,6 +758,7 @@ static const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .needs_fsize = 1, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -808,6 +811,7 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_FALLOCATE] = { .needs_file = 1, + .needs_fsize = 1, }, [IORING_OP_OPENAT] = { .file_table = 1, @@ -839,6 +843,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .needs_fsize = 1, }, [IORING_OP_FADVISE] = { .needs_file = 1, @@ -881,22 +886,38 @@ static const struct io_op_def io_op_defs[] = { }, }; -static void io_wq_submit_work(struct io_wq_work **workptr); +enum io_mem_account { + ACCT_LOCKED, + ACCT_PINNED, +}; + +static void __io_complete_rw(struct io_kiocb *req, long res, long res2, + struct io_comp_state *cs); static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_put_req(struct io_kiocb *req); +static void io_double_put_req(struct io_kiocb *req); static void __io_double_put_req(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); +static void __io_queue_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_files_update *ip, unsigned nr_args); -static int io_grab_files(struct io_kiocb *req); -static void io_complete_rw_common(struct kiocb *kiocb, long res); -static void io_cleanup_req(struct io_kiocb *req); +static int io_prep_work_files(struct io_kiocb *req); +static void __io_clean_op(struct io_kiocb *req); static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, int fd, struct file **out_file, bool fixed); static void __io_queue_sqe(struct io_kiocb *req, - const struct io_uring_sqe *sqe); + const struct io_uring_sqe *sqe, + struct io_comp_state *cs); +static void io_file_put_work(struct work_struct *work); + +static ssize_t io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct iov_iter *iter, + bool needs_lock); +static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, + struct iovec *iovec, struct iovec *fast_iov, + struct iov_iter *iter); static struct kmem_cache *req_cachep; @@ -923,6 +944,12 @@ static void io_get_req_task(struct io_kiocb *req) req->flags |= REQ_F_TASK_PINNED; } +static inline void io_clean_op(struct io_kiocb *req) +{ + if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED)) + __io_clean_op(req); +} + /* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */ static void __io_put_req_task(struct io_kiocb *req) { @@ -930,7 +957,41 @@ static void __io_put_req_task(struct io_kiocb *req) put_task_struct(req->task); } -static void io_file_put_work(struct work_struct *work); +static void io_sq_thread_drop_mm(void) +{ + struct mm_struct *mm = current->mm; + + if (mm) { + kthread_unuse_mm(mm); + mmput(mm); + } +} + +static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) +{ + if (!current->mm) { + if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) || + !mmget_not_zero(ctx->sqo_mm))) + return -EFAULT; + kthread_use_mm(ctx->sqo_mm); + } + + return 0; +} + +static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + if (!io_op_defs[req->opcode].needs_mm) + return 0; + return __io_sq_thread_acquire_mm(ctx); +} + +static inline void req_set_fail_links(struct io_kiocb *req) +{ + if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; +} /* * Note: must call io_req_init_async() for the first time you @@ -957,6 +1018,11 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref) complete(&ctx->ref_comp); } +static inline bool io_is_timeout_noseq(struct io_kiocb *req) +{ + return !req->timeout.off; +} + static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; @@ -1000,7 +1066,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); - INIT_LIST_HEAD(&ctx->poll_list); + INIT_LIST_HEAD(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); init_waitqueue_head(&ctx->inflight_wait); @@ -1017,18 +1083,14 @@ err: return NULL; } -static inline bool __req_need_defer(struct io_kiocb *req) +static bool req_need_defer(struct io_kiocb *req, u32 seq) { - struct io_ring_ctx *ctx = req->ctx; + if (unlikely(req->flags & REQ_F_IO_DRAIN)) { + struct io_ring_ctx *ctx = req->ctx; - return req->sequence != ctx->cached_cq_tail + return seq != ctx->cached_cq_tail + atomic_read(&ctx->cached_cq_overflow); -} - -static inline bool req_need_defer(struct io_kiocb *req) -{ - if (unlikely(req->flags & REQ_F_IO_DRAIN)) - return __req_need_defer(req); + } return false; } @@ -1046,31 +1108,16 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -static inline void io_req_work_grab_env(struct io_kiocb *req, - const struct io_op_def *def) -{ - if (!req->work.mm && def->needs_mm) { - mmgrab(current->mm); - req->work.mm = current->mm; - } - if (!req->work.creds) - req->work.creds = get_current_cred(); - if (!req->work.fs && def->needs_fs) { - spin_lock(¤t->fs->lock); - if (!current->fs->in_exec) { - req->work.fs = current->fs; - req->work.fs->users++; - } else { - req->work.flags |= IO_WQ_WORK_CANCEL; - } - spin_unlock(¤t->fs->lock); - } -} - -static inline void io_req_work_drop_env(struct io_kiocb *req) +/* + * Returns true if we need to defer file table putting. This can only happen + * from the error path with REQ_F_COMP_LOCKED set. + */ +static bool io_req_clean_work(struct io_kiocb *req) { if (!(req->flags & REQ_F_WORK_INITIALIZED)) - return; + return false; + + req->flags &= ~REQ_F_WORK_INITIALIZED; if (req->work.mm) { mmdrop(req->work.mm); @@ -1083,17 +1130,22 @@ static inline void io_req_work_drop_env(struct io_kiocb *req) if (req->work.fs) { struct fs_struct *fs = req->work.fs; + if (req->flags & REQ_F_COMP_LOCKED) + return true; + spin_lock(&req->work.fs->lock); if (--fs->users) fs = NULL; spin_unlock(&req->work.fs->lock); if (fs) free_fs_struct(fs); + req->work.fs = NULL; } + + return false; } -static inline void io_prep_async_work(struct io_kiocb *req, - struct io_kiocb **link) +static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -1106,22 +1158,56 @@ static inline void io_prep_async_work(struct io_kiocb *req, if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } + if (!req->work.mm && def->needs_mm) { + mmgrab(current->mm); + req->work.mm = current->mm; + } + if (!req->work.creds) + req->work.creds = get_current_cred(); + if (!req->work.fs && def->needs_fs) { + spin_lock(¤t->fs->lock); + if (!current->fs->in_exec) { + req->work.fs = current->fs; + req->work.fs->users++; + } else { + req->work.flags |= IO_WQ_WORK_CANCEL; + } + spin_unlock(¤t->fs->lock); + } + if (def->needs_fsize) + req->work.fsize = rlimit(RLIMIT_FSIZE); + else + req->work.fsize = RLIM_INFINITY; +} - io_req_work_grab_env(req, def); +static void io_prep_async_link(struct io_kiocb *req) +{ + struct io_kiocb *cur; - *link = io_prep_linked_timeout(req); + io_prep_async_work(req); + if (req->flags & REQ_F_LINK_HEAD) + list_for_each_entry(cur, &req->link_list, link_list) + io_prep_async_work(cur); } -static inline void io_queue_async_work(struct io_kiocb *req) +static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *link; - - io_prep_async_work(req, &link); + struct io_kiocb *link = io_prep_linked_timeout(req); trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, &req->work, req->flags); io_wq_enqueue(ctx->io_wq, &req->work); + return link; +} + +static void io_queue_async_work(struct io_kiocb *req) +{ + struct io_kiocb *link; + + /* init ->work of the whole link before punting */ + io_prep_async_link(req); + link = __io_queue_async_work(req); if (link) io_queue_linked_timeout(link); @@ -1133,8 +1219,9 @@ static void io_kill_timeout(struct io_kiocb *req) ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret != -1) { - atomic_inc(&req->ctx->cq_timeouts); - list_del_init(&req->list); + atomic_set(&req->ctx->cq_timeouts, + atomic_read(&req->ctx->cq_timeouts) + 1); + list_del_init(&req->timeout.list); req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, 0); io_put_req(req); @@ -1146,7 +1233,7 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx) struct io_kiocb *req, *tmp; spin_lock_irq(&ctx->completion_lock); - list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list) + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) io_kill_timeout(req); spin_unlock_irq(&ctx->completion_lock); } @@ -1154,13 +1241,22 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx) static void __io_queue_deferred(struct io_ring_ctx *ctx) { do { - struct io_kiocb *req = list_first_entry(&ctx->defer_list, - struct io_kiocb, list); + struct io_defer_entry *de = list_first_entry(&ctx->defer_list, + struct io_defer_entry, list); + struct io_kiocb *link; - if (req_need_defer(req)) + if (req_need_defer(de->req, de->seq)) break; - list_del_init(&req->list); - io_queue_async_work(req); + list_del_init(&de->list); + /* punt-init is done before queueing for defer */ + link = __io_queue_async_work(de->req); + if (link) { + __io_queue_linked_timeout(link); + /* drop submission reference */ + link->flags |= REQ_F_COMP_LOCKED; + io_put_req(link); + } + kfree(de); } while (!list_empty(&ctx->defer_list)); } @@ -1168,15 +1264,15 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) { while (!list_empty(&ctx->timeout_list)) { struct io_kiocb *req = list_first_entry(&ctx->timeout_list, - struct io_kiocb, list); + struct io_kiocb, timeout.list); - if (req->flags & REQ_F_TIMEOUT_NOSEQ) + if (io_is_timeout_noseq(req)) break; if (req->timeout.target_seq != ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts)) break; - list_del_init(&req->list); + list_del_init(&req->timeout.list); io_kill_timeout(req); } } @@ -1229,6 +1325,15 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) eventfd_signal(ctx->cq_ev_fd, 1); } +static void io_cqring_mark_overflow(struct io_ring_ctx *ctx) +{ + if (list_empty(&ctx->cq_overflow_list)) { + clear_bit(0, &ctx->sq_check_overflow); + clear_bit(0, &ctx->cq_check_overflow); + ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; + } +} + /* Returns true if there are no backlogged entries after the flush */ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { @@ -1259,13 +1364,13 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) break; req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb, - list); - list_move(&req->list, &list); + compl.list); + list_move(&req->compl.list, &list); req->flags &= ~REQ_F_OVERFLOW; if (cqe) { WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, req->result); - WRITE_ONCE(cqe->flags, req->cflags); + WRITE_ONCE(cqe->flags, req->compl.cflags); } else { WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); @@ -1273,17 +1378,14 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) } io_commit_cqring(ctx); - if (cqe) { - clear_bit(0, &ctx->sq_check_overflow); - clear_bit(0, &ctx->cq_check_overflow); - ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; - } + io_cqring_mark_overflow(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); while (!list_empty(&list)) { - req = list_first_entry(&list, struct io_kiocb, list); - list_del(&req->list); + req = list_first_entry(&list, struct io_kiocb, compl.list); + list_del(&req->compl.list); io_put_req(req); } @@ -1316,11 +1418,12 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) set_bit(0, &ctx->cq_check_overflow); ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW; } + io_clean_op(req); req->flags |= REQ_F_OVERFLOW; - refcount_inc(&req->refs); req->result = res; - req->cflags = cflags; - list_add_tail(&req->list, &ctx->cq_overflow_list); + req->compl.cflags = cflags; + refcount_inc(&req->refs); + list_add_tail(&req->compl.list, &ctx->cq_overflow_list); } } @@ -1329,7 +1432,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) __io_cqring_fill_event(req, res, 0); } -static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags) +static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags) { struct io_ring_ctx *ctx = req->ctx; unsigned long flags; @@ -1342,9 +1445,52 @@ static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags) io_cqring_ev_posted(ctx); } -static void io_cqring_add_event(struct io_kiocb *req, long res) +static void io_submit_flush_completions(struct io_comp_state *cs) +{ + struct io_ring_ctx *ctx = cs->ctx; + + spin_lock_irq(&ctx->completion_lock); + while (!list_empty(&cs->list)) { + struct io_kiocb *req; + + req = list_first_entry(&cs->list, struct io_kiocb, compl.list); + list_del(&req->compl.list); + __io_cqring_fill_event(req, req->result, req->compl.cflags); + if (!(req->flags & REQ_F_LINK_HEAD)) { + req->flags |= REQ_F_COMP_LOCKED; + io_put_req(req); + } else { + spin_unlock_irq(&ctx->completion_lock); + io_put_req(req); + spin_lock_irq(&ctx->completion_lock); + } + } + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + cs->nr = 0; +} + +static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags, + struct io_comp_state *cs) +{ + if (!cs) { + io_cqring_add_event(req, res, cflags); + io_put_req(req); + } else { + io_clean_op(req); + req->result = res; + req->compl.cflags = cflags; + list_add_tail(&req->compl.list, &cs->list); + if (++cs->nr >= 32) + io_submit_flush_completions(cs); + } +} + +static void io_req_complete(struct io_kiocb *req, long res) { - __io_cqring_add_event(req, res, 0); + __io_req_complete(req, res, 0, NULL); } static inline bool io_is_fallback_req(struct io_kiocb *req) @@ -1370,11 +1516,7 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct io_kiocb *req; - if (!state) { - req = kmem_cache_alloc(req_cachep, gfp); - if (unlikely(!req)) - goto fallback; - } else if (!state->free_reqs) { + if (!state->free_reqs) { size_t sz; int ret; @@ -1412,21 +1554,14 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file, fput(file); } -static void __io_req_aux_free(struct io_kiocb *req) +static bool io_dismantle_req(struct io_kiocb *req) { - if (req->flags & REQ_F_NEED_CLEANUP) - io_cleanup_req(req); + io_clean_op(req); - kfree(req->io); + if (req->io) + kfree(req->io); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - __io_put_req_task(req); - io_req_work_drop_env(req); -} - -static void __io_free_req(struct io_kiocb *req) -{ - __io_req_aux_free(req); if (req->flags & REQ_F_INFLIGHT) { struct io_ring_ctx *ctx = req->ctx; @@ -1439,56 +1574,52 @@ static void __io_free_req(struct io_kiocb *req) spin_unlock_irqrestore(&ctx->inflight_lock, flags); } - percpu_ref_put(&req->ctx->refs); + return io_req_clean_work(req); +} + +static void __io_free_req_finish(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + __io_put_req_task(req); if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); else - clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req); + clear_bit_unlock(0, (unsigned long *) &ctx->fallback_req); + percpu_ref_put(&ctx->refs); } -struct req_batch { - void *reqs[IO_IOPOLL_BATCH]; - int to_free; - int need_iter; -}; - -static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) +static void io_req_task_file_table_put(struct callback_head *cb) { - if (!rb->to_free) - return; - if (rb->need_iter) { - int i, inflight = 0; - unsigned long flags; + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct fs_struct *fs = req->work.fs; - for (i = 0; i < rb->to_free; i++) { - struct io_kiocb *req = rb->reqs[i]; + spin_lock(&req->work.fs->lock); + if (--fs->users) + fs = NULL; + spin_unlock(&req->work.fs->lock); + if (fs) + free_fs_struct(fs); + req->work.fs = NULL; + __io_free_req_finish(req); +} - if (req->flags & REQ_F_INFLIGHT) - inflight++; - __io_req_aux_free(req); - } - if (!inflight) - goto do_free; +static void __io_free_req(struct io_kiocb *req) +{ + if (!io_dismantle_req(req)) { + __io_free_req_finish(req); + } else { + int ret; - spin_lock_irqsave(&ctx->inflight_lock, flags); - for (i = 0; i < rb->to_free; i++) { - struct io_kiocb *req = rb->reqs[i]; + init_task_work(&req->task_work, io_req_task_file_table_put); + ret = task_work_add(req->task, &req->task_work, TWA_RESUME); + if (unlikely(ret)) { + struct task_struct *tsk; - if (req->flags & REQ_F_INFLIGHT) { - list_del(&req->inflight_entry); - if (!--inflight) - break; - } + tsk = io_wq_get_task(req->ctx->io_wq); + task_work_add(tsk, &req->task_work, 0); } - spin_unlock_irqrestore(&ctx->inflight_lock, flags); - - if (waitqueue_active(&ctx->inflight_wait)) - wake_up(&ctx->inflight_wait); } -do_free: - kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); - percpu_ref_put_many(&ctx->refs, rb->to_free); - rb->to_free = rb->need_iter = 0; } static bool io_link_cancel_timeout(struct io_kiocb *req) @@ -1508,53 +1639,68 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) return false; } -static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) +static bool __io_kill_linked_timeout(struct io_kiocb *req) +{ + struct io_kiocb *link; + bool wake_ev; + + if (list_empty(&req->link_list)) + return false; + link = list_first_entry(&req->link_list, struct io_kiocb, link_list); + if (link->opcode != IORING_OP_LINK_TIMEOUT) + return false; + + list_del_init(&link->link_list); + link->flags |= REQ_F_COMP_LOCKED; + wake_ev = io_link_cancel_timeout(link); + req->flags &= ~REQ_F_LINK_TIMEOUT; + return wake_ev; +} + +static void io_kill_linked_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - bool wake_ev = false; + bool wake_ev; - /* Already got next link */ - if (req->flags & REQ_F_LINK_NEXT) - return; + if (!(req->flags & REQ_F_COMP_LOCKED)) { + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); + wake_ev = __io_kill_linked_timeout(req); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else { + wake_ev = __io_kill_linked_timeout(req); + } + + if (wake_ev) + io_cqring_ev_posted(ctx); +} + +static struct io_kiocb *io_req_link_next(struct io_kiocb *req) +{ + struct io_kiocb *nxt; /* * The list should never be empty when we are called here. But could * potentially happen if the chain is messed up, check to be on the * safe side. */ - while (!list_empty(&req->link_list)) { - struct io_kiocb *nxt = list_first_entry(&req->link_list, - struct io_kiocb, link_list); - - if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) && - (nxt->flags & REQ_F_TIMEOUT))) { - list_del_init(&nxt->link_list); - wake_ev |= io_link_cancel_timeout(nxt); - req->flags &= ~REQ_F_LINK_TIMEOUT; - continue; - } - - list_del_init(&req->link_list); - if (!list_empty(&nxt->link_list)) - nxt->flags |= REQ_F_LINK_HEAD; - *nxtptr = nxt; - break; - } + if (unlikely(list_empty(&req->link_list))) + return NULL; - req->flags |= REQ_F_LINK_NEXT; - if (wake_ev) - io_cqring_ev_posted(ctx); + nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list); + list_del_init(&req->link_list); + if (!list_empty(&nxt->link_list)) + nxt->flags |= REQ_F_LINK_HEAD; + return nxt; } /* * Called if REQ_F_LINK_HEAD is set, and we fail the head request */ -static void io_fail_links(struct io_kiocb *req) +static void __io_fail_links(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->completion_lock, flags); while (!list_empty(&req->link_list)) { struct io_kiocb *link = list_first_entry(&req->link_list, @@ -1563,88 +1709,229 @@ static void io_fail_links(struct io_kiocb *req) list_del_init(&link->link_list); trace_io_uring_fail_link(req, link); - if ((req->flags & REQ_F_LINK_TIMEOUT) && - link->opcode == IORING_OP_LINK_TIMEOUT) { - io_link_cancel_timeout(link); - } else { - io_cqring_fill_event(link, -ECANCELED); - __io_double_put_req(link); - } + io_cqring_fill_event(link, -ECANCELED); + link->flags |= REQ_F_COMP_LOCKED; + __io_double_put_req(link); req->flags &= ~REQ_F_LINK_TIMEOUT; } io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); } -static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) +static void io_fail_links(struct io_kiocb *req) { - if (likely(!(req->flags & REQ_F_LINK_HEAD))) - return; + struct io_ring_ctx *ctx = req->ctx; - /* - * If LINK is set, we have dependent requests in this chain. If we + if (!(req->flags & REQ_F_COMP_LOCKED)) { + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); + __io_fail_links(req); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else { + __io_fail_links(req); + } + + io_cqring_ev_posted(ctx); +} + +static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) +{ + req->flags &= ~REQ_F_LINK_HEAD; + if (req->flags & REQ_F_LINK_TIMEOUT) + io_kill_linked_timeout(req); + + /* + * If LINK is set, we have dependent requests in this chain. If we * didn't fail this request, queue the first one up, moving any other * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (req->flags & REQ_F_FAIL_LINK) { - io_fail_links(req); - } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) == - REQ_F_LINK_TIMEOUT) { - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; + if (likely(!(req->flags & REQ_F_FAIL_LINK))) + return io_req_link_next(req); + io_fail_links(req); + return NULL; +} - /* - * If this is a timeout link, we could be racing with the - * timeout timer. Grab the completion lock for this case to - * protect against that. - */ - spin_lock_irqsave(&ctx->completion_lock, flags); - io_req_link_next(req, nxt); - spin_unlock_irqrestore(&ctx->completion_lock, flags); +static struct io_kiocb *io_req_find_next(struct io_kiocb *req) +{ + if (likely(!(req->flags & REQ_F_LINK_HEAD))) + return NULL; + return __io_req_find_next(req); +} + +static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) +{ + struct task_struct *tsk = req->task; + struct io_ring_ctx *ctx = req->ctx; + int ret, notify; + + /* + * SQPOLL kernel thread doesn't need notification, just a wakeup. For + * all other cases, use TWA_SIGNAL unconditionally to ensure we're + * processing task_work. There's no reliable way to tell if TWA_RESUME + * will do the job. + */ + notify = 0; + if (!(ctx->flags & IORING_SETUP_SQPOLL)) + notify = TWA_SIGNAL; + + ret = task_work_add(tsk, cb, notify); + if (!ret) + wake_up_process(tsk); + + return ret; +} + +static void __io_req_task_cancel(struct io_kiocb *req, int error) +{ + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->completion_lock); + io_cqring_fill_event(req, error); + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + req_set_fail_links(req); + io_double_put_req(req); +} + +static void io_req_task_cancel(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + + __io_req_task_cancel(req, -ECANCELED); +} + +static void __io_req_task_submit(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (!__io_sq_thread_acquire_mm(ctx)) { + mutex_lock(&ctx->uring_lock); + __io_queue_sqe(req, NULL, NULL); + mutex_unlock(&ctx->uring_lock); } else { - io_req_link_next(req, nxt); + __io_req_task_cancel(req, -EFAULT); } } -static void io_free_req(struct io_kiocb *req) +static void io_req_task_submit(struct callback_head *cb) { - struct io_kiocb *nxt = NULL; + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_ring_ctx *ctx = req->ctx; - io_req_find_next(req, &nxt); - __io_free_req(req); + __io_req_task_submit(req); + percpu_ref_put(&ctx->refs); +} + +static void io_req_task_queue(struct io_kiocb *req) +{ + int ret; + + init_task_work(&req->task_work, io_req_task_submit); + percpu_ref_get(&req->ctx->refs); + + ret = io_req_task_work_add(req, &req->task_work); + if (unlikely(ret)) { + struct task_struct *tsk; + + init_task_work(&req->task_work, io_req_task_cancel); + tsk = io_wq_get_task(req->ctx->io_wq); + task_work_add(tsk, &req->task_work, 0); + wake_up_process(tsk); + } +} + +static void io_queue_next(struct io_kiocb *req) +{ + struct io_kiocb *nxt = io_req_find_next(req); if (nxt) - io_queue_async_work(nxt); + io_req_task_queue(nxt); } -static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) +static void io_free_req(struct io_kiocb *req) { - struct io_kiocb *link; - const struct io_op_def *def = &io_op_defs[nxt->opcode]; + io_queue_next(req); + __io_free_req(req); +} + +struct req_batch { + void *reqs[IO_IOPOLL_BATCH]; + int to_free; - if ((nxt->flags & REQ_F_ISREG) && def->hash_reg_file) - io_wq_hash_work(&nxt->work, file_inode(nxt->file)); + struct task_struct *task; + int task_refs; +}; - *workptr = &nxt->work; - link = io_prep_linked_timeout(nxt); - if (link) - nxt->flags |= REQ_F_QUEUE_TIMEOUT; +static inline void io_init_req_batch(struct req_batch *rb) +{ + rb->to_free = 0; + rb->task_refs = 0; + rb->task = NULL; +} + +static void __io_req_free_batch_flush(struct io_ring_ctx *ctx, + struct req_batch *rb) +{ + kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); + percpu_ref_put_many(&ctx->refs, rb->to_free); + rb->to_free = 0; +} + +static void io_req_free_batch_finish(struct io_ring_ctx *ctx, + struct req_batch *rb) +{ + if (rb->to_free) + __io_req_free_batch_flush(ctx, rb); + if (rb->task) { + put_task_struct_many(rb->task, rb->task_refs); + rb->task = NULL; + } +} + +static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) +{ + if (unlikely(io_is_fallback_req(req))) { + io_free_req(req); + return; + } + if (req->flags & REQ_F_LINK_HEAD) + io_queue_next(req); + + if (req->flags & REQ_F_TASK_PINNED) { + if (req->task != rb->task) { + if (rb->task) + put_task_struct_many(rb->task, rb->task_refs); + rb->task = req->task; + rb->task_refs = 0; + } + rb->task_refs++; + req->flags &= ~REQ_F_TASK_PINNED; + } + + WARN_ON_ONCE(io_dismantle_req(req)); + rb->reqs[rb->to_free++] = req; + if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) + __io_req_free_batch_flush(req->ctx, rb); } /* * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. */ -__attribute__((nonnull)) -static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr) +static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) { + struct io_kiocb *nxt = NULL; + if (refcount_dec_and_test(&req->refs)) { - io_req_find_next(req, nxtptr); + nxt = io_req_find_next(req); __io_free_req(req); } + return nxt; } static void io_put_req(struct io_kiocb *req) @@ -1653,24 +1940,20 @@ static void io_put_req(struct io_kiocb *req) io_free_req(req); } -static void io_steal_work(struct io_kiocb *req, - struct io_wq_work **workptr) +static struct io_wq_work *io_steal_work(struct io_kiocb *req) { + struct io_kiocb *nxt; + /* - * It's in an io-wq worker, so there always should be at least - * one reference, which will be dropped in io_put_work() just - * after the current handler returns. - * - * It also means, that if the counter dropped to 1, then there is - * no asynchronous users left, so it's safe to steal the next work. + * A ref is owned by io-wq in which context we're. So, if that's the + * last one, it's safe to steal next work. False negatives are Ok, + * it just will be re-punted async in io_put_work() */ - if (refcount_read(&req->refs) == 1) { - struct io_kiocb *nxt = NULL; + if (refcount_read(&req->refs) != 1) + return NULL; - io_req_find_next(req, &nxt); - if (nxt) - io_wq_assign_next(workptr, nxt); - } + nxt = io_req_find_next(req); + return nxt ? &nxt->work : NULL; } /* @@ -1720,31 +2003,34 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; } -static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) +static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf) { - if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req)) - return false; - - if (req->file || req->io) - rb->need_iter++; + unsigned int cflags; - rb->reqs[rb->to_free++] = req; - if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) - io_free_req_many(req->ctx, rb); - return true; + cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; + cflags |= IORING_CQE_F_BUFFER; + req->flags &= ~REQ_F_BUFFER_SELECTED; + kfree(kbuf); + return cflags; } -static int io_put_kbuf(struct io_kiocb *req) +static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) { struct io_buffer *kbuf; - int cflags; kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; - cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; - cflags |= IORING_CQE_F_BUFFER; - req->rw.addr = 0; - kfree(kbuf); - return cflags; + return io_put_kbuf(req, kbuf); +} + +static inline bool io_run_task_work(void) +{ + if (current->task_works) { + __set_current_state(TASK_RUNNING); + task_work_run(); + return true; + } + + return false; } static void io_iopoll_queue(struct list_head *again) @@ -1752,18 +2038,9 @@ static void io_iopoll_queue(struct list_head *again) struct io_kiocb *req; do { - req = list_first_entry(again, struct io_kiocb, list); - list_del(&req->list); - - /* shouldn't happen unless io_uring is dying, cancel reqs */ - if (unlikely(!current->mm)) { - io_complete_rw_common(&req->rw.kiocb, -EAGAIN); - io_put_req(req); - continue; - } - - refcount_inc(&req->refs); - io_queue_async_work(req); + req = list_first_entry(again, struct io_kiocb, inflight_entry); + list_del(&req->inflight_entry); + __io_complete_rw(req, -EAGAIN, 0, NULL); } while (!list_empty(again)); } @@ -1780,33 +2057,32 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, /* order with ->result store in io_complete_rw_iopoll() */ smp_rmb(); - rb.to_free = rb.need_iter = 0; + io_init_req_batch(&rb); while (!list_empty(done)) { int cflags = 0; - req = list_first_entry(done, struct io_kiocb, list); + req = list_first_entry(done, struct io_kiocb, inflight_entry); if (READ_ONCE(req->result) == -EAGAIN) { req->iopoll_completed = 0; - list_move_tail(&req->list, &again); + list_move_tail(&req->inflight_entry, &again); continue; } - list_del(&req->list); + list_del(&req->inflight_entry); if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_kbuf(req); + cflags = io_put_rw_kbuf(req); __io_cqring_fill_event(req, req->result, cflags); (*nr_events)++; - if (refcount_dec_and_test(&req->refs) && - !io_req_multi_free(&rb, req)) - io_free_req(req); + if (refcount_dec_and_test(&req->refs)) + io_req_free_batch(&rb, req); } io_commit_cqring(ctx); if (ctx->flags & IORING_SETUP_SQPOLL) io_cqring_ev_posted(ctx); - io_free_req_many(ctx, &rb); + io_req_free_batch_finish(ctx, &rb); if (!list_empty(&again)) io_iopoll_queue(&again); @@ -1827,7 +2103,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, spin = !ctx->poll_multi_file && *nr_events < min; ret = 0; - list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) { + list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) { struct kiocb *kiocb = &req->rw.kiocb; /* @@ -1836,7 +2112,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, * and complete those lists first, if we have entries there. */ if (READ_ONCE(req->iopoll_completed)) { - list_move_tail(&req->list, &done); + list_move_tail(&req->inflight_entry, &done); continue; } if (!list_empty(&done)) @@ -1846,6 +2122,10 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, if (ret < 0) break; + /* iopoll may have completed current req */ + if (READ_ONCE(req->iopoll_completed)) + list_move_tail(&req->inflight_entry, &done); + if (ret && spin) spin = false; ret = 0; @@ -1865,13 +2145,13 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, long min) { - while (!list_empty(&ctx->poll_list) && !need_resched()) { + while (!list_empty(&ctx->iopoll_list) && !need_resched()) { int ret; ret = io_do_iopoll(ctx, nr_events, min); if (ret < 0) return ret; - if (!min || *nr_events >= min) + if (*nr_events >= min) return 0; } @@ -1882,29 +2162,37 @@ static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, * We can't just wait for polled events to come to us, we have to actively * find and complete them. */ -static void io_iopoll_reap_events(struct io_ring_ctx *ctx) +static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_IOPOLL)) return; mutex_lock(&ctx->uring_lock); - while (!list_empty(&ctx->poll_list)) { + while (!list_empty(&ctx->iopoll_list)) { unsigned int nr_events = 0; - io_iopoll_getevents(ctx, &nr_events, 1); + io_do_iopoll(ctx, &nr_events, 0); + /* let it sleep and repeat later if can't complete a request */ + if (nr_events == 0) + break; /* * Ensure we allow local-to-the-cpu processing to take place, * in this case we need to ensure that we reap all events. + * Also let task_work, etc. to progress by releasing the mutex */ - cond_resched(); + if (need_resched()) { + mutex_unlock(&ctx->uring_lock); + cond_resched(); + mutex_lock(&ctx->uring_lock); + } } mutex_unlock(&ctx->uring_lock); } -static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, - long min) +static int io_iopoll_check(struct io_ring_ctx *ctx, long min) { + unsigned int nr_events = 0; int iters = 0, ret = 0; /* @@ -1914,8 +2202,6 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, */ mutex_lock(&ctx->uring_lock); do { - int tmin = 0; - /* * Don't enter poll loop if we already have events pending. * If we do, we can potentially be spinning for commands that @@ -1936,17 +2222,15 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, */ if (!(++iters & 7)) { mutex_unlock(&ctx->uring_lock); + io_run_task_work(); mutex_lock(&ctx->uring_lock); } - if (*nr_events < min) - tmin = min - *nr_events; - - ret = io_iopoll_getevents(ctx, nr_events, tmin); + ret = io_iopoll_getevents(ctx, &nr_events, min); if (ret <= 0) break; ret = 0; - } while (min && !*nr_events && !need_resched()); + } while (min && !nr_events && !need_resched()); mutex_unlock(&ctx->uring_lock); return ret; @@ -1966,13 +2250,8 @@ static void kiocb_end_write(struct io_kiocb *req) file_end_write(req->file); } -static inline void req_set_fail_links(struct io_kiocb *req) -{ - if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; -} - -static void io_complete_rw_common(struct kiocb *kiocb, long res) +static void io_complete_rw_common(struct kiocb *kiocb, long res, + struct io_comp_state *cs) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); int cflags = 0; @@ -1983,16 +2262,100 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res) if (res != req->result) req_set_fail_links(req); if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_kbuf(req); - __io_cqring_add_event(req, res, cflags); + cflags = io_put_rw_kbuf(req); + __io_req_complete(req, res, cflags, cs); +} + +#ifdef CONFIG_BLOCK +static bool io_resubmit_prep(struct io_kiocb *req, int error) +{ + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + ssize_t ret = -ECANCELED; + struct iov_iter iter; + int rw; + + if (error) { + ret = error; + goto end_req; + } + + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: + rw = READ; + break; + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: + rw = WRITE; + break; + default: + printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n", + req->opcode); + goto end_req; + } + + ret = io_import_iovec(rw, req, &iovec, &iter, false); + if (ret < 0) + goto end_req; + ret = io_setup_async_rw(req, ret, iovec, inline_vecs, &iter); + if (!ret) + return true; + kfree(iovec); +end_req: + req_set_fail_links(req); + io_req_complete(req, ret); + return false; +} + +static void io_rw_resubmit(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_ring_ctx *ctx = req->ctx; + int err; + + err = io_sq_thread_acquire_mm(ctx, req); + + if (io_resubmit_prep(req, err)) { + refcount_inc(&req->refs); + io_queue_async_work(req); + } + + percpu_ref_put(&ctx->refs); +} +#endif + +static bool io_rw_reissue(struct io_kiocb *req, long res) +{ +#ifdef CONFIG_BLOCK + int ret; + + if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) + return false; + + init_task_work(&req->task_work, io_rw_resubmit); + percpu_ref_get(&req->ctx->refs); + + ret = io_req_task_work_add(req, &req->task_work); + if (!ret) + return true; +#endif + return false; +} + +static void __io_complete_rw(struct io_kiocb *req, long res, long res2, + struct io_comp_state *cs) +{ + if (!io_rw_reissue(req, res)) + io_complete_rw_common(&req->rw.kiocb, res, cs); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); - io_complete_rw_common(kiocb, res); - io_put_req(req); + __io_complete_rw(req, res, res2, NULL); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) @@ -2026,13 +2389,13 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * how we do polling eventually, not spinning if we're on potentially * different devices. */ - if (list_empty(&ctx->poll_list)) { + if (list_empty(&ctx->iopoll_list)) { ctx->poll_multi_file = false; } else if (!ctx->poll_multi_file) { struct io_kiocb *list_req; - list_req = list_first_entry(&ctx->poll_list, struct io_kiocb, - list); + list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, + inflight_entry); if (list_req->file != req->file) ctx->poll_multi_file = true; } @@ -2042,9 +2405,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * it to the front so we find it first. */ if (READ_ONCE(req->iopoll_completed)) - list_add(&req->list, &ctx->poll_list); + list_add(&req->inflight_entry, &ctx->iopoll_list); else - list_add_tail(&req->list, &ctx->poll_list); + list_add_tail(&req->inflight_entry, &ctx->iopoll_list); if ((ctx->flags & IORING_SETUP_SQPOLL) && wq_has_sleeper(&ctx->sqo_wait)) @@ -2053,10 +2416,8 @@ static void io_iopoll_req_issued(struct io_kiocb *req) static void __io_state_file_put(struct io_submit_state *state) { - int diff = state->has_refs - state->used_refs; - - if (diff) - fput_many(state->file, diff); + if (state->has_refs) + fput_many(state->file, state->has_refs); state->file = NULL; } @@ -2078,7 +2439,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) if (state->file) { if (state->fd == fd) { - state->used_refs++; + state->has_refs--; state->ios_left--; return state->file; } @@ -2089,12 +2450,20 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) return NULL; state->fd = fd; - state->has_refs = state->ios_left; - state->used_refs = 1; state->ios_left--; + state->has_refs = state->ios_left; return state->file; } +static bool io_bdev_nowait(struct block_device *bdev) +{ +#ifdef CONFIG_BLOCK + return !bdev || queue_is_mq(bdev_get_queue(bdev)); +#else + return true; +#endif +} + /* * If we tracked the file through the SCM inflight mechanism, we could support * any file. For now, just ensure that anything potentially problematic is done @@ -2104,10 +2473,19 @@ static bool io_file_supports_async(struct file *file, int rw) { umode_t mode = file_inode(file)->i_mode; - if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode)) - return true; - if (S_ISREG(mode) && file->f_op != &io_uring_fops) + if (S_ISBLK(mode)) { + if (io_bdev_nowait(file->f_inode->i_bdev)) + return true; + return false; + } + if (S_ISCHR(mode) || S_ISSOCK(mode)) return true; + if (S_ISREG(mode)) { + if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) && + file->f_op != &io_uring_fops) + return true; + return false; + } /* any ->read/write should understand O_NONBLOCK */ if (file->f_flags & O_NONBLOCK) @@ -2158,6 +2536,9 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (kiocb->ki_flags & IOCB_NOWAIT) req->flags |= REQ_F_NOWAIT; + if (kiocb->ki_flags & IOCB_DIRECT) + io_get_req_task(req); + if (force_nonblock) kiocb->ki_flags |= IOCB_NOWAIT; @@ -2168,8 +2549,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; - req->result = 0; req->iopoll_completed = 0; + io_get_req_task(req); } else { if (kiocb->ki_flags & IOCB_HIPRI) return -EINVAL; @@ -2203,14 +2584,15 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } -static void kiocb_done(struct kiocb *kiocb, ssize_t ret) +static void kiocb_done(struct kiocb *kiocb, ssize_t ret, + struct io_comp_state *cs) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); if (req->flags & REQ_F_CUR_POS) req->file->f_pos = kiocb->ki_pos; if (ret >= 0 && kiocb->ki_complete == io_complete_rw) - io_complete_rw(kiocb, ret, 0); + __io_complete_rw(req, ret, 0, cs); else io_rw_done(kiocb, ret); } @@ -2466,10 +2848,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, if (req->io) { struct io_async_rw *iorw = &req->io->rw; - *iovec = iorw->iov; - iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size); - if (iorw->iov == iorw->fast_iov) - *iovec = NULL; + iov_iter_init(iter, rw, iorw->iov, iorw->nr_segs, iorw->size); + *iovec = NULL; return iorw->size; } @@ -2554,15 +2934,17 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, struct iovec *iovec, struct iovec *fast_iov, struct iov_iter *iter) { - req->io->rw.nr_segs = iter->nr_segs; - req->io->rw.size = io_size; - req->io->rw.iov = iovec; - if (!req->io->rw.iov) { - req->io->rw.iov = req->io->rw.fast_iov; - if (req->io->rw.iov != fast_iov) - memcpy(req->io->rw.iov, fast_iov, + struct io_async_rw *rw = &req->io->rw; + + rw->nr_segs = iter->nr_segs; + rw->size = io_size; + if (!iovec) { + rw->iov = rw->fast_iov; + if (rw->iov != fast_iov) + memcpy(rw->iov, fast_iov, sizeof(struct iovec) * iter->nr_segs); } else { + rw->iov = iovec; req->flags |= REQ_F_NEED_CLEANUP; } } @@ -2581,107 +2963,239 @@ static int io_alloc_async_ctx(struct io_kiocb *req) return __io_alloc_async_ctx(req); } -static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, - struct iovec *iovec, struct iovec *fast_iov, - struct iov_iter *iter) +static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, + struct iovec *iovec, struct iovec *fast_iov, + struct iov_iter *iter) +{ + if (!io_op_defs[req->opcode].async_ctx) + return 0; + if (!req->io) { + if (__io_alloc_async_ctx(req)) + return -ENOMEM; + + io_req_map_rw(req, io_size, iovec, fast_iov, iter); + } + return 0; +} + +static inline int io_rw_prep_async(struct io_kiocb *req, int rw, + bool force_nonblock) +{ + struct io_async_ctx *io = req->io; + struct iov_iter iter; + ssize_t ret; + + io->rw.iov = io->rw.fast_iov; + req->io = NULL; + ret = io_import_iovec(rw, req, &io->rw.iov, &iter, !force_nonblock); + req->io = io; + if (unlikely(ret < 0)) + return ret; + + io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); + return 0; +} + +static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) +{ + ssize_t ret; + + ret = io_prep_rw(req, sqe, force_nonblock); + if (ret) + return ret; + + if (unlikely(!(req->file->f_mode & FMODE_READ))) + return -EBADF; + + /* either don't need iovec imported or already have it */ + if (!req->io || req->flags & REQ_F_NEED_CLEANUP) + return 0; + return io_rw_prep_async(req, READ, force_nonblock); +} + +/* + * This is our waitqueue callback handler, registered through lock_page_async() + * when we initially tried to do the IO with the iocb armed our waitqueue. + * This gets called when the page is unlocked, and we generally expect that to + * happen when the page IO is completed and the page is now uptodate. This will + * queue a task_work based retry of the operation, attempting to copy the data + * again. If the latter fails because the page was NOT uptodate, then we will + * do a thread based blocking retry of the operation. That's the unexpected + * slow path. + */ +static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, + int sync, void *arg) +{ + struct wait_page_queue *wpq; + struct io_kiocb *req = wait->private; + struct wait_page_key *key = arg; + int ret; + + wpq = container_of(wait, struct wait_page_queue, wait); + + if (!wake_page_match(wpq, key)) + return 0; + + list_del_init(&wait->entry); + + init_task_work(&req->task_work, io_req_task_submit); + percpu_ref_get(&req->ctx->refs); + + /* submit ref gets dropped, acquire a new one */ + refcount_inc(&req->refs); + ret = io_req_task_work_add(req, &req->task_work); + if (unlikely(ret)) { + struct task_struct *tsk; + + /* queue just for cancelation */ + init_task_work(&req->task_work, io_req_task_cancel); + tsk = io_wq_get_task(req->ctx->io_wq); + task_work_add(tsk, &req->task_work, 0); + wake_up_process(tsk); + } + return 1; +} + +static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb, + struct wait_page_queue *wait, + wait_queue_func_t func, + void *data) { - if (!io_op_defs[req->opcode].async_ctx) + /* Can't support async wakeup with polled IO */ + if (kiocb->ki_flags & IOCB_HIPRI) + return -EINVAL; + if (kiocb->ki_filp->f_mode & FMODE_BUF_RASYNC) { + wait->wait.func = func; + wait->wait.private = data; + wait->wait.flags = 0; + INIT_LIST_HEAD(&wait->wait.entry); + kiocb->ki_flags |= IOCB_WAITQ; + kiocb->ki_waitq = wait; return 0; - if (!req->io) { - if (__io_alloc_async_ctx(req)) - return -ENOMEM; - - io_req_map_rw(req, io_size, iovec, fast_iov, iter); } - return 0; + + return -EOPNOTSUPP; } -static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock) +/* + * This controls whether a given IO request should be armed for async page + * based retry. If we return false here, the request is handed to the async + * worker threads for retry. If we're doing buffered reads on a regular file, + * we prepare a private wait_page_queue entry and retry the operation. This + * will either succeed because the page is now uptodate and unlocked, or it + * will register a callback when the page is unlocked at IO completion. Through + * that callback, io_uring uses task_work to setup a retry of the operation. + * That retry will attempt the buffered read again. The retry will generally + * succeed, or in rare cases where it fails, we then fall back to using the + * async worker threads for a blocking retry. + */ +static bool io_rw_should_retry(struct io_kiocb *req) { - struct io_async_ctx *io; - struct iov_iter iter; - ssize_t ret; + struct kiocb *kiocb = &req->rw.kiocb; + int ret; - ret = io_prep_rw(req, sqe, force_nonblock); - if (ret) - return ret; + /* never retry for NOWAIT, we just complete with -EAGAIN */ + if (req->flags & REQ_F_NOWAIT) + return false; - if (unlikely(!(req->file->f_mode & FMODE_READ))) - return -EBADF; + /* already tried, or we're doing O_DIRECT */ + if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_WAITQ)) + return false; + /* + * just use poll if we can, and don't attempt if the fs doesn't + * support callback based unlocks + */ + if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) + return false; - /* either don't need iovec imported or already have it */ - if (!req->io || req->flags & REQ_F_NEED_CLEANUP) - return 0; + /* + * If request type doesn't require req->io to defer in general, + * we need to allocate it here + */ + if (!req->io && __io_alloc_async_ctx(req)) + return false; - io = req->io; - io->rw.iov = io->rw.fast_iov; - req->io = NULL; - ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock); - req->io = io; - if (ret < 0) - return ret; + ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq, + io_async_buf_func, req); + if (!ret) { + io_get_req_task(req); + return true; + } - io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); - return 0; + return false; +} + +static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) +{ + if (req->file->f_op->read_iter) + return call_read_iter(req->file, &req->rw.kiocb, iter); + else if (req->file->f_op->read) + return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter); + else + return -EINVAL; } -static int io_read(struct io_kiocb *req, bool force_nonblock) +static int io_read(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter iter; size_t iov_count; - ssize_t io_size, ret; + ssize_t io_size, ret, ret2; + unsigned long nr_segs; ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock); if (ret < 0) return ret; + io_size = ret; + req->result = io_size; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) kiocb->ki_flags &= ~IOCB_NOWAIT; - req->result = 0; - io_size = ret; - if (req->flags & REQ_F_LINK_HEAD) - req->result = io_size; - - /* - * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so - * we know to async punt it even if it was opened O_NONBLOCK - */ + /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, READ)) goto copy_iov; iov_count = iov_iter_count(&iter); + nr_segs = iter.nr_segs; ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); - if (!ret) { - ssize_t ret2; + if (unlikely(ret)) + goto out_free; - if (req->file->f_op->read_iter) - ret2 = call_read_iter(req->file, kiocb, &iter); - else - ret2 = loop_rw_iter(READ, req->file, kiocb, &iter); + ret2 = io_iter_do_read(req, &iter); - /* Catch -EAGAIN return for forced non-blocking submission */ - if (!force_nonblock || ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2); - } else { + /* Catch -EAGAIN return for forced non-blocking submission */ + if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) { + kiocb_done(kiocb, ret2, cs); + } else { + iter.count = iov_count; + iter.nr_segs = nr_segs; copy_iov: - ret = io_setup_async_rw(req, io_size, iovec, - inline_vecs, &iter); - if (ret) + ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, + &iter); + if (ret) + goto out_free; + /* it's copied and will be cleaned with ->io */ + iovec = NULL; + /* if we can retry, do so with the callbacks armed */ + if (io_rw_should_retry(req)) { + ret2 = io_iter_do_read(req, &iter); + if (ret2 == -EIOCBQUEUED) { goto out_free; - /* any defer here is final, must blocking retry */ - if (!(req->flags & REQ_F_NOWAIT) && - !file_can_poll(req->file)) - req->flags |= REQ_F_MUST_PUNT; - return -EAGAIN; + } else if (ret2 != -EAGAIN) { + kiocb_done(kiocb, ret2, cs); + goto out_free; + } } + kiocb->ki_flags &= ~IOCB_WAITQ; + return -EAGAIN; } out_free: - if (!(req->flags & REQ_F_NEED_CLEANUP)) + if (iovec) kfree(iovec); return ret; } @@ -2689,8 +3203,6 @@ out_free: static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, bool force_nonblock) { - struct io_async_ctx *io; - struct iov_iter iter; ssize_t ret; ret = io_prep_rw(req, sqe, force_nonblock); @@ -2700,49 +3212,33 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (unlikely(!(req->file->f_mode & FMODE_WRITE))) return -EBADF; - req->fsize = rlimit(RLIMIT_FSIZE); - /* either don't need iovec imported or already have it */ if (!req->io || req->flags & REQ_F_NEED_CLEANUP) return 0; - - io = req->io; - io->rw.iov = io->rw.fast_iov; - req->io = NULL; - ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock); - req->io = io; - if (ret < 0) - return ret; - - io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); - return 0; + return io_rw_prep_async(req, WRITE, force_nonblock); } -static int io_write(struct io_kiocb *req, bool force_nonblock) +static int io_write(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter iter; size_t iov_count; - ssize_t ret, io_size; + ssize_t ret, ret2, io_size; + unsigned long nr_segs; ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock); if (ret < 0) return ret; + io_size = ret; + req->result = io_size; /* Ensure we clear previously set non-block flag */ if (!force_nonblock) req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; - req->result = 0; - io_size = ret; - if (req->flags & REQ_F_LINK_HEAD) - req->result = io_size; - - /* - * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so - * we know to async punt it even if it was opened O_NONBLOCK - */ + /* If the file doesn't support async, just async punt */ if (force_nonblock && !io_file_supports_async(req->file, WRITE)) goto copy_iov; @@ -2752,59 +3248,55 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) goto copy_iov; iov_count = iov_iter_count(&iter); + nr_segs = iter.nr_segs; ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); - if (!ret) { - ssize_t ret2; - - /* - * Open-code file_start_write here to grab freeze protection, - * which will be released by another thread in - * io_complete_rw(). Fool lockdep by telling it the lock got - * released so that it doesn't complain about the held lock when - * we return to userspace. - */ - if (req->flags & REQ_F_ISREG) { - __sb_start_write(file_inode(req->file)->i_sb, - SB_FREEZE_WRITE, true); - __sb_writers_release(file_inode(req->file)->i_sb, - SB_FREEZE_WRITE); - } - kiocb->ki_flags |= IOCB_WRITE; - - if (!force_nonblock) - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; + if (unlikely(ret)) + goto out_free; - if (req->file->f_op->write_iter) - ret2 = call_write_iter(req->file, kiocb, &iter); - else - ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); + /* + * Open-code file_start_write here to grab freeze protection, + * which will be released by another thread in + * io_complete_rw(). Fool lockdep by telling it the lock got + * released so that it doesn't complain about the held lock when + * we return to userspace. + */ + if (req->flags & REQ_F_ISREG) { + __sb_start_write(file_inode(req->file)->i_sb, + SB_FREEZE_WRITE, true); + __sb_writers_release(file_inode(req->file)->i_sb, + SB_FREEZE_WRITE); + } + kiocb->ki_flags |= IOCB_WRITE; - if (!force_nonblock) - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + if (req->file->f_op->write_iter) + ret2 = call_write_iter(req->file, kiocb, &iter); + else if (req->file->f_op->write) + ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); + else + ret2 = -EINVAL; - /* - * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just - * retry them without IOCB_NOWAIT. - */ - if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) - ret2 = -EAGAIN; - if (!force_nonblock || ret2 != -EAGAIN) { - kiocb_done(kiocb, ret2); - } else { + /* + * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just + * retry them without IOCB_NOWAIT. + */ + if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) + ret2 = -EAGAIN; + if (!force_nonblock || ret2 != -EAGAIN) { + kiocb_done(kiocb, ret2, cs); + } else { + iter.count = iov_count; + iter.nr_segs = nr_segs; copy_iov: - ret = io_setup_async_rw(req, io_size, iovec, - inline_vecs, &iter); - if (ret) - goto out_free; - /* any defer here is final, must blocking retry */ - if (!(req->flags & REQ_F_NOWAIT) && - !file_can_poll(req->file)) - req->flags |= REQ_F_MUST_PUNT; - return -EAGAIN; - } + ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, + &iter); + if (ret) + goto out_free; + /* it's copied and will be cleaned with ->io */ + iovec = NULL; + return -EAGAIN; } out_free: - if (!(req->flags & REQ_F_NEED_CLEANUP)) + if (iovec) kfree(iovec); return ret; } @@ -2870,10 +3362,9 @@ static int io_tee(struct io_kiocb *req, bool force_nonblock) io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); req->flags &= ~REQ_F_NEED_CLEANUP; - io_cqring_add_event(req, ret); if (ret != sp->len) req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -2907,25 +3398,23 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock) io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); req->flags &= ~REQ_F_NEED_CLEANUP; - io_cqring_add_event(req, ret); if (ret != sp->len) req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return 0; } /* * IORING_OP_NOP just posts a completion event, nothing else. */ -static int io_nop(struct io_kiocb *req) +static int io_nop(struct io_kiocb *req, struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - io_cqring_add_event(req, 0); - io_put_req(req); + __io_req_complete(req, 0, 0, cs); return 0; } @@ -2964,8 +3453,7 @@ static int io_fsync(struct io_kiocb *req, bool force_nonblock) req->sync.flags & IORING_FSYNC_DATASYNC); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -2980,7 +3468,6 @@ static int io_fallocate_prep(struct io_kiocb *req, req->sync.off = READ_ONCE(sqe->off); req->sync.len = READ_ONCE(sqe->addr); req->sync.mode = READ_ONCE(sqe->len); - req->fsize = rlimit(RLIMIT_FSIZE); return 0; } @@ -2991,15 +3478,11 @@ static int io_fallocate(struct io_kiocb *req, bool force_nonblock) /* fallocate always requiring blocking context */ if (force_nonblock) return -EAGAIN; - - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, req->sync.len); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3095,8 +3578,7 @@ err: req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3150,7 +3632,8 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, return i; } -static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock) +static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; @@ -3169,8 +3652,7 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock) io_ring_submit_lock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -3228,7 +3710,8 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) return i ? i : -ENOMEM; } -static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock) +static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_provide_buf *p = &req->pbuf; struct io_ring_ctx *ctx = req->ctx; @@ -3257,8 +3740,7 @@ out: io_ring_submit_unlock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -3289,7 +3771,8 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, #endif } -static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock) +static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { #if defined(CONFIG_EPOLL) struct io_epoll *ie = &req->epoll; @@ -3301,8 +3784,7 @@ static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + __io_req_complete(req, ret, 0, cs); return 0; #else return -EOPNOTSUPP; @@ -3338,8 +3820,7 @@ static int io_madvise(struct io_kiocb *req, bool force_nonblock) ret = do_madvise(ma->addr, ma->len, ma->advice); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; #else return -EOPNOTSUPP; @@ -3378,8 +3859,7 @@ static int io_fadvise(struct io_kiocb *req, bool force_nonblock) ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3418,8 +3898,7 @@ static int io_statx(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3450,7 +3929,8 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_close(struct io_kiocb *req, bool force_nonblock) +static int io_close(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_close *close = &req->close; int ret; @@ -3464,8 +3944,10 @@ static int io_close(struct io_kiocb *req, bool force_nonblock) /* if the file has a flush method, be safe and punt to async */ if (close->put_file->f_op->flush && force_nonblock) { + /* was never set, but play safe */ + req->flags &= ~REQ_F_NOWAIT; /* avoid grabbing files - we don't need the files */ - req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT; + req->flags |= REQ_F_NO_FILE_TABLE; return -EAGAIN; } @@ -3473,10 +3955,9 @@ static int io_close(struct io_kiocb *req, bool force_nonblock) ret = filp_close(close->put_file, req->work.files); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); fput(close->put_file); close->put_file = NULL; - io_put_req(req); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -3510,8 +3991,7 @@ static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock) req->sync.flags); if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -3531,6 +4011,15 @@ static int io_setup_async_msg(struct io_kiocb *req, return -EAGAIN; } +static int io_sendmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) +{ + iomsg->iov = iomsg->fast_iov; + iomsg->msg.msg_name = &iomsg->addr; + return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, + req->sr_msg.msg_flags, &iomsg->iov); +} + static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = &req->sr_msg; @@ -3541,7 +4030,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags); - sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); #ifdef CONFIG_COMPAT @@ -3555,136 +4044,126 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->flags & REQ_F_NEED_CLEANUP) return 0; - io->msg.msg.msg_name = &io->msg.addr; - io->msg.iov = io->msg.fast_iov; - ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, - &io->msg.iov); + ret = io_sendmsg_copy_hdr(req, &io->msg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; } -static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) +static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { - struct io_async_msghdr *kmsg = NULL; + struct io_async_msghdr iomsg, *kmsg; struct socket *sock; + unsigned flags; int ret; sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_async_ctx io; - unsigned flags; - - if (req->io) { - kmsg = &req->io->msg; - kmsg->msg.msg_name = &req->io->msg.addr; - /* if iov is set, it's allocated already */ - if (!kmsg->iov) - kmsg->iov = kmsg->fast_iov; - kmsg->msg.msg_iter.iov = kmsg->iov; - } else { - struct io_sr_msg *sr = &req->sr_msg; - - kmsg = &io.msg; - kmsg->msg.msg_name = &io.msg.addr; + if (unlikely(!sock)) + return ret; - io.msg.iov = io.msg.fast_iov; - ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg, - sr->msg_flags, &io.msg.iov); - if (ret) - return ret; - } + if (req->io) { + kmsg = &req->io->msg; + kmsg->msg.msg_name = &req->io->msg.addr; + /* if iov is set, it's allocated already */ + if (!kmsg->iov) + kmsg->iov = kmsg->fast_iov; + kmsg->msg.msg_iter.iov = kmsg->iov; + } else { + ret = io_sendmsg_copy_hdr(req, &iomsg); + if (ret) + return ret; + kmsg = &iomsg; + } - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; - ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); - if (force_nonblock && ret == -EAGAIN) - return io_setup_async_msg(req, kmsg); - if (ret == -ERESTARTSYS) - ret = -EINTR; - } + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); + if (force_nonblock && ret == -EAGAIN) + return io_setup_async_msg(req, kmsg); + if (ret == -ERESTARTSYS) + ret = -EINTR; - if (kmsg && kmsg->iov != kmsg->fast_iov) + if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; - io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + __io_req_complete(req, ret, 0, cs); return 0; } -static int io_send(struct io_kiocb *req, bool force_nonblock) +static int io_send(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + struct iovec iov; struct socket *sock; + unsigned flags; int ret; sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_sr_msg *sr = &req->sr_msg; - struct msghdr msg; - struct iovec iov; - unsigned flags; + if (unlikely(!sock)) + return ret; - ret = import_single_range(WRITE, sr->buf, sr->len, &iov, - &msg.msg_iter); - if (ret) - return ret; + ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); + if (unlikely(ret)) + return ret;; - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; - msg.msg_flags = flags; - ret = sock_sendmsg(sock, &msg); - if (force_nonblock && ret == -EAGAIN) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - } + msg.msg_flags = flags; + ret = sock_sendmsg(sock, &msg); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; - io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + __io_req_complete(req, ret, 0, cs); return 0; } -static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) +static int __io_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) { struct io_sr_msg *sr = &req->sr_msg; struct iovec __user *uiov; size_t iov_len; int ret; - ret = __copy_msghdr_from_user(&io->msg.msg, sr->msg, &io->msg.uaddr, - &uiov, &iov_len); + ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg, + &iomsg->uaddr, &uiov, &iov_len); if (ret) return ret; if (req->flags & REQ_F_BUFFER_SELECT) { if (iov_len > 1) return -EINVAL; - if (copy_from_user(io->msg.iov, uiov, sizeof(*uiov))) + if (copy_from_user(iomsg->iov, uiov, sizeof(*uiov))) return -EFAULT; - sr->len = io->msg.iov[0].iov_len; - iov_iter_init(&io->msg.msg.msg_iter, READ, io->msg.iov, 1, + sr->len = iomsg->iov[0].iov_len; + iov_iter_init(&iomsg->msg.msg_iter, READ, iomsg->iov, 1, sr->len); - io->msg.iov = NULL; + iomsg->iov = NULL; } else { ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV, - &io->msg.iov, &io->msg.msg.msg_iter); + &iomsg->iov, &iomsg->msg.msg_iter); if (ret > 0) ret = 0; } @@ -3694,7 +4173,7 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) #ifdef CONFIG_COMPAT static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_ctx *io) + struct io_async_msghdr *iomsg) { struct compat_msghdr __user *msg_compat; struct io_sr_msg *sr = &req->sr_msg; @@ -3703,8 +4182,8 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, compat_size_t len; int ret; - msg_compat = (struct compat_msghdr __user *) sr->msg; - ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr, + msg_compat = (struct compat_msghdr __user *) sr->umsg; + ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr, &ptr, &len); if (ret) return ret; @@ -3721,12 +4200,12 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, return -EFAULT; if (clen < 0) return -EINVAL; - sr->len = io->msg.iov[0].iov_len; - io->msg.iov = NULL; + sr->len = iomsg->iov[0].iov_len; + iomsg->iov = NULL; } else { ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV, - &io->msg.iov, - &io->msg.msg.msg_iter); + &iomsg->iov, + &iomsg->msg.msg_iter); if (ret < 0) return ret; } @@ -3735,40 +4214,40 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, } #endif -static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) +static int io_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) { - io->msg.msg.msg_name = &io->msg.addr; - io->msg.iov = io->msg.fast_iov; + iomsg->msg.msg_name = &iomsg->addr; + iomsg->iov = iomsg->fast_iov; #ifdef CONFIG_COMPAT if (req->ctx->compat) - return __io_compat_recvmsg_copy_hdr(req, io); + return __io_compat_recvmsg_copy_hdr(req, iomsg); #endif - return __io_recvmsg_copy_hdr(req, io); + return __io_recvmsg_copy_hdr(req, iomsg); } static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, - int *cflags, bool needs_lock) + bool needs_lock) { struct io_sr_msg *sr = &req->sr_msg; struct io_buffer *kbuf; - if (!(req->flags & REQ_F_BUFFER_SELECT)) - return NULL; - kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock); if (IS_ERR(kbuf)) return kbuf; sr->kbuf = kbuf; req->flags |= REQ_F_BUFFER_SELECTED; - - *cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; - *cflags |= IORING_CQE_F_BUFFER; return kbuf; } +static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) +{ + return io_put_kbuf(req, req->sr_msg.kbuf); +} + static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -3780,7 +4259,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags); - sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); sr->bgid = READ_ONCE(sqe->buf_group); @@ -3795,133 +4274,123 @@ static int io_recvmsg_prep(struct io_kiocb *req, if (req->flags & REQ_F_NEED_CLEANUP) return 0; - ret = io_recvmsg_copy_hdr(req, io); + ret = io_recvmsg_copy_hdr(req, &io->msg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; } -static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) +static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { - struct io_async_msghdr *kmsg = NULL; + struct io_async_msghdr iomsg, *kmsg; struct socket *sock; + struct io_buffer *kbuf; + unsigned flags; int ret, cflags = 0; sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_buffer *kbuf; - struct io_async_ctx io; - unsigned flags; - - if (req->io) { - kmsg = &req->io->msg; - kmsg->msg.msg_name = &req->io->msg.addr; - /* if iov is set, it's allocated already */ - if (!kmsg->iov) - kmsg->iov = kmsg->fast_iov; - kmsg->msg.msg_iter.iov = kmsg->iov; - } else { - kmsg = &io.msg; - kmsg->msg.msg_name = &io.msg.addr; + if (unlikely(!sock)) + return ret; - ret = io_recvmsg_copy_hdr(req, &io); - if (ret) - return ret; - } + if (req->io) { + kmsg = &req->io->msg; + kmsg->msg.msg_name = &req->io->msg.addr; + /* if iov is set, it's allocated already */ + if (!kmsg->iov) + kmsg->iov = kmsg->fast_iov; + kmsg->msg.msg_iter.iov = kmsg->iov; + } else { + ret = io_recvmsg_copy_hdr(req, &iomsg); + if (ret) + return ret; + kmsg = &iomsg; + } - kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); - if (IS_ERR(kbuf)) { + if (req->flags & REQ_F_BUFFER_SELECT) { + kbuf = io_recv_buffer_select(req, !force_nonblock); + if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - } else if (kbuf) { - kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); - iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov, - 1, req->sr_msg.len); - } + kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); + iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov, + 1, req->sr_msg.len); + } - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; - ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg, - kmsg->uaddr, flags); - if (force_nonblock && ret == -EAGAIN) { - ret = io_setup_async_msg(req, kmsg); - if (ret != -EAGAIN) - kfree(kbuf); - return ret; - } - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (kbuf) - kfree(kbuf); - } + ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, + kmsg->uaddr, flags); + if (force_nonblock && ret == -EAGAIN) + return io_setup_async_msg(req, kmsg); + if (ret == -ERESTARTSYS) + ret = -EINTR; - if (kmsg && kmsg->iov != kmsg->fast_iov) + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_recv_kbuf(req); + if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); req->flags &= ~REQ_F_NEED_CLEANUP; - __io_cqring_add_event(req, ret, cflags); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + __io_req_complete(req, ret, cflags, cs); return 0; } -static int io_recv(struct io_kiocb *req, bool force_nonblock) +static int io_recv(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { - struct io_buffer *kbuf = NULL; + struct io_buffer *kbuf; + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + void __user *buf = sr->buf; struct socket *sock; + struct iovec iov; + unsigned flags; int ret, cflags = 0; sock = sock_from_file(req->file, &ret); - if (sock) { - struct io_sr_msg *sr = &req->sr_msg; - void __user *buf = sr->buf; - struct msghdr msg; - struct iovec iov; - unsigned flags; + if (unlikely(!sock)) + return ret; - kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock); + if (req->flags & REQ_F_BUFFER_SELECT) { + kbuf = io_recv_buffer_select(req, !force_nonblock); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - else if (kbuf) - buf = u64_to_user_ptr(kbuf->addr); + buf = u64_to_user_ptr(kbuf->addr); + } - ret = import_single_range(READ, buf, sr->len, &iov, - &msg.msg_iter); - if (ret) { - kfree(kbuf); - return ret; - } + ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); + if (unlikely(ret)) + goto out_free; - req->flags |= REQ_F_NEED_CLEANUP; - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; - msg.msg_iocb = NULL; - msg.msg_flags = 0; - - flags = req->sr_msg.msg_flags; - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; - - ret = sock_recvmsg(sock, &msg, flags); - if (force_nonblock && ret == -EAGAIN) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - } + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + msg.msg_iocb = NULL; + msg.msg_flags = 0; - kfree(kbuf); - req->flags &= ~REQ_F_NEED_CLEANUP; - __io_cqring_add_event(req, ret, cflags); + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = sock_recvmsg(sock, &msg, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; +out_free: + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_recv_kbuf(req); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + __io_req_complete(req, ret, cflags, cs); return 0; } @@ -3941,7 +4410,8 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int io_accept(struct io_kiocb *req, bool force_nonblock) +static int io_accept(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_accept *accept = &req->accept; unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; @@ -3960,8 +4430,7 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock) ret = -EINTR; req_set_fail_links(req); } - io_cqring_add_event(req, ret); - io_put_req(req); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -3985,7 +4454,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) &io->connect.address); } -static int io_connect(struct io_kiocb *req, bool force_nonblock) +static int io_connect(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_async_ctx __io, *io; unsigned file_flags; @@ -4021,8 +4491,7 @@ static int io_connect(struct io_kiocb *req, bool force_nonblock) out: if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + __io_req_complete(req, ret, 0, cs); return 0; } #else /* !CONFIG_NET */ @@ -4031,12 +4500,14 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EOPNOTSUPP; } -static int io_sendmsg(struct io_kiocb *req, bool force_nonblock) +static int io_sendmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } -static int io_send(struct io_kiocb *req, bool force_nonblock) +static int io_send(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } @@ -4047,12 +4518,14 @@ static int io_recvmsg_prep(struct io_kiocb *req, return -EOPNOTSUPP; } -static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) +static int io_recvmsg(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } -static int io_recv(struct io_kiocb *req, bool force_nonblock) +static int io_recv(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } @@ -4062,7 +4535,8 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EOPNOTSUPP; } -static int io_accept(struct io_kiocb *req, bool force_nonblock) +static int io_accept(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } @@ -4072,45 +4546,22 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EOPNOTSUPP; } -static int io_connect(struct io_kiocb *req, bool force_nonblock) +static int io_connect(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { return -EOPNOTSUPP; } #endif /* CONFIG_NET */ -struct io_poll_table { - struct poll_table_struct pt; - struct io_kiocb *req; - int error; -}; - -static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) -{ - struct task_struct *tsk = req->task; - struct io_ring_ctx *ctx = req->ctx; - int ret, notify = TWA_RESUME; - - /* - * SQPOLL kernel thread doesn't need notification, just a wakeup. - * If we're not using an eventfd, then TWA_RESUME is always fine, - * as we won't have dependencies between request completions for - * other kernel wait conditions. - */ - if (ctx->flags & IORING_SETUP_SQPOLL) - notify = 0; - else if (ctx->cq_ev_fd) - notify = TWA_SIGNAL; - - ret = task_work_add(tsk, cb, notify); - if (!ret) - wake_up_process(tsk); - return ret; -} +struct io_poll_table { + struct poll_table_struct pt; + struct io_kiocb *req; + int error; +}; static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, __poll_t mask, task_work_func_t func) { - struct task_struct *tsk; int ret; /* for instances that support it check for an event match first: */ @@ -4121,9 +4572,10 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, list_del_init(&poll->wait.entry); - tsk = req->task; req->result = mask; init_task_work(&req->task_work, func); + percpu_ref_get(&req->ctx->refs); + /* * If this fails, then the task is exiting. When a task exits, the * work gets canceled, so just cancel this request as well instead @@ -4132,6 +4584,8 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, */ ret = io_req_task_work_add(req, &req->task_work); if (unlikely(ret)) { + struct task_struct *tsk; + WRITE_ONCE(poll->canceled, true); tsk = io_wq_get_task(req->ctx->io_wq); task_work_add(tsk, &req->task_work, 0); @@ -4200,7 +4654,7 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) hash_del(&req->hash_node); io_poll_complete(req, req->result, 0); req->flags |= REQ_F_COMP_LOCKED; - io_put_req_find_next(req, nxt); + *nxt = io_put_req_find_next(req); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); @@ -4209,16 +4663,13 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) static void io_poll_task_func(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *nxt = NULL; io_poll_task_handler(req, &nxt); - if (nxt) { - struct io_ring_ctx *ctx = nxt->ctx; - - mutex_lock(&ctx->uring_lock); - __io_queue_sqe(nxt, NULL); - mutex_unlock(&ctx->uring_lock); - } + if (nxt) + __io_req_task_submit(nxt); + percpu_ref_put(&ctx->refs); } static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, @@ -4288,7 +4739,11 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, pt->error = 0; poll->head = head; - add_wait_queue(head, &poll->wait); + + if (poll->events & EPOLLEXCLUSIVE) + add_wait_queue_exclusive(head, &poll->wait); + else + add_wait_queue(head, &poll->wait); } static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, @@ -4300,77 +4755,35 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); } -static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) -{ - struct mm_struct *mm = current->mm; - - if (mm) { - kthread_unuse_mm(mm); - mmput(mm); - } -} - -static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - if (io_op_defs[req->opcode].needs_mm && !current->mm) { - if (unlikely(!mmget_not_zero(ctx->sqo_mm))) - return -EFAULT; - kthread_use_mm(ctx->sqo_mm); - } - - return 0; -} - static void io_async_task_func(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); struct async_poll *apoll = req->apoll; struct io_ring_ctx *ctx = req->ctx; - bool canceled = false; trace_io_uring_task_run(req->ctx, req->opcode, req->user_data); if (io_poll_rewait(req, &apoll->poll)) { spin_unlock_irq(&ctx->completion_lock); + percpu_ref_put(&ctx->refs); return; } /* If req is still hashed, it cannot have been canceled. Don't check. */ - if (hash_hashed(&req->hash_node)) { + if (hash_hashed(&req->hash_node)) hash_del(&req->hash_node); - } else { - canceled = READ_ONCE(apoll->poll.canceled); - if (canceled) { - io_cqring_fill_event(req, -ECANCELED); - io_commit_cqring(ctx); - } - } io_poll_remove_double(req, apoll->double_poll); spin_unlock_irq(&ctx->completion_lock); - /* restore ->work in case we need to retry again */ - if (req->flags & REQ_F_WORK_INITIALIZED) - memcpy(&req->work, &apoll->work, sizeof(req->work)); + if (!READ_ONCE(apoll->poll.canceled)) + __io_req_task_submit(req); + else + __io_req_task_cancel(req, -ECANCELED); + + percpu_ref_put(&ctx->refs); kfree(apoll->double_poll); kfree(apoll); - - if (!canceled) { - __set_current_state(TASK_RUNNING); - if (io_sq_thread_acquire_mm(ctx, req)) { - io_cqring_add_event(req, -EFAULT); - goto end_req; - } - mutex_lock(&ctx->uring_lock); - __io_queue_sqe(req, NULL); - mutex_unlock(&ctx->uring_lock); - } else { - io_cqring_ev_posted(ctx); -end_req: - req_set_fail_links(req); - io_double_put_req(req); - } } static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, @@ -4403,8 +4816,8 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req, struct io_ring_ctx *ctx = req->ctx; bool cancel = false; - poll->file = req->file; io_init_poll_iocb(poll, mask, wake_func); + poll->file = req->file; poll->wait.private = req; ipt->pt._key = mask; @@ -4444,7 +4857,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req) if (!req->file || !file_can_poll(req->file)) return false; - if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED)) + if (req->flags & REQ_F_POLLED) return false; if (!def->pollin && !def->pollout) return false; @@ -4455,9 +4868,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req) apoll->double_poll = NULL; req->flags |= REQ_F_POLLED; - if (req->flags & REQ_F_WORK_INITIALIZED) - memcpy(&apoll->work, &req->work, sizeof(req->work)); - io_get_req_task(req); req->apoll = apoll; INIT_HLIST_NODE(&req->hash_node); @@ -4476,8 +4886,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req) if (ret) { io_poll_remove_double(req, apoll->double_poll); spin_unlock_irq(&ctx->completion_lock); - if (req->flags & REQ_F_WORK_INITIALIZED) - memcpy(&req->work, &apoll->work, sizeof(req->work)); kfree(apoll->double_poll); kfree(apoll); return false; @@ -4520,14 +4928,6 @@ static bool io_poll_remove_one(struct io_kiocb *req) do_complete = __io_poll_remove_one(req, &apoll->poll); if (do_complete) { io_put_req(req); - /* - * restore ->work because we will call - * io_req_work_drop_env below when dropping the - * final reference. - */ - if (req->flags & REQ_F_WORK_INITIALIZED) - memcpy(&req->work, &apoll->work, - sizeof(req->work)); kfree(apoll->double_poll); kfree(apoll); } @@ -4608,10 +5008,9 @@ static int io_poll_remove(struct io_kiocb *req) ret = io_poll_cancel(ctx, addr); spin_unlock_irq(&ctx->completion_lock); - io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); - io_put_req(req); + io_req_complete(req, ret); return 0; } @@ -4635,7 +5034,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_poll_iocb *poll = &req->poll; - u16 events; + u32 events; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4644,8 +5043,12 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe if (!poll->file) return -EBADF; - events = READ_ONCE(sqe->poll_events); - poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; + events = READ_ONCE(sqe->poll32_events); +#ifdef __BIG_ENDIAN + events = swahw32(events); +#endif + poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP | + (events & EPOLLEXCLUSIVE); io_get_req_task(req); return 0; @@ -4659,7 +5062,6 @@ static int io_poll_add(struct io_kiocb *req) __poll_t mask; INIT_HLIST_NODE(&req->hash_node); - INIT_LIST_HEAD(&req->list); ipt.pt._qproc = io_poll_queue_proc; mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events, @@ -4686,15 +5088,16 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - atomic_inc(&ctx->cq_timeouts); - spin_lock_irqsave(&ctx->completion_lock, flags); + atomic_set(&req->ctx->cq_timeouts, + atomic_read(&req->ctx->cq_timeouts) + 1); + /* * We could be racing with timeout deletion. If the list is empty, * then timeout lookup already found it and will be handling it. */ - if (!list_empty(&req->list)) - list_del_init(&req->list); + if (!list_empty(&req->timeout.list)) + list_del_init(&req->timeout.list); io_cqring_fill_event(req, -ETIME); io_commit_cqring(ctx); @@ -4711,9 +5114,9 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) struct io_kiocb *req; int ret = -ENOENT; - list_for_each_entry(req, &ctx->timeout_list, list) { + list_for_each_entry(req, &ctx->timeout_list, timeout.list) { if (user_data == req->user_data) { - list_del_init(&req->list); + list_del_init(&req->timeout.list); ret = 0; break; } @@ -4727,6 +5130,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) return -EALREADY; req_set_fail_links(req); + req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, -ECANCELED); io_put_req(req); return 0; @@ -4795,7 +5199,6 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, data = &req->io->timeout; data->req = req; - req->flags |= REQ_F_TIMEOUT; if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; @@ -4823,8 +5226,7 @@ static int io_timeout(struct io_kiocb *req) * timeout event to be satisfied. If it isn't set, then this is * a pure timeout request, sequence isn't used. */ - if (!off) { - req->flags |= REQ_F_TIMEOUT_NOSEQ; + if (io_is_timeout_noseq(req)) { entry = ctx->timeout_list.prev; goto add; } @@ -4837,16 +5239,17 @@ static int io_timeout(struct io_kiocb *req) * the one we need first. */ list_for_each_prev(entry, &ctx->timeout_list) { - struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); + struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, + timeout.list); - if (nxt->flags & REQ_F_TIMEOUT_NOSEQ) + if (io_is_timeout_noseq(nxt)) continue; /* nxt.seq is behind @tail, otherwise would've been completed */ if (off >= nxt->timeout.target_seq - tail) break; } add: - list_add(&req->list, entry); + list_add(&req->timeout.list, entry); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); spin_unlock_irq(&ctx->completion_lock); @@ -4950,7 +5353,8 @@ static int io_files_update_prep(struct io_kiocb *req, return 0; } -static int io_files_update(struct io_kiocb *req, bool force_nonblock) +static int io_files_update(struct io_kiocb *req, bool force_nonblock, + struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; struct io_uring_files_update up; @@ -4968,8 +5372,7 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock) if (ret < 0) req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + __io_req_complete(req, ret, 0, cs); return 0; } @@ -4981,15 +5384,11 @@ static int io_req_defer_prep(struct io_kiocb *req, if (!sqe) return 0; - io_req_init_async(req); - - if (io_op_defs[req->opcode].file_table) { - ret = io_grab_files(req); - if (unlikely(ret)) - return ret; - } - - io_req_work_grab_env(req, &io_op_defs[req->opcode]); + if (io_alloc_async_ctx(req)) + return -EAGAIN; + ret = io_prep_work_files(req); + if (unlikely(ret)) + return ret; switch (req->opcode) { case IORING_OP_NOP: @@ -5091,86 +5490,117 @@ static int io_req_defer_prep(struct io_kiocb *req, return ret; } +static u32 io_get_sequence(struct io_kiocb *req) +{ + struct io_kiocb *pos; + struct io_ring_ctx *ctx = req->ctx; + u32 total_submitted, nr_reqs = 1; + + if (req->flags & REQ_F_LINK_HEAD) + list_for_each_entry(pos, &req->link_list, link_list) + nr_reqs++; + + total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped; + return total_submitted - nr_reqs; +} + static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; + struct io_defer_entry *de; int ret; + u32 seq; /* Still need defer if there is pending req in defer list. */ - if (!req_need_defer(req) && list_empty_careful(&ctx->defer_list)) + if (likely(list_empty_careful(&ctx->defer_list) && + !(req->flags & REQ_F_IO_DRAIN))) + return 0; + + seq = io_get_sequence(req); + /* Still a chance to pass the sequence check */ + if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) return 0; if (!req->io) { - if (io_alloc_async_ctx(req)) - return -EAGAIN; ret = io_req_defer_prep(req, sqe); - if (ret < 0) + if (ret) return ret; } + io_prep_async_link(req); + de = kmalloc(sizeof(*de), GFP_KERNEL); + if (!de) + return -ENOMEM; spin_lock_irq(&ctx->completion_lock); - if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { + if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { spin_unlock_irq(&ctx->completion_lock); - return 0; + kfree(de); + io_queue_async_work(req); + return -EIOCBQUEUED; } trace_io_uring_defer(ctx, req, req->user_data); - list_add_tail(&req->list, &ctx->defer_list); + de->req = req; + de->seq = seq; + list_add_tail(&de->list, &ctx->defer_list); spin_unlock_irq(&ctx->completion_lock); return -EIOCBQUEUED; } -static void io_cleanup_req(struct io_kiocb *req) +static void __io_clean_op(struct io_kiocb *req) { struct io_async_ctx *io = req->io; - switch (req->opcode) { - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: - if (req->flags & REQ_F_BUFFER_SELECTED) + if (req->flags & REQ_F_BUFFER_SELECTED) { + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: kfree((void *)(unsigned long)req->rw.addr); - /* fallthrough */ - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: - if (io->rw.iov != io->rw.fast_iov) - kfree(io->rw.iov); - break; - case IORING_OP_RECVMSG: - if (req->flags & REQ_F_BUFFER_SELECTED) - kfree(req->sr_msg.kbuf); - /* fallthrough */ - case IORING_OP_SENDMSG: - if (io->msg.iov != io->msg.fast_iov) - kfree(io->msg.iov); - break; - case IORING_OP_RECV: - if (req->flags & REQ_F_BUFFER_SELECTED) + break; + case IORING_OP_RECVMSG: + case IORING_OP_RECV: kfree(req->sr_msg.kbuf); - break; - case IORING_OP_OPENAT: - case IORING_OP_OPENAT2: - break; - case IORING_OP_SPLICE: - case IORING_OP_TEE: - io_put_file(req, req->splice.file_in, - (req->splice.flags & SPLICE_F_FD_IN_FIXED)); - break; + break; + } + req->flags &= ~REQ_F_BUFFER_SELECTED; + } + + if (req->flags & REQ_F_NEED_CLEANUP) { + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: + if (io->rw.iov != io->rw.fast_iov) + kfree(io->rw.iov); + break; + case IORING_OP_RECVMSG: + case IORING_OP_SENDMSG: + if (io->msg.iov != io->msg.fast_iov) + kfree(io->msg.iov); + break; + case IORING_OP_SPLICE: + case IORING_OP_TEE: + io_put_file(req, req->splice.file_in, + (req->splice.flags & SPLICE_F_FD_IN_FIXED)); + break; + } + req->flags &= ~REQ_F_NEED_CLEANUP; } - - req->flags &= ~REQ_F_NEED_CLEANUP; } static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - bool force_nonblock) + bool force_nonblock, struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; int ret; switch (req->opcode) { case IORING_OP_NOP: - ret = io_nop(req); + ret = io_nop(req, cs); break; case IORING_OP_READV: case IORING_OP_READ_FIXED: @@ -5180,7 +5610,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0) break; } - ret = io_read(req, force_nonblock); + ret = io_read(req, force_nonblock, cs); break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: @@ -5190,7 +5620,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret < 0) break; } - ret = io_write(req, force_nonblock); + ret = io_write(req, force_nonblock, cs); break; case IORING_OP_FSYNC: if (sqe) { @@ -5232,9 +5662,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; } if (req->opcode == IORING_OP_SENDMSG) - ret = io_sendmsg(req, force_nonblock); + ret = io_sendmsg(req, force_nonblock, cs); else - ret = io_send(req, force_nonblock); + ret = io_send(req, force_nonblock, cs); break; case IORING_OP_RECVMSG: case IORING_OP_RECV: @@ -5244,9 +5674,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; } if (req->opcode == IORING_OP_RECVMSG) - ret = io_recvmsg(req, force_nonblock); + ret = io_recvmsg(req, force_nonblock, cs); else - ret = io_recv(req, force_nonblock); + ret = io_recv(req, force_nonblock, cs); break; case IORING_OP_TIMEOUT: if (sqe) { @@ -5270,7 +5700,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_accept(req, force_nonblock); + ret = io_accept(req, force_nonblock, cs); break; case IORING_OP_CONNECT: if (sqe) { @@ -5278,7 +5708,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_connect(req, force_nonblock); + ret = io_connect(req, force_nonblock, cs); break; case IORING_OP_ASYNC_CANCEL: if (sqe) { @@ -5310,7 +5740,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_close(req, force_nonblock); + ret = io_close(req, force_nonblock, cs); break; case IORING_OP_FILES_UPDATE: if (sqe) { @@ -5318,7 +5748,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_files_update(req, force_nonblock); + ret = io_files_update(req, force_nonblock, cs); break; case IORING_OP_STATX: if (sqe) { @@ -5358,7 +5788,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_epoll_ctl(req, force_nonblock); + ret = io_epoll_ctl(req, force_nonblock, cs); break; case IORING_OP_SPLICE: if (sqe) { @@ -5374,7 +5804,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_provide_buffers(req, force_nonblock); + ret = io_provide_buffers(req, force_nonblock, cs); break; case IORING_OP_REMOVE_BUFFERS: if (sqe) { @@ -5382,7 +5812,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) break; } - ret = io_remove_buffers(req, force_nonblock); + ret = io_remove_buffers(req, force_nonblock, cs); break; case IORING_OP_TEE: if (sqe) { @@ -5417,25 +5847,15 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } -static void io_arm_async_linked_timeout(struct io_kiocb *req) -{ - struct io_kiocb *link; - - /* link head's timeout is queued in io_queue_async_work() */ - if (!(req->flags & REQ_F_QUEUE_TIMEOUT)) - return; - - link = list_first_entry(&req->link_list, struct io_kiocb, link_list); - io_queue_linked_timeout(link); -} - -static void io_wq_submit_work(struct io_wq_work **workptr) +static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) { - struct io_wq_work *work = *workptr; struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_kiocb *timeout; int ret = 0; - io_arm_async_linked_timeout(req); + timeout = io_prep_linked_timeout(req); + if (timeout) + io_queue_linked_timeout(timeout); /* if NO_CANCEL is set, we must still run the work */ if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) == @@ -5445,7 +5865,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr) if (!ret) { do { - ret = io_issue_sqe(req, NULL, false); + ret = io_issue_sqe(req, NULL, false, NULL); /* * We can get EAGAIN for polled IO even though we're * forcing a sync submission from here, since we can't @@ -5459,11 +5879,10 @@ static void io_wq_submit_work(struct io_wq_work **workptr) if (ret) { req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); + io_req_complete(req, ret); } - io_steal_work(req, workptr); + return io_steal_work(req); } static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, @@ -5520,6 +5939,8 @@ static int io_grab_files(struct io_kiocb *req) int ret = -EBADF; struct io_ring_ctx *ctx = req->ctx; + io_req_init_async(req); + if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE)) return 0; if (!ctx->ring_file) @@ -5545,6 +5966,13 @@ static int io_grab_files(struct io_kiocb *req) return ret; } +static inline int io_prep_work_files(struct io_kiocb *req) +{ + if (!io_op_defs[req->opcode].file_table) + return 0; + return io_grab_files(req); +} + static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, @@ -5577,21 +6005,17 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME); io_put_req(prev); } else { - io_cqring_add_event(req, -ETIME); - io_put_req(req); + io_req_complete(req, -ETIME); } return HRTIMER_NORESTART; } -static void io_queue_linked_timeout(struct io_kiocb *req) +static void __io_queue_linked_timeout(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - /* * If the list is now empty, then our linked request finished before * we got a chance to setup the timer */ - spin_lock_irq(&ctx->completion_lock); if (!list_empty(&req->link_list)) { struct io_timeout_data *data = &req->io->timeout; @@ -5599,6 +6023,14 @@ static void io_queue_linked_timeout(struct io_kiocb *req) hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); } +} + +static void io_queue_linked_timeout(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->completion_lock); + __io_queue_linked_timeout(req); spin_unlock_irq(&ctx->completion_lock); /* drop submission reference */ @@ -5611,8 +6043,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) if (!(req->flags & REQ_F_LINK_HEAD)) return NULL; - /* for polled retry, if flag is set, we already went through here */ - if (req->flags & REQ_F_POLLED) + if (req->flags & REQ_F_LINK_TIMEOUT) return NULL; nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, @@ -5624,7 +6055,8 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) return nxt; } -static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_comp_state *cs) { struct io_kiocb *linked_timeout; struct io_kiocb *nxt; @@ -5644,54 +6076,45 @@ again: old_creds = override_creds(req->work.creds); } - ret = io_issue_sqe(req, sqe, true); + ret = io_issue_sqe(req, sqe, true, cs); /* * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ - if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || - (req->flags & REQ_F_MUST_PUNT))) { - if (io_arm_poll_handler(req)) { - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); - goto exit; - } + if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { + if (!io_arm_poll_handler(req)) { punt: - io_req_init_async(req); - - if (io_op_defs[req->opcode].file_table) { - ret = io_grab_files(req); - if (ret) + ret = io_prep_work_files(req); + if (unlikely(ret)) goto err; + /* + * Queued up for async execution, worker will release + * submit reference when the iocb is actually submitted. + */ + io_queue_async_work(req); } - /* - * Queued up for async execution, worker will release - * submit reference when the iocb is actually submitted. - */ - io_queue_async_work(req); + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); goto exit; } + if (unlikely(ret)) { err: - nxt = NULL; - /* drop submission reference */ - io_put_req_find_next(req, &nxt); - - if (linked_timeout) { - if (!ret) - io_queue_linked_timeout(linked_timeout); - else - io_put_req(linked_timeout); - } - - /* and drop final reference, if we failed */ - if (ret) { - io_cqring_add_event(req, ret); + /* un-prep timeout, so it'll be killed as any other linked */ + req->flags &= ~REQ_F_LINK_TIMEOUT; req_set_fail_links(req); io_put_req(req); + io_req_complete(req, ret); + goto exit; } + + /* drop submission reference */ + nxt = io_put_req_find_next(req); + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); + if (nxt) { req = nxt; @@ -5704,7 +6127,8 @@ exit: revert_creds(old_creds); } -static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_comp_state *cs) { int ret; @@ -5712,17 +6136,14 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (ret) { if (ret != -EIOCBQUEUED) { fail_req: - io_cqring_add_event(req, ret); req_set_fail_links(req); - io_double_put_req(req); + io_put_req(req); + io_req_complete(req, ret); } } else if (req->flags & REQ_F_FORCE_ASYNC) { if (!req->io) { - ret = -EAGAIN; - if (io_alloc_async_ctx(req)) - goto fail_req; ret = io_req_defer_prep(req, sqe); - if (unlikely(ret < 0)) + if (unlikely(ret)) goto fail_req; } @@ -5734,21 +6155,22 @@ fail_req: req->work.flags |= IO_WQ_WORK_CONCURRENT; io_queue_async_work(req); } else { - __io_queue_sqe(req, sqe); + __io_queue_sqe(req, sqe, cs); } } -static inline void io_queue_link_head(struct io_kiocb *req) +static inline void io_queue_link_head(struct io_kiocb *req, + struct io_comp_state *cs) { if (unlikely(req->flags & REQ_F_FAIL_LINK)) { - io_cqring_add_event(req, -ECANCELED); - io_double_put_req(req); + io_put_req(req); + io_req_complete(req, -ECANCELED); } else - io_queue_sqe(req, NULL); + io_queue_sqe(req, NULL, cs); } static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **link) + struct io_kiocb **link, struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -5774,21 +6196,19 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, head->flags |= REQ_F_IO_DRAIN; ctx->drain_next = 1; } - if (io_alloc_async_ctx(req)) - return -EAGAIN; - ret = io_req_defer_prep(req, sqe); - if (ret) { + if (unlikely(ret)) { /* fail even hard links since we don't submit */ head->flags |= REQ_F_FAIL_LINK; return ret; } trace_io_uring_link(ctx, req, head); + io_get_req_task(req); list_add_tail(&req->link_list, &head->link_list); /* last request of a link, enqueue the link */ if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { - io_queue_link_head(head); + io_queue_link_head(head, cs); *link = NULL; } } else { @@ -5800,15 +6220,12 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->flags |= REQ_F_LINK_HEAD; INIT_LIST_HEAD(&req->link_list); - if (io_alloc_async_ctx(req)) - return -EAGAIN; - ret = io_req_defer_prep(req, sqe); - if (ret) + if (unlikely(ret)) req->flags |= REQ_F_FAIL_LINK; *link = req; } else { - io_queue_sqe(req, sqe); + io_queue_sqe(req, sqe, cs); } } @@ -5820,6 +6237,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, */ static void io_submit_state_end(struct io_submit_state *state) { + if (!list_empty(&state->comp.list)) + io_submit_flush_completions(&state->comp); blk_finish_plug(&state->plug); io_state_file_put(state); if (state->free_reqs) @@ -5830,9 +6249,15 @@ static void io_submit_state_end(struct io_submit_state *state) * Start submission side cache. */ static void io_submit_state_start(struct io_submit_state *state, - unsigned int max_ios) + struct io_ring_ctx *ctx, unsigned int max_ios) { blk_start_plug(&state->plug); +#ifdef CONFIG_BLOCK + state->plug.nowait = true; +#endif + state->comp.nr = 0; + INIT_LIST_HEAD(&state->comp.list); + state->comp.ctx = ctx; state->free_reqs = 0; state->file = NULL; state->ios_left = max_ios; @@ -5897,12 +6322,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags; int id; - /* - * All io need record the previous position, if LINK vs DARIN, - * it can be used to mark the position of the first IO in the - * link list. - */ - req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped; req->opcode = READ_ONCE(sqe->opcode); req->user_data = READ_ONCE(sqe->user_data); req->io = NULL; @@ -5950,7 +6369,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, struct file *ring_file, int ring_fd) { - struct io_submit_state state, *statep = NULL; + struct io_submit_state state; struct io_kiocb *link = NULL; int i, submitted = 0; @@ -5967,10 +6386,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, if (!percpu_ref_tryget_many(&ctx->refs, nr)) return -EAGAIN; - if (nr > IO_PLUG_THRESHOLD) { - io_submit_state_start(&state, nr); - statep = &state; - } + io_submit_state_start(&state, ctx, nr); ctx->ring_fd = ring_fd; ctx->ring_file = ring_file; @@ -5985,28 +6401,28 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, io_consume_sqe(ctx); break; } - req = io_alloc_req(ctx, statep); + req = io_alloc_req(ctx, &state); if (unlikely(!req)) { if (!submitted) submitted = -EAGAIN; break; } - err = io_init_req(ctx, req, sqe, statep); + err = io_init_req(ctx, req, sqe, &state); io_consume_sqe(ctx); /* will complete beyond this point, count as submitted */ submitted++; if (unlikely(err)) { fail_req: - io_cqring_add_event(req, err); - io_double_put_req(req); + io_put_req(req); + io_req_complete(req, err); break; } trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, true, io_async_submit(ctx)); - err = io_submit_sqe(req, sqe, &link); + err = io_submit_sqe(req, sqe, &link, &state.comp); if (err) goto fail_req; } @@ -6017,9 +6433,8 @@ fail_req: percpu_ref_put_many(&ctx->refs, nr - ref_used); } if (link) - io_queue_link_head(link); - if (statep) - io_submit_state_end(&state); + io_queue_link_head(link, &state.comp); + io_submit_state_end(&state); /* Commit SQ ring head once we've consumed and submitted all SQEs */ io_commit_sqring(ctx); @@ -6027,6 +6442,21 @@ fail_req: return submitted; } +static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) +{ + /* Tell userspace we may need a wakeup call */ + spin_lock_irq(&ctx->completion_lock); + ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; + spin_unlock_irq(&ctx->completion_lock); +} + +static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) +{ + spin_lock_irq(&ctx->completion_lock); + ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; + spin_unlock_irq(&ctx->completion_lock); +} + static int io_sq_thread(void *data) { struct io_ring_ctx *ctx = data; @@ -6043,12 +6473,12 @@ static int io_sq_thread(void *data) while (!kthread_should_park()) { unsigned int to_submit; - if (!list_empty(&ctx->poll_list)) { + if (!list_empty(&ctx->iopoll_list)) { unsigned nr_events = 0; mutex_lock(&ctx->uring_lock); - if (!list_empty(&ctx->poll_list)) - io_iopoll_getevents(ctx, &nr_events, 0); + if (!list_empty(&ctx->iopoll_list) && !need_resched()) + io_do_iopoll(ctx, &nr_events, 0); else timeout = jiffies + ctx->sq_thread_idle; mutex_unlock(&ctx->uring_lock); @@ -6067,7 +6497,7 @@ static int io_sq_thread(void *data) * adding ourselves to the waitqueue, as the unuse/drop * may sleep. */ - io_sq_thread_drop_mm(ctx); + io_sq_thread_drop_mm(); /* * We're polling. If we're within the defined idle @@ -6076,11 +6506,10 @@ static int io_sq_thread(void *data) * more IO, we should wait for the application to * reap events and wake us up. */ - if (!list_empty(&ctx->poll_list) || need_resched() || + if (!list_empty(&ctx->iopoll_list) || need_resched() || (!time_after(jiffies, timeout) && ret != -EBUSY && !percpu_ref_is_dying(&ctx->refs))) { - if (current->task_works) - task_work_run(); + io_run_task_work(); cond_resched(); continue; } @@ -6090,21 +6519,18 @@ static int io_sq_thread(void *data) /* * While doing polled IO, before going to sleep, we need - * to check if there are new reqs added to poll_list, it - * is because reqs may have been punted to io worker and - * will be added to poll_list later, hence check the - * poll_list again. + * to check if there are new reqs added to iopoll_list, + * it is because reqs may have been punted to io worker + * and will be added to iopoll_list later, hence check + * the iopoll_list again. */ if ((ctx->flags & IORING_SETUP_IOPOLL) && - !list_empty_careful(&ctx->poll_list)) { + !list_empty_careful(&ctx->iopoll_list)) { finish_wait(&ctx->sqo_wait, &wait); continue; } - /* Tell userspace we may need a wakeup call */ - spin_lock_irq(&ctx->completion_lock); - ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; - spin_unlock_irq(&ctx->completion_lock); + io_ring_set_wakeup_flag(ctx); to_submit = io_sqring_entries(ctx); if (!to_submit || ret == -EBUSY) { @@ -6112,9 +6538,9 @@ static int io_sq_thread(void *data) finish_wait(&ctx->sqo_wait, &wait); break; } - if (current->task_works) { - task_work_run(); + if (io_run_task_work()) { finish_wait(&ctx->sqo_wait, &wait); + io_ring_clear_wakeup_flag(ctx); continue; } if (signal_pending(current)) @@ -6122,17 +6548,13 @@ static int io_sq_thread(void *data) schedule(); finish_wait(&ctx->sqo_wait, &wait); - spin_lock_irq(&ctx->completion_lock); - ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; - spin_unlock_irq(&ctx->completion_lock); + io_ring_clear_wakeup_flag(ctx); ret = 0; continue; } finish_wait(&ctx->sqo_wait, &wait); - spin_lock_irq(&ctx->completion_lock); - ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; - spin_unlock_irq(&ctx->completion_lock); + io_ring_clear_wakeup_flag(ctx); } mutex_lock(&ctx->uring_lock); @@ -6142,10 +6564,9 @@ static int io_sq_thread(void *data) timeout = jiffies + ctx->sq_thread_idle; } - if (current->task_works) - task_work_run(); + io_run_task_work(); - io_sq_thread_drop_mm(ctx); + io_sq_thread_drop_mm(); revert_creds(old_cred); kthread_parkme(); @@ -6208,9 +6629,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, do { if (io_cqring_events(ctx, false) >= min_events) return 0; - if (!current->task_works) + if (!io_run_task_work()) break; - task_work_run(); } while (1); if (sig) { @@ -6232,8 +6652,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE); /* make sure we run task_work before checking for signals */ - if (current->task_works) - task_work_run(); + if (io_run_task_work()) + continue; if (signal_pending(current)) { if (current->jobctl & JOBCTL_TASK_WORK) { spin_lock_irq(¤t->sighand->siglock); @@ -7019,17 +7439,21 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, return 0; err: io_finish_async(ctx); - mmdrop(ctx->sqo_mm); - ctx->sqo_mm = NULL; + if (ctx->sqo_mm) { + mmdrop(ctx->sqo_mm); + ctx->sqo_mm = NULL; + } return ret; } -static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) +static inline void __io_unaccount_mem(struct user_struct *user, + unsigned long nr_pages) { atomic_long_sub(nr_pages, &user->locked_vm); } -static int io_account_mem(struct user_struct *user, unsigned long nr_pages) +static inline int __io_account_mem(struct user_struct *user, + unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; @@ -7047,6 +7471,41 @@ static int io_account_mem(struct user_struct *user, unsigned long nr_pages) return 0; } +static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, + enum io_mem_account acct) +{ + if (ctx->limit_mem) + __io_unaccount_mem(ctx->user, nr_pages); + + if (ctx->sqo_mm) { + if (acct == ACCT_LOCKED) + ctx->sqo_mm->locked_vm -= nr_pages; + else if (acct == ACCT_PINNED) + atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm); + } +} + +static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages, + enum io_mem_account acct) +{ + int ret; + + if (ctx->limit_mem) { + ret = __io_account_mem(ctx->user, nr_pages); + if (ret) + return ret; + } + + if (ctx->sqo_mm) { + if (acct == ACCT_LOCKED) + ctx->sqo_mm->locked_vm += nr_pages; + else if (acct == ACCT_PINNED) + atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm); + } + + return 0; +} + static void io_mem_free(void *ptr) { struct page *page; @@ -7083,6 +7542,9 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, return SIZE_MAX; #endif + if (sq_offset) + *sq_offset = off; + sq_array_size = array_size(sizeof(u32), sq_entries); if (sq_array_size == SIZE_MAX) return SIZE_MAX; @@ -7090,9 +7552,6 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, if (check_add_overflow(off, sq_array_size, &off)) return SIZE_MAX; - if (sq_offset) - *sq_offset = off; - return off; } @@ -7121,8 +7580,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) for (j = 0; j < imu->nr_bvecs; j++) unpin_user_page(imu->bvec[j].bv_page); - if (ctx->account_mem) - io_unaccount_mem(ctx->user, imu->nr_bvecs); + io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED); kvfree(imu->bvec); imu->nr_bvecs = 0; } @@ -7205,11 +7663,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, start = ubuf >> PAGE_SHIFT; nr_pages = end - start; - if (ctx->account_mem) { - ret = io_account_mem(ctx->user, nr_pages); - if (ret) - goto err; - } + ret = io_account_mem(ctx, nr_pages, ACCT_PINNED); + if (ret) + goto err; ret = 0; if (!pages || nr_pages > got_pages) { @@ -7222,8 +7678,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, GFP_KERNEL); if (!pages || !vmas) { ret = -ENOMEM; - if (ctx->account_mem) - io_unaccount_mem(ctx->user, nr_pages); + io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); goto err; } got_pages = nr_pages; @@ -7233,8 +7688,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, GFP_KERNEL); ret = -ENOMEM; if (!imu->bvec) { - if (ctx->account_mem) - io_unaccount_mem(ctx->user, nr_pages); + io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); goto err; } @@ -7265,8 +7719,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, */ if (pret > 0) unpin_user_pages(pages, pret); - if (ctx->account_mem) - io_unaccount_mem(ctx->user, nr_pages); + io_unaccount_mem(ctx, nr_pages, ACCT_PINNED); kvfree(imu->bvec); goto err; } @@ -7350,11 +7803,12 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx) static void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_finish_async(ctx); - if (ctx->sqo_mm) + io_sqe_buffer_unregister(ctx); + if (ctx->sqo_mm) { mmdrop(ctx->sqo_mm); + ctx->sqo_mm = NULL; + } - io_iopoll_reap_events(ctx); - io_sqe_buffer_unregister(ctx); io_sqe_files_unregister(ctx); io_eventfd_unregister(ctx); io_destroy_buffers(ctx); @@ -7418,11 +7872,8 @@ static int io_remove_personalities(int id, void *p, void *data) static void io_ring_exit_work(struct work_struct *work) { - struct io_ring_ctx *ctx; - - ctx = container_of(work, struct io_ring_ctx, exit_work); - if (ctx->rings) - io_cqring_overflow_flush(ctx, true); + struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, + exit_work); /* * If we're doing polled IO and end up having requests being @@ -7430,11 +7881,11 @@ static void io_ring_exit_work(struct work_struct *work) * we're waiting for refs to drop. We need to reap these manually, * as nobody else will be looking for them. */ - while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) { - io_iopoll_reap_events(ctx); + do { if (ctx->rings) io_cqring_overflow_flush(ctx, true); - } + io_iopoll_try_reap_events(ctx); + } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); io_ring_ctx_free(ctx); } @@ -7450,10 +7901,10 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) if (ctx->io_wq) io_wq_cancel_all(ctx->io_wq); - io_iopoll_reap_events(ctx); /* if we failed setting up the ctx, we might not have any rings */ if (ctx->rings) io_cqring_overflow_flush(ctx, true); + io_iopoll_try_reap_events(ctx); idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); /* @@ -7461,9 +7912,8 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) * is closed but resources aren't reaped yet. This can cause * spurious failure in setting up a new ring. */ - if (ctx->account_mem) - io_unaccount_mem(ctx->user, - ring_pages(ctx->sq_entries, ctx->cq_entries)); + io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries), + ACCT_LOCKED); INIT_WORK(&ctx->exit_work, io_ring_exit_work); queue_work(system_wq, &ctx->exit_work); @@ -7519,17 +7969,14 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, if (cancel_req->flags & REQ_F_OVERFLOW) { spin_lock_irq(&ctx->completion_lock); - list_del(&cancel_req->list); + list_del(&cancel_req->compl.list); cancel_req->flags &= ~REQ_F_OVERFLOW; - if (list_empty(&ctx->cq_overflow_list)) { - clear_bit(0, &ctx->sq_check_overflow); - clear_bit(0, &ctx->cq_check_overflow); - ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; - } - spin_unlock_irq(&ctx->completion_lock); + io_cqring_mark_overflow(ctx); WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); /* * Put inflight ref and overflow ref. If that's @@ -7652,8 +8099,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, int submitted = 0; struct fd f; - if (current->task_works) - task_work_run(); + io_run_task_work(); if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP)) return -EINVAL; @@ -7692,8 +8138,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, goto out; } if (flags & IORING_ENTER_GETEVENTS) { - unsigned nr_events = 0; - min_complete = min(min_complete, ctx->cq_entries); /* @@ -7704,7 +8148,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ if (ctx->flags & IORING_SETUP_IOPOLL && !(ctx->flags & IORING_SETUP_SQPOLL)) { - ret = io_iopoll_check(ctx, &nr_events, min_complete); + ret = io_iopoll_check(ctx, min_complete); } else { ret = io_cqring_wait(ctx, min_complete, sig, sigsz); } @@ -7825,6 +8269,10 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_rings *rings; size_t size, sq_array_offset; + /* make sure these are sane, as we already accounted them */ + ctx->sq_entries = p->sq_entries; + ctx->cq_entries = p->cq_entries; + size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW; @@ -7841,8 +8289,6 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, rings->cq_ring_entries = p->cq_entries; ctx->sq_mask = rings->sq_ring_mask; ctx->cq_mask = rings->cq_ring_mask; - ctx->sq_entries = rings->sq_ring_entries; - ctx->cq_entries = rings->cq_ring_entries; size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) { @@ -7909,7 +8355,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, { struct user_struct *user = NULL; struct io_ring_ctx *ctx; - bool account_mem; + bool limit_mem; int ret; if (!entries) @@ -7948,10 +8394,10 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, } user = get_uid(current_user()); - account_mem = !capable(CAP_IPC_LOCK); + limit_mem = !capable(CAP_IPC_LOCK); - if (account_mem) { - ret = io_account_mem(user, + if (limit_mem) { + ret = __io_account_mem(user, ring_pages(p->sq_entries, p->cq_entries)); if (ret) { free_uid(user); @@ -7961,17 +8407,26 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, ctx = io_ring_ctx_alloc(p); if (!ctx) { - if (account_mem) - io_unaccount_mem(user, ring_pages(p->sq_entries, + if (limit_mem) + __io_unaccount_mem(user, ring_pages(p->sq_entries, p->cq_entries)); free_uid(user); return -ENOMEM; } ctx->compat = in_compat_syscall(); - ctx->account_mem = account_mem; ctx->user = user; ctx->creds = get_current_cred(); + /* + * Account memory _before_ installing the file descriptor. Once + * the descriptor is installed, it can get closed at any time. Also + * do this before hitting the general error path, as ring freeing + * will un-account as well. + */ + io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries), + ACCT_LOCKED); + ctx->limit_mem = limit_mem; + ret = io_allocate_scq_urings(ctx, p); if (ret) goto err; @@ -8000,12 +8455,14 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | - IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL; + IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | + IORING_FEAT_POLL_32BITS; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; goto err; } + /* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup @@ -8289,7 +8746,8 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags); BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags); - BUILD_BUG_SQE_ELEM(28, __u16, poll_events); + BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events); + BUILD_BUG_SQE_ELEM(28, __u32, poll32_events); BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags); BUILD_BUG_SQE_ELEM(28, __u32, msg_flags); BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);