struct io_mapped_ubuf {
u64 ubuf;
- size_t len;
+ u64 ubuf_end;
struct bio_vec *bvec;
unsigned int nr_bvecs;
unsigned long acct_pages;
struct list_head list;
};
+struct io_fixed_file {
+ /* file * with additional FFS_* flags */
+ unsigned long file_ptr;
+};
+
struct io_rsrc_put {
struct list_head list;
union {
};
};
-struct fixed_rsrc_table {
- struct file **files;
+struct io_file_table {
+ /* two level table */
+ struct io_fixed_file **files;
};
-struct fixed_rsrc_ref_node {
+struct io_rsrc_node {
struct percpu_ref refs;
struct list_head node;
struct list_head rsrc_list;
- struct fixed_rsrc_data *rsrc_data;
- void (*rsrc_put)(struct io_ring_ctx *ctx,
- struct io_rsrc_put *prsrc);
+ struct io_rsrc_data *rsrc_data;
struct llist_node llist;
bool done;
};
-struct fixed_rsrc_data {
- struct fixed_rsrc_table *table;
+typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
+
+struct io_rsrc_data {
struct io_ring_ctx *ctx;
- struct fixed_rsrc_ref_node *node;
- struct percpu_ref refs;
+ rsrc_put_fn *do_put;
+ atomic_t refs;
struct completion done;
bool quiesce;
};
struct {
unsigned int flags;
unsigned int compat: 1;
- unsigned int cq_overflow_flushed: 1;
unsigned int drain_next: 1;
unsigned int eventfd_async: 1;
unsigned int restricted: 1;
* readers must ensure that ->refs is alive as long as the file* is
* used. Only updated through io_uring_register(2).
*/
- struct fixed_rsrc_data *file_data;
+ struct io_rsrc_data *file_data;
+ struct io_file_table file_table;
unsigned nr_user_files;
/* if used, fixed mapped user buffers */
struct hlist_head *cancel_hash;
unsigned cancel_hash_bits;
bool poll_multi_file;
-
- spinlock_t inflight_lock;
- struct list_head inflight_list;
} ____cacheline_aligned_in_smp;
struct delayed_work rsrc_put_work;
struct llist_head rsrc_put_llist;
struct list_head rsrc_ref_list;
spinlock_t rsrc_ref_lock;
- struct fixed_rsrc_ref_node *rsrc_backup_node;
+ struct io_rsrc_node *rsrc_node;
+ struct io_rsrc_node *rsrc_backup_node;
struct io_restriction restrictions;
/* exit task_work */
struct callback_head *exit_task_work;
- struct wait_queue_head hash_wait;
-
/* Keep this last, we don't need it for the fast path */
struct work_struct exit_work;
struct list_head tctx_list;
const struct io_ring_ctx *last;
struct io_wq *io_wq;
struct percpu_counter inflight;
+ atomic_t inflight_tracked;
atomic_t in_idle;
spinlock_t task_lock;
struct wait_queue_entry wait;
};
-struct io_poll_remove {
+struct io_poll_update {
struct file *file;
- u64 addr;
+ u64 old_user_data;
+ u64 new_user_data;
+ __poll_t events;
+ bool update_events;
+ bool update_user_data;
};
struct io_close {
struct io_sr_msg {
struct file *file;
union {
- struct user_msghdr __user *umsg;
- void __user *buf;
+ struct compat_msghdr __user *umsg_compat;
+ struct user_msghdr __user *umsg;
+ void __user *buf;
};
int msg_flags;
int bgid;
struct file *file;
struct io_rw rw;
struct io_poll_iocb poll;
- struct io_poll_remove poll_remove;
+ struct io_poll_update poll_update;
struct io_accept accept;
struct io_sync sync;
struct io_cancel cancel;
struct io_kiocb *link;
struct percpu_ref *fixed_rsrc_refs;
- /*
- * 1. used with ctx->iopoll_list with reads/writes
- * 2. to track reqs with ->files (see io_op_def::file_table)
- */
+ /* used with ctx->iopoll_list with reads/writes */
struct list_head inflight_entry;
union {
struct io_task_work io_task_work;
struct task_struct *task,
struct files_struct *files);
static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx);
-static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
-static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
- struct io_ring_ctx *ctx);
-static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
+static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
-static void io_cqring_fill_event(struct io_kiocb *req, long res);
+static bool io_cqring_fill_event(struct io_kiocb *req, long res, unsigned cflags);
static void io_put_req(struct io_kiocb *req);
static void io_put_req_deferred(struct io_kiocb *req, int nr);
static void io_dismantle_req(struct io_kiocb *req);
static void io_put_task(struct task_struct *task, int nr);
-static void io_queue_next(struct io_kiocb *req);
static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
static void io_req_task_queue(struct io_kiocb *req);
static void io_submit_flush_completions(struct io_comp_state *cs,
struct io_ring_ctx *ctx);
+static bool io_poll_remove_waitqs(struct io_kiocb *req);
static int io_req_prep_async(struct io_kiocb *req);
static struct kmem_cache *req_cachep;
#define io_for_each_link(pos, head) \
for (pos = (head); pos; pos = pos->link)
-static inline void io_set_resource_node(struct io_kiocb *req)
+static inline void io_req_set_rsrc_node(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
if (!req->fixed_rsrc_refs) {
- req->fixed_rsrc_refs = &ctx->file_data->node->refs;
+ req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
percpu_ref_get(req->fixed_rsrc_refs);
}
}
+static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
+{
+ bool got = percpu_ref_tryget(ref);
+
+ /* already at zero, wait for ->release() */
+ if (!got)
+ wait_for_completion(compl);
+ percpu_ref_resurrect(ref);
+ if (got)
+ percpu_ref_put(ref);
+}
+
static bool io_match_task(struct io_kiocb *head,
struct task_struct *task,
struct files_struct *files)
static inline void req_set_fail_links(struct io_kiocb *req)
{
- if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
+ if (req->flags & REQ_F_LINK)
req->flags |= REQ_F_FAIL_LINK;
}
INIT_LIST_HEAD(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
- spin_lock_init(&ctx->inflight_lock);
- INIT_LIST_HEAD(&ctx->inflight_list);
spin_lock_init(&ctx->rsrc_ref_lock);
INIT_LIST_HEAD(&ctx->rsrc_ref_list);
INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
static void io_req_track_inflight(struct io_kiocb *req)
{
- struct io_ring_ctx *ctx = req->ctx;
-
if (!(req->flags & REQ_F_INFLIGHT)) {
req->flags |= REQ_F_INFLIGHT;
-
- spin_lock_irq(&ctx->inflight_lock);
- list_add(&req->inflight_entry, &ctx->inflight_list);
- spin_unlock_irq(&ctx->inflight_lock);
+ atomic_inc(¤t->io_uring->inflight_tracked);
}
}
switch (req->opcode) {
case IORING_OP_SPLICE:
case IORING_OP_TEE:
- /*
- * Splice operation will be punted aync, and here need to
- * modify io_wq_work.flags, so initialize io_wq_work firstly.
- */
if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
req->work.flags |= IO_WQ_WORK_UNBOUND;
break;
}
static void io_kill_timeout(struct io_kiocb *req, int status)
+ __must_hold(&req->ctx->completion_lock)
{
struct io_timeout_data *io = req->async_data;
- int ret;
- ret = hrtimer_try_to_cancel(&io->timer);
- if (ret != -1) {
+ if (hrtimer_try_to_cancel(&io->timer) != -1) {
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
list_del_init(&req->timeout.list);
- io_cqring_fill_event(req, status);
+ io_cqring_fill_event(req, status, 0);
io_put_req_deferred(req, 1);
}
}
return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
}
-static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
+static inline struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
unsigned tail;
static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
{
- if (!ctx->cq_ev_fd)
+ if (likely(!ctx->cq_ev_fd))
return false;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
return false;
- if (!ctx->eventfd_async)
- return true;
- return io_wq_current_is_worker();
+ return !ctx->eventfd_async || io_wq_current_is_worker();
}
static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
atomic_inc(&req->refs);
}
-static void __io_cqring_fill_event(struct io_kiocb *req, long res,
- unsigned int cflags)
+static bool io_cqring_event_overflow(struct io_kiocb *req, long res,
+ unsigned int cflags)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_overflow_cqe *ocqe;
+
+ ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
+ if (!ocqe) {
+ /*
+ * If we're in ring overflow flush mode, or in task cancel mode,
+ * or cannot allocate an overflow entry, then we need to drop it
+ * on the floor.
+ */
+ WRITE_ONCE(ctx->rings->cq_overflow, ++ctx->cached_cq_overflow);
+ return false;
+ }
+ if (list_empty(&ctx->cq_overflow_list)) {
+ set_bit(0, &ctx->sq_check_overflow);
+ set_bit(0, &ctx->cq_check_overflow);
+ ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
+ }
+ ocqe->cqe.user_data = req->user_data;
+ ocqe->cqe.res = res;
+ ocqe->cqe.flags = cflags;
+ list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
+ return true;
+}
+
+static inline bool __io_cqring_fill_event(struct io_kiocb *req, long res,
+ unsigned int cflags)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_uring_cqe *cqe;
- trace_io_uring_complete(ctx, req->user_data, res);
+ trace_io_uring_complete(ctx, req->user_data, res, cflags);
/*
* If we can't get a cq entry, userspace overflowed the
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, cflags);
- return;
- }
- if (!ctx->cq_overflow_flushed &&
- !atomic_read(&req->task->io_uring->in_idle)) {
- struct io_overflow_cqe *ocqe;
-
- ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
- if (!ocqe)
- goto overflow;
- if (list_empty(&ctx->cq_overflow_list)) {
- set_bit(0, &ctx->sq_check_overflow);
- set_bit(0, &ctx->cq_check_overflow);
- ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
- }
- ocqe->cqe.user_data = req->user_data;
- ocqe->cqe.res = res;
- ocqe->cqe.flags = cflags;
- list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
- return;
+ return true;
}
-overflow:
- /*
- * If we're in ring overflow flush mode, or in task cancel mode,
- * or cannot allocate an overflow entry, then we need to drop it
- * on the floor.
- */
- WRITE_ONCE(ctx->rings->cq_overflow, ++ctx->cached_cq_overflow);
+ return io_cqring_event_overflow(req, res, cflags);
}
-static void io_cqring_fill_event(struct io_kiocb *req, long res)
+/* not as hot to bloat with inlining */
+static noinline bool io_cqring_fill_event(struct io_kiocb *req, long res,
+ unsigned int cflags)
{
- __io_cqring_fill_event(req, res, 0);
+ return __io_cqring_fill_event(req, res, cflags);
}
static void io_req_complete_post(struct io_kiocb *req, long res,
io_clean_op(req);
if (req->flags & REQ_F_INFLIGHT) {
- struct io_ring_ctx *ctx = req->ctx;
- unsigned long flags;
+ struct io_uring_task *tctx = req->task->io_uring;
- spin_lock_irqsave(&ctx->inflight_lock, flags);
- list_del(&req->inflight_entry);
- spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+ atomic_dec(&tctx->inflight_tracked);
req->flags &= ~REQ_F_INFLIGHT;
}
}
*/
if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
struct io_timeout_data *io = link->async_data;
- int ret;
io_remove_next_linked(req);
link->timeout.head = NULL;
- ret = hrtimer_try_to_cancel(&io->timer);
- if (ret != -1) {
- io_cqring_fill_event(link, -ECANCELED);
+ if (hrtimer_try_to_cancel(&io->timer) != -1) {
+ io_cqring_fill_event(link, -ECANCELED, 0);
io_put_req_deferred(link, 1);
return true;
}
link->link = NULL;
trace_io_uring_fail_link(req, link);
- io_cqring_fill_event(link, -ECANCELED);
+ io_cqring_fill_event(link, -ECANCELED, 0);
io_put_req_deferred(link, 2);
link = nxt;
}
if (likely(req->flags & REQ_F_LINK_TIMEOUT))
posted = io_kill_linked_timeout(req);
- if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
+ if (unlikely((req->flags & REQ_F_FAIL_LINK) &&
+ !(req->flags & REQ_F_HARDLINK))) {
posted |= (req->link != NULL);
io_fail_links(req);
}
return ret;
}
-/*
- * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
- * non-spinning poll check - we'll still enter the driver poll loop, but only
- * as a non-spinning completion check.
- */
-static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
- long min)
-{
- while (!list_empty(&ctx->iopoll_list) && !need_resched()) {
- int ret;
-
- ret = io_do_iopoll(ctx, nr_events, min);
- if (ret < 0)
- return ret;
- if (*nr_events >= min)
- return 0;
- }
-
- return 1;
-}
-
/*
* We can't just wait for polled events to come to us, we have to actively
* find and complete them.
static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
{
unsigned int nr_events = 0;
- int iters = 0, ret = 0;
+ int ret = 0;
/*
* We disallow the app entering submit/complete with polling, but we
* that got punted to a workqueue.
*/
mutex_lock(&ctx->uring_lock);
+ /*
+ * Don't enter poll loop if we already have events pending.
+ * If we do, we can potentially be spinning for commands that
+ * already triggered a CQE (eg in error).
+ */
+ if (test_bit(0, &ctx->cq_check_overflow))
+ __io_cqring_overflow_flush(ctx, false);
+ if (io_cqring_events(ctx))
+ goto out;
do {
- /*
- * Don't enter poll loop if we already have events pending.
- * If we do, we can potentially be spinning for commands that
- * already triggered a CQE (eg in error).
- */
- if (test_bit(0, &ctx->cq_check_overflow))
- __io_cqring_overflow_flush(ctx, false);
- if (io_cqring_events(ctx))
- break;
-
/*
* If a submit got punted to a workqueue, we can have the
* application entering polling for a command before it gets
* forever, while the workqueue is stuck trying to acquire the
* very same mutex.
*/
- if (!(++iters & 7)) {
+ if (list_empty(&ctx->iopoll_list)) {
mutex_unlock(&ctx->uring_lock);
io_run_task_work();
mutex_lock(&ctx->uring_lock);
- }
-
- ret = io_iopoll_getevents(ctx, &nr_events, min);
- if (ret <= 0)
- break;
- ret = 0;
- } while (min && !nr_events && !need_resched());
+ if (list_empty(&ctx->iopoll_list))
+ break;
+ }
+ ret = io_do_iopoll(ctx, &nr_events, min);
+ } while (!ret && nr_events < min && !need_resched());
+out:
mutex_unlock(&ctx->uring_lock);
return ret;
}
return true;
}
#else
+static bool io_resubmit_prep(struct io_kiocb *req)
+{
+ return false;
+}
static bool io_rw_should_reissue(struct io_kiocb *req)
{
return false;
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
if (unlikely(res != req->result)) {
- bool fail = true;
-
-#ifdef CONFIG_BLOCK
- if (res == -EAGAIN && io_rw_should_reissue(req) &&
- io_resubmit_prep(req))
- fail = false;
-#endif
- if (fail) {
+ if (!(res == -EAGAIN && io_rw_should_reissue(req) &&
+ io_resubmit_prep(req))) {
req_set_fail_links(req);
req->flags |= REQ_F_DONT_REISSUE;
}
/*
* After the iocb has been issued, it's safe to be found on the poll list.
* Adding the kiocb to the list AFTER submission ensures that we don't
- * find it from a io_iopoll_getevents() thread before the issuer is done
+ * find it from a io_do_iopoll() thread before the issuer is done
* accessing the kiocb cookie.
*/
static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
size_t len = req->rw.len;
struct io_mapped_ubuf *imu;
u16 index, buf_index = req->buf_index;
+ u64 buf_end, buf_addr = req->rw.addr;
size_t offset;
- u64 buf_addr;
if (unlikely(buf_index >= ctx->nr_user_bufs))
return -EFAULT;
imu = &ctx->user_bufs[index];
buf_addr = req->rw.addr;
- /* overflow */
- if (buf_addr + len < buf_addr)
+ if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
return -EFAULT;
/* not inside the mapped region */
- if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
+ if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
return -EFAULT;
/*
req->flags &= ~REQ_F_NEED_CLEANUP;
if (ret < 0)
req_set_fail_links(req);
- io_req_complete(req, ret);
+ __io_req_complete(req, issue_flags, ret, 0);
return 0;
}
ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
if (ret < 0)
req_set_fail_links(req);
- io_req_complete(req, ret);
+ __io_req_complete(req, issue_flags, ret, 0);
return 0;
}
struct files_struct *files = current->files;
struct io_close *close = &req->close;
struct fdtable *fdt;
- struct file *file;
- int ret;
+ struct file *file = NULL;
+ int ret = -EBADF;
- file = NULL;
- ret = -EBADF;
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
if (close->fd >= fdt->max_fds) {
goto err;
}
file = fdt->fd[close->fd];
- if (!file) {
- spin_unlock(&files->file_lock);
- goto err;
- }
-
- if (file->f_op == &io_uring_fops) {
+ if (!file || file->f_op == &io_uring_fops) {
spin_unlock(&files->file_lock);
file = NULL;
goto err;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
+ sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
+ if (sr->msg_flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
kmsg = &iomsg;
}
- flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
- if (flags & MSG_DONTWAIT)
- req->flags |= REQ_F_NOWAIT;
- else if (issue_flags & IO_URING_F_NONBLOCK)
+ flags = req->sr_msg.msg_flags;
+ if (issue_flags & IO_URING_F_NONBLOCK)
flags |= MSG_DONTWAIT;
-
if (flags & MSG_WAITALL)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
msg.msg_controllen = 0;
msg.msg_namelen = 0;
- flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
- if (flags & MSG_DONTWAIT)
- req->flags |= REQ_F_NOWAIT;
- else if (issue_flags & IO_URING_F_NONBLOCK)
+ flags = req->sr_msg.msg_flags;
+ if (issue_flags & IO_URING_F_NONBLOCK)
flags |= MSG_DONTWAIT;
-
if (flags & MSG_WAITALL)
min_ret = iov_iter_count(&msg.msg_iter);
static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
struct io_async_msghdr *iomsg)
{
- struct compat_msghdr __user *msg_compat;
struct io_sr_msg *sr = &req->sr_msg;
struct compat_iovec __user *uiov;
compat_uptr_t ptr;
compat_size_t len;
int ret;
- msg_compat = (struct compat_msghdr __user *) sr->umsg;
- ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr,
- &ptr, &len);
+ ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
+ &ptr, &len);
if (ret)
return ret;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
sr->bgid = READ_ONCE(sqe->buf_group);
+ sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
+ if (sr->msg_flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
1, req->sr_msg.len);
}
- flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
- if (flags & MSG_DONTWAIT)
- req->flags |= REQ_F_NOWAIT;
- else if (force_nonblock)
+ flags = req->sr_msg.msg_flags;
+ if (force_nonblock)
flags |= MSG_DONTWAIT;
-
if (flags & MSG_WAITALL)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
msg.msg_iocb = NULL;
msg.msg_flags = 0;
- flags = req->sr_msg.msg_flags | MSG_NOSIGNAL;
- if (flags & MSG_DONTWAIT)
- req->flags |= REQ_F_NOWAIT;
- else if (force_nonblock)
+ flags = req->sr_msg.msg_flags;
+ if (force_nonblock)
flags |= MSG_DONTWAIT;
-
if (flags & MSG_WAITALL)
min_ret = iov_iter_count(&msg.msg_iter);
}
static void io_poll_remove_double(struct io_kiocb *req)
+ __must_hold(&req->ctx->completion_lock)
{
struct io_poll_iocb *poll = io_poll_get_double(req);
}
}
-static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
+static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
+ __must_hold(&req->ctx->completion_lock)
{
struct io_ring_ctx *ctx = req->ctx;
+ unsigned flags = IORING_CQE_F_MORE;
+ int error;
- if (!error && req->poll.canceled)
+ if (READ_ONCE(req->poll.canceled)) {
error = -ECANCELED;
-
- io_poll_remove_double(req);
- req->poll.done = true;
- io_cqring_fill_event(req, error ? error : mangle_poll(mask));
+ req->poll.events |= EPOLLONESHOT;
+ } else {
+ error = mangle_poll(mask);
+ }
+ if (req->poll.events & EPOLLONESHOT)
+ flags = 0;
+ if (!io_cqring_fill_event(req, error, flags)) {
+ io_poll_remove_waitqs(req);
+ req->poll.done = true;
+ flags = 0;
+ }
io_commit_cqring(ctx);
+ return !(flags & IORING_CQE_F_MORE);
}
static void io_poll_task_func(struct callback_head *cb)
if (io_poll_rewait(req, &req->poll)) {
spin_unlock_irq(&ctx->completion_lock);
} else {
- hash_del(&req->hash_node);
- io_poll_complete(req, req->result, 0);
- spin_unlock_irq(&ctx->completion_lock);
+ bool done;
- nxt = io_put_req_find_next(req);
+ done = io_poll_complete(req, req->result);
+ if (done) {
+ hash_del(&req->hash_node);
+ } else {
+ req->result = 0;
+ add_wait_queue(req->poll.head, &req->poll.wait);
+ }
+ spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
- if (nxt)
- __io_req_task_submit(nxt);
+
+ if (done) {
+ nxt = io_put_req_find_next(req);
+ if (nxt)
+ __io_req_task_submit(nxt);
+ }
}
}
/* for instances that support it check for an event match first: */
if (mask && !(mask & poll->events))
return 0;
+ if (!(poll->events & EPOLLONESHOT))
+ return poll->wait.func(&poll->wait, mode, sync, key);
list_del_init(&wait->entry);
pt->error = -EINVAL;
return;
}
+ /*
+ * Can't handle multishot for double wait for now, turn it
+ * into one-shot mode.
+ */
+ if (!(req->poll.events & EPOLLONESHOT))
+ req->poll.events |= EPOLLONESHOT;
/* double add on the same waitqueue head, ignore */
if (poll->head == head)
return;
return;
}
- /* If req is still hashed, it cannot have been canceled. Don't check. */
- if (hash_hashed(&req->hash_node))
- hash_del(&req->hash_node);
-
+ hash_del(&req->hash_node);
io_poll_remove_double(req);
spin_unlock_irq(&ctx->completion_lock);
ipt->error = 0;
mask = 0;
}
- if (mask || ipt->error)
+ if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error)
list_del_init(&poll->wait.entry);
else if (cancel)
WRITE_ONCE(poll->canceled, true);
req->flags |= REQ_F_POLLED;
req->apoll = apoll;
- mask = 0;
+ mask = EPOLLONESHOT;
if (def->pollin)
mask |= POLLIN | POLLRDNORM;
if (def->pollout)
}
static bool __io_poll_remove_one(struct io_kiocb *req,
- struct io_poll_iocb *poll)
+ struct io_poll_iocb *poll, bool do_cancel)
+ __must_hold(&req->ctx->completion_lock)
{
bool do_complete = false;
+ if (!poll->head)
+ return false;
spin_lock(&poll->head->lock);
- WRITE_ONCE(poll->canceled, true);
+ if (do_cancel)
+ WRITE_ONCE(poll->canceled, true);
if (!list_empty(&poll->wait.entry)) {
list_del_init(&poll->wait.entry);
do_complete = true;
return do_complete;
}
-static bool io_poll_remove_one(struct io_kiocb *req)
+static bool io_poll_remove_waitqs(struct io_kiocb *req)
+ __must_hold(&req->ctx->completion_lock)
{
bool do_complete;
io_poll_remove_double(req);
+ do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
- if (req->opcode == IORING_OP_POLL_ADD) {
- do_complete = __io_poll_remove_one(req, &req->poll);
- } else {
+ if (req->opcode != IORING_OP_POLL_ADD && do_complete) {
struct async_poll *apoll = req->apoll;
/* non-poll requests have submit ref still */
- do_complete = __io_poll_remove_one(req, &apoll->poll);
- if (do_complete) {
- io_put_req(req);
- kfree(apoll->double_poll);
- kfree(apoll);
- }
+ req_ref_put(req);
+ kfree(apoll->double_poll);
+ kfree(apoll);
}
+ return do_complete;
+}
+static bool io_poll_remove_one(struct io_kiocb *req)
+ __must_hold(&req->ctx->completion_lock)
+{
+ bool do_complete;
+
+ do_complete = io_poll_remove_waitqs(req);
if (do_complete) {
- io_cqring_fill_event(req, -ECANCELED);
+ io_cqring_fill_event(req, -ECANCELED, 0);
io_commit_cqring(req->ctx);
req_set_fail_links(req);
io_put_req_deferred(req, 1);
return posted != 0;
}
-static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
+static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
+ bool poll_only)
+ __must_hold(&ctx->completion_lock)
{
struct hlist_head *list;
struct io_kiocb *req;
hlist_for_each_entry(req, list, hash_node) {
if (sqe_addr != req->user_data)
continue;
- if (io_poll_remove_one(req))
- return 0;
- return -EALREADY;
+ if (poll_only && req->opcode != IORING_OP_POLL_ADD)
+ continue;
+ return req;
}
+ return NULL;
+}
- return -ENOENT;
+static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
+ bool poll_only)
+ __must_hold(&ctx->completion_lock)
+{
+ struct io_kiocb *req;
+
+ req = io_poll_find(ctx, sqe_addr, poll_only);
+ if (!req)
+ return -ENOENT;
+ if (io_poll_remove_one(req))
+ return 0;
+
+ return -EALREADY;
}
-static int io_poll_remove_prep(struct io_kiocb *req,
+static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
+ unsigned int flags)
+{
+ u32 events;
+
+ events = READ_ONCE(sqe->poll32_events);
+#ifdef __BIG_ENDIAN
+ events = swahw32(events);
+#endif
+ if (!(flags & IORING_POLL_ADD_MULTI))
+ events |= EPOLLONESHOT;
+ return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
+}
+
+static int io_poll_update_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
+ struct io_poll_update *upd = &req->poll_update;
+ u32 flags;
+
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
- sqe->poll_events)
+ if (sqe->ioprio || sqe->buf_index)
+ return -EINVAL;
+ flags = READ_ONCE(sqe->len);
+ if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
+ IORING_POLL_ADD_MULTI))
+ return -EINVAL;
+ /* meaningless without update */
+ if (flags == IORING_POLL_ADD_MULTI)
return -EINVAL;
- req->poll_remove.addr = READ_ONCE(sqe->addr);
- return 0;
-}
-
-/*
- * Find a running poll command that matches one specified in sqe->addr,
- * and remove it if found.
- */
-static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_ring_ctx *ctx = req->ctx;
- int ret;
+ upd->old_user_data = READ_ONCE(sqe->addr);
+ upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
+ upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
- spin_lock_irq(&ctx->completion_lock);
- ret = io_poll_cancel(ctx, req->poll_remove.addr);
- spin_unlock_irq(&ctx->completion_lock);
+ upd->new_user_data = READ_ONCE(sqe->off);
+ if (!upd->update_user_data && upd->new_user_data)
+ return -EINVAL;
+ if (upd->update_events)
+ upd->events = io_poll_parse_events(sqe, flags);
+ else if (sqe->poll32_events)
+ return -EINVAL;
- if (ret < 0)
- req_set_fail_links(req);
- io_req_complete(req, ret);
return 0;
}
static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_poll_iocb *poll = &req->poll;
- u32 events;
+ u32 flags;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
+ if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
+ return -EINVAL;
+ flags = READ_ONCE(sqe->len);
+ if (flags & ~IORING_POLL_ADD_MULTI)
return -EINVAL;
- events = READ_ONCE(sqe->poll32_events);
-#ifdef __BIG_ENDIAN
- events = swahw32(events);
-#endif
- poll->events = demangle_poll(events) | (events & EPOLLEXCLUSIVE);
+ poll->events = io_poll_parse_events(sqe, flags);
return 0;
}
if (mask) { /* no async, we'd stolen it */
ipt.error = 0;
- io_poll_complete(req, mask, 0);
+ io_poll_complete(req, mask);
}
spin_unlock_irq(&ctx->completion_lock);
if (mask) {
io_cqring_ev_posted(ctx);
- io_put_req(req);
+ if (poll->events & EPOLLONESHOT)
+ io_put_req(req);
}
return ipt.error;
}
+static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *preq;
+ bool completing;
+ int ret;
+
+ spin_lock_irq(&ctx->completion_lock);
+ preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
+ if (!preq) {
+ ret = -ENOENT;
+ goto err;
+ }
+
+ if (!req->poll_update.update_events && !req->poll_update.update_user_data) {
+ completing = true;
+ ret = io_poll_remove_one(preq) ? 0 : -EALREADY;
+ goto err;
+ }
+
+ /*
+ * Don't allow racy completion with singleshot, as we cannot safely
+ * update those. For multishot, if we're racing with completion, just
+ * let completion re-add it.
+ */
+ completing = !__io_poll_remove_one(preq, &preq->poll, false);
+ if (completing && (preq->poll.events & EPOLLONESHOT)) {
+ ret = -EALREADY;
+ goto err;
+ }
+ /* we now have a detached poll request. reissue. */
+ ret = 0;
+err:
+ if (ret < 0) {
+ spin_unlock_irq(&ctx->completion_lock);
+ req_set_fail_links(req);
+ io_req_complete(req, ret);
+ return 0;
+ }
+ /* only mask one event flags, keep behavior flags */
+ if (req->poll_update.update_events) {
+ preq->poll.events &= ~0xffff;
+ preq->poll.events |= req->poll_update.events & 0xffff;
+ preq->poll.events |= IO_POLL_UNMASK;
+ }
+ if (req->poll_update.update_user_data)
+ preq->user_data = req->poll_update.new_user_data;
+ spin_unlock_irq(&ctx->completion_lock);
+
+ /* complete update request, we're done with it */
+ io_req_complete(req, ret);
+
+ if (!completing) {
+ ret = io_poll_add(preq, issue_flags);
+ if (ret < 0) {
+ req_set_fail_links(preq);
+ io_req_complete(preq, ret);
+ }
+ }
+ return 0;
+}
+
static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
{
struct io_timeout_data *data = container_of(timer,
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
- io_cqring_fill_event(req, -ETIME);
+ io_cqring_fill_event(req, -ETIME, 0);
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
__u64 user_data)
+ __must_hold(&ctx->completion_lock)
{
struct io_timeout_data *io;
struct io_kiocb *req;
- int ret = -ENOENT;
+ bool found = false;
list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
- if (user_data == req->user_data) {
- ret = 0;
+ found = user_data == req->user_data;
+ if (found)
break;
- }
}
-
- if (ret == -ENOENT)
- return ERR_PTR(ret);
+ if (!found)
+ return ERR_PTR(-ENOENT);
io = req->async_data;
- ret = hrtimer_try_to_cancel(&io->timer);
- if (ret == -1)
+ if (hrtimer_try_to_cancel(&io->timer) == -1)
return ERR_PTR(-EALREADY);
list_del_init(&req->timeout.list);
return req;
}
static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+ __must_hold(&ctx->completion_lock)
{
struct io_kiocb *req = io_timeout_extract(ctx, user_data);
return PTR_ERR(req);
req_set_fail_links(req);
- io_cqring_fill_event(req, -ECANCELED);
+ io_cqring_fill_event(req, -ECANCELED, 0);
io_put_req_deferred(req, 1);
return 0;
}
static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
struct timespec64 *ts, enum hrtimer_mode mode)
+ __must_hold(&ctx->completion_lock)
{
struct io_kiocb *req = io_timeout_extract(ctx, user_data);
struct io_timeout_data *data;
ret = io_timeout_update(ctx, tr->addr, &tr->ts,
io_translate_timeout_mode(tr->flags));
- io_cqring_fill_event(req, ret);
+ io_cqring_fill_event(req, ret, 0);
io_commit_cqring(ctx);
spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
int ret;
ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
- if (ret != -ENOENT) {
- spin_lock_irqsave(&ctx->completion_lock, flags);
- goto done;
- }
-
spin_lock_irqsave(&ctx->completion_lock, flags);
+ if (ret != -ENOENT)
+ goto done;
ret = io_timeout_cancel(ctx, sqe_addr);
if (ret != -ENOENT)
goto done;
- ret = io_poll_cancel(ctx, sqe_addr);
+ ret = io_poll_cancel(ctx, sqe_addr, false);
done:
if (!ret)
ret = success_ret;
- io_cqring_fill_event(req, ret);
+ io_cqring_fill_event(req, ret, 0);
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
if (ret < 0)
req_set_fail_links(req);
- io_put_req(req);
}
static int io_async_cancel_prep(struct io_kiocb *req,
ret = io_timeout_cancel(ctx, sqe_addr);
if (ret != -ENOENT)
goto done;
- ret = io_poll_cancel(ctx, sqe_addr);
+ ret = io_poll_cancel(ctx, sqe_addr, false);
if (ret != -ENOENT)
goto done;
spin_unlock_irq(&ctx->completion_lock);
list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
struct io_uring_task *tctx = node->task->io_uring;
- if (!tctx || !tctx->io_wq)
- continue;
ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
if (ret != -ENOENT)
break;
spin_lock_irq(&ctx->completion_lock);
done:
- io_cqring_fill_event(req, ret);
+ io_cqring_fill_event(req, ret, 0);
io_commit_cqring(ctx);
spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
case IORING_OP_POLL_ADD:
return io_poll_add_prep(req, sqe);
case IORING_OP_POLL_REMOVE:
- return io_poll_remove_prep(req, sqe);
+ return io_poll_update_prep(req, sqe);
case IORING_OP_FSYNC:
return io_fsync_prep(req, sqe);
case IORING_OP_SYNC_FILE_RANGE:
ret = io_poll_add(req, issue_flags);
break;
case IORING_OP_POLL_REMOVE:
- ret = io_poll_remove(req, issue_flags);
+ ret = io_poll_update(req, issue_flags);
break;
case IORING_OP_SYNC_FILE_RANGE:
ret = io_sync_file_range(req, issue_flags);
#endif
#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
-static inline struct file **io_fixed_file_slot(struct fixed_rsrc_data *file_data,
- unsigned i)
+static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
+ unsigned i)
{
- struct fixed_rsrc_table *table;
+ struct io_fixed_file *table_l2;
- table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
- return &table->files[i & IORING_FILE_TABLE_MASK];
+ table_l2 = table->files[i >> IORING_FILE_TABLE_SHIFT];
+ return &table_l2[i & IORING_FILE_TABLE_MASK];
}
static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
int index)
{
- struct file **file_slot = io_fixed_file_slot(ctx->file_data, index);
+ struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
- return (struct file *) ((unsigned long) *file_slot & FFS_MASK);
+ return (struct file *) (slot->file_ptr & FFS_MASK);
+}
+
+static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
+{
+ unsigned long file_ptr = (unsigned long) file;
+
+ if (__io_file_supports_async(file, READ))
+ file_ptr |= FFS_ASYNC_READ;
+ if (__io_file_supports_async(file, WRITE))
+ file_ptr |= FFS_ASYNC_WRITE;
+ if (S_ISREG(file_inode(file)->i_mode))
+ file_ptr |= FFS_ISREG;
+ file_slot->file_ptr = file_ptr;
}
static struct file *io_file_get(struct io_submit_state *state,
if (unlikely((unsigned int)fd >= ctx->nr_user_files))
return NULL;
fd = array_index_nospec(fd, ctx->nr_user_files);
- file_ptr = (unsigned long) *io_fixed_file_slot(ctx->file_data, fd);
+ file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
file = (struct file *) (file_ptr & FFS_MASK);
file_ptr &= ~FFS_MASK;
/* mask in overlapping REQ_F and FFS bits */
req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT);
- io_set_resource_node(req);
+ io_req_set_rsrc_node(req);
} else {
trace_io_uring_file_get(ctx, fd);
file = __io_file_get(state, fd);
io_put_req_deferred(prev, 1);
} else {
io_req_complete_post(req, -ETIME, 0);
- io_put_req_deferred(req, 1);
}
+ io_put_req_deferred(req, 1);
return HRTIMER_NORESTART;
}
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}
+static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
+{
+ unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
+
+ for (i = 0; i < nr_tables; i++)
+ kfree(table->files[i]);
+ kfree(table->files);
+ table->files = NULL;
+}
+
static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
#if defined(CONFIG_UNIX)
fput(file);
}
#endif
-}
-
-static void io_rsrc_data_ref_zero(struct percpu_ref *ref)
-{
- struct fixed_rsrc_data *data;
-
- data = container_of(ref, struct fixed_rsrc_data, refs);
- complete(&data->done);
+ io_free_file_tables(&ctx->file_table, ctx->nr_user_files);
+ kfree(ctx->file_data);
+ ctx->file_data = NULL;
+ ctx->nr_user_files = 0;
}
static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
spin_unlock_bh(&ctx->rsrc_ref_lock);
}
-static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx,
- struct fixed_rsrc_data *rsrc_data,
- struct fixed_rsrc_ref_node *ref_node)
+static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
{
- io_rsrc_ref_lock(ctx);
- rsrc_data->node = ref_node;
- list_add_tail(&ref_node->node, &ctx->rsrc_ref_list);
- io_rsrc_ref_unlock(ctx);
- percpu_ref_get(&rsrc_data->refs);
+ percpu_ref_exit(&ref_node->refs);
+ kfree(ref_node);
}
-static void io_sqe_rsrc_kill_node(struct io_ring_ctx *ctx, struct fixed_rsrc_data *data)
+static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
+ struct io_rsrc_data *data_to_kill)
{
- struct fixed_rsrc_ref_node *ref_node = NULL;
+ WARN_ON_ONCE(!ctx->rsrc_backup_node);
+ WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
- io_rsrc_ref_lock(ctx);
- ref_node = data->node;
- data->node = NULL;
- io_rsrc_ref_unlock(ctx);
- if (ref_node)
- percpu_ref_kill(&ref_node->refs);
+ if (data_to_kill) {
+ struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
+
+ rsrc_node->rsrc_data = data_to_kill;
+ io_rsrc_ref_lock(ctx);
+ list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
+ io_rsrc_ref_unlock(ctx);
+
+ atomic_inc(&data_to_kill->refs);
+ percpu_ref_kill(&rsrc_node->refs);
+ ctx->rsrc_node = NULL;
+ }
+
+ if (!ctx->rsrc_node) {
+ ctx->rsrc_node = ctx->rsrc_backup_node;
+ ctx->rsrc_backup_node = NULL;
+ }
}
-static int io_rsrc_refnode_prealloc(struct io_ring_ctx *ctx)
+static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
{
if (ctx->rsrc_backup_node)
return 0;
- ctx->rsrc_backup_node = alloc_fixed_rsrc_ref_node(ctx);
+ ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
return ctx->rsrc_backup_node ? 0 : -ENOMEM;
}
-static struct fixed_rsrc_ref_node *
-io_rsrc_refnode_get(struct io_ring_ctx *ctx,
- struct fixed_rsrc_data *rsrc_data,
- void (*rsrc_put)(struct io_ring_ctx *ctx,
- struct io_rsrc_put *prsrc))
-{
- struct fixed_rsrc_ref_node *node = ctx->rsrc_backup_node;
-
- WARN_ON_ONCE(!node);
-
- ctx->rsrc_backup_node = NULL;
- node->rsrc_data = rsrc_data;
- node->rsrc_put = rsrc_put;
- return node;
-}
-
-static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
- struct io_ring_ctx *ctx,
- void (*rsrc_put)(struct io_ring_ctx *ctx,
- struct io_rsrc_put *prsrc))
+static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx)
{
- struct fixed_rsrc_ref_node *node;
int ret;
+ /* As we may drop ->uring_lock, other task may have started quiesce */
if (data->quiesce)
return -ENXIO;
data->quiesce = true;
do {
- ret = io_rsrc_refnode_prealloc(ctx);
+ ret = io_rsrc_node_switch_start(ctx);
if (ret)
break;
- io_sqe_rsrc_kill_node(ctx, data);
- percpu_ref_kill(&data->refs);
- flush_delayed_work(&ctx->rsrc_put_work);
+ io_rsrc_node_switch(ctx, data);
+ /* kill initial ref, already quiesced if zero */
+ if (atomic_dec_and_test(&data->refs))
+ break;
+ flush_delayed_work(&ctx->rsrc_put_work);
ret = wait_for_completion_interruptible(&data->done);
if (!ret)
break;
- percpu_ref_resurrect(&data->refs);
- node = io_rsrc_refnode_get(ctx, data, rsrc_put);
- io_sqe_rsrc_set_node(ctx, data, node);
+ atomic_inc(&data->refs);
+ /* wait for all works potentially completing data->done */
+ flush_delayed_work(&ctx->rsrc_put_work);
reinit_completion(&data->done);
mutex_unlock(&ctx->uring_lock);
return ret;
}
-static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx)
+static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx,
+ rsrc_put_fn *do_put)
{
- struct fixed_rsrc_data *data;
+ struct io_rsrc_data *data;
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
return NULL;
- if (percpu_ref_init(&data->refs, io_rsrc_data_ref_zero,
- PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
- kfree(data);
- return NULL;
- }
+ atomic_set(&data->refs, 1);
data->ctx = ctx;
+ data->do_put = do_put;
init_completion(&data->done);
return data;
}
-static void free_fixed_rsrc_data(struct fixed_rsrc_data *data)
-{
- percpu_ref_exit(&data->refs);
- kfree(data->table);
- kfree(data);
-}
-
static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
- struct fixed_rsrc_data *data = ctx->file_data;
- unsigned nr_tables, i;
int ret;
- /*
- * percpu_ref_is_dying() is to stop parallel files unregister
- * Since we possibly drop uring lock later in this function to
- * run task work.
- */
- if (!data || percpu_ref_is_dying(&data->refs))
+ if (!ctx->file_data)
return -ENXIO;
- ret = io_rsrc_ref_quiesce(data, ctx, io_ring_file_put);
- if (ret)
- return ret;
-
- __io_sqe_files_unregister(ctx);
- nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
- for (i = 0; i < nr_tables; i++)
- kfree(data->table[i].files);
- free_fixed_rsrc_data(data);
- ctx->file_data = NULL;
- ctx->nr_user_files = 0;
- return 0;
+ ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
+ if (!ret)
+ __io_sqe_files_unregister(ctx);
+ return ret;
}
static void io_sq_thread_unpark(struct io_sq_data *sqd)
static void io_sq_thread_stop(struct io_sq_data *sqd)
{
WARN_ON_ONCE(sqd->thread == current);
+ WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
- mutex_lock(&sqd->lock);
set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+ mutex_lock(&sqd->lock);
if (sqd->thread)
wake_up_process(sqd->thread);
mutex_unlock(&sqd->lock);
}
#endif
-static int io_sqe_alloc_file_tables(struct fixed_rsrc_data *file_data,
- unsigned nr_tables, unsigned nr_files)
+static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
{
- int i;
+ unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
+
+ table->files = kcalloc(nr_tables, sizeof(*table->files), GFP_KERNEL);
+ if (!table->files)
+ return false;
for (i = 0; i < nr_tables; i++) {
- struct fixed_rsrc_table *table = &file_data->table[i];
- unsigned this_files;
+ unsigned int this_files = min(nr_files, IORING_MAX_FILES_TABLE);
- this_files = min(nr_files, IORING_MAX_FILES_TABLE);
- table->files = kcalloc(this_files, sizeof(struct file *),
+ table->files[i] = kcalloc(this_files, sizeof(*table->files[i]),
GFP_KERNEL);
- if (!table->files)
+ if (!table->files[i])
break;
nr_files -= this_files;
}
if (i == nr_tables)
- return 0;
+ return true;
- for (i = 0; i < nr_tables; i++) {
- struct fixed_rsrc_table *table = &file_data->table[i];
- kfree(table->files);
- }
- return 1;
+ io_free_file_tables(table, nr_tables * IORING_MAX_FILES_TABLE);
+ return false;
}
-static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
+static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
{
struct file *file = prsrc->file;
#if defined(CONFIG_UNIX)
#endif
}
-static void __io_rsrc_put_work(struct fixed_rsrc_ref_node *ref_node)
+static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
{
- struct fixed_rsrc_data *rsrc_data = ref_node->rsrc_data;
+ struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
struct io_ring_ctx *ctx = rsrc_data->ctx;
struct io_rsrc_put *prsrc, *tmp;
list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
list_del(&prsrc->list);
- ref_node->rsrc_put(ctx, prsrc);
+ rsrc_data->do_put(ctx, prsrc);
kfree(prsrc);
}
- percpu_ref_exit(&ref_node->refs);
- kfree(ref_node);
- percpu_ref_put(&rsrc_data->refs);
+ io_rsrc_node_destroy(ref_node);
+ if (atomic_dec_and_test(&rsrc_data->refs))
+ complete(&rsrc_data->done);
}
static void io_rsrc_put_work(struct work_struct *work)
node = llist_del_all(&ctx->rsrc_put_llist);
while (node) {
- struct fixed_rsrc_ref_node *ref_node;
+ struct io_rsrc_node *ref_node;
struct llist_node *next = node->next;
- ref_node = llist_entry(node, struct fixed_rsrc_ref_node, llist);
+ ref_node = llist_entry(node, struct io_rsrc_node, llist);
__io_rsrc_put_work(ref_node);
node = next;
}
static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
{
- struct fixed_rsrc_ref_node *ref_node;
- struct fixed_rsrc_data *data;
- struct io_ring_ctx *ctx;
+ struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
+ struct io_ring_ctx *ctx = node->rsrc_data->ctx;
bool first_add = false;
- int delay = HZ;
-
- ref_node = container_of(ref, struct fixed_rsrc_ref_node, refs);
- data = ref_node->rsrc_data;
- ctx = data->ctx;
io_rsrc_ref_lock(ctx);
- ref_node->done = true;
+ node->done = true;
while (!list_empty(&ctx->rsrc_ref_list)) {
- ref_node = list_first_entry(&ctx->rsrc_ref_list,
- struct fixed_rsrc_ref_node, node);
+ node = list_first_entry(&ctx->rsrc_ref_list,
+ struct io_rsrc_node, node);
/* recycle ref nodes in order */
- if (!ref_node->done)
+ if (!node->done)
break;
- list_del(&ref_node->node);
- first_add |= llist_add(&ref_node->llist, &ctx->rsrc_put_llist);
+ list_del(&node->node);
+ first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
}
io_rsrc_ref_unlock(ctx);
- if (percpu_ref_is_dying(&data->refs))
- delay = 0;
-
- if (!delay)
- mod_delayed_work(system_wq, &ctx->rsrc_put_work, 0);
- else if (first_add)
- queue_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
+ if (first_add)
+ mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
}
-static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
- struct io_ring_ctx *ctx)
+static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
{
- struct fixed_rsrc_ref_node *ref_node;
+ struct io_rsrc_node *ref_node;
ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
if (!ref_node)
return ref_node;
}
-static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
- struct fixed_rsrc_ref_node *ref_node)
-{
- ref_node->rsrc_data = ctx->file_data;
- ref_node->rsrc_put = io_ring_file_put;
-}
-
-static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
-{
- percpu_ref_exit(&ref_node->refs);
- kfree(ref_node);
-}
-
-
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
__s32 __user *fds = (__s32 __user *) arg;
- unsigned nr_tables, i;
struct file *file;
- int fd, ret = -ENOMEM;
- struct fixed_rsrc_ref_node *ref_node;
- struct fixed_rsrc_data *file_data;
+ int fd, ret;
+ unsigned i;
+ struct io_rsrc_data *file_data;
if (ctx->file_data)
return -EBUSY;
return -EINVAL;
if (nr_args > IORING_MAX_FIXED_FILES)
return -EMFILE;
+ ret = io_rsrc_node_switch_start(ctx);
+ if (ret)
+ return ret;
- file_data = alloc_fixed_rsrc_data(ctx);
+ file_data = io_rsrc_data_alloc(ctx, io_rsrc_file_put);
if (!file_data)
return -ENOMEM;
ctx->file_data = file_data;
-
- nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
- file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
- GFP_KERNEL);
- if (!file_data->table)
- goto out_free;
-
- if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args))
+ ret = -ENOMEM;
+ if (!io_alloc_file_tables(&ctx->file_table, nr_args))
goto out_free;
for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
- unsigned long file_ptr;
-
if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
ret = -EFAULT;
goto out_fput;
fput(file);
goto out_fput;
}
- file_ptr = (unsigned long) file;
- if (__io_file_supports_async(file, READ))
- file_ptr |= FFS_ASYNC_READ;
- if (__io_file_supports_async(file, WRITE))
- file_ptr |= FFS_ASYNC_WRITE;
- if (S_ISREG(file_inode(file)->i_mode))
- file_ptr |= FFS_ISREG;
- *io_fixed_file_slot(file_data, i) = (struct file *) file_ptr;
+ io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
}
ret = io_sqe_files_scm(ctx);
if (ret) {
- io_sqe_files_unregister(ctx);
+ __io_sqe_files_unregister(ctx);
return ret;
}
- ref_node = alloc_fixed_rsrc_ref_node(ctx);
- if (!ref_node) {
- io_sqe_files_unregister(ctx);
- return -ENOMEM;
- }
- init_fixed_file_ref_node(ctx, ref_node);
-
- io_sqe_rsrc_set_node(ctx, file_data, ref_node);
+ io_rsrc_node_switch(ctx, NULL);
return ret;
out_fput:
for (i = 0; i < ctx->nr_user_files; i++) {
if (file)
fput(file);
}
- for (i = 0; i < nr_tables; i++)
- kfree(file_data->table[i].files);
+ io_free_file_tables(&ctx->file_table, nr_args);
ctx->nr_user_files = 0;
out_free:
- free_fixed_rsrc_data(ctx->file_data);
+ kfree(ctx->file_data);
ctx->file_data = NULL;
return ret;
}
#endif
}
-static int io_queue_rsrc_removal(struct fixed_rsrc_data *data, void *rsrc)
+static int io_queue_rsrc_removal(struct io_rsrc_data *data,
+ struct io_rsrc_node *node, void *rsrc)
{
struct io_rsrc_put *prsrc;
- struct fixed_rsrc_ref_node *ref_node = data->node;
prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
if (!prsrc)
return -ENOMEM;
prsrc->rsrc = rsrc;
- list_add(&prsrc->list, &ref_node->rsrc_list);
-
+ list_add(&prsrc->list, &node->rsrc_list);
return 0;
}
-static inline int io_queue_file_removal(struct fixed_rsrc_data *data,
- struct file *file)
-{
- return io_queue_rsrc_removal(data, (void *)file);
-}
-
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_rsrc_update *up,
unsigned nr_args)
{
- struct fixed_rsrc_data *data = ctx->file_data;
- struct fixed_rsrc_ref_node *ref_node;
- struct file *file, **file_slot;
+ struct io_rsrc_data *data = ctx->file_data;
+ struct io_fixed_file *file_slot;
+ struct file *file;
__s32 __user *fds;
int fd, i, err;
__u32 done;
return -EOVERFLOW;
if (done > ctx->nr_user_files)
return -EINVAL;
- err = io_rsrc_refnode_prealloc(ctx);
+ err = io_rsrc_node_switch_start(ctx);
if (err)
return err;
continue;
i = array_index_nospec(up->offset + done, ctx->nr_user_files);
- file_slot = io_fixed_file_slot(ctx->file_data, i);
+ file_slot = io_fixed_file_slot(&ctx->file_table, i);
- if (*file_slot) {
- file = (struct file *) ((unsigned long) *file_slot & FFS_MASK);
- err = io_queue_file_removal(data, file);
+ if (file_slot->file_ptr) {
+ file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+ err = io_queue_rsrc_removal(data, ctx->rsrc_node, file);
if (err)
break;
- *file_slot = NULL;
+ file_slot->file_ptr = 0;
needs_switch = true;
}
if (fd != -1) {
err = -EBADF;
break;
}
- *file_slot = file;
+ io_fixed_file_set(file_slot, file);
err = io_sqe_file_register(ctx, file, i);
if (err) {
- *file_slot = NULL;
+ file_slot->file_ptr = 0;
fput(file);
break;
}
}
}
- if (needs_switch) {
- percpu_ref_kill(&data->node->refs);
- ref_node = io_rsrc_refnode_get(ctx, data, io_ring_file_put);
- io_sqe_rsrc_set_node(ctx, data, ref_node);
- }
+ if (needs_switch)
+ io_rsrc_node_switch(ctx, data);
return done ? done : err;
}
return req ? &req->work : NULL;
}
-static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
+static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
+ struct task_struct *task)
{
struct io_wq_hash *hash;
struct io_wq_data data;
}
data.hash = hash;
+ data.task = task;
data.free_work = io_free_work;
data.do_work = io_wq_submit_work;
return ret;
}
- tctx->io_wq = io_init_wq_offload(ctx);
+ tctx->io_wq = io_init_wq_offload(ctx, task);
if (IS_ERR(tctx->io_wq)) {
ret = PTR_ERR(tctx->io_wq);
percpu_counter_destroy(&tctx->inflight);
init_waitqueue_head(&tctx->wait);
tctx->last = NULL;
atomic_set(&tctx->in_idle, 0);
+ atomic_set(&tctx->inflight_tracked, 0);
task->io_uring = tctx;
spin_lock_init(&tctx->task_lock);
INIT_WQ_LIST(&tctx->task_list);
struct io_sq_data *sqd;
bool attached;
- ret = -EPERM;
- if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
- goto err;
-
sqd = io_get_sq_data(p, &attached);
if (IS_ERR(sqd)) {
ret = PTR_ERR(sqd);
return off;
}
+static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
+{
+ unsigned int i;
+
+ for (i = 0; i < imu->nr_bvecs; i++)
+ unpin_user_page(imu->bvec[i].bv_page);
+ if (imu->acct_pages)
+ io_unaccount_mem(ctx, imu->acct_pages);
+ kvfree(imu->bvec);
+ imu->nr_bvecs = 0;
+}
+
static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
- int i, j;
+ unsigned int i;
if (!ctx->user_bufs)
return -ENXIO;
- for (i = 0; i < ctx->nr_user_bufs; i++) {
- struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
-
- for (j = 0; j < imu->nr_bvecs; j++)
- unpin_user_page(imu->bvec[j].bv_page);
-
- if (imu->acct_pages)
- io_unaccount_mem(ctx, imu->acct_pages);
- kvfree(imu->bvec);
- imu->nr_bvecs = 0;
- }
-
+ for (i = 0; i < ctx->nr_user_bufs; i++)
+ io_buffer_unmap(ctx, &ctx->user_bufs[i]);
kfree(ctx->user_bufs);
ctx->user_bufs = NULL;
ctx->nr_user_bufs = 0;
}
/* store original address for later verification */
imu->ubuf = ubuf;
- imu->len = iov->iov_len;
+ imu->ubuf_end = ubuf + iov->iov_len;
imu->nr_bvecs = nr_pages;
ret = 0;
done:
static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
{
- if (ctx->user_bufs)
- return -EBUSY;
- if (!nr_args || nr_args > UIO_MAXIOV)
- return -EINVAL;
-
- ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
- GFP_KERNEL);
- if (!ctx->user_bufs)
- return -ENOMEM;
-
- return 0;
+ ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
+ return ctx->user_bufs ? 0 : -ENOMEM;
}
static int io_buffer_validate(struct iovec *iov)
{
+ unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
+
/*
* Don't impose further limits on the size and buffer
* constraints here, we'll -EINVAL later when IO is
if (iov->iov_len > SZ_1G)
return -EFAULT;
+ if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
+ return -EOVERFLOW;
+
return 0;
}
struct iovec iov;
struct page *last_hpage = NULL;
+ if (ctx->user_bufs)
+ return -EBUSY;
+ if (!nr_args || nr_args > UIO_MAXIOV)
+ return -EINVAL;
ret = io_buffers_map_alloc(ctx, nr_args);
if (ret)
return ret;
- for (i = 0; i < nr_args; i++) {
+ for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
ret = io_copy_iov(ctx, &iov, arg, i);
if (ret)
break;
-
ret = io_buffer_validate(&iov);
if (ret)
break;
-
ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
if (ret)
break;
-
- ctx->nr_user_bufs++;
}
if (ret)
static void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
- /*
- * Some may use context even when all refs and requests have been put,
- * and they are free to do so while still holding uring_lock or
- * completion_lock, see __io_req_task_submit(). Wait for them to finish.
- */
- mutex_lock(&ctx->uring_lock);
- mutex_unlock(&ctx->uring_lock);
- spin_lock_irq(&ctx->completion_lock);
- spin_unlock_irq(&ctx->completion_lock);
-
io_sq_thread_finish(ctx);
io_sqe_buffers_unregister(ctx);
}
mutex_lock(&ctx->uring_lock);
- io_sqe_files_unregister(ctx);
+ if (ctx->file_data) {
+ if (!atomic_dec_and_test(&ctx->file_data->refs))
+ wait_for_completion(&ctx->file_data->done);
+ __io_sqe_files_unregister(ctx);
+ }
+ if (ctx->rings)
+ __io_cqring_overflow_flush(ctx, true);
mutex_unlock(&ctx->uring_lock);
io_eventfd_unregister(ctx);
io_destroy_buffers(ctx);
+ /* there are no registered resources left, nobody uses it */
+ if (ctx->rsrc_node)
+ io_rsrc_node_destroy(ctx->rsrc_node);
if (ctx->rsrc_backup_node)
- destroy_fixed_rsrc_ref_node(ctx->rsrc_backup_node);
+ io_rsrc_node_destroy(ctx->rsrc_backup_node);
+ flush_delayed_work(&ctx->rsrc_put_work);
+
+ WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
+ WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
#if defined(CONFIG_UNIX)
if (ctx->ring_sock) {
WARN_ON_ONCE(time_after(jiffies, timeout));
} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
+ init_completion(&exit.completion);
+ init_task_work(&exit.task_work, io_tctx_exit_cb);
+ exit.ctx = ctx;
+ /*
+ * Some may use context even when all refs and requests have been put,
+ * and they are free to do so while still holding uring_lock or
+ * completion_lock, see __io_req_task_submit(). Apart from other work,
+ * this lock/unlock section also waits them to finish.
+ */
mutex_lock(&ctx->uring_lock);
while (!list_empty(&ctx->tctx_list)) {
WARN_ON_ONCE(time_after(jiffies, timeout));
node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
ctx_node);
- exit.ctx = ctx;
- init_completion(&exit.completion);
- init_task_work(&exit.task_work, io_tctx_exit_cb);
+ /* don't spin on a single task if cancellation failed */
+ list_rotate_left(&ctx->tctx_list);
ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
if (WARN_ON_ONCE(ret))
continue;
mutex_unlock(&ctx->uring_lock);
wait_for_completion(&exit.completion);
- cond_resched();
mutex_lock(&ctx->uring_lock);
}
mutex_unlock(&ctx->uring_lock);
+ spin_lock_irq(&ctx->completion_lock);
+ spin_unlock_irq(&ctx->completion_lock);
io_ring_ctx_free(ctx);
}
mutex_lock(&ctx->uring_lock);
percpu_ref_kill(&ctx->refs);
- /* if force is set, the ring is going away. always drop after that */
- ctx->cq_overflow_flushed = 1;
if (ctx->rings)
__io_cqring_overflow_flush(ctx, true);
xa_for_each(&ctx->personalities, index, creds)
}
}
-static int io_uring_count_inflight(struct io_ring_ctx *ctx,
- struct task_struct *task,
- struct files_struct *files)
-{
- struct io_kiocb *req;
- int cnt = 0;
-
- spin_lock_irq(&ctx->inflight_lock);
- list_for_each_entry(req, &ctx->inflight_list, inflight_entry)
- cnt += io_match_task(req, task, files);
- spin_unlock_irq(&ctx->inflight_lock);
- return cnt;
-}
-
-static void io_uring_cancel_files(struct io_ring_ctx *ctx,
- struct task_struct *task,
- struct files_struct *files)
-{
- while (!list_empty_careful(&ctx->inflight_list)) {
- DEFINE_WAIT(wait);
- int inflight;
-
- inflight = io_uring_count_inflight(ctx, task, files);
- if (!inflight)
- break;
-
- io_uring_try_cancel_requests(ctx, task, files);
-
- prepare_to_wait(&task->io_uring->wait, &wait,
- TASK_UNINTERRUPTIBLE);
- if (inflight == io_uring_count_inflight(ctx, task, files))
- schedule();
- finish_wait(&task->io_uring->wait, &wait);
- }
-}
-
static int __io_uring_add_task_file(struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx = current->io_uring;
}
}
-static s64 tctx_inflight(struct io_uring_task *tctx)
+static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
{
+ if (tracked)
+ return atomic_read(&tctx->inflight_tracked);
return percpu_counter_sum(&tctx->inflight);
}
wait_for_completion(&work.completion);
}
-void __io_uring_files_cancel(struct files_struct *files)
+static void io_uring_try_cancel(struct files_struct *files)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_node *node;
unsigned long index;
- /* make sure overflow events are dropped */
- atomic_inc(&tctx->in_idle);
xa_for_each(&tctx->xa, index, node) {
struct io_ring_ctx *ctx = node->ctx;
io_sqpoll_cancel_sync(ctx);
continue;
}
- io_uring_cancel_files(ctx, current, files);
- if (!files)
- io_uring_try_cancel_requests(ctx, current, NULL);
+ io_uring_try_cancel_requests(ctx, current, files);
}
- atomic_dec(&tctx->in_idle);
-
- if (files)
- io_uring_clean_tctx(tctx);
}
/* should only be called by SQPOLL task */
atomic_inc(&tctx->in_idle);
do {
/* read completions before cancelations */
- inflight = tctx_inflight(tctx);
+ inflight = tctx_inflight(tctx, false);
if (!inflight)
break;
io_uring_try_cancel_requests(ctx, current, NULL);
* avoids a race where a completion comes in before we did
* prepare_to_wait().
*/
- if (inflight == tctx_inflight(tctx))
+ if (inflight == tctx_inflight(tctx, false))
schedule();
finish_wait(&tctx->wait, &wait);
} while (1);
* Find any io_uring fd that this task has registered or done IO on, and cancel
* requests.
*/
-void __io_uring_task_cancel(void)
+void __io_uring_cancel(struct files_struct *files)
{
struct io_uring_task *tctx = current->io_uring;
DEFINE_WAIT(wait);
/* make sure overflow events are dropped */
atomic_inc(&tctx->in_idle);
- __io_uring_files_cancel(NULL);
+ io_uring_try_cancel(files);
do {
/* read completions before cancelations */
- inflight = tctx_inflight(tctx);
+ inflight = tctx_inflight(tctx, !!files);
if (!inflight)
break;
- __io_uring_files_cancel(NULL);
-
+ io_uring_try_cancel(files);
prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
/*
* avoids a race where a completion comes in before we did
* prepare_to_wait().
*/
- if (inflight == tctx_inflight(tctx))
+ if (inflight == tctx_inflight(tctx, !!files))
schedule();
finish_wait(&tctx->wait, &wait);
} while (1);
-
atomic_dec(&tctx->in_idle);
io_uring_clean_tctx(tctx);
- /* all current's requests should be gone, we can kill tctx */
- __io_uring_free(current);
+ if (!files) {
+ /* for exec all current's requests should be gone, kill tctx */
+ __io_uring_free(current);
+ }
}
static void *io_uring_validate_mmap_request(struct file *file,
seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
+ unsigned int len = buf->ubuf_end - buf->ubuf;
- seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
- (unsigned int) buf->len);
+ seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
}
if (has_lock && !xa_empty(&ctx->personalities)) {
unsigned long index;
static bool io_register_op_must_quiesce(int op)
{
switch (op) {
+ case IORING_REGISTER_FILES:
case IORING_UNREGISTER_FILES:
case IORING_REGISTER_FILES_UPDATE:
case IORING_REGISTER_PROBE:
if (ret < 0)
break;
} while (1);
-
mutex_lock(&ctx->uring_lock);
if (ret) {
- percpu_ref_resurrect(&ctx->refs);
- goto out_quiesce;
+ io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
+ return ret;
}
}
if (io_register_op_must_quiesce(opcode)) {
/* bring the ctx back to life */
percpu_ref_reinit(&ctx->refs);
-out_quiesce:
reinit_completion(&ctx->ref_comp);
}
return ret;