#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
+#define IO_RSRC_TAG_TABLE_SHIFT 9
+#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
+#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
+
#define IORING_MAX_REG_BUFFERS (1U << 14)
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
IOSQE_BUFFER_SELECT)
+#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
+ REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS)
+
+#define IO_TCTX_REFS_CACHE_NR (1U << 10)
struct io_uring {
u32 head ____cacheline_aligned_in_smp;
struct io_rsrc_data {
struct io_ring_ctx *ctx;
- u64 *tags;
+ u64 **tags;
+ unsigned int nr;
rsrc_put_fn *do_put;
atomic_t refs;
struct completion done;
};
struct io_ring_ctx {
+ /* const or read-mostly hot data */
struct {
struct percpu_ref refs;
- } ____cacheline_aligned_in_smp;
- struct {
+ struct io_rings *rings;
unsigned int flags;
unsigned int compat: 1;
unsigned int drain_next: 1;
unsigned int eventfd_async: 1;
unsigned int restricted: 1;
+ unsigned int off_timeout_used: 1;
+ unsigned int drain_active: 1;
+ } ____cacheline_aligned_in_smp;
+
+ /* submission data */
+ struct {
+ struct mutex uring_lock;
/*
* Ring buffer of indices into array of io_uring_sqe, which is
* array.
*/
u32 *sq_array;
+ struct io_uring_sqe *sq_sqes;
unsigned cached_sq_head;
unsigned sq_entries;
- unsigned sq_thread_idle;
- unsigned cached_sq_dropped;
- unsigned long sq_check_overflow;
-
struct list_head defer_list;
- struct list_head timeout_list;
- struct list_head cq_overflow_list;
- struct io_uring_sqe *sq_sqes;
- } ____cacheline_aligned_in_smp;
+ /*
+ * Fixed resources fast path, should be accessed only under
+ * uring_lock, and updated through io_uring_register(2)
+ */
+ struct io_rsrc_node *rsrc_node;
+ struct io_file_table file_table;
+ unsigned nr_user_files;
+ unsigned nr_user_bufs;
+ struct io_mapped_ubuf **user_bufs;
- struct {
- struct mutex uring_lock;
- wait_queue_head_t wait;
+ struct io_submit_state submit_state;
+ struct list_head timeout_list;
+ struct list_head cq_overflow_list;
+ struct xarray io_buffers;
+ struct xarray personalities;
+ u32 pers_next;
+ unsigned sq_thread_idle;
} ____cacheline_aligned_in_smp;
- struct io_submit_state submit_state;
/* IRQ completion list, under ->completion_lock */
struct list_head locked_free_list;
unsigned int locked_free_nr;
- struct io_rings *rings;
-
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
struct io_sq_data *sq_data; /* if using sq thread polling */
struct wait_queue_head sqo_sq_wait;
struct list_head sqd_list;
- /*
- * Fixed resources fast path, should be accessed only under uring_lock,
- * and updated through io_uring_register(2)
- */
- struct io_rsrc_node *rsrc_node;
-
- struct io_file_table file_table;
- unsigned nr_user_files;
- unsigned nr_user_bufs;
- struct io_mapped_ubuf **user_bufs;
-
- struct xarray io_buffers;
- struct xarray personalities;
- u32 pers_next;
+ unsigned long check_cq_overflow;
struct {
unsigned cached_cq_tail;
unsigned cq_entries;
- atomic_t cq_timeouts;
- unsigned cq_last_tm_flush;
- unsigned cq_extra;
- unsigned long cq_check_overflow;
+ struct eventfd_ctx *cq_ev_fd;
+ struct wait_queue_head poll_wait;
struct wait_queue_head cq_wait;
+ unsigned cq_extra;
+ atomic_t cq_timeouts;
struct fasync_struct *cq_fasync;
- struct eventfd_ctx *cq_ev_fd;
+ unsigned cq_last_tm_flush;
} ____cacheline_aligned_in_smp;
struct {
struct io_uring_task {
/* submission side */
+ int cached_refs;
struct xarray xa;
struct wait_queue_head wait;
const struct io_ring_ctx *last;
REQ_F_COMPLETE_INLINE_BIT,
REQ_F_REISSUE_BIT,
REQ_F_DONT_REISSUE_BIT,
+ REQ_F_CREDS_BIT,
/* keep async read/write and isreg together and in order */
REQ_F_ASYNC_READ_BIT,
REQ_F_ASYNC_WRITE_BIT,
REQ_F_ASYNC_WRITE = BIT(REQ_F_ASYNC_WRITE_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
+ /* has creds assigned */
+ REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),
};
struct async_poll {
struct hlist_node hash_node;
struct async_poll *apoll;
struct io_wq_work work;
+ const struct cred *creds;
+
/* store used ubuf, so we can prevent reloading */
struct io_mapped_ubuf *imu;
};
static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
bool cancel_all);
-static void io_uring_cancel_sqpoll(struct io_sq_data *sqd);
+static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
static void io_rsrc_put_work(struct work_struct *work);
static void io_req_task_queue(struct io_kiocb *req);
-static void io_submit_flush_completions(struct io_comp_state *cs,
- struct io_ring_ctx *ctx);
+static void io_submit_flush_completions(struct io_ring_ctx *ctx);
static bool io_poll_remove_waitqs(struct io_kiocb *req);
static int io_req_prep_async(struct io_kiocb *req);
ctx->flags = p->flags;
init_waitqueue_head(&ctx->sqo_sq_wait);
INIT_LIST_HEAD(&ctx->sqd_list);
- init_waitqueue_head(&ctx->cq_wait);
+ init_waitqueue_head(&ctx->poll_wait);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
init_completion(&ctx->ref_comp);
xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
mutex_init(&ctx->uring_lock);
- init_waitqueue_head(&ctx->wait);
+ init_waitqueue_head(&ctx->cq_wait);
spin_lock_init(&ctx->completion_lock);
INIT_LIST_HEAD(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->defer_list);
const struct io_op_def *def = &io_op_defs[req->opcode];
struct io_ring_ctx *ctx = req->ctx;
- if (!req->work.creds)
- req->work.creds = get_current_cred();
+ if (!(req->flags & REQ_F_CREDS)) {
+ req->flags |= REQ_F_CREDS;
+ req->creds = get_current_cred();
+ }
req->work.list.next = NULL;
req->work.flags = 0;
}
}
-static void __io_queue_deferred(struct io_ring_ctx *ctx)
+static void io_queue_deferred(struct io_ring_ctx *ctx)
{
- do {
+ while (!list_empty(&ctx->defer_list)) {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
struct io_defer_entry, list);
list_del_init(&de->list);
io_req_task_queue(de->req);
kfree(de);
- } while (!list_empty(&ctx->defer_list));
+ }
}
static void io_flush_timeouts(struct io_ring_ctx *ctx)
{
- u32 seq;
-
- if (list_empty(&ctx->timeout_list))
- return;
+ u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
- seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
-
- do {
+ while (!list_empty(&ctx->timeout_list)) {
u32 events_needed, events_got;
struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
struct io_kiocb, timeout.list);
list_del_init(&req->timeout.list);
io_kill_timeout(req, 0);
- } while (!list_empty(&ctx->timeout_list));
-
+ }
ctx->cq_last_tm_flush = seq;
}
-static void io_commit_cqring(struct io_ring_ctx *ctx)
+static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
- io_flush_timeouts(ctx);
+ if (ctx->off_timeout_used)
+ io_flush_timeouts(ctx);
+ if (ctx->drain_active)
+ io_queue_deferred(ctx);
+}
+static inline void io_commit_cqring(struct io_ring_ctx *ctx)
+{
+ if (unlikely(ctx->off_timeout_used || ctx->drain_active))
+ __io_commit_cqring_flush(ctx);
/* order cqe stores with ring update */
smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
-
- if (unlikely(!list_empty(&ctx->defer_list)))
- __io_queue_deferred(ctx);
}
static inline bool io_sqring_full(struct io_ring_ctx *ctx)
/* see waitqueue_active() comment */
smp_mb();
- if (waitqueue_active(&ctx->wait))
- wake_up(&ctx->wait);
+ if (waitqueue_active(&ctx->cq_wait))
+ wake_up(&ctx->cq_wait);
if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
wake_up(&ctx->sq_data->wait);
if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
- if (waitqueue_active(&ctx->cq_wait)) {
- wake_up_interruptible(&ctx->cq_wait);
+ if (waitqueue_active(&ctx->poll_wait)) {
+ wake_up_interruptible(&ctx->poll_wait);
kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
}
}
smp_mb();
if (ctx->flags & IORING_SETUP_SQPOLL) {
- if (waitqueue_active(&ctx->wait))
- wake_up(&ctx->wait);
+ if (waitqueue_active(&ctx->cq_wait))
+ wake_up(&ctx->cq_wait);
}
if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
- if (waitqueue_active(&ctx->cq_wait)) {
- wake_up_interruptible(&ctx->cq_wait);
+ if (waitqueue_active(&ctx->poll_wait)) {
+ wake_up_interruptible(&ctx->poll_wait);
kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
}
}
all_flushed = list_empty(&ctx->cq_overflow_list);
if (all_flushed) {
- clear_bit(0, &ctx->sq_check_overflow);
- clear_bit(0, &ctx->cq_check_overflow);
+ clear_bit(0, &ctx->check_cq_overflow);
ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
}
{
bool ret = true;
- if (test_bit(0, &ctx->cq_check_overflow)) {
+ if (test_bit(0, &ctx->check_cq_overflow)) {
/* iopoll syncs against uring_lock, not completion_lock */
if (ctx->flags & IORING_SETUP_IOPOLL)
mutex_lock(&ctx->uring_lock);
return false;
}
if (list_empty(&ctx->cq_overflow_list)) {
- set_bit(0, &ctx->sq_check_overflow);
- set_bit(0, &ctx->cq_check_overflow);
+ set_bit(0, &ctx->check_cq_overflow);
ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
}
ocqe->cqe.user_data = user_data;
static inline bool io_req_needs_clean(struct io_kiocb *req)
{
- return req->flags & (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP |
- REQ_F_POLLED | REQ_F_INFLIGHT);
+ return req->flags & IO_REQ_CLEAN_FLAGS;
}
static void io_req_complete_state(struct io_kiocb *req, long res,
percpu_ref_put(req->fixed_rsrc_refs);
if (req->async_data)
kfree(req->async_data);
- if (req->work.creds) {
- put_cred(req->work.creds);
- req->work.creds = NULL;
- }
}
/* must to be called somewhat shortly after putting a request */
return;
if (ctx->submit_state.comp.nr) {
mutex_lock(&ctx->uring_lock);
- io_submit_flush_completions(&ctx->submit_state.comp, ctx);
+ io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
}
percpu_ref_put(&ctx->refs);
}
-static bool __tctx_task_work(struct io_uring_task *tctx)
+static void tctx_task_work(struct callback_head *cb)
{
struct io_ring_ctx *ctx = NULL;
- struct io_wq_work_list list;
- struct io_wq_work_node *node;
+ struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
+ task_work);
- if (wq_list_empty(&tctx->task_list))
- return false;
+ clear_bit(0, &tctx->task_state);
- spin_lock_irq(&tctx->task_lock);
- list = tctx->task_list;
- INIT_WQ_LIST(&tctx->task_list);
- spin_unlock_irq(&tctx->task_lock);
+ while (!wq_list_empty(&tctx->task_list)) {
+ struct io_wq_work_node *node;
- node = list.first;
- while (node) {
- struct io_wq_work_node *next = node->next;
- struct io_kiocb *req;
+ spin_lock_irq(&tctx->task_lock);
+ node = tctx->task_list.first;
+ INIT_WQ_LIST(&tctx->task_list);
+ spin_unlock_irq(&tctx->task_lock);
- req = container_of(node, struct io_kiocb, io_task_work.node);
- if (req->ctx != ctx) {
- ctx_flush_and_put(ctx);
- ctx = req->ctx;
- percpu_ref_get(&ctx->refs);
- }
+ while (node) {
+ struct io_wq_work_node *next = node->next;
+ struct io_kiocb *req = container_of(node, struct io_kiocb,
+ io_task_work.node);
- req->task_work.func(&req->task_work);
- node = next;
+ if (req->ctx != ctx) {
+ ctx_flush_and_put(ctx);
+ ctx = req->ctx;
+ percpu_ref_get(&ctx->refs);
+ }
+ req->task_work.func(&req->task_work);
+ node = next;
+ }
+ cond_resched();
}
ctx_flush_and_put(ctx);
- return list.first != NULL;
-}
-
-static void tctx_task_work(struct callback_head *cb)
-{
- struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work);
-
- clear_bit(0, &tctx->task_state);
-
- while (__tctx_task_work(tctx))
- cond_resched();
}
static int io_req_task_work_add(struct io_kiocb *req)
list_add(&req->compl.list, &state->comp.free_list);
}
-static void io_submit_flush_completions(struct io_comp_state *cs,
- struct io_ring_ctx *ctx)
+static void io_submit_flush_completions(struct io_ring_ctx *ctx)
{
+ struct io_comp_state *cs = &ctx->submit_state.comp;
int i, nr = cs->nr;
struct io_kiocb *req;
struct req_batch rb;
* If we do, we can potentially be spinning for commands that
* already triggered a CQE (eg in error).
*/
- if (test_bit(0, &ctx->cq_check_overflow))
+ if (test_bit(0, &ctx->check_cq_overflow))
__io_cqring_overflow_flush(ctx, false);
if (io_cqring_events(ctx))
goto out;
return true;
}
-static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
+static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
{
if (req->file->f_op->read_iter)
return call_read_iter(req->file, &req->rw.kiocb, iter);
struct async_poll *apoll = req->apoll;
struct io_ring_ctx *ctx = req->ctx;
- trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
+ trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data);
if (io_poll_rewait(req, &apoll->poll)) {
spin_unlock_irq(&ctx->completion_lock);
return false;
}
spin_unlock_irq(&ctx->completion_lock);
- trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
- apoll->poll.events);
+ trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
+ mask, apoll->poll.events);
return true;
}
return -EINVAL;
req->timeout.off = off;
+ if (unlikely(off && !req->ctx->off_timeout_used))
+ req->ctx->off_timeout_used = true;
if (!req->async_data && io_alloc_async_data(req))
return -ENOMEM;
static u32 io_get_sequence(struct io_kiocb *req)
{
- struct io_kiocb *pos;
- struct io_ring_ctx *ctx = req->ctx;
- u32 total_submitted, nr_reqs = 0;
-
- io_for_each_link(pos, req)
- nr_reqs++;
+ u32 seq = req->ctx->cached_sq_head;
- total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
- return total_submitted - nr_reqs;
+ /* need original cached_sq_head, but it was increased for each req */
+ io_for_each_link(req, req)
+ seq--;
+ return seq;
}
-static int io_req_defer(struct io_kiocb *req)
+static bool io_drain_req(struct io_kiocb *req)
{
+ struct io_kiocb *pos;
struct io_ring_ctx *ctx = req->ctx;
struct io_defer_entry *de;
int ret;
u32 seq;
+ /*
+ * If we need to drain a request in the middle of a link, drain the
+ * head request and the next request/link after the current link.
+ * Considering sequential execution of links, IOSQE_IO_DRAIN will be
+ * maintained for every request of our link.
+ */
+ if (ctx->drain_next) {
+ req->flags |= REQ_F_IO_DRAIN;
+ ctx->drain_next = false;
+ }
+ /* not interested in head, start from the first linked */
+ io_for_each_link(pos, req->link) {
+ if (pos->flags & REQ_F_IO_DRAIN) {
+ ctx->drain_next = true;
+ req->flags |= REQ_F_IO_DRAIN;
+ break;
+ }
+ }
+
/* Still need defer if there is pending req in defer list. */
if (likely(list_empty_careful(&ctx->defer_list) &&
- !(req->flags & REQ_F_IO_DRAIN)))
- return 0;
+ !(req->flags & REQ_F_IO_DRAIN))) {
+ ctx->drain_active = false;
+ return false;
+ }
seq = io_get_sequence(req);
/* Still a chance to pass the sequence check */
if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
- return 0;
+ return false;
ret = io_req_prep_async(req);
if (ret)
return ret;
io_prep_async_link(req);
de = kmalloc(sizeof(*de), GFP_KERNEL);
- if (!de)
- return -ENOMEM;
+ if (!de) {
+ io_req_complete_failed(req, ret);
+ return true;
+ }
spin_lock_irq(&ctx->completion_lock);
if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
spin_unlock_irq(&ctx->completion_lock);
kfree(de);
io_queue_async_work(req);
- return -EIOCBQUEUED;
+ return true;
}
trace_io_uring_defer(ctx, req, req->user_data);
de->seq = seq;
list_add_tail(&de->list, &ctx->defer_list);
spin_unlock_irq(&ctx->completion_lock);
- return -EIOCBQUEUED;
+ return true;
}
static void io_clean_op(struct io_kiocb *req)
kfree(req->sr_msg.kbuf);
break;
}
- req->flags &= ~REQ_F_BUFFER_SELECTED;
}
if (req->flags & REQ_F_NEED_CLEANUP) {
case IORING_OP_WRITE_FIXED:
case IORING_OP_WRITE: {
struct io_async_rw *io = req->async_data;
- if (io->free_iovec)
- kfree(io->free_iovec);
+
+ kfree(io->free_iovec);
break;
}
case IORING_OP_RECVMSG:
putname(req->unlink.filename);
break;
}
- req->flags &= ~REQ_F_NEED_CLEANUP;
}
if ((req->flags & REQ_F_POLLED) && req->apoll) {
kfree(req->apoll->double_poll);
struct io_uring_task *tctx = req->task->io_uring;
atomic_dec(&tctx->inflight_tracked);
- req->flags &= ~REQ_F_INFLIGHT;
}
+ if (req->flags & REQ_F_CREDS)
+ put_cred(req->creds);
+
+ req->flags &= ~IO_REQ_CLEAN_FLAGS;
}
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
const struct cred *creds = NULL;
int ret;
- if (req->work.creds && req->work.creds != current_cred())
- creds = override_creds(req->work.creds);
+ if ((req->flags & REQ_F_CREDS) && req->creds != current_cred())
+ creds = override_creds(req->creds);
switch (req->opcode) {
case IORING_OP_NOP:
cs->reqs[cs->nr++] = req;
if (cs->nr == ARRAY_SIZE(cs->reqs))
- io_submit_flush_completions(cs, ctx);
+ io_submit_flush_completions(ctx);
} else {
io_put_req(req);
}
io_queue_linked_timeout(linked_timeout);
}
-static void io_queue_sqe(struct io_kiocb *req)
+static inline void io_queue_sqe(struct io_kiocb *req)
{
- int ret;
+ if (unlikely(req->ctx->drain_active) && io_drain_req(req))
+ return;
- ret = io_req_defer(req);
- if (ret) {
- if (ret != -EIOCBQUEUED) {
-fail_req:
- io_req_complete_failed(req, ret);
- }
- } else if (req->flags & REQ_F_FORCE_ASYNC) {
- ret = io_req_prep_async(req);
- if (unlikely(ret))
- goto fail_req;
- io_queue_async_work(req);
- } else {
+ if (likely(!(req->flags & REQ_F_FORCE_ASYNC))) {
__io_queue_sqe(req);
+ } else {
+ int ret = io_req_prep_async(req);
+
+ if (unlikely(ret))
+ io_req_complete_failed(req, ret);
+ else
+ io_queue_async_work(req);
}
}
atomic_set(&req->refs, 2);
req->task = current;
req->result = 0;
- req->work.creds = NULL;
/* enforce forwards compatibility on users */
if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
!io_op_defs[req->opcode].buffer_select)
return -EOPNOTSUPP;
+ if (unlikely(sqe_flags & IOSQE_IO_DRAIN))
+ ctx->drain_active = true;
personality = READ_ONCE(sqe->personality);
if (personality) {
- req->work.creds = xa_load(&ctx->personalities, personality);
- if (!req->work.creds)
+ req->creds = xa_load(&ctx->personalities, personality);
+ if (!req->creds)
return -EINVAL;
- get_cred(req->work.creds);
+ get_cred(req->creds);
+ req->flags |= REQ_F_CREDS;
}
state = &ctx->submit_state;
io_req_complete_failed(req, ret);
return ret;
}
+
ret = io_req_prep(req, sqe);
if (unlikely(ret))
goto fail_req;
/* don't need @sqe from now on */
- trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
- true, ctx->flags & IORING_SETUP_SQPOLL);
+ trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
+ req->flags, true,
+ ctx->flags & IORING_SETUP_SQPOLL);
/*
* If we already have a head request, queue this one for async
if (link->head) {
struct io_kiocb *head = link->head;
- /*
- * Taking sequential execution of a link, draining both sides
- * of the link also fullfils IOSQE_IO_DRAIN semantics for all
- * requests in the link. So, it drains the head and the
- * next after the link request. The last one is done via
- * drain_next flag to persist the effect across calls.
- */
- if (req->flags & REQ_F_IO_DRAIN) {
- head->flags |= REQ_F_IO_DRAIN;
- ctx->drain_next = 1;
- }
ret = io_req_prep_async(req);
if (unlikely(ret))
goto fail_req;
/* last request of a link, enqueue the link */
if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
- io_queue_sqe(head);
link->head = NULL;
+ io_queue_sqe(head);
}
} else {
- if (unlikely(ctx->drain_next)) {
- req->flags |= REQ_F_IO_DRAIN;
- ctx->drain_next = 0;
- }
if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
link->head = req;
link->last = req;
if (state->link.head)
io_queue_sqe(state->link.head);
if (state->comp.nr)
- io_submit_flush_completions(&state->comp, ctx);
+ io_submit_flush_completions(ctx);
if (state->plug_started)
blk_finish_plug(&state->plug);
io_state_file_put(state);
}
/*
- * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
+ * Fetch an sqe, if one is available. Note this returns a pointer to memory
* that is mapped by userspace. This means that care needs to be taken to
* ensure that reads are stable, as we cannot rely on userspace always
* being a good citizen. If members of the sqe are validated and then later
*/
static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
{
- u32 *sq_array = ctx->sq_array;
unsigned head, mask = ctx->sq_entries - 1;
+ unsigned sq_idx = ctx->cached_sq_head++ & mask;
/*
* The cached sq head (or cq tail) serves two purposes:
* 2) allows the kernel side to track the head on its own, even
* though the application is the one updating it.
*/
- head = READ_ONCE(sq_array[ctx->cached_sq_head++ & mask]);
+ head = READ_ONCE(ctx->sq_array[sq_idx]);
if (likely(head < ctx->sq_entries))
return &ctx->sq_sqes[head];
/* drop invalid entries */
- ctx->cached_sq_dropped++;
- WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
+ ctx->cq_extra--;
+ WRITE_ONCE(ctx->rings->sq_dropped,
+ READ_ONCE(ctx->rings->sq_dropped) + 1);
return NULL;
}
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
{
+ struct io_uring_task *tctx;
int submitted = 0;
/* make sure SQ entry isn't read before tail */
nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
-
if (!percpu_ref_tryget_many(&ctx->refs, nr))
return -EAGAIN;
- percpu_counter_add(¤t->io_uring->inflight, nr);
- refcount_add(nr, ¤t->usage);
+ tctx = current->io_uring;
+ tctx->cached_refs -= nr;
+ if (unlikely(tctx->cached_refs < 0)) {
+ unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
+
+ percpu_counter_add(&tctx->inflight, refill);
+ refcount_add(refill, ¤t->usage);
+ tctx->cached_refs += refill;
+ }
io_submit_state_start(&ctx->submit_state, nr);
while (submitted < nr) {
if (unlikely(submitted != nr)) {
int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
- struct io_uring_task *tctx = current->io_uring;
int unused = nr - ref_used;
+ current->io_uring->cached_refs += unused;
percpu_ref_put_many(&ctx->refs, unused);
- percpu_counter_sub(&tctx->inflight, unused);
- put_task_struct_many(current, unused);
}
io_submit_state_end(&ctx->submit_state, ctx);
timeout = jiffies + sqd->sq_thread_idle;
}
- io_uring_cancel_sqpoll(sqd);
+ io_uring_cancel_generic(true, sqd);
sqd->thread = NULL;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_set_wakeup_flag(ctx);
* Cannot safely flush overflowed CQEs from here, ensure we wake up
* the task, and the next invocation will do it.
*/
- if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->cq_check_overflow))
+ if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
return autoremove_wake_function(curr, mode, wake_flags, key);
return -1;
}
if (ret || io_should_wake(iowq))
return ret;
/* let the caller flush overflows, retry */
- if (test_bit(0, &ctx->cq_check_overflow))
+ if (test_bit(0, &ctx->check_cq_overflow))
return 1;
*timeout = schedule_timeout(*timeout);
ret = -EBUSY;
break;
}
- prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
+ prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
TASK_INTERRUPTIBLE);
ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
- finish_wait(&ctx->wait, &iowq.wq);
+ finish_wait(&ctx->cq_wait, &iowq.wq);
cond_resched();
} while (ret > 0);
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}
-static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
+static void io_free_page_table(void **table, size_t size)
{
- unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
+ unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
for (i = 0; i < nr_tables; i++)
- kfree(table->files[i]);
- kfree(table->files);
- table->files = NULL;
+ kfree(table[i]);
+ kfree(table);
+}
+
+static void **io_alloc_page_table(size_t size)
+{
+ unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
+ size_t init_size = size;
+ void **table;
+
+ table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL);
+ if (!table)
+ return NULL;
+
+ for (i = 0; i < nr_tables; i++) {
+ unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
+
+ table[i] = kzalloc(this_size, GFP_KERNEL);
+ if (!table[i]) {
+ io_free_page_table(table, init_size);
+ return NULL;
+ }
+ size -= this_size;
+ }
+ return table;
}
static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
return ret;
}
+static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
+{
+ unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
+ unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
+
+ return &data->tags[table_idx][off];
+}
+
static void io_rsrc_data_free(struct io_rsrc_data *data)
{
- kvfree(data->tags);
+ size_t size = data->nr * sizeof(data->tags[0][0]);
+
+ if (data->tags)
+ io_free_page_table((void **)data->tags, size);
kfree(data);
}
-static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx,
- rsrc_put_fn *do_put,
- unsigned nr)
+static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
+ u64 __user *utags, unsigned nr,
+ struct io_rsrc_data **pdata)
{
struct io_rsrc_data *data;
+ int ret = -ENOMEM;
+ unsigned i;
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
- return NULL;
-
- data->tags = kvcalloc(nr, sizeof(*data->tags), GFP_KERNEL);
+ return -ENOMEM;
+ data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
if (!data->tags) {
kfree(data);
- return NULL;
+ return -ENOMEM;
}
- atomic_set(&data->refs, 1);
+ data->nr = nr;
data->ctx = ctx;
data->do_put = do_put;
+ if (utags) {
+ ret = -EFAULT;
+ for (i = 0; i < nr; i++) {
+ u64 *tag_slot = io_get_tag_slot(data, i);
+
+ if (copy_from_user(tag_slot, &utags[i],
+ sizeof(*tag_slot)))
+ goto fail;
+ }
+ }
+
+ atomic_set(&data->refs, 1);
init_completion(&data->done);
- return data;
+ *pdata = data;
+ return 0;
+fail:
+ io_rsrc_data_free(data);
+ return ret;
+}
+
+static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
+{
+ size_t size = nr_files * sizeof(struct io_fixed_file);
+
+ table->files = (struct io_fixed_file **)io_alloc_page_table(size);
+ return !!table->files;
+}
+
+static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
+{
+ size_t size = nr_files * sizeof(struct io_fixed_file);
+
+ io_free_page_table((void **)table->files, size);
+ table->files = NULL;
}
static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
}
#endif
-static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
-{
- unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
-
- table->files = kcalloc(nr_tables, sizeof(*table->files), GFP_KERNEL);
- if (!table->files)
- return false;
-
- for (i = 0; i < nr_tables; i++) {
- unsigned int this_files = min(nr_files, IORING_MAX_FILES_TABLE);
-
- table->files[i] = kcalloc(this_files, sizeof(*table->files[i]),
- GFP_KERNEL);
- if (!table->files[i])
- break;
- nr_files -= this_files;
- }
-
- if (i == nr_tables)
- return true;
-
- io_free_file_tables(table, nr_tables * IORING_MAX_FILES_TABLE);
- return false;
-}
-
static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
{
struct file *file = prsrc->file;
if (prsrc->tag) {
bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
- unsigned long flags;
io_ring_submit_lock(ctx, lock_ring);
- spin_lock_irqsave(&ctx->completion_lock, flags);
+ spin_lock_irq(&ctx->completion_lock);
io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
ctx->cq_extra++;
io_commit_cqring(ctx);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
io_ring_submit_unlock(ctx, lock_ring);
}
struct file *file;
int fd, ret;
unsigned i;
- struct io_rsrc_data *file_data;
if (ctx->file_data)
return -EBUSY;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;
+ ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
+ &ctx->file_data);
+ if (ret)
+ return ret;
- file_data = io_rsrc_data_alloc(ctx, io_rsrc_file_put, nr_args);
- if (!file_data)
- return -ENOMEM;
- ctx->file_data = file_data;
ret = -ENOMEM;
if (!io_alloc_file_tables(&ctx->file_table, nr_args))
goto out_free;
for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
- u64 tag = 0;
-
- if ((tags && copy_from_user(&tag, &tags[i], sizeof(tag))) ||
- copy_from_user(&fd, &fds[i], sizeof(fd))) {
+ if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
ret = -EFAULT;
goto out_fput;
}
/* allow sparse sets */
if (fd == -1) {
ret = -EINVAL;
- if (unlikely(tag))
+ if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
goto out_fput;
continue;
}
fput(file);
goto out_fput;
}
- ctx->file_data->tags[i] = tag;
io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
}
if (!prsrc)
return -ENOMEM;
- prsrc->tag = data->tags[idx];
+ prsrc->tag = *io_get_tag_slot(data, idx);
prsrc->rsrc = rsrc;
list_add(&prsrc->list, &node->rsrc_list);
return 0;
err = -EBADF;
break;
}
- data->tags[up->offset + done] = tag;
+ *io_get_tag_slot(data, up->offset + done) = tag;
io_fixed_file_set(file_slot, file);
err = io_sqe_file_register(ctx, file, i);
if (err) {
struct io_uring_task *tctx;
int ret;
- tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
+ tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
if (unlikely(!tctx))
return -ENOMEM;
xa_init(&tctx->xa);
init_waitqueue_head(&tctx->wait);
- tctx->last = NULL;
atomic_set(&tctx->in_idle, 0);
atomic_set(&tctx->inflight_tracked, 0);
task->io_uring = tctx;
spin_lock_init(&tctx->task_lock);
INIT_WQ_LIST(&tctx->task_list);
- tctx->task_state = 0;
init_task_work(&tctx->task_work, tctx_task_work);
return 0;
}
WARN_ON_ONCE(!xa_empty(&tctx->xa));
WARN_ON_ONCE(tctx->io_wq);
+ WARN_ON_ONCE(tctx->cached_refs);
percpu_counter_destroy(&tctx->inflight);
kfree(tctx);
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;
- data = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, nr_args);
- if (!data)
- return -ENOMEM;
+ ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
+ if (ret)
+ return ret;
ret = io_buffers_map_alloc(ctx, nr_args);
if (ret) {
io_rsrc_data_free(data);
}
for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
- u64 tag = 0;
-
- if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) {
- ret = -EFAULT;
- break;
- }
ret = io_copy_iov(ctx, &iov, arg, i);
if (ret)
break;
ret = io_buffer_validate(&iov);
if (ret)
break;
- if (!iov.iov_base && tag) {
+ if (!iov.iov_base && *io_get_tag_slot(data, i)) {
ret = -EINVAL;
break;
}
&last_hpage);
if (ret)
break;
- data->tags[i] = tag;
}
WARN_ON_ONCE(ctx->buf_data);
}
ctx->user_bufs[i] = imu;
- ctx->buf_data->tags[offset] = tag;
+ *io_get_tag_slot(ctx->buf_data, offset) = tag;
}
if (needs_switch)
struct io_ring_ctx *ctx = file->private_data;
__poll_t mask = 0;
- poll_wait(file, &ctx->cq_wait, wait);
+ poll_wait(file, &ctx->poll_wait, wait);
/*
* synchronizes with barrier from wq_has_sleeper call in
* io_commit_cqring
* Users may get EPOLLIN meanwhile seeing nothing in cqring, this
* pushs them to do the flush.
*/
- if (io_cqring_events(ctx) || test_bit(0, &ctx->cq_check_overflow))
+ if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
mask |= EPOLLIN | EPOLLRDNORM;
return mask;
return percpu_counter_sum(&tctx->inflight);
}
-static void io_uring_try_cancel(bool cancel_all)
+static void io_uring_drop_tctx_refs(struct task_struct *task)
{
- struct io_uring_task *tctx = current->io_uring;
- struct io_tctx_node *node;
- unsigned long index;
-
- xa_for_each(&tctx->xa, index, node) {
- struct io_ring_ctx *ctx = node->ctx;
+ struct io_uring_task *tctx = task->io_uring;
+ unsigned int refs = tctx->cached_refs;
- /* sqpoll task will cancel all its requests */
- if (!ctx->sq_data)
- io_uring_try_cancel_requests(ctx, current, cancel_all);
- }
+ tctx->cached_refs = 0;
+ percpu_counter_sub(&tctx->inflight, refs);
+ put_task_struct_many(task, refs);
}
-/* should only be called by SQPOLL task */
-static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
+/*
+ * Find any io_uring ctx that this task has registered or done IO on, and cancel
+ * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation.
+ */
+static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
{
struct io_uring_task *tctx = current->io_uring;
struct io_ring_ctx *ctx;
s64 inflight;
DEFINE_WAIT(wait);
+ WARN_ON_ONCE(sqd && sqd->thread != current);
+
if (!current->io_uring)
return;
if (tctx->io_wq)
io_wq_exit_start(tctx->io_wq);
- WARN_ON_ONCE(!sqd || sqd->thread != current);
-
+ io_uring_drop_tctx_refs(current);
atomic_inc(&tctx->in_idle);
do {
/* read completions before cancelations */
- inflight = tctx_inflight(tctx, false);
+ inflight = tctx_inflight(tctx, !cancel_all);
if (!inflight)
break;
- list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
- io_uring_try_cancel_requests(ctx, current, true);
- prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
- /*
- * If we've seen completions, retry without waiting. This
- * avoids a race where a completion comes in before we did
- * prepare_to_wait().
- */
- if (inflight == tctx_inflight(tctx, false))
- schedule();
- finish_wait(&tctx->wait, &wait);
- } while (1);
- atomic_dec(&tctx->in_idle);
-}
+ if (!sqd) {
+ struct io_tctx_node *node;
+ unsigned long index;
-/*
- * Find any io_uring fd that this task has registered or done IO on, and cancel
- * requests.
- */
-void __io_uring_cancel(struct files_struct *files)
-{
- struct io_uring_task *tctx = current->io_uring;
- DEFINE_WAIT(wait);
- s64 inflight;
- bool cancel_all = !files;
-
- if (tctx->io_wq)
- io_wq_exit_start(tctx->io_wq);
+ xa_for_each(&tctx->xa, index, node) {
+ /* sqpoll task will cancel all its requests */
+ if (node->ctx->sq_data)
+ continue;
+ io_uring_try_cancel_requests(node->ctx, current,
+ cancel_all);
+ }
+ } else {
+ list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+ io_uring_try_cancel_requests(ctx, current,
+ cancel_all);
+ }
- /* make sure overflow events are dropped */
- atomic_inc(&tctx->in_idle);
- do {
- /* read completions before cancelations */
- inflight = tctx_inflight(tctx, !cancel_all);
- if (!inflight)
- break;
- io_uring_try_cancel(cancel_all);
prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
-
/*
* If we've seen completions, retry without waiting. This
* avoids a race where a completion comes in before we did
}
}
+void __io_uring_cancel(struct files_struct *files)
+{
+ io_uring_cancel_generic(!files, NULL);
+}
+
static void *io_uring_validate_mmap_request(struct file *file,
loff_t pgoff, size_t sz)
{
return -EINVAL;
}
+static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned len)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ cpumask_var_t new_mask;
+ int ret;
+
+ if (!tctx || !tctx->io_wq)
+ return -EINVAL;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpumask_clear(new_mask);
+ if (len > cpumask_size())
+ len = cpumask_size();
+
+ if (copy_from_user(new_mask, arg, len)) {
+ free_cpumask_var(new_mask);
+ return -EFAULT;
+ }
+
+ ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
+ free_cpumask_var(new_mask);
+ return ret;
+}
+
+static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
+{
+ struct io_uring_task *tctx = current->io_uring;
+
+ if (!tctx || !tctx->io_wq)
+ return -EINVAL;
+
+ return io_wq_cpu_affinity(tctx->io_wq, NULL);
+}
+
static bool io_register_op_must_quiesce(int op)
{
switch (op) {
case IORING_REGISTER_FILES_UPDATE2:
case IORING_REGISTER_BUFFERS2:
case IORING_REGISTER_BUFFERS_UPDATE:
+ case IORING_REGISTER_IOWQ_AFF:
+ case IORING_UNREGISTER_IOWQ_AFF:
return false;
default:
return true;
ret = io_register_rsrc_update(ctx, arg, nr_args,
IORING_RSRC_BUFFER);
break;
+ case IORING_REGISTER_IOWQ_AFF:
+ ret = -EINVAL;
+ if (!arg || !nr_args)
+ break;
+ ret = io_register_iowq_aff(ctx, arg, nr_args);
+ break;
+ case IORING_UNREGISTER_IOWQ_AFF:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_unregister_iowq_aff(ctx);
+ break;
default:
ret = -EINVAL;
break;