#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
IOSQE_BUFFER_SELECT)
+#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
+ REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS)
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
};
struct io_ring_ctx {
+ /* const or read-mostly hot data */
struct {
struct percpu_ref refs;
- } ____cacheline_aligned_in_smp;
- struct {
+ struct io_rings *rings;
unsigned int flags;
unsigned int compat: 1;
unsigned int drain_next: 1;
unsigned int eventfd_async: 1;
unsigned int restricted: 1;
+ unsigned int off_timeout_used: 1;
+ unsigned int drain_active: 1;
+ } ____cacheline_aligned_in_smp;
+
+ /* submission data */
+ struct {
+ struct mutex uring_lock;
/*
* Ring buffer of indices into array of io_uring_sqe, which is
* array.
*/
u32 *sq_array;
+ struct io_uring_sqe *sq_sqes;
unsigned cached_sq_head;
unsigned sq_entries;
- unsigned sq_thread_idle;
- unsigned cached_sq_dropped;
- unsigned long sq_check_overflow;
-
struct list_head defer_list;
- struct list_head timeout_list;
- struct list_head cq_overflow_list;
- struct io_uring_sqe *sq_sqes;
- } ____cacheline_aligned_in_smp;
+ /*
+ * Fixed resources fast path, should be accessed only under
+ * uring_lock, and updated through io_uring_register(2)
+ */
+ struct io_rsrc_node *rsrc_node;
+ struct io_file_table file_table;
+ unsigned nr_user_files;
+ unsigned nr_user_bufs;
+ struct io_mapped_ubuf **user_bufs;
- struct {
- struct mutex uring_lock;
- wait_queue_head_t wait;
+ struct io_submit_state submit_state;
+ struct list_head timeout_list;
+ struct list_head cq_overflow_list;
+ struct xarray io_buffers;
+ struct xarray personalities;
+ u32 pers_next;
+ unsigned sq_thread_idle;
} ____cacheline_aligned_in_smp;
- struct io_submit_state submit_state;
/* IRQ completion list, under ->completion_lock */
struct list_head locked_free_list;
unsigned int locked_free_nr;
- struct io_rings *rings;
-
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
struct io_sq_data *sq_data; /* if using sq thread polling */
struct wait_queue_head sqo_sq_wait;
struct list_head sqd_list;
- /*
- * Fixed resources fast path, should be accessed only under uring_lock,
- * and updated through io_uring_register(2)
- */
- struct io_rsrc_node *rsrc_node;
-
- struct io_file_table file_table;
- unsigned nr_user_files;
- unsigned nr_user_bufs;
- struct io_mapped_ubuf **user_bufs;
-
- struct xarray io_buffers;
- struct xarray personalities;
- u32 pers_next;
+ unsigned long check_cq_overflow;
struct {
unsigned cached_cq_tail;
unsigned cq_entries;
- atomic_t cq_timeouts;
- unsigned cq_last_tm_flush;
- unsigned cq_extra;
- unsigned long cq_check_overflow;
+ struct eventfd_ctx *cq_ev_fd;
+ struct wait_queue_head poll_wait;
struct wait_queue_head cq_wait;
+ unsigned cq_extra;
+ atomic_t cq_timeouts;
struct fasync_struct *cq_fasync;
- struct eventfd_ctx *cq_ev_fd;
+ unsigned cq_last_tm_flush;
} ____cacheline_aligned_in_smp;
struct {
REQ_F_COMPLETE_INLINE_BIT,
REQ_F_REISSUE_BIT,
REQ_F_DONT_REISSUE_BIT,
+ REQ_F_CREDS_BIT,
/* keep async read/write and isreg together and in order */
REQ_F_ASYNC_READ_BIT,
REQ_F_ASYNC_WRITE_BIT,
REQ_F_ASYNC_WRITE = BIT(REQ_F_ASYNC_WRITE_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
+ /* has creds assigned */
+ REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),
};
struct async_poll {
struct hlist_node hash_node;
struct async_poll *apoll;
struct io_wq_work work;
+ const struct cred *creds;
+
/* store used ubuf, so we can prevent reloading */
struct io_mapped_ubuf *imu;
};
static void io_rsrc_put_work(struct work_struct *work);
static void io_req_task_queue(struct io_kiocb *req);
-static void io_submit_flush_completions(struct io_comp_state *cs,
- struct io_ring_ctx *ctx);
+static void io_submit_flush_completions(struct io_ring_ctx *ctx);
static bool io_poll_remove_waitqs(struct io_kiocb *req);
static int io_req_prep_async(struct io_kiocb *req);
ctx->flags = p->flags;
init_waitqueue_head(&ctx->sqo_sq_wait);
INIT_LIST_HEAD(&ctx->sqd_list);
- init_waitqueue_head(&ctx->cq_wait);
+ init_waitqueue_head(&ctx->poll_wait);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
init_completion(&ctx->ref_comp);
xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
mutex_init(&ctx->uring_lock);
- init_waitqueue_head(&ctx->wait);
+ init_waitqueue_head(&ctx->cq_wait);
spin_lock_init(&ctx->completion_lock);
INIT_LIST_HEAD(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->defer_list);
const struct io_op_def *def = &io_op_defs[req->opcode];
struct io_ring_ctx *ctx = req->ctx;
- if (!req->work.creds)
- req->work.creds = get_current_cred();
+ if (!(req->flags & REQ_F_CREDS)) {
+ req->flags |= REQ_F_CREDS;
+ req->creds = get_current_cred();
+ }
req->work.list.next = NULL;
req->work.flags = 0;
}
}
-static void __io_queue_deferred(struct io_ring_ctx *ctx)
+static void io_queue_deferred(struct io_ring_ctx *ctx)
{
- do {
+ while (!list_empty(&ctx->defer_list)) {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
struct io_defer_entry, list);
list_del_init(&de->list);
io_req_task_queue(de->req);
kfree(de);
- } while (!list_empty(&ctx->defer_list));
+ }
}
static void io_flush_timeouts(struct io_ring_ctx *ctx)
{
- u32 seq;
-
- if (list_empty(&ctx->timeout_list))
- return;
-
- seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
+ u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
- do {
+ while (!list_empty(&ctx->timeout_list)) {
u32 events_needed, events_got;
struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
struct io_kiocb, timeout.list);
list_del_init(&req->timeout.list);
io_kill_timeout(req, 0);
- } while (!list_empty(&ctx->timeout_list));
-
+ }
ctx->cq_last_tm_flush = seq;
}
-static void io_commit_cqring(struct io_ring_ctx *ctx)
+static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
- io_flush_timeouts(ctx);
+ if (ctx->off_timeout_used)
+ io_flush_timeouts(ctx);
+ if (ctx->drain_active)
+ io_queue_deferred(ctx);
+}
+static inline void io_commit_cqring(struct io_ring_ctx *ctx)
+{
+ if (unlikely(ctx->off_timeout_used || ctx->drain_active))
+ __io_commit_cqring_flush(ctx);
/* order cqe stores with ring update */
smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
-
- if (unlikely(!list_empty(&ctx->defer_list)))
- __io_queue_deferred(ctx);
}
static inline bool io_sqring_full(struct io_ring_ctx *ctx)
/* see waitqueue_active() comment */
smp_mb();
- if (waitqueue_active(&ctx->wait))
- wake_up(&ctx->wait);
+ if (waitqueue_active(&ctx->cq_wait))
+ wake_up(&ctx->cq_wait);
if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
wake_up(&ctx->sq_data->wait);
if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
- if (waitqueue_active(&ctx->cq_wait)) {
- wake_up_interruptible(&ctx->cq_wait);
+ if (waitqueue_active(&ctx->poll_wait)) {
+ wake_up_interruptible(&ctx->poll_wait);
kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
}
}
smp_mb();
if (ctx->flags & IORING_SETUP_SQPOLL) {
- if (waitqueue_active(&ctx->wait))
- wake_up(&ctx->wait);
+ if (waitqueue_active(&ctx->cq_wait))
+ wake_up(&ctx->cq_wait);
}
if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
- if (waitqueue_active(&ctx->cq_wait)) {
- wake_up_interruptible(&ctx->cq_wait);
+ if (waitqueue_active(&ctx->poll_wait)) {
+ wake_up_interruptible(&ctx->poll_wait);
kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
}
}
all_flushed = list_empty(&ctx->cq_overflow_list);
if (all_flushed) {
- clear_bit(0, &ctx->sq_check_overflow);
- clear_bit(0, &ctx->cq_check_overflow);
+ clear_bit(0, &ctx->check_cq_overflow);
ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
}
{
bool ret = true;
- if (test_bit(0, &ctx->cq_check_overflow)) {
+ if (test_bit(0, &ctx->check_cq_overflow)) {
/* iopoll syncs against uring_lock, not completion_lock */
if (ctx->flags & IORING_SETUP_IOPOLL)
mutex_lock(&ctx->uring_lock);
return false;
}
if (list_empty(&ctx->cq_overflow_list)) {
- set_bit(0, &ctx->sq_check_overflow);
- set_bit(0, &ctx->cq_check_overflow);
+ set_bit(0, &ctx->check_cq_overflow);
ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
}
ocqe->cqe.user_data = user_data;
static inline bool io_req_needs_clean(struct io_kiocb *req)
{
- return req->flags & (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP |
- REQ_F_POLLED | REQ_F_INFLIGHT);
+ return req->flags & IO_REQ_CLEAN_FLAGS;
}
static void io_req_complete_state(struct io_kiocb *req, long res,
percpu_ref_put(req->fixed_rsrc_refs);
if (req->async_data)
kfree(req->async_data);
- if (req->work.creds) {
- put_cred(req->work.creds);
- req->work.creds = NULL;
- }
}
/* must to be called somewhat shortly after putting a request */
return;
if (ctx->submit_state.comp.nr) {
mutex_lock(&ctx->uring_lock);
- io_submit_flush_completions(&ctx->submit_state.comp, ctx);
+ io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
}
percpu_ref_put(&ctx->refs);
}
-static bool __tctx_task_work(struct io_uring_task *tctx)
+static void tctx_task_work(struct callback_head *cb)
{
struct io_ring_ctx *ctx = NULL;
- struct io_wq_work_list list;
- struct io_wq_work_node *node;
+ struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
+ task_work);
- if (wq_list_empty(&tctx->task_list))
- return false;
+ clear_bit(0, &tctx->task_state);
- spin_lock_irq(&tctx->task_lock);
- list = tctx->task_list;
- INIT_WQ_LIST(&tctx->task_list);
- spin_unlock_irq(&tctx->task_lock);
+ while (!wq_list_empty(&tctx->task_list)) {
+ struct io_wq_work_node *node;
- node = list.first;
- while (node) {
- struct io_wq_work_node *next = node->next;
- struct io_kiocb *req;
+ spin_lock_irq(&tctx->task_lock);
+ node = tctx->task_list.first;
+ INIT_WQ_LIST(&tctx->task_list);
+ spin_unlock_irq(&tctx->task_lock);
- req = container_of(node, struct io_kiocb, io_task_work.node);
- if (req->ctx != ctx) {
- ctx_flush_and_put(ctx);
- ctx = req->ctx;
- percpu_ref_get(&ctx->refs);
- }
+ while (node) {
+ struct io_wq_work_node *next = node->next;
+ struct io_kiocb *req = container_of(node, struct io_kiocb,
+ io_task_work.node);
- req->task_work.func(&req->task_work);
- node = next;
+ if (req->ctx != ctx) {
+ ctx_flush_and_put(ctx);
+ ctx = req->ctx;
+ percpu_ref_get(&ctx->refs);
+ }
+ req->task_work.func(&req->task_work);
+ node = next;
+ }
+ cond_resched();
}
ctx_flush_and_put(ctx);
- return list.first != NULL;
-}
-
-static void tctx_task_work(struct callback_head *cb)
-{
- struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work);
-
- clear_bit(0, &tctx->task_state);
-
- while (__tctx_task_work(tctx))
- cond_resched();
}
static int io_req_task_work_add(struct io_kiocb *req)
list_add(&req->compl.list, &state->comp.free_list);
}
-static void io_submit_flush_completions(struct io_comp_state *cs,
- struct io_ring_ctx *ctx)
+static void io_submit_flush_completions(struct io_ring_ctx *ctx)
{
+ struct io_comp_state *cs = &ctx->submit_state.comp;
int i, nr = cs->nr;
struct io_kiocb *req;
struct req_batch rb;
* If we do, we can potentially be spinning for commands that
* already triggered a CQE (eg in error).
*/
- if (test_bit(0, &ctx->cq_check_overflow))
+ if (test_bit(0, &ctx->check_cq_overflow))
__io_cqring_overflow_flush(ctx, false);
if (io_cqring_events(ctx))
goto out;
return true;
}
-static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
+static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
{
if (req->file->f_op->read_iter)
return call_read_iter(req->file, &req->rw.kiocb, iter);
struct async_poll *apoll = req->apoll;
struct io_ring_ctx *ctx = req->ctx;
- trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
+ trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data);
if (io_poll_rewait(req, &apoll->poll)) {
spin_unlock_irq(&ctx->completion_lock);
return false;
}
spin_unlock_irq(&ctx->completion_lock);
- trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
- apoll->poll.events);
+ trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
+ mask, apoll->poll.events);
return true;
}
return -EINVAL;
req->timeout.off = off;
+ if (unlikely(off && !req->ctx->off_timeout_used))
+ req->ctx->off_timeout_used = true;
if (!req->async_data && io_alloc_async_data(req))
return -ENOMEM;
static u32 io_get_sequence(struct io_kiocb *req)
{
- struct io_kiocb *pos;
- struct io_ring_ctx *ctx = req->ctx;
- u32 total_submitted, nr_reqs = 0;
+ u32 seq = req->ctx->cached_sq_head;
- io_for_each_link(pos, req)
- nr_reqs++;
-
- total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
- return total_submitted - nr_reqs;
+ /* need original cached_sq_head, but it was increased for each req */
+ io_for_each_link(req, req)
+ seq--;
+ return seq;
}
-static int io_req_defer(struct io_kiocb *req)
+static bool io_drain_req(struct io_kiocb *req)
{
+ struct io_kiocb *pos;
struct io_ring_ctx *ctx = req->ctx;
struct io_defer_entry *de;
int ret;
u32 seq;
+ /*
+ * If we need to drain a request in the middle of a link, drain the
+ * head request and the next request/link after the current link.
+ * Considering sequential execution of links, IOSQE_IO_DRAIN will be
+ * maintained for every request of our link.
+ */
+ if (ctx->drain_next) {
+ req->flags |= REQ_F_IO_DRAIN;
+ ctx->drain_next = false;
+ }
+ /* not interested in head, start from the first linked */
+ io_for_each_link(pos, req->link) {
+ if (pos->flags & REQ_F_IO_DRAIN) {
+ ctx->drain_next = true;
+ req->flags |= REQ_F_IO_DRAIN;
+ break;
+ }
+ }
+
/* Still need defer if there is pending req in defer list. */
if (likely(list_empty_careful(&ctx->defer_list) &&
- !(req->flags & REQ_F_IO_DRAIN)))
- return 0;
+ !(req->flags & REQ_F_IO_DRAIN))) {
+ ctx->drain_active = false;
+ return false;
+ }
seq = io_get_sequence(req);
/* Still a chance to pass the sequence check */
if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
- return 0;
+ return false;
ret = io_req_prep_async(req);
if (ret)
return ret;
io_prep_async_link(req);
de = kmalloc(sizeof(*de), GFP_KERNEL);
- if (!de)
- return -ENOMEM;
+ if (!de) {
+ io_req_complete_failed(req, ret);
+ return true;
+ }
spin_lock_irq(&ctx->completion_lock);
if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
spin_unlock_irq(&ctx->completion_lock);
kfree(de);
io_queue_async_work(req);
- return -EIOCBQUEUED;
+ return true;
}
trace_io_uring_defer(ctx, req, req->user_data);
de->seq = seq;
list_add_tail(&de->list, &ctx->defer_list);
spin_unlock_irq(&ctx->completion_lock);
- return -EIOCBQUEUED;
+ return true;
}
static void io_clean_op(struct io_kiocb *req)
kfree(req->sr_msg.kbuf);
break;
}
- req->flags &= ~REQ_F_BUFFER_SELECTED;
}
if (req->flags & REQ_F_NEED_CLEANUP) {
case IORING_OP_WRITE_FIXED:
case IORING_OP_WRITE: {
struct io_async_rw *io = req->async_data;
- if (io->free_iovec)
- kfree(io->free_iovec);
+
+ kfree(io->free_iovec);
break;
}
case IORING_OP_RECVMSG:
putname(req->unlink.filename);
break;
}
- req->flags &= ~REQ_F_NEED_CLEANUP;
}
if ((req->flags & REQ_F_POLLED) && req->apoll) {
kfree(req->apoll->double_poll);
struct io_uring_task *tctx = req->task->io_uring;
atomic_dec(&tctx->inflight_tracked);
- req->flags &= ~REQ_F_INFLIGHT;
}
+ if (req->flags & REQ_F_CREDS)
+ put_cred(req->creds);
+
+ req->flags &= ~IO_REQ_CLEAN_FLAGS;
}
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
const struct cred *creds = NULL;
int ret;
- if (req->work.creds && req->work.creds != current_cred())
- creds = override_creds(req->work.creds);
+ if ((req->flags & REQ_F_CREDS) && req->creds != current_cred())
+ creds = override_creds(req->creds);
switch (req->opcode) {
case IORING_OP_NOP:
cs->reqs[cs->nr++] = req;
if (cs->nr == ARRAY_SIZE(cs->reqs))
- io_submit_flush_completions(cs, ctx);
+ io_submit_flush_completions(ctx);
} else {
io_put_req(req);
}
io_queue_linked_timeout(linked_timeout);
}
-static void io_queue_sqe(struct io_kiocb *req)
+static inline void io_queue_sqe(struct io_kiocb *req)
{
- int ret;
+ if (unlikely(req->ctx->drain_active) && io_drain_req(req))
+ return;
- ret = io_req_defer(req);
- if (ret) {
- if (ret != -EIOCBQUEUED) {
-fail_req:
- io_req_complete_failed(req, ret);
- }
- } else if (req->flags & REQ_F_FORCE_ASYNC) {
- ret = io_req_prep_async(req);
- if (unlikely(ret))
- goto fail_req;
- io_queue_async_work(req);
- } else {
+ if (likely(!(req->flags & REQ_F_FORCE_ASYNC))) {
__io_queue_sqe(req);
+ } else {
+ int ret = io_req_prep_async(req);
+
+ if (unlikely(ret))
+ io_req_complete_failed(req, ret);
+ else
+ io_queue_async_work(req);
}
}
atomic_set(&req->refs, 2);
req->task = current;
req->result = 0;
- req->work.creds = NULL;
/* enforce forwards compatibility on users */
if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
!io_op_defs[req->opcode].buffer_select)
return -EOPNOTSUPP;
+ if (unlikely(sqe_flags & IOSQE_IO_DRAIN))
+ ctx->drain_active = true;
personality = READ_ONCE(sqe->personality);
if (personality) {
- req->work.creds = xa_load(&ctx->personalities, personality);
- if (!req->work.creds)
+ req->creds = xa_load(&ctx->personalities, personality);
+ if (!req->creds)
return -EINVAL;
- get_cred(req->work.creds);
+ get_cred(req->creds);
+ req->flags |= REQ_F_CREDS;
}
state = &ctx->submit_state;
io_req_complete_failed(req, ret);
return ret;
}
+
ret = io_req_prep(req, sqe);
if (unlikely(ret))
goto fail_req;
/* don't need @sqe from now on */
- trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
- true, ctx->flags & IORING_SETUP_SQPOLL);
+ trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
+ req->flags, true,
+ ctx->flags & IORING_SETUP_SQPOLL);
/*
* If we already have a head request, queue this one for async
if (link->head) {
struct io_kiocb *head = link->head;
- /*
- * Taking sequential execution of a link, draining both sides
- * of the link also fullfils IOSQE_IO_DRAIN semantics for all
- * requests in the link. So, it drains the head and the
- * next after the link request. The last one is done via
- * drain_next flag to persist the effect across calls.
- */
- if (req->flags & REQ_F_IO_DRAIN) {
- head->flags |= REQ_F_IO_DRAIN;
- ctx->drain_next = 1;
- }
ret = io_req_prep_async(req);
if (unlikely(ret))
goto fail_req;
/* last request of a link, enqueue the link */
if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
- io_queue_sqe(head);
link->head = NULL;
+ io_queue_sqe(head);
}
} else {
- if (unlikely(ctx->drain_next)) {
- req->flags |= REQ_F_IO_DRAIN;
- ctx->drain_next = 0;
- }
if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
link->head = req;
link->last = req;
if (state->link.head)
io_queue_sqe(state->link.head);
if (state->comp.nr)
- io_submit_flush_completions(&state->comp, ctx);
+ io_submit_flush_completions(ctx);
if (state->plug_started)
blk_finish_plug(&state->plug);
io_state_file_put(state);
}
/*
- * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
+ * Fetch an sqe, if one is available. Note this returns a pointer to memory
* that is mapped by userspace. This means that care needs to be taken to
* ensure that reads are stable, as we cannot rely on userspace always
* being a good citizen. If members of the sqe are validated and then later
*/
static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
{
- u32 *sq_array = ctx->sq_array;
unsigned head, mask = ctx->sq_entries - 1;
+ unsigned sq_idx = ctx->cached_sq_head++ & mask;
/*
* The cached sq head (or cq tail) serves two purposes:
* 2) allows the kernel side to track the head on its own, even
* though the application is the one updating it.
*/
- head = READ_ONCE(sq_array[ctx->cached_sq_head++ & mask]);
+ head = READ_ONCE(ctx->sq_array[sq_idx]);
if (likely(head < ctx->sq_entries))
return &ctx->sq_sqes[head];
/* drop invalid entries */
- ctx->cached_sq_dropped++;
- WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
+ ctx->cq_extra--;
+ WRITE_ONCE(ctx->rings->sq_dropped,
+ READ_ONCE(ctx->rings->sq_dropped) + 1);
return NULL;
}
* Cannot safely flush overflowed CQEs from here, ensure we wake up
* the task, and the next invocation will do it.
*/
- if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->cq_check_overflow))
+ if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
return autoremove_wake_function(curr, mode, wake_flags, key);
return -1;
}
if (ret || io_should_wake(iowq))
return ret;
/* let the caller flush overflows, retry */
- if (test_bit(0, &ctx->cq_check_overflow))
+ if (test_bit(0, &ctx->check_cq_overflow))
return 1;
*timeout = schedule_timeout(*timeout);
ret = -EBUSY;
break;
}
- prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
+ prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
TASK_INTERRUPTIBLE);
ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
- finish_wait(&ctx->wait, &iowq.wq);
+ finish_wait(&ctx->cq_wait, &iowq.wq);
cond_resched();
} while (ret > 0);
return NULL;
for (i = 0; i < nr_tables; i++) {
- unsigned int this_size = min(size, PAGE_SIZE);
+ unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
table[i] = kzalloc(this_size, GFP_KERNEL);
if (!table[i]) {
if (utags) {
ret = -EFAULT;
for (i = 0; i < nr; i++) {
- if (copy_from_user(io_get_tag_slot(data, i), &utags[i],
- sizeof(data->tags[i])))
+ u64 *tag_slot = io_get_tag_slot(data, i);
+
+ if (copy_from_user(tag_slot, &utags[i],
+ sizeof(*tag_slot)))
goto fail;
}
}
struct io_ring_ctx *ctx = file->private_data;
__poll_t mask = 0;
- poll_wait(file, &ctx->cq_wait, wait);
+ poll_wait(file, &ctx->poll_wait, wait);
/*
* synchronizes with barrier from wq_has_sleeper call in
* io_commit_cqring
* Users may get EPOLLIN meanwhile seeing nothing in cqring, this
* pushs them to do the flush.
*/
- if (io_cqring_events(ctx) || test_bit(0, &ctx->cq_check_overflow))
+ if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
mask |= EPOLLIN | EPOLLRDNORM;
return mask;
return -EINVAL;
}
+static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned len)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ cpumask_var_t new_mask;
+ int ret;
+
+ if (!tctx || !tctx->io_wq)
+ return -EINVAL;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpumask_clear(new_mask);
+ if (len > cpumask_size())
+ len = cpumask_size();
+
+ if (copy_from_user(new_mask, arg, len)) {
+ free_cpumask_var(new_mask);
+ return -EFAULT;
+ }
+
+ ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
+ free_cpumask_var(new_mask);
+ return ret;
+}
+
+static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
+{
+ struct io_uring_task *tctx = current->io_uring;
+
+ if (!tctx || !tctx->io_wq)
+ return -EINVAL;
+
+ return io_wq_cpu_affinity(tctx->io_wq, NULL);
+}
+
static bool io_register_op_must_quiesce(int op)
{
switch (op) {
case IORING_REGISTER_FILES_UPDATE2:
case IORING_REGISTER_BUFFERS2:
case IORING_REGISTER_BUFFERS_UPDATE:
+ case IORING_REGISTER_IOWQ_AFF:
+ case IORING_UNREGISTER_IOWQ_AFF:
return false;
default:
return true;
ret = io_register_rsrc_update(ctx, arg, nr_args,
IORING_RSRC_BUFFER);
break;
+ case IORING_REGISTER_IOWQ_AFF:
+ ret = -EINVAL;
+ if (!arg || !nr_args)
+ break;
+ ret = io_register_iowq_aff(ctx, arg, nr_args);
+ break;
+ case IORING_UNREGISTER_IOWQ_AFF:
+ ret = -EINVAL;
+ if (arg || nr_args)
+ break;
+ ret = io_unregister_iowq_aff(ctx);
+ break;
default:
ret = -EINVAL;
break;