unsigned sq_mask;
unsigned sq_thread_idle;
unsigned cached_sq_dropped;
- atomic_t cached_cq_overflow;
+ unsigned cached_cq_overflow;
unsigned long sq_check_overflow;
struct list_head defer_list;
REQ_F_BUFFER_SELECTED_BIT,
REQ_F_NO_FILE_TABLE_BIT,
REQ_F_WORK_INITIALIZED_BIT,
+ REQ_F_LTIMEOUT_ACTIVE_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
/* must not punt to workers */
REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
- /* has linked timeout */
+ /* has or had linked timeout */
REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT),
/* io_wq_work is initialized */
REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
+ /* linked timeout is active, i.e. prepared by link's head */
+ REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
};
struct async_poll {
unsigned pollout : 1;
/* op supports buffer selection */
unsigned buffer_select : 1;
- /* needs rlimit(RLIMIT_FSIZE) assigned */
- unsigned needs_fsize : 1;
/* must always have async data allocated */
unsigned needs_async_data : 1;
/* size of async data needed, if any */
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
- .needs_fsize = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
+ IO_WQ_WORK_FSIZE,
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_BLKCG,
+ .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
},
[IORING_OP_WRITE_FIXED] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
- .needs_fsize = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_BLKCG,
+ .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
+ IO_WQ_WORK_MM,
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
- .needs_fsize = 1,
- .work_flags = IO_WQ_WORK_BLKCG,
+ .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE,
},
[IORING_OP_OPENAT] = {
.work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
- .needs_fsize = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
+ IO_WQ_WORK_FSIZE,
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
refcount_set(&id->count, 1);
}
+static inline void __io_req_init_async(struct io_kiocb *req)
+{
+ memset(&req->work, 0, sizeof(req->work));
+ req->flags |= REQ_F_WORK_INITIALIZED;
+}
+
/*
* Note: must call io_req_init_async() for the first time you
* touch any members of io_wq_work.
if (req->flags & REQ_F_WORK_INITIALIZED)
return;
- memset(&req->work, 0, sizeof(req->work));
- req->flags |= REQ_F_WORK_INITIALIZED;
+ __io_req_init_async(req);
/* Grab a ref if this isn't our static identity */
req->work.identity = tctx->identity;
struct io_ring_ctx *ctx = req->ctx;
return seq != ctx->cached_cq_tail
- + atomic_read(&ctx->cached_cq_overflow);
+ + READ_ONCE(ctx->cached_cq_overflow);
}
return false;
struct io_identity *id = req->work.identity;
struct io_ring_ctx *ctx = req->ctx;
- if (def->needs_fsize && id->fsize != rlimit(RLIMIT_FSIZE))
- return false;
+ if (def->work_flags & IO_WQ_WORK_FSIZE) {
+ if (id->fsize != rlimit(RLIMIT_FSIZE))
+ return false;
+ req->work.flags |= IO_WQ_WORK_FSIZE;
+ }
if (!(req->work.flags & IO_WQ_WORK_FILES) &&
(def->work_flags & IO_WQ_WORK_FILES) &&
io_req_init_async(req);
id = req->work.identity;
+ if (req->flags & REQ_F_FORCE_ASYNC)
+ req->work.flags |= IO_WQ_WORK_CONCURRENT;
+
if (req->flags & REQ_F_ISREG) {
if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
io_wq_hash_work(&req->work, file_inode(req->file));
WRITE_ONCE(cqe->res, req->result);
WRITE_ONCE(cqe->flags, req->compl.cflags);
} else {
+ ctx->cached_cq_overflow++;
WRITE_ONCE(ctx->rings->cq_overflow,
- atomic_inc_return(&ctx->cached_cq_overflow));
+ ctx->cached_cq_overflow);
}
}
* then we cannot store the request for later flushing, we need
* to drop it on the floor.
*/
- WRITE_ONCE(ctx->rings->cq_overflow,
- atomic_inc_return(&ctx->cached_cq_overflow));
+ ctx->cached_cq_overflow++;
+ WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow);
} else {
if (list_empty(&ctx->cq_overflow_list)) {
set_bit(0, &ctx->sq_check_overflow);
percpu_ref_put(&ctx->refs);
}
-static bool io_link_cancel_timeout(struct io_kiocb *req)
-{
- struct io_timeout_data *io = req->async_data;
- struct io_ring_ctx *ctx = req->ctx;
- int ret;
-
- ret = hrtimer_try_to_cancel(&io->timer);
- if (ret != -1) {
- io_cqring_fill_event(req, -ECANCELED);
- io_commit_cqring(ctx);
- req->flags &= ~REQ_F_LINK_HEAD;
- io_put_req_deferred(req, 1);
- return true;
- }
-
- return false;
-}
-
-static bool __io_kill_linked_timeout(struct io_kiocb *req)
-{
- struct io_kiocb *link;
- bool wake_ev;
-
- if (list_empty(&req->link_list))
- return false;
- link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
- if (link->opcode != IORING_OP_LINK_TIMEOUT)
- return false;
-
- list_del_init(&link->link_list);
- wake_ev = io_link_cancel_timeout(link);
- req->flags &= ~REQ_F_LINK_TIMEOUT;
- return wake_ev;
-}
-
static void io_kill_linked_timeout(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *link;
+ bool cancelled = false;
unsigned long flags;
- bool wake_ev;
spin_lock_irqsave(&ctx->completion_lock, flags);
- wake_ev = __io_kill_linked_timeout(req);
+ link = list_first_entry_or_null(&req->link_list, struct io_kiocb,
+ link_list);
+ /*
+ * Can happen if a linked timeout fired and link had been like
+ * req -> link t-out -> link t-out [-> ...]
+ */
+ if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
+ struct io_timeout_data *io = link->async_data;
+ int ret;
+
+ list_del_init(&link->link_list);
+ ret = hrtimer_try_to_cancel(&io->timer);
+ if (ret != -1) {
+ io_cqring_fill_event(link, -ECANCELED);
+ io_commit_cqring(ctx);
+ cancelled = true;
+ }
+ }
+ req->flags &= ~REQ_F_LINK_TIMEOUT;
spin_unlock_irqrestore(&ctx->completion_lock, flags);
- if (wake_ev)
+ if (cancelled) {
io_cqring_ev_posted(ctx);
+ io_put_req(link);
+ }
}
static struct io_kiocb *io_req_link_next(struct io_kiocb *req)
/*
* Called if REQ_F_LINK_HEAD is set, and we fail the head request
*/
-static void __io_fail_links(struct io_kiocb *req)
+static void io_fail_links(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
+ unsigned long flags;
+ spin_lock_irqsave(&ctx->completion_lock, flags);
while (!list_empty(&req->link_list)) {
struct io_kiocb *link = list_first_entry(&req->link_list,
struct io_kiocb, link_list);
}
io_commit_cqring(ctx);
-}
-
-static void io_fail_links(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->completion_lock, flags);
- __io_fail_links(req);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
* For files that don't have ->read_iter() and ->write_iter(), handle them
* by looping over ->read() or ->write() manually.
*/
-static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
- struct iov_iter *iter)
+static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
{
+ struct kiocb *kiocb = &req->rw.kiocb;
+ struct file *file = req->file;
ssize_t ret = 0;
/*
if (!iov_iter_is_bvec(iter)) {
iovec = iov_iter_iovec(iter);
} else {
- /* fixed buffers import bvec */
- iovec.iov_base = kmap(iter->bvec->bv_page)
- + iter->iov_offset;
- iovec.iov_len = min(iter->count,
- iter->bvec->bv_len - iter->iov_offset);
+ iovec.iov_base = u64_to_user_ptr(req->rw.addr);
+ iovec.iov_len = req->rw.len;
}
if (rw == READ) {
iovec.iov_len, io_kiocb_ppos(kiocb));
}
- if (iov_iter_is_bvec(iter))
- kunmap(iter->bvec->bv_page);
-
if (nr < 0) {
if (!ret)
ret = nr;
ret += nr;
if (nr != iovec.iov_len)
break;
+ req->rw.len -= nr;
+ req->rw.addr += nr;
iov_iter_advance(iter, nr);
}
if (req->file->f_op->read_iter)
return call_read_iter(req->file, &req->rw.kiocb, iter);
else if (req->file->f_op->read)
- return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter);
+ return loop_rw_iter(READ, req, iter);
else
return -EINVAL;
}
if (req->file->f_op->write_iter)
ret2 = call_write_iter(req->file, kiocb, iter);
else if (req->file->f_op->write)
- ret2 = loop_rw_iter(WRITE, req->file, kiocb, iter);
+ ret2 = loop_rw_iter(WRITE, req, iter);
else
ret2 = -EINVAL;
io_commit_cqring(ctx);
}
-static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
+static void io_poll_task_func(struct callback_head *cb)
{
+ struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *nxt;
if (io_poll_rewait(req, &req->poll)) {
spin_unlock_irq(&ctx->completion_lock);
- return;
- }
-
- hash_del(&req->hash_node);
- io_poll_complete(req, req->result, 0);
- spin_unlock_irq(&ctx->completion_lock);
-
- *nxt = io_put_req_find_next(req);
- io_cqring_ev_posted(ctx);
-}
+ } else {
+ hash_del(&req->hash_node);
+ io_poll_complete(req, req->result, 0);
+ spin_unlock_irq(&ctx->completion_lock);
-static void io_poll_task_func(struct callback_head *cb)
-{
- struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
- struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *nxt = NULL;
+ nxt = io_put_req_find_next(req);
+ io_cqring_ev_posted(ctx);
+ if (nxt)
+ __io_req_task_submit(nxt);
+ }
- io_poll_task_handler(req, &nxt);
- if (nxt)
- __io_req_task_submit(nxt);
percpu_ref_put(&ctx->refs);
}
/* make sure double remove sees this as being gone */
wait->private = NULL;
spin_unlock(&poll->head->lock);
- if (!done)
- __io_async_wake(req, poll, mask, io_poll_task_func);
+ if (!done) {
+ /* use wait func handler, so it matches the rq type */
+ poll->wait.func(&poll->wait, mode, sync, key);
+ }
}
refcount_dec(&req->refs);
return 1;
struct io_ring_ctx *ctx = req->ctx;
bool cancel = false;
+ INIT_HLIST_NODE(&req->hash_node);
io_init_poll_iocb(poll, mask, wake_func);
poll->file = req->file;
poll->wait.private = req;
req->flags |= REQ_F_POLLED;
req->apoll = apoll;
- INIT_HLIST_NODE(&req->hash_node);
mask = 0;
if (def->pollin)
return -EINVAL;
if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
return -EINVAL;
- if (!poll->file)
- return -EBADF;
events = READ_ONCE(sqe->poll32_events);
#ifdef __BIG_ENDIAN
struct io_poll_table ipt;
__poll_t mask;
- INIT_HLIST_NODE(&req->hash_node);
ipt.pt._qproc = io_poll_queue_proc;
mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
if (!list_empty(&req->link_list)) {
prev = list_entry(req->link_list.prev, struct io_kiocb,
link_list);
- if (refcount_inc_not_zero(&prev->refs)) {
+ if (refcount_inc_not_zero(&prev->refs))
list_del_init(&req->link_list);
- prev->flags &= ~REQ_F_LINK_TIMEOUT;
- } else
+ else
prev = NULL;
}
if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
return NULL;
+ nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
req->flags |= REQ_F_LINK_TIMEOUT;
return nxt;
}
static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
{
struct io_kiocb *linked_timeout;
- struct io_kiocb *nxt;
const struct cred *old_creds = NULL;
int ret;
again:
linked_timeout = io_prep_linked_timeout(req);
- if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.identity->creds &&
+ if ((req->flags & REQ_F_WORK_INITIALIZED) &&
+ (req->work.flags & IO_WQ_WORK_CREDS) &&
req->work.identity->creds != current_cred()) {
if (old_creds)
revert_creds(old_creds);
old_creds = NULL; /* restored original creds */
else
old_creds = override_creds(req->work.identity->creds);
- req->work.flags |= IO_WQ_WORK_CREDS;
}
ret = io_issue_sqe(req, true, cs);
*/
if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
if (!io_arm_poll_handler(req)) {
-punt:
/*
* Queued up for async execution, worker will release
* submit reference when the iocb is actually submitted.
if (linked_timeout)
io_queue_linked_timeout(linked_timeout);
- goto exit;
- }
+ } else if (likely(!ret)) {
+ /* drop submission reference */
+ req = io_put_req_find_next(req);
+ if (linked_timeout)
+ io_queue_linked_timeout(linked_timeout);
- if (unlikely(ret)) {
+ if (req) {
+ if (!(req->flags & REQ_F_FORCE_ASYNC))
+ goto again;
+ io_queue_async_work(req);
+ }
+ } else {
/* un-prep timeout, so it'll be killed as any other linked */
req->flags &= ~REQ_F_LINK_TIMEOUT;
req_set_fail_links(req);
io_put_req(req);
io_req_complete(req, ret);
- goto exit;
}
- /* drop submission reference */
- nxt = io_put_req_find_next(req);
- if (linked_timeout)
- io_queue_linked_timeout(linked_timeout);
-
- if (nxt) {
- req = nxt;
-
- if (req->flags & REQ_F_FORCE_ASYNC)
- goto punt;
- goto again;
- }
-exit:
if (old_creds)
revert_creds(old_creds);
}
if (unlikely(ret))
goto fail_req;
}
-
- /*
- * Never try inline submit of IOSQE_ASYNC is set, go straight
- * to async execution.
- */
- io_req_init_async(req);
- req->work.flags |= IO_WQ_WORK_CONCURRENT;
io_queue_async_work(req);
} else {
if (sqe) {
if (id) {
struct io_identity *iod;
- io_req_init_async(req);
iod = idr_find(&ctx->personality_idr, id);
if (unlikely(!iod))
return -EINVAL;
refcount_inc(&iod->count);
- io_put_identity(current->io_uring, req);
+
+ __io_req_init_async(req);
get_cred(iod->creds);
req->work.identity = iod;
req->work.flags |= IO_WQ_WORK_CREDS;
fput(file);
}
-static void __io_uring_attempt_task_drop(struct file *file)
-{
- struct file *old = xa_load(¤t->io_uring->xa, (unsigned long)file);
-
- if (old == file)
- io_uring_del_task_file(file);
-}
-
/*
* Drop task note for this file if we're the only ones that hold it after
* pending fput()
*/
-static void io_uring_attempt_task_drop(struct file *file, bool exiting)
+static void io_uring_attempt_task_drop(struct file *file)
{
if (!current->io_uring)
return;
* fput() is pending, will be 2 if the only other ref is our potential
* task file note. If the task is exiting, drop regardless of count.
*/
- if (!exiting && atomic_long_read(&file->f_count) != 2)
- return;
-
- __io_uring_attempt_task_drop(file);
+ if (fatal_signal_pending(current) || (current->flags & PF_EXITING) ||
+ atomic_long_read(&file->f_count) == 2)
+ io_uring_del_task_file(file);
}
void __io_uring_files_cancel(struct files_struct *files)
static int io_uring_flush(struct file *file, void *data)
{
- struct io_ring_ctx *ctx = file->private_data;
-
- /*
- * If the task is going away, cancel work it may have pending
- */
- if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
- data = NULL;
-
- io_uring_cancel_task_requests(ctx, data);
- io_uring_attempt_task_drop(file, !data);
+ io_uring_attempt_task_drop(file);
return 0;
}