#include <linux/fsnotify.h>
#include <linux/fadvise.h>
#include <linux/eventpoll.h>
-#include <linux/fs_struct.h>
#include <linux/splice.h>
#include <linux/task_work.h>
#include <linux/pagemap.h>
#include <linux/io_uring.h>
-#include <linux/blk-cgroup.h>
-#include <linux/audit.h>
+#include <linux/freezer.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
unsigned long state;
struct completion startup;
- struct completion completion;
+ struct completion parked;
struct completion exited;
};
/* fail rest of links */
REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
- /* on inflight list */
+ /* on inflight list, should be cancelled and waited on exit reliably */
REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
/* read/write uses file position */
REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
return true;
io_for_each_link(req, head) {
- if (req->file && req->file->f_op == &io_uring_fops)
+ if (req->flags & REQ_F_INFLIGHT)
return true;
if (req->task->files == files)
return true;
return false;
}
-static void io_req_clean_work(struct io_kiocb *req)
-{
- if (req->flags & REQ_F_INFLIGHT) {
- struct io_ring_ctx *ctx = req->ctx;
- struct io_uring_task *tctx = req->task->io_uring;
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->inflight_lock, flags);
- list_del(&req->inflight_entry);
- spin_unlock_irqrestore(&ctx->inflight_lock, flags);
- req->flags &= ~REQ_F_INFLIGHT;
- if (atomic_read(&tctx->in_idle))
- wake_up(&tctx->wait);
- }
-}
-
static void io_req_track_inflight(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
io_prep_async_work(cur);
}
-static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
+static void io_queue_async_work(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link = io_prep_linked_timeout(req);
trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
&req->work, req->flags);
- io_wq_enqueue(tctx->io_wq, &req->work);
- return link;
-}
-
-static void io_queue_async_work(struct io_kiocb *req)
-{
- struct io_kiocb *link;
-
/* init ->work of the whole link before punting */
io_prep_async_link(req);
- link = __io_queue_async_work(req);
-
+ io_wq_enqueue(tctx->io_wq, &req->work);
if (link)
io_queue_linked_timeout(link);
}
return all_flushed;
}
-static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
+static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
struct task_struct *tsk,
struct files_struct *files)
{
+ bool ret = true;
+
if (test_bit(0, &ctx->cq_check_overflow)) {
/* iopoll syncs against uring_lock, not completion_lock */
if (ctx->flags & IORING_SETUP_IOPOLL)
mutex_lock(&ctx->uring_lock);
- __io_cqring_overflow_flush(ctx, force, tsk, files);
+ ret = __io_cqring_overflow_flush(ctx, force, tsk, files);
if (ctx->flags & IORING_SETUP_IOPOLL)
mutex_unlock(&ctx->uring_lock);
}
+
+ return ret;
}
static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
if (req->fixed_rsrc_refs)
percpu_ref_put(req->fixed_rsrc_refs);
- io_req_clean_work(req);
+
+ if (req->flags & REQ_F_INFLIGHT) {
+ struct io_ring_ctx *ctx = req->ctx;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->inflight_lock, flags);
+ list_del(&req->inflight_entry);
+ spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+ req->flags &= ~REQ_F_INFLIGHT;
+ }
}
+/* must to be called somewhat shortly after putting a request */
static inline void io_put_task(struct task_struct *task, int nr)
{
struct io_uring_task *tctx = task->io_uring;
if (ret == -EIOCBQUEUED)
return 0;
/* we got some bytes, but not all. retry. */
+ kiocb->ki_flags &= ~IOCB_WAITQ;
} while (ret > 0 && ret < io_size);
done:
kiocb_done(kiocb, ret, issue_flags);
static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
{
- return io_openat2(req, issue_flags & IO_URING_F_NONBLOCK);
+ return io_openat2(req, issue_flags);
}
static int io_remove_buffers_prep(struct io_kiocb *req,
data->mode = io_translate_timeout_mode(flags);
hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
+ io_req_track_inflight(req);
return 0;
}
* wait_task_inactive().
*/
preempt_disable();
- complete(&sqd->completion);
+ complete(&sqd->parked);
schedule_preempt_disabled();
preempt_enable();
}
sprintf(buf, "iou-sqp-%d", sqd->task_pid);
set_task_comm(current, buf);
- sqd->thread = current;
current->pf_io_worker = NULL;
if (sqd->sq_cpu != -1)
set_cpus_allowed_ptr(current, cpu_online_mask);
current->flags |= PF_NO_SETAFFINITY;
- complete(&sqd->completion);
-
wait_for_completion(&sqd->startup);
while (!io_sq_thread_should_stop(sqd)) {
io_ring_set_wakeup_flag(ctx);
schedule();
+ try_to_freeze();
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
io_ring_clear_wakeup_flag(ctx);
}
io_run_task_work();
- if (io_sq_thread_should_park(sqd))
- io_sq_thread_parkme(sqd);
-
/*
- * Clear thread under lock so that concurrent parks work correctly
+ * Ensure that we park properly if racing with someone trying to park
+ * while we're exiting. If we fail to grab the lock, check park and
+ * park if necessary. The ordering with the park bit and the lock
+ * ensures that we catch this reliably.
*/
- complete(&sqd->completion);
- mutex_lock(&sqd->lock);
+ if (!mutex_trylock(&sqd->lock)) {
+ if (io_sq_thread_should_park(sqd))
+ io_sq_thread_parkme(sqd);
+ mutex_lock(&sqd->lock);
+ }
+
sqd->thread = NULL;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
ctx->sqo_exec = 1;
iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
trace_io_uring_cqring_wait(ctx, min_events);
do {
- io_cqring_overflow_flush(ctx, false, NULL, NULL);
+ /* if we can't even flush overflow, don't wait for more */
+ if (!io_cqring_overflow_flush(ctx, false, NULL, NULL)) {
+ ret = -EBUSY;
+ break;
+ }
prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
TASK_INTERRUPTIBLE);
ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
finish_wait(&ctx->wait, &iowq.wq);
+ cond_resched();
} while (ret > 0);
restore_saved_sigmask_unless(ret == -EINTR);
static void io_sq_thread_unpark(struct io_sq_data *sqd)
__releases(&sqd->lock)
{
- if (!sqd->thread)
- return;
if (sqd->thread == current)
return;
clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
- wake_up_state(sqd->thread, TASK_PARKED);
+ if (sqd->thread)
+ wake_up_state(sqd->thread, TASK_PARKED);
mutex_unlock(&sqd->lock);
}
-static bool io_sq_thread_park(struct io_sq_data *sqd)
+static void io_sq_thread_park(struct io_sq_data *sqd)
__acquires(&sqd->lock)
{
if (sqd->thread == current)
- return true;
+ return;
+ set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
mutex_lock(&sqd->lock);
- if (!sqd->thread) {
- mutex_unlock(&sqd->lock);
- return false;
+ if (sqd->thread) {
+ wake_up_process(sqd->thread);
+ wait_for_completion(&sqd->parked);
}
- set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
- wake_up_process(sqd->thread);
- wait_for_completion(&sqd->completion);
- return true;
}
static void io_sq_thread_stop(struct io_sq_data *sqd)
mutex_init(&sqd->lock);
init_waitqueue_head(&sqd->wait);
init_completion(&sqd->startup);
- init_completion(&sqd->completion);
+ init_completion(&sqd->parked);
init_completion(&sqd->exited);
return sqd;
}
static int io_sq_thread_fork(struct io_sq_data *sqd, struct io_ring_ctx *ctx)
{
+ struct task_struct *tsk;
int ret;
clear_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
- reinit_completion(&sqd->completion);
+ reinit_completion(&sqd->parked);
ctx->sqo_exec = 0;
sqd->task_pid = current->pid;
- current->flags |= PF_IO_WORKER;
- ret = io_wq_fork_thread(io_sq_thread, sqd);
- current->flags &= ~PF_IO_WORKER;
- if (ret < 0) {
- sqd->thread = NULL;
- return ret;
- }
- wait_for_completion(&sqd->completion);
- return io_uring_alloc_task_context(sqd->thread, ctx);
+ tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
+ if (IS_ERR(tsk))
+ return PTR_ERR(tsk);
+ ret = io_uring_alloc_task_context(tsk, ctx);
+ if (ret)
+ set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+ sqd->thread = tsk;
+ wake_up_new_task(tsk);
+ return ret;
}
static int io_sq_offload_create(struct io_ring_ctx *ctx,
fdput(f);
}
if (ctx->flags & IORING_SETUP_SQPOLL) {
+ struct task_struct *tsk;
struct io_sq_data *sqd;
ret = -EPERM;
}
sqd->task_pid = current->pid;
- current->flags |= PF_IO_WORKER;
- ret = io_wq_fork_thread(io_sq_thread, sqd);
- current->flags &= ~PF_IO_WORKER;
- if (ret < 0) {
- sqd->thread = NULL;
+ tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
+ if (IS_ERR(tsk)) {
+ ret = PTR_ERR(tsk);
goto err;
}
- wait_for_completion(&sqd->completion);
- ret = io_uring_alloc_task_context(sqd->thread, ctx);
+ ret = io_uring_alloc_task_context(tsk, ctx);
+ if (ret)
+ set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+ sqd->thread = tsk;
+ wake_up_new_task(tsk);
if (ret)
goto err;
} else if (p->flags & IORING_SETUP_SQ_AFF) {
struct files_struct *files)
{
struct task_struct *task = current;
- bool did_park = false;
if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
/* never started, nothing to cancel */
io_sq_offload_start(ctx);
return;
}
- did_park = io_sq_thread_park(ctx->sq_data);
- if (did_park) {
- task = ctx->sq_data->thread;
+ io_sq_thread_park(ctx->sq_data);
+ task = ctx->sq_data->thread;
+ if (task)
atomic_inc(&task->io_uring->in_idle);
- }
}
io_cancel_defer_files(ctx, task, files);
if (!files)
io_uring_try_cancel_requests(ctx, task, NULL);
- if (did_park) {
+ if (task)
atomic_dec(&task->io_uring->in_idle);
+ if (ctx->sq_data)
io_sq_thread_unpark(ctx->sq_data);
- }
}
/*
if (!sqd)
return;
- if (!io_sq_thread_park(sqd))
- return;
- tctx = ctx->sq_data->thread->io_uring;
- /* can happen on fork/alloc failure, just ignore that state */
- if (!tctx) {
+ io_sq_thread_park(sqd);
+ if (!sqd->thread || !sqd->thread->io_uring) {
io_sq_thread_unpark(sqd);
return;
}
-
+ tctx = ctx->sq_data->thread->io_uring;
atomic_inc(&tctx->in_idle);
do {
/* read completions before cancelations */