Merge tag 'io_uring-worker.v3-2021-02-25' of git://git.kernel.dk/linux-block
[linux-2.6-microblaze.git] / fs / io_uring.c
index 0a435a6..4a08858 100644 (file)
@@ -339,6 +339,7 @@ struct io_ring_ctx {
                unsigned int            eventfd_async: 1;
                unsigned int            restricted: 1;
                unsigned int            sqo_dead: 1;
+               unsigned int            sqo_exec: 1;
 
                /*
                 * Ring buffer of indices into array of io_uring_sqe, which is
@@ -360,6 +361,9 @@ struct io_ring_ctx {
                unsigned                cached_cq_overflow;
                unsigned long           sq_check_overflow;
 
+               /* hashed buffered write serialization */
+               struct io_wq_hash       *hash_map;
+
                struct list_head        defer_list;
                struct list_head        timeout_list;
                struct list_head        cq_overflow_list;
@@ -454,6 +458,8 @@ struct io_ring_ctx {
        /* exit task_work */
        struct callback_head            *exit_task_work;
 
+       struct wait_queue_head          hash_wait;
+
        /* Keep this last, we don't need it for the fast path */
        struct work_struct              exit_work;
 };
@@ -1058,21 +1064,6 @@ static inline void io_set_resource_node(struct io_kiocb *req)
        }
 }
 
-static bool io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
-{
-       if (!percpu_ref_tryget(ref)) {
-               /* already at zero, wait for ->release() */
-               if (!try_wait_for_completion(compl))
-                       synchronize_rcu();
-               return false;
-       }
-
-       percpu_ref_resurrect(ref);
-       reinit_completion(compl);
-       percpu_ref_put(ref);
-       return true;
-}
-
 static bool io_match_task(struct io_kiocb *head,
                          struct task_struct *task,
                          struct files_struct *files)
@@ -2019,7 +2010,7 @@ static void __io_req_task_submit(struct io_kiocb *req)
 
        /* ctx stays valid until unlock, even if we drop all ours ctx->refs */
        mutex_lock(&ctx->uring_lock);
-       if (!ctx->sqo_dead && !(current->flags & PF_EXITING))
+       if (!ctx->sqo_dead && !(current->flags & PF_EXITING) && !current->in_execve)
                __io_queue_sqe(req);
        else
                __io_req_task_cancel(req, -EFAULT);
@@ -2492,6 +2483,13 @@ static bool io_rw_reissue(struct io_kiocb *req)
                return false;
        if ((req->flags & REQ_F_NOWAIT) || io_wq_current_is_worker())
                return false;
+       /*
+        * If ref is dying, we might be running poll reap from the exit work.
+        * Don't attempt to reissue from that path, just let it fail with
+        * -EAGAIN.
+        */
+       if (percpu_ref_is_dying(&req->ctx->refs))
+               return false;
 
        lockdep_assert_held(&req->ctx->uring_lock);
 
@@ -6791,6 +6789,10 @@ static int io_sq_thread(void *data)
        complete_all(&sqd->completion);
        mutex_lock(&sqd->lock);
        sqd->thread = NULL;
+       list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+               ctx->sqo_exec = 1;
+               io_ring_set_wakeup_flag(ctx);
+       }
        mutex_unlock(&sqd->lock);
 
        complete(&sqd->exited);
@@ -7016,11 +7018,13 @@ static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
                flush_delayed_work(&ctx->rsrc_put_work);
 
                ret = wait_for_completion_interruptible(&data->done);
-               if (!ret || !io_refs_resurrect(&data->refs, &data->done))
+               if (!ret)
                        break;
 
+               percpu_ref_resurrect(&data->refs);
                io_sqe_rsrc_set_node(ctx, data, backup_node);
                backup_node = NULL;
+               reinit_completion(&data->done);
                mutex_unlock(&ctx->uring_lock);
                ret = io_run_task_work_sig();
                mutex_lock(&ctx->uring_lock);
@@ -7136,6 +7140,7 @@ static void io_sq_thread_finish(struct io_ring_ctx *ctx)
        struct io_sq_data *sqd = ctx->sq_data;
 
        if (sqd) {
+               complete(&sqd->startup);
                if (sqd->thread) {
                        wait_for_completion(&ctx->sq_thread_comp);
                        io_sq_thread_park(sqd);
@@ -7763,9 +7768,21 @@ static struct io_wq_work *io_free_work(struct io_wq_work *work)
 
 static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
 {
+       struct io_wq_hash *hash;
        struct io_wq_data data;
        unsigned int concurrency;
 
+       hash = ctx->hash_map;
+       if (!hash) {
+               hash = kzalloc(sizeof(*hash), GFP_KERNEL);
+               if (!hash)
+                       return ERR_PTR(-ENOMEM);
+               refcount_set(&hash->refs, 1);
+               init_waitqueue_head(&hash->wait);
+               ctx->hash_map = hash;
+       }
+
+       data.hash = hash;
        data.free_work = io_free_work;
        data.do_work = io_wq_submit_work;
 
@@ -7822,6 +7839,25 @@ void __io_uring_free(struct task_struct *tsk)
        tsk->io_uring = NULL;
 }
 
+static int io_sq_thread_fork(struct io_sq_data *sqd, struct io_ring_ctx *ctx)
+{
+       int ret;
+
+       clear_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+       reinit_completion(&sqd->completion);
+       ctx->sqo_dead = ctx->sqo_exec = 0;
+       sqd->task_pid = current->pid;
+       current->flags |= PF_IO_WORKER;
+       ret = io_wq_fork_thread(io_sq_thread, sqd);
+       current->flags &= ~PF_IO_WORKER;
+       if (ret < 0) {
+               sqd->thread = NULL;
+               return ret;
+       }
+       wait_for_completion(&sqd->completion);
+       return io_uring_alloc_task_context(sqd->thread, ctx);
+}
+
 static int io_sq_offload_create(struct io_ring_ctx *ctx,
                                struct io_uring_params *p)
 {
@@ -7910,7 +7946,7 @@ static void io_sq_offload_start(struct io_ring_ctx *ctx)
 {
        struct io_sq_data *sqd = ctx->sq_data;
 
-       if ((ctx->flags & IORING_SETUP_SQPOLL) && sqd->thread)
+       if (ctx->flags & IORING_SETUP_SQPOLL)
                complete(&sqd->startup);
 }
 
@@ -8351,19 +8387,23 @@ static void io_req_cache_free(struct list_head *list, struct task_struct *tsk)
 static void io_req_caches_free(struct io_ring_ctx *ctx, struct task_struct *tsk)
 {
        struct io_submit_state *submit_state = &ctx->submit_state;
+       struct io_comp_state *cs = &ctx->submit_state.comp;
 
        mutex_lock(&ctx->uring_lock);
 
-       if (submit_state->free_reqs)
+       if (submit_state->free_reqs) {
                kmem_cache_free_bulk(req_cachep, submit_state->free_reqs,
                                     submit_state->reqs);
-
-       io_req_cache_free(&submit_state->comp.free_list, NULL);
+               submit_state->free_reqs = 0;
+       }
 
        spin_lock_irq(&ctx->completion_lock);
-       io_req_cache_free(&submit_state->comp.locked_free_list, NULL);
+       list_splice_init(&cs->locked_free_list, &cs->free_list);
+       cs->locked_free_nr = 0;
        spin_unlock_irq(&ctx->completion_lock);
 
+       io_req_cache_free(&cs->free_list, NULL);
+
        mutex_unlock(&ctx->uring_lock);
 }
 
@@ -8405,6 +8445,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
        percpu_ref_exit(&ctx->refs);
        free_uid(ctx->user);
        io_req_caches_free(ctx, NULL);
+       if (ctx->hash_map)
+               io_wq_put_hash(ctx->hash_map);
        kfree(ctx->cancel_hash);
        kfree(ctx);
 }
@@ -8801,7 +8843,7 @@ void __io_uring_files_cancel(struct files_struct *files)
        if (files) {
                io_uring_remove_task_files(tctx);
                if (tctx->io_wq) {
-                       io_wq_destroy(tctx->io_wq);
+                       io_wq_put(tctx->io_wq);
                        tctx->io_wq = NULL;
                }
        }
@@ -9108,6 +9150,12 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
        if (ctx->flags & IORING_SETUP_SQPOLL) {
                io_cqring_overflow_flush(ctx, false, NULL, NULL);
 
+               if (unlikely(ctx->sqo_exec)) {
+                       ret = io_sq_thread_fork(ctx->sq_data, ctx);
+                       if (ret)
+                               goto out;
+                       ctx->sqo_exec = 0;
+               }
                ret = -EOWNERDEAD;
                if (unlikely(ctx->sqo_dead))
                        goto out;
@@ -9209,8 +9257,11 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
         */
        has_lock = mutex_trylock(&ctx->uring_lock);
 
-       if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL))
+       if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
                sq = ctx->sq_data;
+               if (!sq->thread)
+                       sq = NULL;
+       }
 
        seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
        seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
@@ -9716,8 +9767,10 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 
                mutex_lock(&ctx->uring_lock);
 
-               if (ret && io_refs_resurrect(&ctx->refs, &ctx->ref_comp))
-                       return ret;
+               if (ret) {
+                       percpu_ref_resurrect(&ctx->refs);
+                       goto out_quiesce;
+               }
        }
 
        if (ctx->restricted) {
@@ -9809,6 +9862,7 @@ out:
        if (io_register_op_must_quiesce(opcode)) {
                /* bring the ctx back to life */
                percpu_ref_reinit(&ctx->refs);
+out_quiesce:
                reinit_completion(&ctx->ref_comp);
        }
        return ret;