io_uring: fix -EAGAIN retry with IOPOLL

[linux-2.6-microblaze.git] / fs / io_uring.c
diff --git a/fs/io_uring.c b/fs/io_uring.c

index 1dd30a1..361befa 100644 (file)
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -338,7 +338,6 @@ struct io_ring_ctx {
                 unsigned int            drain_next: 1;
                 unsigned int            eventfd_async: 1;
                 unsigned int            restricted: 1;
-               unsigned int            sqo_dead: 1;
                 unsigned int            sqo_exec: 1;
  
                 /*
@@ -380,11 +379,6 @@ struct io_ring_ctx {
  
         struct io_rings *rings;
  
-       /*
-        * For SQPOLL usage
-        */
-       struct task_struct      *sqo_task;
-
         /* Only used for accounting purposes */
         struct mm_struct        *mm_account;
  
@@ -688,7 +682,6 @@ enum {
         REQ_F_POLLED_BIT,
         REQ_F_BUFFER_SELECTED_BIT,
         REQ_F_NO_FILE_TABLE_BIT,
-       REQ_F_WORK_INITIALIZED_BIT,
         REQ_F_LTIMEOUT_ACTIVE_BIT,
         REQ_F_COMPLETE_INLINE_BIT,
  
@@ -730,8 +723,6 @@ enum {
         REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
         /* doesn't need file table for this request */
         REQ_F_NO_FILE_TABLE     = BIT(REQ_F_NO_FILE_TABLE_BIT),
-       /* io_wq_work is initialized */
-       REQ_F_WORK_INITIALIZED  = BIT(REQ_F_WORK_INITIALIZED_BIT),
         /* linked timeout is active, i.e. prepared by link's head */
         REQ_F_LTIMEOUT_ACTIVE   = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
         /* completion is deferred through io_comp_state */
@@ -1094,24 +1085,6 @@ static inline void req_set_fail_links(struct io_kiocb *req)
                 req->flags |= REQ_F_FAIL_LINK;
  }
  
-static inline void __io_req_init_async(struct io_kiocb *req)
-{
-       memset(&req->work, 0, sizeof(req->work));
-       req->flags |= REQ_F_WORK_INITIALIZED;
-}
-
-/*
- * Note: must call io_req_init_async() for the first time you
- * touch any members of io_wq_work.
- */
-static inline void io_req_init_async(struct io_kiocb *req)
-{
-       if (req->flags & REQ_F_WORK_INITIALIZED)
-               return;
-
-       __io_req_init_async(req);
-}
-
  static void io_ring_ctx_ref_free(struct percpu_ref *ref)
  {
         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
@@ -1196,13 +1169,6 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
  
  static void io_req_clean_work(struct io_kiocb *req)
  {
-       if (!(req->flags & REQ_F_WORK_INITIALIZED))
-               return;
-
-       if (req->work.creds) {
-               put_cred(req->work.creds);
-               req->work.creds = NULL;
-       }
         if (req->flags & REQ_F_INFLIGHT) {
                 struct io_ring_ctx *ctx = req->ctx;
                 struct io_uring_task *tctx = req->task->io_uring;
@@ -1215,8 +1181,6 @@ static void io_req_clean_work(struct io_kiocb *req)
                 if (atomic_read(&tctx->in_idle))
                         wake_up(&tctx->wait);
         }
-
-       req->flags &= ~REQ_F_WORK_INITIALIZED;
  }
  
  static void io_req_track_inflight(struct io_kiocb *req)
@@ -1224,7 +1188,6 @@ static void io_req_track_inflight(struct io_kiocb *req)
         struct io_ring_ctx *ctx = req->ctx;
  
         if (!(req->flags & REQ_F_INFLIGHT)) {
-               io_req_init_async(req);
                 req->flags |= REQ_F_INFLIGHT;
  
                 spin_lock_irq(&ctx->inflight_lock);
@@ -1238,8 +1201,6 @@ static void io_prep_async_work(struct io_kiocb *req)
         const struct io_op_def *def = &io_op_defs[req->opcode];
         struct io_ring_ctx *ctx = req->ctx;
  
-       io_req_init_async(req);
-
         if (req->flags & REQ_F_FORCE_ASYNC)
                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
  
@@ -1250,8 +1211,6 @@ static void io_prep_async_work(struct io_kiocb *req)
                 if (def->unbound_nonreg_file)
                         req->work.flags |= IO_WQ_WORK_UNBOUND;
         }
-       if (!req->work.creds)
-               req->work.creds = get_current_cred();
  }
  
  static void io_prep_async_link(struct io_kiocb *req)
@@ -1835,6 +1794,18 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
         return __io_req_find_next(req);
  }
  
+static void ctx_flush_and_put(struct io_ring_ctx *ctx)
+{
+       if (!ctx)
+               return;
+       if (ctx->submit_state.comp.nr) {
+               mutex_lock(&ctx->uring_lock);
+               io_submit_flush_completions(&ctx->submit_state.comp, ctx);
+               mutex_unlock(&ctx->uring_lock);
+       }
+       percpu_ref_put(&ctx->refs);
+}
+
  static bool __tctx_task_work(struct io_uring_task *tctx)
  {
         struct io_ring_ctx *ctx = NULL;
@@ -1852,30 +1823,20 @@ static bool __tctx_task_work(struct io_uring_task *tctx)
         node = list.first;
         while (node) {
                 struct io_wq_work_node *next = node->next;
-               struct io_ring_ctx *this_ctx;
                 struct io_kiocb *req;
  
                 req = container_of(node, struct io_kiocb, io_task_work.node);
-               this_ctx = req->ctx;
-               req->task_work.func(&req->task_work);
-               node = next;
-
-               if (!ctx) {
-                       ctx = this_ctx;
-               } else if (ctx != this_ctx) {
-                       mutex_lock(&ctx->uring_lock);
-                       io_submit_flush_completions(&ctx->submit_state.comp, ctx);
-                       mutex_unlock(&ctx->uring_lock);
-                       ctx = this_ctx;
+               if (req->ctx != ctx) {
+                       ctx_flush_and_put(ctx);
+                       ctx = req->ctx;
+                       percpu_ref_get(&ctx->refs);
                 }
-       }
  
-       if (ctx && ctx->submit_state.comp.nr) {
-               mutex_lock(&ctx->uring_lock);
-               io_submit_flush_completions(&ctx->submit_state.comp, ctx);
-               mutex_unlock(&ctx->uring_lock);
+               req->task_work.func(&req->task_work);
+               node = next;
         }
  
+       ctx_flush_and_put(ctx);
         return list.first != NULL;
  }
  
@@ -2000,7 +1961,7 @@ static void __io_req_task_submit(struct io_kiocb *req)
  
         /* ctx stays valid until unlock, even if we drop all ours ctx->refs */
         mutex_lock(&ctx->uring_lock);
-       if (!ctx->sqo_dead && !(current->flags & PF_EXITING) && !current->in_execve)
+       if (!(current->flags & PF_EXITING) && !current->in_execve)
                 __io_queue_sqe(req);
         else
                 __io_req_task_cancel(req, -EFAULT);
@@ -2462,23 +2423,32 @@ static bool io_resubmit_prep(struct io_kiocb *req)
                 return false;
         return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
  }
-#endif
  
-static bool io_rw_reissue(struct io_kiocb *req)
+static bool io_rw_should_reissue(struct io_kiocb *req)
  {
-#ifdef CONFIG_BLOCK
         umode_t mode = file_inode(req->file)->i_mode;
+       struct io_ring_ctx *ctx = req->ctx;
  
         if (!S_ISBLK(mode) && !S_ISREG(mode))
                 return false;
-       if ((req->flags & REQ_F_NOWAIT) || io_wq_current_is_worker())
+       if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
+           !(ctx->flags & IORING_SETUP_IOPOLL)))
                 return false;
         /*
          * If ref is dying, we might be running poll reap from the exit work.
          * Don't attempt to reissue from that path, just let it fail with
          * -EAGAIN.
          */
-       if (percpu_ref_is_dying(&req->ctx->refs))
+       if (percpu_ref_is_dying(&ctx->refs))
+               return false;
+       return true;
+}
+#endif
+
+static bool io_rw_reissue(struct io_kiocb *req)
+{
+#ifdef CONFIG_BLOCK
+       if (!io_rw_should_reissue(req))
                 return false;
  
         lockdep_assert_held(&req->ctx->uring_lock);
@@ -2521,6 +2491,19 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
  {
         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
  
+#ifdef CONFIG_BLOCK
+       /* Rewind iter, if we have one. iopoll path resubmits as usual */
+       if (res == -EAGAIN && io_rw_should_reissue(req)) {
+               struct io_async_rw *rw = req->async_data;
+
+               if (rw)
+                       iov_iter_revert(&rw->iter,
+                                       req->result - iov_iter_count(&rw->iter));
+               else if (!io_resubmit_prep(req))
+                       res = -EIO;
+       }
+#endif
+
         if (kiocb->ki_flags & IOCB_WRITE)
                 kiocb_end_write(req);
  
@@ -3269,6 +3252,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
         ret = io_iter_do_read(req, iter);
  
         if (ret == -EIOCBQUEUED) {
+               if (req->async_data)
+                       iov_iter_revert(iter, io_size - iov_iter_count(iter));
                 goto out_free;
         } else if (ret == -EAGAIN) {
                 /* IOPOLL retry should happen for io-wq threads */
@@ -3400,6 +3385,8 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
         /* no retry on NONBLOCK nor RWF_NOWAIT */
         if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
                 goto done;
+       if (ret2 == -EIOCBQUEUED && req->async_data)
+               iov_iter_revert(iter, io_size - iov_iter_count(iter));
         if (!force_nonblock || ret2 != -EAGAIN) {
                 /* IOPOLL retry should happen for io-wq threads */
                 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
@@ -3578,7 +3565,6 @@ static int __io_splice_prep(struct io_kiocb *req,
                  * Splice operation will be punted aync, and here need to
                  * modify io_wq_work.flags, so initialize io_wq_work firstly.
                  */
-               io_req_init_async(req);
                 req->work.flags |= IO_WQ_WORK_UNBOUND;
         }
  
@@ -4993,6 +4979,9 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
                         pt->error = -EINVAL;
                         return;
                 }
+               /* double add on the same waitqueue head, ignore */
+               if (poll->head == head)
+                       return;
                 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
                 if (!poll) {
                         pt->error = -ENOMEM;
@@ -5935,8 +5924,22 @@ static void __io_clean_op(struct io_kiocb *req)
  static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
  {
         struct io_ring_ctx *ctx = req->ctx;
+       const struct cred *creds = NULL;
         int ret;
  
+       if (req->work.personality) {
+               const struct cred *new_creds;
+
+               if (!(issue_flags & IO_URING_F_NONBLOCK))
+                       mutex_lock(&ctx->uring_lock);
+               new_creds = idr_find(&ctx->personality_idr, req->work.personality);
+               if (!(issue_flags & IO_URING_F_NONBLOCK))
+                       mutex_unlock(&ctx->uring_lock);
+               if (!new_creds)
+                       return -EINVAL;
+               creds = override_creds(new_creds);
+       }
+
         switch (req->opcode) {
         case IORING_OP_NOP:
                 ret = io_nop(req, issue_flags);
@@ -6043,6 +6046,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
                 break;
         }
  
+       if (creds)
+               revert_creds(creds);
+
         if (ret)
                 return ret;
  
@@ -6206,18 +6212,10 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
  static void __io_queue_sqe(struct io_kiocb *req)
  {
         struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
-       const struct cred *old_creds = NULL;
         int ret;
  
-       if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds &&
-           req->work.creds != current_cred())
-               old_creds = override_creds(req->work.creds);
-
         ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
  
-       if (old_creds)
-               revert_creds(old_creds);
-
         /*
          * We async punt it if the file wasn't marked NOWAIT, or if the file
          * doesn't support non-blocking read/write attempts
@@ -6304,7 +6302,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
  {
         struct io_submit_state *state;
         unsigned int sqe_flags;
-       int id, ret = 0;
+       int ret = 0;
  
         req->opcode = READ_ONCE(sqe->opcode);
         /* same numerical values with corresponding REQ_F_*, safe to copy */
@@ -6336,15 +6334,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
             !io_op_defs[req->opcode].buffer_select)
                 return -EOPNOTSUPP;
  
-       id = READ_ONCE(sqe->personality);
-       if (id) {
-               __io_req_init_async(req);
-               req->work.creds = idr_find(&ctx->personality_idr, id);
-               if (unlikely(!req->work.creds))
-                       return -EINVAL;
-               get_cred(req->work.creds);
-       }
-
+       req->work.list.next = NULL;
+       req->work.flags = 0;
+       req->work.personality = READ_ONCE(sqe->personality);
         state = &ctx->submit_state;
  
         /*
@@ -6606,8 +6598,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
                 if (!list_empty(&ctx->iopoll_list))
                         io_do_iopoll(ctx, &nr_events, 0);
  
-               if (to_submit && !ctx->sqo_dead &&
-                   likely(!percpu_ref_is_dying(&ctx->refs)))
+               if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)))
                         ret = io_submit_sqes(ctx, to_submit);
                 mutex_unlock(&ctx->uring_lock);
         }
@@ -7846,7 +7837,7 @@ static int io_sq_thread_fork(struct io_sq_data *sqd, struct io_ring_ctx *ctx)
  
         clear_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
         reinit_completion(&sqd->completion);
-       ctx->sqo_dead = ctx->sqo_exec = 0;
+       ctx->sqo_exec = 0;
         sqd->task_pid = current->pid;
         current->flags |= PF_IO_WORKER;
         ret = io_wq_fork_thread(io_sq_thread, sqd);
@@ -7947,6 +7938,7 @@ static void io_sq_offload_start(struct io_ring_ctx *ctx)
  {
         struct io_sq_data *sqd = ctx->sq_data;
  
+       ctx->flags &= ~IORING_SETUP_R_DISABLED;
         if (ctx->flags & IORING_SETUP_SQPOLL)
                 complete(&sqd->startup);
  }
@@ -8515,15 +8507,11 @@ static int io_remove_personalities(int id, void *p, void *data)
  
  static bool io_run_ctx_fallback(struct io_ring_ctx *ctx)
  {
-       struct callback_head *work, *head, *next;
+       struct callback_head *work, *next;
         bool executed = false;
  
         do {
-               do {
-                       head = NULL;
-                       work = READ_ONCE(ctx->exit_task_work);
-               } while (cmpxchg(&ctx->exit_task_work, work, head) != work);
-
+               work = xchg(&ctx->exit_task_work, NULL);
                 if (!work)
                         break;
  
@@ -8552,7 +8540,6 @@ static void io_ring_exit_work(struct work_struct *work)
          */
         do {
                 io_uring_try_cancel_requests(ctx, NULL, NULL);
-               io_run_ctx_fallback(ctx);
         } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
         io_ring_ctx_free(ctx);
  }
@@ -8561,10 +8548,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
  {
         mutex_lock(&ctx->uring_lock);
         percpu_ref_kill(&ctx->refs);
-
-       if (WARN_ON_ONCE((ctx->flags & IORING_SETUP_SQPOLL) && !ctx->sqo_dead))
-               ctx->sqo_dead = 1;
-
         /* if force is set, the ring is going away. always drop after that */
         ctx->cq_overflow_flushed = 1;
         if (ctx->rings)
@@ -8724,17 +8707,6 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
         }
  }
  
-static void io_disable_sqo_submit(struct io_ring_ctx *ctx)
-{
-       mutex_lock(&ctx->uring_lock);
-       ctx->sqo_dead = 1;
-       mutex_unlock(&ctx->uring_lock);
-
-       /* make sure callers enter the ring to get error */
-       if (ctx->rings)
-               io_ring_set_wakeup_flag(ctx);
-}
-
  /*
   * We need to iteratively cancel requests, in case a request has dependent
   * hard links. These persist even for failure of cancelations, hence keep
@@ -8747,7 +8719,11 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
         bool did_park = false;
  
         if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
-               io_disable_sqo_submit(ctx);
+               /* never started, nothing to cancel */
+               if (ctx->flags & IORING_SETUP_R_DISABLED) {
+                       io_sq_offload_start(ctx);
+                       return;
+               }
                 did_park = io_sq_thread_park(ctx->sq_data);
                 if (did_park) {
                         task = ctx->sq_data->thread;
@@ -8792,10 +8768,6 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
                                 fput(file);
                                 return ret;
                         }
-
-                       /* one and only SQPOLL file note, held by sqo_task */
-                       WARN_ON_ONCE((ctx->flags & IORING_SETUP_SQPOLL) &&
-                                    current != ctx->sqo_task);
                 }
                 tctx->last = file;
         }
@@ -8868,7 +8840,6 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
  
         if (!sqd)
                 return;
-       io_disable_sqo_submit(ctx);
         if (!io_sq_thread_park(sqd))
                 return;
         tctx = ctx->sq_data->thread->io_uring;
@@ -8913,7 +8884,6 @@ void __io_uring_task_cancel(void)
         /* make sure overflow events are dropped */
         atomic_inc(&tctx->in_idle);
  
-       /* trigger io_disable_sqo_submit() */
         if (tctx->sqpoll) {
                 struct file *file;
                 unsigned long index;
@@ -8948,52 +8918,6 @@ void __io_uring_task_cancel(void)
         __io_uring_free(current);
  }
  
-static int io_uring_flush(struct file *file, void *data)
-{
-       struct io_uring_task *tctx = current->io_uring;
-       struct io_ring_ctx *ctx = file->private_data;
-
-       /* Ignore helper thread files exit */
-       if (current->flags & PF_IO_WORKER)
-               return 0;
-
-       if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
-               io_uring_cancel_task_requests(ctx, NULL);
-               io_req_caches_free(ctx);
-       }
-
-       io_run_ctx_fallback(ctx);
-
-       if (!tctx)
-               return 0;
-
-       /* we should have cancelled and erased it before PF_EXITING */
-       WARN_ON_ONCE((current->flags & PF_EXITING) &&
-                    xa_load(&tctx->xa, (unsigned long)file));
-
-       /*
-        * fput() is pending, will be 2 if the only other ref is our potential
-        * task file note. If the task is exiting, drop regardless of count.
-        */
-       if (atomic_long_read(&file->f_count) != 2)
-               return 0;
-
-       if (ctx->flags & IORING_SETUP_SQPOLL) {
-               /* there is only one file note, which is owned by sqo_task */
-               WARN_ON_ONCE(ctx->sqo_task != current &&
-                            xa_load(&tctx->xa, (unsigned long)file));
-               /* sqo_dead check is for when this happens after cancellation */
-               WARN_ON_ONCE(ctx->sqo_task == current && !ctx->sqo_dead &&
-                            !xa_load(&tctx->xa, (unsigned long)file));
-
-               io_disable_sqo_submit(ctx);
-       }
-
-       if (!(ctx->flags & IORING_SETUP_SQPOLL) || ctx->sqo_task == current)
-               io_uring_del_task_file(file);
-       return 0;
-}
-
  static void *io_uring_validate_mmap_request(struct file *file,
                                             loff_t pgoff, size_t sz)
  {
@@ -9072,22 +8996,14 @@ static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
         do {
                 if (!io_sqring_full(ctx))
                         break;
-
                 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
  
-               if (unlikely(ctx->sqo_dead)) {
-                       ret = -EOWNERDEAD;
-                       goto out;
-               }
-
                 if (!io_sqring_full(ctx))
                         break;
-
                 schedule();
         } while (!signal_pending(current));
  
         finish_wait(&ctx->sqo_sq_wait, &wait);
-out:
         return ret;
  }
  
@@ -9169,8 +9085,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
                         ctx->sqo_exec = 0;
                 }
                 ret = -EOWNERDEAD;
-               if (unlikely(ctx->sqo_dead))
-                       goto out;
                 if (flags & IORING_ENTER_SQ_WAKEUP)
                         wake_up(&ctx->sq_data->wait);
                 if (flags & IORING_ENTER_SQ_WAIT) {
@@ -9325,7 +9239,6 @@ static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
  
  static const struct file_operations io_uring_fops = {
         .release        = io_uring_release,
-       .flush          = io_uring_flush,
         .mmap           = io_uring_mmap,
  #ifndef CONFIG_MMU
         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
@@ -9480,7 +9393,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
         ctx->compat = in_compat_syscall();
         if (!capable(CAP_IPC_LOCK))
                 ctx->user = get_uid(current_user());
-       ctx->sqo_task = current;
  
         /*
          * This is just grabbed for accounting purposes. When a process exits,
@@ -9543,7 +9455,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
          */
         ret = io_uring_install_fd(ctx, file);
         if (ret < 0) {
-               io_disable_sqo_submit(ctx);
                 /* fput will clean it up */
                 fput(file);
                 return ret;
@@ -9552,7 +9463,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
         return ret;
  err:
-       io_disable_sqo_submit(ctx);
         io_ring_ctx_wait_and_kill(ctx);
         return ret;
  }
@@ -9720,10 +9630,7 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx)
         if (ctx->restrictions.registered)
                 ctx->restricted = 1;
  
-       ctx->flags &= ~IORING_SETUP_R_DISABLED;
-
         io_sq_offload_start(ctx);
-
         return 0;
  }