io_uring: Fix uninitialized variable up.resv

[linux-2.6-microblaze.git] / fs / io_uring.c
diff --git a/fs/io_uring.c b/fs/io_uring.c

index aa29918..5775204 100644 (file)
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -195,9 +195,9 @@ enum io_uring_cmd_flags {
  struct io_mapped_ubuf {
         u64             ubuf;
         u64             ubuf_end;
-       struct          bio_vec *bvec;
         unsigned int    nr_bvecs;
         unsigned long   acct_pages;
+       struct bio_vec  bvec[];
  };
  
  struct io_ring_ctx;
@@ -214,14 +214,17 @@ struct io_fixed_file {
  
  struct io_rsrc_put {
         struct list_head list;
+       u64 tag;
         union {
                 void *rsrc;
                 struct file *file;
+               struct io_mapped_ubuf *buf;
         };
  };
  
-struct fixed_rsrc_table {
-       struct io_fixed_file *files;
+struct io_file_table {
+       /* two level table */
+       struct io_fixed_file **files;
  };
  
  struct io_rsrc_node {
@@ -236,11 +239,11 @@ struct io_rsrc_node {
  typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
  
  struct io_rsrc_data {
-       struct fixed_rsrc_table         *table;
         struct io_ring_ctx              *ctx;
  
+       u64                             *tags;
         rsrc_put_fn                     *do_put;
-       struct percpu_ref               refs;
+       atomic_t                        refs;
         struct completion               done;
         bool                            quiesce;
  };
@@ -398,11 +401,13 @@ struct io_ring_ctx {
          * used. Only updated through io_uring_register(2).
          */
         struct io_rsrc_data     *file_data;
+       struct io_file_table    file_table;
         unsigned                nr_user_files;
  
         /* if used, fixed mapped user buffers */
+       struct io_rsrc_data     *buf_data;
         unsigned                nr_user_bufs;
-       struct io_mapped_ubuf   *user_bufs;
+       struct io_mapped_ubuf   **user_bufs;
  
         struct user_struct      *user;
  
@@ -487,20 +492,16 @@ struct io_poll_iocb {
         __poll_t                        events;
         bool                            done;
         bool                            canceled;
-       bool                            update_events;
-       bool                            update_user_data;
-       union {
-               struct wait_queue_entry wait;
-               struct {
-                       u64             old_user_data;
-                       u64             new_user_data;
-               };
-       };
+       struct wait_queue_entry         wait;
  };
  
-struct io_poll_remove {
+struct io_poll_update {
         struct file                     *file;
-       u64                             addr;
+       u64                             old_user_data;
+       u64                             new_user_data;
+       __poll_t                        events;
+       bool                            update_events;
+       bool                            update_user_data;
  };
  
  struct io_close {
@@ -629,7 +630,7 @@ struct io_splice {
  struct io_provide_buf {
         struct file                     *file;
         __u64                           addr;
-       __s32                           len;
+       __u32                           len;
         __u32                           bgid;
         __u16                           nbufs;
         __u16                           bid;
@@ -788,7 +789,7 @@ struct io_kiocb {
                 struct file             *file;
                 struct io_rw            rw;
                 struct io_poll_iocb     poll;
-               struct io_poll_remove   poll_remove;
+               struct io_poll_update   poll_update;
                 struct io_accept        accept;
                 struct io_sync          sync;
                 struct io_cancel        cancel;
@@ -840,6 +841,8 @@ struct io_kiocb {
         struct hlist_node               hash_node;
         struct async_poll               *apoll;
         struct io_wq_work               work;
+       /* store used ubuf, so we can prevent reloading */
+       struct io_mapped_ubuf           *imu;
  };
  
  struct io_tctx_node {
@@ -1025,19 +1028,20 @@ static void io_uring_del_task_file(unsigned long index);
  static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
                                          struct task_struct *task,
                                          struct files_struct *files);
-static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx);
+static void io_uring_cancel_sqpoll(struct io_sq_data *sqd);
  static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
  
-static bool io_cqring_fill_event(struct io_kiocb *req, long res, unsigned cflags);
+static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
+                                long res, unsigned int cflags);
  static void io_put_req(struct io_kiocb *req);
  static void io_put_req_deferred(struct io_kiocb *req, int nr);
  static void io_dismantle_req(struct io_kiocb *req);
  static void io_put_task(struct task_struct *task, int nr);
  static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
  static void io_queue_linked_timeout(struct io_kiocb *req);
-static int __io_sqe_files_update(struct io_ring_ctx *ctx,
-                                struct io_uring_rsrc_update *ip,
-                                unsigned nr_args);
+static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
+                                    struct io_uring_rsrc_update2 *up,
+                                    unsigned nr_args);
  static void io_clean_op(struct io_kiocb *req);
  static struct file *io_file_get(struct io_submit_state *state,
                                 struct io_kiocb *req, int fd, bool fixed);
@@ -1080,6 +1084,18 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req)
         }
  }
  
+static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
+{
+       bool got = percpu_ref_tryget(ref);
+
+       /* already at zero, wait for ->release() */
+       if (!got)
+               wait_for_completion(compl);
+       percpu_ref_resurrect(ref);
+       if (got)
+               percpu_ref_put(ref);
+}
+
  static bool io_match_task(struct io_kiocb *head,
                           struct task_struct *task,
                           struct files_struct *files)
@@ -1100,7 +1116,7 @@ static bool io_match_task(struct io_kiocb *head,
  
  static inline void req_set_fail_links(struct io_kiocb *req)
  {
-       if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
+       if (req->flags & REQ_F_LINK)
                 req->flags |= REQ_F_FAIL_LINK;
  }
  
@@ -1249,16 +1265,15 @@ static void io_queue_async_work(struct io_kiocb *req)
  }
  
  static void io_kill_timeout(struct io_kiocb *req, int status)
+       __must_hold(&req->ctx->completion_lock)
  {
         struct io_timeout_data *io = req->async_data;
-       int ret;
  
-       ret = hrtimer_try_to_cancel(&io->timer);
-       if (ret != -1) {
+       if (hrtimer_try_to_cancel(&io->timer) != -1) {
                 atomic_set(&req->ctx->cq_timeouts,
                         atomic_read(&req->ctx->cq_timeouts) + 1);
                 list_del_init(&req->timeout.list);
-               io_cqring_fill_event(req, status, 0);
+               io_cqring_fill_event(req->ctx, req->user_data, status, 0);
                 io_put_req_deferred(req, 1);
         }
  }
@@ -1492,45 +1507,39 @@ static inline void req_ref_get(struct io_kiocb *req)
         atomic_inc(&req->refs);
  }
  
-static bool io_cqring_event_overflow(struct io_kiocb *req, long res,
-                                    unsigned int cflags)
+static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
+                                    long res, unsigned int cflags)
  {
-       struct io_ring_ctx *ctx = req->ctx;
+       struct io_overflow_cqe *ocqe;
  
-       if (!atomic_read(&req->task->io_uring->in_idle)) {
-               struct io_overflow_cqe *ocqe;
-
-               ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
-               if (!ocqe)
-                       goto overflow;
-               if (list_empty(&ctx->cq_overflow_list)) {
-                       set_bit(0, &ctx->sq_check_overflow);
-                       set_bit(0, &ctx->cq_check_overflow);
-                       ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
-               }
-               ocqe->cqe.user_data = req->user_data;
-               ocqe->cqe.res = res;
-               ocqe->cqe.flags = cflags;
-               list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
-               return true;
+       ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
+       if (!ocqe) {
+               /*
+                * If we're in ring overflow flush mode, or in task cancel mode,
+                * or cannot allocate an overflow entry, then we need to drop it
+                * on the floor.
+                */
+               WRITE_ONCE(ctx->rings->cq_overflow, ++ctx->cached_cq_overflow);
+               return false;
         }
-overflow:
-       /*
-        * If we're in ring overflow flush mode, or in task cancel mode,
-        * or cannot allocate an overflow entry, then we need to drop it
-        * on the floor.
-        */
-       WRITE_ONCE(ctx->rings->cq_overflow, ++ctx->cached_cq_overflow);
-       return false;
+       if (list_empty(&ctx->cq_overflow_list)) {
+               set_bit(0, &ctx->sq_check_overflow);
+               set_bit(0, &ctx->cq_check_overflow);
+               ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
+       }
+       ocqe->cqe.user_data = user_data;
+       ocqe->cqe.res = res;
+       ocqe->cqe.flags = cflags;
+       list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
+       return true;
  }
  
-static inline bool __io_cqring_fill_event(struct io_kiocb *req, long res,
-                                            unsigned int cflags)
+static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
+                                         long res, unsigned int cflags)
  {
-       struct io_ring_ctx *ctx = req->ctx;
         struct io_uring_cqe *cqe;
  
-       trace_io_uring_complete(ctx, req->user_data, res, cflags);
+       trace_io_uring_complete(ctx, user_data, res, cflags);
  
         /*
          * If we can't get a cq entry, userspace overflowed the
@@ -1539,19 +1548,19 @@ static inline bool __io_cqring_fill_event(struct io_kiocb *req, long res,
          */
         cqe = io_get_cqring(ctx);
         if (likely(cqe)) {
-               WRITE_ONCE(cqe->user_data, req->user_data);
+               WRITE_ONCE(cqe->user_data, user_data);
                 WRITE_ONCE(cqe->res, res);
                 WRITE_ONCE(cqe->flags, cflags);
                 return true;
         }
-       return io_cqring_event_overflow(req, res, cflags);
+       return io_cqring_event_overflow(ctx, user_data, res, cflags);
  }
  
  /* not as hot to bloat with inlining */
-static noinline bool io_cqring_fill_event(struct io_kiocb *req, long res,
-                                         unsigned int cflags)
+static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
+                                         long res, unsigned int cflags)
  {
-       return __io_cqring_fill_event(req, res, cflags);
+       return __io_cqring_fill_event(ctx, user_data, res, cflags);
  }
  
  static void io_req_complete_post(struct io_kiocb *req, long res,
@@ -1561,7 +1570,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
         unsigned long flags;
  
         spin_lock_irqsave(&ctx->completion_lock, flags);
-       __io_cqring_fill_event(req, res, cflags);
+       __io_cqring_fill_event(ctx, req->user_data, res, cflags);
         /*
          * If we're the last reference to this request, add to our locked
          * free_list cache.
@@ -1594,10 +1603,16 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
         }
  }
  
+static inline bool io_req_needs_clean(struct io_kiocb *req)
+{
+       return req->flags & (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP |
+                               REQ_F_POLLED | REQ_F_INFLIGHT);
+}
+
  static void io_req_complete_state(struct io_kiocb *req, long res,
                                   unsigned int cflags)
  {
-       if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
+       if (io_req_needs_clean(req))
                 io_clean_op(req);
         req->result = res;
         req->compl.cflags = cflags;
@@ -1707,19 +1722,10 @@ static void io_dismantle_req(struct io_kiocb *req)
  {
         unsigned int flags = req->flags;
  
+       if (io_req_needs_clean(req))
+               io_clean_op(req);
         if (!(flags & REQ_F_FIXED_FILE))
                 io_put_file(req->file);
-       if (flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED |
-                    REQ_F_INFLIGHT)) {
-               io_clean_op(req);
-
-               if (req->flags & REQ_F_INFLIGHT) {
-                       struct io_uring_task *tctx = req->task->io_uring;
-
-                       atomic_dec(&tctx->inflight_tracked);
-                       req->flags &= ~REQ_F_INFLIGHT;
-               }
-       }
         if (req->fixed_rsrc_refs)
                 percpu_ref_put(req->fixed_rsrc_refs);
         if (req->async_data)
@@ -1771,13 +1777,12 @@ static bool io_kill_linked_timeout(struct io_kiocb *req)
          */
         if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
                 struct io_timeout_data *io = link->async_data;
-               int ret;
  
                 io_remove_next_linked(req);
                 link->timeout.head = NULL;
-               ret = hrtimer_try_to_cancel(&io->timer);
-               if (ret != -1) {
-                       io_cqring_fill_event(link, -ECANCELED, 0);
+               if (hrtimer_try_to_cancel(&io->timer) != -1) {
+                       io_cqring_fill_event(link->ctx, link->user_data,
+                                            -ECANCELED, 0);
                         io_put_req_deferred(link, 1);
                         return true;
                 }
@@ -1796,7 +1801,7 @@ static void io_fail_links(struct io_kiocb *req)
                 link->link = NULL;
  
                 trace_io_uring_fail_link(req, link);
-               io_cqring_fill_event(link, -ECANCELED, 0);
+               io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0);
                 io_put_req_deferred(link, 2);
                 link = nxt;
         }
@@ -1809,7 +1814,8 @@ static bool io_disarm_next(struct io_kiocb *req)
  
         if (likely(req->flags & REQ_F_LINK_TIMEOUT))
                 posted = io_kill_linked_timeout(req);
-       if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
+       if (unlikely((req->flags & REQ_F_FAIL_LINK) &&
+                    !(req->flags & REQ_F_HARDLINK))) {
                 posted |= (req->link != NULL);
                 io_fail_links(req);
         }
@@ -2116,7 +2122,8 @@ static void io_submit_flush_completions(struct io_comp_state *cs,
         spin_lock_irq(&ctx->completion_lock);
         for (i = 0; i < nr; i++) {
                 req = cs->reqs[i];
-               __io_cqring_fill_event(req, req->result, req->compl.cflags);
+               __io_cqring_fill_event(ctx, req->user_data, req->result,
+                                       req->compl.cflags);
         }
         io_commit_cqring(ctx);
         spin_unlock_irq(&ctx->completion_lock);
@@ -2256,7 +2263,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
                 if (req->flags & REQ_F_BUFFER_SELECTED)
                         cflags = io_put_rw_kbuf(req);
  
-               __io_cqring_fill_event(req, req->result, cflags);
+               __io_cqring_fill_event(ctx, req->user_data, req->result, cflags);
                 (*nr_events)++;
  
                 if (req_ref_put_and_test(req))
@@ -2317,27 +2324,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
         return ret;
  }
  
-/*
- * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
- * non-spinning poll check - we'll still enter the driver poll loop, but only
- * as a non-spinning completion check.
- */
-static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
-                               long min)
-{
-       while (!list_empty(&ctx->iopoll_list) && !need_resched()) {
-               int ret;
-
-               ret = io_do_iopoll(ctx, nr_events, min);
-               if (ret < 0)
-                       return ret;
-               if (*nr_events >= min)
-                       return 0;
-       }
-
-       return 1;
-}
-
  /*
   * We can't just wait for polled events to come to us, we have to actively
   * find and complete them.
@@ -2373,7 +2359,7 @@ static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
  static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
  {
         unsigned int nr_events = 0;
-       int iters = 0, ret = 0;
+       int ret = 0;
  
         /*
          * We disallow the app entering submit/complete with polling, but we
@@ -2381,17 +2367,16 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
          * that got punted to a workqueue.
          */
         mutex_lock(&ctx->uring_lock);
+       /*
+        * Don't enter poll loop if we already have events pending.
+        * If we do, we can potentially be spinning for commands that
+        * already triggered a CQE (eg in error).
+        */
+       if (test_bit(0, &ctx->cq_check_overflow))
+               __io_cqring_overflow_flush(ctx, false);
+       if (io_cqring_events(ctx))
+               goto out;
         do {
-               /*
-                * Don't enter poll loop if we already have events pending.
-                * If we do, we can potentially be spinning for commands that
-                * already triggered a CQE (eg in error).
-                */
-               if (test_bit(0, &ctx->cq_check_overflow))
-                       __io_cqring_overflow_flush(ctx, false);
-               if (io_cqring_events(ctx))
-                       break;
-
                 /*
                  * If a submit got punted to a workqueue, we can have the
                  * application entering polling for a command before it gets
@@ -2402,18 +2387,17 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
                  * forever, while the workqueue is stuck trying to acquire the
                  * very same mutex.
                  */
-               if (!(++iters & 7)) {
+               if (list_empty(&ctx->iopoll_list)) {
                         mutex_unlock(&ctx->uring_lock);
                         io_run_task_work();
                         mutex_lock(&ctx->uring_lock);
-               }
-
-               ret = io_iopoll_getevents(ctx, &nr_events, min);
-               if (ret <= 0)
-                       break;
-               ret = 0;
-       } while (min && !nr_events && !need_resched());
  
+                       if (list_empty(&ctx->iopoll_list))
+                               break;
+               }
+               ret = io_do_iopoll(ctx, &nr_events, min);
+       } while (!ret && nr_events < min && !need_resched());
+out:
         mutex_unlock(&ctx->uring_lock);
         return ret;
  }
@@ -2524,7 +2508,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
  /*
   * After the iocb has been issued, it's safe to be found on the poll list.
   * Adding the kiocb to the list AFTER submission ensures that we don't
- * find it from a io_iopoll_getevents() thread before the issuer is done
+ * find it from a io_do_iopoll() thread before the issuer is done
   * accessing the kiocb cookie.
   */
  static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
@@ -2703,6 +2687,12 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
                 kiocb->ki_complete = io_complete_rw;
         }
  
+       if (req->opcode == IORING_OP_READ_FIXED ||
+           req->opcode == IORING_OP_WRITE_FIXED) {
+               req->imu = NULL;
+               io_req_set_rsrc_node(req);
+       }
+
         req->rw.addr = READ_ONCE(sqe->addr);
         req->rw.len = READ_ONCE(sqe->len);
         req->buf_index = READ_ONCE(sqe->buf_index);
@@ -2754,7 +2744,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
  
         if (check_reissue && req->flags & REQ_F_REISSUE) {
                 req->flags &= ~REQ_F_REISSUE;
-               if (!io_resubmit_prep(req)) {
+               if (io_resubmit_prep(req)) {
                         req_ref_get(req);
                         io_queue_async_work(req);
                 } else {
@@ -2768,21 +2758,13 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
         }
  }
  
-static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
+static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
+                            struct io_mapped_ubuf *imu)
  {
-       struct io_ring_ctx *ctx = req->ctx;
         size_t len = req->rw.len;
-       struct io_mapped_ubuf *imu;
-       u16 index, buf_index = req->buf_index;
         u64 buf_end, buf_addr = req->rw.addr;
         size_t offset;
  
-       if (unlikely(buf_index >= ctx->nr_user_bufs))
-               return -EFAULT;
-       index = array_index_nospec(buf_index, ctx->nr_user_bufs);
-       imu = &ctx->user_bufs[index];
-       buf_addr = req->rw.addr;
-
         if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
                 return -EFAULT;
         /* not inside the mapped region */
@@ -2834,6 +2816,22 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
         return 0;
  }
  
+static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
+{
+       struct io_ring_ctx *ctx = req->ctx;
+       struct io_mapped_ubuf *imu = req->imu;
+       u16 index, buf_index = req->buf_index;
+
+       if (likely(!imu)) {
+               if (unlikely(buf_index >= ctx->nr_user_bufs))
+                       return -EFAULT;
+               index = array_index_nospec(buf_index, ctx->nr_user_bufs);
+               imu = READ_ONCE(ctx->user_bufs[index]);
+               req->imu = imu;
+       }
+       return __io_import_fixed(req, rw, iter, imu);
+}
+
  static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
  {
         if (needs_lock)
@@ -3934,7 +3932,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
  static int io_provide_buffers_prep(struct io_kiocb *req,
                                    const struct io_uring_sqe *sqe)
  {
-       unsigned long size;
+       unsigned long size, tmp_check;
         struct io_provide_buf *p = &req->pbuf;
         u64 tmp;
  
@@ -3948,6 +3946,12 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
         p->addr = READ_ONCE(sqe->addr);
         p->len = READ_ONCE(sqe->len);
  
+       if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
+                               &size))
+               return -EOVERFLOW;
+       if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
+               return -EOVERFLOW;
+
         size = (unsigned long)p->len * p->nbufs;
         if (!access_ok(u64_to_user_ptr(p->addr), size))
                 return -EFAULT;
@@ -4892,7 +4896,7 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
         }
         if (req->poll.events & EPOLLONESHOT)
                 flags = 0;
-       if (!io_cqring_fill_event(req, error, flags)) {
+       if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
                 io_poll_remove_waitqs(req);
                 req->poll.done = true;
                 flags = 0;
@@ -4970,7 +4974,6 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
         poll->head = NULL;
         poll->done = false;
         poll->canceled = false;
-       poll->update_events = poll->update_user_data = false;
  #define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
         /* mask in events that we always want/need */
         poll->events = events | IO_POLL_UNMASK;
@@ -4997,6 +5000,12 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
                         pt->error = -EINVAL;
                         return;
                 }
+               /*
+                * Can't handle multishot for double wait for now, turn it
+                * into one-shot mode.
+                */
+               if (!(req->poll.events & EPOLLONESHOT))
+                       req->poll.events |= EPOLLONESHOT;
                 /* double add on the same waitqueue head, ignore */
                 if (poll->head == head)
                         return;
@@ -5050,9 +5059,6 @@ static void io_async_task_func(struct callback_head *cb)
                 __io_req_task_submit(req);
         else
                 io_req_complete_failed(req, -ECANCELED);
-
-       kfree(apoll->double_poll);
-       kfree(apoll);
  }
  
  static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@ -5168,8 +5174,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
         if (ret || ipt.error) {
                 io_poll_remove_double(req);
                 spin_unlock_irq(&ctx->completion_lock);
-               kfree(apoll->double_poll);
-               kfree(apoll);
                 return false;
         }
         spin_unlock_irq(&ctx->completion_lock);
@@ -5204,21 +5208,12 @@ static bool io_poll_remove_waitqs(struct io_kiocb *req)
         bool do_complete;
  
         io_poll_remove_double(req);
+       do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
  
-       if (req->opcode == IORING_OP_POLL_ADD) {
-               do_complete = __io_poll_remove_one(req, &req->poll, true);
-       } else {
-               struct async_poll *apoll = req->apoll;
-
+       if (req->opcode != IORING_OP_POLL_ADD && do_complete) {
                 /* non-poll requests have submit ref still */
-               do_complete = __io_poll_remove_one(req, &apoll->poll, true);
-               if (do_complete) {
-                       req_ref_put(req);
-                       kfree(apoll->double_poll);
-                       kfree(apoll);
-               }
+               req_ref_put(req);
         }
-
         return do_complete;
  }
  
@@ -5229,7 +5224,7 @@ static bool io_poll_remove_one(struct io_kiocb *req)
  
         do_complete = io_poll_remove_waitqs(req);
         if (do_complete) {
-               io_cqring_fill_event(req, -ECANCELED, 0);
+               io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
                 io_commit_cqring(req->ctx);
                 req_set_fail_links(req);
                 io_put_req_deferred(req, 1);
@@ -5266,7 +5261,8 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
         return posted != 0;
  }
  
-static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr)
+static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
+                                    bool poll_only)
         __must_hold(&ctx->completion_lock)
  {
         struct hlist_head *list;
@@ -5276,18 +5272,20 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr)
         hlist_for_each_entry(req, list, hash_node) {
                 if (sqe_addr != req->user_data)
                         continue;
+               if (poll_only && req->opcode != IORING_OP_POLL_ADD)
+                       continue;
                 return req;
         }
-
         return NULL;
  }
  
-static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
+static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
+                         bool poll_only)
         __must_hold(&ctx->completion_lock)
  {
         struct io_kiocb *req;
  
-       req = io_poll_find(ctx, sqe_addr);
+       req = io_poll_find(ctx, sqe_addr, poll_only);
         if (!req)
                 return -ENOENT;
         if (io_poll_remove_one(req))
@@ -5296,35 +5294,50 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
         return -EALREADY;
  }
  
-static int io_poll_remove_prep(struct io_kiocb *req,
+static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
+                                    unsigned int flags)
+{
+       u32 events;
+
+       events = READ_ONCE(sqe->poll32_events);
+#ifdef __BIG_ENDIAN
+       events = swahw32(events);
+#endif
+       if (!(flags & IORING_POLL_ADD_MULTI))
+               events |= EPOLLONESHOT;
+       return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
+}
+
+static int io_poll_update_prep(struct io_kiocb *req,
                                const struct io_uring_sqe *sqe)
  {
+       struct io_poll_update *upd = &req->poll_update;
+       u32 flags;
+
         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
-       if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
-           sqe->poll_events)
+       if (sqe->ioprio || sqe->buf_index)
+               return -EINVAL;
+       flags = READ_ONCE(sqe->len);
+       if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
+                     IORING_POLL_ADD_MULTI))
+               return -EINVAL;
+       /* meaningless without update */
+       if (flags == IORING_POLL_ADD_MULTI)
                 return -EINVAL;
  
-       req->poll_remove.addr = READ_ONCE(sqe->addr);
-       return 0;
-}
-
-/*
- * Find a running poll command that matches one specified in sqe->addr,
- * and remove it if found.
- */
-static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
-{
-       struct io_ring_ctx *ctx = req->ctx;
-       int ret;
+       upd->old_user_data = READ_ONCE(sqe->addr);
+       upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
+       upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
  
-       spin_lock_irq(&ctx->completion_lock);
-       ret = io_poll_cancel(ctx, req->poll_remove.addr);
-       spin_unlock_irq(&ctx->completion_lock);
+       upd->new_user_data = READ_ONCE(sqe->off);
+       if (!upd->update_user_data && upd->new_user_data)
+               return -EINVAL;
+       if (upd->update_events)
+               upd->events = io_poll_parse_events(sqe, flags);
+       else if (sqe->poll32_events)
+               return -EINVAL;
  
-       if (ret < 0)
-               req_set_fail_links(req);
-       __io_req_complete(req, issue_flags, ret, 0);
         return 0;
  }
  
@@ -5348,40 +5361,21 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
  static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  {
         struct io_poll_iocb *poll = &req->poll;
-       u32 events, flags;
+       u32 flags;
  
         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
-       if (sqe->ioprio || sqe->buf_index)
+       if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
                 return -EINVAL;
         flags = READ_ONCE(sqe->len);
-       if (flags & ~(IORING_POLL_ADD_MULTI | IORING_POLL_UPDATE_EVENTS |
-                       IORING_POLL_UPDATE_USER_DATA))
-               return -EINVAL;
-       events = READ_ONCE(sqe->poll32_events);
-#ifdef __BIG_ENDIAN
-       events = swahw32(events);
-#endif
-       if (!(flags & IORING_POLL_ADD_MULTI))
-               events |= EPOLLONESHOT;
-       poll->update_events = poll->update_user_data = false;
-       if (flags & IORING_POLL_UPDATE_EVENTS) {
-               poll->update_events = true;
-               poll->old_user_data = READ_ONCE(sqe->addr);
-       }
-       if (flags & IORING_POLL_UPDATE_USER_DATA) {
-               poll->update_user_data = true;
-               poll->new_user_data = READ_ONCE(sqe->off);
-       }
-       if (!(poll->update_events || poll->update_user_data) &&
-            (sqe->off || sqe->addr))
+       if (flags & ~IORING_POLL_ADD_MULTI)
                 return -EINVAL;
-       poll->events = demangle_poll(events) |
-                               (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
+
+       poll->events = io_poll_parse_events(sqe, flags);
         return 0;
  }
  
-static int __io_poll_add(struct io_kiocb *req)
+static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
  {
         struct io_poll_iocb *poll = &req->poll;
         struct io_ring_ctx *ctx = req->ctx;
@@ -5407,7 +5401,7 @@ static int __io_poll_add(struct io_kiocb *req)
         return ipt.error;
  }
  
-static int io_poll_update(struct io_kiocb *req)
+static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
  {
         struct io_ring_ctx *ctx = req->ctx;
         struct io_kiocb *preq;
@@ -5415,13 +5409,15 @@ static int io_poll_update(struct io_kiocb *req)
         int ret;
  
         spin_lock_irq(&ctx->completion_lock);
-       preq = io_poll_find(ctx, req->poll.old_user_data);
+       preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
         if (!preq) {
                 ret = -ENOENT;
                 goto err;
-       } else if (preq->opcode != IORING_OP_POLL_ADD) {
-               /* don't allow internal poll updates */
-               ret = -EACCES;
+       }
+
+       if (!req->poll_update.update_events && !req->poll_update.update_user_data) {
+               completing = true;
+               ret = io_poll_remove_one(preq) ? 0 : -EALREADY;
                 goto err;
         }
  
@@ -5445,21 +5441,20 @@ err:
                 return 0;
         }
         /* only mask one event flags, keep behavior flags */
-       if (req->poll.update_events) {
+       if (req->poll_update.update_events) {
                 preq->poll.events &= ~0xffff;
-               preq->poll.events |= req->poll.events & 0xffff;
+               preq->poll.events |= req->poll_update.events & 0xffff;
                 preq->poll.events |= IO_POLL_UNMASK;
         }
-       if (req->poll.update_user_data)
-               preq->user_data = req->poll.new_user_data;
-
+       if (req->poll_update.update_user_data)
+               preq->user_data = req->poll_update.new_user_data;
         spin_unlock_irq(&ctx->completion_lock);
  
         /* complete update request, we're done with it */
         io_req_complete(req, ret);
  
         if (!completing) {
-               ret = __io_poll_add(preq);
+               ret = io_poll_add(preq, issue_flags);
                 if (ret < 0) {
                         req_set_fail_links(preq);
                         io_req_complete(preq, ret);
@@ -5468,13 +5463,6 @@ err:
         return 0;
  }
  
-static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
-{
-       if (!req->poll.update_events && !req->poll.update_user_data)
-               return __io_poll_add(req);
-       return io_poll_update(req);
-}
-
  static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
  {
         struct io_timeout_data *data = container_of(timer,
@@ -5488,7 +5476,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
         atomic_set(&req->ctx->cq_timeouts,
                 atomic_read(&req->ctx->cq_timeouts) + 1);
  
-       io_cqring_fill_event(req, -ETIME, 0);
+       io_cqring_fill_event(ctx, req->user_data, -ETIME, 0);
         io_commit_cqring(ctx);
         spin_unlock_irqrestore(&ctx->completion_lock, flags);
  
@@ -5504,21 +5492,18 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
  {
         struct io_timeout_data *io;
         struct io_kiocb *req;
-       int ret = -ENOENT;
+       bool found = false;
  
         list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
-               if (user_data == req->user_data) {
-                       ret = 0;
+               found = user_data == req->user_data;
+               if (found)
                         break;
-               }
         }
-
-       if (ret == -ENOENT)
-               return ERR_PTR(ret);
+       if (!found)
+               return ERR_PTR(-ENOENT);
  
         io = req->async_data;
-       ret = hrtimer_try_to_cancel(&io->timer);
-       if (ret == -1)
+       if (hrtimer_try_to_cancel(&io->timer) == -1)
                 return ERR_PTR(-EALREADY);
         list_del_init(&req->timeout.list);
         return req;
@@ -5533,7 +5518,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
                 return PTR_ERR(req);
  
         req_set_fail_links(req);
-       io_cqring_fill_event(req, -ECANCELED, 0);
+       io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
         io_put_req_deferred(req, 1);
         return 0;
  }
@@ -5606,7 +5591,7 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
                 ret = io_timeout_update(ctx, tr->addr, &tr->ts,
                                         io_translate_timeout_mode(tr->flags));
  
-       io_cqring_fill_event(req, ret, 0);
+       io_cqring_fill_event(ctx, req->user_data, ret, 0);
         io_commit_cqring(ctx);
         spin_unlock_irq(&ctx->completion_lock);
         io_cqring_ev_posted(ctx);
@@ -5754,11 +5739,11 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
         ret = io_timeout_cancel(ctx, sqe_addr);
         if (ret != -ENOENT)
                 goto done;
-       ret = io_poll_cancel(ctx, sqe_addr);
+       ret = io_poll_cancel(ctx, sqe_addr, false);
  done:
         if (!ret)
                 ret = success_ret;
-       io_cqring_fill_event(req, ret, 0);
+       io_cqring_fill_event(ctx, req->user_data, ret, 0);
         io_commit_cqring(ctx);
         spin_unlock_irqrestore(&ctx->completion_lock, flags);
         io_cqring_ev_posted(ctx);
@@ -5796,7 +5781,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
         ret = io_timeout_cancel(ctx, sqe_addr);
         if (ret != -ENOENT)
                 goto done;
-       ret = io_poll_cancel(ctx, sqe_addr);
+       ret = io_poll_cancel(ctx, sqe_addr, false);
         if (ret != -ENOENT)
                 goto done;
         spin_unlock_irq(&ctx->completion_lock);
@@ -5815,7 +5800,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
  
         spin_lock_irq(&ctx->completion_lock);
  done:
-       io_cqring_fill_event(req, ret, 0);
+       io_cqring_fill_event(ctx, req->user_data, ret, 0);
         io_commit_cqring(ctx);
         spin_unlock_irq(&ctx->completion_lock);
         io_cqring_ev_posted(ctx);
@@ -5847,7 +5832,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
  static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
  {
         struct io_ring_ctx *ctx = req->ctx;
-       struct io_uring_rsrc_update up;
+       struct io_uring_rsrc_update2 up;
         int ret;
  
         if (issue_flags & IO_URING_F_NONBLOCK)
@@ -5855,9 +5840,13 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
  
         up.offset = req->rsrc_update.offset;
         up.data = req->rsrc_update.arg;
+       up.nr = 0;
+       up.tags = 0;
+       up.resv = 0;
  
         mutex_lock(&ctx->uring_lock);
-       ret = __io_sqe_files_update(ctx, &up, req->rsrc_update.nr_args);
+       ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
+                                       &up, req->rsrc_update.nr_args);
         mutex_unlock(&ctx->uring_lock);
  
         if (ret < 0)
@@ -5882,7 +5871,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
         case IORING_OP_POLL_ADD:
                 return io_poll_add_prep(req, sqe);
         case IORING_OP_POLL_REMOVE:
-               return io_poll_remove_prep(req, sqe);
+               return io_poll_update_prep(req, sqe);
         case IORING_OP_FSYNC:
                 return io_fsync_prep(req, sqe);
         case IORING_OP_SYNC_FILE_RANGE:
@@ -5941,7 +5930,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
  
         printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
                         req->opcode);
-       return-EINVAL;
+       return -EINVAL;
  }
  
  static int io_req_prep_async(struct io_kiocb *req)
@@ -6081,6 +6070,17 @@ static void io_clean_op(struct io_kiocb *req)
                 }
                 req->flags &= ~REQ_F_NEED_CLEANUP;
         }
+       if ((req->flags & REQ_F_POLLED) && req->apoll) {
+               kfree(req->apoll->double_poll);
+               kfree(req->apoll);
+               req->apoll = NULL;
+       }
+       if (req->flags & REQ_F_INFLIGHT) {
+               struct io_uring_task *tctx = req->task->io_uring;
+
+               atomic_dec(&tctx->inflight_tracked);
+               req->flags &= ~REQ_F_INFLIGHT;
+       }
  }
  
  static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
@@ -6113,7 +6113,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
                 ret = io_poll_add(req, issue_flags);
                 break;
         case IORING_OP_POLL_REMOVE:
-               ret = io_poll_remove(req, issue_flags);
+               ret = io_poll_update(req, issue_flags);
                 break;
         case IORING_OP_SYNC_FILE_RANGE:
                 ret = io_sync_file_range(req, issue_flags);
@@ -6265,19 +6265,19 @@ static void io_wq_submit_work(struct io_wq_work *work)
  #endif
  #define FFS_MASK               ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
  
-static inline struct io_fixed_file *io_fixed_file_slot(struct io_rsrc_data *file_data,
+static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
                                                       unsigned i)
  {
-       struct fixed_rsrc_table *table;
+       struct io_fixed_file *table_l2;
  
-       table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
-       return &table->files[i & IORING_FILE_TABLE_MASK];
+       table_l2 = table->files[i >> IORING_FILE_TABLE_SHIFT];
+       return &table_l2[i & IORING_FILE_TABLE_MASK];
  }
  
  static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
                                               int index)
  {
-       struct io_fixed_file *slot = io_fixed_file_slot(ctx->file_data, index);
+       struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
  
         return (struct file *) (slot->file_ptr & FFS_MASK);
  }
@@ -6307,7 +6307,7 @@ static struct file *io_file_get(struct io_submit_state *state,
                 if (unlikely((unsigned int)fd >= ctx->nr_user_files))
                         return NULL;
                 fd = array_index_nospec(fd, ctx->nr_user_files);
-               file_ptr = io_fixed_file_slot(ctx->file_data, fd)->file_ptr;
+               file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
                 file = (struct file *) (file_ptr & FFS_MASK);
                 file_ptr &= ~FFS_MASK;
                 /* mask in overlapping REQ_F and FFS bits */
@@ -6694,12 +6694,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
  {
         int submitted = 0;
  
-       /* if we have a backlog and couldn't flush it all, return BUSY */
-       if (test_bit(0, &ctx->sq_check_overflow)) {
-               if (!__io_cqring_overflow_flush(ctx, false))
-                       return -EBUSY;
-       }
-
         /* make sure SQ entry isn't read before tail */
         nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
  
@@ -6780,6 +6774,10 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
                 if (!list_empty(&ctx->iopoll_list))
                         io_do_iopoll(ctx, &nr_events, 0);
  
+               /*
+                * Don't submit if refs are dying, good for io_uring_register(),
+                * but also it is relied upon by io_ring_exit_work()
+                */
                 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
                     !(ctx->flags & IORING_SETUP_R_DISABLED))
                         ret = io_submit_sqes(ctx, to_submit);
@@ -6812,7 +6810,6 @@ static int io_sq_thread(void *data)
  
         snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
         set_task_comm(current, buf);
-       current->pf_io_worker = NULL;
  
         if (sqd->sq_cpu != -1)
                 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
@@ -6866,27 +6863,29 @@ static int io_sq_thread(void *data)
                         continue;
                 }
  
-               needs_sched = true;
                 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
-               list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
-                       if ((ctx->flags & IORING_SETUP_IOPOLL) &&
-                           !list_empty_careful(&ctx->iopoll_list)) {
-                               needs_sched = false;
-                               break;
-                       }
-                       if (io_sqring_entries(ctx)) {
-                               needs_sched = false;
-                               break;
-                       }
-               }
-
-               if (needs_sched && !test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) {
+               if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) {
                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                 io_ring_set_wakeup_flag(ctx);
  
-                       mutex_unlock(&sqd->lock);
-                       schedule();
-                       mutex_lock(&sqd->lock);
+                       needs_sched = true;
+                       list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
+                               if ((ctx->flags & IORING_SETUP_IOPOLL) &&
+                                   !list_empty_careful(&ctx->iopoll_list)) {
+                                       needs_sched = false;
+                                       break;
+                               }
+                               if (io_sqring_entries(ctx)) {
+                                       needs_sched = false;
+                                       break;
+                               }
+                       }
+
+                       if (needs_sched) {
+                               mutex_unlock(&sqd->lock);
+                               schedule();
+                               mutex_lock(&sqd->lock);
+                       }
                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                                 io_ring_clear_wakeup_flag(ctx);
                 }
@@ -6896,15 +6895,14 @@ static int io_sq_thread(void *data)
                 timeout = jiffies + sqd->sq_thread_idle;
         }
  
-       list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
-               io_uring_cancel_sqpoll(ctx);
+       io_uring_cancel_sqpoll(sqd);
         sqd->thread = NULL;
         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
                 io_ring_set_wakeup_flag(ctx);
-       mutex_unlock(&sqd->lock);
-
         io_run_task_work();
         io_run_task_work_head(&sqd->park_task_work);
+       mutex_unlock(&sqd->lock);
+
         complete(&sqd->exited);
         do_exit(0);
  }
@@ -7044,44 +7042,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
  }
  
-static void io_free_file_tables(struct io_rsrc_data *data, unsigned nr_files)
+static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
  {
         unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
  
         for (i = 0; i < nr_tables; i++)
-               kfree(data->table[i].files);
-       kfree(data->table);
-       data->table = NULL;
-}
-
-static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
-{
-#if defined(CONFIG_UNIX)
-       if (ctx->ring_sock) {
-               struct sock *sock = ctx->ring_sock->sk;
-               struct sk_buff *skb;
-
-               while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
-                       kfree_skb(skb);
-       }
-#else
-       int i;
-
-       for (i = 0; i < ctx->nr_user_files; i++) {
-               struct file *file;
-
-               file = io_file_from_index(ctx, i);
-               if (file)
-                       fput(file);
-       }
-#endif
-}
-
-static void io_rsrc_data_ref_zero(struct percpu_ref *ref)
-{
-       struct io_rsrc_data *data = container_of(ref, struct io_rsrc_data, refs);
-
-       complete(&data->done);
+               kfree(table->files[i]);
+       kfree(table->files);
+       table->files = NULL;
  }
  
  static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
@@ -7114,7 +7082,7 @@ static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
                 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
                 io_rsrc_ref_unlock(ctx);
  
-               percpu_ref_get(&data_to_kill->refs);
+               atomic_inc(&data_to_kill->refs);
                 percpu_ref_kill(&rsrc_node->refs);
                 ctx->rsrc_node = NULL;
         }
@@ -7148,14 +7116,17 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ct
                         break;
                 io_rsrc_node_switch(ctx, data);
  
-               percpu_ref_kill(&data->refs);
+               /* kill initial ref, already quiesced if zero */
+               if (atomic_dec_and_test(&data->refs))
+                       break;
                 flush_delayed_work(&ctx->rsrc_put_work);
-
                 ret = wait_for_completion_interruptible(&data->done);
                 if (!ret)
                         break;
  
-               percpu_ref_resurrect(&data->refs);
+               atomic_inc(&data->refs);
+               /* wait for all works potentially completing data->done */
+               flush_delayed_work(&ctx->rsrc_put_work);
                 reinit_completion(&data->done);
  
                 mutex_unlock(&ctx->uring_lock);
@@ -7167,8 +7138,15 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ct
         return ret;
  }
  
+static void io_rsrc_data_free(struct io_rsrc_data *data)
+{
+       kvfree(data->tags);
+       kfree(data);
+}
+
  static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx,
-                                              rsrc_put_fn *do_put)
+                                              rsrc_put_fn *do_put,
+                                              unsigned nr)
  {
         struct io_rsrc_data *data;
  
@@ -7176,40 +7154,56 @@ static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx,
         if (!data)
                 return NULL;
  
-       if (percpu_ref_init(&data->refs, io_rsrc_data_ref_zero,
-                           PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
+       data->tags = kvcalloc(nr, sizeof(*data->tags), GFP_KERNEL);
+       if (!data->tags) {
                 kfree(data);
                 return NULL;
         }
+
+       atomic_set(&data->refs, 1);
         data->ctx = ctx;
         data->do_put = do_put;
         init_completion(&data->done);
         return data;
  }
  
-static void io_rsrc_data_free(struct io_rsrc_data *data)
+static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
  {
-       percpu_ref_exit(&data->refs);
-       kfree(data);
+#if defined(CONFIG_UNIX)
+       if (ctx->ring_sock) {
+               struct sock *sock = ctx->ring_sock->sk;
+               struct sk_buff *skb;
+
+               while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
+                       kfree_skb(skb);
+       }
+#else
+       int i;
+
+       for (i = 0; i < ctx->nr_user_files; i++) {
+               struct file *file;
+
+               file = io_file_from_index(ctx, i);
+               if (file)
+                       fput(file);
+       }
+#endif
+       io_free_file_tables(&ctx->file_table, ctx->nr_user_files);
+       io_rsrc_data_free(ctx->file_data);
+       ctx->file_data = NULL;
+       ctx->nr_user_files = 0;
  }
  
  static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
  {
-       struct io_rsrc_data *data = ctx->file_data;
         int ret;
  
-       if (!data)
+       if (!ctx->file_data)
                 return -ENXIO;
-       ret = io_rsrc_ref_quiesce(data, ctx);
-       if (ret)
-               return ret;
-
-       __io_sqe_files_unregister(ctx);
-       io_free_file_tables(data, ctx->nr_user_files);
-       io_rsrc_data_free(data);
-       ctx->file_data = NULL;
-       ctx->nr_user_files = 0;
-       return 0;
+       ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
+       if (!ret)
+               __io_sqe_files_unregister(ctx);
+       return ret;
  }
  
  static void io_sq_thread_unpark(struct io_sq_data *sqd)
@@ -7242,9 +7236,10 @@ static void io_sq_thread_park(struct io_sq_data *sqd)
  static void io_sq_thread_stop(struct io_sq_data *sqd)
  {
         WARN_ON_ONCE(sqd->thread == current);
+       WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
  
-       mutex_lock(&sqd->lock);
         set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+       mutex_lock(&sqd->lock);
         if (sqd->thread)
                 wake_up_process(sqd->thread);
         mutex_unlock(&sqd->lock);
@@ -7273,8 +7268,6 @@ static void io_sq_thread_finish(struct io_ring_ctx *ctx)
  
                 io_put_sq_data(sqd);
                 ctx->sq_data = NULL;
-               if (ctx->sq_creds)
-                       put_cred(ctx->sq_creds);
         }
  }
  
@@ -7435,23 +7428,20 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
  }
  #endif
  
-static bool io_alloc_file_tables(struct io_rsrc_data *file_data,
-                                unsigned nr_files)
+static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
  {
         unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE);
  
-       file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
-                                  GFP_KERNEL);
-       if (!file_data->table)
+       table->files = kcalloc(nr_tables, sizeof(*table->files), GFP_KERNEL);
+       if (!table->files)
                 return false;
  
         for (i = 0; i < nr_tables; i++) {
-               struct fixed_rsrc_table *table = &file_data->table[i];
                 unsigned int this_files = min(nr_files, IORING_MAX_FILES_TABLE);
  
-               table->files = kcalloc(this_files, sizeof(struct file *),
+               table->files[i] = kcalloc(this_files, sizeof(*table->files[i]),
                                         GFP_KERNEL);
-               if (!table->files)
+               if (!table->files[i])
                         break;
                 nr_files -= this_files;
         }
@@ -7459,7 +7449,7 @@ static bool io_alloc_file_tables(struct io_rsrc_data *file_data,
         if (i == nr_tables)
                 return true;
  
-       io_free_file_tables(file_data, nr_tables * IORING_MAX_FILES_TABLE);
+       io_free_file_tables(table, nr_tables * IORING_MAX_FILES_TABLE);
         return false;
  }
  
@@ -7534,12 +7524,27 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
  
         list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
                 list_del(&prsrc->list);
+
+               if (prsrc->tag) {
+                       bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
+                       unsigned long flags;
+
+                       io_ring_submit_lock(ctx, lock_ring);
+                       spin_lock_irqsave(&ctx->completion_lock, flags);
+                       io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
+                       io_commit_cqring(ctx);
+                       spin_unlock_irqrestore(&ctx->completion_lock, flags);
+                       io_cqring_ev_posted(ctx);
+                       io_ring_submit_unlock(ctx, lock_ring);
+               }
+
                 rsrc_data->do_put(ctx, prsrc);
                 kfree(prsrc);
         }
  
         io_rsrc_node_destroy(ref_node);
-       percpu_ref_put(&rsrc_data->refs);
+       if (atomic_dec_and_test(&rsrc_data->refs))
+               complete(&rsrc_data->done);
  }
  
  static void io_rsrc_put_work(struct work_struct *work)
@@ -7563,10 +7568,8 @@ static void io_rsrc_put_work(struct work_struct *work)
  static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
  {
         struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
-       struct io_rsrc_data *data = node->rsrc_data;
-       struct io_ring_ctx *ctx = data->ctx;
+       struct io_ring_ctx *ctx = node->rsrc_data->ctx;
         bool first_add = false;
-       int delay;
  
         io_rsrc_ref_lock(ctx);
         node->done = true;
@@ -7582,9 +7585,8 @@ static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
         }
         io_rsrc_ref_unlock(ctx);
  
-       delay = percpu_ref_is_dying(&data->refs) ? 0 : HZ;
-       if (first_add || !delay)
-               mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
+       if (first_add)
+               mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
  }
  
  static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
@@ -7607,7 +7609,7 @@ static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
  }
  
  static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
-                                unsigned nr_args)
+                                unsigned nr_args, u64 __user *tags)
  {
         __s32 __user *fds = (__s32 __user *) arg;
         struct file *file;
@@ -7625,27 +7627,33 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
         if (ret)
                 return ret;
  
-       file_data = io_rsrc_data_alloc(ctx, io_rsrc_file_put);
+       file_data = io_rsrc_data_alloc(ctx, io_rsrc_file_put, nr_args);
         if (!file_data)
                 return -ENOMEM;
         ctx->file_data = file_data;
-
         ret = -ENOMEM;
-       if (!io_alloc_file_tables(file_data, nr_args))
+       if (!io_alloc_file_tables(&ctx->file_table, nr_args))
                 goto out_free;
  
         for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
-               if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
+               u64 tag = 0;
+
+               if ((tags && copy_from_user(&tag, &tags[i], sizeof(tag))) ||
+                   copy_from_user(&fd, &fds[i], sizeof(fd))) {
                         ret = -EFAULT;
                         goto out_fput;
                 }
                 /* allow sparse sets */
-               if (fd == -1)
+               if (fd == -1) {
+                       ret = -EINVAL;
+                       if (unlikely(tag))
+                               goto out_fput;
                         continue;
+               }
  
                 file = fget(fd);
                 ret = -EBADF;
-               if (!file)
+               if (unlikely(!file))
                         goto out_fput;
  
                 /*
@@ -7659,12 +7667,13 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
                         fput(file);
                         goto out_fput;
                 }
-               io_fixed_file_set(io_fixed_file_slot(file_data, i), file);
+               ctx->file_data->tags[i] = tag;
+               io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
         }
  
         ret = io_sqe_files_scm(ctx);
         if (ret) {
-               io_sqe_files_unregister(ctx);
+               __io_sqe_files_unregister(ctx);
                 return ret;
         }
  
@@ -7676,7 +7685,7 @@ out_fput:
                 if (file)
                         fput(file);
         }
-       io_free_file_tables(file_data, nr_args);
+       io_free_file_tables(&ctx->file_table, nr_args);
         ctx->nr_user_files = 0;
  out_free:
         io_rsrc_data_free(ctx->file_data);
@@ -7727,7 +7736,7 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
  #endif
  }
  
-static int io_queue_rsrc_removal(struct io_rsrc_data *data,
+static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
                                  struct io_rsrc_node *node, void *rsrc)
  {
         struct io_rsrc_put *prsrc;
@@ -7736,47 +7745,52 @@ static int io_queue_rsrc_removal(struct io_rsrc_data *data,
         if (!prsrc)
                 return -ENOMEM;
  
+       prsrc->tag = data->tags[idx];
         prsrc->rsrc = rsrc;
         list_add(&prsrc->list, &node->rsrc_list);
         return 0;
  }
  
  static int __io_sqe_files_update(struct io_ring_ctx *ctx,
-                                struct io_uring_rsrc_update *up,
+                                struct io_uring_rsrc_update2 *up,
                                  unsigned nr_args)
  {
+       u64 __user *tags = u64_to_user_ptr(up->tags);
+       __s32 __user *fds = u64_to_user_ptr(up->data);
         struct io_rsrc_data *data = ctx->file_data;
         struct io_fixed_file *file_slot;
         struct file *file;
-       __s32 __user *fds;
-       int fd, i, err;
-       __u32 done;
+       int fd, i, err = 0;
+       unsigned int done;
         bool needs_switch = false;
  
-       if (check_add_overflow(up->offset, nr_args, &done))
-               return -EOVERFLOW;
-       if (done > ctx->nr_user_files)
+       if (!ctx->file_data)
+               return -ENXIO;
+       if (up->offset + nr_args > ctx->nr_user_files)
                 return -EINVAL;
-       err = io_rsrc_node_switch_start(ctx);
-       if (err)
-               return err;
  
-       fds = u64_to_user_ptr(up->data);
         for (done = 0; done < nr_args; done++) {
-               err = 0;
-               if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
+               u64 tag = 0;
+
+               if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
+                   copy_from_user(&fd, &fds[done], sizeof(fd))) {
                         err = -EFAULT;
                         break;
                 }
+               if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
+                       err = -EINVAL;
+                       break;
+               }
                 if (fd == IORING_REGISTER_FILES_SKIP)
                         continue;
  
                 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
-               file_slot = io_fixed_file_slot(ctx->file_data, i);
+               file_slot = io_fixed_file_slot(&ctx->file_table, i);
  
                 if (file_slot->file_ptr) {
                         file = (struct file *)(file_slot->file_ptr & FFS_MASK);
-                       err = io_queue_rsrc_removal(data, ctx->rsrc_node, file);
+                       err = io_queue_rsrc_removal(data, up->offset + done,
+                                                   ctx->rsrc_node, file);
                         if (err)
                                 break;
                         file_slot->file_ptr = 0;
@@ -7801,6 +7815,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
                                 err = -EBADF;
                                 break;
                         }
+                       data->tags[up->offset + done] = tag;
                         io_fixed_file_set(file_slot, file);
                         err = io_sqe_file_register(ctx, file, i);
                         if (err) {
@@ -7816,23 +7831,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
         return done ? done : err;
  }
  
-static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
-                              unsigned nr_args)
-{
-       struct io_uring_rsrc_update up;
-
-       if (!ctx->file_data)
-               return -ENXIO;
-       if (!nr_args)
-               return -EINVAL;
-       if (copy_from_user(&up, arg, sizeof(up)))
-               return -EFAULT;
-       if (up.resv)
-               return -EINVAL;
-
-       return __io_sqe_files_update(ctx, &up, nr_args);
-}
-
  static struct io_wq_work *io_free_work(struct io_wq_work *work)
  {
         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
@@ -7931,11 +7929,9 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
                 f = fdget(p->wq_fd);
                 if (!f.file)
                         return -ENXIO;
-               if (f.file->f_op != &io_uring_fops) {
-                       fdput(f);
-                       return -EINVAL;
-               }
                 fdput(f);
+               if (f.file->f_op != &io_uring_fops)
+                       return -EINVAL;
         }
         if (ctx->flags & IORING_SETUP_SQPOLL) {
                 struct task_struct *tsk;
@@ -7954,13 +7950,11 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
                 if (!ctx->sq_thread_idle)
                         ctx->sq_thread_idle = HZ;
  
-               ret = 0;
                 io_sq_thread_park(sqd);
                 list_add(&ctx->sqd_list, &sqd->ctx_list);
                 io_sqd_update_thread_idle(sqd);
                 /* don't attach to a dying SQPOLL thread, would be racy */
-               if (attached && !sqd->thread)
-                       ret = -ENXIO;
+               ret = (attached && !sqd->thread) ? -ENXIO : 0;
                 io_sq_thread_unpark(sqd);
  
                 if (ret < 0)
@@ -7972,11 +7966,8 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
                         int cpu = p->sq_thread_cpu;
  
                         ret = -EINVAL;
-                       if (cpu >= nr_cpu_ids)
-                               goto err_sqpoll;
-                       if (!cpu_online(cpu))
+                       if (cpu >= nr_cpu_ids || !cpu_online(cpu))
                                 goto err_sqpoll;
-
                         sqd->sq_cpu = cpu;
                 } else {
                         sqd->sq_cpu = -1;
@@ -8002,12 +7993,11 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
         }
  
         return 0;
+err_sqpoll:
+       complete(&ctx->sq_data->exited);
  err:
         io_sq_thread_finish(ctx);
         return ret;
-err_sqpoll:
-       complete(&ctx->sq_data->exited);
-       goto err;
  }
  
  static inline void __io_unaccount_mem(struct user_struct *user,
@@ -8109,29 +8099,49 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
         return off;
  }
  
-static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
+static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
  {
-       int i, j;
-
-       if (!ctx->user_bufs)
-               return -ENXIO;
+       struct io_mapped_ubuf *imu = *slot;
+       unsigned int i;
  
-       for (i = 0; i < ctx->nr_user_bufs; i++) {
-               struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+       for (i = 0; i < imu->nr_bvecs; i++)
+               unpin_user_page(imu->bvec[i].bv_page);
+       if (imu->acct_pages)
+               io_unaccount_mem(ctx, imu->acct_pages);
+       kvfree(imu);
+       *slot = NULL;
+}
  
-               for (j = 0; j < imu->nr_bvecs; j++)
-                       unpin_user_page(imu->bvec[j].bv_page);
+static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
+{
+       io_buffer_unmap(ctx, &prsrc->buf);
+       prsrc->buf = NULL;
+}
  
-               if (imu->acct_pages)
-                       io_unaccount_mem(ctx, imu->acct_pages);
-               kvfree(imu->bvec);
-               imu->nr_bvecs = 0;
-       }
+static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
+{
+       unsigned int i;
  
+       for (i = 0; i < ctx->nr_user_bufs; i++)
+               io_buffer_unmap(ctx, &ctx->user_bufs[i]);
         kfree(ctx->user_bufs);
+       kfree(ctx->buf_data);
         ctx->user_bufs = NULL;
+       ctx->buf_data = NULL;
         ctx->nr_user_bufs = 0;
-       return 0;
+}
+
+static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
+{
+       int ret;
+
+       if (!ctx->buf_data)
+               return -ENXIO;
+
+       ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
+       if (!ret)
+               __io_sqe_buffers_unregister(ctx);
+       return ret;
  }
  
  static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
@@ -8183,7 +8193,7 @@ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
  
         /* check previously registered pages */
         for (i = 0; i < ctx->nr_user_bufs; i++) {
-               struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+               struct io_mapped_ubuf *imu = ctx->user_bufs[i];
  
                 for (j = 0; j < imu->nr_bvecs; j++) {
                         if (!PageCompound(imu->bvec[j].bv_page))
@@ -8228,9 +8238,10 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
  }
  
  static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
-                                 struct io_mapped_ubuf *imu,
+                                 struct io_mapped_ubuf **pimu,
                                   struct page **last_hpage)
  {
+       struct io_mapped_ubuf *imu = NULL;
         struct vm_area_struct **vmas = NULL;
         struct page **pages = NULL;
         unsigned long off, start, end, ubuf;
@@ -8242,6 +8253,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
         start = ubuf >> PAGE_SHIFT;
         nr_pages = end - start;
  
+       *pimu = NULL;
         ret = -ENOMEM;
  
         pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
@@ -8253,9 +8265,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
         if (!vmas)
                 goto done;
  
-       imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
-                                  GFP_KERNEL);
-       if (!imu->bvec)
+       imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
+       if (!imu)
                 goto done;
  
         ret = 0;
@@ -8284,14 +8295,12 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
                  */
                 if (pret > 0)
                         unpin_user_pages(pages, pret);
-               kvfree(imu->bvec);
                 goto done;
         }
  
         ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
         if (ret) {
                 unpin_user_pages(pages, pret);
-               kvfree(imu->bvec);
                 goto done;
         }
  
@@ -8311,8 +8320,11 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
         imu->ubuf = ubuf;
         imu->ubuf_end = ubuf + iov->iov_len;
         imu->nr_bvecs = nr_pages;
+       *pimu = imu;
         ret = 0;
  done:
+       if (ret)
+               kvfree(imu);
         kvfree(pages);
         kvfree(vmas);
         return ret;
@@ -8320,17 +8332,8 @@ done:
  
  static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
  {
-       if (ctx->user_bufs)
-               return -EBUSY;
-       if (!nr_args || nr_args > UIO_MAXIOV)
-               return -EINVAL;
-
-       ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
-                                       GFP_KERNEL);
-       if (!ctx->user_bufs)
-               return -ENOMEM;
-
-       return 0;
+       ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
+       return ctx->user_bufs ? 0 : -ENOMEM;
  }
  
  static int io_buffer_validate(struct iovec *iov)
@@ -8356,40 +8359,116 @@ static int io_buffer_validate(struct iovec *iov)
  }
  
  static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
-                                  unsigned int nr_args)
+                                  unsigned int nr_args, u64 __user *tags)
  {
+       struct page *last_hpage = NULL;
+       struct io_rsrc_data *data;
         int i, ret;
         struct iovec iov;
-       struct page *last_hpage = NULL;
  
-       ret = io_buffers_map_alloc(ctx, nr_args);
+       if (ctx->user_bufs)
+               return -EBUSY;
+       if (!nr_args || nr_args > UIO_MAXIOV)
+               return -EINVAL;
+       ret = io_rsrc_node_switch_start(ctx);
         if (ret)
                 return ret;
+       data = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, nr_args);
+       if (!data)
+               return -ENOMEM;
+       ret = io_buffers_map_alloc(ctx, nr_args);
+       if (ret) {
+               kfree(data);
+               return ret;
+       }
  
-       for (i = 0; i < nr_args; i++) {
-               struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
+       for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
+               u64 tag = 0;
  
+               if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) {
+                       ret = -EFAULT;
+                       break;
+               }
                 ret = io_copy_iov(ctx, &iov, arg, i);
                 if (ret)
                         break;
-
                 ret = io_buffer_validate(&iov);
                 if (ret)
                         break;
  
-               ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
+               ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
+                                            &last_hpage);
                 if (ret)
                         break;
-
-               ctx->nr_user_bufs++;
+               data->tags[i] = tag;
         }
  
-       if (ret)
-               io_sqe_buffers_unregister(ctx);
+       WARN_ON_ONCE(ctx->buf_data);
  
+       ctx->buf_data = data;
+       if (ret)
+               __io_sqe_buffers_unregister(ctx);
+       else
+               io_rsrc_node_switch(ctx, NULL);
         return ret;
  }
  
+static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
+                                  struct io_uring_rsrc_update2 *up,
+                                  unsigned int nr_args)
+{
+       u64 __user *tags = u64_to_user_ptr(up->tags);
+       struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
+       struct io_mapped_ubuf *imu;
+       struct page *last_hpage = NULL;
+       bool needs_switch = false;
+       __u32 done;
+       int i, err;
+
+       if (!ctx->buf_data)
+               return -ENXIO;
+       if (up->offset + nr_args > ctx->nr_user_bufs)
+               return -EINVAL;
+
+       for (done = 0; done < nr_args; done++) {
+               u64 tag = 0;
+
+               err = io_copy_iov(ctx, &iov, iovs, done);
+               if (err)
+                       break;
+               if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
+                       err = -EFAULT;
+                       break;
+               }
+
+               i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
+               imu = ctx->user_bufs[i];
+               if (imu) {
+                       err = io_queue_rsrc_removal(ctx->buf_data, up->offset + done,
+                                                   ctx->rsrc_node, imu);
+                       if (err)
+                               break;
+                       ctx->user_bufs[i] = NULL;
+                       needs_switch = true;
+               }
+
+               if (iov.iov_base || iov.iov_len) {
+                       err = io_buffer_validate(&iov);
+                       if (err)
+                               break;
+                       err = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
+                                                    &last_hpage);
+                       if (err)
+                               break;
+                       ctx->buf_data->tags[up->offset + done] = tag;
+               }
+       }
+
+       if (needs_switch)
+               io_rsrc_node_switch(ctx, ctx->buf_data);
+       return done ? done : err;
+}
+
  static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
  {
         __s32 __user *fds = arg;
@@ -8461,10 +8540,18 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
         mutex_unlock(&ctx->uring_lock);
  }
  
+static bool io_wait_rsrc_data(struct io_rsrc_data *data)
+{
+       if (!data)
+               return false;
+       if (!atomic_dec_and_test(&data->refs))
+               wait_for_completion(&data->done);
+       return true;
+}
+
  static void io_ring_ctx_free(struct io_ring_ctx *ctx)
  {
         io_sq_thread_finish(ctx);
-       io_sqe_buffers_unregister(ctx);
  
         if (ctx->mm_account) {
                 mmdrop(ctx->mm_account);
@@ -8472,12 +8559,17 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
         }
  
         mutex_lock(&ctx->uring_lock);
-       io_sqe_files_unregister(ctx);
+       if (io_wait_rsrc_data(ctx->buf_data))
+               __io_sqe_buffers_unregister(ctx);
+       if (io_wait_rsrc_data(ctx->file_data))
+               __io_sqe_files_unregister(ctx);
         if (ctx->rings)
                 __io_cqring_overflow_flush(ctx, true);
         mutex_unlock(&ctx->uring_lock);
         io_eventfd_unregister(ctx);
         io_destroy_buffers(ctx);
+       if (ctx->sq_creds)
+               put_cred(ctx->sq_creds);
  
         /* there are no registered resources left, nobody uses it */
         if (ctx->rsrc_node)
@@ -8595,14 +8687,6 @@ static void io_ring_exit_work(struct work_struct *work)
         struct io_tctx_node *node;
         int ret;
  
-       /* prevent SQPOLL from submitting new requests */
-       if (ctx->sq_data) {
-               io_sq_thread_park(ctx->sq_data);
-               list_del_init(&ctx->sqd_list);
-               io_sqd_update_thread_idle(ctx->sq_data);
-               io_sq_thread_unpark(ctx->sq_data);
-       }
-
         /*
          * If we're doing polled IO and end up having requests being
          * submitted async (out-of-line), then completions can come in while
@@ -8615,6 +8699,9 @@ static void io_ring_exit_work(struct work_struct *work)
                 WARN_ON_ONCE(time_after(jiffies, timeout));
         } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
  
+       init_completion(&exit.completion);
+       init_task_work(&exit.task_work, io_tctx_exit_cb);
+       exit.ctx = ctx;
         /*
          * Some may use context even when all refs and requests have been put,
          * and they are free to do so while still holding uring_lock or
@@ -8627,9 +8714,8 @@ static void io_ring_exit_work(struct work_struct *work)
  
                 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
                                         ctx_node);
-               exit.ctx = ctx;
-               init_completion(&exit.completion);
-               init_task_work(&exit.task_work, io_tctx_exit_cb);
+               /* don't spin on a single task if cancellation failed */
+               list_rotate_left(&ctx->tctx_list);
                 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
                 if (WARN_ON_ONCE(ret))
                         continue;
@@ -8637,7 +8723,6 @@ static void io_ring_exit_work(struct work_struct *work)
  
                 mutex_unlock(&ctx->uring_lock);
                 wait_for_completion(&exit.completion);
-               cond_resched();
                 mutex_lock(&ctx->uring_lock);
         }
         mutex_unlock(&ctx->uring_lock);
@@ -8928,11 +9013,12 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
  static void io_sqpoll_cancel_cb(struct callback_head *cb)
  {
         struct io_tctx_exit *work = container_of(cb, struct io_tctx_exit, task_work);
-       struct io_ring_ctx *ctx = work->ctx;
-       struct io_sq_data *sqd = ctx->sq_data;
+       struct io_sq_data *sqd = work->ctx->sq_data;
  
         if (sqd->thread)
-               io_uring_cancel_sqpoll(ctx);
+               io_uring_cancel_sqpoll(sqd);
+       list_del_init(&work->ctx->sqd_list);
+       io_sqd_update_thread_idle(sqd);
         complete(&work->completion);
  }
  
@@ -8943,14 +9029,15 @@ static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx)
         struct task_struct *task;
  
         io_sq_thread_park(sqd);
-       list_del_init(&ctx->sqd_list);
-       io_sqd_update_thread_idle(sqd);
         task = sqd->thread;
         if (task) {
                 init_completion(&work.completion);
                 init_task_work(&work.task_work, io_sqpoll_cancel_cb);
                 io_task_work_add_head(&sqd->park_task_work, &work.task_work);
                 wake_up_process(task);
+       } else {
+               list_del_init(&ctx->sqd_list);
+               io_sqd_update_thread_idle(sqd);
         }
         io_sq_thread_unpark(sqd);
  
@@ -8976,14 +9063,14 @@ static void io_uring_try_cancel(struct files_struct *files)
  }
  
  /* should only be called by SQPOLL task */
-static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
+static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
  {
-       struct io_sq_data *sqd = ctx->sq_data;
         struct io_uring_task *tctx = current->io_uring;
+       struct io_ring_ctx *ctx;
         s64 inflight;
         DEFINE_WAIT(wait);
  
-       WARN_ON_ONCE(!sqd || ctx->sq_data->thread != current);
+       WARN_ON_ONCE(!sqd || sqd->thread != current);
  
         atomic_inc(&tctx->in_idle);
         do {
@@ -8991,7 +9078,8 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
                 inflight = tctx_inflight(tctx, false);
                 if (!inflight)
                         break;
-               io_uring_try_cancel_requests(ctx, current, NULL);
+               list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+                       io_uring_try_cancel_requests(ctx, current, NULL);
  
                 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
                 /*
@@ -9325,7 +9413,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
         }
         seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
         for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
-               struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
+               struct io_mapped_ubuf *buf = ctx->user_bufs[i];
                 unsigned int len = buf->ubuf_end - buf->ubuf;
  
                 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
@@ -9537,6 +9625,9 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
         ret = io_sq_offload_create(ctx, p);
         if (ret)
                 goto err;
+       /* always set a rsrc node */
+       io_rsrc_node_switch_start(ctx);
+       io_rsrc_node_switch(ctx, NULL);
  
         memset(&p->sq_off, 0, sizeof(p->sq_off));
         p->sq_off.head = offsetof(struct io_rings, sq.head);
@@ -9762,15 +9853,96 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx)
         return 0;
  }
  
+static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
+                                    struct io_uring_rsrc_update2 *up,
+                                    unsigned nr_args)
+{
+       __u32 tmp;
+       int err;
+
+       if (up->resv)
+               return -EINVAL;
+       if (check_add_overflow(up->offset, nr_args, &tmp))
+               return -EOVERFLOW;
+       err = io_rsrc_node_switch_start(ctx);
+       if (err)
+               return err;
+
+       switch (type) {
+       case IORING_RSRC_FILE:
+               return __io_sqe_files_update(ctx, up, nr_args);
+       case IORING_RSRC_BUFFER:
+               return __io_sqe_buffers_update(ctx, up, nr_args);
+       }
+       return -EINVAL;
+}
+
+static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
+                                   unsigned nr_args)
+{
+       struct io_uring_rsrc_update2 up;
+
+       if (!nr_args)
+               return -EINVAL;
+       memset(&up, 0, sizeof(up));
+       if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
+               return -EFAULT;
+       return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
+}
+
+static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
+                                  unsigned size)
+{
+       struct io_uring_rsrc_update2 up;
+
+       if (size != sizeof(up))
+               return -EINVAL;
+       if (copy_from_user(&up, arg, sizeof(up)))
+               return -EFAULT;
+       if (!up.nr)
+               return -EINVAL;
+       return __io_register_rsrc_update(ctx, up.type, &up, up.nr);
+}
+
+static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
+                           unsigned int size)
+{
+       struct io_uring_rsrc_register rr;
+
+       /* keep it extendible */
+       if (size != sizeof(rr))
+               return -EINVAL;
+
+       memset(&rr, 0, sizeof(rr));
+       if (copy_from_user(&rr, arg, size))
+               return -EFAULT;
+       if (!rr.nr)
+               return -EINVAL;
+
+       switch (rr.type) {
+       case IORING_RSRC_FILE:
+               return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
+                                            rr.nr, u64_to_user_ptr(rr.tags));
+       case IORING_RSRC_BUFFER:
+               return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
+                                              rr.nr, u64_to_user_ptr(rr.tags));
+       }
+       return -EINVAL;
+}
+
  static bool io_register_op_must_quiesce(int op)
  {
         switch (op) {
+       case IORING_REGISTER_BUFFERS:
+       case IORING_UNREGISTER_BUFFERS:
         case IORING_REGISTER_FILES:
         case IORING_UNREGISTER_FILES:
         case IORING_REGISTER_FILES_UPDATE:
         case IORING_REGISTER_PROBE:
         case IORING_REGISTER_PERSONALITY:
         case IORING_UNREGISTER_PERSONALITY:
+       case IORING_REGISTER_RSRC:
+       case IORING_REGISTER_RSRC_UPDATE:
                 return false;
         default:
                 return true;
@@ -9792,6 +9964,14 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
         if (percpu_ref_is_dying(&ctx->refs))
                 return -ENXIO;
  
+       if (ctx->restricted) {
+               if (opcode >= IORING_REGISTER_LAST)
+                       return -EINVAL;
+               opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
+               if (!test_bit(opcode, ctx->restrictions.register_op))
+                       return -EACCES;
+       }
+
         if (io_register_op_must_quiesce(opcode)) {
                 percpu_ref_kill(&ctx->refs);
  
@@ -9812,30 +9992,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
                         if (ret < 0)
                                 break;
                 } while (1);
-
                 mutex_lock(&ctx->uring_lock);
  
                 if (ret) {
-                       percpu_ref_resurrect(&ctx->refs);
-                       goto out_quiesce;
-               }
-       }
-
-       if (ctx->restricted) {
-               if (opcode >= IORING_REGISTER_LAST) {
-                       ret = -EINVAL;
-                       goto out;
-               }
-
-               if (!test_bit(opcode, ctx->restrictions.register_op)) {
-                       ret = -EACCES;
-                       goto out;
+                       io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
+                       return ret;
                 }
         }
  
         switch (opcode) {
         case IORING_REGISTER_BUFFERS:
-               ret = io_sqe_buffers_register(ctx, arg, nr_args);
+               ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
                 break;
         case IORING_UNREGISTER_BUFFERS:
                 ret = -EINVAL;
@@ -9844,7 +10011,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
                 ret = io_sqe_buffers_unregister(ctx);
                 break;
         case IORING_REGISTER_FILES:
-               ret = io_sqe_files_register(ctx, arg, nr_args);
+               ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
                 break;
         case IORING_UNREGISTER_FILES:
                 ret = -EINVAL;
@@ -9853,7 +10020,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
                 ret = io_sqe_files_unregister(ctx);
                 break;
         case IORING_REGISTER_FILES_UPDATE:
-               ret = io_sqe_files_update(ctx, arg, nr_args);
+               ret = io_register_files_update(ctx, arg, nr_args);
                 break;
         case IORING_REGISTER_EVENTFD:
         case IORING_REGISTER_EVENTFD_ASYNC:
@@ -9901,16 +10068,20 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
         case IORING_REGISTER_RESTRICTIONS:
                 ret = io_register_restrictions(ctx, arg, nr_args);
                 break;
+       case IORING_REGISTER_RSRC:
+               ret = io_register_rsrc(ctx, arg, nr_args);
+               break;
+       case IORING_REGISTER_RSRC_UPDATE:
+               ret = io_register_rsrc_update(ctx, arg, nr_args);
+               break;
         default:
                 ret = -EINVAL;
                 break;
         }
  
-out:
         if (io_register_op_must_quiesce(opcode)) {
                 /* bring the ctx back to life */
                 percpu_ref_reinit(&ctx->refs);
-out_quiesce:
                 reinit_completion(&ctx->ref_comp);
         }
         return ret;