io_uring/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqe (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <net/compat.h>
  47 #include <linux/refcount.h>
  48 #include <linux/uio.h>
  49 #include <linux/bits.h>
  50
  51 #include <linux/sched/signal.h>
  52 #include <linux/fs.h>
  53 #include <linux/file.h>
  54 #include <linux/fdtable.h>
  55 #include <linux/mm.h>
  56 #include <linux/mman.h>
  57 #include <linux/percpu.h>
  58 #include <linux/slab.h>
  59 #include <linux/bvec.h>
  60 #include <linux/net.h>
  61 #include <net/sock.h>
  62 #include <net/af_unix.h>
  63 #include <net/scm.h>
  64 #include <linux/anon_inodes.h>
  65 #include <linux/sched/mm.h>
  66 #include <linux/uaccess.h>
  67 #include <linux/nospec.h>
  68 #include <linux/highmem.h>
  69 #include <linux/fsnotify.h>
  70 #include <linux/fadvise.h>
  71 #include <linux/task_work.h>
  72 #include <linux/io_uring.h>
  73 #include <linux/audit.h>
  74 #include <linux/security.h>
  75
  76 #define CREATE_TRACE_POINTS
  77 #include <trace/events/io_uring.h>
  78
  79 #include <uapi/linux/io_uring.h>
  80
  81 #include "io-wq.h"
  82
  83 #include "io_uring_types.h"
  84 #include "io_uring.h"
  85 #include "opdef.h"
  86 #include "refs.h"
  87 #include "tctx.h"
  88 #include "sqpoll.h"
  89 #include "fdinfo.h"
  90 #include "kbuf.h"
  91 #include "rsrc.h"
  92 #include "cancel.h"
  93
  94 #include "timeout.h"
  95 #include "poll.h"
  96
  97 #define IORING_MAX_ENTRIES      32768
  98 #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
  99
 100 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
 101                                  IORING_REGISTER_LAST + IORING_OP_LAST)
 102
 103 #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
 104                           IOSQE_IO_HARDLINK | IOSQE_ASYNC)
 105
 106 #define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
 107                         IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
 108
 109 #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
 110                                 REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
 111                                 REQ_F_ASYNC_DATA)
 112
 113 #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
 114                                  IO_REQ_CLEAN_FLAGS)
 115
 116 #define IO_TCTX_REFS_CACHE_NR   (1U << 10)
 117
 118 #define IO_COMPL_BATCH                  32
 119 #define IO_REQ_ALLOC_BATCH              8
 120
 121 enum {
 122         IO_CHECK_CQ_OVERFLOW_BIT,
 123         IO_CHECK_CQ_DROPPED_BIT,
 124 };
 125
 126 struct io_defer_entry {
 127         struct list_head        list;
 128         struct io_kiocb         *req;
 129         u32                     seq;
 130 };
 131
 132 /* requests with any of those set should undergo io_disarm_next() */
 133 #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
 134 #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
 135
 136 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 137                                          struct task_struct *task,
 138                                          bool cancel_all);
 139
 140 static void io_dismantle_req(struct io_kiocb *req);
 141 static void io_clean_op(struct io_kiocb *req);
 142 static void io_queue_sqe(struct io_kiocb *req);
 143
 144 static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
 145
 146 static void io_eventfd_signal(struct io_ring_ctx *ctx);
 147
 148 static struct kmem_cache *req_cachep;
 149
 150 struct sock *io_uring_get_socket(struct file *file)
 151 {
 152 #if defined(CONFIG_UNIX)
 153         if (io_is_uring_fops(file)) {
 154                 struct io_ring_ctx *ctx = file->private_data;
 155
 156                 return ctx->ring_sock->sk;
 157         }
 158 #endif
 159         return NULL;
 160 }
 161 EXPORT_SYMBOL(io_uring_get_socket);
 162
 163 static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
 164 {
 165         if (!wq_list_empty(&ctx->submit_state.compl_reqs))
 166                 __io_submit_flush_completions(ctx);
 167 }
 168
 169 static bool io_match_linked(struct io_kiocb *head)
 170 {
 171         struct io_kiocb *req;
 172
 173         io_for_each_link(req, head) {
 174                 if (req->flags & REQ_F_INFLIGHT)
 175                         return true;
 176         }
 177         return false;
 178 }
 179
 180 /*
 181  * As io_match_task() but protected against racing with linked timeouts.
 182  * User must not hold timeout_lock.
 183  */
 184 bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
 185                         bool cancel_all)
 186 {
 187         bool matched;
 188
 189         if (task && head->task != task)
 190                 return false;
 191         if (cancel_all)
 192                 return true;
 193
 194         if (head->flags & REQ_F_LINK_TIMEOUT) {
 195                 struct io_ring_ctx *ctx = head->ctx;
 196
 197                 /* protect against races with linked timeouts */
 198                 spin_lock_irq(&ctx->timeout_lock);
 199                 matched = io_match_linked(head);
 200                 spin_unlock_irq(&ctx->timeout_lock);
 201         } else {
 202                 matched = io_match_linked(head);
 203         }
 204         return matched;
 205 }
 206
 207 static inline void req_fail_link_node(struct io_kiocb *req, int res)
 208 {
 209         req_set_fail(req);
 210         io_req_set_res(req, res, 0);
 211 }
 212
 213 static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
 214 {
 215         wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
 216 }
 217
 218 static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
 219 {
 220         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
 221
 222         complete(&ctx->ref_comp);
 223 }
 224
 225 static __cold void io_fallback_req_func(struct work_struct *work)
 226 {
 227         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
 228                                                 fallback_work.work);
 229         struct llist_node *node = llist_del_all(&ctx->fallback_llist);
 230         struct io_kiocb *req, *tmp;
 231         bool locked = false;
 232
 233         percpu_ref_get(&ctx->refs);
 234         llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
 235                 req->io_task_work.func(req, &locked);
 236
 237         if (locked) {
 238                 io_submit_flush_completions(ctx);
 239                 mutex_unlock(&ctx->uring_lock);
 240         }
 241         percpu_ref_put(&ctx->refs);
 242 }
 243
 244 static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits)
 245 {
 246         unsigned hash_buckets = 1U << bits;
 247         size_t hash_size = hash_buckets * sizeof(table->hbs[0]);
 248
 249         table->hbs = kmalloc(hash_size, GFP_KERNEL);
 250         if (!table->hbs)
 251                 return -ENOMEM;
 252
 253         table->hash_bits = bits;
 254         init_hash_table(table, hash_buckets);
 255         return 0;
 256 }
 257
 258 static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 259 {
 260         struct io_ring_ctx *ctx;
 261         int hash_bits;
 262
 263         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 264         if (!ctx)
 265                 return NULL;
 266
 267         xa_init(&ctx->io_bl_xa);
 268
 269         /*
 270          * Use 5 bits less than the max cq entries, that should give us around
 271          * 32 entries per hash list if totally full and uniformly spread, but
 272          * don't keep too many buckets to not overconsume memory.
 273          */
 274         hash_bits = ilog2(p->cq_entries) - 5;
 275         hash_bits = clamp(hash_bits, 1, 8);
 276         if (io_alloc_hash_table(&ctx->cancel_table, hash_bits))
 277                 goto err;
 278         if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits))
 279                 goto err;
 280
 281         ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
 282         if (!ctx->dummy_ubuf)
 283                 goto err;
 284         /* set invalid range, so io_import_fixed() fails meeting it */
 285         ctx->dummy_ubuf->ubuf = -1UL;
 286
 287         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
 288                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
 289                 goto err;
 290
 291         ctx->flags = p->flags;
 292         init_waitqueue_head(&ctx->sqo_sq_wait);
 293         INIT_LIST_HEAD(&ctx->sqd_list);
 294         INIT_LIST_HEAD(&ctx->cq_overflow_list);
 295         INIT_LIST_HEAD(&ctx->io_buffers_cache);
 296         INIT_LIST_HEAD(&ctx->apoll_cache);
 297         init_completion(&ctx->ref_comp);
 298         xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
 299         mutex_init(&ctx->uring_lock);
 300         init_waitqueue_head(&ctx->cq_wait);
 301         spin_lock_init(&ctx->completion_lock);
 302         spin_lock_init(&ctx->timeout_lock);
 303         INIT_WQ_LIST(&ctx->iopoll_list);
 304         INIT_LIST_HEAD(&ctx->io_buffers_pages);
 305         INIT_LIST_HEAD(&ctx->io_buffers_comp);
 306         INIT_LIST_HEAD(&ctx->defer_list);
 307         INIT_LIST_HEAD(&ctx->timeout_list);
 308         INIT_LIST_HEAD(&ctx->ltimeout_list);
 309         spin_lock_init(&ctx->rsrc_ref_lock);
 310         INIT_LIST_HEAD(&ctx->rsrc_ref_list);
 311         INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
 312         init_llist_head(&ctx->rsrc_put_llist);
 313         INIT_LIST_HEAD(&ctx->tctx_list);
 314         ctx->submit_state.free_list.next = NULL;
 315         INIT_WQ_LIST(&ctx->locked_free_list);
 316         INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
 317         INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
 318         return ctx;
 319 err:
 320         kfree(ctx->dummy_ubuf);
 321         kfree(ctx->cancel_table.hbs);
 322         kfree(ctx->cancel_table_locked.hbs);
 323         kfree(ctx->io_bl);
 324         xa_destroy(&ctx->io_bl_xa);
 325         kfree(ctx);
 326         return NULL;
 327 }
 328
 329 static void io_account_cq_overflow(struct io_ring_ctx *ctx)
 330 {
 331         struct io_rings *r = ctx->rings;
 332
 333         WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
 334         ctx->cq_extra--;
 335 }
 336
 337 static bool req_need_defer(struct io_kiocb *req, u32 seq)
 338 {
 339         if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
 340                 struct io_ring_ctx *ctx = req->ctx;
 341
 342                 return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
 343         }
 344
 345         return false;
 346 }
 347
 348 static inline void io_req_track_inflight(struct io_kiocb *req)
 349 {
 350         if (!(req->flags & REQ_F_INFLIGHT)) {
 351                 req->flags |= REQ_F_INFLIGHT;
 352                 atomic_inc(&req->task->io_uring->inflight_tracked);
 353         }
 354 }
 355
 356 static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
 357 {
 358         if (WARN_ON_ONCE(!req->link))
 359                 return NULL;
 360
 361         req->flags &= ~REQ_F_ARM_LTIMEOUT;
 362         req->flags |= REQ_F_LINK_TIMEOUT;
 363
 364         /* linked timeouts should have two refs once prep'ed */
 365         io_req_set_refcount(req);
 366         __io_req_set_refcount(req->link, 2);
 367         return req->link;
 368 }
 369
 370 static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 371 {
 372         if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
 373                 return NULL;
 374         return __io_prep_linked_timeout(req);
 375 }
 376
 377 static noinline void __io_arm_ltimeout(struct io_kiocb *req)
 378 {
 379         io_queue_linked_timeout(__io_prep_linked_timeout(req));
 380 }
 381
 382 static inline void io_arm_ltimeout(struct io_kiocb *req)
 383 {
 384         if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
 385                 __io_arm_ltimeout(req);
 386 }
 387
 388 static void io_prep_async_work(struct io_kiocb *req)
 389 {
 390         const struct io_op_def *def = &io_op_defs[req->opcode];
 391         struct io_ring_ctx *ctx = req->ctx;
 392
 393         if (!(req->flags & REQ_F_CREDS)) {
 394                 req->flags |= REQ_F_CREDS;
 395                 req->creds = get_current_cred();
 396         }
 397
 398         req->work.list.next = NULL;
 399         req->work.flags = 0;
 400         req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
 401         if (req->flags & REQ_F_FORCE_ASYNC)
 402                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
 403
 404         if (req->flags & REQ_F_ISREG) {
 405                 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
 406                         io_wq_hash_work(&req->work, file_inode(req->file));
 407         } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
 408                 if (def->unbound_nonreg_file)
 409                         req->work.flags |= IO_WQ_WORK_UNBOUND;
 410         }
 411 }
 412
 413 static void io_prep_async_link(struct io_kiocb *req)
 414 {
 415         struct io_kiocb *cur;
 416
 417         if (req->flags & REQ_F_LINK_TIMEOUT) {
 418                 struct io_ring_ctx *ctx = req->ctx;
 419
 420                 spin_lock_irq(&ctx->timeout_lock);
 421                 io_for_each_link(cur, req)
 422                         io_prep_async_work(cur);
 423                 spin_unlock_irq(&ctx->timeout_lock);
 424         } else {
 425                 io_for_each_link(cur, req)
 426                         io_prep_async_work(cur);
 427         }
 428 }
 429
 430 void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
 431 {
 432         struct io_kiocb *link = io_prep_linked_timeout(req);
 433         struct io_uring_task *tctx = req->task->io_uring;
 434
 435         BUG_ON(!tctx);
 436         BUG_ON(!tctx->io_wq);
 437
 438         /* init ->work of the whole link before punting */
 439         io_prep_async_link(req);
 440
 441         /*
 442          * Not expected to happen, but if we do have a bug where this _can_
 443          * happen, catch it here and ensure the request is marked as
 444          * canceled. That will make io-wq go through the usual work cancel
 445          * procedure rather than attempt to run this request (or create a new
 446          * worker for it).
 447          */
 448         if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
 449                 req->work.flags |= IO_WQ_WORK_CANCEL;
 450
 451         trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data,
 452                                         req->opcode, req->flags, &req->work,
 453                                         io_wq_is_hashed(&req->work));
 454         io_wq_enqueue(tctx->io_wq, &req->work);
 455         if (link)
 456                 io_queue_linked_timeout(link);
 457 }
 458
 459 static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
 460 {
 461         while (!list_empty(&ctx->defer_list)) {
 462                 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
 463                                                 struct io_defer_entry, list);
 464
 465                 if (req_need_defer(de->req, de->seq))
 466                         break;
 467                 list_del_init(&de->list);
 468                 io_req_task_queue(de->req);
 469                 kfree(de);
 470         }
 471 }
 472
 473 void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 474 {
 475         if (ctx->off_timeout_used || ctx->drain_active) {
 476                 spin_lock(&ctx->completion_lock);
 477                 if (ctx->off_timeout_used)
 478                         io_flush_timeouts(ctx);
 479                 if (ctx->drain_active)
 480                         io_queue_deferred(ctx);
 481                 io_commit_cqring(ctx);
 482                 spin_unlock(&ctx->completion_lock);
 483         }
 484         if (ctx->has_evfd)
 485                 io_eventfd_signal(ctx);
 486 }
 487
 488 static void io_eventfd_signal(struct io_ring_ctx *ctx)
 489 {
 490         struct io_ev_fd *ev_fd;
 491
 492         rcu_read_lock();
 493         /*
 494          * rcu_dereference ctx->io_ev_fd once and use it for both for checking
 495          * and eventfd_signal
 496          */
 497         ev_fd = rcu_dereference(ctx->io_ev_fd);
 498
 499         /*
 500          * Check again if ev_fd exists incase an io_eventfd_unregister call
 501          * completed between the NULL check of ctx->io_ev_fd at the start of
 502          * the function and rcu_read_lock.
 503          */
 504         if (unlikely(!ev_fd))
 505                 goto out;
 506         if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
 507                 goto out;
 508
 509         if (!ev_fd->eventfd_async || io_wq_current_is_worker())
 510                 eventfd_signal(ev_fd->cq_ev_fd, 1);
 511 out:
 512         rcu_read_unlock();
 513 }
 514
 515 /*
 516  * This should only get called when at least one event has been posted.
 517  * Some applications rely on the eventfd notification count only changing
 518  * IFF a new CQE has been added to the CQ ring. There's no depedency on
 519  * 1:1 relationship between how many times this function is called (and
 520  * hence the eventfd count) and number of CQEs posted to the CQ ring.
 521  */
 522 void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 523 {
 524         if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
 525                      ctx->has_evfd))
 526                 __io_commit_cqring_flush(ctx);
 527
 528         io_cqring_wake(ctx);
 529 }
 530
 531 /* Returns true if there are no backlogged entries after the flush */
 532 static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 533 {
 534         bool all_flushed, posted;
 535         size_t cqe_size = sizeof(struct io_uring_cqe);
 536
 537         if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
 538                 return false;
 539
 540         if (ctx->flags & IORING_SETUP_CQE32)
 541                 cqe_size <<= 1;
 542
 543         posted = false;
 544         spin_lock(&ctx->completion_lock);
 545         while (!list_empty(&ctx->cq_overflow_list)) {
 546                 struct io_uring_cqe *cqe = io_get_cqe(ctx);
 547                 struct io_overflow_cqe *ocqe;
 548
 549                 if (!cqe && !force)
 550                         break;
 551                 ocqe = list_first_entry(&ctx->cq_overflow_list,
 552                                         struct io_overflow_cqe, list);
 553                 if (cqe)
 554                         memcpy(cqe, &ocqe->cqe, cqe_size);
 555                 else
 556                         io_account_cq_overflow(ctx);
 557
 558                 posted = true;
 559                 list_del(&ocqe->list);
 560                 kfree(ocqe);
 561         }
 562
 563         all_flushed = list_empty(&ctx->cq_overflow_list);
 564         if (all_flushed) {
 565                 clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
 566                 atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
 567         }
 568
 569         io_commit_cqring(ctx);
 570         spin_unlock(&ctx->completion_lock);
 571         if (posted)
 572                 io_cqring_ev_posted(ctx);
 573         return all_flushed;
 574 }
 575
 576 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
 577 {
 578         bool ret = true;
 579
 580         if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
 581                 /* iopoll syncs against uring_lock, not completion_lock */
 582                 if (ctx->flags & IORING_SETUP_IOPOLL)
 583                         mutex_lock(&ctx->uring_lock);
 584                 ret = __io_cqring_overflow_flush(ctx, false);
 585                 if (ctx->flags & IORING_SETUP_IOPOLL)
 586                         mutex_unlock(&ctx->uring_lock);
 587         }
 588
 589         return ret;
 590 }
 591
 592 static void __io_put_task(struct task_struct *task, int nr)
 593 {
 594         struct io_uring_task *tctx = task->io_uring;
 595
 596         percpu_counter_sub(&tctx->inflight, nr);
 597         if (unlikely(atomic_read(&tctx->in_idle)))
 598                 wake_up(&tctx->wait);
 599         put_task_struct_many(task, nr);
 600 }
 601
 602 /* must to be called somewhat shortly after putting a request */
 603 static inline void io_put_task(struct task_struct *task, int nr)
 604 {
 605         if (likely(task == current))
 606                 task->io_uring->cached_refs += nr;
 607         else
 608                 __io_put_task(task, nr);
 609 }
 610
 611 static void io_task_refs_refill(struct io_uring_task *tctx)
 612 {
 613         unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
 614
 615         percpu_counter_add(&tctx->inflight, refill);
 616         refcount_add(refill, &current->usage);
 617         tctx->cached_refs += refill;
 618 }
 619
 620 static inline void io_get_task_refs(int nr)
 621 {
 622         struct io_uring_task *tctx = current->io_uring;
 623
 624         tctx->cached_refs -= nr;
 625         if (unlikely(tctx->cached_refs < 0))
 626                 io_task_refs_refill(tctx);
 627 }
 628
 629 static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
 630 {
 631         struct io_uring_task *tctx = task->io_uring;
 632         unsigned int refs = tctx->cached_refs;
 633
 634         if (refs) {
 635                 tctx->cached_refs = 0;
 636                 percpu_counter_sub(&tctx->inflight, refs);
 637                 put_task_struct_many(task, refs);
 638         }
 639 }
 640
 641 bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 642                               u32 cflags, u64 extra1, u64 extra2)
 643 {
 644         struct io_overflow_cqe *ocqe;
 645         size_t ocq_size = sizeof(struct io_overflow_cqe);
 646         bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
 647
 648         if (is_cqe32)
 649                 ocq_size += sizeof(struct io_uring_cqe);
 650
 651         ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
 652         trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
 653         if (!ocqe) {
 654                 /*
 655                  * If we're in ring overflow flush mode, or in task cancel mode,
 656                  * or cannot allocate an overflow entry, then we need to drop it
 657                  * on the floor.
 658                  */
 659                 io_account_cq_overflow(ctx);
 660                 set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
 661                 return false;
 662         }
 663         if (list_empty(&ctx->cq_overflow_list)) {
 664                 set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
 665                 atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
 666
 667         }
 668         ocqe->cqe.user_data = user_data;
 669         ocqe->cqe.res = res;
 670         ocqe->cqe.flags = cflags;
 671         if (is_cqe32) {
 672                 ocqe->cqe.big_cqe[0] = extra1;
 673                 ocqe->cqe.big_cqe[1] = extra2;
 674         }
 675         list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
 676         return true;
 677 }
 678
 679 bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 680                      u32 cflags)
 681 {
 682         struct io_uring_cqe *cqe;
 683
 684         ctx->cq_extra++;
 685         trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
 686
 687         /*
 688          * If we can't get a cq entry, userspace overflowed the
 689          * submission (by quite a lot). Increment the overflow count in
 690          * the ring.
 691          */
 692         cqe = io_get_cqe(ctx);
 693         if (likely(cqe)) {
 694                 WRITE_ONCE(cqe->user_data, user_data);
 695                 WRITE_ONCE(cqe->res, res);
 696                 WRITE_ONCE(cqe->flags, cflags);
 697
 698                 if (ctx->flags & IORING_SETUP_CQE32) {
 699                         WRITE_ONCE(cqe->big_cqe[0], 0);
 700                         WRITE_ONCE(cqe->big_cqe[1], 0);
 701                 }
 702                 return true;
 703         }
 704         return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
 705 }
 706
 707 static void __io_req_complete_put(struct io_kiocb *req)
 708 {
 709         /*
 710          * If we're the last reference to this request, add to our locked
 711          * free_list cache.
 712          */
 713         if (req_ref_put_and_test(req)) {
 714                 struct io_ring_ctx *ctx = req->ctx;
 715
 716                 if (req->flags & IO_REQ_LINK_FLAGS) {
 717                         if (req->flags & IO_DISARM_MASK)
 718                                 io_disarm_next(req);
 719                         if (req->link) {
 720                                 io_req_task_queue(req->link);
 721                                 req->link = NULL;
 722                         }
 723                 }
 724                 io_req_put_rsrc(req);
 725                 /*
 726                  * Selected buffer deallocation in io_clean_op() assumes that
 727                  * we don't hold ->completion_lock. Clean them here to avoid
 728                  * deadlocks.
 729                  */
 730                 io_put_kbuf_comp(req);
 731                 io_dismantle_req(req);
 732                 io_put_task(req->task, 1);
 733                 wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
 734                 ctx->locked_free_nr++;
 735         }
 736 }
 737
 738 void __io_req_complete_post(struct io_kiocb *req)
 739 {
 740         if (!(req->flags & REQ_F_CQE_SKIP))
 741                 __io_fill_cqe_req(req->ctx, req);
 742         __io_req_complete_put(req);
 743 }
 744
 745 void io_req_complete_post(struct io_kiocb *req)
 746 {
 747         struct io_ring_ctx *ctx = req->ctx;
 748
 749         spin_lock(&ctx->completion_lock);
 750         __io_req_complete_post(req);
 751         io_commit_cqring(ctx);
 752         spin_unlock(&ctx->completion_lock);
 753         io_cqring_ev_posted(ctx);
 754 }
 755
 756 inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags)
 757 {
 758         io_req_complete_post(req);
 759 }
 760
 761 void io_req_complete_failed(struct io_kiocb *req, s32 res)
 762 {
 763         req_set_fail(req);
 764         io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
 765         io_req_complete_post(req);
 766 }
 767
 768 /*
 769  * Don't initialise the fields below on every allocation, but do that in
 770  * advance and keep them valid across allocations.
 771  */
 772 static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
 773 {
 774         req->ctx = ctx;
 775         req->link = NULL;
 776         req->async_data = NULL;
 777         /* not necessary, but safer to zero */
 778         req->cqe.res = 0;
 779 }
 780
 781 static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
 782                                         struct io_submit_state *state)
 783 {
 784         spin_lock(&ctx->completion_lock);
 785         wq_list_splice(&ctx->locked_free_list, &state->free_list);
 786         ctx->locked_free_nr = 0;
 787         spin_unlock(&ctx->completion_lock);
 788 }
 789
 790 static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
 791 {
 792         return !ctx->submit_state.free_list.next;
 793 }
 794
 795 /*
 796  * A request might get retired back into the request caches even before opcode
 797  * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
 798  * Because of that, io_alloc_req() should be called only under ->uring_lock
 799  * and with extra caution to not get a request that is still worked on.
 800  */
 801 static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 802         __must_hold(&ctx->uring_lock)
 803 {
 804         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 805         void *reqs[IO_REQ_ALLOC_BATCH];
 806         int ret, i;
 807
 808         /*
 809          * If we have more than a batch's worth of requests in our IRQ side
 810          * locked cache, grab the lock and move them over to our submission
 811          * side cache.
 812          */
 813         if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) {
 814                 io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
 815                 if (!io_req_cache_empty(ctx))
 816                         return true;
 817         }
 818
 819         ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
 820
 821         /*
 822          * Bulk alloc is all-or-nothing. If we fail to get a batch,
 823          * retry single alloc to be on the safe side.
 824          */
 825         if (unlikely(ret <= 0)) {
 826                 reqs[0] = kmem_cache_alloc(req_cachep, gfp);
 827                 if (!reqs[0])
 828                         return false;
 829                 ret = 1;
 830         }
 831
 832         percpu_ref_get_many(&ctx->refs, ret);
 833         for (i = 0; i < ret; i++) {
 834                 struct io_kiocb *req = reqs[i];
 835
 836                 io_preinit_req(req, ctx);
 837                 io_req_add_to_cache(req, ctx);
 838         }
 839         return true;
 840 }
 841
 842 static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
 843 {
 844         if (unlikely(io_req_cache_empty(ctx)))
 845                 return __io_alloc_req_refill(ctx);
 846         return true;
 847 }
 848
 849 static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
 850 {
 851         struct io_wq_work_node *node;
 852
 853         node = wq_stack_extract(&ctx->submit_state.free_list);
 854         return container_of(node, struct io_kiocb, comp_list);
 855 }
 856
 857 static inline void io_dismantle_req(struct io_kiocb *req)
 858 {
 859         unsigned int flags = req->flags;
 860
 861         if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
 862                 io_clean_op(req);
 863         if (!(flags & REQ_F_FIXED_FILE))
 864                 io_put_file(req->file);
 865 }
 866
 867 __cold void io_free_req(struct io_kiocb *req)
 868 {
 869         struct io_ring_ctx *ctx = req->ctx;
 870
 871         io_req_put_rsrc(req);
 872         io_dismantle_req(req);
 873         io_put_task(req->task, 1);
 874
 875         spin_lock(&ctx->completion_lock);
 876         wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
 877         ctx->locked_free_nr++;
 878         spin_unlock(&ctx->completion_lock);
 879 }
 880
 881 static void __io_req_find_next_prep(struct io_kiocb *req)
 882 {
 883         struct io_ring_ctx *ctx = req->ctx;
 884         bool posted;
 885
 886         spin_lock(&ctx->completion_lock);
 887         posted = io_disarm_next(req);
 888         io_commit_cqring(ctx);
 889         spin_unlock(&ctx->completion_lock);
 890         if (posted)
 891                 io_cqring_ev_posted(ctx);
 892 }
 893
 894 static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
 895 {
 896         struct io_kiocb *nxt;
 897
 898         /*
 899          * If LINK is set, we have dependent requests in this chain. If we
 900          * didn't fail this request, queue the first one up, moving any other
 901          * dependencies to the next request. In case of failure, fail the rest
 902          * of the chain.
 903          */
 904         if (unlikely(req->flags & IO_DISARM_MASK))
 905                 __io_req_find_next_prep(req);
 906         nxt = req->link;
 907         req->link = NULL;
 908         return nxt;
 909 }
 910
 911 static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
 912 {
 913         if (!ctx)
 914                 return;
 915         if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
 916                 atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
 917         if (*locked) {
 918                 io_submit_flush_completions(ctx);
 919                 mutex_unlock(&ctx->uring_lock);
 920                 *locked = false;
 921         }
 922         percpu_ref_put(&ctx->refs);
 923 }
 924
 925 static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx)
 926 {
 927         io_commit_cqring(ctx);
 928         spin_unlock(&ctx->completion_lock);
 929         io_cqring_ev_posted(ctx);
 930 }
 931
 932 static void handle_prev_tw_list(struct io_wq_work_node *node,
 933                                 struct io_ring_ctx **ctx, bool *uring_locked)
 934 {
 935         if (*ctx && !*uring_locked)
 936                 spin_lock(&(*ctx)->completion_lock);
 937
 938         do {
 939                 struct io_wq_work_node *next = node->next;
 940                 struct io_kiocb *req = container_of(node, struct io_kiocb,
 941                                                     io_task_work.node);
 942
 943                 prefetch(container_of(next, struct io_kiocb, io_task_work.node));
 944
 945                 if (req->ctx != *ctx) {
 946                         if (unlikely(!*uring_locked && *ctx))
 947                                 ctx_commit_and_unlock(*ctx);
 948
 949                         ctx_flush_and_put(*ctx, uring_locked);
 950                         *ctx = req->ctx;
 951                         /* if not contended, grab and improve batching */
 952                         *uring_locked = mutex_trylock(&(*ctx)->uring_lock);
 953                         percpu_ref_get(&(*ctx)->refs);
 954                         if (unlikely(!*uring_locked))
 955                                 spin_lock(&(*ctx)->completion_lock);
 956                 }
 957                 if (likely(*uring_locked)) {
 958                         req->io_task_work.func(req, uring_locked);
 959                 } else {
 960                         req->cqe.flags = io_put_kbuf_comp(req);
 961                         __io_req_complete_post(req);
 962                 }
 963                 node = next;
 964         } while (node);
 965
 966         if (unlikely(!*uring_locked))
 967                 ctx_commit_and_unlock(*ctx);
 968 }
 969
 970 static void handle_tw_list(struct io_wq_work_node *node,
 971                            struct io_ring_ctx **ctx, bool *locked)
 972 {
 973         do {
 974                 struct io_wq_work_node *next = node->next;
 975                 struct io_kiocb *req = container_of(node, struct io_kiocb,
 976                                                     io_task_work.node);
 977
 978                 prefetch(container_of(next, struct io_kiocb, io_task_work.node));
 979
 980                 if (req->ctx != *ctx) {
 981                         ctx_flush_and_put(*ctx, locked);
 982                         *ctx = req->ctx;
 983                         /* if not contended, grab and improve batching */
 984                         *locked = mutex_trylock(&(*ctx)->uring_lock);
 985                         percpu_ref_get(&(*ctx)->refs);
 986                 }
 987                 req->io_task_work.func(req, locked);
 988                 node = next;
 989         } while (node);
 990 }
 991
 992 void tctx_task_work(struct callback_head *cb)
 993 {
 994         bool uring_locked = false;
 995         struct io_ring_ctx *ctx = NULL;
 996         struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
 997                                                   task_work);
 998
 999         while (1) {
1000                 struct io_wq_work_node *node1, *node2;
1001
1002                 spin_lock_irq(&tctx->task_lock);
1003                 node1 = tctx->prio_task_list.first;
1004                 node2 = tctx->task_list.first;
1005                 INIT_WQ_LIST(&tctx->task_list);
1006                 INIT_WQ_LIST(&tctx->prio_task_list);
1007                 if (!node2 && !node1)
1008                         tctx->task_running = false;
1009                 spin_unlock_irq(&tctx->task_lock);
1010                 if (!node2 && !node1)
1011                         break;
1012
1013                 if (node1)
1014                         handle_prev_tw_list(node1, &ctx, &uring_locked);
1015                 if (node2)
1016                         handle_tw_list(node2, &ctx, &uring_locked);
1017                 cond_resched();
1018
1019                 if (data_race(!tctx->task_list.first) &&
1020                     data_race(!tctx->prio_task_list.first) && uring_locked)
1021                         io_submit_flush_completions(ctx);
1022         }
1023
1024         ctx_flush_and_put(ctx, &uring_locked);
1025
1026         /* relaxed read is enough as only the task itself sets ->in_idle */
1027         if (unlikely(atomic_read(&tctx->in_idle)))
1028                 io_uring_drop_tctx_refs(current);
1029 }
1030
1031 static void __io_req_task_work_add(struct io_kiocb *req,
1032                                    struct io_uring_task *tctx,
1033                                    struct io_wq_work_list *list)
1034 {
1035         struct io_ring_ctx *ctx = req->ctx;
1036         struct io_wq_work_node *node;
1037         unsigned long flags;
1038         bool running;
1039
1040         spin_lock_irqsave(&tctx->task_lock, flags);
1041         wq_list_add_tail(&req->io_task_work.node, list);
1042         running = tctx->task_running;
1043         if (!running)
1044                 tctx->task_running = true;
1045         spin_unlock_irqrestore(&tctx->task_lock, flags);
1046
1047         /* task_work already pending, we're done */
1048         if (running)
1049                 return;
1050
1051         if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
1052                 atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
1053
1054         if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
1055                 return;
1056
1057         spin_lock_irqsave(&tctx->task_lock, flags);
1058         tctx->task_running = false;
1059         node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list);
1060         spin_unlock_irqrestore(&tctx->task_lock, flags);
1061
1062         while (node) {
1063                 req = container_of(node, struct io_kiocb, io_task_work.node);
1064                 node = node->next;
1065                 if (llist_add(&req->io_task_work.fallback_node,
1066                               &req->ctx->fallback_llist))
1067                         schedule_delayed_work(&req->ctx->fallback_work, 1);
1068         }
1069 }
1070
1071 void io_req_task_work_add(struct io_kiocb *req)
1072 {
1073         struct io_uring_task *tctx = req->task->io_uring;
1074
1075         __io_req_task_work_add(req, tctx, &tctx->task_list);
1076 }
1077
1078 void io_req_task_prio_work_add(struct io_kiocb *req)
1079 {
1080         struct io_uring_task *tctx = req->task->io_uring;
1081
1082         if (req->ctx->flags & IORING_SETUP_SQPOLL)
1083                 __io_req_task_work_add(req, tctx, &tctx->prio_task_list);
1084         else
1085                 __io_req_task_work_add(req, tctx, &tctx->task_list);
1086 }
1087
1088 static void io_req_tw_post(struct io_kiocb *req, bool *locked)
1089 {
1090         io_req_complete_post(req);
1091 }
1092
1093 void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags)
1094 {
1095         io_req_set_res(req, res, cflags);
1096         req->io_task_work.func = io_req_tw_post;
1097         io_req_task_work_add(req);
1098 }
1099
1100 static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
1101 {
1102         /* not needed for normal modes, but SQPOLL depends on it */
1103         io_tw_lock(req->ctx, locked);
1104         io_req_complete_failed(req, req->cqe.res);
1105 }
1106
1107 void io_req_task_submit(struct io_kiocb *req, bool *locked)
1108 {
1109         io_tw_lock(req->ctx, locked);
1110         /* req->task == current here, checking PF_EXITING is safe */
1111         if (likely(!(req->task->flags & PF_EXITING)))
1112                 io_queue_sqe(req);
1113         else
1114                 io_req_complete_failed(req, -EFAULT);
1115 }
1116
1117 void io_req_task_queue_fail(struct io_kiocb *req, int ret)
1118 {
1119         io_req_set_res(req, ret, 0);
1120         req->io_task_work.func = io_req_task_cancel;
1121         io_req_task_work_add(req);
1122 }
1123
1124 void io_req_task_queue(struct io_kiocb *req)
1125 {
1126         req->io_task_work.func = io_req_task_submit;
1127         io_req_task_work_add(req);
1128 }
1129
1130 void io_queue_next(struct io_kiocb *req)
1131 {
1132         struct io_kiocb *nxt = io_req_find_next(req);
1133
1134         if (nxt)
1135                 io_req_task_queue(nxt);
1136 }
1137
1138 void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
1139         __must_hold(&ctx->uring_lock)
1140 {
1141         struct task_struct *task = NULL;
1142         int task_refs = 0;
1143
1144         do {
1145                 struct io_kiocb *req = container_of(node, struct io_kiocb,
1146                                                     comp_list);
1147
1148                 if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
1149                         if (req->flags & REQ_F_REFCOUNT) {
1150                                 node = req->comp_list.next;
1151                                 if (!req_ref_put_and_test(req))
1152                                         continue;
1153                         }
1154                         if ((req->flags & REQ_F_POLLED) && req->apoll) {
1155                                 struct async_poll *apoll = req->apoll;
1156
1157                                 if (apoll->double_poll)
1158                                         kfree(apoll->double_poll);
1159                                 list_add(&apoll->poll.wait.entry,
1160                                                 &ctx->apoll_cache);
1161                                 req->flags &= ~REQ_F_POLLED;
1162                         }
1163                         if (req->flags & IO_REQ_LINK_FLAGS)
1164                                 io_queue_next(req);
1165                         if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
1166                                 io_clean_op(req);
1167                 }
1168                 if (!(req->flags & REQ_F_FIXED_FILE))
1169                         io_put_file(req->file);
1170
1171                 io_req_put_rsrc_locked(req, ctx);
1172
1173                 if (req->task != task) {
1174                         if (task)
1175                                 io_put_task(task, task_refs);
1176                         task = req->task;
1177                         task_refs = 0;
1178                 }
1179                 task_refs++;
1180                 node = req->comp_list.next;
1181                 io_req_add_to_cache(req, ctx);
1182         } while (node);
1183
1184         if (task)
1185                 io_put_task(task, task_refs);
1186 }
1187
1188 static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
1189         __must_hold(&ctx->uring_lock)
1190 {
1191         struct io_wq_work_node *node, *prev;
1192         struct io_submit_state *state = &ctx->submit_state;
1193
1194         if (state->flush_cqes) {
1195                 spin_lock(&ctx->completion_lock);
1196                 wq_list_for_each(node, prev, &state->compl_reqs) {
1197                         struct io_kiocb *req = container_of(node, struct io_kiocb,
1198                                                     comp_list);
1199
1200                         if (!(req->flags & REQ_F_CQE_SKIP))
1201                                 __io_fill_cqe_req(ctx, req);
1202                 }
1203
1204                 io_commit_cqring(ctx);
1205                 spin_unlock(&ctx->completion_lock);
1206                 io_cqring_ev_posted(ctx);
1207                 state->flush_cqes = false;
1208         }
1209
1210         io_free_batch_list(ctx, state->compl_reqs.first);
1211         INIT_WQ_LIST(&state->compl_reqs);
1212 }
1213
1214 /*
1215  * Drop reference to request, return next in chain (if there is one) if this
1216  * was the last reference to this request.
1217  */
1218 static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
1219 {
1220         struct io_kiocb *nxt = NULL;
1221
1222         if (req_ref_put_and_test(req)) {
1223                 if (unlikely(req->flags & IO_REQ_LINK_FLAGS))
1224                         nxt = io_req_find_next(req);
1225                 io_free_req(req);
1226         }
1227         return nxt;
1228 }
1229
1230 static unsigned io_cqring_events(struct io_ring_ctx *ctx)
1231 {
1232         /* See comment at the top of this file */
1233         smp_rmb();
1234         return __io_cqring_events(ctx);
1235 }
1236
1237 /*
1238  * We can't just wait for polled events to come to us, we have to actively
1239  * find and complete them.
1240  */
1241 static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
1242 {
1243         if (!(ctx->flags & IORING_SETUP_IOPOLL))
1244                 return;
1245
1246         mutex_lock(&ctx->uring_lock);
1247         while (!wq_list_empty(&ctx->iopoll_list)) {
1248                 /* let it sleep and repeat later if can't complete a request */
1249                 if (io_do_iopoll(ctx, true) == 0)
1250                         break;
1251                 /*
1252                  * Ensure we allow local-to-the-cpu processing to take place,
1253                  * in this case we need to ensure that we reap all events.
1254                  * Also let task_work, etc. to progress by releasing the mutex
1255                  */
1256                 if (need_resched()) {
1257                         mutex_unlock(&ctx->uring_lock);
1258                         cond_resched();
1259                         mutex_lock(&ctx->uring_lock);
1260                 }
1261         }
1262         mutex_unlock(&ctx->uring_lock);
1263 }
1264
1265 static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
1266 {
1267         unsigned int nr_events = 0;
1268         int ret = 0;
1269         unsigned long check_cq;
1270
1271         check_cq = READ_ONCE(ctx->check_cq);
1272         if (unlikely(check_cq)) {
1273                 if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
1274                         __io_cqring_overflow_flush(ctx, false);
1275                 /*
1276                  * Similarly do not spin if we have not informed the user of any
1277                  * dropped CQE.
1278                  */
1279                 if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))
1280                         return -EBADR;
1281         }
1282         /*
1283          * Don't enter poll loop if we already have events pending.
1284          * If we do, we can potentially be spinning for commands that
1285          * already triggered a CQE (eg in error).
1286          */
1287         if (io_cqring_events(ctx))
1288                 return 0;
1289
1290         do {
1291                 /*
1292                  * If a submit got punted to a workqueue, we can have the
1293                  * application entering polling for a command before it gets
1294                  * issued. That app will hold the uring_lock for the duration
1295                  * of the poll right here, so we need to take a breather every
1296                  * now and then to ensure that the issue has a chance to add
1297                  * the poll to the issued list. Otherwise we can spin here
1298                  * forever, while the workqueue is stuck trying to acquire the
1299                  * very same mutex.
1300                  */
1301                 if (wq_list_empty(&ctx->iopoll_list)) {
1302                         u32 tail = ctx->cached_cq_tail;
1303
1304                         mutex_unlock(&ctx->uring_lock);
1305                         io_run_task_work();
1306                         mutex_lock(&ctx->uring_lock);
1307
1308                         /* some requests don't go through iopoll_list */
1309                         if (tail != ctx->cached_cq_tail ||
1310                             wq_list_empty(&ctx->iopoll_list))
1311                                 break;
1312                 }
1313                 ret = io_do_iopoll(ctx, !min);
1314                 if (ret < 0)
1315                         break;
1316                 nr_events += ret;
1317                 ret = 0;
1318         } while (nr_events < min && !need_resched());
1319
1320         return ret;
1321 }
1322
1323 void io_req_task_complete(struct io_kiocb *req, bool *locked)
1324 {
1325         if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
1326                 unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED;
1327
1328                 req->cqe.flags |= io_put_kbuf(req, issue_flags);
1329         }
1330
1331         if (*locked)
1332                 io_req_add_compl_list(req);
1333         else
1334                 io_req_complete_post(req);
1335 }
1336
1337 /*
1338  * After the iocb has been issued, it's safe to be found on the poll list.
1339  * Adding the kiocb to the list AFTER submission ensures that we don't
1340  * find it from a io_do_iopoll() thread before the issuer is done
1341  * accessing the kiocb cookie.
1342  */
1343 static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
1344 {
1345         struct io_ring_ctx *ctx = req->ctx;
1346         const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
1347
1348         /* workqueue context doesn't hold uring_lock, grab it now */
1349         if (unlikely(needs_lock))
1350                 mutex_lock(&ctx->uring_lock);
1351
1352         /*
1353          * Track whether we have multiple files in our lists. This will impact
1354          * how we do polling eventually, not spinning if we're on potentially
1355          * different devices.
1356          */
1357         if (wq_list_empty(&ctx->iopoll_list)) {
1358                 ctx->poll_multi_queue = false;
1359         } else if (!ctx->poll_multi_queue) {
1360                 struct io_kiocb *list_req;
1361
1362                 list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
1363                                         comp_list);
1364                 if (list_req->file != req->file)
1365                         ctx->poll_multi_queue = true;
1366         }
1367
1368         /*
1369          * For fast devices, IO may have already completed. If it has, add
1370          * it to the front so we find it first.
1371          */
1372         if (READ_ONCE(req->iopoll_completed))
1373                 wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
1374         else
1375                 wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
1376
1377         if (unlikely(needs_lock)) {
1378                 /*
1379                  * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
1380                  * in sq thread task context or in io worker task context. If
1381                  * current task context is sq thread, we don't need to check
1382                  * whether should wake up sq thread.
1383                  */
1384                 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
1385                     wq_has_sleeper(&ctx->sq_data->wait))
1386                         wake_up(&ctx->sq_data->wait);
1387
1388                 mutex_unlock(&ctx->uring_lock);
1389         }
1390 }
1391
1392 static bool io_bdev_nowait(struct block_device *bdev)
1393 {
1394         return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
1395 }
1396
1397 /*
1398  * If we tracked the file through the SCM inflight mechanism, we could support
1399  * any file. For now, just ensure that anything potentially problematic is done
1400  * inline.
1401  */
1402 static bool __io_file_supports_nowait(struct file *file, umode_t mode)
1403 {
1404         if (S_ISBLK(mode)) {
1405                 if (IS_ENABLED(CONFIG_BLOCK) &&
1406                     io_bdev_nowait(I_BDEV(file->f_mapping->host)))
1407                         return true;
1408                 return false;
1409         }
1410         if (S_ISSOCK(mode))
1411                 return true;
1412         if (S_ISREG(mode)) {
1413                 if (IS_ENABLED(CONFIG_BLOCK) &&
1414                     io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
1415                     !io_is_uring_fops(file))
1416                         return true;
1417                 return false;
1418         }
1419
1420         /* any ->read/write should understand O_NONBLOCK */
1421         if (file->f_flags & O_NONBLOCK)
1422                 return true;
1423         return file->f_mode & FMODE_NOWAIT;
1424 }
1425
1426 /*
1427  * If we tracked the file through the SCM inflight mechanism, we could support
1428  * any file. For now, just ensure that anything potentially problematic is done
1429  * inline.
1430  */
1431 unsigned int io_file_get_flags(struct file *file)
1432 {
1433         umode_t mode = file_inode(file)->i_mode;
1434         unsigned int res = 0;
1435
1436         if (S_ISREG(mode))
1437                 res |= FFS_ISREG;
1438         if (__io_file_supports_nowait(file, mode))
1439                 res |= FFS_NOWAIT;
1440         if (io_file_need_scm(file))
1441                 res |= FFS_SCM;
1442         return res;
1443 }
1444
1445 bool io_alloc_async_data(struct io_kiocb *req)
1446 {
1447         WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
1448         req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
1449         if (req->async_data) {
1450                 req->flags |= REQ_F_ASYNC_DATA;
1451                 return false;
1452         }
1453         return true;
1454 }
1455
1456 int io_req_prep_async(struct io_kiocb *req)
1457 {
1458         const struct io_op_def *def = &io_op_defs[req->opcode];
1459
1460         /* assign early for deferred execution for non-fixed file */
1461         if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE))
1462                 req->file = io_file_get_normal(req, req->cqe.fd);
1463         if (!def->prep_async)
1464                 return 0;
1465         if (WARN_ON_ONCE(req_has_async_data(req)))
1466                 return -EFAULT;
1467         if (io_alloc_async_data(req))
1468                 return -EAGAIN;
1469
1470         return def->prep_async(req);
1471 }
1472
1473 static u32 io_get_sequence(struct io_kiocb *req)
1474 {
1475         u32 seq = req->ctx->cached_sq_head;
1476         struct io_kiocb *cur;
1477
1478         /* need original cached_sq_head, but it was increased for each req */
1479         io_for_each_link(cur, req)
1480                 seq--;
1481         return seq;
1482 }
1483
1484 static __cold void io_drain_req(struct io_kiocb *req)
1485 {
1486         struct io_ring_ctx *ctx = req->ctx;
1487         struct io_defer_entry *de;
1488         int ret;
1489         u32 seq = io_get_sequence(req);
1490
1491         /* Still need defer if there is pending req in defer list. */
1492         spin_lock(&ctx->completion_lock);
1493         if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
1494                 spin_unlock(&ctx->completion_lock);
1495 queue:
1496                 ctx->drain_active = false;
1497                 io_req_task_queue(req);
1498                 return;
1499         }
1500         spin_unlock(&ctx->completion_lock);
1501
1502         ret = io_req_prep_async(req);
1503         if (ret) {
1504 fail:
1505                 io_req_complete_failed(req, ret);
1506                 return;
1507         }
1508         io_prep_async_link(req);
1509         de = kmalloc(sizeof(*de), GFP_KERNEL);
1510         if (!de) {
1511                 ret = -ENOMEM;
1512                 goto fail;
1513         }
1514
1515         spin_lock(&ctx->completion_lock);
1516         if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
1517                 spin_unlock(&ctx->completion_lock);
1518                 kfree(de);
1519                 goto queue;
1520         }
1521
1522         trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode);
1523         de->req = req;
1524         de->seq = seq;
1525         list_add_tail(&de->list, &ctx->defer_list);
1526         spin_unlock(&ctx->completion_lock);
1527 }
1528
1529 static void io_clean_op(struct io_kiocb *req)
1530 {
1531         if (req->flags & REQ_F_BUFFER_SELECTED) {
1532                 spin_lock(&req->ctx->completion_lock);
1533                 io_put_kbuf_comp(req);
1534                 spin_unlock(&req->ctx->completion_lock);
1535         }
1536
1537         if (req->flags & REQ_F_NEED_CLEANUP) {
1538                 const struct io_op_def *def = &io_op_defs[req->opcode];
1539
1540                 if (def->cleanup)
1541                         def->cleanup(req);
1542         }
1543         if ((req->flags & REQ_F_POLLED) && req->apoll) {
1544                 kfree(req->apoll->double_poll);
1545                 kfree(req->apoll);
1546                 req->apoll = NULL;
1547         }
1548         if (req->flags & REQ_F_INFLIGHT) {
1549                 struct io_uring_task *tctx = req->task->io_uring;
1550
1551                 atomic_dec(&tctx->inflight_tracked);
1552         }
1553         if (req->flags & REQ_F_CREDS)
1554                 put_cred(req->creds);
1555         if (req->flags & REQ_F_ASYNC_DATA) {
1556                 kfree(req->async_data);
1557                 req->async_data = NULL;
1558         }
1559         req->flags &= ~IO_REQ_CLEAN_FLAGS;
1560 }
1561
1562 static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
1563 {
1564         if (req->file || !io_op_defs[req->opcode].needs_file)
1565                 return true;
1566
1567         if (req->flags & REQ_F_FIXED_FILE)
1568                 req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
1569         else
1570                 req->file = io_file_get_normal(req, req->cqe.fd);
1571
1572         return !!req->file;
1573 }
1574
1575 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
1576 {
1577         const struct io_op_def *def = &io_op_defs[req->opcode];
1578         const struct cred *creds = NULL;
1579         int ret;
1580
1581         if (unlikely(!io_assign_file(req, issue_flags)))
1582                 return -EBADF;
1583
1584         if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
1585                 creds = override_creds(req->creds);
1586
1587         if (!def->audit_skip)
1588                 audit_uring_entry(req->opcode);
1589
1590         ret = def->issue(req, issue_flags);
1591
1592         if (!def->audit_skip)
1593                 audit_uring_exit(!ret, ret);
1594
1595         if (creds)
1596                 revert_creds(creds);
1597
1598         if (ret == IOU_OK) {
1599                 if (issue_flags & IO_URING_F_COMPLETE_DEFER)
1600                         io_req_add_compl_list(req);
1601                 else
1602                         io_req_complete_post(req);
1603         } else if (ret != IOU_ISSUE_SKIP_COMPLETE)
1604                 return ret;
1605
1606         /* If the op doesn't have a file, we're not polling for it */
1607         if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
1608                 io_iopoll_req_issued(req, issue_flags);
1609
1610         return 0;
1611 }
1612
1613 int io_poll_issue(struct io_kiocb *req, bool *locked)
1614 {
1615         io_tw_lock(req->ctx, locked);
1616         if (unlikely(req->task->flags & PF_EXITING))
1617                 return -EFAULT;
1618         return io_issue_sqe(req, IO_URING_F_NONBLOCK);
1619 }
1620
1621 struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
1622 {
1623         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
1624
1625         req = io_put_req_find_next(req);
1626         return req ? &req->work : NULL;
1627 }
1628
1629 void io_wq_submit_work(struct io_wq_work *work)
1630 {
1631         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
1632         const struct io_op_def *def = &io_op_defs[req->opcode];
1633         unsigned int issue_flags = IO_URING_F_UNLOCKED;
1634         bool needs_poll = false;
1635         int ret = 0, err = -ECANCELED;
1636
1637         /* one will be dropped by ->io_free_work() after returning to io-wq */
1638         if (!(req->flags & REQ_F_REFCOUNT))
1639                 __io_req_set_refcount(req, 2);
1640         else
1641                 req_ref_get(req);
1642
1643         io_arm_ltimeout(req);
1644
1645         /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
1646         if (work->flags & IO_WQ_WORK_CANCEL) {
1647 fail:
1648                 io_req_task_queue_fail(req, err);
1649                 return;
1650         }
1651         if (!io_assign_file(req, issue_flags)) {
1652                 err = -EBADF;
1653                 work->flags |= IO_WQ_WORK_CANCEL;
1654                 goto fail;
1655         }
1656
1657         if (req->flags & REQ_F_FORCE_ASYNC) {
1658                 bool opcode_poll = def->pollin || def->pollout;
1659
1660                 if (opcode_poll && file_can_poll(req->file)) {
1661                         needs_poll = true;
1662                         issue_flags |= IO_URING_F_NONBLOCK;
1663                 }
1664         }
1665
1666         do {
1667                 ret = io_issue_sqe(req, issue_flags);
1668                 if (ret != -EAGAIN)
1669                         break;
1670                 /*
1671                  * We can get EAGAIN for iopolled IO even though we're
1672                  * forcing a sync submission from here, since we can't
1673                  * wait for request slots on the block side.
1674                  */
1675                 if (!needs_poll) {
1676                         if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
1677                                 break;
1678                         cond_resched();
1679                         continue;
1680                 }
1681
1682                 if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
1683                         return;
1684                 /* aborted or ready, in either case retry blocking */
1685                 needs_poll = false;
1686                 issue_flags &= ~IO_URING_F_NONBLOCK;
1687         } while (1);
1688
1689         /* avoid locking problems by failing it from a clean context */
1690         if (ret < 0)
1691                 io_req_task_queue_fail(req, ret);
1692 }
1693
1694 inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
1695                                       unsigned int issue_flags)
1696 {
1697         struct io_ring_ctx *ctx = req->ctx;
1698         struct file *file = NULL;
1699         unsigned long file_ptr;
1700
1701         io_ring_submit_lock(ctx, issue_flags);
1702
1703         if (unlikely((unsigned int)fd >= ctx->nr_user_files))
1704                 goto out;
1705         fd = array_index_nospec(fd, ctx->nr_user_files);
1706         file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
1707         file = (struct file *) (file_ptr & FFS_MASK);
1708         file_ptr &= ~FFS_MASK;
1709         /* mask in overlapping REQ_F and FFS bits */
1710         req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
1711         io_req_set_rsrc_node(req, ctx, 0);
1712         WARN_ON_ONCE(file && !test_bit(fd, ctx->file_table.bitmap));
1713 out:
1714         io_ring_submit_unlock(ctx, issue_flags);
1715         return file;
1716 }
1717
1718 struct file *io_file_get_normal(struct io_kiocb *req, int fd)
1719 {
1720         struct file *file = fget(fd);
1721
1722         trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd);
1723
1724         /* we don't allow fixed io_uring files */
1725         if (file && io_is_uring_fops(file))
1726                 io_req_track_inflight(req);
1727         return file;
1728 }
1729
1730 static void io_queue_async(struct io_kiocb *req, int ret)
1731         __must_hold(&req->ctx->uring_lock)
1732 {
1733         struct io_kiocb *linked_timeout;
1734
1735         if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
1736                 io_req_complete_failed(req, ret);
1737                 return;
1738         }
1739
1740         linked_timeout = io_prep_linked_timeout(req);
1741
1742         switch (io_arm_poll_handler(req, 0)) {
1743         case IO_APOLL_READY:
1744                 io_req_task_queue(req);
1745                 break;
1746         case IO_APOLL_ABORTED:
1747                 /*
1748                  * Queued up for async execution, worker will release
1749                  * submit reference when the iocb is actually submitted.
1750                  */
1751                 io_kbuf_recycle(req, 0);
1752                 io_queue_iowq(req, NULL);
1753                 break;
1754         case IO_APOLL_OK:
1755                 break;
1756         }
1757
1758         if (linked_timeout)
1759                 io_queue_linked_timeout(linked_timeout);
1760 }
1761
1762 static inline void io_queue_sqe(struct io_kiocb *req)
1763         __must_hold(&req->ctx->uring_lock)
1764 {
1765         int ret;
1766
1767         ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
1768
1769         /*
1770          * We async punt it if the file wasn't marked NOWAIT, or if the file
1771          * doesn't support non-blocking read/write attempts
1772          */
1773         if (likely(!ret))
1774                 io_arm_ltimeout(req);
1775         else
1776                 io_queue_async(req, ret);
1777 }
1778
1779 static void io_queue_sqe_fallback(struct io_kiocb *req)
1780         __must_hold(&req->ctx->uring_lock)
1781 {
1782         if (unlikely(req->flags & REQ_F_FAIL)) {
1783                 /*
1784                  * We don't submit, fail them all, for that replace hardlinks
1785                  * with normal links. Extra REQ_F_LINK is tolerated.
1786                  */
1787                 req->flags &= ~REQ_F_HARDLINK;
1788                 req->flags |= REQ_F_LINK;
1789                 io_req_complete_failed(req, req->cqe.res);
1790         } else if (unlikely(req->ctx->drain_active)) {
1791                 io_drain_req(req);
1792         } else {
1793                 int ret = io_req_prep_async(req);
1794
1795                 if (unlikely(ret))
1796                         io_req_complete_failed(req, ret);
1797                 else
1798                         io_queue_iowq(req, NULL);
1799         }
1800 }
1801
1802 /*
1803  * Check SQE restrictions (opcode and flags).
1804  *
1805  * Returns 'true' if SQE is allowed, 'false' otherwise.
1806  */
1807 static inline bool io_check_restriction(struct io_ring_ctx *ctx,
1808                                         struct io_kiocb *req,
1809                                         unsigned int sqe_flags)
1810 {
1811         if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
1812                 return false;
1813
1814         if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
1815             ctx->restrictions.sqe_flags_required)
1816                 return false;
1817
1818         if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
1819                           ctx->restrictions.sqe_flags_required))
1820                 return false;
1821
1822         return true;
1823 }
1824
1825 static void io_init_req_drain(struct io_kiocb *req)
1826 {
1827         struct io_ring_ctx *ctx = req->ctx;
1828         struct io_kiocb *head = ctx->submit_state.link.head;
1829
1830         ctx->drain_active = true;
1831         if (head) {
1832                 /*
1833                  * If we need to drain a request in the middle of a link, drain
1834                  * the head request and the next request/link after the current
1835                  * link. Considering sequential execution of links,
1836                  * REQ_F_IO_DRAIN will be maintained for every request of our
1837                  * link.
1838                  */
1839                 head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
1840                 ctx->drain_next = true;
1841         }
1842 }
1843
1844 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
1845                        const struct io_uring_sqe *sqe)
1846         __must_hold(&ctx->uring_lock)
1847 {
1848         const struct io_op_def *def;
1849         unsigned int sqe_flags;
1850         int personality;
1851         u8 opcode;
1852
1853         /* req is partially pre-initialised, see io_preinit_req() */
1854         req->opcode = opcode = READ_ONCE(sqe->opcode);
1855         /* same numerical values with corresponding REQ_F_*, safe to copy */
1856         req->flags = sqe_flags = READ_ONCE(sqe->flags);
1857         req->cqe.user_data = READ_ONCE(sqe->user_data);
1858         req->file = NULL;
1859         req->rsrc_node = NULL;
1860         req->task = current;
1861
1862         if (unlikely(opcode >= IORING_OP_LAST)) {
1863                 req->opcode = 0;
1864                 return -EINVAL;
1865         }
1866         def = &io_op_defs[opcode];
1867         if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
1868                 /* enforce forwards compatibility on users */
1869                 if (sqe_flags & ~SQE_VALID_FLAGS)
1870                         return -EINVAL;
1871                 if (sqe_flags & IOSQE_BUFFER_SELECT) {
1872                         if (!def->buffer_select)
1873                                 return -EOPNOTSUPP;
1874                         req->buf_index = READ_ONCE(sqe->buf_group);
1875                 }
1876                 if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
1877                         ctx->drain_disabled = true;
1878                 if (sqe_flags & IOSQE_IO_DRAIN) {
1879                         if (ctx->drain_disabled)
1880                                 return -EOPNOTSUPP;
1881                         io_init_req_drain(req);
1882                 }
1883         }
1884         if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
1885                 if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
1886                         return -EACCES;
1887                 /* knock it to the slow queue path, will be drained there */
1888                 if (ctx->drain_active)
1889                         req->flags |= REQ_F_FORCE_ASYNC;
1890                 /* if there is no link, we're at "next" request and need to drain */
1891                 if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
1892                         ctx->drain_next = false;
1893                         ctx->drain_active = true;
1894                         req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
1895                 }
1896         }
1897
1898         if (!def->ioprio && sqe->ioprio)
1899                 return -EINVAL;
1900         if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
1901                 return -EINVAL;
1902
1903         if (def->needs_file) {
1904                 struct io_submit_state *state = &ctx->submit_state;
1905
1906                 req->cqe.fd = READ_ONCE(sqe->fd);
1907
1908                 /*
1909                  * Plug now if we have more than 2 IO left after this, and the
1910                  * target is potentially a read/write to block based storage.
1911                  */
1912                 if (state->need_plug && def->plug) {
1913                         state->plug_started = true;
1914                         state->need_plug = false;
1915                         blk_start_plug_nr_ios(&state->plug, state->submit_nr);
1916                 }
1917         }
1918
1919         personality = READ_ONCE(sqe->personality);
1920         if (personality) {
1921                 int ret;
1922
1923                 req->creds = xa_load(&ctx->personalities, personality);
1924                 if (!req->creds)
1925                         return -EINVAL;
1926                 get_cred(req->creds);
1927                 ret = security_uring_override_creds(req->creds);
1928                 if (ret) {
1929                         put_cred(req->creds);
1930                         return ret;
1931                 }
1932                 req->flags |= REQ_F_CREDS;
1933         }
1934
1935         return def->prep(req, sqe);
1936 }
1937
1938 static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
1939                                       struct io_kiocb *req, int ret)
1940 {
1941         struct io_ring_ctx *ctx = req->ctx;
1942         struct io_submit_link *link = &ctx->submit_state.link;
1943         struct io_kiocb *head = link->head;
1944
1945         trace_io_uring_req_failed(sqe, ctx, req, ret);
1946
1947         /*
1948          * Avoid breaking links in the middle as it renders links with SQPOLL
1949          * unusable. Instead of failing eagerly, continue assembling the link if
1950          * applicable and mark the head with REQ_F_FAIL. The link flushing code
1951          * should find the flag and handle the rest.
1952          */
1953         req_fail_link_node(req, ret);
1954         if (head && !(head->flags & REQ_F_FAIL))
1955                 req_fail_link_node(head, -ECANCELED);
1956
1957         if (!(req->flags & IO_REQ_LINK_FLAGS)) {
1958                 if (head) {
1959                         link->last->link = req;
1960                         link->head = NULL;
1961                         req = head;
1962                 }
1963                 io_queue_sqe_fallback(req);
1964                 return ret;
1965         }
1966
1967         if (head)
1968                 link->last->link = req;
1969         else
1970                 link->head = req;
1971         link->last = req;
1972         return 0;
1973 }
1974
1975 static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
1976                          const struct io_uring_sqe *sqe)
1977         __must_hold(&ctx->uring_lock)
1978 {
1979         struct io_submit_link *link = &ctx->submit_state.link;
1980         int ret;
1981
1982         ret = io_init_req(ctx, req, sqe);
1983         if (unlikely(ret))
1984                 return io_submit_fail_init(sqe, req, ret);
1985
1986         /* don't need @sqe from now on */
1987         trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode,
1988                                   req->flags, true,
1989                                   ctx->flags & IORING_SETUP_SQPOLL);
1990
1991         /*
1992          * If we already have a head request, queue this one for async
1993          * submittal once the head completes. If we don't have a head but
1994          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
1995          * submitted sync once the chain is complete. If none of those
1996          * conditions are true (normal request), then just queue it.
1997          */
1998         if (unlikely(link->head)) {
1999                 ret = io_req_prep_async(req);
2000                 if (unlikely(ret))
2001                         return io_submit_fail_init(sqe, req, ret);
2002
2003                 trace_io_uring_link(ctx, req, link->head);
2004                 link->last->link = req;
2005                 link->last = req;
2006
2007                 if (req->flags & IO_REQ_LINK_FLAGS)
2008                         return 0;
2009                 /* last request of the link, flush it */
2010                 req = link->head;
2011                 link->head = NULL;
2012                 if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))
2013                         goto fallback;
2014
2015         } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS |
2016                                           REQ_F_FORCE_ASYNC | REQ_F_FAIL))) {
2017                 if (req->flags & IO_REQ_LINK_FLAGS) {
2018                         link->head = req;
2019                         link->last = req;
2020                 } else {
2021 fallback:
2022                         io_queue_sqe_fallback(req);
2023                 }
2024                 return 0;
2025         }
2026
2027         io_queue_sqe(req);
2028         return 0;
2029 }
2030
2031 /*
2032  * Batched submission is done, ensure local IO is flushed out.
2033  */
2034 static void io_submit_state_end(struct io_ring_ctx *ctx)
2035 {
2036         struct io_submit_state *state = &ctx->submit_state;
2037
2038         if (unlikely(state->link.head))
2039                 io_queue_sqe_fallback(state->link.head);
2040         /* flush only after queuing links as they can generate completions */
2041         io_submit_flush_completions(ctx);
2042         if (state->plug_started)
2043                 blk_finish_plug(&state->plug);
2044 }
2045
2046 /*
2047  * Start submission side cache.
2048  */
2049 static void io_submit_state_start(struct io_submit_state *state,
2050                                   unsigned int max_ios)
2051 {
2052         state->plug_started = false;
2053         state->need_plug = max_ios > 2;
2054         state->submit_nr = max_ios;
2055         /* set only head, no need to init link_last in advance */
2056         state->link.head = NULL;
2057 }
2058
2059 static void io_commit_sqring(struct io_ring_ctx *ctx)
2060 {
2061         struct io_rings *rings = ctx->rings;
2062
2063         /*
2064          * Ensure any loads from the SQEs are done at this point,
2065          * since once we write the new head, the application could
2066          * write new data to them.
2067          */
2068         smp_store_release(&rings->sq.head, ctx->cached_sq_head);
2069 }
2070
2071 /*
2072  * Fetch an sqe, if one is available. Note this returns a pointer to memory
2073  * that is mapped by userspace. This means that care needs to be taken to
2074  * ensure that reads are stable, as we cannot rely on userspace always
2075  * being a good citizen. If members of the sqe are validated and then later
2076  * used, it's important that those reads are done through READ_ONCE() to
2077  * prevent a re-load down the line.
2078  */
2079 static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
2080 {
2081         unsigned head, mask = ctx->sq_entries - 1;
2082         unsigned sq_idx = ctx->cached_sq_head++ & mask;
2083
2084         /*
2085          * The cached sq head (or cq tail) serves two purposes:
2086          *
2087          * 1) allows us to batch the cost of updating the user visible
2088          *    head updates.
2089          * 2) allows the kernel side to track the head on its own, even
2090          *    though the application is the one updating it.
2091          */
2092         head = READ_ONCE(ctx->sq_array[sq_idx]);
2093         if (likely(head < ctx->sq_entries)) {
2094                 /* double index for 128-byte SQEs, twice as long */
2095                 if (ctx->flags & IORING_SETUP_SQE128)
2096                         head <<= 1;
2097                 return &ctx->sq_sqes[head];
2098         }
2099
2100         /* drop invalid entries */
2101         ctx->cq_extra--;
2102         WRITE_ONCE(ctx->rings->sq_dropped,
2103                    READ_ONCE(ctx->rings->sq_dropped) + 1);
2104         return NULL;
2105 }
2106
2107 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
2108         __must_hold(&ctx->uring_lock)
2109 {
2110         unsigned int entries = io_sqring_entries(ctx);
2111         unsigned int left;
2112         int ret;
2113
2114         if (unlikely(!entries))
2115                 return 0;
2116         /* make sure SQ entry isn't read before tail */
2117         ret = left = min3(nr, ctx->sq_entries, entries);
2118         io_get_task_refs(left);
2119         io_submit_state_start(&ctx->submit_state, left);
2120
2121         do {
2122                 const struct io_uring_sqe *sqe;
2123                 struct io_kiocb *req;
2124
2125                 if (unlikely(!io_alloc_req_refill(ctx)))
2126                         break;
2127                 req = io_alloc_req(ctx);
2128                 sqe = io_get_sqe(ctx);
2129                 if (unlikely(!sqe)) {
2130                         io_req_add_to_cache(req, ctx);
2131                         break;
2132                 }
2133
2134                 /*
2135                  * Continue submitting even for sqe failure if the
2136                  * ring was setup with IORING_SETUP_SUBMIT_ALL
2137                  */
2138                 if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
2139                     !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
2140                         left--;
2141                         break;
2142                 }
2143         } while (--left);
2144
2145         if (unlikely(left)) {
2146                 ret -= left;
2147                 /* try again if it submitted nothing and can't allocate a req */
2148                 if (!ret && io_req_cache_empty(ctx))
2149                         ret = -EAGAIN;
2150                 current->io_uring->cached_refs += left;
2151         }
2152
2153         io_submit_state_end(ctx);
2154          /* Commit SQ ring head once we've consumed and submitted all SQEs */
2155         io_commit_sqring(ctx);
2156         return ret;
2157 }
2158
2159 struct io_wait_queue {
2160         struct wait_queue_entry wq;
2161         struct io_ring_ctx *ctx;
2162         unsigned cq_tail;
2163         unsigned nr_timeouts;
2164 };
2165
2166 static inline bool io_should_wake(struct io_wait_queue *iowq)
2167 {
2168         struct io_ring_ctx *ctx = iowq->ctx;
2169         int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
2170
2171         /*
2172          * Wake up if we have enough events, or if a timeout occurred since we
2173          * started waiting. For timeouts, we always want to return to userspace,
2174          * regardless of event count.
2175          */
2176         return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
2177 }
2178
2179 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
2180                             int wake_flags, void *key)
2181 {
2182         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
2183                                                         wq);
2184
2185         /*
2186          * Cannot safely flush overflowed CQEs from here, ensure we wake up
2187          * the task, and the next invocation will do it.
2188          */
2189         if (io_should_wake(iowq) ||
2190             test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &iowq->ctx->check_cq))
2191                 return autoremove_wake_function(curr, mode, wake_flags, key);
2192         return -1;
2193 }
2194
2195 int io_run_task_work_sig(void)
2196 {
2197         if (io_run_task_work())
2198                 return 1;
2199         if (test_thread_flag(TIF_NOTIFY_SIGNAL))
2200                 return -ERESTARTSYS;
2201         if (task_sigpending(current))
2202                 return -EINTR;
2203         return 0;
2204 }
2205
2206 /* when returns >0, the caller should retry */
2207 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
2208                                           struct io_wait_queue *iowq,
2209                                           ktime_t timeout)
2210 {
2211         int ret;
2212         unsigned long check_cq;
2213
2214         /* make sure we run task_work before checking for signals */
2215         ret = io_run_task_work_sig();
2216         if (ret || io_should_wake(iowq))
2217                 return ret;
2218
2219         check_cq = READ_ONCE(ctx->check_cq);
2220         if (unlikely(check_cq)) {
2221                 /* let the caller flush overflows, retry */
2222                 if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
2223                         return 1;
2224                 if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))
2225                         return -EBADR;
2226         }
2227         if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
2228                 return -ETIME;
2229         return 1;
2230 }
2231
2232 /*
2233  * Wait until events become available, if we don't already have some. The
2234  * application must reap them itself, as they reside on the shared cq ring.
2235  */
2236 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
2237                           const sigset_t __user *sig, size_t sigsz,
2238                           struct __kernel_timespec __user *uts)
2239 {
2240         struct io_wait_queue iowq;
2241         struct io_rings *rings = ctx->rings;
2242         ktime_t timeout = KTIME_MAX;
2243         int ret;
2244
2245         do {
2246                 io_cqring_overflow_flush(ctx);
2247                 if (io_cqring_events(ctx) >= min_events)
2248                         return 0;
2249                 if (!io_run_task_work())
2250                         break;
2251         } while (1);
2252
2253         if (sig) {
2254 #ifdef CONFIG_COMPAT
2255                 if (in_compat_syscall())
2256                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
2257                                                       sigsz);
2258                 else
2259 #endif
2260                         ret = set_user_sigmask(sig, sigsz);
2261
2262                 if (ret)
2263                         return ret;
2264         }
2265
2266         if (uts) {
2267                 struct timespec64 ts;
2268
2269                 if (get_timespec64(&ts, uts))
2270                         return -EFAULT;
2271                 timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
2272         }
2273
2274         init_waitqueue_func_entry(&iowq.wq, io_wake_function);
2275         iowq.wq.private = current;
2276         INIT_LIST_HEAD(&iowq.wq.entry);
2277         iowq.ctx = ctx;
2278         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
2279         iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
2280
2281         trace_io_uring_cqring_wait(ctx, min_events);
2282         do {
2283                 /* if we can't even flush overflow, don't wait for more */
2284                 if (!io_cqring_overflow_flush(ctx)) {
2285                         ret = -EBUSY;
2286                         break;
2287                 }
2288                 prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
2289                                                 TASK_INTERRUPTIBLE);
2290                 ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
2291                 cond_resched();
2292         } while (ret > 0);
2293
2294         finish_wait(&ctx->cq_wait, &iowq.wq);
2295         restore_saved_sigmask_unless(ret == -EINTR);
2296
2297         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
2298 }
2299
2300 static void io_mem_free(void *ptr)
2301 {
2302         struct page *page;
2303
2304         if (!ptr)
2305                 return;
2306
2307         page = virt_to_head_page(ptr);
2308         if (put_page_testzero(page))
2309                 free_compound_page(page);
2310 }
2311
2312 static void *io_mem_alloc(size_t size)
2313 {
2314         gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
2315
2316         return (void *) __get_free_pages(gfp, get_order(size));
2317 }
2318
2319 static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
2320                                 unsigned int cq_entries, size_t *sq_offset)
2321 {
2322         struct io_rings *rings;
2323         size_t off, sq_array_size;
2324
2325         off = struct_size(rings, cqes, cq_entries);
2326         if (off == SIZE_MAX)
2327                 return SIZE_MAX;
2328         if (ctx->flags & IORING_SETUP_CQE32) {
2329                 if (check_shl_overflow(off, 1, &off))
2330                         return SIZE_MAX;
2331         }
2332
2333 #ifdef CONFIG_SMP
2334         off = ALIGN(off, SMP_CACHE_BYTES);
2335         if (off == 0)
2336                 return SIZE_MAX;
2337 #endif
2338
2339         if (sq_offset)
2340                 *sq_offset = off;
2341
2342         sq_array_size = array_size(sizeof(u32), sq_entries);
2343         if (sq_array_size == SIZE_MAX)
2344                 return SIZE_MAX;
2345
2346         if (check_add_overflow(off, sq_array_size, &off))
2347                 return SIZE_MAX;
2348
2349         return off;
2350 }
2351
2352 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
2353                                unsigned int eventfd_async)
2354 {
2355         struct io_ev_fd *ev_fd;
2356         __s32 __user *fds = arg;
2357         int fd;
2358
2359         ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
2360                                         lockdep_is_held(&ctx->uring_lock));
2361         if (ev_fd)
2362                 return -EBUSY;
2363
2364         if (copy_from_user(&fd, fds, sizeof(*fds)))
2365                 return -EFAULT;
2366
2367         ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
2368         if (!ev_fd)
2369                 return -ENOMEM;
2370
2371         ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
2372         if (IS_ERR(ev_fd->cq_ev_fd)) {
2373                 int ret = PTR_ERR(ev_fd->cq_ev_fd);
2374                 kfree(ev_fd);
2375                 return ret;
2376         }
2377         ev_fd->eventfd_async = eventfd_async;
2378         ctx->has_evfd = true;
2379         rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
2380         return 0;
2381 }
2382
2383 static void io_eventfd_put(struct rcu_head *rcu)
2384 {
2385         struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
2386
2387         eventfd_ctx_put(ev_fd->cq_ev_fd);
2388         kfree(ev_fd);
2389 }
2390
2391 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
2392 {
2393         struct io_ev_fd *ev_fd;
2394
2395         ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
2396                                         lockdep_is_held(&ctx->uring_lock));
2397         if (ev_fd) {
2398                 ctx->has_evfd = false;
2399                 rcu_assign_pointer(ctx->io_ev_fd, NULL);
2400                 call_rcu(&ev_fd->rcu, io_eventfd_put);
2401                 return 0;
2402         }
2403
2404         return -ENXIO;
2405 }
2406
2407 static void io_req_caches_free(struct io_ring_ctx *ctx)
2408 {
2409         struct io_submit_state *state = &ctx->submit_state;
2410         int nr = 0;
2411
2412         mutex_lock(&ctx->uring_lock);
2413         io_flush_cached_locked_reqs(ctx, state);
2414
2415         while (!io_req_cache_empty(ctx)) {
2416                 struct io_wq_work_node *node;
2417                 struct io_kiocb *req;
2418
2419                 node = wq_stack_extract(&state->free_list);
2420                 req = container_of(node, struct io_kiocb, comp_list);
2421                 kmem_cache_free(req_cachep, req);
2422                 nr++;
2423         }
2424         if (nr)
2425                 percpu_ref_put_many(&ctx->refs, nr);
2426         mutex_unlock(&ctx->uring_lock);
2427 }
2428
2429 static void io_flush_apoll_cache(struct io_ring_ctx *ctx)
2430 {
2431         struct async_poll *apoll;
2432
2433         while (!list_empty(&ctx->apoll_cache)) {
2434                 apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
2435                                                 poll.wait.entry);
2436                 list_del(&apoll->poll.wait.entry);
2437                 kfree(apoll);
2438         }
2439 }
2440
2441 static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
2442 {
2443         io_sq_thread_finish(ctx);
2444
2445         if (ctx->mm_account) {
2446                 mmdrop(ctx->mm_account);
2447                 ctx->mm_account = NULL;
2448         }
2449
2450         io_rsrc_refs_drop(ctx);
2451         /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
2452         io_wait_rsrc_data(ctx->buf_data);
2453         io_wait_rsrc_data(ctx->file_data);
2454
2455         mutex_lock(&ctx->uring_lock);
2456         if (ctx->buf_data)
2457                 __io_sqe_buffers_unregister(ctx);
2458         if (ctx->file_data)
2459                 __io_sqe_files_unregister(ctx);
2460         if (ctx->rings)
2461                 __io_cqring_overflow_flush(ctx, true);
2462         io_eventfd_unregister(ctx);
2463         io_flush_apoll_cache(ctx);
2464         mutex_unlock(&ctx->uring_lock);
2465         io_destroy_buffers(ctx);
2466         if (ctx->sq_creds)
2467                 put_cred(ctx->sq_creds);
2468         if (ctx->submitter_task)
2469                 put_task_struct(ctx->submitter_task);
2470
2471         /* there are no registered resources left, nobody uses it */
2472         if (ctx->rsrc_node)
2473                 io_rsrc_node_destroy(ctx->rsrc_node);
2474         if (ctx->rsrc_backup_node)
2475                 io_rsrc_node_destroy(ctx->rsrc_backup_node);
2476         flush_delayed_work(&ctx->rsrc_put_work);
2477         flush_delayed_work(&ctx->fallback_work);
2478
2479         WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
2480         WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
2481
2482 #if defined(CONFIG_UNIX)
2483         if (ctx->ring_sock) {
2484                 ctx->ring_sock->file = NULL; /* so that iput() is called */
2485                 sock_release(ctx->ring_sock);
2486         }
2487 #endif
2488         WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
2489
2490         io_mem_free(ctx->rings);
2491         io_mem_free(ctx->sq_sqes);
2492
2493         percpu_ref_exit(&ctx->refs);
2494         free_uid(ctx->user);
2495         io_req_caches_free(ctx);
2496         if (ctx->hash_map)
2497                 io_wq_put_hash(ctx->hash_map);
2498         kfree(ctx->cancel_table.hbs);
2499         kfree(ctx->cancel_table_locked.hbs);
2500         kfree(ctx->dummy_ubuf);
2501         kfree(ctx->io_bl);
2502         xa_destroy(&ctx->io_bl_xa);
2503         kfree(ctx);
2504 }
2505
2506 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
2507 {
2508         struct io_ring_ctx *ctx = file->private_data;
2509         __poll_t mask = 0;
2510
2511         poll_wait(file, &ctx->cq_wait, wait);
2512         /*
2513          * synchronizes with barrier from wq_has_sleeper call in
2514          * io_commit_cqring
2515          */
2516         smp_rmb();
2517         if (!io_sqring_full(ctx))
2518                 mask |= EPOLLOUT | EPOLLWRNORM;
2519
2520         /*
2521          * Don't flush cqring overflow list here, just do a simple check.
2522          * Otherwise there could possible be ABBA deadlock:
2523          *      CPU0                    CPU1
2524          *      ----                    ----
2525          * lock(&ctx->uring_lock);
2526          *                              lock(&ep->mtx);
2527          *                              lock(&ctx->uring_lock);
2528          * lock(&ep->mtx);
2529          *
2530          * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
2531          * pushs them to do the flush.
2532          */
2533         if (io_cqring_events(ctx) ||
2534             test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
2535                 mask |= EPOLLIN | EPOLLRDNORM;
2536
2537         return mask;
2538 }
2539
2540 static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
2541 {
2542         const struct cred *creds;
2543
2544         creds = xa_erase(&ctx->personalities, id);
2545         if (creds) {
2546                 put_cred(creds);
2547                 return 0;
2548         }
2549
2550         return -EINVAL;
2551 }
2552
2553 struct io_tctx_exit {
2554         struct callback_head            task_work;
2555         struct completion               completion;
2556         struct io_ring_ctx              *ctx;
2557 };
2558
2559 static __cold void io_tctx_exit_cb(struct callback_head *cb)
2560 {
2561         struct io_uring_task *tctx = current->io_uring;
2562         struct io_tctx_exit *work;
2563
2564         work = container_of(cb, struct io_tctx_exit, task_work);
2565         /*
2566          * When @in_idle, we're in cancellation and it's racy to remove the
2567          * node. It'll be removed by the end of cancellation, just ignore it.
2568          */
2569         if (!atomic_read(&tctx->in_idle))
2570                 io_uring_del_tctx_node((unsigned long)work->ctx);
2571         complete(&work->completion);
2572 }
2573
2574 static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
2575 {
2576         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
2577
2578         return req->ctx == data;
2579 }
2580
2581 static __cold void io_ring_exit_work(struct work_struct *work)
2582 {
2583         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
2584         unsigned long timeout = jiffies + HZ * 60 * 5;
2585         unsigned long interval = HZ / 20;
2586         struct io_tctx_exit exit;
2587         struct io_tctx_node *node;
2588         int ret;
2589
2590         /*
2591          * If we're doing polled IO and end up having requests being
2592          * submitted async (out-of-line), then completions can come in while
2593          * we're waiting for refs to drop. We need to reap these manually,
2594          * as nobody else will be looking for them.
2595          */
2596         do {
2597                 io_uring_try_cancel_requests(ctx, NULL, true);
2598                 if (ctx->sq_data) {
2599                         struct io_sq_data *sqd = ctx->sq_data;
2600                         struct task_struct *tsk;
2601
2602                         io_sq_thread_park(sqd);
2603                         tsk = sqd->thread;
2604                         if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
2605                                 io_wq_cancel_cb(tsk->io_uring->io_wq,
2606                                                 io_cancel_ctx_cb, ctx, true);
2607                         io_sq_thread_unpark(sqd);
2608                 }
2609
2610                 io_req_caches_free(ctx);
2611
2612                 if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
2613                         /* there is little hope left, don't run it too often */
2614                         interval = HZ * 60;
2615                 }
2616         } while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
2617
2618         init_completion(&exit.completion);
2619         init_task_work(&exit.task_work, io_tctx_exit_cb);
2620         exit.ctx = ctx;
2621         /*
2622          * Some may use context even when all refs and requests have been put,
2623          * and they are free to do so while still holding uring_lock or
2624          * completion_lock, see io_req_task_submit(). Apart from other work,
2625          * this lock/unlock section also waits them to finish.
2626          */
2627         mutex_lock(&ctx->uring_lock);
2628         while (!list_empty(&ctx->tctx_list)) {
2629                 WARN_ON_ONCE(time_after(jiffies, timeout));
2630
2631                 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
2632                                         ctx_node);
2633                 /* don't spin on a single task if cancellation failed */
2634                 list_rotate_left(&ctx->tctx_list);
2635                 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
2636                 if (WARN_ON_ONCE(ret))
2637                         continue;
2638
2639                 mutex_unlock(&ctx->uring_lock);
2640                 wait_for_completion(&exit.completion);
2641                 mutex_lock(&ctx->uring_lock);
2642         }
2643         mutex_unlock(&ctx->uring_lock);
2644         spin_lock(&ctx->completion_lock);
2645         spin_unlock(&ctx->completion_lock);
2646
2647         io_ring_ctx_free(ctx);
2648 }
2649
2650 static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
2651 {
2652         unsigned long index;
2653         struct creds *creds;
2654
2655         mutex_lock(&ctx->uring_lock);
2656         percpu_ref_kill(&ctx->refs);
2657         if (ctx->rings)
2658                 __io_cqring_overflow_flush(ctx, true);
2659         xa_for_each(&ctx->personalities, index, creds)
2660                 io_unregister_personality(ctx, index);
2661         if (ctx->rings)
2662                 io_poll_remove_all(ctx, NULL, true);
2663         mutex_unlock(&ctx->uring_lock);
2664
2665         /* failed during ring init, it couldn't have issued any requests */
2666         if (ctx->rings) {
2667                 io_kill_timeouts(ctx, NULL, true);
2668                 /* if we failed setting up the ctx, we might not have any rings */
2669                 io_iopoll_try_reap_events(ctx);
2670         }
2671
2672         INIT_WORK(&ctx->exit_work, io_ring_exit_work);
2673         /*
2674          * Use system_unbound_wq to avoid spawning tons of event kworkers
2675          * if we're exiting a ton of rings at the same time. It just adds
2676          * noise and overhead, there's no discernable change in runtime
2677          * over using system_wq.
2678          */
2679         queue_work(system_unbound_wq, &ctx->exit_work);
2680 }
2681
2682 static int io_uring_release(struct inode *inode, struct file *file)
2683 {
2684         struct io_ring_ctx *ctx = file->private_data;
2685
2686         file->private_data = NULL;
2687         io_ring_ctx_wait_and_kill(ctx);
2688         return 0;
2689 }
2690
2691 struct io_task_cancel {
2692         struct task_struct *task;
2693         bool all;
2694 };
2695
2696 static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
2697 {
2698         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
2699         struct io_task_cancel *cancel = data;
2700
2701         return io_match_task_safe(req, cancel->task, cancel->all);
2702 }
2703
2704 static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
2705                                          struct task_struct *task,
2706                                          bool cancel_all)
2707 {
2708         struct io_defer_entry *de;
2709         LIST_HEAD(list);
2710
2711         spin_lock(&ctx->completion_lock);
2712         list_for_each_entry_reverse(de, &ctx->defer_list, list) {
2713                 if (io_match_task_safe(de->req, task, cancel_all)) {
2714                         list_cut_position(&list, &ctx->defer_list, &de->list);
2715                         break;
2716                 }
2717         }
2718         spin_unlock(&ctx->completion_lock);
2719         if (list_empty(&list))
2720                 return false;
2721
2722         while (!list_empty(&list)) {
2723                 de = list_first_entry(&list, struct io_defer_entry, list);
2724                 list_del_init(&de->list);
2725                 io_req_complete_failed(de->req, -ECANCELED);
2726                 kfree(de);
2727         }
2728         return true;
2729 }
2730
2731 static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
2732 {
2733         struct io_tctx_node *node;
2734         enum io_wq_cancel cret;
2735         bool ret = false;
2736
2737         mutex_lock(&ctx->uring_lock);
2738         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
2739                 struct io_uring_task *tctx = node->task->io_uring;
2740
2741                 /*
2742                  * io_wq will stay alive while we hold uring_lock, because it's
2743                  * killed after ctx nodes, which requires to take the lock.
2744                  */
2745                 if (!tctx || !tctx->io_wq)
2746                         continue;
2747                 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
2748                 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
2749         }
2750         mutex_unlock(&ctx->uring_lock);
2751
2752         return ret;
2753 }
2754
2755 static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
2756                                                 struct task_struct *task,
2757                                                 bool cancel_all)
2758 {
2759         struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
2760         struct io_uring_task *tctx = task ? task->io_uring : NULL;
2761
2762         /* failed during ring init, it couldn't have issued any requests */
2763         if (!ctx->rings)
2764                 return;
2765
2766         while (1) {
2767                 enum io_wq_cancel cret;
2768                 bool ret = false;
2769
2770                 if (!task) {
2771                         ret |= io_uring_try_cancel_iowq(ctx);
2772                 } else if (tctx && tctx->io_wq) {
2773                         /*
2774                          * Cancels requests of all rings, not only @ctx, but
2775                          * it's fine as the task is in exit/exec.
2776                          */
2777                         cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
2778                                                &cancel, true);
2779                         ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
2780                 }
2781
2782                 /* SQPOLL thread does its own polling */
2783                 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
2784                     (ctx->sq_data && ctx->sq_data->thread == current)) {
2785                         while (!wq_list_empty(&ctx->iopoll_list)) {
2786                                 io_iopoll_try_reap_events(ctx);
2787                                 ret = true;
2788                         }
2789                 }
2790
2791                 ret |= io_cancel_defer_files(ctx, task, cancel_all);
2792                 mutex_lock(&ctx->uring_lock);
2793                 ret |= io_poll_remove_all(ctx, task, cancel_all);
2794                 mutex_unlock(&ctx->uring_lock);
2795                 ret |= io_kill_timeouts(ctx, task, cancel_all);
2796                 if (task)
2797                         ret |= io_run_task_work();
2798                 if (!ret)
2799                         break;
2800                 cond_resched();
2801         }
2802 }
2803
2804 static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
2805 {
2806         if (tracked)
2807                 return atomic_read(&tctx->inflight_tracked);
2808         return percpu_counter_sum(&tctx->inflight);
2809 }
2810
2811 /*
2812  * Find any io_uring ctx that this task has registered or done IO on, and cancel
2813  * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
2814  */
2815 __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
2816 {
2817         struct io_uring_task *tctx = current->io_uring;
2818         struct io_ring_ctx *ctx;
2819         s64 inflight;
2820         DEFINE_WAIT(wait);
2821
2822         WARN_ON_ONCE(sqd && sqd->thread != current);
2823
2824         if (!current->io_uring)
2825                 return;
2826         if (tctx->io_wq)
2827                 io_wq_exit_start(tctx->io_wq);
2828
2829         atomic_inc(&tctx->in_idle);
2830         do {
2831                 io_uring_drop_tctx_refs(current);
2832                 /* read completions before cancelations */
2833                 inflight = tctx_inflight(tctx, !cancel_all);
2834                 if (!inflight)
2835                         break;
2836
2837                 if (!sqd) {
2838                         struct io_tctx_node *node;
2839                         unsigned long index;
2840
2841                         xa_for_each(&tctx->xa, index, node) {
2842                                 /* sqpoll task will cancel all its requests */
2843                                 if (node->ctx->sq_data)
2844                                         continue;
2845                                 io_uring_try_cancel_requests(node->ctx, current,
2846                                                              cancel_all);
2847                         }
2848                 } else {
2849                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
2850                                 io_uring_try_cancel_requests(ctx, current,
2851                                                              cancel_all);
2852                 }
2853
2854                 prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
2855                 io_run_task_work();
2856                 io_uring_drop_tctx_refs(current);
2857
2858                 /*
2859                  * If we've seen completions, retry without waiting. This
2860                  * avoids a race where a completion comes in before we did
2861                  * prepare_to_wait().
2862                  */
2863                 if (inflight == tctx_inflight(tctx, !cancel_all))
2864                         schedule();
2865                 finish_wait(&tctx->wait, &wait);
2866         } while (1);
2867
2868         io_uring_clean_tctx(tctx);
2869         if (cancel_all) {
2870                 /*
2871                  * We shouldn't run task_works after cancel, so just leave
2872                  * ->in_idle set for normal exit.
2873                  */
2874                 atomic_dec(&tctx->in_idle);
2875                 /* for exec all current's requests should be gone, kill tctx */
2876                 __io_uring_free(current);
2877         }
2878 }
2879
2880 void __io_uring_cancel(bool cancel_all)
2881 {
2882         io_uring_cancel_generic(cancel_all, NULL);
2883 }
2884
2885 static void *io_uring_validate_mmap_request(struct file *file,
2886                                             loff_t pgoff, size_t sz)
2887 {
2888         struct io_ring_ctx *ctx = file->private_data;
2889         loff_t offset = pgoff << PAGE_SHIFT;
2890         struct page *page;
2891         void *ptr;
2892
2893         switch (offset) {
2894         case IORING_OFF_SQ_RING:
2895         case IORING_OFF_CQ_RING:
2896                 ptr = ctx->rings;
2897                 break;
2898         case IORING_OFF_SQES:
2899                 ptr = ctx->sq_sqes;
2900                 break;
2901         default:
2902                 return ERR_PTR(-EINVAL);
2903         }
2904
2905         page = virt_to_head_page(ptr);
2906         if (sz > page_size(page))
2907                 return ERR_PTR(-EINVAL);
2908
2909         return ptr;
2910 }
2911
2912 #ifdef CONFIG_MMU
2913
2914 static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
2915 {
2916         size_t sz = vma->vm_end - vma->vm_start;
2917         unsigned long pfn;
2918         void *ptr;
2919
2920         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
2921         if (IS_ERR(ptr))
2922                 return PTR_ERR(ptr);
2923
2924         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
2925         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
2926 }
2927
2928 #else /* !CONFIG_MMU */
2929
2930 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
2931 {
2932         return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
2933 }
2934
2935 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
2936 {
2937         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
2938 }
2939
2940 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
2941         unsigned long addr, unsigned long len,
2942         unsigned long pgoff, unsigned long flags)
2943 {
2944         void *ptr;
2945
2946         ptr = io_uring_validate_mmap_request(file, pgoff, len);
2947         if (IS_ERR(ptr))
2948                 return PTR_ERR(ptr);
2949
2950         return (unsigned long) ptr;
2951 }
2952
2953 #endif /* !CONFIG_MMU */
2954
2955 static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
2956 {
2957         if (flags & IORING_ENTER_EXT_ARG) {
2958                 struct io_uring_getevents_arg arg;
2959
2960                 if (argsz != sizeof(arg))
2961                         return -EINVAL;
2962                 if (copy_from_user(&arg, argp, sizeof(arg)))
2963                         return -EFAULT;
2964         }
2965         return 0;
2966 }
2967
2968 static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
2969                           struct __kernel_timespec __user **ts,
2970                           const sigset_t __user **sig)
2971 {
2972         struct io_uring_getevents_arg arg;
2973
2974         /*
2975          * If EXT_ARG isn't set, then we have no timespec and the argp pointer
2976          * is just a pointer to the sigset_t.
2977          */
2978         if (!(flags & IORING_ENTER_EXT_ARG)) {
2979                 *sig = (const sigset_t __user *) argp;
2980                 *ts = NULL;
2981                 return 0;
2982         }
2983
2984         /*
2985          * EXT_ARG is set - ensure we agree on the size of it and copy in our
2986          * timespec and sigset_t pointers if good.
2987          */
2988         if (*argsz != sizeof(arg))
2989                 return -EINVAL;
2990         if (copy_from_user(&arg, argp, sizeof(arg)))
2991                 return -EFAULT;
2992         if (arg.pad)
2993                 return -EINVAL;
2994         *sig = u64_to_user_ptr(arg.sigmask);
2995         *argsz = arg.sigmask_sz;
2996         *ts = u64_to_user_ptr(arg.ts);
2997         return 0;
2998 }
2999
3000 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
3001                 u32, min_complete, u32, flags, const void __user *, argp,
3002                 size_t, argsz)
3003 {
3004         struct io_ring_ctx *ctx;
3005         struct fd f;
3006         long ret;
3007
3008         io_run_task_work();
3009
3010         if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
3011                                IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
3012                                IORING_ENTER_REGISTERED_RING)))
3013                 return -EINVAL;
3014
3015         /*
3016          * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
3017          * need only dereference our task private array to find it.
3018          */
3019         if (flags & IORING_ENTER_REGISTERED_RING) {
3020                 struct io_uring_task *tctx = current->io_uring;
3021
3022                 if (!tctx || fd >= IO_RINGFD_REG_MAX)
3023                         return -EINVAL;
3024                 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
3025                 f.file = tctx->registered_rings[fd];
3026                 f.flags = 0;
3027         } else {
3028                 f = fdget(fd);
3029         }
3030
3031         if (unlikely(!f.file))
3032                 return -EBADF;
3033
3034         ret = -EOPNOTSUPP;
3035         if (unlikely(!io_is_uring_fops(f.file)))
3036                 goto out_fput;
3037
3038         ret = -ENXIO;
3039         ctx = f.file->private_data;
3040         if (unlikely(!percpu_ref_tryget(&ctx->refs)))
3041                 goto out_fput;
3042
3043         ret = -EBADFD;
3044         if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
3045                 goto out;
3046
3047         /*
3048          * For SQ polling, the thread will do all submissions and completions.
3049          * Just return the requested submit count, and wake the thread if
3050          * we were asked to.
3051          */
3052         ret = 0;
3053         if (ctx->flags & IORING_SETUP_SQPOLL) {
3054                 io_cqring_overflow_flush(ctx);
3055
3056                 if (unlikely(ctx->sq_data->thread == NULL)) {
3057                         ret = -EOWNERDEAD;
3058                         goto out;
3059                 }
3060                 if (flags & IORING_ENTER_SQ_WAKEUP)
3061                         wake_up(&ctx->sq_data->wait);
3062                 if (flags & IORING_ENTER_SQ_WAIT) {
3063                         ret = io_sqpoll_wait_sq(ctx);
3064                         if (ret)
3065                                 goto out;
3066                 }
3067                 ret = to_submit;
3068         } else if (to_submit) {
3069                 ret = io_uring_add_tctx_node(ctx);
3070                 if (unlikely(ret))
3071                         goto out;
3072
3073                 mutex_lock(&ctx->uring_lock);
3074                 ret = io_submit_sqes(ctx, to_submit);
3075                 if (ret != to_submit) {
3076                         mutex_unlock(&ctx->uring_lock);
3077                         goto out;
3078                 }
3079                 if ((flags & IORING_ENTER_GETEVENTS) && ctx->syscall_iopoll)
3080                         goto iopoll_locked;
3081                 mutex_unlock(&ctx->uring_lock);
3082         }
3083         if (flags & IORING_ENTER_GETEVENTS) {
3084                 int ret2;
3085                 if (ctx->syscall_iopoll) {
3086                         /*
3087                          * We disallow the app entering submit/complete with
3088                          * polling, but we still need to lock the ring to
3089                          * prevent racing with polled issue that got punted to
3090                          * a workqueue.
3091                          */
3092                         mutex_lock(&ctx->uring_lock);
3093 iopoll_locked:
3094                         ret2 = io_validate_ext_arg(flags, argp, argsz);
3095                         if (likely(!ret2)) {
3096                                 min_complete = min(min_complete,
3097                                                    ctx->cq_entries);
3098                                 ret2 = io_iopoll_check(ctx, min_complete);
3099                         }
3100                         mutex_unlock(&ctx->uring_lock);
3101                 } else {
3102                         const sigset_t __user *sig;
3103                         struct __kernel_timespec __user *ts;
3104
3105                         ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
3106                         if (likely(!ret2)) {
3107                                 min_complete = min(min_complete,
3108                                                    ctx->cq_entries);
3109                                 ret2 = io_cqring_wait(ctx, min_complete, sig,
3110                                                       argsz, ts);
3111                         }
3112                 }
3113
3114                 if (!ret) {
3115                         ret = ret2;
3116
3117                         /*
3118                          * EBADR indicates that one or more CQE were dropped.
3119                          * Once the user has been informed we can clear the bit
3120                          * as they are obviously ok with those drops.
3121                          */
3122                         if (unlikely(ret2 == -EBADR))
3123                                 clear_bit(IO_CHECK_CQ_DROPPED_BIT,
3124                                           &ctx->check_cq);
3125                 }
3126         }
3127
3128 out:
3129         percpu_ref_put(&ctx->refs);
3130 out_fput:
3131         fdput(f);
3132         return ret;
3133 }
3134
3135 static const struct file_operations io_uring_fops = {
3136         .release        = io_uring_release,
3137         .mmap           = io_uring_mmap,
3138 #ifndef CONFIG_MMU
3139         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
3140         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
3141 #endif
3142         .poll           = io_uring_poll,
3143 #ifdef CONFIG_PROC_FS
3144         .show_fdinfo    = io_uring_show_fdinfo,
3145 #endif
3146 };
3147
3148 bool io_is_uring_fops(struct file *file)
3149 {
3150         return file->f_op == &io_uring_fops;
3151 }
3152
3153 static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
3154                                          struct io_uring_params *p)
3155 {
3156         struct io_rings *rings;
3157         size_t size, sq_array_offset;
3158
3159         /* make sure these are sane, as we already accounted them */
3160         ctx->sq_entries = p->sq_entries;
3161         ctx->cq_entries = p->cq_entries;
3162
3163         size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset);
3164         if (size == SIZE_MAX)
3165                 return -EOVERFLOW;
3166
3167         rings = io_mem_alloc(size);
3168         if (!rings)
3169                 return -ENOMEM;
3170
3171         ctx->rings = rings;
3172         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
3173         rings->sq_ring_mask = p->sq_entries - 1;
3174         rings->cq_ring_mask = p->cq_entries - 1;
3175         rings->sq_ring_entries = p->sq_entries;
3176         rings->cq_ring_entries = p->cq_entries;
3177
3178         if (p->flags & IORING_SETUP_SQE128)
3179                 size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries);
3180         else
3181                 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
3182         if (size == SIZE_MAX) {
3183                 io_mem_free(ctx->rings);
3184                 ctx->rings = NULL;
3185                 return -EOVERFLOW;
3186         }
3187
3188         ctx->sq_sqes = io_mem_alloc(size);
3189         if (!ctx->sq_sqes) {
3190                 io_mem_free(ctx->rings);
3191                 ctx->rings = NULL;
3192                 return -ENOMEM;
3193         }
3194
3195         return 0;
3196 }
3197
3198 static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
3199 {
3200         int ret, fd;
3201
3202         fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
3203         if (fd < 0)
3204                 return fd;
3205
3206         ret = __io_uring_add_tctx_node(ctx, false);
3207         if (ret) {
3208                 put_unused_fd(fd);
3209                 return ret;
3210         }
3211         fd_install(fd, file);
3212         return fd;
3213 }
3214
3215 /*
3216  * Allocate an anonymous fd, this is what constitutes the application
3217  * visible backing of an io_uring instance. The application mmaps this
3218  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
3219  * we have to tie this fd to a socket for file garbage collection purposes.
3220  */
3221 static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
3222 {
3223         struct file *file;
3224 #if defined(CONFIG_UNIX)
3225         int ret;
3226
3227         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
3228                                 &ctx->ring_sock);
3229         if (ret)
3230                 return ERR_PTR(ret);
3231 #endif
3232
3233         file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
3234                                          O_RDWR | O_CLOEXEC, NULL);
3235 #if defined(CONFIG_UNIX)
3236         if (IS_ERR(file)) {
3237                 sock_release(ctx->ring_sock);
3238                 ctx->ring_sock = NULL;
3239         } else {
3240                 ctx->ring_sock->file = file;
3241         }
3242 #endif
3243         return file;
3244 }
3245
3246 static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
3247                                   struct io_uring_params __user *params)
3248 {
3249         struct io_ring_ctx *ctx;
3250         struct file *file;
3251         int ret;
3252
3253         if (!entries)
3254                 return -EINVAL;
3255         if (entries > IORING_MAX_ENTRIES) {
3256                 if (!(p->flags & IORING_SETUP_CLAMP))
3257                         return -EINVAL;
3258                 entries = IORING_MAX_ENTRIES;
3259         }
3260
3261         /*
3262          * Use twice as many entries for the CQ ring. It's possible for the
3263          * application to drive a higher depth than the size of the SQ ring,
3264          * since the sqes are only used at submission time. This allows for
3265          * some flexibility in overcommitting a bit. If the application has
3266          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
3267          * of CQ ring entries manually.
3268          */
3269         p->sq_entries = roundup_pow_of_two(entries);
3270         if (p->flags & IORING_SETUP_CQSIZE) {
3271                 /*
3272                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
3273                  * to a power-of-two, if it isn't already. We do NOT impose
3274                  * any cq vs sq ring sizing.
3275                  */
3276                 if (!p->cq_entries)
3277                         return -EINVAL;
3278                 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
3279                         if (!(p->flags & IORING_SETUP_CLAMP))
3280                                 return -EINVAL;
3281                         p->cq_entries = IORING_MAX_CQ_ENTRIES;
3282                 }
3283                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
3284                 if (p->cq_entries < p->sq_entries)
3285                         return -EINVAL;
3286         } else {
3287                 p->cq_entries = 2 * p->sq_entries;
3288         }
3289
3290         ctx = io_ring_ctx_alloc(p);
3291         if (!ctx)
3292                 return -ENOMEM;
3293
3294         /*
3295          * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
3296          * space applications don't need to do io completion events
3297          * polling again, they can rely on io_sq_thread to do polling
3298          * work, which can reduce cpu usage and uring_lock contention.
3299          */
3300         if (ctx->flags & IORING_SETUP_IOPOLL &&
3301             !(ctx->flags & IORING_SETUP_SQPOLL))
3302                 ctx->syscall_iopoll = 1;
3303
3304         ctx->compat = in_compat_syscall();
3305         if (!capable(CAP_IPC_LOCK))
3306                 ctx->user = get_uid(current_user());
3307
3308         /*
3309          * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if
3310          * COOP_TASKRUN is set, then IPIs are never needed by the app.
3311          */
3312         ret = -EINVAL;
3313         if (ctx->flags & IORING_SETUP_SQPOLL) {
3314                 /* IPI related flags don't make sense with SQPOLL */
3315                 if (ctx->flags & (IORING_SETUP_COOP_TASKRUN |
3316                                   IORING_SETUP_TASKRUN_FLAG))
3317                         goto err;
3318                 ctx->notify_method = TWA_SIGNAL_NO_IPI;
3319         } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
3320                 ctx->notify_method = TWA_SIGNAL_NO_IPI;
3321         } else {
3322                 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
3323                         goto err;
3324                 ctx->notify_method = TWA_SIGNAL;
3325         }
3326
3327         /*
3328          * This is just grabbed for accounting purposes. When a process exits,
3329          * the mm is exited and dropped before the files, hence we need to hang
3330          * on to this mm purely for the purposes of being able to unaccount
3331          * memory (locked/pinned vm). It's not used for anything else.
3332          */
3333         mmgrab(current->mm);
3334         ctx->mm_account = current->mm;
3335
3336         ret = io_allocate_scq_urings(ctx, p);
3337         if (ret)
3338                 goto err;
3339
3340         ret = io_sq_offload_create(ctx, p);
3341         if (ret)
3342                 goto err;
3343         /* always set a rsrc node */
3344         ret = io_rsrc_node_switch_start(ctx);
3345         if (ret)
3346                 goto err;
3347         io_rsrc_node_switch(ctx, NULL);
3348
3349         memset(&p->sq_off, 0, sizeof(p->sq_off));
3350         p->sq_off.head = offsetof(struct io_rings, sq.head);
3351         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
3352         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
3353         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
3354         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
3355         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
3356         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
3357
3358         memset(&p->cq_off, 0, sizeof(p->cq_off));
3359         p->cq_off.head = offsetof(struct io_rings, cq.head);
3360         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
3361         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
3362         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
3363         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
3364         p->cq_off.cqes = offsetof(struct io_rings, cqes);
3365         p->cq_off.flags = offsetof(struct io_rings, cq_flags);
3366
3367         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
3368                         IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
3369                         IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
3370                         IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
3371                         IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
3372                         IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
3373                         IORING_FEAT_LINKED_FILE;
3374
3375         if (copy_to_user(params, p, sizeof(*p))) {
3376                 ret = -EFAULT;
3377                 goto err;
3378         }
3379
3380         file = io_uring_get_file(ctx);
3381         if (IS_ERR(file)) {
3382                 ret = PTR_ERR(file);
3383                 goto err;
3384         }
3385
3386         /*
3387          * Install ring fd as the very last thing, so we don't risk someone
3388          * having closed it before we finish setup
3389          */
3390         ret = io_uring_install_fd(ctx, file);
3391         if (ret < 0) {
3392                 /* fput will clean it up */
3393                 fput(file);
3394                 return ret;
3395         }
3396
3397         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
3398         return ret;
3399 err:
3400         io_ring_ctx_wait_and_kill(ctx);
3401         return ret;
3402 }
3403
3404 /*
3405  * Sets up an aio uring context, and returns the fd. Applications asks for a
3406  * ring size, we return the actual sq/cq ring sizes (among other things) in the
3407  * params structure passed in.
3408  */
3409 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
3410 {
3411         struct io_uring_params p;
3412         int i;
3413
3414         if (copy_from_user(&p, params, sizeof(p)))
3415                 return -EFAULT;
3416         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
3417                 if (p.resv[i])
3418                         return -EINVAL;
3419         }
3420
3421         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
3422                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
3423                         IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
3424                         IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
3425                         IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
3426                         IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
3427                         IORING_SETUP_SINGLE_ISSUER))
3428                 return -EINVAL;
3429
3430         return io_uring_create(entries, &p, params);
3431 }
3432
3433 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
3434                 struct io_uring_params __user *, params)
3435 {
3436         return io_uring_setup(entries, params);
3437 }
3438
3439 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
3440                            unsigned nr_args)
3441 {
3442         struct io_uring_probe *p;
3443         size_t size;
3444         int i, ret;
3445
3446         size = struct_size(p, ops, nr_args);
3447         if (size == SIZE_MAX)
3448                 return -EOVERFLOW;
3449         p = kzalloc(size, GFP_KERNEL);
3450         if (!p)
3451                 return -ENOMEM;
3452
3453         ret = -EFAULT;
3454         if (copy_from_user(p, arg, size))
3455                 goto out;
3456         ret = -EINVAL;
3457         if (memchr_inv(p, 0, size))
3458                 goto out;
3459
3460         p->last_op = IORING_OP_LAST - 1;
3461         if (nr_args > IORING_OP_LAST)
3462                 nr_args = IORING_OP_LAST;
3463
3464         for (i = 0; i < nr_args; i++) {
3465                 p->ops[i].op = i;
3466                 if (!io_op_defs[i].not_supported)
3467                         p->ops[i].flags = IO_URING_OP_SUPPORTED;
3468         }
3469         p->ops_len = i;
3470
3471         ret = 0;
3472         if (copy_to_user(arg, p, size))
3473                 ret = -EFAULT;
3474 out:
3475         kfree(p);
3476         return ret;
3477 }
3478
3479 static int io_register_personality(struct io_ring_ctx *ctx)
3480 {
3481         const struct cred *creds;
3482         u32 id;
3483         int ret;
3484
3485         creds = get_current_cred();
3486
3487         ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
3488                         XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
3489         if (ret < 0) {
3490                 put_cred(creds);
3491                 return ret;
3492         }
3493         return id;
3494 }
3495
3496 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
3497                                            void __user *arg, unsigned int nr_args)
3498 {
3499         struct io_uring_restriction *res;
3500         size_t size;
3501         int i, ret;
3502
3503         /* Restrictions allowed only if rings started disabled */
3504         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
3505                 return -EBADFD;
3506
3507         /* We allow only a single restrictions registration */
3508         if (ctx->restrictions.registered)
3509                 return -EBUSY;
3510
3511         if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
3512                 return -EINVAL;
3513
3514         size = array_size(nr_args, sizeof(*res));
3515         if (size == SIZE_MAX)
3516                 return -EOVERFLOW;
3517
3518         res = memdup_user(arg, size);
3519         if (IS_ERR(res))
3520                 return PTR_ERR(res);
3521
3522         ret = 0;
3523
3524         for (i = 0; i < nr_args; i++) {
3525                 switch (res[i].opcode) {
3526                 case IORING_RESTRICTION_REGISTER_OP:
3527                         if (res[i].register_op >= IORING_REGISTER_LAST) {
3528                                 ret = -EINVAL;
3529                                 goto out;
3530                         }
3531
3532                         __set_bit(res[i].register_op,
3533                                   ctx->restrictions.register_op);
3534                         break;
3535                 case IORING_RESTRICTION_SQE_OP:
3536                         if (res[i].sqe_op >= IORING_OP_LAST) {
3537                                 ret = -EINVAL;
3538                                 goto out;
3539                         }
3540
3541                         __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
3542                         break;
3543                 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
3544                         ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
3545                         break;
3546                 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
3547                         ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
3548                         break;
3549                 default:
3550                         ret = -EINVAL;
3551                         goto out;
3552                 }
3553         }
3554
3555 out:
3556         /* Reset all restrictions if an error happened */
3557         if (ret != 0)
3558                 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
3559         else
3560                 ctx->restrictions.registered = true;
3561
3562         kfree(res);
3563         return ret;
3564 }
3565
3566 static int io_register_enable_rings(struct io_ring_ctx *ctx)
3567 {
3568         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
3569                 return -EBADFD;
3570
3571         if (ctx->restrictions.registered)
3572                 ctx->restricted = 1;
3573
3574         ctx->flags &= ~IORING_SETUP_R_DISABLED;
3575         if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
3576                 wake_up(&ctx->sq_data->wait);
3577         return 0;
3578 }
3579
3580 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
3581                                        void __user *arg, unsigned len)
3582 {
3583         struct io_uring_task *tctx = current->io_uring;
3584         cpumask_var_t new_mask;
3585         int ret;
3586
3587         if (!tctx || !tctx->io_wq)
3588                 return -EINVAL;
3589
3590         if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
3591                 return -ENOMEM;
3592
3593         cpumask_clear(new_mask);
3594         if (len > cpumask_size())
3595                 len = cpumask_size();
3596
3597         if (in_compat_syscall()) {
3598                 ret = compat_get_bitmap(cpumask_bits(new_mask),
3599                                         (const compat_ulong_t __user *)arg,
3600                                         len * 8 /* CHAR_BIT */);
3601         } else {
3602                 ret = copy_from_user(new_mask, arg, len);
3603         }
3604
3605         if (ret) {
3606                 free_cpumask_var(new_mask);
3607                 return -EFAULT;
3608         }
3609
3610         ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
3611         free_cpumask_var(new_mask);
3612         return ret;
3613 }
3614
3615 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
3616 {
3617         struct io_uring_task *tctx = current->io_uring;
3618
3619         if (!tctx || !tctx->io_wq)
3620                 return -EINVAL;
3621
3622         return io_wq_cpu_affinity(tctx->io_wq, NULL);
3623 }
3624
3625 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
3626                                                void __user *arg)
3627         __must_hold(&ctx->uring_lock)
3628 {
3629         struct io_tctx_node *node;
3630         struct io_uring_task *tctx = NULL;
3631         struct io_sq_data *sqd = NULL;
3632         __u32 new_count[2];
3633         int i, ret;
3634
3635         if (copy_from_user(new_count, arg, sizeof(new_count)))
3636                 return -EFAULT;
3637         for (i = 0; i < ARRAY_SIZE(new_count); i++)
3638                 if (new_count[i] > INT_MAX)
3639                         return -EINVAL;
3640
3641         if (ctx->flags & IORING_SETUP_SQPOLL) {
3642                 sqd = ctx->sq_data;
3643                 if (sqd) {
3644                         /*
3645                          * Observe the correct sqd->lock -> ctx->uring_lock
3646                          * ordering. Fine to drop uring_lock here, we hold
3647                          * a ref to the ctx.
3648                          */
3649                         refcount_inc(&sqd->refs);
3650                         mutex_unlock(&ctx->uring_lock);
3651                         mutex_lock(&sqd->lock);
3652                         mutex_lock(&ctx->uring_lock);
3653                         if (sqd->thread)
3654                                 tctx = sqd->thread->io_uring;
3655                 }
3656         } else {
3657                 tctx = current->io_uring;
3658         }
3659
3660         BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
3661
3662         for (i = 0; i < ARRAY_SIZE(new_count); i++)
3663                 if (new_count[i])
3664                         ctx->iowq_limits[i] = new_count[i];
3665         ctx->iowq_limits_set = true;
3666
3667         if (tctx && tctx->io_wq) {
3668                 ret = io_wq_max_workers(tctx->io_wq, new_count);
3669                 if (ret)
3670                         goto err;
3671         } else {
3672                 memset(new_count, 0, sizeof(new_count));
3673         }
3674
3675         if (sqd) {
3676                 mutex_unlock(&sqd->lock);
3677                 io_put_sq_data(sqd);
3678         }
3679
3680         if (copy_to_user(arg, new_count, sizeof(new_count)))
3681                 return -EFAULT;
3682
3683         /* that's it for SQPOLL, only the SQPOLL task creates requests */
3684         if (sqd)
3685                 return 0;
3686
3687         /* now propagate the restriction to all registered users */
3688         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
3689                 struct io_uring_task *tctx = node->task->io_uring;
3690
3691                 if (WARN_ON_ONCE(!tctx->io_wq))
3692                         continue;
3693
3694                 for (i = 0; i < ARRAY_SIZE(new_count); i++)
3695                         new_count[i] = ctx->iowq_limits[i];
3696                 /* ignore errors, it always returns zero anyway */
3697                 (void)io_wq_max_workers(tctx->io_wq, new_count);
3698         }
3699         return 0;
3700 err:
3701         if (sqd) {
3702                 mutex_unlock(&sqd->lock);
3703                 io_put_sq_data(sqd);
3704         }
3705         return ret;
3706 }
3707
3708 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
3709                                void __user *arg, unsigned nr_args)
3710         __releases(ctx->uring_lock)
3711         __acquires(ctx->uring_lock)
3712 {
3713         int ret;
3714
3715         /*
3716          * We're inside the ring mutex, if the ref is already dying, then
3717          * someone else killed the ctx or is already going through
3718          * io_uring_register().
3719          */
3720         if (percpu_ref_is_dying(&ctx->refs))
3721                 return -ENXIO;
3722
3723         if (ctx->restricted) {
3724                 if (opcode >= IORING_REGISTER_LAST)
3725                         return -EINVAL;
3726                 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
3727                 if (!test_bit(opcode, ctx->restrictions.register_op))
3728                         return -EACCES;
3729         }
3730
3731         switch (opcode) {
3732         case IORING_REGISTER_BUFFERS:
3733                 ret = -EFAULT;
3734                 if (!arg)
3735                         break;
3736                 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
3737                 break;
3738         case IORING_UNREGISTER_BUFFERS:
3739                 ret = -EINVAL;
3740                 if (arg || nr_args)
3741                         break;
3742                 ret = io_sqe_buffers_unregister(ctx);
3743                 break;
3744         case IORING_REGISTER_FILES:
3745                 ret = -EFAULT;
3746                 if (!arg)
3747                         break;
3748                 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
3749                 break;
3750         case IORING_UNREGISTER_FILES:
3751                 ret = -EINVAL;
3752                 if (arg || nr_args)
3753                         break;
3754                 ret = io_sqe_files_unregister(ctx);
3755                 break;
3756         case IORING_REGISTER_FILES_UPDATE:
3757                 ret = io_register_files_update(ctx, arg, nr_args);
3758                 break;
3759         case IORING_REGISTER_EVENTFD:
3760                 ret = -EINVAL;
3761                 if (nr_args != 1)
3762                         break;
3763                 ret = io_eventfd_register(ctx, arg, 0);
3764                 break;
3765         case IORING_REGISTER_EVENTFD_ASYNC:
3766                 ret = -EINVAL;
3767                 if (nr_args != 1)
3768                         break;
3769                 ret = io_eventfd_register(ctx, arg, 1);
3770                 break;
3771         case IORING_UNREGISTER_EVENTFD:
3772                 ret = -EINVAL;
3773                 if (arg || nr_args)
3774                         break;
3775                 ret = io_eventfd_unregister(ctx);
3776                 break;
3777         case IORING_REGISTER_PROBE:
3778                 ret = -EINVAL;
3779                 if (!arg || nr_args > 256)
3780                         break;
3781                 ret = io_probe(ctx, arg, nr_args);
3782                 break;
3783         case IORING_REGISTER_PERSONALITY:
3784                 ret = -EINVAL;
3785                 if (arg || nr_args)
3786                         break;
3787                 ret = io_register_personality(ctx);
3788                 break;
3789         case IORING_UNREGISTER_PERSONALITY:
3790                 ret = -EINVAL;
3791                 if (arg)
3792                         break;
3793                 ret = io_unregister_personality(ctx, nr_args);
3794                 break;
3795         case IORING_REGISTER_ENABLE_RINGS:
3796                 ret = -EINVAL;
3797                 if (arg || nr_args)
3798                         break;
3799                 ret = io_register_enable_rings(ctx);
3800                 break;
3801         case IORING_REGISTER_RESTRICTIONS:
3802                 ret = io_register_restrictions(ctx, arg, nr_args);
3803                 break;
3804         case IORING_REGISTER_FILES2:
3805                 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
3806                 break;
3807         case IORING_REGISTER_FILES_UPDATE2:
3808                 ret = io_register_rsrc_update(ctx, arg, nr_args,
3809                                               IORING_RSRC_FILE);
3810                 break;
3811         case IORING_REGISTER_BUFFERS2:
3812                 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
3813                 break;
3814         case IORING_REGISTER_BUFFERS_UPDATE:
3815                 ret = io_register_rsrc_update(ctx, arg, nr_args,
3816                                               IORING_RSRC_BUFFER);
3817                 break;
3818         case IORING_REGISTER_IOWQ_AFF:
3819                 ret = -EINVAL;
3820                 if (!arg || !nr_args)
3821                         break;
3822                 ret = io_register_iowq_aff(ctx, arg, nr_args);
3823                 break;
3824         case IORING_UNREGISTER_IOWQ_AFF:
3825                 ret = -EINVAL;
3826                 if (arg || nr_args)
3827                         break;
3828                 ret = io_unregister_iowq_aff(ctx);
3829                 break;
3830         case IORING_REGISTER_IOWQ_MAX_WORKERS:
3831                 ret = -EINVAL;
3832                 if (!arg || nr_args != 2)
3833                         break;
3834                 ret = io_register_iowq_max_workers(ctx, arg);
3835                 break;
3836         case IORING_REGISTER_RING_FDS:
3837                 ret = io_ringfd_register(ctx, arg, nr_args);
3838                 break;
3839         case IORING_UNREGISTER_RING_FDS:
3840                 ret = io_ringfd_unregister(ctx, arg, nr_args);
3841                 break;
3842         case IORING_REGISTER_PBUF_RING:
3843                 ret = -EINVAL;
3844                 if (!arg || nr_args != 1)
3845                         break;
3846                 ret = io_register_pbuf_ring(ctx, arg);
3847                 break;
3848         case IORING_UNREGISTER_PBUF_RING:
3849                 ret = -EINVAL;
3850                 if (!arg || nr_args != 1)
3851                         break;
3852                 ret = io_unregister_pbuf_ring(ctx, arg);
3853                 break;
3854         default:
3855                 ret = -EINVAL;
3856                 break;
3857         }
3858
3859         return ret;
3860 }
3861
3862 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
3863                 void __user *, arg, unsigned int, nr_args)
3864 {
3865         struct io_ring_ctx *ctx;
3866         long ret = -EBADF;
3867         struct fd f;
3868
3869         f = fdget(fd);
3870         if (!f.file)
3871                 return -EBADF;
3872
3873         ret = -EOPNOTSUPP;
3874         if (!io_is_uring_fops(f.file))
3875                 goto out_fput;
3876
3877         ctx = f.file->private_data;
3878
3879         io_run_task_work();
3880
3881         mutex_lock(&ctx->uring_lock);
3882         ret = __io_uring_register(ctx, opcode, arg, nr_args);
3883         mutex_unlock(&ctx->uring_lock);
3884         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
3885 out_fput:
3886         fdput(f);
3887         return ret;
3888 }
3889
3890 static int __init io_uring_init(void)
3891 {
3892 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
3893         BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
3894         BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
3895 } while (0)
3896
3897 #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
3898         __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
3899         BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
3900         BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
3901         BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
3902         BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
3903         BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
3904         BUILD_BUG_SQE_ELEM(8,  __u64,  off);
3905         BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
3906         BUILD_BUG_SQE_ELEM(16, __u64,  addr);
3907         BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
3908         BUILD_BUG_SQE_ELEM(24, __u32,  len);
3909         BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
3910         BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
3911         BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
3912         BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
3913         BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
3914         BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
3915         BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
3916         BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
3917         BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
3918         BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
3919         BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
3920         BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
3921         BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
3922         BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
3923         BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
3924         BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
3925         BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
3926         BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
3927         BUILD_BUG_SQE_ELEM(42, __u16,  personality);
3928         BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
3929         BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
3930         BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
3931
3932         BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
3933                      sizeof(struct io_uring_rsrc_update));
3934         BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
3935                      sizeof(struct io_uring_rsrc_update2));
3936
3937         /* ->buf_index is u16 */
3938         BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
3939         BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
3940                      offsetof(struct io_uring_buf_ring, tail));
3941
3942         /* should fit into one byte */
3943         BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
3944         BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
3945         BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
3946
3947         BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
3948
3949         BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
3950
3951         io_uring_optable_init();
3952
3953         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
3954                                 SLAB_ACCOUNT);
3955         return 0;
3956 };
3957 __initcall(io_uring_init);