fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqring (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <linux/compat.h>
  47 #include <linux/refcount.h>
  48 #include <linux/uio.h>
  49
  50 #include <linux/sched/signal.h>
  51 #include <linux/fs.h>
  52 #include <linux/file.h>
  53 #include <linux/fdtable.h>
  54 #include <linux/mm.h>
  55 #include <linux/mman.h>
  56 #include <linux/mmu_context.h>
  57 #include <linux/percpu.h>
  58 #include <linux/slab.h>
  59 #include <linux/workqueue.h>
  60 #include <linux/kthread.h>
  61 #include <linux/blkdev.h>
  62 #include <linux/bvec.h>
  63 #include <linux/net.h>
  64 #include <net/sock.h>
  65 #include <net/af_unix.h>
  66 #include <net/scm.h>
  67 #include <linux/anon_inodes.h>
  68 #include <linux/sched/mm.h>
  69 #include <linux/uaccess.h>
  70 #include <linux/nospec.h>
  71 #include <linux/sizes.h>
  72 #include <linux/hugetlb.h>
  73
  74 #include <uapi/linux/io_uring.h>
  75
  76 #include "internal.h"
  77
  78 #define IORING_MAX_ENTRIES      4096
  79 #define IORING_MAX_FIXED_FILES  1024
  80
  81 struct io_uring {
  82         u32 head ____cacheline_aligned_in_smp;
  83         u32 tail ____cacheline_aligned_in_smp;
  84 };
  85
  86 /*
  87  * This data is shared with the application through the mmap at offset
  88  * IORING_OFF_SQ_RING.
  89  *
  90  * The offsets to the member fields are published through struct
  91  * io_sqring_offsets when calling io_uring_setup.
  92  */
  93 struct io_sq_ring {
  94         /*
  95          * Head and tail offsets into the ring; the offsets need to be
  96          * masked to get valid indices.
  97          *
  98          * The kernel controls head and the application controls tail.
  99          */
 100         struct io_uring         r;
 101         /*
 102          * Bitmask to apply to head and tail offsets (constant, equals
 103          * ring_entries - 1)
 104          */
 105         u32                     ring_mask;
 106         /* Ring size (constant, power of 2) */
 107         u32                     ring_entries;
 108         /*
 109          * Number of invalid entries dropped by the kernel due to
 110          * invalid index stored in array
 111          *
 112          * Written by the kernel, shouldn't be modified by the
 113          * application (i.e. get number of "new events" by comparing to
 114          * cached value).
 115          *
 116          * After a new SQ head value was read by the application this
 117          * counter includes all submissions that were dropped reaching
 118          * the new SQ head (and possibly more).
 119          */
 120         u32                     dropped;
 121         /*
 122          * Runtime flags
 123          *
 124          * Written by the kernel, shouldn't be modified by the
 125          * application.
 126          *
 127          * The application needs a full memory barrier before checking
 128          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 129          */
 130         u32                     flags;
 131         /*
 132          * Ring buffer of indices into array of io_uring_sqe, which is
 133          * mmapped by the application using the IORING_OFF_SQES offset.
 134          *
 135          * This indirection could e.g. be used to assign fixed
 136          * io_uring_sqe entries to operations and only submit them to
 137          * the queue when needed.
 138          *
 139          * The kernel modifies neither the indices array nor the entries
 140          * array.
 141          */
 142         u32                     array[];
 143 };
 144
 145 /*
 146  * This data is shared with the application through the mmap at offset
 147  * IORING_OFF_CQ_RING.
 148  *
 149  * The offsets to the member fields are published through struct
 150  * io_cqring_offsets when calling io_uring_setup.
 151  */
 152 struct io_cq_ring {
 153         /*
 154          * Head and tail offsets into the ring; the offsets need to be
 155          * masked to get valid indices.
 156          *
 157          * The application controls head and the kernel tail.
 158          */
 159         struct io_uring         r;
 160         /*
 161          * Bitmask to apply to head and tail offsets (constant, equals
 162          * ring_entries - 1)
 163          */
 164         u32                     ring_mask;
 165         /* Ring size (constant, power of 2) */
 166         u32                     ring_entries;
 167         /*
 168          * Number of completion events lost because the queue was full;
 169          * this should be avoided by the application by making sure
 170          * there are not more requests pending thatn there is space in
 171          * the completion queue.
 172          *
 173          * Written by the kernel, shouldn't be modified by the
 174          * application (i.e. get number of "new events" by comparing to
 175          * cached value).
 176          *
 177          * As completion events come in out of order this counter is not
 178          * ordered with any other data.
 179          */
 180         u32                     overflow;
 181         /*
 182          * Ring buffer of completion events.
 183          *
 184          * The kernel writes completion events fresh every time they are
 185          * produced, so the application is allowed to modify pending
 186          * entries.
 187          */
 188         struct io_uring_cqe     cqes[];
 189 };
 190
 191 struct io_mapped_ubuf {
 192         u64             ubuf;
 193         size_t          len;
 194         struct          bio_vec *bvec;
 195         unsigned int    nr_bvecs;
 196 };
 197
 198 struct async_list {
 199         spinlock_t              lock;
 200         atomic_t                cnt;
 201         struct list_head        list;
 202
 203         struct file             *file;
 204         off_t                   io_end;
 205         size_t                  io_pages;
 206 };
 207
 208 struct io_ring_ctx {
 209         struct {
 210                 struct percpu_ref       refs;
 211         } ____cacheline_aligned_in_smp;
 212
 213         struct {
 214                 unsigned int            flags;
 215                 bool                    compat;
 216                 bool                    account_mem;
 217
 218                 /* SQ ring */
 219                 struct io_sq_ring       *sq_ring;
 220                 unsigned                cached_sq_head;
 221                 unsigned                sq_entries;
 222                 unsigned                sq_mask;
 223                 unsigned                sq_thread_idle;
 224                 struct io_uring_sqe     *sq_sqes;
 225
 226                 struct list_head        defer_list;
 227         } ____cacheline_aligned_in_smp;
 228
 229         /* IO offload */
 230         struct workqueue_struct *sqo_wq;
 231         struct task_struct      *sqo_thread;    /* if using sq thread polling */
 232         struct mm_struct        *sqo_mm;
 233         wait_queue_head_t       sqo_wait;
 234         struct completion       sqo_thread_started;
 235
 236         struct {
 237                 /* CQ ring */
 238                 struct io_cq_ring       *cq_ring;
 239                 unsigned                cached_cq_tail;
 240                 unsigned                cq_entries;
 241                 unsigned                cq_mask;
 242                 struct wait_queue_head  cq_wait;
 243                 struct fasync_struct    *cq_fasync;
 244                 struct eventfd_ctx      *cq_ev_fd;
 245         } ____cacheline_aligned_in_smp;
 246
 247         /*
 248          * If used, fixed file set. Writers must ensure that ->refs is dead,
 249          * readers must ensure that ->refs is alive as long as the file* is
 250          * used. Only updated through io_uring_register(2).
 251          */
 252         struct file             **user_files;
 253         unsigned                nr_user_files;
 254
 255         /* if used, fixed mapped user buffers */
 256         unsigned                nr_user_bufs;
 257         struct io_mapped_ubuf   *user_bufs;
 258
 259         struct user_struct      *user;
 260
 261         struct completion       ctx_done;
 262
 263         struct {
 264                 struct mutex            uring_lock;
 265                 wait_queue_head_t       wait;
 266         } ____cacheline_aligned_in_smp;
 267
 268         struct {
 269                 spinlock_t              completion_lock;
 270                 bool                    poll_multi_file;
 271                 /*
 272                  * ->poll_list is protected by the ctx->uring_lock for
 273                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 274                  * For SQPOLL, only the single threaded io_sq_thread() will
 275                  * manipulate the list, hence no extra locking is needed there.
 276                  */
 277                 struct list_head        poll_list;
 278                 struct list_head        cancel_list;
 279         } ____cacheline_aligned_in_smp;
 280
 281         struct async_list       pending_async[2];
 282
 283 #if defined(CONFIG_UNIX)
 284         struct socket           *ring_sock;
 285 #endif
 286 };
 287
 288 struct sqe_submit {
 289         const struct io_uring_sqe       *sqe;
 290         unsigned short                  index;
 291         bool                            has_user;
 292         bool                            needs_lock;
 293         bool                            needs_fixed_file;
 294 };
 295
 296 /*
 297  * First field must be the file pointer in all the
 298  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 299  */
 300 struct io_poll_iocb {
 301         struct file                     *file;
 302         struct wait_queue_head          *head;
 303         __poll_t                        events;
 304         bool                            done;
 305         bool                            canceled;
 306         struct wait_queue_entry         wait;
 307 };
 308
 309 /*
 310  * NOTE! Each of the iocb union members has the file pointer
 311  * as the first entry in their struct definition. So you can
 312  * access the file pointer through any of the sub-structs,
 313  * or directly as just 'ki_filp' in this struct.
 314  */
 315 struct io_kiocb {
 316         union {
 317                 struct file             *file;
 318                 struct kiocb            rw;
 319                 struct io_poll_iocb     poll;
 320         };
 321
 322         struct sqe_submit       submit;
 323
 324         struct io_ring_ctx      *ctx;
 325         struct list_head        list;
 326         struct list_head        link_list;
 327         unsigned int            flags;
 328         refcount_t              refs;
 329 #define REQ_F_NOWAIT            1       /* must not punt to workers */
 330 #define REQ_F_IOPOLL_COMPLETED  2       /* polled IO has completed */
 331 #define REQ_F_FIXED_FILE        4       /* ctx owns file */
 332 #define REQ_F_SEQ_PREV          8       /* sequential with previous */
 333 #define REQ_F_IO_DRAIN          16      /* drain existing IO first */
 334 #define REQ_F_IO_DRAINED        32      /* drain done */
 335 #define REQ_F_LINK              64      /* linked sqes */
 336 #define REQ_F_FAIL_LINK         128     /* fail rest of links */
 337         u64                     user_data;
 338         u32                     result;
 339         u32                     sequence;
 340
 341         struct work_struct      work;
 342 };
 343
 344 #define IO_PLUG_THRESHOLD               2
 345 #define IO_IOPOLL_BATCH                 8
 346
 347 struct io_submit_state {
 348         struct blk_plug         plug;
 349
 350         /*
 351          * io_kiocb alloc cache
 352          */
 353         void                    *reqs[IO_IOPOLL_BATCH];
 354         unsigned                int free_reqs;
 355         unsigned                int cur_req;
 356
 357         /*
 358          * File reference cache
 359          */
 360         struct file             *file;
 361         unsigned int            fd;
 362         unsigned int            has_refs;
 363         unsigned int            used_refs;
 364         unsigned int            ios_left;
 365 };
 366
 367 static void io_sq_wq_submit_work(struct work_struct *work);
 368
 369 static struct kmem_cache *req_cachep;
 370
 371 static const struct file_operations io_uring_fops;
 372
 373 struct sock *io_uring_get_socket(struct file *file)
 374 {
 375 #if defined(CONFIG_UNIX)
 376         if (file->f_op == &io_uring_fops) {
 377                 struct io_ring_ctx *ctx = file->private_data;
 378
 379                 return ctx->ring_sock->sk;
 380         }
 381 #endif
 382         return NULL;
 383 }
 384 EXPORT_SYMBOL(io_uring_get_socket);
 385
 386 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
 387 {
 388         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
 389
 390         complete(&ctx->ctx_done);
 391 }
 392
 393 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 394 {
 395         struct io_ring_ctx *ctx;
 396         int i;
 397
 398         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 399         if (!ctx)
 400                 return NULL;
 401
 402         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) {
 403                 kfree(ctx);
 404                 return NULL;
 405         }
 406
 407         ctx->flags = p->flags;
 408         init_waitqueue_head(&ctx->cq_wait);
 409         init_completion(&ctx->ctx_done);
 410         init_completion(&ctx->sqo_thread_started);
 411         mutex_init(&ctx->uring_lock);
 412         init_waitqueue_head(&ctx->wait);
 413         for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) {
 414                 spin_lock_init(&ctx->pending_async[i].lock);
 415                 INIT_LIST_HEAD(&ctx->pending_async[i].list);
 416                 atomic_set(&ctx->pending_async[i].cnt, 0);
 417         }
 418         spin_lock_init(&ctx->completion_lock);
 419         INIT_LIST_HEAD(&ctx->poll_list);
 420         INIT_LIST_HEAD(&ctx->cancel_list);
 421         INIT_LIST_HEAD(&ctx->defer_list);
 422         return ctx;
 423 }
 424
 425 static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
 426                                      struct io_kiocb *req)
 427 {
 428         if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
 429                 return false;
 430
 431         return req->sequence > ctx->cached_cq_tail + ctx->sq_ring->dropped;
 432 }
 433
 434 static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
 435 {
 436         struct io_kiocb *req;
 437
 438         if (list_empty(&ctx->defer_list))
 439                 return NULL;
 440
 441         req = list_first_entry(&ctx->defer_list, struct io_kiocb, list);
 442         if (!io_sequence_defer(ctx, req)) {
 443                 list_del_init(&req->list);
 444                 return req;
 445         }
 446
 447         return NULL;
 448 }
 449
 450 static void __io_commit_cqring(struct io_ring_ctx *ctx)
 451 {
 452         struct io_cq_ring *ring = ctx->cq_ring;
 453
 454         if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) {
 455                 /* order cqe stores with ring update */
 456                 smp_store_release(&ring->r.tail, ctx->cached_cq_tail);
 457
 458                 if (wq_has_sleeper(&ctx->cq_wait)) {
 459                         wake_up_interruptible(&ctx->cq_wait);
 460                         kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
 461                 }
 462         }
 463 }
 464
 465 static void io_commit_cqring(struct io_ring_ctx *ctx)
 466 {
 467         struct io_kiocb *req;
 468
 469         __io_commit_cqring(ctx);
 470
 471         while ((req = io_get_deferred_req(ctx)) != NULL) {
 472                 req->flags |= REQ_F_IO_DRAINED;
 473                 queue_work(ctx->sqo_wq, &req->work);
 474         }
 475 }
 476
 477 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 478 {
 479         struct io_cq_ring *ring = ctx->cq_ring;
 480         unsigned tail;
 481
 482         tail = ctx->cached_cq_tail;
 483         /*
 484          * writes to the cq entry need to come after reading head; the
 485          * control dependency is enough as we're using WRITE_ONCE to
 486          * fill the cq entry
 487          */
 488         if (tail - READ_ONCE(ring->r.head) == ring->ring_entries)
 489                 return NULL;
 490
 491         ctx->cached_cq_tail++;
 492         return &ring->cqes[tail & ctx->cq_mask];
 493 }
 494
 495 static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 496                                  long res)
 497 {
 498         struct io_uring_cqe *cqe;
 499
 500         /*
 501          * If we can't get a cq entry, userspace overflowed the
 502          * submission (by quite a lot). Increment the overflow count in
 503          * the ring.
 504          */
 505         cqe = io_get_cqring(ctx);
 506         if (cqe) {
 507                 WRITE_ONCE(cqe->user_data, ki_user_data);
 508                 WRITE_ONCE(cqe->res, res);
 509                 WRITE_ONCE(cqe->flags, 0);
 510         } else {
 511                 unsigned overflow = READ_ONCE(ctx->cq_ring->overflow);
 512
 513                 WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1);
 514         }
 515 }
 516
 517 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 518 {
 519         if (waitqueue_active(&ctx->wait))
 520                 wake_up(&ctx->wait);
 521         if (waitqueue_active(&ctx->sqo_wait))
 522                 wake_up(&ctx->sqo_wait);
 523         if (ctx->cq_ev_fd)
 524                 eventfd_signal(ctx->cq_ev_fd, 1);
 525 }
 526
 527 static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
 528                                 long res)
 529 {
 530         unsigned long flags;
 531
 532         spin_lock_irqsave(&ctx->completion_lock, flags);
 533         io_cqring_fill_event(ctx, user_data, res);
 534         io_commit_cqring(ctx);
 535         spin_unlock_irqrestore(&ctx->completion_lock, flags);
 536
 537         io_cqring_ev_posted(ctx);
 538 }
 539
 540 static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs)
 541 {
 542         percpu_ref_put_many(&ctx->refs, refs);
 543
 544         if (waitqueue_active(&ctx->wait))
 545                 wake_up(&ctx->wait);
 546 }
 547
 548 static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 549                                    struct io_submit_state *state)
 550 {
 551         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 552         struct io_kiocb *req;
 553
 554         if (!percpu_ref_tryget(&ctx->refs))
 555                 return NULL;
 556
 557         if (!state) {
 558                 req = kmem_cache_alloc(req_cachep, gfp);
 559                 if (unlikely(!req))
 560                         goto out;
 561         } else if (!state->free_reqs) {
 562                 size_t sz;
 563                 int ret;
 564
 565                 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
 566                 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
 567
 568                 /*
 569                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
 570                  * retry single alloc to be on the safe side.
 571                  */
 572                 if (unlikely(ret <= 0)) {
 573                         state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
 574                         if (!state->reqs[0])
 575                                 goto out;
 576                         ret = 1;
 577                 }
 578                 state->free_reqs = ret - 1;
 579                 state->cur_req = 1;
 580                 req = state->reqs[0];
 581         } else {
 582                 req = state->reqs[state->cur_req];
 583                 state->free_reqs--;
 584                 state->cur_req++;
 585         }
 586
 587         req->file = NULL;
 588         req->ctx = ctx;
 589         req->flags = 0;
 590         /* one is dropped after submission, the other at completion */
 591         refcount_set(&req->refs, 2);
 592         req->result = 0;
 593         return req;
 594 out:
 595         io_ring_drop_ctx_refs(ctx, 1);
 596         return NULL;
 597 }
 598
 599 static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
 600 {
 601         if (*nr) {
 602                 kmem_cache_free_bulk(req_cachep, *nr, reqs);
 603                 io_ring_drop_ctx_refs(ctx, *nr);
 604                 *nr = 0;
 605         }
 606 }
 607
 608 static void __io_free_req(struct io_kiocb *req)
 609 {
 610         if (req->file && !(req->flags & REQ_F_FIXED_FILE))
 611                 fput(req->file);
 612         io_ring_drop_ctx_refs(req->ctx, 1);
 613         kmem_cache_free(req_cachep, req);
 614 }
 615
 616 static void io_req_link_next(struct io_kiocb *req)
 617 {
 618         struct io_kiocb *nxt;
 619
 620         /*
 621          * The list should never be empty when we are called here. But could
 622          * potentially happen if the chain is messed up, check to be on the
 623          * safe side.
 624          */
 625         nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
 626         if (nxt) {
 627                 list_del(&nxt->list);
 628                 if (!list_empty(&req->link_list)) {
 629                         INIT_LIST_HEAD(&nxt->link_list);
 630                         list_splice(&req->link_list, &nxt->link_list);
 631                         nxt->flags |= REQ_F_LINK;
 632                 }
 633
 634                 INIT_WORK(&nxt->work, io_sq_wq_submit_work);
 635                 queue_work(req->ctx->sqo_wq, &nxt->work);
 636         }
 637 }
 638
 639 /*
 640  * Called if REQ_F_LINK is set, and we fail the head request
 641  */
 642 static void io_fail_links(struct io_kiocb *req)
 643 {
 644         struct io_kiocb *link;
 645
 646         while (!list_empty(&req->link_list)) {
 647                 link = list_first_entry(&req->link_list, struct io_kiocb, list);
 648                 list_del(&link->list);
 649
 650                 io_cqring_add_event(req->ctx, link->user_data, -ECANCELED);
 651                 __io_free_req(link);
 652         }
 653 }
 654
 655 static void io_free_req(struct io_kiocb *req)
 656 {
 657         /*
 658          * If LINK is set, we have dependent requests in this chain. If we
 659          * didn't fail this request, queue the first one up, moving any other
 660          * dependencies to the next request. In case of failure, fail the rest
 661          * of the chain.
 662          */
 663         if (req->flags & REQ_F_LINK) {
 664                 if (req->flags & REQ_F_FAIL_LINK)
 665                         io_fail_links(req);
 666                 else
 667                         io_req_link_next(req);
 668         }
 669
 670         __io_free_req(req);
 671 }
 672
 673 static void io_put_req(struct io_kiocb *req)
 674 {
 675         if (refcount_dec_and_test(&req->refs))
 676                 io_free_req(req);
 677 }
 678
 679 /*
 680  * Find and free completed poll iocbs
 681  */
 682 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 683                                struct list_head *done)
 684 {
 685         void *reqs[IO_IOPOLL_BATCH];
 686         struct io_kiocb *req;
 687         int to_free;
 688
 689         to_free = 0;
 690         while (!list_empty(done)) {
 691                 req = list_first_entry(done, struct io_kiocb, list);
 692                 list_del(&req->list);
 693
 694                 io_cqring_fill_event(ctx, req->user_data, req->result);
 695                 (*nr_events)++;
 696
 697                 if (refcount_dec_and_test(&req->refs)) {
 698                         /* If we're not using fixed files, we have to pair the
 699                          * completion part with the file put. Use regular
 700                          * completions for those, only batch free for fixed
 701                          * file and non-linked commands.
 702                          */
 703                         if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
 704                             REQ_F_FIXED_FILE) {
 705                                 reqs[to_free++] = req;
 706                                 if (to_free == ARRAY_SIZE(reqs))
 707                                         io_free_req_many(ctx, reqs, &to_free);
 708                         } else {
 709                                 io_free_req(req);
 710                         }
 711                 }
 712         }
 713
 714         io_commit_cqring(ctx);
 715         io_free_req_many(ctx, reqs, &to_free);
 716 }
 717
 718 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 719                         long min)
 720 {
 721         struct io_kiocb *req, *tmp;
 722         LIST_HEAD(done);
 723         bool spin;
 724         int ret;
 725
 726         /*
 727          * Only spin for completions if we don't have multiple devices hanging
 728          * off our complete list, and we're under the requested amount.
 729          */
 730         spin = !ctx->poll_multi_file && *nr_events < min;
 731
 732         ret = 0;
 733         list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
 734                 struct kiocb *kiocb = &req->rw;
 735
 736                 /*
 737                  * Move completed entries to our local list. If we find a
 738                  * request that requires polling, break out and complete
 739                  * the done list first, if we have entries there.
 740                  */
 741                 if (req->flags & REQ_F_IOPOLL_COMPLETED) {
 742                         list_move_tail(&req->list, &done);
 743                         continue;
 744                 }
 745                 if (!list_empty(&done))
 746                         break;
 747
 748                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
 749                 if (ret < 0)
 750                         break;
 751
 752                 if (ret && spin)
 753                         spin = false;
 754                 ret = 0;
 755         }
 756
 757         if (!list_empty(&done))
 758                 io_iopoll_complete(ctx, nr_events, &done);
 759
 760         return ret;
 761 }
 762
 763 /*
 764  * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
 765  * non-spinning poll check - we'll still enter the driver poll loop, but only
 766  * as a non-spinning completion check.
 767  */
 768 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
 769                                 long min)
 770 {
 771         while (!list_empty(&ctx->poll_list)) {
 772                 int ret;
 773
 774                 ret = io_do_iopoll(ctx, nr_events, min);
 775                 if (ret < 0)
 776                         return ret;
 777                 if (!min || *nr_events >= min)
 778                         return 0;
 779         }
 780
 781         return 1;
 782 }
 783
 784 /*
 785  * We can't just wait for polled events to come to us, we have to actively
 786  * find and complete them.
 787  */
 788 static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
 789 {
 790         if (!(ctx->flags & IORING_SETUP_IOPOLL))
 791                 return;
 792
 793         mutex_lock(&ctx->uring_lock);
 794         while (!list_empty(&ctx->poll_list)) {
 795                 unsigned int nr_events = 0;
 796
 797                 io_iopoll_getevents(ctx, &nr_events, 1);
 798         }
 799         mutex_unlock(&ctx->uring_lock);
 800 }
 801
 802 static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
 803                            long min)
 804 {
 805         int ret = 0;
 806
 807         do {
 808                 int tmin = 0;
 809
 810                 if (*nr_events < min)
 811                         tmin = min - *nr_events;
 812
 813                 ret = io_iopoll_getevents(ctx, nr_events, tmin);
 814                 if (ret <= 0)
 815                         break;
 816                 ret = 0;
 817         } while (min && !*nr_events && !need_resched());
 818
 819         return ret;
 820 }
 821
 822 static void kiocb_end_write(struct kiocb *kiocb)
 823 {
 824         if (kiocb->ki_flags & IOCB_WRITE) {
 825                 struct inode *inode = file_inode(kiocb->ki_filp);
 826
 827                 /*
 828                  * Tell lockdep we inherited freeze protection from submission
 829                  * thread.
 830                  */
 831                 if (S_ISREG(inode->i_mode))
 832                         __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
 833                 file_end_write(kiocb->ki_filp);
 834         }
 835 }
 836
 837 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 838 {
 839         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
 840
 841         kiocb_end_write(kiocb);
 842
 843         if ((req->flags & REQ_F_LINK) && res != req->result)
 844                 req->flags |= REQ_F_FAIL_LINK;
 845         io_cqring_add_event(req->ctx, req->user_data, res);
 846         io_put_req(req);
 847 }
 848
 849 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
 850 {
 851         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
 852
 853         kiocb_end_write(kiocb);
 854
 855         if ((req->flags & REQ_F_LINK) && res != req->result)
 856                 req->flags |= REQ_F_FAIL_LINK;
 857         req->result = res;
 858         if (res != -EAGAIN)
 859                 req->flags |= REQ_F_IOPOLL_COMPLETED;
 860 }
 861
 862 /*
 863  * After the iocb has been issued, it's safe to be found on the poll list.
 864  * Adding the kiocb to the list AFTER submission ensures that we don't
 865  * find it from a io_iopoll_getevents() thread before the issuer is done
 866  * accessing the kiocb cookie.
 867  */
 868 static void io_iopoll_req_issued(struct io_kiocb *req)
 869 {
 870         struct io_ring_ctx *ctx = req->ctx;
 871
 872         /*
 873          * Track whether we have multiple files in our lists. This will impact
 874          * how we do polling eventually, not spinning if we're on potentially
 875          * different devices.
 876          */
 877         if (list_empty(&ctx->poll_list)) {
 878                 ctx->poll_multi_file = false;
 879         } else if (!ctx->poll_multi_file) {
 880                 struct io_kiocb *list_req;
 881
 882                 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
 883                                                 list);
 884                 if (list_req->rw.ki_filp != req->rw.ki_filp)
 885                         ctx->poll_multi_file = true;
 886         }
 887
 888         /*
 889          * For fast devices, IO may have already completed. If it has, add
 890          * it to the front so we find it first.
 891          */
 892         if (req->flags & REQ_F_IOPOLL_COMPLETED)
 893                 list_add(&req->list, &ctx->poll_list);
 894         else
 895                 list_add_tail(&req->list, &ctx->poll_list);
 896 }
 897
 898 static void io_file_put(struct io_submit_state *state)
 899 {
 900         if (state->file) {
 901                 int diff = state->has_refs - state->used_refs;
 902
 903                 if (diff)
 904                         fput_many(state->file, diff);
 905                 state->file = NULL;
 906         }
 907 }
 908
 909 /*
 910  * Get as many references to a file as we have IOs left in this submission,
 911  * assuming most submissions are for one file, or at least that each file
 912  * has more than one submission.
 913  */
 914 static struct file *io_file_get(struct io_submit_state *state, int fd)
 915 {
 916         if (!state)
 917                 return fget(fd);
 918
 919         if (state->file) {
 920                 if (state->fd == fd) {
 921                         state->used_refs++;
 922                         state->ios_left--;
 923                         return state->file;
 924                 }
 925                 io_file_put(state);
 926         }
 927         state->file = fget_many(fd, state->ios_left);
 928         if (!state->file)
 929                 return NULL;
 930
 931         state->fd = fd;
 932         state->has_refs = state->ios_left;
 933         state->used_refs = 1;
 934         state->ios_left--;
 935         return state->file;
 936 }
 937
 938 /*
 939  * If we tracked the file through the SCM inflight mechanism, we could support
 940  * any file. For now, just ensure that anything potentially problematic is done
 941  * inline.
 942  */
 943 static bool io_file_supports_async(struct file *file)
 944 {
 945         umode_t mode = file_inode(file)->i_mode;
 946
 947         if (S_ISBLK(mode) || S_ISCHR(mode))
 948                 return true;
 949         if (S_ISREG(mode) && file->f_op != &io_uring_fops)
 950                 return true;
 951
 952         return false;
 953 }
 954
 955 static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
 956                       bool force_nonblock)
 957 {
 958         const struct io_uring_sqe *sqe = s->sqe;
 959         struct io_ring_ctx *ctx = req->ctx;
 960         struct kiocb *kiocb = &req->rw;
 961         unsigned ioprio;
 962         int ret;
 963
 964         if (!req->file)
 965                 return -EBADF;
 966
 967         if (force_nonblock && !io_file_supports_async(req->file))
 968                 force_nonblock = false;
 969
 970         kiocb->ki_pos = READ_ONCE(sqe->off);
 971         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
 972         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
 973
 974         ioprio = READ_ONCE(sqe->ioprio);
 975         if (ioprio) {
 976                 ret = ioprio_check_cap(ioprio);
 977                 if (ret)
 978                         return ret;
 979
 980                 kiocb->ki_ioprio = ioprio;
 981         } else
 982                 kiocb->ki_ioprio = get_current_ioprio();
 983
 984         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
 985         if (unlikely(ret))
 986                 return ret;
 987
 988         /* don't allow async punt if RWF_NOWAIT was requested */
 989         if (kiocb->ki_flags & IOCB_NOWAIT)
 990                 req->flags |= REQ_F_NOWAIT;
 991
 992         if (force_nonblock)
 993                 kiocb->ki_flags |= IOCB_NOWAIT;
 994
 995         if (ctx->flags & IORING_SETUP_IOPOLL) {
 996                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
 997                     !kiocb->ki_filp->f_op->iopoll)
 998                         return -EOPNOTSUPP;
 999
1000                 kiocb->ki_flags |= IOCB_HIPRI;
1001                 kiocb->ki_complete = io_complete_rw_iopoll;
1002         } else {
1003                 if (kiocb->ki_flags & IOCB_HIPRI)
1004                         return -EINVAL;
1005                 kiocb->ki_complete = io_complete_rw;
1006         }
1007         return 0;
1008 }
1009
1010 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
1011 {
1012         switch (ret) {
1013         case -EIOCBQUEUED:
1014                 break;
1015         case -ERESTARTSYS:
1016         case -ERESTARTNOINTR:
1017         case -ERESTARTNOHAND:
1018         case -ERESTART_RESTARTBLOCK:
1019                 /*
1020                  * We can't just restart the syscall, since previously
1021                  * submitted sqes may already be in progress. Just fail this
1022                  * IO with EINTR.
1023                  */
1024                 ret = -EINTR;
1025                 /* fall through */
1026         default:
1027                 kiocb->ki_complete(kiocb, ret, 0);
1028         }
1029 }
1030
1031 static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
1032                            const struct io_uring_sqe *sqe,
1033                            struct iov_iter *iter)
1034 {
1035         size_t len = READ_ONCE(sqe->len);
1036         struct io_mapped_ubuf *imu;
1037         unsigned index, buf_index;
1038         size_t offset;
1039         u64 buf_addr;
1040
1041         /* attempt to use fixed buffers without having provided iovecs */
1042         if (unlikely(!ctx->user_bufs))
1043                 return -EFAULT;
1044
1045         buf_index = READ_ONCE(sqe->buf_index);
1046         if (unlikely(buf_index >= ctx->nr_user_bufs))
1047                 return -EFAULT;
1048
1049         index = array_index_nospec(buf_index, ctx->nr_user_bufs);
1050         imu = &ctx->user_bufs[index];
1051         buf_addr = READ_ONCE(sqe->addr);
1052
1053         /* overflow */
1054         if (buf_addr + len < buf_addr)
1055                 return -EFAULT;
1056         /* not inside the mapped region */
1057         if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
1058                 return -EFAULT;
1059
1060         /*
1061          * May not be a start of buffer, set size appropriately
1062          * and advance us to the beginning.
1063          */
1064         offset = buf_addr - imu->ubuf;
1065         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
1066         if (offset)
1067                 iov_iter_advance(iter, offset);
1068         return 0;
1069 }
1070
1071 static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
1072                                const struct sqe_submit *s, struct iovec **iovec,
1073                                struct iov_iter *iter)
1074 {
1075         const struct io_uring_sqe *sqe = s->sqe;
1076         void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
1077         size_t sqe_len = READ_ONCE(sqe->len);
1078         u8 opcode;
1079
1080         /*
1081          * We're reading ->opcode for the second time, but the first read
1082          * doesn't care whether it's _FIXED or not, so it doesn't matter
1083          * whether ->opcode changes concurrently. The first read does care
1084          * about whether it is a READ or a WRITE, so we don't trust this read
1085          * for that purpose and instead let the caller pass in the read/write
1086          * flag.
1087          */
1088         opcode = READ_ONCE(sqe->opcode);
1089         if (opcode == IORING_OP_READ_FIXED ||
1090             opcode == IORING_OP_WRITE_FIXED) {
1091                 ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
1092                 *iovec = NULL;
1093                 return ret;
1094         }
1095
1096         if (!s->has_user)
1097                 return -EFAULT;
1098
1099 #ifdef CONFIG_COMPAT
1100         if (ctx->compat)
1101                 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
1102                                                 iovec, iter);
1103 #endif
1104
1105         return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
1106 }
1107
1108 /*
1109  * Make a note of the last file/offset/direction we punted to async
1110  * context. We'll use this information to see if we can piggy back a
1111  * sequential request onto the previous one, if it's still hasn't been
1112  * completed by the async worker.
1113  */
1114 static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
1115 {
1116         struct async_list *async_list = &req->ctx->pending_async[rw];
1117         struct kiocb *kiocb = &req->rw;
1118         struct file *filp = kiocb->ki_filp;
1119         off_t io_end = kiocb->ki_pos + len;
1120
1121         if (filp == async_list->file && kiocb->ki_pos == async_list->io_end) {
1122                 unsigned long max_pages;
1123
1124                 /* Use 8x RA size as a decent limiter for both reads/writes */
1125                 max_pages = filp->f_ra.ra_pages;
1126                 if (!max_pages)
1127                         max_pages = VM_READAHEAD_PAGES;
1128                 max_pages *= 8;
1129
1130                 /* If max pages are exceeded, reset the state */
1131                 len >>= PAGE_SHIFT;
1132                 if (async_list->io_pages + len <= max_pages) {
1133                         req->flags |= REQ_F_SEQ_PREV;
1134                         async_list->io_pages += len;
1135                 } else {
1136                         io_end = 0;
1137                         async_list->io_pages = 0;
1138                 }
1139         }
1140
1141         /* New file? Reset state. */
1142         if (async_list->file != filp) {
1143                 async_list->io_pages = 0;
1144                 async_list->file = filp;
1145         }
1146         async_list->io_end = io_end;
1147 }
1148
1149 static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
1150                    bool force_nonblock)
1151 {
1152         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
1153         struct kiocb *kiocb = &req->rw;
1154         struct iov_iter iter;
1155         struct file *file;
1156         size_t iov_count;
1157         ssize_t read_size, ret;
1158
1159         ret = io_prep_rw(req, s, force_nonblock);
1160         if (ret)
1161                 return ret;
1162         file = kiocb->ki_filp;
1163
1164         if (unlikely(!(file->f_mode & FMODE_READ)))
1165                 return -EBADF;
1166         if (unlikely(!file->f_op->read_iter))
1167                 return -EINVAL;
1168
1169         ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
1170         if (ret < 0)
1171                 return ret;
1172
1173         read_size = ret;
1174         if (req->flags & REQ_F_LINK)
1175                 req->result = read_size;
1176
1177         iov_count = iov_iter_count(&iter);
1178         ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
1179         if (!ret) {
1180                 ssize_t ret2;
1181
1182                 ret2 = call_read_iter(file, kiocb, &iter);
1183                 /*
1184                  * In case of a short read, punt to async. This can happen
1185                  * if we have data partially cached. Alternatively we can
1186                  * return the short read, in which case the application will
1187                  * need to issue another SQE and wait for it. That SQE will
1188                  * need async punt anyway, so it's more efficient to do it
1189                  * here.
1190                  */
1191                 if (force_nonblock && ret2 > 0 && ret2 < read_size)
1192                         ret2 = -EAGAIN;
1193                 /* Catch -EAGAIN return for forced non-blocking submission */
1194                 if (!force_nonblock || ret2 != -EAGAIN) {
1195                         io_rw_done(kiocb, ret2);
1196                 } else {
1197                         /*
1198                          * If ->needs_lock is true, we're already in async
1199                          * context.
1200                          */
1201                         if (!s->needs_lock)
1202                                 io_async_list_note(READ, req, iov_count);
1203                         ret = -EAGAIN;
1204                 }
1205         }
1206         kfree(iovec);
1207         return ret;
1208 }
1209
1210 static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
1211                     bool force_nonblock)
1212 {
1213         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
1214         struct kiocb *kiocb = &req->rw;
1215         struct iov_iter iter;
1216         struct file *file;
1217         size_t iov_count;
1218         ssize_t ret;
1219
1220         ret = io_prep_rw(req, s, force_nonblock);
1221         if (ret)
1222                 return ret;
1223
1224         file = kiocb->ki_filp;
1225         if (unlikely(!(file->f_mode & FMODE_WRITE)))
1226                 return -EBADF;
1227         if (unlikely(!file->f_op->write_iter))
1228                 return -EINVAL;
1229
1230         ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
1231         if (ret < 0)
1232                 return ret;
1233
1234         if (req->flags & REQ_F_LINK)
1235                 req->result = ret;
1236
1237         iov_count = iov_iter_count(&iter);
1238
1239         ret = -EAGAIN;
1240         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) {
1241                 /* If ->needs_lock is true, we're already in async context. */
1242                 if (!s->needs_lock)
1243                         io_async_list_note(WRITE, req, iov_count);
1244                 goto out_free;
1245         }
1246
1247         ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
1248         if (!ret) {
1249                 ssize_t ret2;
1250
1251                 /*
1252                  * Open-code file_start_write here to grab freeze protection,
1253                  * which will be released by another thread in
1254                  * io_complete_rw().  Fool lockdep by telling it the lock got
1255                  * released so that it doesn't complain about the held lock when
1256                  * we return to userspace.
1257                  */
1258                 if (S_ISREG(file_inode(file)->i_mode)) {
1259                         __sb_start_write(file_inode(file)->i_sb,
1260                                                 SB_FREEZE_WRITE, true);
1261                         __sb_writers_release(file_inode(file)->i_sb,
1262                                                 SB_FREEZE_WRITE);
1263                 }
1264                 kiocb->ki_flags |= IOCB_WRITE;
1265
1266                 ret2 = call_write_iter(file, kiocb, &iter);
1267                 if (!force_nonblock || ret2 != -EAGAIN) {
1268                         io_rw_done(kiocb, ret2);
1269                 } else {
1270                         /*
1271                          * If ->needs_lock is true, we're already in async
1272                          * context.
1273                          */
1274                         if (!s->needs_lock)
1275                                 io_async_list_note(WRITE, req, iov_count);
1276                         ret = -EAGAIN;
1277                 }
1278         }
1279 out_free:
1280         kfree(iovec);
1281         return ret;
1282 }
1283
1284 /*
1285  * IORING_OP_NOP just posts a completion event, nothing else.
1286  */
1287 static int io_nop(struct io_kiocb *req, u64 user_data)
1288 {
1289         struct io_ring_ctx *ctx = req->ctx;
1290         long err = 0;
1291
1292         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
1293                 return -EINVAL;
1294
1295         io_cqring_add_event(ctx, user_data, err);
1296         io_put_req(req);
1297         return 0;
1298 }
1299
1300 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1301 {
1302         struct io_ring_ctx *ctx = req->ctx;
1303
1304         if (!req->file)
1305                 return -EBADF;
1306
1307         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
1308                 return -EINVAL;
1309         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
1310                 return -EINVAL;
1311
1312         return 0;
1313 }
1314
1315 static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1316                     bool force_nonblock)
1317 {
1318         loff_t sqe_off = READ_ONCE(sqe->off);
1319         loff_t sqe_len = READ_ONCE(sqe->len);
1320         loff_t end = sqe_off + sqe_len;
1321         unsigned fsync_flags;
1322         int ret;
1323
1324         fsync_flags = READ_ONCE(sqe->fsync_flags);
1325         if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
1326                 return -EINVAL;
1327
1328         ret = io_prep_fsync(req, sqe);
1329         if (ret)
1330                 return ret;
1331
1332         /* fsync always requires a blocking context */
1333         if (force_nonblock)
1334                 return -EAGAIN;
1335
1336         ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
1337                                 end > 0 ? end : LLONG_MAX,
1338                                 fsync_flags & IORING_FSYNC_DATASYNC);
1339
1340         if (ret < 0 && (req->flags & REQ_F_LINK))
1341                 req->flags |= REQ_F_FAIL_LINK;
1342         io_cqring_add_event(req->ctx, sqe->user_data, ret);
1343         io_put_req(req);
1344         return 0;
1345 }
1346
1347 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1348 {
1349         struct io_ring_ctx *ctx = req->ctx;
1350         int ret = 0;
1351
1352         if (!req->file)
1353                 return -EBADF;
1354
1355         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
1356                 return -EINVAL;
1357         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
1358                 return -EINVAL;
1359
1360         return ret;
1361 }
1362
1363 static int io_sync_file_range(struct io_kiocb *req,
1364                               const struct io_uring_sqe *sqe,
1365                               bool force_nonblock)
1366 {
1367         loff_t sqe_off;
1368         loff_t sqe_len;
1369         unsigned flags;
1370         int ret;
1371
1372         ret = io_prep_sfr(req, sqe);
1373         if (ret)
1374                 return ret;
1375
1376         /* sync_file_range always requires a blocking context */
1377         if (force_nonblock)
1378                 return -EAGAIN;
1379
1380         sqe_off = READ_ONCE(sqe->off);
1381         sqe_len = READ_ONCE(sqe->len);
1382         flags = READ_ONCE(sqe->sync_range_flags);
1383
1384         ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);
1385
1386         if (ret < 0 && (req->flags & REQ_F_LINK))
1387                 req->flags |= REQ_F_FAIL_LINK;
1388         io_cqring_add_event(req->ctx, sqe->user_data, ret);
1389         io_put_req(req);
1390         return 0;
1391 }
1392
1393 #if defined(CONFIG_NET)
1394 static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1395                            bool force_nonblock,
1396                    long (*fn)(struct socket *, struct user_msghdr __user *,
1397                                 unsigned int))
1398 {
1399         struct socket *sock;
1400         int ret;
1401
1402         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
1403                 return -EINVAL;
1404
1405         sock = sock_from_file(req->file, &ret);
1406         if (sock) {
1407                 struct user_msghdr __user *msg;
1408                 unsigned flags;
1409
1410                 flags = READ_ONCE(sqe->msg_flags);
1411                 if (flags & MSG_DONTWAIT)
1412                         req->flags |= REQ_F_NOWAIT;
1413                 else if (force_nonblock)
1414                         flags |= MSG_DONTWAIT;
1415
1416                 msg = (struct user_msghdr __user *) (unsigned long)
1417                         READ_ONCE(sqe->addr);
1418
1419                 ret = fn(sock, msg, flags);
1420                 if (force_nonblock && ret == -EAGAIN)
1421                         return ret;
1422         }
1423
1424         io_cqring_add_event(req->ctx, sqe->user_data, ret);
1425         io_put_req(req);
1426         return 0;
1427 }
1428 #endif
1429
1430 static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1431                       bool force_nonblock)
1432 {
1433 #if defined(CONFIG_NET)
1434         return io_send_recvmsg(req, sqe, force_nonblock, __sys_sendmsg_sock);
1435 #else
1436         return -EOPNOTSUPP;
1437 #endif
1438 }
1439
1440 static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1441                       bool force_nonblock)
1442 {
1443 #if defined(CONFIG_NET)
1444         return io_send_recvmsg(req, sqe, force_nonblock, __sys_recvmsg_sock);
1445 #else
1446         return -EOPNOTSUPP;
1447 #endif
1448 }
1449
1450 static void io_poll_remove_one(struct io_kiocb *req)
1451 {
1452         struct io_poll_iocb *poll = &req->poll;
1453
1454         spin_lock(&poll->head->lock);
1455         WRITE_ONCE(poll->canceled, true);
1456         if (!list_empty(&poll->wait.entry)) {
1457                 list_del_init(&poll->wait.entry);
1458                 queue_work(req->ctx->sqo_wq, &req->work);
1459         }
1460         spin_unlock(&poll->head->lock);
1461
1462         list_del_init(&req->list);
1463 }
1464
1465 static void io_poll_remove_all(struct io_ring_ctx *ctx)
1466 {
1467         struct io_kiocb *req;
1468
1469         spin_lock_irq(&ctx->completion_lock);
1470         while (!list_empty(&ctx->cancel_list)) {
1471                 req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list);
1472                 io_poll_remove_one(req);
1473         }
1474         spin_unlock_irq(&ctx->completion_lock);
1475 }
1476
1477 /*
1478  * Find a running poll command that matches one specified in sqe->addr,
1479  * and remove it if found.
1480  */
1481 static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1482 {
1483         struct io_ring_ctx *ctx = req->ctx;
1484         struct io_kiocb *poll_req, *next;
1485         int ret = -ENOENT;
1486
1487         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
1488                 return -EINVAL;
1489         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
1490             sqe->poll_events)
1491                 return -EINVAL;
1492
1493         spin_lock_irq(&ctx->completion_lock);
1494         list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) {
1495                 if (READ_ONCE(sqe->addr) == poll_req->user_data) {
1496                         io_poll_remove_one(poll_req);
1497                         ret = 0;
1498                         break;
1499                 }
1500         }
1501         spin_unlock_irq(&ctx->completion_lock);
1502
1503         io_cqring_add_event(req->ctx, sqe->user_data, ret);
1504         io_put_req(req);
1505         return 0;
1506 }
1507
1508 static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req,
1509                              __poll_t mask)
1510 {
1511         req->poll.done = true;
1512         io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask));
1513         io_commit_cqring(ctx);
1514 }
1515
1516 static void io_poll_complete_work(struct work_struct *work)
1517 {
1518         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
1519         struct io_poll_iocb *poll = &req->poll;
1520         struct poll_table_struct pt = { ._key = poll->events };
1521         struct io_ring_ctx *ctx = req->ctx;
1522         __poll_t mask = 0;
1523
1524         if (!READ_ONCE(poll->canceled))
1525                 mask = vfs_poll(poll->file, &pt) & poll->events;
1526
1527         /*
1528          * Note that ->ki_cancel callers also delete iocb from active_reqs after
1529          * calling ->ki_cancel.  We need the ctx_lock roundtrip here to
1530          * synchronize with them.  In the cancellation case the list_del_init
1531          * itself is not actually needed, but harmless so we keep it in to
1532          * avoid further branches in the fast path.
1533          */
1534         spin_lock_irq(&ctx->completion_lock);
1535         if (!mask && !READ_ONCE(poll->canceled)) {
1536                 add_wait_queue(poll->head, &poll->wait);
1537                 spin_unlock_irq(&ctx->completion_lock);
1538                 return;
1539         }
1540         list_del_init(&req->list);
1541         io_poll_complete(ctx, req, mask);
1542         spin_unlock_irq(&ctx->completion_lock);
1543
1544         io_cqring_ev_posted(ctx);
1545         io_put_req(req);
1546 }
1547
1548 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
1549                         void *key)
1550 {
1551         struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
1552                                                         wait);
1553         struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
1554         struct io_ring_ctx *ctx = req->ctx;
1555         __poll_t mask = key_to_poll(key);
1556         unsigned long flags;
1557
1558         /* for instances that support it check for an event match first: */
1559         if (mask && !(mask & poll->events))
1560                 return 0;
1561
1562         list_del_init(&poll->wait.entry);
1563
1564         if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
1565                 list_del(&req->list);
1566                 io_poll_complete(ctx, req, mask);
1567                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1568
1569                 io_cqring_ev_posted(ctx);
1570                 io_put_req(req);
1571         } else {
1572                 queue_work(ctx->sqo_wq, &req->work);
1573         }
1574
1575         return 1;
1576 }
1577
1578 struct io_poll_table {
1579         struct poll_table_struct pt;
1580         struct io_kiocb *req;
1581         int error;
1582 };
1583
1584 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
1585                                struct poll_table_struct *p)
1586 {
1587         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
1588
1589         if (unlikely(pt->req->poll.head)) {
1590                 pt->error = -EINVAL;
1591                 return;
1592         }
1593
1594         pt->error = 0;
1595         pt->req->poll.head = head;
1596         add_wait_queue(head, &pt->req->poll.wait);
1597 }
1598
1599 static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1600 {
1601         struct io_poll_iocb *poll = &req->poll;
1602         struct io_ring_ctx *ctx = req->ctx;
1603         struct io_poll_table ipt;
1604         bool cancel = false;
1605         __poll_t mask;
1606         u16 events;
1607
1608         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
1609                 return -EINVAL;
1610         if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
1611                 return -EINVAL;
1612         if (!poll->file)
1613                 return -EBADF;
1614
1615         INIT_WORK(&req->work, io_poll_complete_work);
1616         events = READ_ONCE(sqe->poll_events);
1617         poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
1618
1619         poll->head = NULL;
1620         poll->done = false;
1621         poll->canceled = false;
1622
1623         ipt.pt._qproc = io_poll_queue_proc;
1624         ipt.pt._key = poll->events;
1625         ipt.req = req;
1626         ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
1627
1628         /* initialized the list so that we can do list_empty checks */
1629         INIT_LIST_HEAD(&poll->wait.entry);
1630         init_waitqueue_func_entry(&poll->wait, io_poll_wake);
1631
1632         mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
1633
1634         spin_lock_irq(&ctx->completion_lock);
1635         if (likely(poll->head)) {
1636                 spin_lock(&poll->head->lock);
1637                 if (unlikely(list_empty(&poll->wait.entry))) {
1638                         if (ipt.error)
1639                                 cancel = true;
1640                         ipt.error = 0;
1641                         mask = 0;
1642                 }
1643                 if (mask || ipt.error)
1644                         list_del_init(&poll->wait.entry);
1645                 else if (cancel)
1646                         WRITE_ONCE(poll->canceled, true);
1647                 else if (!poll->done) /* actually waiting for an event */
1648                         list_add_tail(&req->list, &ctx->cancel_list);
1649                 spin_unlock(&poll->head->lock);
1650         }
1651         if (mask) { /* no async, we'd stolen it */
1652                 ipt.error = 0;
1653                 io_poll_complete(ctx, req, mask);
1654         }
1655         spin_unlock_irq(&ctx->completion_lock);
1656
1657         if (mask) {
1658                 io_cqring_ev_posted(ctx);
1659                 io_put_req(req);
1660         }
1661         return ipt.error;
1662 }
1663
1664 static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
1665                         const struct io_uring_sqe *sqe)
1666 {
1667         struct io_uring_sqe *sqe_copy;
1668
1669         if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list))
1670                 return 0;
1671
1672         sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
1673         if (!sqe_copy)
1674                 return -EAGAIN;
1675
1676         spin_lock_irq(&ctx->completion_lock);
1677         if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) {
1678                 spin_unlock_irq(&ctx->completion_lock);
1679                 kfree(sqe_copy);
1680                 return 0;
1681         }
1682
1683         memcpy(sqe_copy, sqe, sizeof(*sqe_copy));
1684         req->submit.sqe = sqe_copy;
1685
1686         INIT_WORK(&req->work, io_sq_wq_submit_work);
1687         list_add_tail(&req->list, &ctx->defer_list);
1688         spin_unlock_irq(&ctx->completion_lock);
1689         return -EIOCBQUEUED;
1690 }
1691
1692 static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
1693                            const struct sqe_submit *s, bool force_nonblock)
1694 {
1695         int ret, opcode;
1696
1697         req->user_data = READ_ONCE(s->sqe->user_data);
1698
1699         if (unlikely(s->index >= ctx->sq_entries))
1700                 return -EINVAL;
1701
1702         opcode = READ_ONCE(s->sqe->opcode);
1703         switch (opcode) {
1704         case IORING_OP_NOP:
1705                 ret = io_nop(req, req->user_data);
1706                 break;
1707         case IORING_OP_READV:
1708                 if (unlikely(s->sqe->buf_index))
1709                         return -EINVAL;
1710                 ret = io_read(req, s, force_nonblock);
1711                 break;
1712         case IORING_OP_WRITEV:
1713                 if (unlikely(s->sqe->buf_index))
1714                         return -EINVAL;
1715                 ret = io_write(req, s, force_nonblock);
1716                 break;
1717         case IORING_OP_READ_FIXED:
1718                 ret = io_read(req, s, force_nonblock);
1719                 break;
1720         case IORING_OP_WRITE_FIXED:
1721                 ret = io_write(req, s, force_nonblock);
1722                 break;
1723         case IORING_OP_FSYNC:
1724                 ret = io_fsync(req, s->sqe, force_nonblock);
1725                 break;
1726         case IORING_OP_POLL_ADD:
1727                 ret = io_poll_add(req, s->sqe);
1728                 break;
1729         case IORING_OP_POLL_REMOVE:
1730                 ret = io_poll_remove(req, s->sqe);
1731                 break;
1732         case IORING_OP_SYNC_FILE_RANGE:
1733                 ret = io_sync_file_range(req, s->sqe, force_nonblock);
1734                 break;
1735         case IORING_OP_SENDMSG:
1736                 ret = io_sendmsg(req, s->sqe, force_nonblock);
1737                 break;
1738         case IORING_OP_RECVMSG:
1739                 ret = io_recvmsg(req, s->sqe, force_nonblock);
1740                 break;
1741         default:
1742                 ret = -EINVAL;
1743                 break;
1744         }
1745
1746         if (ret)
1747                 return ret;
1748
1749         if (ctx->flags & IORING_SETUP_IOPOLL) {
1750                 if (req->result == -EAGAIN)
1751                         return -EAGAIN;
1752
1753                 /* workqueue context doesn't hold uring_lock, grab it now */
1754                 if (s->needs_lock)
1755                         mutex_lock(&ctx->uring_lock);
1756                 io_iopoll_req_issued(req);
1757                 if (s->needs_lock)
1758                         mutex_unlock(&ctx->uring_lock);
1759         }
1760
1761         return 0;
1762 }
1763
1764 static struct async_list *io_async_list_from_sqe(struct io_ring_ctx *ctx,
1765                                                  const struct io_uring_sqe *sqe)
1766 {
1767         switch (sqe->opcode) {
1768         case IORING_OP_READV:
1769         case IORING_OP_READ_FIXED:
1770                 return &ctx->pending_async[READ];
1771         case IORING_OP_WRITEV:
1772         case IORING_OP_WRITE_FIXED:
1773                 return &ctx->pending_async[WRITE];
1774         default:
1775                 return NULL;
1776         }
1777 }
1778
1779 static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
1780 {
1781         u8 opcode = READ_ONCE(sqe->opcode);
1782
1783         return !(opcode == IORING_OP_READ_FIXED ||
1784                  opcode == IORING_OP_WRITE_FIXED);
1785 }
1786
1787 static void io_sq_wq_submit_work(struct work_struct *work)
1788 {
1789         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
1790         struct io_ring_ctx *ctx = req->ctx;
1791         struct mm_struct *cur_mm = NULL;
1792         struct async_list *async_list;
1793         LIST_HEAD(req_list);
1794         mm_segment_t old_fs;
1795         int ret;
1796
1797         async_list = io_async_list_from_sqe(ctx, req->submit.sqe);
1798 restart:
1799         do {
1800                 struct sqe_submit *s = &req->submit;
1801                 const struct io_uring_sqe *sqe = s->sqe;
1802
1803                 /* Ensure we clear previously set non-block flag */
1804                 req->rw.ki_flags &= ~IOCB_NOWAIT;
1805
1806                 ret = 0;
1807                 if (io_sqe_needs_user(sqe) && !cur_mm) {
1808                         if (!mmget_not_zero(ctx->sqo_mm)) {
1809                                 ret = -EFAULT;
1810                         } else {
1811                                 cur_mm = ctx->sqo_mm;
1812                                 use_mm(cur_mm);
1813                                 old_fs = get_fs();
1814                                 set_fs(USER_DS);
1815                         }
1816                 }
1817
1818                 if (!ret) {
1819                         s->has_user = cur_mm != NULL;
1820                         s->needs_lock = true;
1821                         do {
1822                                 ret = __io_submit_sqe(ctx, req, s, false);
1823                                 /*
1824                                  * We can get EAGAIN for polled IO even though
1825                                  * we're forcing a sync submission from here,
1826                                  * since we can't wait for request slots on the
1827                                  * block side.
1828                                  */
1829                                 if (ret != -EAGAIN)
1830                                         break;
1831                                 cond_resched();
1832                         } while (1);
1833                 }
1834
1835                 /* drop submission reference */
1836                 io_put_req(req);
1837
1838                 if (ret) {
1839                         io_cqring_add_event(ctx, sqe->user_data, ret);
1840                         io_put_req(req);
1841                 }
1842
1843                 /* async context always use a copy of the sqe */
1844                 kfree(sqe);
1845
1846                 if (!async_list)
1847                         break;
1848                 if (!list_empty(&req_list)) {
1849                         req = list_first_entry(&req_list, struct io_kiocb,
1850                                                 list);
1851                         list_del(&req->list);
1852                         continue;
1853                 }
1854                 if (list_empty(&async_list->list))
1855                         break;
1856
1857                 req = NULL;
1858                 spin_lock(&async_list->lock);
1859                 if (list_empty(&async_list->list)) {
1860                         spin_unlock(&async_list->lock);
1861                         break;
1862                 }
1863                 list_splice_init(&async_list->list, &req_list);
1864                 spin_unlock(&async_list->lock);
1865
1866                 req = list_first_entry(&req_list, struct io_kiocb, list);
1867                 list_del(&req->list);
1868         } while (req);
1869
1870         /*
1871          * Rare case of racing with a submitter. If we find the count has
1872          * dropped to zero AND we have pending work items, then restart
1873          * the processing. This is a tiny race window.
1874          */
1875         if (async_list) {
1876                 ret = atomic_dec_return(&async_list->cnt);
1877                 while (!ret && !list_empty(&async_list->list)) {
1878                         spin_lock(&async_list->lock);
1879                         atomic_inc(&async_list->cnt);
1880                         list_splice_init(&async_list->list, &req_list);
1881                         spin_unlock(&async_list->lock);
1882
1883                         if (!list_empty(&req_list)) {
1884                                 req = list_first_entry(&req_list,
1885                                                         struct io_kiocb, list);
1886                                 list_del(&req->list);
1887                                 goto restart;
1888                         }
1889                         ret = atomic_dec_return(&async_list->cnt);
1890                 }
1891         }
1892
1893         if (cur_mm) {
1894                 set_fs(old_fs);
1895                 unuse_mm(cur_mm);
1896                 mmput(cur_mm);
1897         }
1898 }
1899
1900 /*
1901  * See if we can piggy back onto previously submitted work, that is still
1902  * running. We currently only allow this if the new request is sequential
1903  * to the previous one we punted.
1904  */
1905 static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
1906 {
1907         bool ret = false;
1908
1909         if (!list)
1910                 return false;
1911         if (!(req->flags & REQ_F_SEQ_PREV))
1912                 return false;
1913         if (!atomic_read(&list->cnt))
1914                 return false;
1915
1916         ret = true;
1917         spin_lock(&list->lock);
1918         list_add_tail(&req->list, &list->list);
1919         if (!atomic_read(&list->cnt)) {
1920                 list_del_init(&req->list);
1921                 ret = false;
1922         }
1923         spin_unlock(&list->lock);
1924         return ret;
1925 }
1926
1927 static bool io_op_needs_file(const struct io_uring_sqe *sqe)
1928 {
1929         int op = READ_ONCE(sqe->opcode);
1930
1931         switch (op) {
1932         case IORING_OP_NOP:
1933         case IORING_OP_POLL_REMOVE:
1934                 return false;
1935         default:
1936                 return true;
1937         }
1938 }
1939
1940 static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
1941                            struct io_submit_state *state, struct io_kiocb *req)
1942 {
1943         unsigned flags;
1944         int fd;
1945
1946         flags = READ_ONCE(s->sqe->flags);
1947         fd = READ_ONCE(s->sqe->fd);
1948
1949         if (flags & IOSQE_IO_DRAIN) {
1950                 req->flags |= REQ_F_IO_DRAIN;
1951                 req->sequence = ctx->cached_sq_head - 1;
1952         }
1953
1954         if (!io_op_needs_file(s->sqe))
1955                 return 0;
1956
1957         if (flags & IOSQE_FIXED_FILE) {
1958                 if (unlikely(!ctx->user_files ||
1959                     (unsigned) fd >= ctx->nr_user_files))
1960                         return -EBADF;
1961                 req->file = ctx->user_files[fd];
1962                 req->flags |= REQ_F_FIXED_FILE;
1963         } else {
1964                 if (s->needs_fixed_file)
1965                         return -EBADF;
1966                 req->file = io_file_get(state, fd);
1967                 if (unlikely(!req->file))
1968                         return -EBADF;
1969         }
1970
1971         return 0;
1972 }
1973
1974 static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
1975                         struct sqe_submit *s)
1976 {
1977         int ret;
1978
1979         ret = __io_submit_sqe(ctx, req, s, true);
1980         if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
1981                 struct io_uring_sqe *sqe_copy;
1982
1983                 sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
1984                 if (sqe_copy) {
1985                         struct async_list *list;
1986
1987                         memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
1988                         s->sqe = sqe_copy;
1989
1990                         memcpy(&req->submit, s, sizeof(*s));
1991                         list = io_async_list_from_sqe(ctx, s->sqe);
1992                         if (!io_add_to_prev_work(list, req)) {
1993                                 if (list)
1994                                         atomic_inc(&list->cnt);
1995                                 INIT_WORK(&req->work, io_sq_wq_submit_work);
1996                                 queue_work(ctx->sqo_wq, &req->work);
1997                         }
1998
1999                         /*
2000                          * Queued up for async execution, worker will release
2001                          * submit reference when the iocb is actually submitted.
2002                          */
2003                         return 0;
2004                 }
2005         }
2006
2007         /* drop submission reference */
2008         io_put_req(req);
2009
2010         /* and drop final reference, if we failed */
2011         if (ret) {
2012                 io_cqring_add_event(ctx, req->user_data, ret);
2013                 if (req->flags & REQ_F_LINK)
2014                         req->flags |= REQ_F_FAIL_LINK;
2015                 io_put_req(req);
2016         }
2017
2018         return ret;
2019 }
2020
2021 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
2022
2023 static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
2024                           struct io_submit_state *state, struct io_kiocb **link)
2025 {
2026         struct io_uring_sqe *sqe_copy;
2027         struct io_kiocb *req;
2028         int ret;
2029
2030         /* enforce forwards compatibility on users */
2031         if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) {
2032                 ret = -EINVAL;
2033                 goto err;
2034         }
2035
2036         req = io_get_req(ctx, state);
2037         if (unlikely(!req)) {
2038                 ret = -EAGAIN;
2039                 goto err;
2040         }
2041
2042         ret = io_req_set_file(ctx, s, state, req);
2043         if (unlikely(ret)) {
2044 err_req:
2045                 io_free_req(req);
2046 err:
2047                 io_cqring_add_event(ctx, s->sqe->user_data, ret);
2048                 return;
2049         }
2050
2051         ret = io_req_defer(ctx, req, s->sqe);
2052         if (ret) {
2053                 if (ret != -EIOCBQUEUED)
2054                         goto err_req;
2055                 return;
2056         }
2057
2058         /*
2059          * If we already have a head request, queue this one for async
2060          * submittal once the head completes. If we don't have a head but
2061          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
2062          * submitted sync once the chain is complete. If none of those
2063          * conditions are true (normal request), then just queue it.
2064          */
2065         if (*link) {
2066                 struct io_kiocb *prev = *link;
2067
2068                 sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
2069                 if (!sqe_copy) {
2070                         ret = -EAGAIN;
2071                         goto err_req;
2072                 }
2073
2074                 s->sqe = sqe_copy;
2075                 memcpy(&req->submit, s, sizeof(*s));
2076                 list_add_tail(&req->list, &prev->link_list);
2077         } else if (s->sqe->flags & IOSQE_IO_LINK) {
2078                 req->flags |= REQ_F_LINK;
2079
2080                 memcpy(&req->submit, s, sizeof(*s));
2081                 INIT_LIST_HEAD(&req->link_list);
2082                 *link = req;
2083         } else {
2084                 io_queue_sqe(ctx, req, s);
2085         }
2086 }
2087
2088 /*
2089  * Batched submission is done, ensure local IO is flushed out.
2090  */
2091 static void io_submit_state_end(struct io_submit_state *state)
2092 {
2093         blk_finish_plug(&state->plug);
2094         io_file_put(state);
2095         if (state->free_reqs)
2096                 kmem_cache_free_bulk(req_cachep, state->free_reqs,
2097                                         &state->reqs[state->cur_req]);
2098 }
2099
2100 /*
2101  * Start submission side cache.
2102  */
2103 static void io_submit_state_start(struct io_submit_state *state,
2104                                   struct io_ring_ctx *ctx, unsigned max_ios)
2105 {
2106         blk_start_plug(&state->plug);
2107         state->free_reqs = 0;
2108         state->file = NULL;
2109         state->ios_left = max_ios;
2110 }
2111
2112 static void io_commit_sqring(struct io_ring_ctx *ctx)
2113 {
2114         struct io_sq_ring *ring = ctx->sq_ring;
2115
2116         if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) {
2117                 /*
2118                  * Ensure any loads from the SQEs are done at this point,
2119                  * since once we write the new head, the application could
2120                  * write new data to them.
2121                  */
2122                 smp_store_release(&ring->r.head, ctx->cached_sq_head);
2123         }
2124 }
2125
2126 /*
2127  * Fetch an sqe, if one is available. Note that s->sqe will point to memory
2128  * that is mapped by userspace. This means that care needs to be taken to
2129  * ensure that reads are stable, as we cannot rely on userspace always
2130  * being a good citizen. If members of the sqe are validated and then later
2131  * used, it's important that those reads are done through READ_ONCE() to
2132  * prevent a re-load down the line.
2133  */
2134 static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
2135 {
2136         struct io_sq_ring *ring = ctx->sq_ring;
2137         unsigned head;
2138
2139         /*
2140          * The cached sq head (or cq tail) serves two purposes:
2141          *
2142          * 1) allows us to batch the cost of updating the user visible
2143          *    head updates.
2144          * 2) allows the kernel side to track the head on its own, even
2145          *    though the application is the one updating it.
2146          */
2147         head = ctx->cached_sq_head;
2148         /* make sure SQ entry isn't read before tail */
2149         if (head == smp_load_acquire(&ring->r.tail))
2150                 return false;
2151
2152         head = READ_ONCE(ring->array[head & ctx->sq_mask]);
2153         if (head < ctx->sq_entries) {
2154                 s->index = head;
2155                 s->sqe = &ctx->sq_sqes[head];
2156                 ctx->cached_sq_head++;
2157                 return true;
2158         }
2159
2160         /* drop invalid entries */
2161         ctx->cached_sq_head++;
2162         ring->dropped++;
2163         return false;
2164 }
2165
2166 static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
2167                           unsigned int nr, bool has_user, bool mm_fault)
2168 {
2169         struct io_submit_state state, *statep = NULL;
2170         struct io_kiocb *link = NULL;
2171         bool prev_was_link = false;
2172         int i, submitted = 0;
2173
2174         if (nr > IO_PLUG_THRESHOLD) {
2175                 io_submit_state_start(&state, ctx, nr);
2176                 statep = &state;
2177         }
2178
2179         for (i = 0; i < nr; i++) {
2180                 /*
2181                  * If previous wasn't linked and we have a linked command,
2182                  * that's the end of the chain. Submit the previous link.
2183                  */
2184                 if (!prev_was_link && link) {
2185                         io_queue_sqe(ctx, link, &link->submit);
2186                         link = NULL;
2187                 }
2188                 prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0;
2189
2190                 if (unlikely(mm_fault)) {
2191                         io_cqring_add_event(ctx, sqes[i].sqe->user_data,
2192                                                 -EFAULT);
2193                 } else {
2194                         sqes[i].has_user = has_user;
2195                         sqes[i].needs_lock = true;
2196                         sqes[i].needs_fixed_file = true;
2197                         io_submit_sqe(ctx, &sqes[i], statep, &link);
2198                         submitted++;
2199                 }
2200         }
2201
2202         if (link)
2203                 io_queue_sqe(ctx, link, &link->submit);
2204         if (statep)
2205                 io_submit_state_end(&state);
2206
2207         return submitted;
2208 }
2209
2210 static int io_sq_thread(void *data)
2211 {
2212         struct sqe_submit sqes[IO_IOPOLL_BATCH];
2213         struct io_ring_ctx *ctx = data;
2214         struct mm_struct *cur_mm = NULL;
2215         mm_segment_t old_fs;
2216         DEFINE_WAIT(wait);
2217         unsigned inflight;
2218         unsigned long timeout;
2219
2220         complete(&ctx->sqo_thread_started);
2221
2222         old_fs = get_fs();
2223         set_fs(USER_DS);
2224
2225         timeout = inflight = 0;
2226         while (!kthread_should_park()) {
2227                 bool all_fixed, mm_fault = false;
2228                 int i;
2229
2230                 if (inflight) {
2231                         unsigned nr_events = 0;
2232
2233                         if (ctx->flags & IORING_SETUP_IOPOLL) {
2234                                 /*
2235                                  * We disallow the app entering submit/complete
2236                                  * with polling, but we still need to lock the
2237                                  * ring to prevent racing with polled issue
2238                                  * that got punted to a workqueue.
2239                                  */
2240                                 mutex_lock(&ctx->uring_lock);
2241                                 io_iopoll_check(ctx, &nr_events, 0);
2242                                 mutex_unlock(&ctx->uring_lock);
2243                         } else {
2244                                 /*
2245                                  * Normal IO, just pretend everything completed.
2246                                  * We don't have to poll completions for that.
2247                                  */
2248                                 nr_events = inflight;
2249                         }
2250
2251                         inflight -= nr_events;
2252                         if (!inflight)
2253                                 timeout = jiffies + ctx->sq_thread_idle;
2254                 }
2255
2256                 if (!io_get_sqring(ctx, &sqes[0])) {
2257                         /*
2258                          * We're polling. If we're within the defined idle
2259                          * period, then let us spin without work before going
2260                          * to sleep.
2261                          */
2262                         if (inflight || !time_after(jiffies, timeout)) {
2263                                 cpu_relax();
2264                                 continue;
2265                         }
2266
2267                         /*
2268                          * Drop cur_mm before scheduling, we can't hold it for
2269                          * long periods (or over schedule()). Do this before
2270                          * adding ourselves to the waitqueue, as the unuse/drop
2271                          * may sleep.
2272                          */
2273                         if (cur_mm) {
2274                                 unuse_mm(cur_mm);
2275                                 mmput(cur_mm);
2276                                 cur_mm = NULL;
2277                         }
2278
2279                         prepare_to_wait(&ctx->sqo_wait, &wait,
2280                                                 TASK_INTERRUPTIBLE);
2281
2282                         /* Tell userspace we may need a wakeup call */
2283                         ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP;
2284                         /* make sure to read SQ tail after writing flags */
2285                         smp_mb();
2286
2287                         if (!io_get_sqring(ctx, &sqes[0])) {
2288                                 if (kthread_should_park()) {
2289                                         finish_wait(&ctx->sqo_wait, &wait);
2290                                         break;
2291                                 }
2292                                 if (signal_pending(current))
2293                                         flush_signals(current);
2294                                 schedule();
2295                                 finish_wait(&ctx->sqo_wait, &wait);
2296
2297                                 ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
2298                                 continue;
2299                         }
2300                         finish_wait(&ctx->sqo_wait, &wait);
2301
2302                         ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP;
2303                 }
2304
2305                 i = 0;
2306                 all_fixed = true;
2307                 do {
2308                         if (all_fixed && io_sqe_needs_user(sqes[i].sqe))
2309                                 all_fixed = false;
2310
2311                         i++;
2312                         if (i == ARRAY_SIZE(sqes))
2313                                 break;
2314                 } while (io_get_sqring(ctx, &sqes[i]));
2315
2316                 /* Unless all new commands are FIXED regions, grab mm */
2317                 if (!all_fixed && !cur_mm) {
2318                         mm_fault = !mmget_not_zero(ctx->sqo_mm);
2319                         if (!mm_fault) {
2320                                 use_mm(ctx->sqo_mm);
2321                                 cur_mm = ctx->sqo_mm;
2322                         }
2323                 }
2324
2325                 inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL,
2326                                                 mm_fault);
2327
2328                 /* Commit SQ ring head once we've consumed all SQEs */
2329                 io_commit_sqring(ctx);
2330         }
2331
2332         set_fs(old_fs);
2333         if (cur_mm) {
2334                 unuse_mm(cur_mm);
2335                 mmput(cur_mm);
2336         }
2337
2338         kthread_parkme();
2339
2340         return 0;
2341 }
2342
2343 static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
2344 {
2345         struct io_submit_state state, *statep = NULL;
2346         struct io_kiocb *link = NULL;
2347         bool prev_was_link = false;
2348         int i, submit = 0;
2349
2350         if (to_submit > IO_PLUG_THRESHOLD) {
2351                 io_submit_state_start(&state, ctx, to_submit);
2352                 statep = &state;
2353         }
2354
2355         for (i = 0; i < to_submit; i++) {
2356                 struct sqe_submit s;
2357
2358                 if (!io_get_sqring(ctx, &s))
2359                         break;
2360
2361                 /*
2362                  * If previous wasn't linked and we have a linked command,
2363                  * that's the end of the chain. Submit the previous link.
2364                  */
2365                 if (!prev_was_link && link) {
2366                         io_queue_sqe(ctx, link, &link->submit);
2367                         link = NULL;
2368                 }
2369                 prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
2370
2371                 s.has_user = true;
2372                 s.needs_lock = false;
2373                 s.needs_fixed_file = false;
2374                 submit++;
2375                 io_submit_sqe(ctx, &s, statep, &link);
2376         }
2377         io_commit_sqring(ctx);
2378
2379         if (link)
2380                 io_queue_sqe(ctx, link, &link->submit);
2381         if (statep)
2382                 io_submit_state_end(statep);
2383
2384         return submit;
2385 }
2386
2387 static unsigned io_cqring_events(struct io_cq_ring *ring)
2388 {
2389         /* See comment at the top of this file */
2390         smp_rmb();
2391         return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head);
2392 }
2393
2394 /*
2395  * Wait until events become available, if we don't already have some. The
2396  * application must reap them itself, as they reside on the shared cq ring.
2397  */
2398 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
2399                           const sigset_t __user *sig, size_t sigsz)
2400 {
2401         struct io_cq_ring *ring = ctx->cq_ring;
2402         sigset_t ksigmask, sigsaved;
2403         int ret;
2404
2405         if (io_cqring_events(ring) >= min_events)
2406                 return 0;
2407
2408         if (sig) {
2409 #ifdef CONFIG_COMPAT
2410                 if (in_compat_syscall())
2411                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
2412                                                       &ksigmask, &sigsaved, sigsz);
2413                 else
2414 #endif
2415                         ret = set_user_sigmask(sig, &ksigmask,
2416                                                &sigsaved, sigsz);
2417
2418                 if (ret)
2419                         return ret;
2420         }
2421
2422         ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events);
2423
2424         if (sig)
2425                 restore_user_sigmask(sig, &sigsaved, ret == -ERESTARTSYS);
2426
2427         if (ret == -ERESTARTSYS)
2428                 ret = -EINTR;
2429
2430         return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0;
2431 }
2432
2433 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
2434 {
2435 #if defined(CONFIG_UNIX)
2436         if (ctx->ring_sock) {
2437                 struct sock *sock = ctx->ring_sock->sk;
2438                 struct sk_buff *skb;
2439
2440                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
2441                         kfree_skb(skb);
2442         }
2443 #else
2444         int i;
2445
2446         for (i = 0; i < ctx->nr_user_files; i++)
2447                 fput(ctx->user_files[i]);
2448 #endif
2449 }
2450
2451 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
2452 {
2453         if (!ctx->user_files)
2454                 return -ENXIO;
2455
2456         __io_sqe_files_unregister(ctx);
2457         kfree(ctx->user_files);
2458         ctx->user_files = NULL;
2459         ctx->nr_user_files = 0;
2460         return 0;
2461 }
2462
2463 static void io_sq_thread_stop(struct io_ring_ctx *ctx)
2464 {
2465         if (ctx->sqo_thread) {
2466                 wait_for_completion(&ctx->sqo_thread_started);
2467                 /*
2468                  * The park is a bit of a work-around, without it we get
2469                  * warning spews on shutdown with SQPOLL set and affinity
2470                  * set to a single CPU.
2471                  */
2472                 kthread_park(ctx->sqo_thread);
2473                 kthread_stop(ctx->sqo_thread);
2474                 ctx->sqo_thread = NULL;
2475         }
2476 }
2477
2478 static void io_finish_async(struct io_ring_ctx *ctx)
2479 {
2480         io_sq_thread_stop(ctx);
2481
2482         if (ctx->sqo_wq) {
2483                 destroy_workqueue(ctx->sqo_wq);
2484                 ctx->sqo_wq = NULL;
2485         }
2486 }
2487
2488 #if defined(CONFIG_UNIX)
2489 static void io_destruct_skb(struct sk_buff *skb)
2490 {
2491         struct io_ring_ctx *ctx = skb->sk->sk_user_data;
2492
2493         io_finish_async(ctx);
2494         unix_destruct_scm(skb);
2495 }
2496
2497 /*
2498  * Ensure the UNIX gc is aware of our file set, so we are certain that
2499  * the io_uring can be safely unregistered on process exit, even if we have
2500  * loops in the file referencing.
2501  */
2502 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
2503 {
2504         struct sock *sk = ctx->ring_sock->sk;
2505         struct scm_fp_list *fpl;
2506         struct sk_buff *skb;
2507         int i;
2508
2509         if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
2510                 unsigned long inflight = ctx->user->unix_inflight + nr;
2511
2512                 if (inflight > task_rlimit(current, RLIMIT_NOFILE))
2513                         return -EMFILE;
2514         }
2515
2516         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
2517         if (!fpl)
2518                 return -ENOMEM;
2519
2520         skb = alloc_skb(0, GFP_KERNEL);
2521         if (!skb) {
2522                 kfree(fpl);
2523                 return -ENOMEM;
2524         }
2525
2526         skb->sk = sk;
2527         skb->destructor = io_destruct_skb;
2528
2529         fpl->user = get_uid(ctx->user);
2530         for (i = 0; i < nr; i++) {
2531                 fpl->fp[i] = get_file(ctx->user_files[i + offset]);
2532                 unix_inflight(fpl->user, fpl->fp[i]);
2533         }
2534
2535         fpl->max = fpl->count = nr;
2536         UNIXCB(skb).fp = fpl;
2537         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2538         skb_queue_head(&sk->sk_receive_queue, skb);
2539
2540         for (i = 0; i < nr; i++)
2541                 fput(fpl->fp[i]);
2542
2543         return 0;
2544 }
2545
2546 /*
2547  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
2548  * causes regular reference counting to break down. We rely on the UNIX
2549  * garbage collection to take care of this problem for us.
2550  */
2551 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
2552 {
2553         unsigned left, total;
2554         int ret = 0;
2555
2556         total = 0;
2557         left = ctx->nr_user_files;
2558         while (left) {
2559                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
2560
2561                 ret = __io_sqe_files_scm(ctx, this_files, total);
2562                 if (ret)
2563                         break;
2564                 left -= this_files;
2565                 total += this_files;
2566         }
2567
2568         if (!ret)
2569                 return 0;
2570
2571         while (total < ctx->nr_user_files) {
2572                 fput(ctx->user_files[total]);
2573                 total++;
2574         }
2575
2576         return ret;
2577 }
2578 #else
2579 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
2580 {
2581         return 0;
2582 }
2583 #endif
2584
2585 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
2586                                  unsigned nr_args)
2587 {
2588         __s32 __user *fds = (__s32 __user *) arg;
2589         int fd, ret = 0;
2590         unsigned i;
2591
2592         if (ctx->user_files)
2593                 return -EBUSY;
2594         if (!nr_args)
2595                 return -EINVAL;
2596         if (nr_args > IORING_MAX_FIXED_FILES)
2597                 return -EMFILE;
2598
2599         ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
2600         if (!ctx->user_files)
2601                 return -ENOMEM;
2602
2603         for (i = 0; i < nr_args; i++) {
2604                 ret = -EFAULT;
2605                 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
2606                         break;
2607
2608                 ctx->user_files[i] = fget(fd);
2609
2610                 ret = -EBADF;
2611                 if (!ctx->user_files[i])
2612                         break;
2613                 /*
2614                  * Don't allow io_uring instances to be registered. If UNIX
2615                  * isn't enabled, then this causes a reference cycle and this
2616                  * instance can never get freed. If UNIX is enabled we'll
2617                  * handle it just fine, but there's still no point in allowing
2618                  * a ring fd as it doesn't support regular read/write anyway.
2619                  */
2620                 if (ctx->user_files[i]->f_op == &io_uring_fops) {
2621                         fput(ctx->user_files[i]);
2622                         break;
2623                 }
2624                 ctx->nr_user_files++;
2625                 ret = 0;
2626         }
2627
2628         if (ret) {
2629                 for (i = 0; i < ctx->nr_user_files; i++)
2630                         fput(ctx->user_files[i]);
2631
2632                 kfree(ctx->user_files);
2633                 ctx->user_files = NULL;
2634                 ctx->nr_user_files = 0;
2635                 return ret;
2636         }
2637
2638         ret = io_sqe_files_scm(ctx);
2639         if (ret)
2640                 io_sqe_files_unregister(ctx);
2641
2642         return ret;
2643 }
2644
2645 static int io_sq_offload_start(struct io_ring_ctx *ctx,
2646                                struct io_uring_params *p)
2647 {
2648         int ret;
2649
2650         init_waitqueue_head(&ctx->sqo_wait);
2651         mmgrab(current->mm);
2652         ctx->sqo_mm = current->mm;
2653
2654         if (ctx->flags & IORING_SETUP_SQPOLL) {
2655                 ret = -EPERM;
2656                 if (!capable(CAP_SYS_ADMIN))
2657                         goto err;
2658
2659                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
2660                 if (!ctx->sq_thread_idle)
2661                         ctx->sq_thread_idle = HZ;
2662
2663                 if (p->flags & IORING_SETUP_SQ_AFF) {
2664                         int cpu = p->sq_thread_cpu;
2665
2666                         ret = -EINVAL;
2667                         if (cpu >= nr_cpu_ids)
2668                                 goto err;
2669                         if (!cpu_online(cpu))
2670                                 goto err;
2671
2672                         ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
2673                                                         ctx, cpu,
2674                                                         "io_uring-sq");
2675                 } else {
2676                         ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
2677                                                         "io_uring-sq");
2678                 }
2679                 if (IS_ERR(ctx->sqo_thread)) {
2680                         ret = PTR_ERR(ctx->sqo_thread);
2681                         ctx->sqo_thread = NULL;
2682                         goto err;
2683                 }
2684                 wake_up_process(ctx->sqo_thread);
2685         } else if (p->flags & IORING_SETUP_SQ_AFF) {
2686                 /* Can't have SQ_AFF without SQPOLL */
2687                 ret = -EINVAL;
2688                 goto err;
2689         }
2690
2691         /* Do QD, or 2 * CPUS, whatever is smallest */
2692         ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE,
2693                         min(ctx->sq_entries - 1, 2 * num_online_cpus()));
2694         if (!ctx->sqo_wq) {
2695                 ret = -ENOMEM;
2696                 goto err;
2697         }
2698
2699         return 0;
2700 err:
2701         io_sq_thread_stop(ctx);
2702         mmdrop(ctx->sqo_mm);
2703         ctx->sqo_mm = NULL;
2704         return ret;
2705 }
2706
2707 static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
2708 {
2709         atomic_long_sub(nr_pages, &user->locked_vm);
2710 }
2711
2712 static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
2713 {
2714         unsigned long page_limit, cur_pages, new_pages;
2715
2716         /* Don't allow more pages than we can safely lock */
2717         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
2718
2719         do {
2720                 cur_pages = atomic_long_read(&user->locked_vm);
2721                 new_pages = cur_pages + nr_pages;
2722                 if (new_pages > page_limit)
2723                         return -ENOMEM;
2724         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
2725                                         new_pages) != cur_pages);
2726
2727         return 0;
2728 }
2729
2730 static void io_mem_free(void *ptr)
2731 {
2732         struct page *page;
2733
2734         if (!ptr)
2735                 return;
2736
2737         page = virt_to_head_page(ptr);
2738         if (put_page_testzero(page))
2739                 free_compound_page(page);
2740 }
2741
2742 static void *io_mem_alloc(size_t size)
2743 {
2744         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
2745                                 __GFP_NORETRY;
2746
2747         return (void *) __get_free_pages(gfp_flags, get_order(size));
2748 }
2749
2750 static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
2751 {
2752         struct io_sq_ring *sq_ring;
2753         struct io_cq_ring *cq_ring;
2754         size_t bytes;
2755
2756         bytes = struct_size(sq_ring, array, sq_entries);
2757         bytes += array_size(sizeof(struct io_uring_sqe), sq_entries);
2758         bytes += struct_size(cq_ring, cqes, cq_entries);
2759
2760         return (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
2761 }
2762
2763 static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
2764 {
2765         int i, j;
2766
2767         if (!ctx->user_bufs)
2768                 return -ENXIO;
2769
2770         for (i = 0; i < ctx->nr_user_bufs; i++) {
2771                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
2772
2773                 for (j = 0; j < imu->nr_bvecs; j++)
2774                         put_page(imu->bvec[j].bv_page);
2775
2776                 if (ctx->account_mem)
2777                         io_unaccount_mem(ctx->user, imu->nr_bvecs);
2778                 kvfree(imu->bvec);
2779                 imu->nr_bvecs = 0;
2780         }
2781
2782         kfree(ctx->user_bufs);
2783         ctx->user_bufs = NULL;
2784         ctx->nr_user_bufs = 0;
2785         return 0;
2786 }
2787
2788 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
2789                        void __user *arg, unsigned index)
2790 {
2791         struct iovec __user *src;
2792
2793 #ifdef CONFIG_COMPAT
2794         if (ctx->compat) {
2795                 struct compat_iovec __user *ciovs;
2796                 struct compat_iovec ciov;
2797
2798                 ciovs = (struct compat_iovec __user *) arg;
2799                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
2800                         return -EFAULT;
2801
2802                 dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
2803                 dst->iov_len = ciov.iov_len;
2804                 return 0;
2805         }
2806 #endif
2807         src = (struct iovec __user *) arg;
2808         if (copy_from_user(dst, &src[index], sizeof(*dst)))
2809                 return -EFAULT;
2810         return 0;
2811 }
2812
2813 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
2814                                   unsigned nr_args)
2815 {
2816         struct vm_area_struct **vmas = NULL;
2817         struct page **pages = NULL;
2818         int i, j, got_pages = 0;
2819         int ret = -EINVAL;
2820
2821         if (ctx->user_bufs)
2822                 return -EBUSY;
2823         if (!nr_args || nr_args > UIO_MAXIOV)
2824                 return -EINVAL;
2825
2826         ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
2827                                         GFP_KERNEL);
2828         if (!ctx->user_bufs)
2829                 return -ENOMEM;
2830
2831         for (i = 0; i < nr_args; i++) {
2832                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
2833                 unsigned long off, start, end, ubuf;
2834                 int pret, nr_pages;
2835                 struct iovec iov;
2836                 size_t size;
2837
2838                 ret = io_copy_iov(ctx, &iov, arg, i);
2839                 if (ret)
2840                         goto err;
2841
2842                 /*
2843                  * Don't impose further limits on the size and buffer
2844                  * constraints here, we'll -EINVAL later when IO is
2845                  * submitted if they are wrong.
2846                  */
2847                 ret = -EFAULT;
2848                 if (!iov.iov_base || !iov.iov_len)
2849                         goto err;
2850
2851                 /* arbitrary limit, but we need something */
2852                 if (iov.iov_len > SZ_1G)
2853                         goto err;
2854
2855                 ubuf = (unsigned long) iov.iov_base;
2856                 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
2857                 start = ubuf >> PAGE_SHIFT;
2858                 nr_pages = end - start;
2859
2860                 if (ctx->account_mem) {
2861                         ret = io_account_mem(ctx->user, nr_pages);
2862                         if (ret)
2863                                 goto err;
2864                 }
2865
2866                 ret = 0;
2867                 if (!pages || nr_pages > got_pages) {
2868                         kfree(vmas);
2869                         kfree(pages);
2870                         pages = kvmalloc_array(nr_pages, sizeof(struct page *),
2871                                                 GFP_KERNEL);
2872                         vmas = kvmalloc_array(nr_pages,
2873                                         sizeof(struct vm_area_struct *),
2874                                         GFP_KERNEL);
2875                         if (!pages || !vmas) {
2876                                 ret = -ENOMEM;
2877                                 if (ctx->account_mem)
2878                                         io_unaccount_mem(ctx->user, nr_pages);
2879                                 goto err;
2880                         }
2881                         got_pages = nr_pages;
2882                 }
2883
2884                 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
2885                                                 GFP_KERNEL);
2886                 ret = -ENOMEM;
2887                 if (!imu->bvec) {
2888                         if (ctx->account_mem)
2889                                 io_unaccount_mem(ctx->user, nr_pages);
2890                         goto err;
2891                 }
2892
2893                 ret = 0;
2894                 down_read(&current->mm->mmap_sem);
2895                 pret = get_user_pages(ubuf, nr_pages,
2896                                       FOLL_WRITE | FOLL_LONGTERM,
2897                                       pages, vmas);
2898                 if (pret == nr_pages) {
2899                         /* don't support file backed memory */
2900                         for (j = 0; j < nr_pages; j++) {
2901                                 struct vm_area_struct *vma = vmas[j];
2902
2903                                 if (vma->vm_file &&
2904                                     !is_file_hugepages(vma->vm_file)) {
2905                                         ret = -EOPNOTSUPP;
2906                                         break;
2907                                 }
2908                         }
2909                 } else {
2910                         ret = pret < 0 ? pret : -EFAULT;
2911                 }
2912                 up_read(&current->mm->mmap_sem);
2913                 if (ret) {
2914                         /*
2915                          * if we did partial map, or found file backed vmas,
2916                          * release any pages we did get
2917                          */
2918                         if (pret > 0) {
2919                                 for (j = 0; j < pret; j++)
2920                                         put_page(pages[j]);
2921                         }
2922                         if (ctx->account_mem)
2923                                 io_unaccount_mem(ctx->user, nr_pages);
2924                         kvfree(imu->bvec);
2925                         goto err;
2926                 }
2927
2928                 off = ubuf & ~PAGE_MASK;
2929                 size = iov.iov_len;
2930                 for (j = 0; j < nr_pages; j++) {
2931                         size_t vec_len;
2932
2933                         vec_len = min_t(size_t, size, PAGE_SIZE - off);
2934                         imu->bvec[j].bv_page = pages[j];
2935                         imu->bvec[j].bv_len = vec_len;
2936                         imu->bvec[j].bv_offset = off;
2937                         off = 0;
2938                         size -= vec_len;
2939                 }
2940                 /* store original address for later verification */
2941                 imu->ubuf = ubuf;
2942                 imu->len = iov.iov_len;
2943                 imu->nr_bvecs = nr_pages;
2944
2945                 ctx->nr_user_bufs++;
2946         }
2947         kvfree(pages);
2948         kvfree(vmas);
2949         return 0;
2950 err:
2951         kvfree(pages);
2952         kvfree(vmas);
2953         io_sqe_buffer_unregister(ctx);
2954         return ret;
2955 }
2956
2957 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
2958 {
2959         __s32 __user *fds = arg;
2960         int fd;
2961
2962         if (ctx->cq_ev_fd)
2963                 return -EBUSY;
2964
2965         if (copy_from_user(&fd, fds, sizeof(*fds)))
2966                 return -EFAULT;
2967
2968         ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
2969         if (IS_ERR(ctx->cq_ev_fd)) {
2970                 int ret = PTR_ERR(ctx->cq_ev_fd);
2971                 ctx->cq_ev_fd = NULL;
2972                 return ret;
2973         }
2974
2975         return 0;
2976 }
2977
2978 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
2979 {
2980         if (ctx->cq_ev_fd) {
2981                 eventfd_ctx_put(ctx->cq_ev_fd);
2982                 ctx->cq_ev_fd = NULL;
2983                 return 0;
2984         }
2985
2986         return -ENXIO;
2987 }
2988
2989 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
2990 {
2991         io_finish_async(ctx);
2992         if (ctx->sqo_mm)
2993                 mmdrop(ctx->sqo_mm);
2994
2995         io_iopoll_reap_events(ctx);
2996         io_sqe_buffer_unregister(ctx);
2997         io_sqe_files_unregister(ctx);
2998         io_eventfd_unregister(ctx);
2999
3000 #if defined(CONFIG_UNIX)
3001         if (ctx->ring_sock) {
3002                 ctx->ring_sock->file = NULL; /* so that iput() is called */
3003                 sock_release(ctx->ring_sock);
3004         }
3005 #endif
3006
3007         io_mem_free(ctx->sq_ring);
3008         io_mem_free(ctx->sq_sqes);
3009         io_mem_free(ctx->cq_ring);
3010
3011         percpu_ref_exit(&ctx->refs);
3012         if (ctx->account_mem)
3013                 io_unaccount_mem(ctx->user,
3014                                 ring_pages(ctx->sq_entries, ctx->cq_entries));
3015         free_uid(ctx->user);
3016         kfree(ctx);
3017 }
3018
3019 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
3020 {
3021         struct io_ring_ctx *ctx = file->private_data;
3022         __poll_t mask = 0;
3023
3024         poll_wait(file, &ctx->cq_wait, wait);
3025         /*
3026          * synchronizes with barrier from wq_has_sleeper call in
3027          * io_commit_cqring
3028          */
3029         smp_rmb();
3030         if (READ_ONCE(ctx->sq_ring->r.tail) - ctx->cached_sq_head !=
3031             ctx->sq_ring->ring_entries)
3032                 mask |= EPOLLOUT | EPOLLWRNORM;
3033         if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail)
3034                 mask |= EPOLLIN | EPOLLRDNORM;
3035
3036         return mask;
3037 }
3038
3039 static int io_uring_fasync(int fd, struct file *file, int on)
3040 {
3041         struct io_ring_ctx *ctx = file->private_data;
3042
3043         return fasync_helper(fd, file, on, &ctx->cq_fasync);
3044 }
3045
3046 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
3047 {
3048         mutex_lock(&ctx->uring_lock);
3049         percpu_ref_kill(&ctx->refs);
3050         mutex_unlock(&ctx->uring_lock);
3051
3052         io_poll_remove_all(ctx);
3053         io_iopoll_reap_events(ctx);
3054         wait_for_completion(&ctx->ctx_done);
3055         io_ring_ctx_free(ctx);
3056 }
3057
3058 static int io_uring_release(struct inode *inode, struct file *file)
3059 {
3060         struct io_ring_ctx *ctx = file->private_data;
3061
3062         file->private_data = NULL;
3063         io_ring_ctx_wait_and_kill(ctx);
3064         return 0;
3065 }
3066
3067 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
3068 {
3069         loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
3070         unsigned long sz = vma->vm_end - vma->vm_start;
3071         struct io_ring_ctx *ctx = file->private_data;
3072         unsigned long pfn;
3073         struct page *page;
3074         void *ptr;
3075
3076         switch (offset) {
3077         case IORING_OFF_SQ_RING:
3078                 ptr = ctx->sq_ring;
3079                 break;
3080         case IORING_OFF_SQES:
3081                 ptr = ctx->sq_sqes;
3082                 break;
3083         case IORING_OFF_CQ_RING:
3084                 ptr = ctx->cq_ring;
3085                 break;
3086         default:
3087                 return -EINVAL;
3088         }
3089
3090         page = virt_to_head_page(ptr);
3091         if (sz > (PAGE_SIZE << compound_order(page)))
3092                 return -EINVAL;
3093
3094         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
3095         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
3096 }
3097
3098 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
3099                 u32, min_complete, u32, flags, const sigset_t __user *, sig,
3100                 size_t, sigsz)
3101 {
3102         struct io_ring_ctx *ctx;
3103         long ret = -EBADF;
3104         int submitted = 0;
3105         struct fd f;
3106
3107         if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
3108                 return -EINVAL;
3109
3110         f = fdget(fd);
3111         if (!f.file)
3112                 return -EBADF;
3113
3114         ret = -EOPNOTSUPP;
3115         if (f.file->f_op != &io_uring_fops)
3116                 goto out_fput;
3117
3118         ret = -ENXIO;
3119         ctx = f.file->private_data;
3120         if (!percpu_ref_tryget(&ctx->refs))
3121                 goto out_fput;
3122
3123         /*
3124          * For SQ polling, the thread will do all submissions and completions.
3125          * Just return the requested submit count, and wake the thread if
3126          * we were asked to.
3127          */
3128         if (ctx->flags & IORING_SETUP_SQPOLL) {
3129                 if (flags & IORING_ENTER_SQ_WAKEUP)
3130                         wake_up(&ctx->sqo_wait);
3131                 submitted = to_submit;
3132                 goto out_ctx;
3133         }
3134
3135         ret = 0;
3136         if (to_submit) {
3137                 to_submit = min(to_submit, ctx->sq_entries);
3138
3139                 mutex_lock(&ctx->uring_lock);
3140                 submitted = io_ring_submit(ctx, to_submit);
3141                 mutex_unlock(&ctx->uring_lock);
3142         }
3143         if (flags & IORING_ENTER_GETEVENTS) {
3144                 unsigned nr_events = 0;
3145
3146                 min_complete = min(min_complete, ctx->cq_entries);
3147
3148                 if (ctx->flags & IORING_SETUP_IOPOLL) {
3149                         mutex_lock(&ctx->uring_lock);
3150                         ret = io_iopoll_check(ctx, &nr_events, min_complete);
3151                         mutex_unlock(&ctx->uring_lock);
3152                 } else {
3153                         ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
3154                 }
3155         }
3156
3157 out_ctx:
3158         io_ring_drop_ctx_refs(ctx, 1);
3159 out_fput:
3160         fdput(f);
3161         return submitted ? submitted : ret;
3162 }
3163
3164 static const struct file_operations io_uring_fops = {
3165         .release        = io_uring_release,
3166         .mmap           = io_uring_mmap,
3167         .poll           = io_uring_poll,
3168         .fasync         = io_uring_fasync,
3169 };
3170
3171 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
3172                                   struct io_uring_params *p)
3173 {
3174         struct io_sq_ring *sq_ring;
3175         struct io_cq_ring *cq_ring;
3176         size_t size;
3177
3178         sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries));
3179         if (!sq_ring)
3180                 return -ENOMEM;
3181
3182         ctx->sq_ring = sq_ring;
3183         sq_ring->ring_mask = p->sq_entries - 1;
3184         sq_ring->ring_entries = p->sq_entries;
3185         ctx->sq_mask = sq_ring->ring_mask;
3186         ctx->sq_entries = sq_ring->ring_entries;
3187
3188         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
3189         if (size == SIZE_MAX)
3190                 return -EOVERFLOW;
3191
3192         ctx->sq_sqes = io_mem_alloc(size);
3193         if (!ctx->sq_sqes)
3194                 return -ENOMEM;
3195
3196         cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries));
3197         if (!cq_ring)
3198                 return -ENOMEM;
3199
3200         ctx->cq_ring = cq_ring;
3201         cq_ring->ring_mask = p->cq_entries - 1;
3202         cq_ring->ring_entries = p->cq_entries;
3203         ctx->cq_mask = cq_ring->ring_mask;
3204         ctx->cq_entries = cq_ring->ring_entries;
3205         return 0;
3206 }
3207
3208 /*
3209  * Allocate an anonymous fd, this is what constitutes the application
3210  * visible backing of an io_uring instance. The application mmaps this
3211  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
3212  * we have to tie this fd to a socket for file garbage collection purposes.
3213  */
3214 static int io_uring_get_fd(struct io_ring_ctx *ctx)
3215 {
3216         struct file *file;
3217         int ret;
3218
3219 #if defined(CONFIG_UNIX)
3220         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
3221                                 &ctx->ring_sock);
3222         if (ret)
3223                 return ret;
3224 #endif
3225
3226         ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
3227         if (ret < 0)
3228                 goto err;
3229
3230         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
3231                                         O_RDWR | O_CLOEXEC);
3232         if (IS_ERR(file)) {
3233                 put_unused_fd(ret);
3234                 ret = PTR_ERR(file);
3235                 goto err;
3236         }
3237
3238 #if defined(CONFIG_UNIX)
3239         ctx->ring_sock->file = file;
3240         ctx->ring_sock->sk->sk_user_data = ctx;
3241 #endif
3242         fd_install(ret, file);
3243         return ret;
3244 err:
3245 #if defined(CONFIG_UNIX)
3246         sock_release(ctx->ring_sock);
3247         ctx->ring_sock = NULL;
3248 #endif
3249         return ret;
3250 }
3251
3252 static int io_uring_create(unsigned entries, struct io_uring_params *p)
3253 {
3254         struct user_struct *user = NULL;
3255         struct io_ring_ctx *ctx;
3256         bool account_mem;
3257         int ret;
3258
3259         if (!entries || entries > IORING_MAX_ENTRIES)
3260                 return -EINVAL;
3261
3262         /*
3263          * Use twice as many entries for the CQ ring. It's possible for the
3264          * application to drive a higher depth than the size of the SQ ring,
3265          * since the sqes are only used at submission time. This allows for
3266          * some flexibility in overcommitting a bit.
3267          */
3268         p->sq_entries = roundup_pow_of_two(entries);
3269         p->cq_entries = 2 * p->sq_entries;
3270
3271         user = get_uid(current_user());
3272         account_mem = !capable(CAP_IPC_LOCK);
3273
3274         if (account_mem) {
3275                 ret = io_account_mem(user,
3276                                 ring_pages(p->sq_entries, p->cq_entries));
3277                 if (ret) {
3278                         free_uid(user);
3279                         return ret;
3280                 }
3281         }
3282
3283         ctx = io_ring_ctx_alloc(p);
3284         if (!ctx) {
3285                 if (account_mem)
3286                         io_unaccount_mem(user, ring_pages(p->sq_entries,
3287                                                                 p->cq_entries));
3288                 free_uid(user);
3289                 return -ENOMEM;
3290         }
3291         ctx->compat = in_compat_syscall();
3292         ctx->account_mem = account_mem;
3293         ctx->user = user;
3294
3295         ret = io_allocate_scq_urings(ctx, p);
3296         if (ret)
3297                 goto err;
3298
3299         ret = io_sq_offload_start(ctx, p);
3300         if (ret)
3301                 goto err;
3302
3303         ret = io_uring_get_fd(ctx);
3304         if (ret < 0)
3305                 goto err;
3306
3307         memset(&p->sq_off, 0, sizeof(p->sq_off));
3308         p->sq_off.head = offsetof(struct io_sq_ring, r.head);
3309         p->sq_off.tail = offsetof(struct io_sq_ring, r.tail);
3310         p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask);
3311         p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries);
3312         p->sq_off.flags = offsetof(struct io_sq_ring, flags);
3313         p->sq_off.dropped = offsetof(struct io_sq_ring, dropped);
3314         p->sq_off.array = offsetof(struct io_sq_ring, array);
3315
3316         memset(&p->cq_off, 0, sizeof(p->cq_off));
3317         p->cq_off.head = offsetof(struct io_cq_ring, r.head);
3318         p->cq_off.tail = offsetof(struct io_cq_ring, r.tail);
3319         p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask);
3320         p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries);
3321         p->cq_off.overflow = offsetof(struct io_cq_ring, overflow);
3322         p->cq_off.cqes = offsetof(struct io_cq_ring, cqes);
3323         return ret;
3324 err:
3325         io_ring_ctx_wait_and_kill(ctx);
3326         return ret;
3327 }
3328
3329 /*
3330  * Sets up an aio uring context, and returns the fd. Applications asks for a
3331  * ring size, we return the actual sq/cq ring sizes (among other things) in the
3332  * params structure passed in.
3333  */
3334 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
3335 {
3336         struct io_uring_params p;
3337         long ret;
3338         int i;
3339
3340         if (copy_from_user(&p, params, sizeof(p)))
3341                 return -EFAULT;
3342         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
3343                 if (p.resv[i])
3344                         return -EINVAL;
3345         }
3346
3347         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
3348                         IORING_SETUP_SQ_AFF))
3349                 return -EINVAL;
3350
3351         ret = io_uring_create(entries, &p);
3352         if (ret < 0)
3353                 return ret;
3354
3355         if (copy_to_user(params, &p, sizeof(p)))
3356                 return -EFAULT;
3357
3358         return ret;
3359 }
3360
3361 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
3362                 struct io_uring_params __user *, params)
3363 {
3364         return io_uring_setup(entries, params);
3365 }
3366
3367 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
3368                                void __user *arg, unsigned nr_args)
3369         __releases(ctx->uring_lock)
3370         __acquires(ctx->uring_lock)
3371 {
3372         int ret;
3373
3374         /*
3375          * We're inside the ring mutex, if the ref is already dying, then
3376          * someone else killed the ctx or is already going through
3377          * io_uring_register().
3378          */
3379         if (percpu_ref_is_dying(&ctx->refs))
3380                 return -ENXIO;
3381
3382         percpu_ref_kill(&ctx->refs);
3383
3384         /*
3385          * Drop uring mutex before waiting for references to exit. If another
3386          * thread is currently inside io_uring_enter() it might need to grab
3387          * the uring_lock to make progress. If we hold it here across the drain
3388          * wait, then we can deadlock. It's safe to drop the mutex here, since
3389          * no new references will come in after we've killed the percpu ref.
3390          */
3391         mutex_unlock(&ctx->uring_lock);
3392         wait_for_completion(&ctx->ctx_done);
3393         mutex_lock(&ctx->uring_lock);
3394
3395         switch (opcode) {
3396         case IORING_REGISTER_BUFFERS:
3397                 ret = io_sqe_buffer_register(ctx, arg, nr_args);
3398                 break;
3399         case IORING_UNREGISTER_BUFFERS:
3400                 ret = -EINVAL;
3401                 if (arg || nr_args)
3402                         break;
3403                 ret = io_sqe_buffer_unregister(ctx);
3404                 break;
3405         case IORING_REGISTER_FILES:
3406                 ret = io_sqe_files_register(ctx, arg, nr_args);
3407                 break;
3408         case IORING_UNREGISTER_FILES:
3409                 ret = -EINVAL;
3410                 if (arg || nr_args)
3411                         break;
3412                 ret = io_sqe_files_unregister(ctx);
3413                 break;
3414         case IORING_REGISTER_EVENTFD:
3415                 ret = -EINVAL;
3416                 if (nr_args != 1)
3417                         break;
3418                 ret = io_eventfd_register(ctx, arg);
3419                 break;
3420         case IORING_UNREGISTER_EVENTFD:
3421                 ret = -EINVAL;
3422                 if (arg || nr_args)
3423                         break;
3424                 ret = io_eventfd_unregister(ctx);
3425                 break;
3426         default:
3427                 ret = -EINVAL;
3428                 break;
3429         }
3430
3431         /* bring the ctx back to life */
3432         reinit_completion(&ctx->ctx_done);
3433         percpu_ref_reinit(&ctx->refs);
3434         return ret;
3435 }
3436
3437 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
3438                 void __user *, arg, unsigned int, nr_args)
3439 {
3440         struct io_ring_ctx *ctx;
3441         long ret = -EBADF;
3442         struct fd f;
3443
3444         f = fdget(fd);
3445         if (!f.file)
3446                 return -EBADF;
3447
3448         ret = -EOPNOTSUPP;
3449         if (f.file->f_op != &io_uring_fops)
3450                 goto out_fput;
3451
3452         ctx = f.file->private_data;
3453
3454         mutex_lock(&ctx->uring_lock);
3455         ret = __io_uring_register(ctx, opcode, arg, nr_args);
3456         mutex_unlock(&ctx->uring_lock);
3457 out_fput:
3458         fdput(f);
3459         return ret;
3460 }
3461
3462 static int __init io_uring_init(void)
3463 {
3464         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
3465         return 0;
3466 };
3467 __initcall(io_uring_init);