fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqring (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <linux/compat.h>
  47 #include <linux/refcount.h>
  48 #include <linux/uio.h>
  49
  50 #include <linux/sched/signal.h>
  51 #include <linux/fs.h>
  52 #include <linux/file.h>
  53 #include <linux/fdtable.h>
  54 #include <linux/mm.h>
  55 #include <linux/mman.h>
  56 #include <linux/mmu_context.h>
  57 #include <linux/percpu.h>
  58 #include <linux/slab.h>
  59 #include <linux/kthread.h>
  60 #include <linux/blkdev.h>
  61 #include <linux/bvec.h>
  62 #include <linux/net.h>
  63 #include <net/sock.h>
  64 #include <net/af_unix.h>
  65 #include <net/scm.h>
  66 #include <linux/anon_inodes.h>
  67 #include <linux/sched/mm.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/nospec.h>
  70 #include <linux/sizes.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/highmem.h>
  73
  74 #define CREATE_TRACE_POINTS
  75 #include <trace/events/io_uring.h>
  76
  77 #include <uapi/linux/io_uring.h>
  78
  79 #include "internal.h"
  80 #include "io-wq.h"
  81
  82 #define IORING_MAX_ENTRIES      32768
  83 #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
  84
  85 /*
  86  * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
  87  */
  88 #define IORING_FILE_TABLE_SHIFT 9
  89 #define IORING_MAX_FILES_TABLE  (1U << IORING_FILE_TABLE_SHIFT)
  90 #define IORING_FILE_TABLE_MASK  (IORING_MAX_FILES_TABLE - 1)
  91 #define IORING_MAX_FIXED_FILES  (64 * IORING_MAX_FILES_TABLE)
  92
  93 struct io_uring {
  94         u32 head ____cacheline_aligned_in_smp;
  95         u32 tail ____cacheline_aligned_in_smp;
  96 };
  97
  98 /*
  99  * This data is shared with the application through the mmap at offsets
 100  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 101  *
 102  * The offsets to the member fields are published through struct
 103  * io_sqring_offsets when calling io_uring_setup.
 104  */
 105 struct io_rings {
 106         /*
 107          * Head and tail offsets into the ring; the offsets need to be
 108          * masked to get valid indices.
 109          *
 110          * The kernel controls head of the sq ring and the tail of the cq ring,
 111          * and the application controls tail of the sq ring and the head of the
 112          * cq ring.
 113          */
 114         struct io_uring         sq, cq;
 115         /*
 116          * Bitmasks to apply to head and tail offsets (constant, equals
 117          * ring_entries - 1)
 118          */
 119         u32                     sq_ring_mask, cq_ring_mask;
 120         /* Ring sizes (constant, power of 2) */
 121         u32                     sq_ring_entries, cq_ring_entries;
 122         /*
 123          * Number of invalid entries dropped by the kernel due to
 124          * invalid index stored in array
 125          *
 126          * Written by the kernel, shouldn't be modified by the
 127          * application (i.e. get number of "new events" by comparing to
 128          * cached value).
 129          *
 130          * After a new SQ head value was read by the application this
 131          * counter includes all submissions that were dropped reaching
 132          * the new SQ head (and possibly more).
 133          */
 134         u32                     sq_dropped;
 135         /*
 136          * Runtime flags
 137          *
 138          * Written by the kernel, shouldn't be modified by the
 139          * application.
 140          *
 141          * The application needs a full memory barrier before checking
 142          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 143          */
 144         u32                     sq_flags;
 145         /*
 146          * Number of completion events lost because the queue was full;
 147          * this should be avoided by the application by making sure
 148          * there are not more requests pending thatn there is space in
 149          * the completion queue.
 150          *
 151          * Written by the kernel, shouldn't be modified by the
 152          * application (i.e. get number of "new events" by comparing to
 153          * cached value).
 154          *
 155          * As completion events come in out of order this counter is not
 156          * ordered with any other data.
 157          */
 158         u32                     cq_overflow;
 159         /*
 160          * Ring buffer of completion events.
 161          *
 162          * The kernel writes completion events fresh every time they are
 163          * produced, so the application is allowed to modify pending
 164          * entries.
 165          */
 166         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 167 };
 168
 169 struct io_mapped_ubuf {
 170         u64             ubuf;
 171         size_t          len;
 172         struct          bio_vec *bvec;
 173         unsigned int    nr_bvecs;
 174 };
 175
 176 struct fixed_file_table {
 177         struct file             **files;
 178 };
 179
 180 struct io_ring_ctx {
 181         struct {
 182                 struct percpu_ref       refs;
 183         } ____cacheline_aligned_in_smp;
 184
 185         struct {
 186                 unsigned int            flags;
 187                 bool                    compat;
 188                 bool                    account_mem;
 189                 bool                    cq_overflow_flushed;
 190                 bool                    drain_next;
 191
 192                 /*
 193                  * Ring buffer of indices into array of io_uring_sqe, which is
 194                  * mmapped by the application using the IORING_OFF_SQES offset.
 195                  *
 196                  * This indirection could e.g. be used to assign fixed
 197                  * io_uring_sqe entries to operations and only submit them to
 198                  * the queue when needed.
 199                  *
 200                  * The kernel modifies neither the indices array nor the entries
 201                  * array.
 202                  */
 203                 u32                     *sq_array;
 204                 unsigned                cached_sq_head;
 205                 unsigned                sq_entries;
 206                 unsigned                sq_mask;
 207                 unsigned                sq_thread_idle;
 208                 unsigned                cached_sq_dropped;
 209                 atomic_t                cached_cq_overflow;
 210                 struct io_uring_sqe     *sq_sqes;
 211
 212                 struct list_head        defer_list;
 213                 struct list_head        timeout_list;
 214                 struct list_head        cq_overflow_list;
 215
 216                 wait_queue_head_t       inflight_wait;
 217         } ____cacheline_aligned_in_smp;
 218
 219         struct io_rings *rings;
 220
 221         /* IO offload */
 222         struct io_wq            *io_wq;
 223         struct task_struct      *sqo_thread;    /* if using sq thread polling */
 224         struct mm_struct        *sqo_mm;
 225         wait_queue_head_t       sqo_wait;
 226
 227         /*
 228          * If used, fixed file set. Writers must ensure that ->refs is dead,
 229          * readers must ensure that ->refs is alive as long as the file* is
 230          * used. Only updated through io_uring_register(2).
 231          */
 232         struct fixed_file_table *file_table;
 233         unsigned                nr_user_files;
 234
 235         /* if used, fixed mapped user buffers */
 236         unsigned                nr_user_bufs;
 237         struct io_mapped_ubuf   *user_bufs;
 238
 239         struct user_struct      *user;
 240
 241         struct cred             *creds;
 242
 243         /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
 244         struct completion       *completions;
 245
 246         /* if all else fails... */
 247         struct io_kiocb         *fallback_req;
 248
 249 #if defined(CONFIG_UNIX)
 250         struct socket           *ring_sock;
 251 #endif
 252
 253         struct {
 254                 unsigned                cached_cq_tail;
 255                 unsigned                cq_entries;
 256                 unsigned                cq_mask;
 257                 atomic_t                cq_timeouts;
 258                 struct wait_queue_head  cq_wait;
 259                 struct fasync_struct    *cq_fasync;
 260                 struct eventfd_ctx      *cq_ev_fd;
 261         } ____cacheline_aligned_in_smp;
 262
 263         struct {
 264                 struct mutex            uring_lock;
 265                 wait_queue_head_t       wait;
 266         } ____cacheline_aligned_in_smp;
 267
 268         struct {
 269                 spinlock_t              completion_lock;
 270                 bool                    poll_multi_file;
 271                 /*
 272                  * ->poll_list is protected by the ctx->uring_lock for
 273                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 274                  * For SQPOLL, only the single threaded io_sq_thread() will
 275                  * manipulate the list, hence no extra locking is needed there.
 276                  */
 277                 struct list_head        poll_list;
 278                 struct rb_root          cancel_tree;
 279
 280                 spinlock_t              inflight_lock;
 281                 struct list_head        inflight_list;
 282         } ____cacheline_aligned_in_smp;
 283 };
 284
 285 /*
 286  * First field must be the file pointer in all the
 287  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 288  */
 289 struct io_poll_iocb {
 290         struct file                     *file;
 291         struct wait_queue_head          *head;
 292         __poll_t                        events;
 293         bool                            done;
 294         bool                            canceled;
 295         struct wait_queue_entry         *wait;
 296 };
 297
 298 struct io_timeout_data {
 299         struct io_kiocb                 *req;
 300         struct hrtimer                  timer;
 301         struct timespec64               ts;
 302         enum hrtimer_mode               mode;
 303         u32                             seq_offset;
 304 };
 305
 306 struct io_timeout {
 307         struct file                     *file;
 308         struct io_timeout_data          *data;
 309 };
 310
 311 /*
 312  * NOTE! Each of the iocb union members has the file pointer
 313  * as the first entry in their struct definition. So you can
 314  * access the file pointer through any of the sub-structs,
 315  * or directly as just 'ki_filp' in this struct.
 316  */
 317 struct io_kiocb {
 318         union {
 319                 struct file             *file;
 320                 struct kiocb            rw;
 321                 struct io_poll_iocb     poll;
 322                 struct io_timeout       timeout;
 323         };
 324
 325         const struct io_uring_sqe       *sqe;
 326         struct file                     *ring_file;
 327         int                             ring_fd;
 328         bool                            has_user;
 329         bool                            in_async;
 330         bool                            needs_fixed_file;
 331
 332         struct io_ring_ctx      *ctx;
 333         union {
 334                 struct list_head        list;
 335                 struct rb_node          rb_node;
 336         };
 337         struct list_head        link_list;
 338         unsigned int            flags;
 339         refcount_t              refs;
 340 #define REQ_F_NOWAIT            1       /* must not punt to workers */
 341 #define REQ_F_IOPOLL_COMPLETED  2       /* polled IO has completed */
 342 #define REQ_F_FIXED_FILE        4       /* ctx owns file */
 343 #define REQ_F_LINK_NEXT         8       /* already grabbed next link */
 344 #define REQ_F_IO_DRAIN          16      /* drain existing IO first */
 345 #define REQ_F_IO_DRAINED        32      /* drain done */
 346 #define REQ_F_LINK              64      /* linked sqes */
 347 #define REQ_F_LINK_TIMEOUT      128     /* has linked timeout */
 348 #define REQ_F_FAIL_LINK         256     /* fail rest of links */
 349 #define REQ_F_DRAIN_LINK        512     /* link should be fully drained */
 350 #define REQ_F_TIMEOUT           1024    /* timeout request */
 351 #define REQ_F_ISREG             2048    /* regular file */
 352 #define REQ_F_MUST_PUNT         4096    /* must be punted even for NONBLOCK */
 353 #define REQ_F_TIMEOUT_NOSEQ     8192    /* no timeout sequence */
 354 #define REQ_F_INFLIGHT          16384   /* on inflight list */
 355 #define REQ_F_COMP_LOCKED       32768   /* completion under lock */
 356 #define REQ_F_FREE_SQE          65536   /* free sqe if not async queued */
 357         u64                     user_data;
 358         u32                     result;
 359         u32                     sequence;
 360
 361         struct list_head        inflight_entry;
 362
 363         struct io_wq_work       work;
 364 };
 365
 366 #define IO_PLUG_THRESHOLD               2
 367 #define IO_IOPOLL_BATCH                 8
 368
 369 struct io_submit_state {
 370         struct blk_plug         plug;
 371
 372         /*
 373          * io_kiocb alloc cache
 374          */
 375         void                    *reqs[IO_IOPOLL_BATCH];
 376         unsigned                int free_reqs;
 377         unsigned                int cur_req;
 378
 379         /*
 380          * File reference cache
 381          */
 382         struct file             *file;
 383         unsigned int            fd;
 384         unsigned int            has_refs;
 385         unsigned int            used_refs;
 386         unsigned int            ios_left;
 387 };
 388
 389 static void io_wq_submit_work(struct io_wq_work **workptr);
 390 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 391 static void __io_free_req(struct io_kiocb *req);
 392 static void io_put_req(struct io_kiocb *req);
 393 static void io_double_put_req(struct io_kiocb *req);
 394 static void __io_double_put_req(struct io_kiocb *req);
 395 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 396 static void io_queue_linked_timeout(struct io_kiocb *req);
 397
 398 static struct kmem_cache *req_cachep;
 399
 400 static const struct file_operations io_uring_fops;
 401
 402 struct sock *io_uring_get_socket(struct file *file)
 403 {
 404 #if defined(CONFIG_UNIX)
 405         if (file->f_op == &io_uring_fops) {
 406                 struct io_ring_ctx *ctx = file->private_data;
 407
 408                 return ctx->ring_sock->sk;
 409         }
 410 #endif
 411         return NULL;
 412 }
 413 EXPORT_SYMBOL(io_uring_get_socket);
 414
 415 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
 416 {
 417         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
 418
 419         complete(&ctx->completions[0]);
 420 }
 421
 422 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 423 {
 424         struct io_ring_ctx *ctx;
 425
 426         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 427         if (!ctx)
 428                 return NULL;
 429
 430         ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
 431         if (!ctx->fallback_req)
 432                 goto err;
 433
 434         ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL);
 435         if (!ctx->completions)
 436                 goto err;
 437
 438         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
 439                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
 440                 goto err;
 441
 442         ctx->flags = p->flags;
 443         init_waitqueue_head(&ctx->cq_wait);
 444         INIT_LIST_HEAD(&ctx->cq_overflow_list);
 445         init_completion(&ctx->completions[0]);
 446         init_completion(&ctx->completions[1]);
 447         mutex_init(&ctx->uring_lock);
 448         init_waitqueue_head(&ctx->wait);
 449         spin_lock_init(&ctx->completion_lock);
 450         INIT_LIST_HEAD(&ctx->poll_list);
 451         ctx->cancel_tree = RB_ROOT;
 452         INIT_LIST_HEAD(&ctx->defer_list);
 453         INIT_LIST_HEAD(&ctx->timeout_list);
 454         init_waitqueue_head(&ctx->inflight_wait);
 455         spin_lock_init(&ctx->inflight_lock);
 456         INIT_LIST_HEAD(&ctx->inflight_list);
 457         return ctx;
 458 err:
 459         if (ctx->fallback_req)
 460                 kmem_cache_free(req_cachep, ctx->fallback_req);
 461         kfree(ctx->completions);
 462         kfree(ctx);
 463         return NULL;
 464 }
 465
 466 static inline bool __req_need_defer(struct io_kiocb *req)
 467 {
 468         struct io_ring_ctx *ctx = req->ctx;
 469
 470         return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
 471                                         + atomic_read(&ctx->cached_cq_overflow);
 472 }
 473
 474 static inline bool req_need_defer(struct io_kiocb *req)
 475 {
 476         if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) == REQ_F_IO_DRAIN)
 477                 return __req_need_defer(req);
 478
 479         return false;
 480 }
 481
 482 static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
 483 {
 484         struct io_kiocb *req;
 485
 486         req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
 487         if (req && !req_need_defer(req)) {
 488                 list_del_init(&req->list);
 489                 return req;
 490         }
 491
 492         return NULL;
 493 }
 494
 495 static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
 496 {
 497         struct io_kiocb *req;
 498
 499         req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
 500         if (req) {
 501                 if (req->flags & REQ_F_TIMEOUT_NOSEQ)
 502                         return NULL;
 503                 if (!__req_need_defer(req)) {
 504                         list_del_init(&req->list);
 505                         return req;
 506                 }
 507         }
 508
 509         return NULL;
 510 }
 511
 512 static void __io_commit_cqring(struct io_ring_ctx *ctx)
 513 {
 514         struct io_rings *rings = ctx->rings;
 515
 516         if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) {
 517                 /* order cqe stores with ring update */
 518                 smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
 519
 520                 if (wq_has_sleeper(&ctx->cq_wait)) {
 521                         wake_up_interruptible(&ctx->cq_wait);
 522                         kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
 523                 }
 524         }
 525 }
 526
 527 static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
 528 {
 529         u8 opcode = READ_ONCE(sqe->opcode);
 530
 531         return !(opcode == IORING_OP_READ_FIXED ||
 532                  opcode == IORING_OP_WRITE_FIXED);
 533 }
 534
 535 static inline bool io_prep_async_work(struct io_kiocb *req,
 536                                       struct io_kiocb **link)
 537 {
 538         bool do_hashed = false;
 539
 540         if (req->sqe) {
 541                 switch (req->sqe->opcode) {
 542                 case IORING_OP_WRITEV:
 543                 case IORING_OP_WRITE_FIXED:
 544                         do_hashed = true;
 545                         /* fall-through */
 546                 case IORING_OP_READV:
 547                 case IORING_OP_READ_FIXED:
 548                 case IORING_OP_SENDMSG:
 549                 case IORING_OP_RECVMSG:
 550                 case IORING_OP_ACCEPT:
 551                 case IORING_OP_POLL_ADD:
 552                 case IORING_OP_CONNECT:
 553                         /*
 554                          * We know REQ_F_ISREG is not set on some of these
 555                          * opcodes, but this enables us to keep the check in
 556                          * just one place.
 557                          */
 558                         if (!(req->flags & REQ_F_ISREG))
 559                                 req->work.flags |= IO_WQ_WORK_UNBOUND;
 560                         break;
 561                 }
 562                 if (io_sqe_needs_user(req->sqe))
 563                         req->work.flags |= IO_WQ_WORK_NEEDS_USER;
 564         }
 565
 566         *link = io_prep_linked_timeout(req);
 567         return do_hashed;
 568 }
 569
 570 static inline void io_queue_async_work(struct io_kiocb *req)
 571 {
 572         struct io_ring_ctx *ctx = req->ctx;
 573         struct io_kiocb *link;
 574         bool do_hashed;
 575
 576         do_hashed = io_prep_async_work(req, &link);
 577
 578         trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
 579                                         req->flags);
 580         if (!do_hashed) {
 581                 io_wq_enqueue(ctx->io_wq, &req->work);
 582         } else {
 583                 io_wq_enqueue_hashed(ctx->io_wq, &req->work,
 584                                         file_inode(req->file));
 585         }
 586
 587         if (link)
 588                 io_queue_linked_timeout(link);
 589 }
 590
 591 static void io_kill_timeout(struct io_kiocb *req)
 592 {
 593         int ret;
 594
 595         ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
 596         if (ret != -1) {
 597                 atomic_inc(&req->ctx->cq_timeouts);
 598                 list_del_init(&req->list);
 599                 io_cqring_fill_event(req, 0);
 600                 io_put_req(req);
 601         }
 602 }
 603
 604 static void io_kill_timeouts(struct io_ring_ctx *ctx)
 605 {
 606         struct io_kiocb *req, *tmp;
 607
 608         spin_lock_irq(&ctx->completion_lock);
 609         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
 610                 io_kill_timeout(req);
 611         spin_unlock_irq(&ctx->completion_lock);
 612 }
 613
 614 static void io_commit_cqring(struct io_ring_ctx *ctx)
 615 {
 616         struct io_kiocb *req;
 617
 618         while ((req = io_get_timeout_req(ctx)) != NULL)
 619                 io_kill_timeout(req);
 620
 621         __io_commit_cqring(ctx);
 622
 623         while ((req = io_get_deferred_req(ctx)) != NULL) {
 624                 req->flags |= REQ_F_IO_DRAINED;
 625                 io_queue_async_work(req);
 626         }
 627 }
 628
 629 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 630 {
 631         struct io_rings *rings = ctx->rings;
 632         unsigned tail;
 633
 634         tail = ctx->cached_cq_tail;
 635         /*
 636          * writes to the cq entry need to come after reading head; the
 637          * control dependency is enough as we're using WRITE_ONCE to
 638          * fill the cq entry
 639          */
 640         if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
 641                 return NULL;
 642
 643         ctx->cached_cq_tail++;
 644         return &rings->cqes[tail & ctx->cq_mask];
 645 }
 646
 647 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 648 {
 649         if (waitqueue_active(&ctx->wait))
 650                 wake_up(&ctx->wait);
 651         if (waitqueue_active(&ctx->sqo_wait))
 652                 wake_up(&ctx->sqo_wait);
 653         if (ctx->cq_ev_fd)
 654                 eventfd_signal(ctx->cq_ev_fd, 1);
 655 }
 656
 657 /* Returns true if there are no backlogged entries after the flush */
 658 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 659 {
 660         struct io_rings *rings = ctx->rings;
 661         struct io_uring_cqe *cqe;
 662         struct io_kiocb *req;
 663         unsigned long flags;
 664         LIST_HEAD(list);
 665
 666         if (!force) {
 667                 if (list_empty_careful(&ctx->cq_overflow_list))
 668                         return true;
 669                 if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
 670                     rings->cq_ring_entries))
 671                         return false;
 672         }
 673
 674         spin_lock_irqsave(&ctx->completion_lock, flags);
 675
 676         /* if force is set, the ring is going away. always drop after that */
 677         if (force)
 678                 ctx->cq_overflow_flushed = true;
 679
 680         cqe = NULL;
 681         while (!list_empty(&ctx->cq_overflow_list)) {
 682                 cqe = io_get_cqring(ctx);
 683                 if (!cqe && !force)
 684                         break;
 685
 686                 req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
 687                                                 list);
 688                 list_move(&req->list, &list);
 689                 if (cqe) {
 690                         WRITE_ONCE(cqe->user_data, req->user_data);
 691                         WRITE_ONCE(cqe->res, req->result);
 692                         WRITE_ONCE(cqe->flags, 0);
 693                 } else {
 694                         WRITE_ONCE(ctx->rings->cq_overflow,
 695                                 atomic_inc_return(&ctx->cached_cq_overflow));
 696                 }
 697         }
 698
 699         io_commit_cqring(ctx);
 700         spin_unlock_irqrestore(&ctx->completion_lock, flags);
 701         io_cqring_ev_posted(ctx);
 702
 703         while (!list_empty(&list)) {
 704                 req = list_first_entry(&list, struct io_kiocb, list);
 705                 list_del(&req->list);
 706                 io_put_req(req);
 707         }
 708
 709         return cqe != NULL;
 710 }
 711
 712 static void io_cqring_fill_event(struct io_kiocb *req, long res)
 713 {
 714         struct io_ring_ctx *ctx = req->ctx;
 715         struct io_uring_cqe *cqe;
 716
 717         trace_io_uring_complete(ctx, req->user_data, res);
 718
 719         /*
 720          * If we can't get a cq entry, userspace overflowed the
 721          * submission (by quite a lot). Increment the overflow count in
 722          * the ring.
 723          */
 724         cqe = io_get_cqring(ctx);
 725         if (likely(cqe)) {
 726                 WRITE_ONCE(cqe->user_data, req->user_data);
 727                 WRITE_ONCE(cqe->res, res);
 728                 WRITE_ONCE(cqe->flags, 0);
 729         } else if (ctx->cq_overflow_flushed) {
 730                 WRITE_ONCE(ctx->rings->cq_overflow,
 731                                 atomic_inc_return(&ctx->cached_cq_overflow));
 732         } else {
 733                 refcount_inc(&req->refs);
 734                 req->result = res;
 735                 list_add_tail(&req->list, &ctx->cq_overflow_list);
 736         }
 737 }
 738
 739 static void io_cqring_add_event(struct io_kiocb *req, long res)
 740 {
 741         struct io_ring_ctx *ctx = req->ctx;
 742         unsigned long flags;
 743
 744         spin_lock_irqsave(&ctx->completion_lock, flags);
 745         io_cqring_fill_event(req, res);
 746         io_commit_cqring(ctx);
 747         spin_unlock_irqrestore(&ctx->completion_lock, flags);
 748
 749         io_cqring_ev_posted(ctx);
 750 }
 751
 752 static inline bool io_is_fallback_req(struct io_kiocb *req)
 753 {
 754         return req == (struct io_kiocb *)
 755                         ((unsigned long) req->ctx->fallback_req & ~1UL);
 756 }
 757
 758 static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
 759 {
 760         struct io_kiocb *req;
 761
 762         req = ctx->fallback_req;
 763         if (!test_and_set_bit_lock(0, (unsigned long *) ctx->fallback_req))
 764                 return req;
 765
 766         return NULL;
 767 }
 768
 769 static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 770                                    struct io_submit_state *state)
 771 {
 772         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 773         struct io_kiocb *req;
 774
 775         if (!percpu_ref_tryget(&ctx->refs))
 776                 return NULL;
 777
 778         if (!state) {
 779                 req = kmem_cache_alloc(req_cachep, gfp);
 780                 if (unlikely(!req))
 781                         goto fallback;
 782         } else if (!state->free_reqs) {
 783                 size_t sz;
 784                 int ret;
 785
 786                 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
 787                 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
 788
 789                 /*
 790                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
 791                  * retry single alloc to be on the safe side.
 792                  */
 793                 if (unlikely(ret <= 0)) {
 794                         state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
 795                         if (!state->reqs[0])
 796                                 goto fallback;
 797                         ret = 1;
 798                 }
 799                 state->free_reqs = ret - 1;
 800                 state->cur_req = 1;
 801                 req = state->reqs[0];
 802         } else {
 803                 req = state->reqs[state->cur_req];
 804                 state->free_reqs--;
 805                 state->cur_req++;
 806         }
 807
 808 got_it:
 809         req->ring_file = NULL;
 810         req->file = NULL;
 811         req->ctx = ctx;
 812         req->flags = 0;
 813         /* one is dropped after submission, the other at completion */
 814         refcount_set(&req->refs, 2);
 815         req->result = 0;
 816         INIT_IO_WORK(&req->work, io_wq_submit_work);
 817         return req;
 818 fallback:
 819         req = io_get_fallback_req(ctx);
 820         if (req)
 821                 goto got_it;
 822         percpu_ref_put(&ctx->refs);
 823         return NULL;
 824 }
 825
 826 static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
 827 {
 828         if (*nr) {
 829                 kmem_cache_free_bulk(req_cachep, *nr, reqs);
 830                 percpu_ref_put_many(&ctx->refs, *nr);
 831                 *nr = 0;
 832         }
 833 }
 834
 835 static void __io_free_req(struct io_kiocb *req)
 836 {
 837         struct io_ring_ctx *ctx = req->ctx;
 838
 839         if (req->flags & REQ_F_FREE_SQE)
 840                 kfree(req->sqe);
 841         if (req->file && !(req->flags & REQ_F_FIXED_FILE))
 842                 fput(req->file);
 843         if (req->flags & REQ_F_INFLIGHT) {
 844                 unsigned long flags;
 845
 846                 spin_lock_irqsave(&ctx->inflight_lock, flags);
 847                 list_del(&req->inflight_entry);
 848                 if (waitqueue_active(&ctx->inflight_wait))
 849                         wake_up(&ctx->inflight_wait);
 850                 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
 851         }
 852         if (req->flags & REQ_F_TIMEOUT)
 853                 kfree(req->timeout.data);
 854         percpu_ref_put(&ctx->refs);
 855         if (likely(!io_is_fallback_req(req)))
 856                 kmem_cache_free(req_cachep, req);
 857         else
 858                 clear_bit_unlock(0, (unsigned long *) ctx->fallback_req);
 859 }
 860
 861 static bool io_link_cancel_timeout(struct io_kiocb *req)
 862 {
 863         struct io_ring_ctx *ctx = req->ctx;
 864         int ret;
 865
 866         ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
 867         if (ret != -1) {
 868                 io_cqring_fill_event(req, -ECANCELED);
 869                 io_commit_cqring(ctx);
 870                 req->flags &= ~REQ_F_LINK;
 871                 io_put_req(req);
 872                 return true;
 873         }
 874
 875         return false;
 876 }
 877
 878 static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 879 {
 880         struct io_ring_ctx *ctx = req->ctx;
 881         struct io_kiocb *nxt;
 882         bool wake_ev = false;
 883
 884         /* Already got next link */
 885         if (req->flags & REQ_F_LINK_NEXT)
 886                 return;
 887
 888         /*
 889          * The list should never be empty when we are called here. But could
 890          * potentially happen if the chain is messed up, check to be on the
 891          * safe side.
 892          */
 893         nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
 894         while (nxt) {
 895                 list_del_init(&nxt->list);
 896
 897                 if ((req->flags & REQ_F_LINK_TIMEOUT) &&
 898                     (nxt->flags & REQ_F_TIMEOUT)) {
 899                         wake_ev |= io_link_cancel_timeout(nxt);
 900                         nxt = list_first_entry_or_null(&req->link_list,
 901                                                         struct io_kiocb, list);
 902                         req->flags &= ~REQ_F_LINK_TIMEOUT;
 903                         continue;
 904                 }
 905                 if (!list_empty(&req->link_list)) {
 906                         INIT_LIST_HEAD(&nxt->link_list);
 907                         list_splice(&req->link_list, &nxt->link_list);
 908                         nxt->flags |= REQ_F_LINK;
 909                 }
 910
 911                 *nxtptr = nxt;
 912                 break;
 913         }
 914
 915         req->flags |= REQ_F_LINK_NEXT;
 916         if (wake_ev)
 917                 io_cqring_ev_posted(ctx);
 918 }
 919
 920 /*
 921  * Called if REQ_F_LINK is set, and we fail the head request
 922  */
 923 static void io_fail_links(struct io_kiocb *req)
 924 {
 925         struct io_ring_ctx *ctx = req->ctx;
 926         struct io_kiocb *link;
 927         unsigned long flags;
 928
 929         spin_lock_irqsave(&ctx->completion_lock, flags);
 930
 931         while (!list_empty(&req->link_list)) {
 932                 link = list_first_entry(&req->link_list, struct io_kiocb, list);
 933                 list_del_init(&link->list);
 934
 935                 trace_io_uring_fail_link(req, link);
 936
 937                 if ((req->flags & REQ_F_LINK_TIMEOUT) &&
 938                     link->sqe->opcode == IORING_OP_LINK_TIMEOUT) {
 939                         io_link_cancel_timeout(link);
 940                 } else {
 941                         io_cqring_fill_event(link, -ECANCELED);
 942                         __io_double_put_req(link);
 943                 }
 944                 req->flags &= ~REQ_F_LINK_TIMEOUT;
 945         }
 946
 947         io_commit_cqring(ctx);
 948         spin_unlock_irqrestore(&ctx->completion_lock, flags);
 949         io_cqring_ev_posted(ctx);
 950 }
 951
 952 static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
 953 {
 954         if (likely(!(req->flags & REQ_F_LINK)))
 955                 return;
 956
 957         /*
 958          * If LINK is set, we have dependent requests in this chain. If we
 959          * didn't fail this request, queue the first one up, moving any other
 960          * dependencies to the next request. In case of failure, fail the rest
 961          * of the chain.
 962          */
 963         if (req->flags & REQ_F_FAIL_LINK) {
 964                 io_fail_links(req);
 965         } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
 966                         REQ_F_LINK_TIMEOUT) {
 967                 struct io_ring_ctx *ctx = req->ctx;
 968                 unsigned long flags;
 969
 970                 /*
 971                  * If this is a timeout link, we could be racing with the
 972                  * timeout timer. Grab the completion lock for this case to
 973                  * protect against that.
 974                  */
 975                 spin_lock_irqsave(&ctx->completion_lock, flags);
 976                 io_req_link_next(req, nxt);
 977                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
 978         } else {
 979                 io_req_link_next(req, nxt);
 980         }
 981 }
 982
 983 static void io_free_req(struct io_kiocb *req)
 984 {
 985         struct io_kiocb *nxt = NULL;
 986
 987         io_req_find_next(req, &nxt);
 988         __io_free_req(req);
 989
 990         if (nxt)
 991                 io_queue_async_work(nxt);
 992 }
 993
 994 /*
 995  * Drop reference to request, return next in chain (if there is one) if this
 996  * was the last reference to this request.
 997  */
 998 __attribute__((nonnull))
 999 static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
1000 {
1001         io_req_find_next(req, nxtptr);
1002
1003         if (refcount_dec_and_test(&req->refs))
1004                 __io_free_req(req);
1005 }
1006
1007 static void io_put_req(struct io_kiocb *req)
1008 {
1009         if (refcount_dec_and_test(&req->refs))
1010                 io_free_req(req);
1011 }
1012
1013 /*
1014  * Must only be used if we don't need to care about links, usually from
1015  * within the completion handling itself.
1016  */
1017 static void __io_double_put_req(struct io_kiocb *req)
1018 {
1019         /* drop both submit and complete references */
1020         if (refcount_sub_and_test(2, &req->refs))
1021                 __io_free_req(req);
1022 }
1023
1024 static void io_double_put_req(struct io_kiocb *req)
1025 {
1026         /* drop both submit and complete references */
1027         if (refcount_sub_and_test(2, &req->refs))
1028                 io_free_req(req);
1029 }
1030
1031 static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
1032 {
1033         struct io_rings *rings = ctx->rings;
1034
1035         /*
1036          * noflush == true is from the waitqueue handler, just ensure we wake
1037          * up the task, and the next invocation will flush the entries. We
1038          * cannot safely to it from here.
1039          */
1040         if (noflush && !list_empty(&ctx->cq_overflow_list))
1041                 return -1U;
1042
1043         io_cqring_overflow_flush(ctx, false);
1044
1045         /* See comment at the top of this file */
1046         smp_rmb();
1047         return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
1048 }
1049
1050 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
1051 {
1052         struct io_rings *rings = ctx->rings;
1053
1054         /* make sure SQ entry isn't read before tail */
1055         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
1056 }
1057
1058 /*
1059  * Find and free completed poll iocbs
1060  */
1061 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
1062                                struct list_head *done)
1063 {
1064         void *reqs[IO_IOPOLL_BATCH];
1065         struct io_kiocb *req;
1066         int to_free;
1067
1068         to_free = 0;
1069         while (!list_empty(done)) {
1070                 req = list_first_entry(done, struct io_kiocb, list);
1071                 list_del(&req->list);
1072
1073                 io_cqring_fill_event(req, req->result);
1074                 (*nr_events)++;
1075
1076                 if (refcount_dec_and_test(&req->refs)) {
1077                         /* If we're not using fixed files, we have to pair the
1078                          * completion part with the file put. Use regular
1079                          * completions for those, only batch free for fixed
1080                          * file and non-linked commands.
1081                          */
1082                         if (((req->flags &
1083                                 (REQ_F_FIXED_FILE|REQ_F_LINK|REQ_F_FREE_SQE)) ==
1084                             REQ_F_FIXED_FILE) && !io_is_fallback_req(req)) {
1085                                 reqs[to_free++] = req;
1086                                 if (to_free == ARRAY_SIZE(reqs))
1087                                         io_free_req_many(ctx, reqs, &to_free);
1088                         } else {
1089                                 io_free_req(req);
1090                         }
1091                 }
1092         }
1093
1094         io_commit_cqring(ctx);
1095         io_free_req_many(ctx, reqs, &to_free);
1096 }
1097
1098 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
1099                         long min)
1100 {
1101         struct io_kiocb *req, *tmp;
1102         LIST_HEAD(done);
1103         bool spin;
1104         int ret;
1105
1106         /*
1107          * Only spin for completions if we don't have multiple devices hanging
1108          * off our complete list, and we're under the requested amount.
1109          */
1110         spin = !ctx->poll_multi_file && *nr_events < min;
1111
1112         ret = 0;
1113         list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
1114                 struct kiocb *kiocb = &req->rw;
1115
1116                 /*
1117                  * Move completed entries to our local list. If we find a
1118                  * request that requires polling, break out and complete
1119                  * the done list first, if we have entries there.
1120                  */
1121                 if (req->flags & REQ_F_IOPOLL_COMPLETED) {
1122                         list_move_tail(&req->list, &done);
1123                         continue;
1124                 }
1125                 if (!list_empty(&done))
1126                         break;
1127
1128                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
1129                 if (ret < 0)
1130                         break;
1131
1132                 if (ret && spin)
1133                         spin = false;
1134                 ret = 0;
1135         }
1136
1137         if (!list_empty(&done))
1138                 io_iopoll_complete(ctx, nr_events, &done);
1139
1140         return ret;
1141 }
1142
1143 /*
1144  * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
1145  * non-spinning poll check - we'll still enter the driver poll loop, but only
1146  * as a non-spinning completion check.
1147  */
1148 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
1149                                 long min)
1150 {
1151         while (!list_empty(&ctx->poll_list) && !need_resched()) {
1152                 int ret;
1153
1154                 ret = io_do_iopoll(ctx, nr_events, min);
1155                 if (ret < 0)
1156                         return ret;
1157                 if (!min || *nr_events >= min)
1158                         return 0;
1159         }
1160
1161         return 1;
1162 }
1163
1164 /*
1165  * We can't just wait for polled events to come to us, we have to actively
1166  * find and complete them.
1167  */
1168 static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
1169 {
1170         if (!(ctx->flags & IORING_SETUP_IOPOLL))
1171                 return;
1172
1173         mutex_lock(&ctx->uring_lock);
1174         while (!list_empty(&ctx->poll_list)) {
1175                 unsigned int nr_events = 0;
1176
1177                 io_iopoll_getevents(ctx, &nr_events, 1);
1178
1179                 /*
1180                  * Ensure we allow local-to-the-cpu processing to take place,
1181                  * in this case we need to ensure that we reap all events.
1182                  */
1183                 cond_resched();
1184         }
1185         mutex_unlock(&ctx->uring_lock);
1186 }
1187
1188 static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1189                             long min)
1190 {
1191         int iters = 0, ret = 0;
1192
1193         do {
1194                 int tmin = 0;
1195
1196                 /*
1197                  * Don't enter poll loop if we already have events pending.
1198                  * If we do, we can potentially be spinning for commands that
1199                  * already triggered a CQE (eg in error).
1200                  */
1201                 if (io_cqring_events(ctx, false))
1202                         break;
1203
1204                 /*
1205                  * If a submit got punted to a workqueue, we can have the
1206                  * application entering polling for a command before it gets
1207                  * issued. That app will hold the uring_lock for the duration
1208                  * of the poll right here, so we need to take a breather every
1209                  * now and then to ensure that the issue has a chance to add
1210                  * the poll to the issued list. Otherwise we can spin here
1211                  * forever, while the workqueue is stuck trying to acquire the
1212                  * very same mutex.
1213                  */
1214                 if (!(++iters & 7)) {
1215                         mutex_unlock(&ctx->uring_lock);
1216                         mutex_lock(&ctx->uring_lock);
1217                 }
1218
1219                 if (*nr_events < min)
1220                         tmin = min - *nr_events;
1221
1222                 ret = io_iopoll_getevents(ctx, nr_events, tmin);
1223                 if (ret <= 0)
1224                         break;
1225                 ret = 0;
1226         } while (min && !*nr_events && !need_resched());
1227
1228         return ret;
1229 }
1230
1231 static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1232                            long min)
1233 {
1234         int ret;
1235
1236         /*
1237          * We disallow the app entering submit/complete with polling, but we
1238          * still need to lock the ring to prevent racing with polled issue
1239          * that got punted to a workqueue.
1240          */
1241         mutex_lock(&ctx->uring_lock);
1242         ret = __io_iopoll_check(ctx, nr_events, min);
1243         mutex_unlock(&ctx->uring_lock);
1244         return ret;
1245 }
1246
1247 static void kiocb_end_write(struct io_kiocb *req)
1248 {
1249         /*
1250          * Tell lockdep we inherited freeze protection from submission
1251          * thread.
1252          */
1253         if (req->flags & REQ_F_ISREG) {
1254                 struct inode *inode = file_inode(req->file);
1255
1256                 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
1257         }
1258         file_end_write(req->file);
1259 }
1260
1261 static void io_complete_rw_common(struct kiocb *kiocb, long res)
1262 {
1263         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
1264
1265         if (kiocb->ki_flags & IOCB_WRITE)
1266                 kiocb_end_write(req);
1267
1268         if ((req->flags & REQ_F_LINK) && res != req->result)
1269                 req->flags |= REQ_F_FAIL_LINK;
1270         io_cqring_add_event(req, res);
1271 }
1272
1273 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
1274 {
1275         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
1276
1277         io_complete_rw_common(kiocb, res);
1278         io_put_req(req);
1279 }
1280
1281 static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res)
1282 {
1283         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
1284         struct io_kiocb *nxt = NULL;
1285
1286         io_complete_rw_common(kiocb, res);
1287         io_put_req_find_next(req, &nxt);
1288
1289         return nxt;
1290 }
1291
1292 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
1293 {
1294         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
1295
1296         if (kiocb->ki_flags & IOCB_WRITE)
1297                 kiocb_end_write(req);
1298
1299         if ((req->flags & REQ_F_LINK) && res != req->result)
1300                 req->flags |= REQ_F_FAIL_LINK;
1301         req->result = res;
1302         if (res != -EAGAIN)
1303                 req->flags |= REQ_F_IOPOLL_COMPLETED;
1304 }
1305
1306 /*
1307  * After the iocb has been issued, it's safe to be found on the poll list.
1308  * Adding the kiocb to the list AFTER submission ensures that we don't
1309  * find it from a io_iopoll_getevents() thread before the issuer is done
1310  * accessing the kiocb cookie.
1311  */
1312 static void io_iopoll_req_issued(struct io_kiocb *req)
1313 {
1314         struct io_ring_ctx *ctx = req->ctx;
1315
1316         /*
1317          * Track whether we have multiple files in our lists. This will impact
1318          * how we do polling eventually, not spinning if we're on potentially
1319          * different devices.
1320          */
1321         if (list_empty(&ctx->poll_list)) {
1322                 ctx->poll_multi_file = false;
1323         } else if (!ctx->poll_multi_file) {
1324                 struct io_kiocb *list_req;
1325
1326                 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
1327                                                 list);
1328                 if (list_req->rw.ki_filp != req->rw.ki_filp)
1329                         ctx->poll_multi_file = true;
1330         }
1331
1332         /*
1333          * For fast devices, IO may have already completed. If it has, add
1334          * it to the front so we find it first.
1335          */
1336         if (req->flags & REQ_F_IOPOLL_COMPLETED)
1337                 list_add(&req->list, &ctx->poll_list);
1338         else
1339                 list_add_tail(&req->list, &ctx->poll_list);
1340 }
1341
1342 static void io_file_put(struct io_submit_state *state)
1343 {
1344         if (state->file) {
1345                 int diff = state->has_refs - state->used_refs;
1346
1347                 if (diff)
1348                         fput_many(state->file, diff);
1349                 state->file = NULL;
1350         }
1351 }
1352
1353 /*
1354  * Get as many references to a file as we have IOs left in this submission,
1355  * assuming most submissions are for one file, or at least that each file
1356  * has more than one submission.
1357  */
1358 static struct file *io_file_get(struct io_submit_state *state, int fd)
1359 {
1360         if (!state)
1361                 return fget(fd);
1362
1363         if (state->file) {
1364                 if (state->fd == fd) {
1365                         state->used_refs++;
1366                         state->ios_left--;
1367                         return state->file;
1368                 }
1369                 io_file_put(state);
1370         }
1371         state->file = fget_many(fd, state->ios_left);
1372         if (!state->file)
1373                 return NULL;
1374
1375         state->fd = fd;
1376         state->has_refs = state->ios_left;
1377         state->used_refs = 1;
1378         state->ios_left--;
1379         return state->file;
1380 }
1381
1382 /*
1383  * If we tracked the file through the SCM inflight mechanism, we could support
1384  * any file. For now, just ensure that anything potentially problematic is done
1385  * inline.
1386  */
1387 static bool io_file_supports_async(struct file *file)
1388 {
1389         umode_t mode = file_inode(file)->i_mode;
1390
1391         if (S_ISBLK(mode) || S_ISCHR(mode))
1392                 return true;
1393         if (S_ISREG(mode) && file->f_op != &io_uring_fops)
1394                 return true;
1395
1396         return false;
1397 }
1398
1399 static int io_prep_rw(struct io_kiocb *req, bool force_nonblock)
1400 {
1401         const struct io_uring_sqe *sqe = req->sqe;
1402         struct io_ring_ctx *ctx = req->ctx;
1403         struct kiocb *kiocb = &req->rw;
1404         unsigned ioprio;
1405         int ret;
1406
1407         if (!req->file)
1408                 return -EBADF;
1409
1410         if (S_ISREG(file_inode(req->file)->i_mode))
1411                 req->flags |= REQ_F_ISREG;
1412
1413         /*
1414          * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
1415          * we know to async punt it even if it was opened O_NONBLOCK
1416          */
1417         if (force_nonblock && !io_file_supports_async(req->file)) {
1418                 req->flags |= REQ_F_MUST_PUNT;
1419                 return -EAGAIN;
1420         }
1421
1422         kiocb->ki_pos = READ_ONCE(sqe->off);
1423         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
1424         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
1425
1426         ioprio = READ_ONCE(sqe->ioprio);
1427         if (ioprio) {
1428                 ret = ioprio_check_cap(ioprio);
1429                 if (ret)
1430                         return ret;
1431
1432                 kiocb->ki_ioprio = ioprio;
1433         } else
1434                 kiocb->ki_ioprio = get_current_ioprio();
1435
1436         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
1437         if (unlikely(ret))
1438                 return ret;
1439
1440         /* don't allow async punt if RWF_NOWAIT was requested */
1441         if ((kiocb->ki_flags & IOCB_NOWAIT) ||
1442             (req->file->f_flags & O_NONBLOCK))
1443                 req->flags |= REQ_F_NOWAIT;
1444
1445         if (force_nonblock)
1446                 kiocb->ki_flags |= IOCB_NOWAIT;
1447
1448         if (ctx->flags & IORING_SETUP_IOPOLL) {
1449                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
1450                     !kiocb->ki_filp->f_op->iopoll)
1451                         return -EOPNOTSUPP;
1452
1453                 kiocb->ki_flags |= IOCB_HIPRI;
1454                 kiocb->ki_complete = io_complete_rw_iopoll;
1455                 req->result = 0;
1456         } else {
1457                 if (kiocb->ki_flags & IOCB_HIPRI)
1458                         return -EINVAL;
1459                 kiocb->ki_complete = io_complete_rw;
1460         }
1461         return 0;
1462 }
1463
1464 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
1465 {
1466         switch (ret) {
1467         case -EIOCBQUEUED:
1468                 break;
1469         case -ERESTARTSYS:
1470         case -ERESTARTNOINTR:
1471         case -ERESTARTNOHAND:
1472         case -ERESTART_RESTARTBLOCK:
1473                 /*
1474                  * We can't just restart the syscall, since previously
1475                  * submitted sqes may already be in progress. Just fail this
1476                  * IO with EINTR.
1477                  */
1478                 ret = -EINTR;
1479                 /* fall through */
1480         default:
1481                 kiocb->ki_complete(kiocb, ret, 0);
1482         }
1483 }
1484
1485 static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
1486                        bool in_async)
1487 {
1488         if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw)
1489                 *nxt = __io_complete_rw(kiocb, ret);
1490         else
1491                 io_rw_done(kiocb, ret);
1492 }
1493
1494 static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw,
1495                                const struct io_uring_sqe *sqe,
1496                                struct iov_iter *iter)
1497 {
1498         size_t len = READ_ONCE(sqe->len);
1499         struct io_mapped_ubuf *imu;
1500         unsigned index, buf_index;
1501         size_t offset;
1502         u64 buf_addr;
1503
1504         /* attempt to use fixed buffers without having provided iovecs */
1505         if (unlikely(!ctx->user_bufs))
1506                 return -EFAULT;
1507
1508         buf_index = READ_ONCE(sqe->buf_index);
1509         if (unlikely(buf_index >= ctx->nr_user_bufs))
1510                 return -EFAULT;
1511
1512         index = array_index_nospec(buf_index, ctx->nr_user_bufs);
1513         imu = &ctx->user_bufs[index];
1514         buf_addr = READ_ONCE(sqe->addr);
1515
1516         /* overflow */
1517         if (buf_addr + len < buf_addr)
1518                 return -EFAULT;
1519         /* not inside the mapped region */
1520         if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
1521                 return -EFAULT;
1522
1523         /*
1524          * May not be a start of buffer, set size appropriately
1525          * and advance us to the beginning.
1526          */
1527         offset = buf_addr - imu->ubuf;
1528         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
1529
1530         if (offset) {
1531                 /*
1532                  * Don't use iov_iter_advance() here, as it's really slow for
1533                  * using the latter parts of a big fixed buffer - it iterates
1534                  * over each segment manually. We can cheat a bit here, because
1535                  * we know that:
1536                  *
1537                  * 1) it's a BVEC iter, we set it up
1538                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
1539                  *    first and last bvec
1540                  *
1541                  * So just find our index, and adjust the iterator afterwards.
1542                  * If the offset is within the first bvec (or the whole first
1543                  * bvec, just use iov_iter_advance(). This makes it easier
1544                  * since we can just skip the first segment, which may not
1545                  * be PAGE_SIZE aligned.
1546                  */
1547                 const struct bio_vec *bvec = imu->bvec;
1548
1549                 if (offset <= bvec->bv_len) {
1550                         iov_iter_advance(iter, offset);
1551                 } else {
1552                         unsigned long seg_skip;
1553
1554                         /* skip first vec */
1555                         offset -= bvec->bv_len;
1556                         seg_skip = 1 + (offset >> PAGE_SHIFT);
1557
1558                         iter->bvec = bvec + seg_skip;
1559                         iter->nr_segs -= seg_skip;
1560                         iter->count -= bvec->bv_len + offset;
1561                         iter->iov_offset = offset & ~PAGE_MASK;
1562                 }
1563         }
1564
1565         return len;
1566 }
1567
1568 static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
1569                                struct iovec **iovec, struct iov_iter *iter)
1570 {
1571         const struct io_uring_sqe *sqe = req->sqe;
1572         void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
1573         size_t sqe_len = READ_ONCE(sqe->len);
1574         u8 opcode;
1575
1576         /*
1577          * We're reading ->opcode for the second time, but the first read
1578          * doesn't care whether it's _FIXED or not, so it doesn't matter
1579          * whether ->opcode changes concurrently. The first read does care
1580          * about whether it is a READ or a WRITE, so we don't trust this read
1581          * for that purpose and instead let the caller pass in the read/write
1582          * flag.
1583          */
1584         opcode = READ_ONCE(sqe->opcode);
1585         if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
1586                 *iovec = NULL;
1587                 return io_import_fixed(req->ctx, rw, sqe, iter);
1588         }
1589
1590         if (!req->has_user)
1591                 return -EFAULT;
1592
1593 #ifdef CONFIG_COMPAT
1594         if (req->ctx->compat)
1595                 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
1596                                                 iovec, iter);
1597 #endif
1598
1599         return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
1600 }
1601
1602 /*
1603  * For files that don't have ->read_iter() and ->write_iter(), handle them
1604  * by looping over ->read() or ->write() manually.
1605  */
1606 static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
1607                            struct iov_iter *iter)
1608 {
1609         ssize_t ret = 0;
1610
1611         /*
1612          * Don't support polled IO through this interface, and we can't
1613          * support non-blocking either. For the latter, this just causes
1614          * the kiocb to be handled from an async context.
1615          */
1616         if (kiocb->ki_flags & IOCB_HIPRI)
1617                 return -EOPNOTSUPP;
1618         if (kiocb->ki_flags & IOCB_NOWAIT)
1619                 return -EAGAIN;
1620
1621         while (iov_iter_count(iter)) {
1622                 struct iovec iovec;
1623                 ssize_t nr;
1624
1625                 if (!iov_iter_is_bvec(iter)) {
1626                         iovec = iov_iter_iovec(iter);
1627                 } else {
1628                         /* fixed buffers import bvec */
1629                         iovec.iov_base = kmap(iter->bvec->bv_page)
1630                                                 + iter->iov_offset;
1631                         iovec.iov_len = min(iter->count,
1632                                         iter->bvec->bv_len - iter->iov_offset);
1633                 }
1634
1635                 if (rw == READ) {
1636                         nr = file->f_op->read(file, iovec.iov_base,
1637                                               iovec.iov_len, &kiocb->ki_pos);
1638                 } else {
1639                         nr = file->f_op->write(file, iovec.iov_base,
1640                                                iovec.iov_len, &kiocb->ki_pos);
1641                 }
1642
1643                 if (iov_iter_is_bvec(iter))
1644                         kunmap(iter->bvec->bv_page);
1645
1646                 if (nr < 0) {
1647                         if (!ret)
1648                                 ret = nr;
1649                         break;
1650                 }
1651                 ret += nr;
1652                 if (nr != iovec.iov_len)
1653                         break;
1654                 iov_iter_advance(iter, nr);
1655         }
1656
1657         return ret;
1658 }
1659
1660 static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
1661                    bool force_nonblock)
1662 {
1663         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
1664         struct kiocb *kiocb = &req->rw;
1665         struct iov_iter iter;
1666         struct file *file;
1667         size_t iov_count;
1668         ssize_t read_size, ret;
1669
1670         ret = io_prep_rw(req, force_nonblock);
1671         if (ret)
1672                 return ret;
1673         file = kiocb->ki_filp;
1674
1675         if (unlikely(!(file->f_mode & FMODE_READ)))
1676                 return -EBADF;
1677
1678         ret = io_import_iovec(READ, req, &iovec, &iter);
1679         if (ret < 0)
1680                 return ret;
1681
1682         read_size = ret;
1683         if (req->flags & REQ_F_LINK)
1684                 req->result = read_size;
1685
1686         iov_count = iov_iter_count(&iter);
1687         ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
1688         if (!ret) {
1689                 ssize_t ret2;
1690
1691                 if (file->f_op->read_iter)
1692                         ret2 = call_read_iter(file, kiocb, &iter);
1693                 else
1694                         ret2 = loop_rw_iter(READ, file, kiocb, &iter);
1695
1696                 /*
1697                  * In case of a short read, punt to async. This can happen
1698                  * if we have data partially cached. Alternatively we can
1699                  * return the short read, in which case the application will
1700                  * need to issue another SQE and wait for it. That SQE will
1701                  * need async punt anyway, so it's more efficient to do it
1702                  * here.
1703                  */
1704                 if (force_nonblock && !(req->flags & REQ_F_NOWAIT) &&
1705                     (req->flags & REQ_F_ISREG) &&
1706                     ret2 > 0 && ret2 < read_size)
1707                         ret2 = -EAGAIN;
1708                 /* Catch -EAGAIN return for forced non-blocking submission */
1709                 if (!force_nonblock || ret2 != -EAGAIN)
1710                         kiocb_done(kiocb, ret2, nxt, req->in_async);
1711                 else
1712                         ret = -EAGAIN;
1713         }
1714         kfree(iovec);
1715         return ret;
1716 }
1717
1718 static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
1719                     bool force_nonblock)
1720 {
1721         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
1722         struct kiocb *kiocb = &req->rw;
1723         struct iov_iter iter;
1724         struct file *file;
1725         size_t iov_count;
1726         ssize_t ret;
1727
1728         ret = io_prep_rw(req, force_nonblock);
1729         if (ret)
1730                 return ret;
1731
1732         file = kiocb->ki_filp;
1733         if (unlikely(!(file->f_mode & FMODE_WRITE)))
1734                 return -EBADF;
1735
1736         ret = io_import_iovec(WRITE, req, &iovec, &iter);
1737         if (ret < 0)
1738                 return ret;
1739
1740         if (req->flags & REQ_F_LINK)
1741                 req->result = ret;
1742
1743         iov_count = iov_iter_count(&iter);
1744
1745         ret = -EAGAIN;
1746         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT))
1747                 goto out_free;
1748
1749         ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
1750         if (!ret) {
1751                 ssize_t ret2;
1752
1753                 /*
1754                  * Open-code file_start_write here to grab freeze protection,
1755                  * which will be released by another thread in
1756                  * io_complete_rw().  Fool lockdep by telling it the lock got
1757                  * released so that it doesn't complain about the held lock when
1758                  * we return to userspace.
1759                  */
1760                 if (req->flags & REQ_F_ISREG) {
1761                         __sb_start_write(file_inode(file)->i_sb,
1762                                                 SB_FREEZE_WRITE, true);
1763                         __sb_writers_release(file_inode(file)->i_sb,
1764                                                 SB_FREEZE_WRITE);
1765                 }
1766                 kiocb->ki_flags |= IOCB_WRITE;
1767
1768                 if (file->f_op->write_iter)
1769                         ret2 = call_write_iter(file, kiocb, &iter);
1770                 else
1771                         ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
1772                 if (!force_nonblock || ret2 != -EAGAIN)
1773                         kiocb_done(kiocb, ret2, nxt, req->in_async);
1774                 else
1775                         ret = -EAGAIN;
1776         }
1777 out_free:
1778         kfree(iovec);
1779         return ret;
1780 }
1781
1782 /*
1783  * IORING_OP_NOP just posts a completion event, nothing else.
1784  */
1785 static int io_nop(struct io_kiocb *req)
1786 {
1787         struct io_ring_ctx *ctx = req->ctx;
1788
1789         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
1790                 return -EINVAL;
1791
1792         io_cqring_add_event(req, 0);
1793         io_put_req(req);
1794         return 0;
1795 }
1796
1797 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1798 {
1799         struct io_ring_ctx *ctx = req->ctx;
1800
1801         if (!req->file)
1802                 return -EBADF;
1803
1804         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
1805                 return -EINVAL;
1806         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
1807                 return -EINVAL;
1808
1809         return 0;
1810 }
1811
1812 static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1813                     struct io_kiocb **nxt, bool force_nonblock)
1814 {
1815         loff_t sqe_off = READ_ONCE(sqe->off);
1816         loff_t sqe_len = READ_ONCE(sqe->len);
1817         loff_t end = sqe_off + sqe_len;
1818         unsigned fsync_flags;
1819         int ret;
1820
1821         fsync_flags = READ_ONCE(sqe->fsync_flags);
1822         if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
1823                 return -EINVAL;
1824
1825         ret = io_prep_fsync(req, sqe);
1826         if (ret)
1827                 return ret;
1828
1829         /* fsync always requires a blocking context */
1830         if (force_nonblock)
1831                 return -EAGAIN;
1832
1833         ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
1834                                 end > 0 ? end : LLONG_MAX,
1835                                 fsync_flags & IORING_FSYNC_DATASYNC);
1836
1837         if (ret < 0 && (req->flags & REQ_F_LINK))
1838                 req->flags |= REQ_F_FAIL_LINK;
1839         io_cqring_add_event(req, ret);
1840         io_put_req_find_next(req, nxt);
1841         return 0;
1842 }
1843
1844 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1845 {
1846         struct io_ring_ctx *ctx = req->ctx;
1847         int ret = 0;
1848
1849         if (!req->file)
1850                 return -EBADF;
1851
1852         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
1853                 return -EINVAL;
1854         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
1855                 return -EINVAL;
1856
1857         return ret;
1858 }
1859
1860 static int io_sync_file_range(struct io_kiocb *req,
1861                               const struct io_uring_sqe *sqe,
1862                               struct io_kiocb **nxt,
1863                               bool force_nonblock)
1864 {
1865         loff_t sqe_off;
1866         loff_t sqe_len;
1867         unsigned flags;
1868         int ret;
1869
1870         ret = io_prep_sfr(req, sqe);
1871         if (ret)
1872                 return ret;
1873
1874         /* sync_file_range always requires a blocking context */
1875         if (force_nonblock)
1876                 return -EAGAIN;
1877
1878         sqe_off = READ_ONCE(sqe->off);
1879         sqe_len = READ_ONCE(sqe->len);
1880         flags = READ_ONCE(sqe->sync_range_flags);
1881
1882         ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);
1883
1884         if (ret < 0 && (req->flags & REQ_F_LINK))
1885                 req->flags |= REQ_F_FAIL_LINK;
1886         io_cqring_add_event(req, ret);
1887         io_put_req_find_next(req, nxt);
1888         return 0;
1889 }
1890
1891 #if defined(CONFIG_NET)
1892 static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1893                            struct io_kiocb **nxt, bool force_nonblock,
1894                    long (*fn)(struct socket *, struct user_msghdr __user *,
1895                                 unsigned int))
1896 {
1897         struct socket *sock;
1898         int ret;
1899
1900         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
1901                 return -EINVAL;
1902
1903         sock = sock_from_file(req->file, &ret);
1904         if (sock) {
1905                 struct user_msghdr __user *msg;
1906                 unsigned flags;
1907
1908                 flags = READ_ONCE(sqe->msg_flags);
1909                 if (flags & MSG_DONTWAIT)
1910                         req->flags |= REQ_F_NOWAIT;
1911                 else if (force_nonblock)
1912                         flags |= MSG_DONTWAIT;
1913
1914                 msg = (struct user_msghdr __user *) (unsigned long)
1915                         READ_ONCE(sqe->addr);
1916
1917                 ret = fn(sock, msg, flags);
1918                 if (force_nonblock && ret == -EAGAIN)
1919                         return ret;
1920         }
1921
1922         io_cqring_add_event(req, ret);
1923         if (ret < 0 && (req->flags & REQ_F_LINK))
1924                 req->flags |= REQ_F_FAIL_LINK;
1925         io_put_req_find_next(req, nxt);
1926         return 0;
1927 }
1928 #endif
1929
1930 static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1931                       struct io_kiocb **nxt, bool force_nonblock)
1932 {
1933 #if defined(CONFIG_NET)
1934         return io_send_recvmsg(req, sqe, nxt, force_nonblock,
1935                                 __sys_sendmsg_sock);
1936 #else
1937         return -EOPNOTSUPP;
1938 #endif
1939 }
1940
1941 static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1942                       struct io_kiocb **nxt, bool force_nonblock)
1943 {
1944 #if defined(CONFIG_NET)
1945         return io_send_recvmsg(req, sqe, nxt, force_nonblock,
1946                                 __sys_recvmsg_sock);
1947 #else
1948         return -EOPNOTSUPP;
1949 #endif
1950 }
1951
1952 static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1953                      struct io_kiocb **nxt, bool force_nonblock)
1954 {
1955 #if defined(CONFIG_NET)
1956         struct sockaddr __user *addr;
1957         int __user *addr_len;
1958         unsigned file_flags;
1959         int flags, ret;
1960
1961         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
1962                 return -EINVAL;
1963         if (sqe->ioprio || sqe->len || sqe->buf_index)
1964                 return -EINVAL;
1965
1966         addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr);
1967         addr_len = (int __user *) (unsigned long) READ_ONCE(sqe->addr2);
1968         flags = READ_ONCE(sqe->accept_flags);
1969         file_flags = force_nonblock ? O_NONBLOCK : 0;
1970
1971         ret = __sys_accept4_file(req->file, file_flags, addr, addr_len, flags);
1972         if (ret == -EAGAIN && force_nonblock) {
1973                 req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
1974                 return -EAGAIN;
1975         }
1976         if (ret == -ERESTARTSYS)
1977                 ret = -EINTR;
1978         if (ret < 0 && (req->flags & REQ_F_LINK))
1979                 req->flags |= REQ_F_FAIL_LINK;
1980         io_cqring_add_event(req, ret);
1981         io_put_req_find_next(req, nxt);
1982         return 0;
1983 #else
1984         return -EOPNOTSUPP;
1985 #endif
1986 }
1987
1988 static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1989                       struct io_kiocb **nxt, bool force_nonblock)
1990 {
1991 #if defined(CONFIG_NET)
1992         struct sockaddr __user *addr;
1993         unsigned file_flags;
1994         int addr_len, ret;
1995
1996         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
1997                 return -EINVAL;
1998         if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
1999                 return -EINVAL;
2000
2001         addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr);
2002         addr_len = READ_ONCE(sqe->addr2);
2003         file_flags = force_nonblock ? O_NONBLOCK : 0;
2004
2005         ret = __sys_connect_file(req->file, addr, addr_len, file_flags);
2006         if (ret == -EAGAIN && force_nonblock)
2007                 return -EAGAIN;
2008         if (ret == -ERESTARTSYS)
2009                 ret = -EINTR;
2010         if (ret < 0 && (req->flags & REQ_F_LINK))
2011                 req->flags |= REQ_F_FAIL_LINK;
2012         io_cqring_add_event(req, ret);
2013         io_put_req_find_next(req, nxt);
2014         return 0;
2015 #else
2016         return -EOPNOTSUPP;
2017 #endif
2018 }
2019
2020 static inline void io_poll_remove_req(struct io_kiocb *req)
2021 {
2022         if (!RB_EMPTY_NODE(&req->rb_node)) {
2023                 rb_erase(&req->rb_node, &req->ctx->cancel_tree);
2024                 RB_CLEAR_NODE(&req->rb_node);
2025         }
2026 }
2027
2028 static void io_poll_remove_one(struct io_kiocb *req)
2029 {
2030         struct io_poll_iocb *poll = &req->poll;
2031
2032         spin_lock(&poll->head->lock);
2033         WRITE_ONCE(poll->canceled, true);
2034         if (!list_empty(&poll->wait->entry)) {
2035                 list_del_init(&poll->wait->entry);
2036                 io_queue_async_work(req);
2037         }
2038         spin_unlock(&poll->head->lock);
2039         io_poll_remove_req(req);
2040 }
2041
2042 static void io_poll_remove_all(struct io_ring_ctx *ctx)
2043 {
2044         struct rb_node *node;
2045         struct io_kiocb *req;
2046
2047         spin_lock_irq(&ctx->completion_lock);
2048         while ((node = rb_first(&ctx->cancel_tree)) != NULL) {
2049                 req = rb_entry(node, struct io_kiocb, rb_node);
2050                 io_poll_remove_one(req);
2051         }
2052         spin_unlock_irq(&ctx->completion_lock);
2053 }
2054
2055 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
2056 {
2057         struct rb_node *p, *parent = NULL;
2058         struct io_kiocb *req;
2059
2060         p = ctx->cancel_tree.rb_node;
2061         while (p) {
2062                 parent = p;
2063                 req = rb_entry(parent, struct io_kiocb, rb_node);
2064                 if (sqe_addr < req->user_data) {
2065                         p = p->rb_left;
2066                 } else if (sqe_addr > req->user_data) {
2067                         p = p->rb_right;
2068                 } else {
2069                         io_poll_remove_one(req);
2070                         return 0;
2071                 }
2072         }
2073
2074         return -ENOENT;
2075 }
2076
2077 /*
2078  * Find a running poll command that matches one specified in sqe->addr,
2079  * and remove it if found.
2080  */
2081 static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2082 {
2083         struct io_ring_ctx *ctx = req->ctx;
2084         int ret;
2085
2086         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2087                 return -EINVAL;
2088         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
2089             sqe->poll_events)
2090                 return -EINVAL;
2091
2092         spin_lock_irq(&ctx->completion_lock);
2093         ret = io_poll_cancel(ctx, READ_ONCE(sqe->addr));
2094         spin_unlock_irq(&ctx->completion_lock);
2095
2096         io_cqring_add_event(req, ret);
2097         if (ret < 0 && (req->flags & REQ_F_LINK))
2098                 req->flags |= REQ_F_FAIL_LINK;
2099         io_put_req(req);
2100         return 0;
2101 }
2102
2103 static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
2104 {
2105         struct io_ring_ctx *ctx = req->ctx;
2106
2107         req->poll.done = true;
2108         kfree(req->poll.wait);
2109         if (error)
2110                 io_cqring_fill_event(req, error);
2111         else
2112                 io_cqring_fill_event(req, mangle_poll(mask));
2113         io_commit_cqring(ctx);
2114 }
2115
2116 static void io_poll_complete_work(struct io_wq_work **workptr)
2117 {
2118         struct io_wq_work *work = *workptr;
2119         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
2120         struct io_poll_iocb *poll = &req->poll;
2121         struct poll_table_struct pt = { ._key = poll->events };
2122         struct io_ring_ctx *ctx = req->ctx;
2123         struct io_kiocb *nxt = NULL;
2124         __poll_t mask = 0;
2125         int ret = 0;
2126
2127         if (work->flags & IO_WQ_WORK_CANCEL) {
2128                 WRITE_ONCE(poll->canceled, true);
2129                 ret = -ECANCELED;
2130         } else if (READ_ONCE(poll->canceled)) {
2131                 ret = -ECANCELED;
2132         }
2133
2134         if (ret != -ECANCELED)
2135                 mask = vfs_poll(poll->file, &pt) & poll->events;
2136
2137         /*
2138          * Note that ->ki_cancel callers also delete iocb from active_reqs after
2139          * calling ->ki_cancel.  We need the ctx_lock roundtrip here to
2140          * synchronize with them.  In the cancellation case the list_del_init
2141          * itself is not actually needed, but harmless so we keep it in to
2142          * avoid further branches in the fast path.
2143          */
2144         spin_lock_irq(&ctx->completion_lock);
2145         if (!mask && ret != -ECANCELED) {
2146                 add_wait_queue(poll->head, poll->wait);
2147                 spin_unlock_irq(&ctx->completion_lock);
2148                 return;
2149         }
2150         io_poll_remove_req(req);
2151         io_poll_complete(req, mask, ret);
2152         spin_unlock_irq(&ctx->completion_lock);
2153
2154         io_cqring_ev_posted(ctx);
2155
2156         if (ret < 0 && req->flags & REQ_F_LINK)
2157                 req->flags |= REQ_F_FAIL_LINK;
2158         io_put_req_find_next(req, &nxt);
2159         if (nxt)
2160                 *workptr = &nxt->work;
2161 }
2162
2163 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
2164                         void *key)
2165 {
2166         struct io_poll_iocb *poll = wait->private;
2167         struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
2168         struct io_ring_ctx *ctx = req->ctx;
2169         __poll_t mask = key_to_poll(key);
2170         unsigned long flags;
2171
2172         /* for instances that support it check for an event match first: */
2173         if (mask && !(mask & poll->events))
2174                 return 0;
2175
2176         list_del_init(&poll->wait->entry);
2177
2178         /*
2179          * Run completion inline if we can. We're using trylock here because
2180          * we are violating the completion_lock -> poll wq lock ordering.
2181          * If we have a link timeout we're going to need the completion_lock
2182          * for finalizing the request, mark us as having grabbed that already.
2183          */
2184         if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
2185                 io_poll_remove_req(req);
2186                 io_poll_complete(req, mask, 0);
2187                 req->flags |= REQ_F_COMP_LOCKED;
2188                 io_put_req(req);
2189                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
2190
2191                 io_cqring_ev_posted(ctx);
2192         } else {
2193                 io_queue_async_work(req);
2194         }
2195
2196         return 1;
2197 }
2198
2199 struct io_poll_table {
2200         struct poll_table_struct pt;
2201         struct io_kiocb *req;
2202         int error;
2203 };
2204
2205 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
2206                                struct poll_table_struct *p)
2207 {
2208         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
2209
2210         if (unlikely(pt->req->poll.head)) {
2211                 pt->error = -EINVAL;
2212                 return;
2213         }
2214
2215         pt->error = 0;
2216         pt->req->poll.head = head;
2217         add_wait_queue(head, pt->req->poll.wait);
2218 }
2219
2220 static void io_poll_req_insert(struct io_kiocb *req)
2221 {
2222         struct io_ring_ctx *ctx = req->ctx;
2223         struct rb_node **p = &ctx->cancel_tree.rb_node;
2224         struct rb_node *parent = NULL;
2225         struct io_kiocb *tmp;
2226
2227         while (*p) {
2228                 parent = *p;
2229                 tmp = rb_entry(parent, struct io_kiocb, rb_node);
2230                 if (req->user_data < tmp->user_data)
2231                         p = &(*p)->rb_left;
2232                 else
2233                         p = &(*p)->rb_right;
2234         }
2235         rb_link_node(&req->rb_node, parent, p);
2236         rb_insert_color(&req->rb_node, &ctx->cancel_tree);
2237 }
2238
2239 static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2240                        struct io_kiocb **nxt)
2241 {
2242         struct io_poll_iocb *poll = &req->poll;
2243         struct io_ring_ctx *ctx = req->ctx;
2244         struct io_poll_table ipt;
2245         bool cancel = false;
2246         __poll_t mask;
2247         u16 events;
2248
2249         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2250                 return -EINVAL;
2251         if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
2252                 return -EINVAL;
2253         if (!poll->file)
2254                 return -EBADF;
2255
2256         poll->wait = kmalloc(sizeof(*poll->wait), GFP_KERNEL);
2257         if (!poll->wait)
2258                 return -ENOMEM;
2259
2260         req->sqe = NULL;
2261         INIT_IO_WORK(&req->work, io_poll_complete_work);
2262         events = READ_ONCE(sqe->poll_events);
2263         poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
2264         RB_CLEAR_NODE(&req->rb_node);
2265
2266         poll->head = NULL;
2267         poll->done = false;
2268         poll->canceled = false;
2269
2270         ipt.pt._qproc = io_poll_queue_proc;
2271         ipt.pt._key = poll->events;
2272         ipt.req = req;
2273         ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
2274
2275         /* initialized the list so that we can do list_empty checks */
2276         INIT_LIST_HEAD(&poll->wait->entry);
2277         init_waitqueue_func_entry(poll->wait, io_poll_wake);
2278         poll->wait->private = poll;
2279
2280         INIT_LIST_HEAD(&req->list);
2281
2282         mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
2283
2284         spin_lock_irq(&ctx->completion_lock);
2285         if (likely(poll->head)) {
2286                 spin_lock(&poll->head->lock);
2287                 if (unlikely(list_empty(&poll->wait->entry))) {
2288                         if (ipt.error)
2289                                 cancel = true;
2290                         ipt.error = 0;
2291                         mask = 0;
2292                 }
2293                 if (mask || ipt.error)
2294                         list_del_init(&poll->wait->entry);
2295                 else if (cancel)
2296                         WRITE_ONCE(poll->canceled, true);
2297                 else if (!poll->done) /* actually waiting for an event */
2298                         io_poll_req_insert(req);
2299                 spin_unlock(&poll->head->lock);
2300         }
2301         if (mask) { /* no async, we'd stolen it */
2302                 ipt.error = 0;
2303                 io_poll_complete(req, mask, 0);
2304         }
2305         spin_unlock_irq(&ctx->completion_lock);
2306
2307         if (mask) {
2308                 io_cqring_ev_posted(ctx);
2309                 io_put_req_find_next(req, nxt);
2310         }
2311         return ipt.error;
2312 }
2313
2314 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
2315 {
2316         struct io_timeout_data *data = container_of(timer,
2317                                                 struct io_timeout_data, timer);
2318         struct io_kiocb *req = data->req;
2319         struct io_ring_ctx *ctx = req->ctx;
2320         unsigned long flags;
2321
2322         atomic_inc(&ctx->cq_timeouts);
2323
2324         spin_lock_irqsave(&ctx->completion_lock, flags);
2325         /*
2326          * We could be racing with timeout deletion. If the list is empty,
2327          * then timeout lookup already found it and will be handling it.
2328          */
2329         if (!list_empty(&req->list)) {
2330                 struct io_kiocb *prev;
2331
2332                 /*
2333                  * Adjust the reqs sequence before the current one because it
2334                  * will consume a slot in the cq_ring and the the cq_tail
2335                  * pointer will be increased, otherwise other timeout reqs may
2336                  * return in advance without waiting for enough wait_nr.
2337                  */
2338                 prev = req;
2339                 list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
2340                         prev->sequence++;
2341                 list_del_init(&req->list);
2342         }
2343
2344         io_cqring_fill_event(req, -ETIME);
2345         io_commit_cqring(ctx);
2346         spin_unlock_irqrestore(&ctx->completion_lock, flags);
2347
2348         io_cqring_ev_posted(ctx);
2349         if (req->flags & REQ_F_LINK)
2350                 req->flags |= REQ_F_FAIL_LINK;
2351         io_put_req(req);
2352         return HRTIMER_NORESTART;
2353 }
2354
2355 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
2356 {
2357         struct io_kiocb *req;
2358         int ret = -ENOENT;
2359
2360         list_for_each_entry(req, &ctx->timeout_list, list) {
2361                 if (user_data == req->user_data) {
2362                         list_del_init(&req->list);
2363                         ret = 0;
2364                         break;
2365                 }
2366         }
2367
2368         if (ret == -ENOENT)
2369                 return ret;
2370
2371         ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
2372         if (ret == -1)
2373                 return -EALREADY;
2374
2375         if (req->flags & REQ_F_LINK)
2376                 req->flags |= REQ_F_FAIL_LINK;
2377         io_cqring_fill_event(req, -ECANCELED);
2378         io_put_req(req);
2379         return 0;
2380 }
2381
2382 /*
2383  * Remove or update an existing timeout command
2384  */
2385 static int io_timeout_remove(struct io_kiocb *req,
2386                              const struct io_uring_sqe *sqe)
2387 {
2388         struct io_ring_ctx *ctx = req->ctx;
2389         unsigned flags;
2390         int ret;
2391
2392         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2393                 return -EINVAL;
2394         if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
2395                 return -EINVAL;
2396         flags = READ_ONCE(sqe->timeout_flags);
2397         if (flags)
2398                 return -EINVAL;
2399
2400         spin_lock_irq(&ctx->completion_lock);
2401         ret = io_timeout_cancel(ctx, READ_ONCE(sqe->addr));
2402
2403         io_cqring_fill_event(req, ret);
2404         io_commit_cqring(ctx);
2405         spin_unlock_irq(&ctx->completion_lock);
2406         io_cqring_ev_posted(ctx);
2407         if (ret < 0 && req->flags & REQ_F_LINK)
2408                 req->flags |= REQ_F_FAIL_LINK;
2409         io_put_req(req);
2410         return 0;
2411 }
2412
2413 static int io_timeout_setup(struct io_kiocb *req)
2414 {
2415         const struct io_uring_sqe *sqe = req->sqe;
2416         struct io_timeout_data *data;
2417         unsigned flags;
2418
2419         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2420                 return -EINVAL;
2421         if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
2422                 return -EINVAL;
2423         flags = READ_ONCE(sqe->timeout_flags);
2424         if (flags & ~IORING_TIMEOUT_ABS)
2425                 return -EINVAL;
2426
2427         data = kzalloc(sizeof(struct io_timeout_data), GFP_KERNEL);
2428         if (!data)
2429                 return -ENOMEM;
2430         data->req = req;
2431         req->timeout.data = data;
2432         req->flags |= REQ_F_TIMEOUT;
2433
2434         if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
2435                 return -EFAULT;
2436
2437         if (flags & IORING_TIMEOUT_ABS)
2438                 data->mode = HRTIMER_MODE_ABS;
2439         else
2440                 data->mode = HRTIMER_MODE_REL;
2441
2442         hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
2443         return 0;
2444 }
2445
2446 static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2447 {
2448         unsigned count;
2449         struct io_ring_ctx *ctx = req->ctx;
2450         struct io_timeout_data *data;
2451         struct list_head *entry;
2452         unsigned span = 0;
2453         int ret;
2454
2455         ret = io_timeout_setup(req);
2456         /* common setup allows flags (like links) set, we don't */
2457         if (!ret && sqe->flags)
2458                 ret = -EINVAL;
2459         if (ret)
2460                 return ret;
2461
2462         /*
2463          * sqe->off holds how many events that need to occur for this
2464          * timeout event to be satisfied. If it isn't set, then this is
2465          * a pure timeout request, sequence isn't used.
2466          */
2467         count = READ_ONCE(sqe->off);
2468         if (!count) {
2469                 req->flags |= REQ_F_TIMEOUT_NOSEQ;
2470                 spin_lock_irq(&ctx->completion_lock);
2471                 entry = ctx->timeout_list.prev;
2472                 goto add;
2473         }
2474
2475         req->sequence = ctx->cached_sq_head + count - 1;
2476         req->timeout.data->seq_offset = count;
2477
2478         /*
2479          * Insertion sort, ensuring the first entry in the list is always
2480          * the one we need first.
2481          */
2482         spin_lock_irq(&ctx->completion_lock);
2483         list_for_each_prev(entry, &ctx->timeout_list) {
2484                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
2485                 unsigned nxt_sq_head;
2486                 long long tmp, tmp_nxt;
2487                 u32 nxt_offset = nxt->timeout.data->seq_offset;
2488
2489                 if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
2490                         continue;
2491
2492                 /*
2493                  * Since cached_sq_head + count - 1 can overflow, use type long
2494                  * long to store it.
2495                  */
2496                 tmp = (long long)ctx->cached_sq_head + count - 1;
2497                 nxt_sq_head = nxt->sequence - nxt_offset + 1;
2498                 tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1;
2499
2500                 /*
2501                  * cached_sq_head may overflow, and it will never overflow twice
2502                  * once there is some timeout req still be valid.
2503                  */
2504                 if (ctx->cached_sq_head < nxt_sq_head)
2505                         tmp += UINT_MAX;
2506
2507                 if (tmp > tmp_nxt)
2508                         break;
2509
2510                 /*
2511                  * Sequence of reqs after the insert one and itself should
2512                  * be adjusted because each timeout req consumes a slot.
2513                  */
2514                 span++;
2515                 nxt->sequence++;
2516         }
2517         req->sequence -= span;
2518 add:
2519         list_add(&req->list, entry);
2520         data = req->timeout.data;
2521         data->timer.function = io_timeout_fn;
2522         hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
2523         spin_unlock_irq(&ctx->completion_lock);
2524         return 0;
2525 }
2526
2527 static bool io_cancel_cb(struct io_wq_work *work, void *data)
2528 {
2529         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
2530
2531         return req->user_data == (unsigned long) data;
2532 }
2533
2534 static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
2535 {
2536         enum io_wq_cancel cancel_ret;
2537         int ret = 0;
2538
2539         cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
2540         switch (cancel_ret) {
2541         case IO_WQ_CANCEL_OK:
2542                 ret = 0;
2543                 break;
2544         case IO_WQ_CANCEL_RUNNING:
2545                 ret = -EALREADY;
2546                 break;
2547         case IO_WQ_CANCEL_NOTFOUND:
2548                 ret = -ENOENT;
2549                 break;
2550         }
2551
2552         return ret;
2553 }
2554
2555 static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
2556                                      struct io_kiocb *req, __u64 sqe_addr,
2557                                      struct io_kiocb **nxt, int success_ret)
2558 {
2559         unsigned long flags;
2560         int ret;
2561
2562         ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
2563         if (ret != -ENOENT) {
2564                 spin_lock_irqsave(&ctx->completion_lock, flags);
2565                 goto done;
2566         }
2567
2568         spin_lock_irqsave(&ctx->completion_lock, flags);
2569         ret = io_timeout_cancel(ctx, sqe_addr);
2570         if (ret != -ENOENT)
2571                 goto done;
2572         ret = io_poll_cancel(ctx, sqe_addr);
2573 done:
2574         if (!ret)
2575                 ret = success_ret;
2576         io_cqring_fill_event(req, ret);
2577         io_commit_cqring(ctx);
2578         spin_unlock_irqrestore(&ctx->completion_lock, flags);
2579         io_cqring_ev_posted(ctx);
2580
2581         if (ret < 0 && (req->flags & REQ_F_LINK))
2582                 req->flags |= REQ_F_FAIL_LINK;
2583         io_put_req_find_next(req, nxt);
2584 }
2585
2586 static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2587                            struct io_kiocb **nxt)
2588 {
2589         struct io_ring_ctx *ctx = req->ctx;
2590
2591         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2592                 return -EINVAL;
2593         if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
2594             sqe->cancel_flags)
2595                 return -EINVAL;
2596
2597         io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), nxt, 0);
2598         return 0;
2599 }
2600
2601 static int io_req_defer(struct io_kiocb *req)
2602 {
2603         struct io_uring_sqe *sqe_copy;
2604         struct io_ring_ctx *ctx = req->ctx;
2605
2606         /* Still need defer if there is pending req in defer list. */
2607         if (!req_need_defer(req) && list_empty(&ctx->defer_list))
2608                 return 0;
2609
2610         sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
2611         if (!sqe_copy)
2612                 return -EAGAIN;
2613
2614         spin_lock_irq(&ctx->completion_lock);
2615         if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
2616                 spin_unlock_irq(&ctx->completion_lock);
2617                 kfree(sqe_copy);
2618                 return 0;
2619         }
2620
2621         memcpy(sqe_copy, req->sqe, sizeof(*sqe_copy));
2622         req->flags |= REQ_F_FREE_SQE;
2623         req->sqe = sqe_copy;
2624
2625         trace_io_uring_defer(ctx, req, req->user_data);
2626         list_add_tail(&req->list, &ctx->defer_list);
2627         spin_unlock_irq(&ctx->completion_lock);
2628         return -EIOCBQUEUED;
2629 }
2630
2631 __attribute__((nonnull))
2632 static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
2633                         bool force_nonblock)
2634 {
2635         int ret, opcode;
2636         struct io_ring_ctx *ctx = req->ctx;
2637
2638         opcode = READ_ONCE(req->sqe->opcode);
2639         switch (opcode) {
2640         case IORING_OP_NOP:
2641                 ret = io_nop(req);
2642                 break;
2643         case IORING_OP_READV:
2644                 if (unlikely(req->sqe->buf_index))
2645                         return -EINVAL;
2646                 ret = io_read(req, nxt, force_nonblock);
2647                 break;
2648         case IORING_OP_WRITEV:
2649                 if (unlikely(req->sqe->buf_index))
2650                         return -EINVAL;
2651                 ret = io_write(req, nxt, force_nonblock);
2652                 break;
2653         case IORING_OP_READ_FIXED:
2654                 ret = io_read(req, nxt, force_nonblock);
2655                 break;
2656         case IORING_OP_WRITE_FIXED:
2657                 ret = io_write(req, nxt, force_nonblock);
2658                 break;
2659         case IORING_OP_FSYNC:
2660                 ret = io_fsync(req, req->sqe, nxt, force_nonblock);
2661                 break;
2662         case IORING_OP_POLL_ADD:
2663                 ret = io_poll_add(req, req->sqe, nxt);
2664                 break;
2665         case IORING_OP_POLL_REMOVE:
2666                 ret = io_poll_remove(req, req->sqe);
2667                 break;
2668         case IORING_OP_SYNC_FILE_RANGE:
2669                 ret = io_sync_file_range(req, req->sqe, nxt, force_nonblock);
2670                 break;
2671         case IORING_OP_SENDMSG:
2672                 ret = io_sendmsg(req, req->sqe, nxt, force_nonblock);
2673                 break;
2674         case IORING_OP_RECVMSG:
2675                 ret = io_recvmsg(req, req->sqe, nxt, force_nonblock);
2676                 break;
2677         case IORING_OP_TIMEOUT:
2678                 ret = io_timeout(req, req->sqe);
2679                 break;
2680         case IORING_OP_TIMEOUT_REMOVE:
2681                 ret = io_timeout_remove(req, req->sqe);
2682                 break;
2683         case IORING_OP_ACCEPT:
2684                 ret = io_accept(req, req->sqe, nxt, force_nonblock);
2685                 break;
2686         case IORING_OP_CONNECT:
2687                 ret = io_connect(req, req->sqe, nxt, force_nonblock);
2688                 break;
2689         case IORING_OP_ASYNC_CANCEL:
2690                 ret = io_async_cancel(req, req->sqe, nxt);
2691                 break;
2692         default:
2693                 ret = -EINVAL;
2694                 break;
2695         }
2696
2697         if (ret)
2698                 return ret;
2699
2700         if (ctx->flags & IORING_SETUP_IOPOLL) {
2701                 if (req->result == -EAGAIN)
2702                         return -EAGAIN;
2703
2704                 /* workqueue context doesn't hold uring_lock, grab it now */
2705                 if (req->in_async)
2706                         mutex_lock(&ctx->uring_lock);
2707                 io_iopoll_req_issued(req);
2708                 if (req->in_async)
2709                         mutex_unlock(&ctx->uring_lock);
2710         }
2711
2712         return 0;
2713 }
2714
2715 static void io_link_work_cb(struct io_wq_work **workptr)
2716 {
2717         struct io_wq_work *work = *workptr;
2718         struct io_kiocb *link = work->data;
2719
2720         io_queue_linked_timeout(link);
2721         work->func = io_wq_submit_work;
2722 }
2723
2724 static void io_wq_submit_work(struct io_wq_work **workptr)
2725 {
2726         struct io_wq_work *work = *workptr;
2727         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
2728         struct io_kiocb *nxt = NULL;
2729         int ret = 0;
2730
2731         /* Ensure we clear previously set non-block flag */
2732         req->rw.ki_flags &= ~IOCB_NOWAIT;
2733
2734         if (work->flags & IO_WQ_WORK_CANCEL)
2735                 ret = -ECANCELED;
2736
2737         if (!ret) {
2738                 req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
2739                 req->in_async = true;
2740                 do {
2741                         ret = io_issue_sqe(req, &nxt, false);
2742                         /*
2743                          * We can get EAGAIN for polled IO even though we're
2744                          * forcing a sync submission from here, since we can't
2745                          * wait for request slots on the block side.
2746                          */
2747                         if (ret != -EAGAIN)
2748                                 break;
2749                         cond_resched();
2750                 } while (1);
2751         }
2752
2753         /* drop submission reference */
2754         io_put_req(req);
2755
2756         if (ret) {
2757                 if (req->flags & REQ_F_LINK)
2758                         req->flags |= REQ_F_FAIL_LINK;
2759                 io_cqring_add_event(req, ret);
2760                 io_put_req(req);
2761         }
2762
2763         /* if a dependent link is ready, pass it back */
2764         if (!ret && nxt) {
2765                 struct io_kiocb *link;
2766
2767                 io_prep_async_work(nxt, &link);
2768                 *workptr = &nxt->work;
2769                 if (link) {
2770                         nxt->work.flags |= IO_WQ_WORK_CB;
2771                         nxt->work.func = io_link_work_cb;
2772                         nxt->work.data = link;
2773                 }
2774         }
2775 }
2776
2777 static bool io_op_needs_file(const struct io_uring_sqe *sqe)
2778 {
2779         int op = READ_ONCE(sqe->opcode);
2780
2781         switch (op) {
2782         case IORING_OP_NOP:
2783         case IORING_OP_POLL_REMOVE:
2784         case IORING_OP_TIMEOUT:
2785         case IORING_OP_TIMEOUT_REMOVE:
2786         case IORING_OP_ASYNC_CANCEL:
2787         case IORING_OP_LINK_TIMEOUT:
2788                 return false;
2789         default:
2790                 return true;
2791         }
2792 }
2793
2794 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
2795                                               int index)
2796 {
2797         struct fixed_file_table *table;
2798
2799         table = &ctx->file_table[index >> IORING_FILE_TABLE_SHIFT];
2800         return table->files[index & IORING_FILE_TABLE_MASK];
2801 }
2802
2803 static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req)
2804 {
2805         struct io_ring_ctx *ctx = req->ctx;
2806         unsigned flags;
2807         int fd;
2808
2809         flags = READ_ONCE(req->sqe->flags);
2810         fd = READ_ONCE(req->sqe->fd);
2811
2812         if (flags & IOSQE_IO_DRAIN)
2813                 req->flags |= REQ_F_IO_DRAIN;
2814
2815         if (!io_op_needs_file(req->sqe))
2816                 return 0;
2817
2818         if (flags & IOSQE_FIXED_FILE) {
2819                 if (unlikely(!ctx->file_table ||
2820                     (unsigned) fd >= ctx->nr_user_files))
2821                         return -EBADF;
2822                 fd = array_index_nospec(fd, ctx->nr_user_files);
2823                 req->file = io_file_from_index(ctx, fd);
2824                 if (!req->file)
2825                         return -EBADF;
2826                 req->flags |= REQ_F_FIXED_FILE;
2827         } else {
2828                 if (req->needs_fixed_file)
2829                         return -EBADF;
2830                 trace_io_uring_file_get(ctx, fd);
2831                 req->file = io_file_get(state, fd);
2832                 if (unlikely(!req->file))
2833                         return -EBADF;
2834         }
2835
2836         return 0;
2837 }
2838
2839 static int io_grab_files(struct io_kiocb *req)
2840 {
2841         int ret = -EBADF;
2842         struct io_ring_ctx *ctx = req->ctx;
2843
2844         rcu_read_lock();
2845         spin_lock_irq(&ctx->inflight_lock);
2846         /*
2847          * We use the f_ops->flush() handler to ensure that we can flush
2848          * out work accessing these files if the fd is closed. Check if
2849          * the fd has changed since we started down this path, and disallow
2850          * this operation if it has.
2851          */
2852         if (fcheck(req->ring_fd) == req->ring_file) {
2853                 list_add(&req->inflight_entry, &ctx->inflight_list);
2854                 req->flags |= REQ_F_INFLIGHT;
2855                 req->work.files = current->files;
2856                 ret = 0;
2857         }
2858         spin_unlock_irq(&ctx->inflight_lock);
2859         rcu_read_unlock();
2860
2861         return ret;
2862 }
2863
2864 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
2865 {
2866         struct io_timeout_data *data = container_of(timer,
2867                                                 struct io_timeout_data, timer);
2868         struct io_kiocb *req = data->req;
2869         struct io_ring_ctx *ctx = req->ctx;
2870         struct io_kiocb *prev = NULL;
2871         unsigned long flags;
2872
2873         spin_lock_irqsave(&ctx->completion_lock, flags);
2874
2875         /*
2876          * We don't expect the list to be empty, that will only happen if we
2877          * race with the completion of the linked work.
2878          */
2879         if (!list_empty(&req->list)) {
2880                 prev = list_entry(req->list.prev, struct io_kiocb, link_list);
2881                 if (refcount_inc_not_zero(&prev->refs)) {
2882                         list_del_init(&req->list);
2883                         prev->flags &= ~REQ_F_LINK_TIMEOUT;
2884                 } else
2885                         prev = NULL;
2886         }
2887
2888         spin_unlock_irqrestore(&ctx->completion_lock, flags);
2889
2890         if (prev) {
2891                 if (prev->flags & REQ_F_LINK)
2892                         prev->flags |= REQ_F_FAIL_LINK;
2893                 io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
2894                                                 -ETIME);
2895                 io_put_req(prev);
2896         } else {
2897                 io_cqring_add_event(req, -ETIME);
2898                 io_put_req(req);
2899         }
2900         return HRTIMER_NORESTART;
2901 }
2902
2903 static void io_queue_linked_timeout(struct io_kiocb *req)
2904 {
2905         struct io_ring_ctx *ctx = req->ctx;
2906
2907         /*
2908          * If the list is now empty, then our linked request finished before
2909          * we got a chance to setup the timer
2910          */
2911         spin_lock_irq(&ctx->completion_lock);
2912         if (!list_empty(&req->list)) {
2913                 struct io_timeout_data *data = req->timeout.data;
2914
2915                 data->timer.function = io_link_timeout_fn;
2916                 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
2917                                 data->mode);
2918         }
2919         spin_unlock_irq(&ctx->completion_lock);
2920
2921         /* drop submission reference */
2922         io_put_req(req);
2923 }
2924
2925 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
2926 {
2927         struct io_kiocb *nxt;
2928
2929         if (!(req->flags & REQ_F_LINK))
2930                 return NULL;
2931
2932         nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
2933         if (!nxt || nxt->sqe->opcode != IORING_OP_LINK_TIMEOUT)
2934                 return NULL;
2935
2936         req->flags |= REQ_F_LINK_TIMEOUT;
2937         return nxt;
2938 }
2939
2940 static void __io_queue_sqe(struct io_kiocb *req)
2941 {
2942         struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
2943         struct io_kiocb *nxt = NULL;
2944         int ret;
2945
2946         ret = io_issue_sqe(req, &nxt, true);
2947         if (nxt)
2948                 io_queue_async_work(nxt);
2949
2950         /*
2951          * We async punt it if the file wasn't marked NOWAIT, or if the file
2952          * doesn't support non-blocking read/write attempts
2953          */
2954         if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
2955             (req->flags & REQ_F_MUST_PUNT))) {
2956                 struct io_uring_sqe *sqe_copy;
2957
2958                 sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL);
2959                 if (!sqe_copy)
2960                         goto err;
2961
2962                 req->sqe = sqe_copy;
2963                 req->flags |= REQ_F_FREE_SQE;
2964
2965                 if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
2966                         ret = io_grab_files(req);
2967                         if (ret)
2968                                 goto err;
2969                 }
2970
2971                 /*
2972                  * Queued up for async execution, worker will release
2973                  * submit reference when the iocb is actually submitted.
2974                  */
2975                 io_queue_async_work(req);
2976                 return;
2977         }
2978
2979 err:
2980         /* drop submission reference */
2981         io_put_req(req);
2982
2983         if (linked_timeout) {
2984                 if (!ret)
2985                         io_queue_linked_timeout(linked_timeout);
2986                 else
2987                         io_put_req(linked_timeout);
2988         }
2989
2990         /* and drop final reference, if we failed */
2991         if (ret) {
2992                 io_cqring_add_event(req, ret);
2993                 if (req->flags & REQ_F_LINK)
2994                         req->flags |= REQ_F_FAIL_LINK;
2995                 io_put_req(req);
2996         }
2997 }
2998
2999 static void io_queue_sqe(struct io_kiocb *req)
3000 {
3001         int ret;
3002
3003         if (unlikely(req->ctx->drain_next)) {
3004                 req->flags |= REQ_F_IO_DRAIN;
3005                 req->ctx->drain_next = false;
3006         }
3007         req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK);
3008
3009         ret = io_req_defer(req);
3010         if (ret) {
3011                 if (ret != -EIOCBQUEUED) {
3012                         io_cqring_add_event(req, ret);
3013                         if (req->flags & REQ_F_LINK)
3014                                 req->flags |= REQ_F_FAIL_LINK;
3015                         io_double_put_req(req);
3016                 }
3017         } else
3018                 __io_queue_sqe(req);
3019 }
3020
3021 static inline void io_queue_link_head(struct io_kiocb *req)
3022 {
3023         if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
3024                 io_cqring_add_event(req, -ECANCELED);
3025                 io_double_put_req(req);
3026         } else
3027                 io_queue_sqe(req);
3028 }
3029
3030
3031 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
3032
3033 static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
3034                           struct io_kiocb **link)
3035 {
3036         struct io_ring_ctx *ctx = req->ctx;
3037         int ret;
3038
3039         req->user_data = req->sqe->user_data;
3040
3041         /* enforce forwards compatibility on users */
3042         if (unlikely(req->sqe->flags & ~SQE_VALID_FLAGS)) {
3043                 ret = -EINVAL;
3044                 goto err_req;
3045         }
3046
3047         ret = io_req_set_file(state, req);
3048         if (unlikely(ret)) {
3049 err_req:
3050                 io_cqring_add_event(req, ret);
3051                 io_double_put_req(req);
3052                 return;
3053         }
3054
3055         /*
3056          * If we already have a head request, queue this one for async
3057          * submittal once the head completes. If we don't have a head but
3058          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
3059          * submitted sync once the chain is complete. If none of those
3060          * conditions are true (normal request), then just queue it.
3061          */
3062         if (*link) {
3063                 struct io_kiocb *prev = *link;
3064                 struct io_uring_sqe *sqe_copy;
3065
3066                 if (req->sqe->flags & IOSQE_IO_DRAIN)
3067                         (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
3068
3069                 if (READ_ONCE(req->sqe->opcode) == IORING_OP_LINK_TIMEOUT) {
3070                         ret = io_timeout_setup(req);
3071                         /* common setup allows offset being set, we don't */
3072                         if (!ret && req->sqe->off)
3073                                 ret = -EINVAL;
3074                         if (ret) {
3075                                 prev->flags |= REQ_F_FAIL_LINK;
3076                                 goto err_req;
3077                         }
3078                 }
3079
3080                 sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL);
3081                 if (!sqe_copy) {
3082                         ret = -EAGAIN;
3083                         goto err_req;
3084                 }
3085
3086                 req->sqe = sqe_copy;
3087                 req->flags |= REQ_F_FREE_SQE;
3088                 trace_io_uring_link(ctx, req, prev);
3089                 list_add_tail(&req->list, &prev->link_list);
3090         } else if (req->sqe->flags & IOSQE_IO_LINK) {
3091                 req->flags |= REQ_F_LINK;
3092
3093                 INIT_LIST_HEAD(&req->link_list);
3094                 *link = req;
3095         } else {
3096                 io_queue_sqe(req);
3097         }
3098 }
3099
3100 /*
3101  * Batched submission is done, ensure local IO is flushed out.
3102  */
3103 static void io_submit_state_end(struct io_submit_state *state)
3104 {
3105         blk_finish_plug(&state->plug);
3106         io_file_put(state);
3107         if (state->free_reqs)
3108                 kmem_cache_free_bulk(req_cachep, state->free_reqs,
3109                                         &state->reqs[state->cur_req]);
3110 }
3111
3112 /*
3113  * Start submission side cache.
3114  */
3115 static void io_submit_state_start(struct io_submit_state *state,
3116                                   struct io_ring_ctx *ctx, unsigned max_ios)
3117 {
3118         blk_start_plug(&state->plug);
3119         state->free_reqs = 0;
3120         state->file = NULL;
3121         state->ios_left = max_ios;
3122 }
3123
3124 static void io_commit_sqring(struct io_ring_ctx *ctx)
3125 {
3126         struct io_rings *rings = ctx->rings;
3127
3128         if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) {
3129                 /*
3130                  * Ensure any loads from the SQEs are done at this point,
3131                  * since once we write the new head, the application could
3132                  * write new data to them.
3133                  */
3134                 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
3135         }
3136 }
3137
3138 /*
3139  * Fetch an sqe, if one is available. Note that s->sqe will point to memory
3140  * that is mapped by userspace. This means that care needs to be taken to
3141  * ensure that reads are stable, as we cannot rely on userspace always
3142  * being a good citizen. If members of the sqe are validated and then later
3143  * used, it's important that those reads are done through READ_ONCE() to
3144  * prevent a re-load down the line.
3145  */
3146 static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req)
3147 {
3148         struct io_rings *rings = ctx->rings;
3149         u32 *sq_array = ctx->sq_array;
3150         unsigned head;
3151
3152         /*
3153          * The cached sq head (or cq tail) serves two purposes:
3154          *
3155          * 1) allows us to batch the cost of updating the user visible
3156          *    head updates.
3157          * 2) allows the kernel side to track the head on its own, even
3158          *    though the application is the one updating it.
3159          */
3160         head = ctx->cached_sq_head;
3161         /* make sure SQ entry isn't read before tail */
3162         if (unlikely(head == smp_load_acquire(&rings->sq.tail)))
3163                 return false;
3164
3165         head = READ_ONCE(sq_array[head & ctx->sq_mask]);
3166         if (likely(head < ctx->sq_entries)) {
3167                 /*
3168                  * All io need record the previous position, if LINK vs DARIN,
3169                  * it can be used to mark the position of the first IO in the
3170                  * link list.
3171                  */
3172                 req->sequence = ctx->cached_sq_head;
3173                 req->sqe = &ctx->sq_sqes[head];
3174                 ctx->cached_sq_head++;
3175                 return true;
3176         }
3177
3178         /* drop invalid entries */
3179         ctx->cached_sq_head++;
3180         ctx->cached_sq_dropped++;
3181         WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped);
3182         return false;
3183 }
3184
3185 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
3186                           struct file *ring_file, int ring_fd,
3187                           struct mm_struct **mm, bool async)
3188 {
3189         struct io_submit_state state, *statep = NULL;
3190         struct io_kiocb *link = NULL;
3191         int i, submitted = 0;
3192         bool mm_fault = false;
3193
3194         /* if we have a backlog and couldn't flush it all, return BUSY */
3195         if (!list_empty(&ctx->cq_overflow_list) &&
3196             !io_cqring_overflow_flush(ctx, false))
3197                 return -EBUSY;
3198
3199         if (nr > IO_PLUG_THRESHOLD) {
3200                 io_submit_state_start(&state, ctx, nr);
3201                 statep = &state;
3202         }
3203
3204         for (i = 0; i < nr; i++) {
3205                 struct io_kiocb *req;
3206                 unsigned int sqe_flags;
3207
3208                 req = io_get_req(ctx, statep);
3209                 if (unlikely(!req)) {
3210                         if (!submitted)
3211                                 submitted = -EAGAIN;
3212                         break;
3213                 }
3214                 if (!io_get_sqring(ctx, req)) {
3215                         __io_free_req(req);
3216                         break;
3217                 }
3218
3219                 if (io_sqe_needs_user(req->sqe) && !*mm) {
3220                         mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
3221                         if (!mm_fault) {
3222                                 use_mm(ctx->sqo_mm);
3223                                 *mm = ctx->sqo_mm;
3224                         }
3225                 }
3226
3227                 sqe_flags = req->sqe->flags;
3228
3229                 req->ring_file = ring_file;
3230                 req->ring_fd = ring_fd;
3231                 req->has_user = *mm != NULL;
3232                 req->in_async = async;
3233                 req->needs_fixed_file = async;
3234                 trace_io_uring_submit_sqe(ctx, req->sqe->user_data,
3235                                           true, async);
3236                 io_submit_sqe(req, statep, &link);
3237                 submitted++;
3238
3239                 /*
3240                  * If previous wasn't linked and we have a linked command,
3241                  * that's the end of the chain. Submit the previous link.
3242                  */
3243                 if (!(sqe_flags & IOSQE_IO_LINK) && link) {
3244                         io_queue_link_head(link);
3245                         link = NULL;
3246                 }
3247         }
3248
3249         if (link)
3250                 io_queue_link_head(link);
3251         if (statep)
3252                 io_submit_state_end(&state);
3253
3254          /* Commit SQ ring head once we've consumed and submitted all SQEs */
3255         io_commit_sqring(ctx);
3256
3257         return submitted;
3258 }
3259
3260 static int io_sq_thread(void *data)
3261 {
3262         struct io_ring_ctx *ctx = data;
3263         struct mm_struct *cur_mm = NULL;
3264         const struct cred *old_cred;
3265         mm_segment_t old_fs;
3266         DEFINE_WAIT(wait);
3267         unsigned inflight;
3268         unsigned long timeout;
3269         int ret;
3270
3271         complete(&ctx->completions[1]);
3272
3273         old_fs = get_fs();
3274         set_fs(USER_DS);
3275         old_cred = override_creds(ctx->creds);
3276
3277         ret = timeout = inflight = 0;
3278         while (!kthread_should_park()) {
3279                 unsigned int to_submit;
3280
3281                 if (inflight) {
3282                         unsigned nr_events = 0;
3283
3284                         if (ctx->flags & IORING_SETUP_IOPOLL) {
3285                                 /*
3286                                  * inflight is the count of the maximum possible
3287                                  * entries we submitted, but it can be smaller
3288                                  * if we dropped some of them. If we don't have
3289                                  * poll entries available, then we know that we
3290                                  * have nothing left to poll for. Reset the
3291                                  * inflight count to zero in that case.
3292                                  */
3293                                 mutex_lock(&ctx->uring_lock);
3294                                 if (!list_empty(&ctx->poll_list))
3295                                         __io_iopoll_check(ctx, &nr_events, 0);
3296                                 else
3297                                         inflight = 0;
3298                                 mutex_unlock(&ctx->uring_lock);
3299                         } else {
3300                                 /*
3301                                  * Normal IO, just pretend everything completed.
3302                                  * We don't have to poll completions for that.
3303                                  */
3304                                 nr_events = inflight;
3305                         }
3306
3307                         inflight -= nr_events;
3308                         if (!inflight)
3309                                 timeout = jiffies + ctx->sq_thread_idle;
3310                 }
3311
3312                 to_submit = io_sqring_entries(ctx);
3313
3314                 /*
3315                  * If submit got -EBUSY, flag us as needing the application
3316                  * to enter the kernel to reap and flush events.
3317                  */
3318                 if (!to_submit || ret == -EBUSY) {
3319                         /*
3320                          * We're polling. If we're within the defined idle
3321                          * period, then let us spin without work before going
3322                          * to sleep. The exception is if we got EBUSY doing
3323                          * more IO, we should wait for the application to
3324                          * reap events and wake us up.
3325                          */
3326                         if (inflight ||
3327                             (!time_after(jiffies, timeout) && ret != -EBUSY)) {
3328                                 cond_resched();
3329                                 continue;
3330                         }
3331
3332                         /*
3333                          * Drop cur_mm before scheduling, we can't hold it for
3334                          * long periods (or over schedule()). Do this before
3335                          * adding ourselves to the waitqueue, as the unuse/drop
3336                          * may sleep.
3337                          */
3338                         if (cur_mm) {
3339                                 unuse_mm(cur_mm);
3340                                 mmput(cur_mm);
3341                                 cur_mm = NULL;
3342                         }
3343
3344                         prepare_to_wait(&ctx->sqo_wait, &wait,
3345                                                 TASK_INTERRUPTIBLE);
3346
3347                         /* Tell userspace we may need a wakeup call */
3348                         ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
3349                         /* make sure to read SQ tail after writing flags */
3350                         smp_mb();
3351
3352                         to_submit = io_sqring_entries(ctx);
3353                         if (!to_submit || ret == -EBUSY) {
3354                                 if (kthread_should_park()) {
3355                                         finish_wait(&ctx->sqo_wait, &wait);
3356                                         break;
3357                                 }
3358                                 if (signal_pending(current))
3359                                         flush_signals(current);
3360                                 schedule();
3361                                 finish_wait(&ctx->sqo_wait, &wait);
3362
3363                                 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
3364                                 continue;
3365                         }
3366                         finish_wait(&ctx->sqo_wait, &wait);
3367
3368                         ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
3369                 }
3370
3371                 to_submit = min(to_submit, ctx->sq_entries);
3372                 ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
3373                 if (ret > 0)
3374                         inflight += ret;
3375         }
3376
3377         set_fs(old_fs);
3378         if (cur_mm) {
3379                 unuse_mm(cur_mm);
3380                 mmput(cur_mm);
3381         }
3382         revert_creds(old_cred);
3383
3384         kthread_parkme();
3385
3386         return 0;
3387 }
3388
3389 struct io_wait_queue {
3390         struct wait_queue_entry wq;
3391         struct io_ring_ctx *ctx;
3392         unsigned to_wait;
3393         unsigned nr_timeouts;
3394 };
3395
3396 static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
3397 {
3398         struct io_ring_ctx *ctx = iowq->ctx;
3399
3400         /*
3401          * Wake up if we have enough events, or if a timeout occured since we
3402          * started waiting. For timeouts, we always want to return to userspace,
3403          * regardless of event count.
3404          */
3405         return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
3406                         atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
3407 }
3408
3409 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
3410                             int wake_flags, void *key)
3411 {
3412         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
3413                                                         wq);
3414
3415         /* use noflush == true, as we can't safely rely on locking context */
3416         if (!io_should_wake(iowq, true))
3417                 return -1;
3418
3419         return autoremove_wake_function(curr, mode, wake_flags, key);
3420 }
3421
3422 /*
3423  * Wait until events become available, if we don't already have some. The
3424  * application must reap them itself, as they reside on the shared cq ring.
3425  */
3426 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
3427                           const sigset_t __user *sig, size_t sigsz)
3428 {
3429         struct io_wait_queue iowq = {
3430                 .wq = {
3431                         .private        = current,
3432                         .func           = io_wake_function,
3433                         .entry          = LIST_HEAD_INIT(iowq.wq.entry),
3434                 },
3435                 .ctx            = ctx,
3436                 .to_wait        = min_events,
3437         };
3438         struct io_rings *rings = ctx->rings;
3439         int ret = 0;
3440
3441         if (io_cqring_events(ctx, false) >= min_events)
3442                 return 0;
3443
3444         if (sig) {
3445 #ifdef CONFIG_COMPAT
3446                 if (in_compat_syscall())
3447                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
3448                                                       sigsz);
3449                 else
3450 #endif
3451                         ret = set_user_sigmask(sig, sigsz);
3452
3453                 if (ret)
3454                         return ret;
3455         }
3456
3457         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
3458         trace_io_uring_cqring_wait(ctx, min_events);
3459         do {
3460                 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
3461                                                 TASK_INTERRUPTIBLE);
3462                 if (io_should_wake(&iowq, false))
3463                         break;
3464                 schedule();
3465                 if (signal_pending(current)) {
3466                         ret = -EINTR;
3467                         break;
3468                 }
3469         } while (1);
3470         finish_wait(&ctx->wait, &iowq.wq);
3471
3472         restore_saved_sigmask_unless(ret == -EINTR);
3473
3474         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
3475 }
3476
3477 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
3478 {
3479 #if defined(CONFIG_UNIX)
3480         if (ctx->ring_sock) {
3481                 struct sock *sock = ctx->ring_sock->sk;
3482                 struct sk_buff *skb;
3483
3484                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
3485                         kfree_skb(skb);
3486         }
3487 #else
3488         int i;
3489
3490         for (i = 0; i < ctx->nr_user_files; i++) {
3491                 struct file *file;
3492
3493                 file = io_file_from_index(ctx, i);
3494                 if (file)
3495                         fput(file);
3496         }
3497 #endif
3498 }
3499
3500 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
3501 {
3502         unsigned nr_tables, i;
3503
3504         if (!ctx->file_table)
3505                 return -ENXIO;
3506
3507         __io_sqe_files_unregister(ctx);
3508         nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
3509         for (i = 0; i < nr_tables; i++)
3510                 kfree(ctx->file_table[i].files);
3511         kfree(ctx->file_table);
3512         ctx->file_table = NULL;
3513         ctx->nr_user_files = 0;
3514         return 0;
3515 }
3516
3517 static void io_sq_thread_stop(struct io_ring_ctx *ctx)
3518 {
3519         if (ctx->sqo_thread) {
3520                 wait_for_completion(&ctx->completions[1]);
3521                 /*
3522                  * The park is a bit of a work-around, without it we get
3523                  * warning spews on shutdown with SQPOLL set and affinity
3524                  * set to a single CPU.
3525                  */
3526                 kthread_park(ctx->sqo_thread);
3527                 kthread_stop(ctx->sqo_thread);
3528                 ctx->sqo_thread = NULL;
3529         }
3530 }
3531
3532 static void io_finish_async(struct io_ring_ctx *ctx)
3533 {
3534         io_sq_thread_stop(ctx);
3535
3536         if (ctx->io_wq) {
3537                 io_wq_destroy(ctx->io_wq);
3538                 ctx->io_wq = NULL;
3539         }
3540 }
3541
3542 #if defined(CONFIG_UNIX)
3543 static void io_destruct_skb(struct sk_buff *skb)
3544 {
3545         struct io_ring_ctx *ctx = skb->sk->sk_user_data;
3546
3547         if (ctx->io_wq)
3548                 io_wq_flush(ctx->io_wq);
3549
3550         unix_destruct_scm(skb);
3551 }
3552
3553 /*
3554  * Ensure the UNIX gc is aware of our file set, so we are certain that
3555  * the io_uring can be safely unregistered on process exit, even if we have
3556  * loops in the file referencing.
3557  */
3558 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
3559 {
3560         struct sock *sk = ctx->ring_sock->sk;
3561         struct scm_fp_list *fpl;
3562         struct sk_buff *skb;
3563         int i, nr_files;
3564
3565         if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
3566                 unsigned long inflight = ctx->user->unix_inflight + nr;
3567
3568                 if (inflight > task_rlimit(current, RLIMIT_NOFILE))
3569                         return -EMFILE;
3570         }
3571
3572         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
3573         if (!fpl)
3574                 return -ENOMEM;
3575
3576         skb = alloc_skb(0, GFP_KERNEL);
3577         if (!skb) {
3578                 kfree(fpl);
3579                 return -ENOMEM;
3580         }
3581
3582         skb->sk = sk;
3583
3584         nr_files = 0;
3585         fpl->user = get_uid(ctx->user);
3586         for (i = 0; i < nr; i++) {
3587                 struct file *file = io_file_from_index(ctx, i + offset);
3588
3589                 if (!file)
3590                         continue;
3591                 fpl->fp[nr_files] = get_file(file);
3592                 unix_inflight(fpl->user, fpl->fp[nr_files]);
3593                 nr_files++;
3594         }
3595
3596         if (nr_files) {
3597                 fpl->max = SCM_MAX_FD;
3598                 fpl->count = nr_files;
3599                 UNIXCB(skb).fp = fpl;
3600                 skb->destructor = io_destruct_skb;
3601                 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
3602                 skb_queue_head(&sk->sk_receive_queue, skb);
3603
3604                 for (i = 0; i < nr_files; i++)
3605                         fput(fpl->fp[i]);
3606         } else {
3607                 kfree_skb(skb);
3608                 kfree(fpl);
3609         }
3610
3611         return 0;
3612 }
3613
3614 /*
3615  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
3616  * causes regular reference counting to break down. We rely on the UNIX
3617  * garbage collection to take care of this problem for us.
3618  */
3619 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
3620 {
3621         unsigned left, total;
3622         int ret = 0;
3623
3624         total = 0;
3625         left = ctx->nr_user_files;
3626         while (left) {
3627                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
3628
3629                 ret = __io_sqe_files_scm(ctx, this_files, total);
3630                 if (ret)
3631                         break;
3632                 left -= this_files;
3633                 total += this_files;
3634         }
3635
3636         if (!ret)
3637                 return 0;
3638
3639         while (total < ctx->nr_user_files) {
3640                 struct file *file = io_file_from_index(ctx, total);
3641
3642                 if (file)
3643                         fput(file);
3644                 total++;
3645         }
3646
3647         return ret;
3648 }
3649 #else
3650 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
3651 {
3652         return 0;
3653 }
3654 #endif
3655
3656 static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
3657                                     unsigned nr_files)
3658 {
3659         int i;
3660
3661         for (i = 0; i < nr_tables; i++) {
3662                 struct fixed_file_table *table = &ctx->file_table[i];
3663                 unsigned this_files;
3664
3665                 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
3666                 table->files = kcalloc(this_files, sizeof(struct file *),
3667                                         GFP_KERNEL);
3668                 if (!table->files)
3669                         break;
3670                 nr_files -= this_files;
3671         }
3672
3673         if (i == nr_tables)
3674                 return 0;
3675
3676         for (i = 0; i < nr_tables; i++) {
3677                 struct fixed_file_table *table = &ctx->file_table[i];
3678                 kfree(table->files);
3679         }
3680         return 1;
3681 }
3682
3683 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
3684                                  unsigned nr_args)
3685 {
3686         __s32 __user *fds = (__s32 __user *) arg;
3687         unsigned nr_tables;
3688         int fd, ret = 0;
3689         unsigned i;
3690
3691         if (ctx->file_table)
3692                 return -EBUSY;
3693         if (!nr_args)
3694                 return -EINVAL;
3695         if (nr_args > IORING_MAX_FIXED_FILES)
3696                 return -EMFILE;
3697
3698         nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
3699         ctx->file_table = kcalloc(nr_tables, sizeof(struct fixed_file_table),
3700                                         GFP_KERNEL);
3701         if (!ctx->file_table)
3702                 return -ENOMEM;
3703
3704         if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
3705                 kfree(ctx->file_table);
3706                 ctx->file_table = NULL;
3707                 return -ENOMEM;
3708         }
3709
3710         for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
3711                 struct fixed_file_table *table;
3712                 unsigned index;
3713
3714                 ret = -EFAULT;
3715                 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
3716                         break;
3717                 /* allow sparse sets */
3718                 if (fd == -1) {
3719                         ret = 0;
3720                         continue;
3721                 }
3722
3723                 table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
3724                 index = i & IORING_FILE_TABLE_MASK;
3725                 table->files[index] = fget(fd);
3726
3727                 ret = -EBADF;
3728                 if (!table->files[index])
3729                         break;
3730                 /*
3731                  * Don't allow io_uring instances to be registered. If UNIX
3732                  * isn't enabled, then this causes a reference cycle and this
3733                  * instance can never get freed. If UNIX is enabled we'll
3734                  * handle it just fine, but there's still no point in allowing
3735                  * a ring fd as it doesn't support regular read/write anyway.
3736                  */
3737                 if (table->files[index]->f_op == &io_uring_fops) {
3738                         fput(table->files[index]);
3739                         break;
3740                 }
3741                 ret = 0;
3742         }
3743
3744         if (ret) {
3745                 for (i = 0; i < ctx->nr_user_files; i++) {
3746                         struct file *file;
3747
3748                         file = io_file_from_index(ctx, i);
3749                         if (file)
3750                                 fput(file);
3751                 }
3752                 for (i = 0; i < nr_tables; i++)
3753                         kfree(ctx->file_table[i].files);
3754
3755                 kfree(ctx->file_table);
3756                 ctx->file_table = NULL;
3757                 ctx->nr_user_files = 0;
3758                 return ret;
3759         }
3760
3761         ret = io_sqe_files_scm(ctx);
3762         if (ret)
3763                 io_sqe_files_unregister(ctx);
3764
3765         return ret;
3766 }
3767
3768 static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index)
3769 {
3770 #if defined(CONFIG_UNIX)
3771         struct file *file = io_file_from_index(ctx, index);
3772         struct sock *sock = ctx->ring_sock->sk;
3773         struct sk_buff_head list, *head = &sock->sk_receive_queue;
3774         struct sk_buff *skb;
3775         int i;
3776
3777         __skb_queue_head_init(&list);
3778
3779         /*
3780          * Find the skb that holds this file in its SCM_RIGHTS. When found,
3781          * remove this entry and rearrange the file array.
3782          */
3783         skb = skb_dequeue(head);
3784         while (skb) {
3785                 struct scm_fp_list *fp;
3786
3787                 fp = UNIXCB(skb).fp;
3788                 for (i = 0; i < fp->count; i++) {
3789                         int left;
3790
3791                         if (fp->fp[i] != file)
3792                                 continue;
3793
3794                         unix_notinflight(fp->user, fp->fp[i]);
3795                         left = fp->count - 1 - i;
3796                         if (left) {
3797                                 memmove(&fp->fp[i], &fp->fp[i + 1],
3798                                                 left * sizeof(struct file *));
3799                         }
3800                         fp->count--;
3801                         if (!fp->count) {
3802                                 kfree_skb(skb);
3803                                 skb = NULL;
3804                         } else {
3805                                 __skb_queue_tail(&list, skb);
3806                         }
3807                         fput(file);
3808                         file = NULL;
3809                         break;
3810                 }
3811
3812                 if (!file)
3813                         break;
3814
3815                 __skb_queue_tail(&list, skb);
3816
3817                 skb = skb_dequeue(head);
3818         }
3819
3820         if (skb_peek(&list)) {
3821                 spin_lock_irq(&head->lock);
3822                 while ((skb = __skb_dequeue(&list)) != NULL)
3823                         __skb_queue_tail(head, skb);
3824                 spin_unlock_irq(&head->lock);
3825         }
3826 #else
3827         fput(io_file_from_index(ctx, index));
3828 #endif
3829 }
3830
3831 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
3832                                 int index)
3833 {
3834 #if defined(CONFIG_UNIX)
3835         struct sock *sock = ctx->ring_sock->sk;
3836         struct sk_buff_head *head = &sock->sk_receive_queue;
3837         struct sk_buff *skb;
3838
3839         /*
3840          * See if we can merge this file into an existing skb SCM_RIGHTS
3841          * file set. If there's no room, fall back to allocating a new skb
3842          * and filling it in.
3843          */
3844         spin_lock_irq(&head->lock);
3845         skb = skb_peek(head);
3846         if (skb) {
3847                 struct scm_fp_list *fpl = UNIXCB(skb).fp;
3848
3849                 if (fpl->count < SCM_MAX_FD) {
3850                         __skb_unlink(skb, head);
3851                         spin_unlock_irq(&head->lock);
3852                         fpl->fp[fpl->count] = get_file(file);
3853                         unix_inflight(fpl->user, fpl->fp[fpl->count]);
3854                         fpl->count++;
3855                         spin_lock_irq(&head->lock);
3856                         __skb_queue_head(head, skb);
3857                 } else {
3858                         skb = NULL;
3859                 }
3860         }
3861         spin_unlock_irq(&head->lock);
3862
3863         if (skb) {
3864                 fput(file);
3865                 return 0;
3866         }
3867
3868         return __io_sqe_files_scm(ctx, 1, index);
3869 #else
3870         return 0;
3871 #endif
3872 }
3873
3874 static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
3875                                unsigned nr_args)
3876 {
3877         struct io_uring_files_update up;
3878         __s32 __user *fds;
3879         int fd, i, err;
3880         __u32 done;
3881
3882         if (!ctx->file_table)
3883                 return -ENXIO;
3884         if (!nr_args)
3885                 return -EINVAL;
3886         if (copy_from_user(&up, arg, sizeof(up)))
3887                 return -EFAULT;
3888         if (check_add_overflow(up.offset, nr_args, &done))
3889                 return -EOVERFLOW;
3890         if (done > ctx->nr_user_files)
3891                 return -EINVAL;
3892
3893         done = 0;
3894         fds = (__s32 __user *) up.fds;
3895         while (nr_args) {
3896                 struct fixed_file_table *table;
3897                 unsigned index;
3898
3899                 err = 0;
3900                 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
3901                         err = -EFAULT;
3902                         break;
3903                 }
3904                 i = array_index_nospec(up.offset, ctx->nr_user_files);
3905                 table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
3906                 index = i & IORING_FILE_TABLE_MASK;
3907                 if (table->files[index]) {
3908                         io_sqe_file_unregister(ctx, i);
3909                         table->files[index] = NULL;
3910                 }
3911                 if (fd != -1) {
3912                         struct file *file;
3913
3914                         file = fget(fd);
3915                         if (!file) {
3916                                 err = -EBADF;
3917                                 break;
3918                         }
3919                         /*
3920                          * Don't allow io_uring instances to be registered. If
3921                          * UNIX isn't enabled, then this causes a reference
3922                          * cycle and this instance can never get freed. If UNIX
3923                          * is enabled we'll handle it just fine, but there's
3924                          * still no point in allowing a ring fd as it doesn't
3925                          * support regular read/write anyway.
3926                          */
3927                         if (file->f_op == &io_uring_fops) {
3928                                 fput(file);
3929                                 err = -EBADF;
3930                                 break;
3931                         }
3932                         table->files[index] = file;
3933                         err = io_sqe_file_register(ctx, file, i);
3934                         if (err)
3935                                 break;
3936                 }
3937                 nr_args--;
3938                 done++;
3939                 up.offset++;
3940         }
3941
3942         return done ? done : err;
3943 }
3944
3945 static void io_put_work(struct io_wq_work *work)
3946 {
3947         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
3948
3949         io_put_req(req);
3950 }
3951
3952 static void io_get_work(struct io_wq_work *work)
3953 {
3954         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
3955
3956         refcount_inc(&req->refs);
3957 }
3958
3959 static int io_sq_offload_start(struct io_ring_ctx *ctx,
3960                                struct io_uring_params *p)
3961 {
3962         struct io_wq_data data;
3963         unsigned concurrency;
3964         int ret;
3965
3966         init_waitqueue_head(&ctx->sqo_wait);
3967         mmgrab(current->mm);
3968         ctx->sqo_mm = current->mm;
3969
3970         if (ctx->flags & IORING_SETUP_SQPOLL) {
3971                 ret = -EPERM;
3972                 if (!capable(CAP_SYS_ADMIN))
3973                         goto err;
3974
3975                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
3976                 if (!ctx->sq_thread_idle)
3977                         ctx->sq_thread_idle = HZ;
3978
3979                 if (p->flags & IORING_SETUP_SQ_AFF) {
3980                         int cpu = p->sq_thread_cpu;
3981
3982                         ret = -EINVAL;
3983                         if (cpu >= nr_cpu_ids)
3984                                 goto err;
3985                         if (!cpu_online(cpu))
3986                                 goto err;
3987
3988                         ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
3989                                                         ctx, cpu,
3990                                                         "io_uring-sq");
3991                 } else {
3992                         ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
3993                                                         "io_uring-sq");
3994                 }
3995                 if (IS_ERR(ctx->sqo_thread)) {
3996                         ret = PTR_ERR(ctx->sqo_thread);
3997                         ctx->sqo_thread = NULL;
3998                         goto err;
3999                 }
4000                 wake_up_process(ctx->sqo_thread);
4001         } else if (p->flags & IORING_SETUP_SQ_AFF) {
4002                 /* Can't have SQ_AFF without SQPOLL */
4003                 ret = -EINVAL;
4004                 goto err;
4005         }
4006
4007         data.mm = ctx->sqo_mm;
4008         data.user = ctx->user;
4009         data.creds = ctx->creds;
4010         data.get_work = io_get_work;
4011         data.put_work = io_put_work;
4012
4013         /* Do QD, or 4 * CPUS, whatever is smallest */
4014         concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
4015         ctx->io_wq = io_wq_create(concurrency, &data);
4016         if (IS_ERR(ctx->io_wq)) {
4017                 ret = PTR_ERR(ctx->io_wq);
4018                 ctx->io_wq = NULL;
4019                 goto err;
4020         }
4021
4022         return 0;
4023 err:
4024         io_finish_async(ctx);
4025         mmdrop(ctx->sqo_mm);
4026         ctx->sqo_mm = NULL;
4027         return ret;
4028 }
4029
4030 static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
4031 {
4032         atomic_long_sub(nr_pages, &user->locked_vm);
4033 }
4034
4035 static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
4036 {
4037         unsigned long page_limit, cur_pages, new_pages;
4038
4039         /* Don't allow more pages than we can safely lock */
4040         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
4041
4042         do {
4043                 cur_pages = atomic_long_read(&user->locked_vm);
4044                 new_pages = cur_pages + nr_pages;
4045                 if (new_pages > page_limit)
4046                         return -ENOMEM;
4047         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
4048                                         new_pages) != cur_pages);
4049
4050         return 0;
4051 }
4052
4053 static void io_mem_free(void *ptr)
4054 {
4055         struct page *page;
4056
4057         if (!ptr)
4058                 return;
4059
4060         page = virt_to_head_page(ptr);
4061         if (put_page_testzero(page))
4062                 free_compound_page(page);
4063 }
4064
4065 static void *io_mem_alloc(size_t size)
4066 {
4067         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
4068                                 __GFP_NORETRY;
4069
4070         return (void *) __get_free_pages(gfp_flags, get_order(size));
4071 }
4072
4073 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
4074                                 size_t *sq_offset)
4075 {
4076         struct io_rings *rings;
4077         size_t off, sq_array_size;
4078
4079         off = struct_size(rings, cqes, cq_entries);
4080         if (off == SIZE_MAX)
4081                 return SIZE_MAX;
4082
4083 #ifdef CONFIG_SMP
4084         off = ALIGN(off, SMP_CACHE_BYTES);
4085         if (off == 0)
4086                 return SIZE_MAX;
4087 #endif
4088
4089         sq_array_size = array_size(sizeof(u32), sq_entries);
4090         if (sq_array_size == SIZE_MAX)
4091                 return SIZE_MAX;
4092
4093         if (check_add_overflow(off, sq_array_size, &off))
4094                 return SIZE_MAX;
4095
4096         if (sq_offset)
4097                 *sq_offset = off;
4098
4099         return off;
4100 }
4101
4102 static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
4103 {
4104         size_t pages;
4105
4106         pages = (size_t)1 << get_order(
4107                 rings_size(sq_entries, cq_entries, NULL));
4108         pages += (size_t)1 << get_order(
4109                 array_size(sizeof(struct io_uring_sqe), sq_entries));
4110
4111         return pages;
4112 }
4113
4114 static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
4115 {
4116         int i, j;
4117
4118         if (!ctx->user_bufs)
4119                 return -ENXIO;
4120
4121         for (i = 0; i < ctx->nr_user_bufs; i++) {
4122                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
4123
4124                 for (j = 0; j < imu->nr_bvecs; j++)
4125                         put_user_page(imu->bvec[j].bv_page);
4126
4127                 if (ctx->account_mem)
4128                         io_unaccount_mem(ctx->user, imu->nr_bvecs);
4129                 kvfree(imu->bvec);
4130                 imu->nr_bvecs = 0;
4131         }
4132
4133         kfree(ctx->user_bufs);
4134         ctx->user_bufs = NULL;
4135         ctx->nr_user_bufs = 0;
4136         return 0;
4137 }
4138
4139 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
4140                        void __user *arg, unsigned index)
4141 {
4142         struct iovec __user *src;
4143
4144 #ifdef CONFIG_COMPAT
4145         if (ctx->compat) {
4146                 struct compat_iovec __user *ciovs;
4147                 struct compat_iovec ciov;
4148
4149                 ciovs = (struct compat_iovec __user *) arg;
4150                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
4151                         return -EFAULT;
4152
4153                 dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
4154                 dst->iov_len = ciov.iov_len;
4155                 return 0;
4156         }
4157 #endif
4158         src = (struct iovec __user *) arg;
4159         if (copy_from_user(dst, &src[index], sizeof(*dst)))
4160                 return -EFAULT;
4161         return 0;
4162 }
4163
4164 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
4165                                   unsigned nr_args)
4166 {
4167         struct vm_area_struct **vmas = NULL;
4168         struct page **pages = NULL;
4169         int i, j, got_pages = 0;
4170         int ret = -EINVAL;
4171
4172         if (ctx->user_bufs)
4173                 return -EBUSY;
4174         if (!nr_args || nr_args > UIO_MAXIOV)
4175                 return -EINVAL;
4176
4177         ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
4178                                         GFP_KERNEL);
4179         if (!ctx->user_bufs)
4180                 return -ENOMEM;
4181
4182         for (i = 0; i < nr_args; i++) {
4183                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
4184                 unsigned long off, start, end, ubuf;
4185                 int pret, nr_pages;
4186                 struct iovec iov;
4187                 size_t size;
4188
4189                 ret = io_copy_iov(ctx, &iov, arg, i);
4190                 if (ret)
4191                         goto err;
4192
4193                 /*
4194                  * Don't impose further limits on the size and buffer
4195                  * constraints here, we'll -EINVAL later when IO is
4196                  * submitted if they are wrong.
4197                  */
4198                 ret = -EFAULT;
4199                 if (!iov.iov_base || !iov.iov_len)
4200                         goto err;
4201
4202                 /* arbitrary limit, but we need something */
4203                 if (iov.iov_len > SZ_1G)
4204                         goto err;
4205
4206                 ubuf = (unsigned long) iov.iov_base;
4207                 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
4208                 start = ubuf >> PAGE_SHIFT;
4209                 nr_pages = end - start;
4210
4211                 if (ctx->account_mem) {
4212                         ret = io_account_mem(ctx->user, nr_pages);
4213                         if (ret)
4214                                 goto err;
4215                 }
4216
4217                 ret = 0;
4218                 if (!pages || nr_pages > got_pages) {
4219                         kfree(vmas);
4220                         kfree(pages);
4221                         pages = kvmalloc_array(nr_pages, sizeof(struct page *),
4222                                                 GFP_KERNEL);
4223                         vmas = kvmalloc_array(nr_pages,
4224                                         sizeof(struct vm_area_struct *),
4225                                         GFP_KERNEL);
4226                         if (!pages || !vmas) {
4227                                 ret = -ENOMEM;
4228                                 if (ctx->account_mem)
4229                                         io_unaccount_mem(ctx->user, nr_pages);
4230                                 goto err;
4231                         }
4232                         got_pages = nr_pages;
4233                 }
4234
4235                 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
4236                                                 GFP_KERNEL);
4237                 ret = -ENOMEM;
4238                 if (!imu->bvec) {
4239                         if (ctx->account_mem)
4240                                 io_unaccount_mem(ctx->user, nr_pages);
4241                         goto err;
4242                 }
4243
4244                 ret = 0;
4245                 down_read(&current->mm->mmap_sem);
4246                 pret = get_user_pages(ubuf, nr_pages,
4247                                       FOLL_WRITE | FOLL_LONGTERM,
4248                                       pages, vmas);
4249                 if (pret == nr_pages) {
4250                         /* don't support file backed memory */
4251                         for (j = 0; j < nr_pages; j++) {
4252                                 struct vm_area_struct *vma = vmas[j];
4253
4254                                 if (vma->vm_file &&
4255                                     !is_file_hugepages(vma->vm_file)) {
4256                                         ret = -EOPNOTSUPP;
4257                                         break;
4258                                 }
4259                         }
4260                 } else {
4261                         ret = pret < 0 ? pret : -EFAULT;
4262                 }
4263                 up_read(&current->mm->mmap_sem);
4264                 if (ret) {
4265                         /*
4266                          * if we did partial map, or found file backed vmas,
4267                          * release any pages we did get
4268                          */
4269                         if (pret > 0)
4270                                 put_user_pages(pages, pret);
4271                         if (ctx->account_mem)
4272                                 io_unaccount_mem(ctx->user, nr_pages);
4273                         kvfree(imu->bvec);
4274                         goto err;
4275                 }
4276
4277                 off = ubuf & ~PAGE_MASK;
4278                 size = iov.iov_len;
4279                 for (j = 0; j < nr_pages; j++) {
4280                         size_t vec_len;
4281
4282                         vec_len = min_t(size_t, size, PAGE_SIZE - off);
4283                         imu->bvec[j].bv_page = pages[j];
4284                         imu->bvec[j].bv_len = vec_len;
4285                         imu->bvec[j].bv_offset = off;
4286                         off = 0;
4287                         size -= vec_len;
4288                 }
4289                 /* store original address for later verification */
4290                 imu->ubuf = ubuf;
4291                 imu->len = iov.iov_len;
4292                 imu->nr_bvecs = nr_pages;
4293
4294                 ctx->nr_user_bufs++;
4295         }
4296         kvfree(pages);
4297         kvfree(vmas);
4298         return 0;
4299 err:
4300         kvfree(pages);
4301         kvfree(vmas);
4302         io_sqe_buffer_unregister(ctx);
4303         return ret;
4304 }
4305
4306 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
4307 {
4308         __s32 __user *fds = arg;
4309         int fd;
4310
4311         if (ctx->cq_ev_fd)
4312                 return -EBUSY;
4313
4314         if (copy_from_user(&fd, fds, sizeof(*fds)))
4315                 return -EFAULT;
4316
4317         ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
4318         if (IS_ERR(ctx->cq_ev_fd)) {
4319                 int ret = PTR_ERR(ctx->cq_ev_fd);
4320                 ctx->cq_ev_fd = NULL;
4321                 return ret;
4322         }
4323
4324         return 0;
4325 }
4326
4327 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
4328 {
4329         if (ctx->cq_ev_fd) {
4330                 eventfd_ctx_put(ctx->cq_ev_fd);
4331                 ctx->cq_ev_fd = NULL;
4332                 return 0;
4333         }
4334
4335         return -ENXIO;
4336 }
4337
4338 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
4339 {
4340         io_finish_async(ctx);
4341         if (ctx->sqo_mm)
4342                 mmdrop(ctx->sqo_mm);
4343
4344         io_iopoll_reap_events(ctx);
4345         io_sqe_buffer_unregister(ctx);
4346         io_sqe_files_unregister(ctx);
4347         io_eventfd_unregister(ctx);
4348
4349 #if defined(CONFIG_UNIX)
4350         if (ctx->ring_sock) {
4351                 ctx->ring_sock->file = NULL; /* so that iput() is called */
4352                 sock_release(ctx->ring_sock);
4353         }
4354 #endif
4355
4356         io_mem_free(ctx->rings);
4357         io_mem_free(ctx->sq_sqes);
4358
4359         percpu_ref_exit(&ctx->refs);
4360         if (ctx->account_mem)
4361                 io_unaccount_mem(ctx->user,
4362                                 ring_pages(ctx->sq_entries, ctx->cq_entries));
4363         free_uid(ctx->user);
4364         put_cred(ctx->creds);
4365         kfree(ctx->completions);
4366         kmem_cache_free(req_cachep, ctx->fallback_req);
4367         kfree(ctx);
4368 }
4369
4370 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
4371 {
4372         struct io_ring_ctx *ctx = file->private_data;
4373         __poll_t mask = 0;
4374
4375         poll_wait(file, &ctx->cq_wait, wait);
4376         /*
4377          * synchronizes with barrier from wq_has_sleeper call in
4378          * io_commit_cqring
4379          */
4380         smp_rmb();
4381         if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
4382             ctx->rings->sq_ring_entries)
4383                 mask |= EPOLLOUT | EPOLLWRNORM;
4384         if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail)
4385                 mask |= EPOLLIN | EPOLLRDNORM;
4386
4387         return mask;
4388 }
4389
4390 static int io_uring_fasync(int fd, struct file *file, int on)
4391 {
4392         struct io_ring_ctx *ctx = file->private_data;
4393
4394         return fasync_helper(fd, file, on, &ctx->cq_fasync);
4395 }
4396
4397 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
4398 {
4399         mutex_lock(&ctx->uring_lock);
4400         percpu_ref_kill(&ctx->refs);
4401         mutex_unlock(&ctx->uring_lock);
4402
4403         io_kill_timeouts(ctx);
4404         io_poll_remove_all(ctx);
4405
4406         if (ctx->io_wq)
4407                 io_wq_cancel_all(ctx->io_wq);
4408
4409         io_iopoll_reap_events(ctx);
4410         /* if we failed setting up the ctx, we might not have any rings */
4411         if (ctx->rings)
4412                 io_cqring_overflow_flush(ctx, true);
4413         wait_for_completion(&ctx->completions[0]);
4414         io_ring_ctx_free(ctx);
4415 }
4416
4417 static int io_uring_release(struct inode *inode, struct file *file)
4418 {
4419         struct io_ring_ctx *ctx = file->private_data;
4420
4421         file->private_data = NULL;
4422         io_ring_ctx_wait_and_kill(ctx);
4423         return 0;
4424 }
4425
4426 static void io_uring_cancel_files(struct io_ring_ctx *ctx,
4427                                   struct files_struct *files)
4428 {
4429         struct io_kiocb *req;
4430         DEFINE_WAIT(wait);
4431
4432         while (!list_empty_careful(&ctx->inflight_list)) {
4433                 struct io_kiocb *cancel_req = NULL;
4434
4435                 spin_lock_irq(&ctx->inflight_lock);
4436                 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
4437                         if (req->work.files != files)
4438                                 continue;
4439                         /* req is being completed, ignore */
4440                         if (!refcount_inc_not_zero(&req->refs))
4441                                 continue;
4442                         cancel_req = req;
4443                         break;
4444                 }
4445                 if (cancel_req)
4446                         prepare_to_wait(&ctx->inflight_wait, &wait,
4447                                                 TASK_UNINTERRUPTIBLE);
4448                 spin_unlock_irq(&ctx->inflight_lock);
4449
4450                 /* We need to keep going until we don't find a matching req */
4451                 if (!cancel_req)
4452                         break;
4453
4454                 io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
4455                 io_put_req(cancel_req);
4456                 schedule();
4457         }
4458         finish_wait(&ctx->inflight_wait, &wait);
4459 }
4460
4461 static int io_uring_flush(struct file *file, void *data)
4462 {
4463         struct io_ring_ctx *ctx = file->private_data;
4464
4465         io_uring_cancel_files(ctx, data);
4466         if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
4467                 io_cqring_overflow_flush(ctx, true);
4468                 io_wq_cancel_all(ctx->io_wq);
4469         }
4470         return 0;
4471 }
4472
4473 static void *io_uring_validate_mmap_request(struct file *file,
4474                                             loff_t pgoff, size_t sz)
4475 {
4476         struct io_ring_ctx *ctx = file->private_data;
4477         loff_t offset = pgoff << PAGE_SHIFT;
4478         struct page *page;
4479         void *ptr;
4480
4481         switch (offset) {
4482         case IORING_OFF_SQ_RING:
4483         case IORING_OFF_CQ_RING:
4484                 ptr = ctx->rings;
4485                 break;
4486         case IORING_OFF_SQES:
4487                 ptr = ctx->sq_sqes;
4488                 break;
4489         default:
4490                 return ERR_PTR(-EINVAL);
4491         }
4492
4493         page = virt_to_head_page(ptr);
4494         if (sz > page_size(page))
4495                 return ERR_PTR(-EINVAL);
4496
4497         return ptr;
4498 }
4499
4500 #ifdef CONFIG_MMU
4501
4502 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
4503 {
4504         size_t sz = vma->vm_end - vma->vm_start;
4505         unsigned long pfn;
4506         void *ptr;
4507
4508         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
4509         if (IS_ERR(ptr))
4510                 return PTR_ERR(ptr);
4511
4512         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
4513         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
4514 }
4515
4516 #else /* !CONFIG_MMU */
4517
4518 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
4519 {
4520         return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
4521 }
4522
4523 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
4524 {
4525         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
4526 }
4527
4528 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
4529         unsigned long addr, unsigned long len,
4530         unsigned long pgoff, unsigned long flags)
4531 {
4532         void *ptr;
4533
4534         ptr = io_uring_validate_mmap_request(file, pgoff, len);
4535         if (IS_ERR(ptr))
4536                 return PTR_ERR(ptr);
4537
4538         return (unsigned long) ptr;
4539 }
4540
4541 #endif /* !CONFIG_MMU */
4542
4543 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
4544                 u32, min_complete, u32, flags, const sigset_t __user *, sig,
4545                 size_t, sigsz)
4546 {
4547         struct io_ring_ctx *ctx;
4548         long ret = -EBADF;
4549         int submitted = 0;
4550         struct fd f;
4551
4552         if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
4553                 return -EINVAL;
4554
4555         f = fdget(fd);
4556         if (!f.file)
4557                 return -EBADF;
4558
4559         ret = -EOPNOTSUPP;
4560         if (f.file->f_op != &io_uring_fops)
4561                 goto out_fput;
4562
4563         ret = -ENXIO;
4564         ctx = f.file->private_data;
4565         if (!percpu_ref_tryget(&ctx->refs))
4566                 goto out_fput;
4567
4568         /*
4569          * For SQ polling, the thread will do all submissions and completions.
4570          * Just return the requested submit count, and wake the thread if
4571          * we were asked to.
4572          */
4573         ret = 0;
4574         if (ctx->flags & IORING_SETUP_SQPOLL) {
4575                 if (!list_empty_careful(&ctx->cq_overflow_list))
4576                         io_cqring_overflow_flush(ctx, false);
4577                 if (flags & IORING_ENTER_SQ_WAKEUP)
4578                         wake_up(&ctx->sqo_wait);
4579                 submitted = to_submit;
4580         } else if (to_submit) {
4581                 struct mm_struct *cur_mm;
4582
4583                 to_submit = min(to_submit, ctx->sq_entries);
4584                 mutex_lock(&ctx->uring_lock);
4585                 /* already have mm, so io_submit_sqes() won't try to grab it */
4586                 cur_mm = ctx->sqo_mm;
4587                 submitted = io_submit_sqes(ctx, to_submit, f.file, fd,
4588                                            &cur_mm, false);
4589                 mutex_unlock(&ctx->uring_lock);
4590         }
4591         if (flags & IORING_ENTER_GETEVENTS) {
4592                 unsigned nr_events = 0;
4593
4594                 min_complete = min(min_complete, ctx->cq_entries);
4595
4596                 if (ctx->flags & IORING_SETUP_IOPOLL) {
4597                         ret = io_iopoll_check(ctx, &nr_events, min_complete);
4598                 } else {
4599                         ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
4600                 }
4601         }
4602
4603         percpu_ref_put(&ctx->refs);
4604 out_fput:
4605         fdput(f);
4606         return submitted ? submitted : ret;
4607 }
4608
4609 static const struct file_operations io_uring_fops = {
4610         .release        = io_uring_release,
4611         .flush          = io_uring_flush,
4612         .mmap           = io_uring_mmap,
4613 #ifndef CONFIG_MMU
4614         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
4615         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
4616 #endif
4617         .poll           = io_uring_poll,
4618         .fasync         = io_uring_fasync,
4619 };
4620
4621 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
4622                                   struct io_uring_params *p)
4623 {
4624         struct io_rings *rings;
4625         size_t size, sq_array_offset;
4626
4627         size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
4628         if (size == SIZE_MAX)
4629                 return -EOVERFLOW;
4630
4631         rings = io_mem_alloc(size);
4632         if (!rings)
4633                 return -ENOMEM;
4634
4635         ctx->rings = rings;
4636         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
4637         rings->sq_ring_mask = p->sq_entries - 1;
4638         rings->cq_ring_mask = p->cq_entries - 1;
4639         rings->sq_ring_entries = p->sq_entries;
4640         rings->cq_ring_entries = p->cq_entries;
4641         ctx->sq_mask = rings->sq_ring_mask;
4642         ctx->cq_mask = rings->cq_ring_mask;
4643         ctx->sq_entries = rings->sq_ring_entries;
4644         ctx->cq_entries = rings->cq_ring_entries;
4645
4646         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
4647         if (size == SIZE_MAX) {
4648                 io_mem_free(ctx->rings);
4649                 ctx->rings = NULL;
4650                 return -EOVERFLOW;
4651         }
4652
4653         ctx->sq_sqes = io_mem_alloc(size);
4654         if (!ctx->sq_sqes) {
4655                 io_mem_free(ctx->rings);
4656                 ctx->rings = NULL;
4657                 return -ENOMEM;
4658         }
4659
4660         return 0;
4661 }
4662
4663 /*
4664  * Allocate an anonymous fd, this is what constitutes the application
4665  * visible backing of an io_uring instance. The application mmaps this
4666  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
4667  * we have to tie this fd to a socket for file garbage collection purposes.
4668  */
4669 static int io_uring_get_fd(struct io_ring_ctx *ctx)
4670 {
4671         struct file *file;
4672         int ret;
4673
4674 #if defined(CONFIG_UNIX)
4675         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
4676                                 &ctx->ring_sock);
4677         if (ret)
4678                 return ret;
4679 #endif
4680
4681         ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
4682         if (ret < 0)
4683                 goto err;
4684
4685         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
4686                                         O_RDWR | O_CLOEXEC);
4687         if (IS_ERR(file)) {
4688                 put_unused_fd(ret);
4689                 ret = PTR_ERR(file);
4690                 goto err;
4691         }
4692
4693 #if defined(CONFIG_UNIX)
4694         ctx->ring_sock->file = file;
4695         ctx->ring_sock->sk->sk_user_data = ctx;
4696 #endif
4697         fd_install(ret, file);
4698         return ret;
4699 err:
4700 #if defined(CONFIG_UNIX)
4701         sock_release(ctx->ring_sock);
4702         ctx->ring_sock = NULL;
4703 #endif
4704         return ret;
4705 }
4706
4707 static int io_uring_create(unsigned entries, struct io_uring_params *p)
4708 {
4709         struct user_struct *user = NULL;
4710         struct io_ring_ctx *ctx;
4711         bool account_mem;
4712         int ret;
4713
4714         if (!entries || entries > IORING_MAX_ENTRIES)
4715                 return -EINVAL;
4716
4717         /*
4718          * Use twice as many entries for the CQ ring. It's possible for the
4719          * application to drive a higher depth than the size of the SQ ring,
4720          * since the sqes are only used at submission time. This allows for
4721          * some flexibility in overcommitting a bit. If the application has
4722          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
4723          * of CQ ring entries manually.
4724          */
4725         p->sq_entries = roundup_pow_of_two(entries);
4726         if (p->flags & IORING_SETUP_CQSIZE) {
4727                 /*
4728                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
4729                  * to a power-of-two, if it isn't already. We do NOT impose
4730                  * any cq vs sq ring sizing.
4731                  */
4732                 if (p->cq_entries < p->sq_entries || p->cq_entries > IORING_MAX_CQ_ENTRIES)
4733                         return -EINVAL;
4734                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
4735         } else {
4736                 p->cq_entries = 2 * p->sq_entries;
4737         }
4738
4739         user = get_uid(current_user());
4740         account_mem = !capable(CAP_IPC_LOCK);
4741
4742         if (account_mem) {
4743                 ret = io_account_mem(user,
4744                                 ring_pages(p->sq_entries, p->cq_entries));
4745                 if (ret) {
4746                         free_uid(user);
4747                         return ret;
4748                 }
4749         }
4750
4751         ctx = io_ring_ctx_alloc(p);
4752         if (!ctx) {
4753                 if (account_mem)
4754                         io_unaccount_mem(user, ring_pages(p->sq_entries,
4755                                                                 p->cq_entries));
4756                 free_uid(user);
4757                 return -ENOMEM;
4758         }
4759         ctx->compat = in_compat_syscall();
4760         ctx->account_mem = account_mem;
4761         ctx->user = user;
4762         ctx->creds = prepare_creds();
4763
4764         ret = io_allocate_scq_urings(ctx, p);
4765         if (ret)
4766                 goto err;
4767
4768         ret = io_sq_offload_start(ctx, p);
4769         if (ret)
4770                 goto err;
4771
4772         memset(&p->sq_off, 0, sizeof(p->sq_off));
4773         p->sq_off.head = offsetof(struct io_rings, sq.head);
4774         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
4775         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
4776         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
4777         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
4778         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
4779         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
4780
4781         memset(&p->cq_off, 0, sizeof(p->cq_off));
4782         p->cq_off.head = offsetof(struct io_rings, cq.head);
4783         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
4784         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
4785         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
4786         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
4787         p->cq_off.cqes = offsetof(struct io_rings, cqes);
4788
4789         /*
4790          * Install ring fd as the very last thing, so we don't risk someone
4791          * having closed it before we finish setup
4792          */
4793         ret = io_uring_get_fd(ctx);
4794         if (ret < 0)
4795                 goto err;
4796
4797         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP;
4798         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
4799         return ret;
4800 err:
4801         io_ring_ctx_wait_and_kill(ctx);
4802         return ret;
4803 }
4804
4805 /*
4806  * Sets up an aio uring context, and returns the fd. Applications asks for a
4807  * ring size, we return the actual sq/cq ring sizes (among other things) in the
4808  * params structure passed in.
4809  */
4810 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
4811 {
4812         struct io_uring_params p;
4813         long ret;
4814         int i;
4815
4816         if (copy_from_user(&p, params, sizeof(p)))
4817                 return -EFAULT;
4818         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
4819                 if (p.resv[i])
4820                         return -EINVAL;
4821         }
4822
4823         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
4824                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE))
4825                 return -EINVAL;
4826
4827         ret = io_uring_create(entries, &p);
4828         if (ret < 0)
4829                 return ret;
4830
4831         if (copy_to_user(params, &p, sizeof(p)))
4832                 return -EFAULT;
4833
4834         return ret;
4835 }
4836
4837 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
4838                 struct io_uring_params __user *, params)
4839 {
4840         return io_uring_setup(entries, params);
4841 }
4842
4843 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
4844                                void __user *arg, unsigned nr_args)
4845         __releases(ctx->uring_lock)
4846         __acquires(ctx->uring_lock)
4847 {
4848         int ret;
4849
4850         /*
4851          * We're inside the ring mutex, if the ref is already dying, then
4852          * someone else killed the ctx or is already going through
4853          * io_uring_register().
4854          */
4855         if (percpu_ref_is_dying(&ctx->refs))
4856                 return -ENXIO;
4857
4858         percpu_ref_kill(&ctx->refs);
4859
4860         /*
4861          * Drop uring mutex before waiting for references to exit. If another
4862          * thread is currently inside io_uring_enter() it might need to grab
4863          * the uring_lock to make progress. If we hold it here across the drain
4864          * wait, then we can deadlock. It's safe to drop the mutex here, since
4865          * no new references will come in after we've killed the percpu ref.
4866          */
4867         mutex_unlock(&ctx->uring_lock);
4868         wait_for_completion(&ctx->completions[0]);
4869         mutex_lock(&ctx->uring_lock);
4870
4871         switch (opcode) {
4872         case IORING_REGISTER_BUFFERS:
4873                 ret = io_sqe_buffer_register(ctx, arg, nr_args);
4874                 break;
4875         case IORING_UNREGISTER_BUFFERS:
4876                 ret = -EINVAL;
4877                 if (arg || nr_args)
4878                         break;
4879                 ret = io_sqe_buffer_unregister(ctx);
4880                 break;
4881         case IORING_REGISTER_FILES:
4882                 ret = io_sqe_files_register(ctx, arg, nr_args);
4883                 break;
4884         case IORING_UNREGISTER_FILES:
4885                 ret = -EINVAL;
4886                 if (arg || nr_args)
4887                         break;
4888                 ret = io_sqe_files_unregister(ctx);
4889                 break;
4890         case IORING_REGISTER_FILES_UPDATE:
4891                 ret = io_sqe_files_update(ctx, arg, nr_args);
4892                 break;
4893         case IORING_REGISTER_EVENTFD:
4894                 ret = -EINVAL;
4895                 if (nr_args != 1)
4896                         break;
4897                 ret = io_eventfd_register(ctx, arg);
4898                 break;
4899         case IORING_UNREGISTER_EVENTFD:
4900                 ret = -EINVAL;
4901                 if (arg || nr_args)
4902                         break;
4903                 ret = io_eventfd_unregister(ctx);
4904                 break;
4905         default:
4906                 ret = -EINVAL;
4907                 break;
4908         }
4909
4910         /* bring the ctx back to life */
4911         reinit_completion(&ctx->completions[0]);
4912         percpu_ref_reinit(&ctx->refs);
4913         return ret;
4914 }
4915
4916 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
4917                 void __user *, arg, unsigned int, nr_args)
4918 {
4919         struct io_ring_ctx *ctx;
4920         long ret = -EBADF;
4921         struct fd f;
4922
4923         f = fdget(fd);
4924         if (!f.file)
4925                 return -EBADF;
4926
4927         ret = -EOPNOTSUPP;
4928         if (f.file->f_op != &io_uring_fops)
4929                 goto out_fput;
4930
4931         ctx = f.file->private_data;
4932
4933         mutex_lock(&ctx->uring_lock);
4934         ret = __io_uring_register(ctx, opcode, arg, nr_args);
4935         mutex_unlock(&ctx->uring_lock);
4936         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
4937                                                         ctx->cq_ev_fd != NULL, ret);
4938 out_fput:
4939         fdput(f);
4940         return ret;
4941 }
4942
4943 static int __init io_uring_init(void)
4944 {
4945         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
4946         return 0;
4947 };
4948 __initcall(io_uring_init);