fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqring (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <linux/compat.h>
  47 #include <linux/refcount.h>
  48 #include <linux/uio.h>
  49
  50 #include <linux/sched/signal.h>
  51 #include <linux/fs.h>
  52 #include <linux/file.h>
  53 #include <linux/fdtable.h>
  54 #include <linux/mm.h>
  55 #include <linux/mman.h>
  56 #include <linux/mmu_context.h>
  57 #include <linux/percpu.h>
  58 #include <linux/slab.h>
  59 #include <linux/workqueue.h>
  60 #include <linux/kthread.h>
  61 #include <linux/blkdev.h>
  62 #include <linux/bvec.h>
  63 #include <linux/net.h>
  64 #include <net/sock.h>
  65 #include <net/af_unix.h>
  66 #include <net/scm.h>
  67 #include <linux/anon_inodes.h>
  68 #include <linux/sched/mm.h>
  69 #include <linux/uaccess.h>
  70 #include <linux/nospec.h>
  71 #include <linux/sizes.h>
  72 #include <linux/hugetlb.h>
  73
  74 #include <uapi/linux/io_uring.h>
  75
  76 #include "internal.h"
  77
  78 #define IORING_MAX_ENTRIES      32768
  79 #define IORING_MAX_FIXED_FILES  1024
  80
  81 struct io_uring {
  82         u32 head ____cacheline_aligned_in_smp;
  83         u32 tail ____cacheline_aligned_in_smp;
  84 };
  85
  86 /*
  87  * This data is shared with the application through the mmap at offsets
  88  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
  89  *
  90  * The offsets to the member fields are published through struct
  91  * io_sqring_offsets when calling io_uring_setup.
  92  */
  93 struct io_rings {
  94         /*
  95          * Head and tail offsets into the ring; the offsets need to be
  96          * masked to get valid indices.
  97          *
  98          * The kernel controls head of the sq ring and the tail of the cq ring,
  99          * and the application controls tail of the sq ring and the head of the
 100          * cq ring.
 101          */
 102         struct io_uring         sq, cq;
 103         /*
 104          * Bitmasks to apply to head and tail offsets (constant, equals
 105          * ring_entries - 1)
 106          */
 107         u32                     sq_ring_mask, cq_ring_mask;
 108         /* Ring sizes (constant, power of 2) */
 109         u32                     sq_ring_entries, cq_ring_entries;
 110         /*
 111          * Number of invalid entries dropped by the kernel due to
 112          * invalid index stored in array
 113          *
 114          * Written by the kernel, shouldn't be modified by the
 115          * application (i.e. get number of "new events" by comparing to
 116          * cached value).
 117          *
 118          * After a new SQ head value was read by the application this
 119          * counter includes all submissions that were dropped reaching
 120          * the new SQ head (and possibly more).
 121          */
 122         u32                     sq_dropped;
 123         /*
 124          * Runtime flags
 125          *
 126          * Written by the kernel, shouldn't be modified by the
 127          * application.
 128          *
 129          * The application needs a full memory barrier before checking
 130          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 131          */
 132         u32                     sq_flags;
 133         /*
 134          * Number of completion events lost because the queue was full;
 135          * this should be avoided by the application by making sure
 136          * there are not more requests pending thatn there is space in
 137          * the completion queue.
 138          *
 139          * Written by the kernel, shouldn't be modified by the
 140          * application (i.e. get number of "new events" by comparing to
 141          * cached value).
 142          *
 143          * As completion events come in out of order this counter is not
 144          * ordered with any other data.
 145          */
 146         u32                     cq_overflow;
 147         /*
 148          * Ring buffer of completion events.
 149          *
 150          * The kernel writes completion events fresh every time they are
 151          * produced, so the application is allowed to modify pending
 152          * entries.
 153          */
 154         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 155 };
 156
 157 struct io_mapped_ubuf {
 158         u64             ubuf;
 159         size_t          len;
 160         struct          bio_vec *bvec;
 161         unsigned int    nr_bvecs;
 162 };
 163
 164 struct async_list {
 165         spinlock_t              lock;
 166         atomic_t                cnt;
 167         struct list_head        list;
 168
 169         struct file             *file;
 170         off_t                   io_start;
 171         size_t                  io_len;
 172 };
 173
 174 struct io_ring_ctx {
 175         struct {
 176                 struct percpu_ref       refs;
 177         } ____cacheline_aligned_in_smp;
 178
 179         struct {
 180                 unsigned int            flags;
 181                 bool                    compat;
 182                 bool                    account_mem;
 183
 184                 /*
 185                  * Ring buffer of indices into array of io_uring_sqe, which is
 186                  * mmapped by the application using the IORING_OFF_SQES offset.
 187                  *
 188                  * This indirection could e.g. be used to assign fixed
 189                  * io_uring_sqe entries to operations and only submit them to
 190                  * the queue when needed.
 191                  *
 192                  * The kernel modifies neither the indices array nor the entries
 193                  * array.
 194                  */
 195                 u32                     *sq_array;
 196                 unsigned                cached_sq_head;
 197                 unsigned                sq_entries;
 198                 unsigned                sq_mask;
 199                 unsigned                sq_thread_idle;
 200                 struct io_uring_sqe     *sq_sqes;
 201
 202                 struct list_head        defer_list;
 203                 struct list_head        timeout_list;
 204         } ____cacheline_aligned_in_smp;
 205
 206         /* IO offload */
 207         struct workqueue_struct *sqo_wq[2];
 208         struct task_struct      *sqo_thread;    /* if using sq thread polling */
 209         struct mm_struct        *sqo_mm;
 210         wait_queue_head_t       sqo_wait;
 211         struct completion       sqo_thread_started;
 212
 213         struct {
 214                 unsigned                cached_cq_tail;
 215                 unsigned                cq_entries;
 216                 unsigned                cq_mask;
 217                 struct wait_queue_head  cq_wait;
 218                 struct fasync_struct    *cq_fasync;
 219                 struct eventfd_ctx      *cq_ev_fd;
 220                 atomic_t                cq_timeouts;
 221         } ____cacheline_aligned_in_smp;
 222
 223         struct io_rings *rings;
 224
 225         /*
 226          * If used, fixed file set. Writers must ensure that ->refs is dead,
 227          * readers must ensure that ->refs is alive as long as the file* is
 228          * used. Only updated through io_uring_register(2).
 229          */
 230         struct file             **user_files;
 231         unsigned                nr_user_files;
 232
 233         /* if used, fixed mapped user buffers */
 234         unsigned                nr_user_bufs;
 235         struct io_mapped_ubuf   *user_bufs;
 236
 237         struct user_struct      *user;
 238
 239         struct completion       ctx_done;
 240
 241         struct {
 242                 struct mutex            uring_lock;
 243                 wait_queue_head_t       wait;
 244         } ____cacheline_aligned_in_smp;
 245
 246         struct {
 247                 spinlock_t              completion_lock;
 248                 bool                    poll_multi_file;
 249                 /*
 250                  * ->poll_list is protected by the ctx->uring_lock for
 251                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 252                  * For SQPOLL, only the single threaded io_sq_thread() will
 253                  * manipulate the list, hence no extra locking is needed there.
 254                  */
 255                 struct list_head        poll_list;
 256                 struct list_head        cancel_list;
 257         } ____cacheline_aligned_in_smp;
 258
 259         struct async_list       pending_async[2];
 260
 261 #if defined(CONFIG_UNIX)
 262         struct socket           *ring_sock;
 263 #endif
 264 };
 265
 266 struct sqe_submit {
 267         const struct io_uring_sqe       *sqe;
 268         unsigned short                  index;
 269         u32                             sequence;
 270         bool                            has_user;
 271         bool                            needs_lock;
 272         bool                            needs_fixed_file;
 273 };
 274
 275 /*
 276  * First field must be the file pointer in all the
 277  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 278  */
 279 struct io_poll_iocb {
 280         struct file                     *file;
 281         struct wait_queue_head          *head;
 282         __poll_t                        events;
 283         bool                            done;
 284         bool                            canceled;
 285         struct wait_queue_entry         wait;
 286 };
 287
 288 struct io_timeout {
 289         struct file                     *file;
 290         struct hrtimer                  timer;
 291 };
 292
 293 /*
 294  * NOTE! Each of the iocb union members has the file pointer
 295  * as the first entry in their struct definition. So you can
 296  * access the file pointer through any of the sub-structs,
 297  * or directly as just 'ki_filp' in this struct.
 298  */
 299 struct io_kiocb {
 300         union {
 301                 struct file             *file;
 302                 struct kiocb            rw;
 303                 struct io_poll_iocb     poll;
 304                 struct io_timeout       timeout;
 305         };
 306
 307         struct sqe_submit       submit;
 308
 309         struct io_ring_ctx      *ctx;
 310         struct list_head        list;
 311         struct list_head        link_list;
 312         unsigned int            flags;
 313         refcount_t              refs;
 314 #define REQ_F_NOWAIT            1       /* must not punt to workers */
 315 #define REQ_F_IOPOLL_COMPLETED  2       /* polled IO has completed */
 316 #define REQ_F_FIXED_FILE        4       /* ctx owns file */
 317 #define REQ_F_SEQ_PREV          8       /* sequential with previous */
 318 #define REQ_F_IO_DRAIN          16      /* drain existing IO first */
 319 #define REQ_F_IO_DRAINED        32      /* drain done */
 320 #define REQ_F_LINK              64      /* linked sqes */
 321 #define REQ_F_LINK_DONE         128     /* linked sqes done */
 322 #define REQ_F_FAIL_LINK         256     /* fail rest of links */
 323 #define REQ_F_SHADOW_DRAIN      512     /* link-drain shadow req */
 324 #define REQ_F_TIMEOUT           1024    /* timeout request */
 325 #define REQ_F_ISREG             2048    /* regular file */
 326 #define REQ_F_MUST_PUNT         4096    /* must be punted even for NONBLOCK */
 327         u64                     user_data;
 328         u32                     result;
 329         u32                     sequence;
 330
 331         struct work_struct      work;
 332 };
 333
 334 #define IO_PLUG_THRESHOLD               2
 335 #define IO_IOPOLL_BATCH                 8
 336
 337 struct io_submit_state {
 338         struct blk_plug         plug;
 339
 340         /*
 341          * io_kiocb alloc cache
 342          */
 343         void                    *reqs[IO_IOPOLL_BATCH];
 344         unsigned                int free_reqs;
 345         unsigned                int cur_req;
 346
 347         /*
 348          * File reference cache
 349          */
 350         struct file             *file;
 351         unsigned int            fd;
 352         unsigned int            has_refs;
 353         unsigned int            used_refs;
 354         unsigned int            ios_left;
 355 };
 356
 357 static void io_sq_wq_submit_work(struct work_struct *work);
 358 static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 359                                  long res);
 360 static void __io_free_req(struct io_kiocb *req);
 361
 362 static struct kmem_cache *req_cachep;
 363
 364 static const struct file_operations io_uring_fops;
 365
 366 struct sock *io_uring_get_socket(struct file *file)
 367 {
 368 #if defined(CONFIG_UNIX)
 369         if (file->f_op == &io_uring_fops) {
 370                 struct io_ring_ctx *ctx = file->private_data;
 371
 372                 return ctx->ring_sock->sk;
 373         }
 374 #endif
 375         return NULL;
 376 }
 377 EXPORT_SYMBOL(io_uring_get_socket);
 378
 379 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
 380 {
 381         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
 382
 383         complete(&ctx->ctx_done);
 384 }
 385
 386 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 387 {
 388         struct io_ring_ctx *ctx;
 389         int i;
 390
 391         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 392         if (!ctx)
 393                 return NULL;
 394
 395         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
 396                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
 397                 kfree(ctx);
 398                 return NULL;
 399         }
 400
 401         ctx->flags = p->flags;
 402         init_waitqueue_head(&ctx->cq_wait);
 403         init_completion(&ctx->ctx_done);
 404         init_completion(&ctx->sqo_thread_started);
 405         mutex_init(&ctx->uring_lock);
 406         init_waitqueue_head(&ctx->wait);
 407         for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) {
 408                 spin_lock_init(&ctx->pending_async[i].lock);
 409                 INIT_LIST_HEAD(&ctx->pending_async[i].list);
 410                 atomic_set(&ctx->pending_async[i].cnt, 0);
 411         }
 412         spin_lock_init(&ctx->completion_lock);
 413         INIT_LIST_HEAD(&ctx->poll_list);
 414         INIT_LIST_HEAD(&ctx->cancel_list);
 415         INIT_LIST_HEAD(&ctx->defer_list);
 416         INIT_LIST_HEAD(&ctx->timeout_list);
 417         return ctx;
 418 }
 419
 420 static inline bool __io_sequence_defer(struct io_ring_ctx *ctx,
 421                                        struct io_kiocb *req)
 422 {
 423         return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped;
 424 }
 425
 426 static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
 427                                      struct io_kiocb *req)
 428 {
 429         if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
 430                 return false;
 431
 432         return __io_sequence_defer(ctx, req);
 433 }
 434
 435 static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
 436 {
 437         struct io_kiocb *req;
 438
 439         req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
 440         if (req && !io_sequence_defer(ctx, req)) {
 441                 list_del_init(&req->list);
 442                 return req;
 443         }
 444
 445         return NULL;
 446 }
 447
 448 static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
 449 {
 450         struct io_kiocb *req;
 451
 452         req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
 453         if (req && !__io_sequence_defer(ctx, req)) {
 454                 list_del_init(&req->list);
 455                 return req;
 456         }
 457
 458         return NULL;
 459 }
 460
 461 static void __io_commit_cqring(struct io_ring_ctx *ctx)
 462 {
 463         struct io_rings *rings = ctx->rings;
 464
 465         if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) {
 466                 /* order cqe stores with ring update */
 467                 smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
 468
 469                 if (wq_has_sleeper(&ctx->cq_wait)) {
 470                         wake_up_interruptible(&ctx->cq_wait);
 471                         kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
 472                 }
 473         }
 474 }
 475
 476 static inline void io_queue_async_work(struct io_ring_ctx *ctx,
 477                                        struct io_kiocb *req)
 478 {
 479         int rw = 0;
 480
 481         if (req->submit.sqe) {
 482                 switch (req->submit.sqe->opcode) {
 483                 case IORING_OP_WRITEV:
 484                 case IORING_OP_WRITE_FIXED:
 485                         rw = !(req->rw.ki_flags & IOCB_DIRECT);
 486                         break;
 487                 }
 488         }
 489
 490         queue_work(ctx->sqo_wq[rw], &req->work);
 491 }
 492
 493 static void io_kill_timeout(struct io_kiocb *req)
 494 {
 495         int ret;
 496
 497         ret = hrtimer_try_to_cancel(&req->timeout.timer);
 498         if (ret != -1) {
 499                 atomic_inc(&req->ctx->cq_timeouts);
 500                 list_del(&req->list);
 501                 io_cqring_fill_event(req->ctx, req->user_data, 0);
 502                 __io_free_req(req);
 503         }
 504 }
 505
 506 static void io_kill_timeouts(struct io_ring_ctx *ctx)
 507 {
 508         struct io_kiocb *req, *tmp;
 509
 510         spin_lock_irq(&ctx->completion_lock);
 511         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
 512                 io_kill_timeout(req);
 513         spin_unlock_irq(&ctx->completion_lock);
 514 }
 515
 516 static void io_commit_cqring(struct io_ring_ctx *ctx)
 517 {
 518         struct io_kiocb *req;
 519
 520         while ((req = io_get_timeout_req(ctx)) != NULL)
 521                 io_kill_timeout(req);
 522
 523         __io_commit_cqring(ctx);
 524
 525         while ((req = io_get_deferred_req(ctx)) != NULL) {
 526                 if (req->flags & REQ_F_SHADOW_DRAIN) {
 527                         /* Just for drain, free it. */
 528                         __io_free_req(req);
 529                         continue;
 530                 }
 531                 req->flags |= REQ_F_IO_DRAINED;
 532                 io_queue_async_work(ctx, req);
 533         }
 534 }
 535
 536 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 537 {
 538         struct io_rings *rings = ctx->rings;
 539         unsigned tail;
 540
 541         tail = ctx->cached_cq_tail;
 542         /*
 543          * writes to the cq entry need to come after reading head; the
 544          * control dependency is enough as we're using WRITE_ONCE to
 545          * fill the cq entry
 546          */
 547         if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
 548                 return NULL;
 549
 550         ctx->cached_cq_tail++;
 551         return &rings->cqes[tail & ctx->cq_mask];
 552 }
 553
 554 static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
 555                                  long res)
 556 {
 557         struct io_uring_cqe *cqe;
 558
 559         /*
 560          * If we can't get a cq entry, userspace overflowed the
 561          * submission (by quite a lot). Increment the overflow count in
 562          * the ring.
 563          */
 564         cqe = io_get_cqring(ctx);
 565         if (cqe) {
 566                 WRITE_ONCE(cqe->user_data, ki_user_data);
 567                 WRITE_ONCE(cqe->res, res);
 568                 WRITE_ONCE(cqe->flags, 0);
 569         } else {
 570                 unsigned overflow = READ_ONCE(ctx->rings->cq_overflow);
 571
 572                 WRITE_ONCE(ctx->rings->cq_overflow, overflow + 1);
 573         }
 574 }
 575
 576 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 577 {
 578         if (waitqueue_active(&ctx->wait))
 579                 wake_up(&ctx->wait);
 580         if (waitqueue_active(&ctx->sqo_wait))
 581                 wake_up(&ctx->sqo_wait);
 582         if (ctx->cq_ev_fd)
 583                 eventfd_signal(ctx->cq_ev_fd, 1);
 584 }
 585
 586 static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
 587                                 long res)
 588 {
 589         unsigned long flags;
 590
 591         spin_lock_irqsave(&ctx->completion_lock, flags);
 592         io_cqring_fill_event(ctx, user_data, res);
 593         io_commit_cqring(ctx);
 594         spin_unlock_irqrestore(&ctx->completion_lock, flags);
 595
 596         io_cqring_ev_posted(ctx);
 597 }
 598
 599 static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 600                                    struct io_submit_state *state)
 601 {
 602         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 603         struct io_kiocb *req;
 604
 605         if (!percpu_ref_tryget(&ctx->refs))
 606                 return NULL;
 607
 608         if (!state) {
 609                 req = kmem_cache_alloc(req_cachep, gfp);
 610                 if (unlikely(!req))
 611                         goto out;
 612         } else if (!state->free_reqs) {
 613                 size_t sz;
 614                 int ret;
 615
 616                 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
 617                 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
 618
 619                 /*
 620                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
 621                  * retry single alloc to be on the safe side.
 622                  */
 623                 if (unlikely(ret <= 0)) {
 624                         state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
 625                         if (!state->reqs[0])
 626                                 goto out;
 627                         ret = 1;
 628                 }
 629                 state->free_reqs = ret - 1;
 630                 state->cur_req = 1;
 631                 req = state->reqs[0];
 632         } else {
 633                 req = state->reqs[state->cur_req];
 634                 state->free_reqs--;
 635                 state->cur_req++;
 636         }
 637
 638         req->file = NULL;
 639         req->ctx = ctx;
 640         req->flags = 0;
 641         /* one is dropped after submission, the other at completion */
 642         refcount_set(&req->refs, 2);
 643         req->result = 0;
 644         return req;
 645 out:
 646         percpu_ref_put(&ctx->refs);
 647         return NULL;
 648 }
 649
 650 static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
 651 {
 652         if (*nr) {
 653                 kmem_cache_free_bulk(req_cachep, *nr, reqs);
 654                 percpu_ref_put_many(&ctx->refs, *nr);
 655                 *nr = 0;
 656         }
 657 }
 658
 659 static void __io_free_req(struct io_kiocb *req)
 660 {
 661         if (req->file && !(req->flags & REQ_F_FIXED_FILE))
 662                 fput(req->file);
 663         percpu_ref_put(&req->ctx->refs);
 664         kmem_cache_free(req_cachep, req);
 665 }
 666
 667 static void io_req_link_next(struct io_kiocb *req)
 668 {
 669         struct io_kiocb *nxt;
 670
 671         /*
 672          * The list should never be empty when we are called here. But could
 673          * potentially happen if the chain is messed up, check to be on the
 674          * safe side.
 675          */
 676         nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
 677         if (nxt) {
 678                 list_del(&nxt->list);
 679                 if (!list_empty(&req->link_list)) {
 680                         INIT_LIST_HEAD(&nxt->link_list);
 681                         list_splice(&req->link_list, &nxt->link_list);
 682                         nxt->flags |= REQ_F_LINK;
 683                 }
 684
 685                 nxt->flags |= REQ_F_LINK_DONE;
 686                 INIT_WORK(&nxt->work, io_sq_wq_submit_work);
 687                 io_queue_async_work(req->ctx, nxt);
 688         }
 689 }
 690
 691 /*
 692  * Called if REQ_F_LINK is set, and we fail the head request
 693  */
 694 static void io_fail_links(struct io_kiocb *req)
 695 {
 696         struct io_kiocb *link;
 697
 698         while (!list_empty(&req->link_list)) {
 699                 link = list_first_entry(&req->link_list, struct io_kiocb, list);
 700                 list_del(&link->list);
 701
 702                 io_cqring_add_event(req->ctx, link->user_data, -ECANCELED);
 703                 __io_free_req(link);
 704         }
 705 }
 706
 707 static void io_free_req(struct io_kiocb *req)
 708 {
 709         /*
 710          * If LINK is set, we have dependent requests in this chain. If we
 711          * didn't fail this request, queue the first one up, moving any other
 712          * dependencies to the next request. In case of failure, fail the rest
 713          * of the chain.
 714          */
 715         if (req->flags & REQ_F_LINK) {
 716                 if (req->flags & REQ_F_FAIL_LINK)
 717                         io_fail_links(req);
 718                 else
 719                         io_req_link_next(req);
 720         }
 721
 722         __io_free_req(req);
 723 }
 724
 725 static void io_put_req(struct io_kiocb *req)
 726 {
 727         if (refcount_dec_and_test(&req->refs))
 728                 io_free_req(req);
 729 }
 730
 731 static unsigned io_cqring_events(struct io_rings *rings)
 732 {
 733         /* See comment at the top of this file */
 734         smp_rmb();
 735         return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
 736 }
 737
 738 /*
 739  * Find and free completed poll iocbs
 740  */
 741 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 742                                struct list_head *done)
 743 {
 744         void *reqs[IO_IOPOLL_BATCH];
 745         struct io_kiocb *req;
 746         int to_free;
 747
 748         to_free = 0;
 749         while (!list_empty(done)) {
 750                 req = list_first_entry(done, struct io_kiocb, list);
 751                 list_del(&req->list);
 752
 753                 io_cqring_fill_event(ctx, req->user_data, req->result);
 754                 (*nr_events)++;
 755
 756                 if (refcount_dec_and_test(&req->refs)) {
 757                         /* If we're not using fixed files, we have to pair the
 758                          * completion part with the file put. Use regular
 759                          * completions for those, only batch free for fixed
 760                          * file and non-linked commands.
 761                          */
 762                         if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
 763                             REQ_F_FIXED_FILE) {
 764                                 reqs[to_free++] = req;
 765                                 if (to_free == ARRAY_SIZE(reqs))
 766                                         io_free_req_many(ctx, reqs, &to_free);
 767                         } else {
 768                                 io_free_req(req);
 769                         }
 770                 }
 771         }
 772
 773         io_commit_cqring(ctx);
 774         io_free_req_many(ctx, reqs, &to_free);
 775 }
 776
 777 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 778                         long min)
 779 {
 780         struct io_kiocb *req, *tmp;
 781         LIST_HEAD(done);
 782         bool spin;
 783         int ret;
 784
 785         /*
 786          * Only spin for completions if we don't have multiple devices hanging
 787          * off our complete list, and we're under the requested amount.
 788          */
 789         spin = !ctx->poll_multi_file && *nr_events < min;
 790
 791         ret = 0;
 792         list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
 793                 struct kiocb *kiocb = &req->rw;
 794
 795                 /*
 796                  * Move completed entries to our local list. If we find a
 797                  * request that requires polling, break out and complete
 798                  * the done list first, if we have entries there.
 799                  */
 800                 if (req->flags & REQ_F_IOPOLL_COMPLETED) {
 801                         list_move_tail(&req->list, &done);
 802                         continue;
 803                 }
 804                 if (!list_empty(&done))
 805                         break;
 806
 807                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
 808                 if (ret < 0)
 809                         break;
 810
 811                 if (ret && spin)
 812                         spin = false;
 813                 ret = 0;
 814         }
 815
 816         if (!list_empty(&done))
 817                 io_iopoll_complete(ctx, nr_events, &done);
 818
 819         return ret;
 820 }
 821
 822 /*
 823  * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a
 824  * non-spinning poll check - we'll still enter the driver poll loop, but only
 825  * as a non-spinning completion check.
 826  */
 827 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
 828                                 long min)
 829 {
 830         while (!list_empty(&ctx->poll_list) && !need_resched()) {
 831                 int ret;
 832
 833                 ret = io_do_iopoll(ctx, nr_events, min);
 834                 if (ret < 0)
 835                         return ret;
 836                 if (!min || *nr_events >= min)
 837                         return 0;
 838         }
 839
 840         return 1;
 841 }
 842
 843 /*
 844  * We can't just wait for polled events to come to us, we have to actively
 845  * find and complete them.
 846  */
 847 static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
 848 {
 849         if (!(ctx->flags & IORING_SETUP_IOPOLL))
 850                 return;
 851
 852         mutex_lock(&ctx->uring_lock);
 853         while (!list_empty(&ctx->poll_list)) {
 854                 unsigned int nr_events = 0;
 855
 856                 io_iopoll_getevents(ctx, &nr_events, 1);
 857
 858                 /*
 859                  * Ensure we allow local-to-the-cpu processing to take place,
 860                  * in this case we need to ensure that we reap all events.
 861                  */
 862                 cond_resched();
 863         }
 864         mutex_unlock(&ctx->uring_lock);
 865 }
 866
 867 static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
 868                            long min)
 869 {
 870         int iters, ret = 0;
 871
 872         /*
 873          * We disallow the app entering submit/complete with polling, but we
 874          * still need to lock the ring to prevent racing with polled issue
 875          * that got punted to a workqueue.
 876          */
 877         mutex_lock(&ctx->uring_lock);
 878
 879         iters = 0;
 880         do {
 881                 int tmin = 0;
 882
 883                 /*
 884                  * Don't enter poll loop if we already have events pending.
 885                  * If we do, we can potentially be spinning for commands that
 886                  * already triggered a CQE (eg in error).
 887                  */
 888                 if (io_cqring_events(ctx->rings))
 889                         break;
 890
 891                 /*
 892                  * If a submit got punted to a workqueue, we can have the
 893                  * application entering polling for a command before it gets
 894                  * issued. That app will hold the uring_lock for the duration
 895                  * of the poll right here, so we need to take a breather every
 896                  * now and then to ensure that the issue has a chance to add
 897                  * the poll to the issued list. Otherwise we can spin here
 898                  * forever, while the workqueue is stuck trying to acquire the
 899                  * very same mutex.
 900                  */
 901                 if (!(++iters & 7)) {
 902                         mutex_unlock(&ctx->uring_lock);
 903                         mutex_lock(&ctx->uring_lock);
 904                 }
 905
 906                 if (*nr_events < min)
 907                         tmin = min - *nr_events;
 908
 909                 ret = io_iopoll_getevents(ctx, nr_events, tmin);
 910                 if (ret <= 0)
 911                         break;
 912                 ret = 0;
 913         } while (min && !*nr_events && !need_resched());
 914
 915         mutex_unlock(&ctx->uring_lock);
 916         return ret;
 917 }
 918
 919 static void kiocb_end_write(struct io_kiocb *req)
 920 {
 921         /*
 922          * Tell lockdep we inherited freeze protection from submission
 923          * thread.
 924          */
 925         if (req->flags & REQ_F_ISREG) {
 926                 struct inode *inode = file_inode(req->file);
 927
 928                 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
 929         }
 930         file_end_write(req->file);
 931 }
 932
 933 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 934 {
 935         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
 936
 937         if (kiocb->ki_flags & IOCB_WRITE)
 938                 kiocb_end_write(req);
 939
 940         if ((req->flags & REQ_F_LINK) && res != req->result)
 941                 req->flags |= REQ_F_FAIL_LINK;
 942         io_cqring_add_event(req->ctx, req->user_data, res);
 943         io_put_req(req);
 944 }
 945
 946 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
 947 {
 948         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
 949
 950         if (kiocb->ki_flags & IOCB_WRITE)
 951                 kiocb_end_write(req);
 952
 953         if ((req->flags & REQ_F_LINK) && res != req->result)
 954                 req->flags |= REQ_F_FAIL_LINK;
 955         req->result = res;
 956         if (res != -EAGAIN)
 957                 req->flags |= REQ_F_IOPOLL_COMPLETED;
 958 }
 959
 960 /*
 961  * After the iocb has been issued, it's safe to be found on the poll list.
 962  * Adding the kiocb to the list AFTER submission ensures that we don't
 963  * find it from a io_iopoll_getevents() thread before the issuer is done
 964  * accessing the kiocb cookie.
 965  */
 966 static void io_iopoll_req_issued(struct io_kiocb *req)
 967 {
 968         struct io_ring_ctx *ctx = req->ctx;
 969
 970         /*
 971          * Track whether we have multiple files in our lists. This will impact
 972          * how we do polling eventually, not spinning if we're on potentially
 973          * different devices.
 974          */
 975         if (list_empty(&ctx->poll_list)) {
 976                 ctx->poll_multi_file = false;
 977         } else if (!ctx->poll_multi_file) {
 978                 struct io_kiocb *list_req;
 979
 980                 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
 981                                                 list);
 982                 if (list_req->rw.ki_filp != req->rw.ki_filp)
 983                         ctx->poll_multi_file = true;
 984         }
 985
 986         /*
 987          * For fast devices, IO may have already completed. If it has, add
 988          * it to the front so we find it first.
 989          */
 990         if (req->flags & REQ_F_IOPOLL_COMPLETED)
 991                 list_add(&req->list, &ctx->poll_list);
 992         else
 993                 list_add_tail(&req->list, &ctx->poll_list);
 994 }
 995
 996 static void io_file_put(struct io_submit_state *state)
 997 {
 998         if (state->file) {
 999                 int diff = state->has_refs - state->used_refs;
1000
1001                 if (diff)
1002                         fput_many(state->file, diff);
1003                 state->file = NULL;
1004         }
1005 }
1006
1007 /*
1008  * Get as many references to a file as we have IOs left in this submission,
1009  * assuming most submissions are for one file, or at least that each file
1010  * has more than one submission.
1011  */
1012 static struct file *io_file_get(struct io_submit_state *state, int fd)
1013 {
1014         if (!state)
1015                 return fget(fd);
1016
1017         if (state->file) {
1018                 if (state->fd == fd) {
1019                         state->used_refs++;
1020                         state->ios_left--;
1021                         return state->file;
1022                 }
1023                 io_file_put(state);
1024         }
1025         state->file = fget_many(fd, state->ios_left);
1026         if (!state->file)
1027                 return NULL;
1028
1029         state->fd = fd;
1030         state->has_refs = state->ios_left;
1031         state->used_refs = 1;
1032         state->ios_left--;
1033         return state->file;
1034 }
1035
1036 /*
1037  * If we tracked the file through the SCM inflight mechanism, we could support
1038  * any file. For now, just ensure that anything potentially problematic is done
1039  * inline.
1040  */
1041 static bool io_file_supports_async(struct file *file)
1042 {
1043         umode_t mode = file_inode(file)->i_mode;
1044
1045         if (S_ISBLK(mode) || S_ISCHR(mode))
1046                 return true;
1047         if (S_ISREG(mode) && file->f_op != &io_uring_fops)
1048                 return true;
1049
1050         return false;
1051 }
1052
1053 static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
1054                       bool force_nonblock)
1055 {
1056         const struct io_uring_sqe *sqe = s->sqe;
1057         struct io_ring_ctx *ctx = req->ctx;
1058         struct kiocb *kiocb = &req->rw;
1059         unsigned ioprio;
1060         int ret;
1061
1062         if (!req->file)
1063                 return -EBADF;
1064
1065         if (S_ISREG(file_inode(req->file)->i_mode))
1066                 req->flags |= REQ_F_ISREG;
1067
1068         /*
1069          * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
1070          * we know to async punt it even if it was opened O_NONBLOCK
1071          */
1072         if (force_nonblock && !io_file_supports_async(req->file)) {
1073                 req->flags |= REQ_F_MUST_PUNT;
1074                 return -EAGAIN;
1075         }
1076
1077         kiocb->ki_pos = READ_ONCE(sqe->off);
1078         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
1079         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
1080
1081         ioprio = READ_ONCE(sqe->ioprio);
1082         if (ioprio) {
1083                 ret = ioprio_check_cap(ioprio);
1084                 if (ret)
1085                         return ret;
1086
1087                 kiocb->ki_ioprio = ioprio;
1088         } else
1089                 kiocb->ki_ioprio = get_current_ioprio();
1090
1091         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
1092         if (unlikely(ret))
1093                 return ret;
1094
1095         /* don't allow async punt if RWF_NOWAIT was requested */
1096         if ((kiocb->ki_flags & IOCB_NOWAIT) ||
1097             (req->file->f_flags & O_NONBLOCK))
1098                 req->flags |= REQ_F_NOWAIT;
1099
1100         if (force_nonblock)
1101                 kiocb->ki_flags |= IOCB_NOWAIT;
1102
1103         if (ctx->flags & IORING_SETUP_IOPOLL) {
1104                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
1105                     !kiocb->ki_filp->f_op->iopoll)
1106                         return -EOPNOTSUPP;
1107
1108                 kiocb->ki_flags |= IOCB_HIPRI;
1109                 kiocb->ki_complete = io_complete_rw_iopoll;
1110         } else {
1111                 if (kiocb->ki_flags & IOCB_HIPRI)
1112                         return -EINVAL;
1113                 kiocb->ki_complete = io_complete_rw;
1114         }
1115         return 0;
1116 }
1117
1118 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
1119 {
1120         switch (ret) {
1121         case -EIOCBQUEUED:
1122                 break;
1123         case -ERESTARTSYS:
1124         case -ERESTARTNOINTR:
1125         case -ERESTARTNOHAND:
1126         case -ERESTART_RESTARTBLOCK:
1127                 /*
1128                  * We can't just restart the syscall, since previously
1129                  * submitted sqes may already be in progress. Just fail this
1130                  * IO with EINTR.
1131                  */
1132                 ret = -EINTR;
1133                 /* fall through */
1134         default:
1135                 kiocb->ki_complete(kiocb, ret, 0);
1136         }
1137 }
1138
1139 static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
1140                            const struct io_uring_sqe *sqe,
1141                            struct iov_iter *iter)
1142 {
1143         size_t len = READ_ONCE(sqe->len);
1144         struct io_mapped_ubuf *imu;
1145         unsigned index, buf_index;
1146         size_t offset;
1147         u64 buf_addr;
1148
1149         /* attempt to use fixed buffers without having provided iovecs */
1150         if (unlikely(!ctx->user_bufs))
1151                 return -EFAULT;
1152
1153         buf_index = READ_ONCE(sqe->buf_index);
1154         if (unlikely(buf_index >= ctx->nr_user_bufs))
1155                 return -EFAULT;
1156
1157         index = array_index_nospec(buf_index, ctx->nr_user_bufs);
1158         imu = &ctx->user_bufs[index];
1159         buf_addr = READ_ONCE(sqe->addr);
1160
1161         /* overflow */
1162         if (buf_addr + len < buf_addr)
1163                 return -EFAULT;
1164         /* not inside the mapped region */
1165         if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
1166                 return -EFAULT;
1167
1168         /*
1169          * May not be a start of buffer, set size appropriately
1170          * and advance us to the beginning.
1171          */
1172         offset = buf_addr - imu->ubuf;
1173         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
1174
1175         if (offset) {
1176                 /*
1177                  * Don't use iov_iter_advance() here, as it's really slow for
1178                  * using the latter parts of a big fixed buffer - it iterates
1179                  * over each segment manually. We can cheat a bit here, because
1180                  * we know that:
1181                  *
1182                  * 1) it's a BVEC iter, we set it up
1183                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
1184                  *    first and last bvec
1185                  *
1186                  * So just find our index, and adjust the iterator afterwards.
1187                  * If the offset is within the first bvec (or the whole first
1188                  * bvec, just use iov_iter_advance(). This makes it easier
1189                  * since we can just skip the first segment, which may not
1190                  * be PAGE_SIZE aligned.
1191                  */
1192                 const struct bio_vec *bvec = imu->bvec;
1193
1194                 if (offset <= bvec->bv_len) {
1195                         iov_iter_advance(iter, offset);
1196                 } else {
1197                         unsigned long seg_skip;
1198
1199                         /* skip first vec */
1200                         offset -= bvec->bv_len;
1201                         seg_skip = 1 + (offset >> PAGE_SHIFT);
1202
1203                         iter->bvec = bvec + seg_skip;
1204                         iter->nr_segs -= seg_skip;
1205                         iter->count -= bvec->bv_len + offset;
1206                         iter->iov_offset = offset & ~PAGE_MASK;
1207                 }
1208         }
1209
1210         return 0;
1211 }
1212
1213 static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
1214                                const struct sqe_submit *s, struct iovec **iovec,
1215                                struct iov_iter *iter)
1216 {
1217         const struct io_uring_sqe *sqe = s->sqe;
1218         void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
1219         size_t sqe_len = READ_ONCE(sqe->len);
1220         u8 opcode;
1221
1222         /*
1223          * We're reading ->opcode for the second time, but the first read
1224          * doesn't care whether it's _FIXED or not, so it doesn't matter
1225          * whether ->opcode changes concurrently. The first read does care
1226          * about whether it is a READ or a WRITE, so we don't trust this read
1227          * for that purpose and instead let the caller pass in the read/write
1228          * flag.
1229          */
1230         opcode = READ_ONCE(sqe->opcode);
1231         if (opcode == IORING_OP_READ_FIXED ||
1232             opcode == IORING_OP_WRITE_FIXED) {
1233                 ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
1234                 *iovec = NULL;
1235                 return ret;
1236         }
1237
1238         if (!s->has_user)
1239                 return -EFAULT;
1240
1241 #ifdef CONFIG_COMPAT
1242         if (ctx->compat)
1243                 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
1244                                                 iovec, iter);
1245 #endif
1246
1247         return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
1248 }
1249
1250 static inline bool io_should_merge(struct async_list *al, struct kiocb *kiocb)
1251 {
1252         if (al->file == kiocb->ki_filp) {
1253                 off_t start, end;
1254
1255                 /*
1256                  * Allow merging if we're anywhere in the range of the same
1257                  * page. Generally this happens for sub-page reads or writes,
1258                  * and it's beneficial to allow the first worker to bring the
1259                  * page in and the piggy backed work can then work on the
1260                  * cached page.
1261                  */
1262                 start = al->io_start & PAGE_MASK;
1263                 end = (al->io_start + al->io_len + PAGE_SIZE - 1) & PAGE_MASK;
1264                 if (kiocb->ki_pos >= start && kiocb->ki_pos <= end)
1265                         return true;
1266         }
1267
1268         al->file = NULL;
1269         return false;
1270 }
1271
1272 /*
1273  * Make a note of the last file/offset/direction we punted to async
1274  * context. We'll use this information to see if we can piggy back a
1275  * sequential request onto the previous one, if it's still hasn't been
1276  * completed by the async worker.
1277  */
1278 static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
1279 {
1280         struct async_list *async_list = &req->ctx->pending_async[rw];
1281         struct kiocb *kiocb = &req->rw;
1282         struct file *filp = kiocb->ki_filp;
1283
1284         if (io_should_merge(async_list, kiocb)) {
1285                 unsigned long max_bytes;
1286
1287                 /* Use 8x RA size as a decent limiter for both reads/writes */
1288                 max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3);
1289                 if (!max_bytes)
1290                         max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3);
1291
1292                 /* If max len are exceeded, reset the state */
1293                 if (async_list->io_len + len <= max_bytes) {
1294                         req->flags |= REQ_F_SEQ_PREV;
1295                         async_list->io_len += len;
1296                 } else {
1297                         async_list->file = NULL;
1298                 }
1299         }
1300
1301         /* New file? Reset state. */
1302         if (async_list->file != filp) {
1303                 async_list->io_start = kiocb->ki_pos;
1304                 async_list->io_len = len;
1305                 async_list->file = filp;
1306         }
1307 }
1308
1309 /*
1310  * For files that don't have ->read_iter() and ->write_iter(), handle them
1311  * by looping over ->read() or ->write() manually.
1312  */
1313 static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
1314                            struct iov_iter *iter)
1315 {
1316         ssize_t ret = 0;
1317
1318         /*
1319          * Don't support polled IO through this interface, and we can't
1320          * support non-blocking either. For the latter, this just causes
1321          * the kiocb to be handled from an async context.
1322          */
1323         if (kiocb->ki_flags & IOCB_HIPRI)
1324                 return -EOPNOTSUPP;
1325         if (kiocb->ki_flags & IOCB_NOWAIT)
1326                 return -EAGAIN;
1327
1328         while (iov_iter_count(iter)) {
1329                 struct iovec iovec = iov_iter_iovec(iter);
1330                 ssize_t nr;
1331
1332                 if (rw == READ) {
1333                         nr = file->f_op->read(file, iovec.iov_base,
1334                                               iovec.iov_len, &kiocb->ki_pos);
1335                 } else {
1336                         nr = file->f_op->write(file, iovec.iov_base,
1337                                                iovec.iov_len, &kiocb->ki_pos);
1338                 }
1339
1340                 if (nr < 0) {
1341                         if (!ret)
1342                                 ret = nr;
1343                         break;
1344                 }
1345                 ret += nr;
1346                 if (nr != iovec.iov_len)
1347                         break;
1348                 iov_iter_advance(iter, nr);
1349         }
1350
1351         return ret;
1352 }
1353
1354 static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
1355                    bool force_nonblock)
1356 {
1357         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
1358         struct kiocb *kiocb = &req->rw;
1359         struct iov_iter iter;
1360         struct file *file;
1361         size_t iov_count;
1362         ssize_t read_size, ret;
1363
1364         ret = io_prep_rw(req, s, force_nonblock);
1365         if (ret)
1366                 return ret;
1367         file = kiocb->ki_filp;
1368
1369         if (unlikely(!(file->f_mode & FMODE_READ)))
1370                 return -EBADF;
1371
1372         ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
1373         if (ret < 0)
1374                 return ret;
1375
1376         read_size = ret;
1377         if (req->flags & REQ_F_LINK)
1378                 req->result = read_size;
1379
1380         iov_count = iov_iter_count(&iter);
1381         ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
1382         if (!ret) {
1383                 ssize_t ret2;
1384
1385                 if (file->f_op->read_iter)
1386                         ret2 = call_read_iter(file, kiocb, &iter);
1387                 else
1388                         ret2 = loop_rw_iter(READ, file, kiocb, &iter);
1389
1390                 /*
1391                  * In case of a short read, punt to async. This can happen
1392                  * if we have data partially cached. Alternatively we can
1393                  * return the short read, in which case the application will
1394                  * need to issue another SQE and wait for it. That SQE will
1395                  * need async punt anyway, so it's more efficient to do it
1396                  * here.
1397                  */
1398                 if (force_nonblock && !(req->flags & REQ_F_NOWAIT) &&
1399                     (req->flags & REQ_F_ISREG) &&
1400                     ret2 > 0 && ret2 < read_size)
1401                         ret2 = -EAGAIN;
1402                 /* Catch -EAGAIN return for forced non-blocking submission */
1403                 if (!force_nonblock || ret2 != -EAGAIN) {
1404                         io_rw_done(kiocb, ret2);
1405                 } else {
1406                         /*
1407                          * If ->needs_lock is true, we're already in async
1408                          * context.
1409                          */
1410                         if (!s->needs_lock)
1411                                 io_async_list_note(READ, req, iov_count);
1412                         ret = -EAGAIN;
1413                 }
1414         }
1415         kfree(iovec);
1416         return ret;
1417 }
1418
1419 static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
1420                     bool force_nonblock)
1421 {
1422         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
1423         struct kiocb *kiocb = &req->rw;
1424         struct iov_iter iter;
1425         struct file *file;
1426         size_t iov_count;
1427         ssize_t ret;
1428
1429         ret = io_prep_rw(req, s, force_nonblock);
1430         if (ret)
1431                 return ret;
1432
1433         file = kiocb->ki_filp;
1434         if (unlikely(!(file->f_mode & FMODE_WRITE)))
1435                 return -EBADF;
1436
1437         ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
1438         if (ret < 0)
1439                 return ret;
1440
1441         if (req->flags & REQ_F_LINK)
1442                 req->result = ret;
1443
1444         iov_count = iov_iter_count(&iter);
1445
1446         ret = -EAGAIN;
1447         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) {
1448                 /* If ->needs_lock is true, we're already in async context. */
1449                 if (!s->needs_lock)
1450                         io_async_list_note(WRITE, req, iov_count);
1451                 goto out_free;
1452         }
1453
1454         ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
1455         if (!ret) {
1456                 ssize_t ret2;
1457
1458                 /*
1459                  * Open-code file_start_write here to grab freeze protection,
1460                  * which will be released by another thread in
1461                  * io_complete_rw().  Fool lockdep by telling it the lock got
1462                  * released so that it doesn't complain about the held lock when
1463                  * we return to userspace.
1464                  */
1465                 if (req->flags & REQ_F_ISREG) {
1466                         __sb_start_write(file_inode(file)->i_sb,
1467                                                 SB_FREEZE_WRITE, true);
1468                         __sb_writers_release(file_inode(file)->i_sb,
1469                                                 SB_FREEZE_WRITE);
1470                 }
1471                 kiocb->ki_flags |= IOCB_WRITE;
1472
1473                 if (file->f_op->write_iter)
1474                         ret2 = call_write_iter(file, kiocb, &iter);
1475                 else
1476                         ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
1477                 if (!force_nonblock || ret2 != -EAGAIN) {
1478                         io_rw_done(kiocb, ret2);
1479                 } else {
1480                         /*
1481                          * If ->needs_lock is true, we're already in async
1482                          * context.
1483                          */
1484                         if (!s->needs_lock)
1485                                 io_async_list_note(WRITE, req, iov_count);
1486                         ret = -EAGAIN;
1487                 }
1488         }
1489 out_free:
1490         kfree(iovec);
1491         return ret;
1492 }
1493
1494 /*
1495  * IORING_OP_NOP just posts a completion event, nothing else.
1496  */
1497 static int io_nop(struct io_kiocb *req, u64 user_data)
1498 {
1499         struct io_ring_ctx *ctx = req->ctx;
1500         long err = 0;
1501
1502         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
1503                 return -EINVAL;
1504
1505         io_cqring_add_event(ctx, user_data, err);
1506         io_put_req(req);
1507         return 0;
1508 }
1509
1510 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1511 {
1512         struct io_ring_ctx *ctx = req->ctx;
1513
1514         if (!req->file)
1515                 return -EBADF;
1516
1517         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
1518                 return -EINVAL;
1519         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
1520                 return -EINVAL;
1521
1522         return 0;
1523 }
1524
1525 static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1526                     bool force_nonblock)
1527 {
1528         loff_t sqe_off = READ_ONCE(sqe->off);
1529         loff_t sqe_len = READ_ONCE(sqe->len);
1530         loff_t end = sqe_off + sqe_len;
1531         unsigned fsync_flags;
1532         int ret;
1533
1534         fsync_flags = READ_ONCE(sqe->fsync_flags);
1535         if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC))
1536                 return -EINVAL;
1537
1538         ret = io_prep_fsync(req, sqe);
1539         if (ret)
1540                 return ret;
1541
1542         /* fsync always requires a blocking context */
1543         if (force_nonblock)
1544                 return -EAGAIN;
1545
1546         ret = vfs_fsync_range(req->rw.ki_filp, sqe_off,
1547                                 end > 0 ? end : LLONG_MAX,
1548                                 fsync_flags & IORING_FSYNC_DATASYNC);
1549
1550         if (ret < 0 && (req->flags & REQ_F_LINK))
1551                 req->flags |= REQ_F_FAIL_LINK;
1552         io_cqring_add_event(req->ctx, sqe->user_data, ret);
1553         io_put_req(req);
1554         return 0;
1555 }
1556
1557 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1558 {
1559         struct io_ring_ctx *ctx = req->ctx;
1560         int ret = 0;
1561
1562         if (!req->file)
1563                 return -EBADF;
1564
1565         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
1566                 return -EINVAL;
1567         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
1568                 return -EINVAL;
1569
1570         return ret;
1571 }
1572
1573 static int io_sync_file_range(struct io_kiocb *req,
1574                               const struct io_uring_sqe *sqe,
1575                               bool force_nonblock)
1576 {
1577         loff_t sqe_off;
1578         loff_t sqe_len;
1579         unsigned flags;
1580         int ret;
1581
1582         ret = io_prep_sfr(req, sqe);
1583         if (ret)
1584                 return ret;
1585
1586         /* sync_file_range always requires a blocking context */
1587         if (force_nonblock)
1588                 return -EAGAIN;
1589
1590         sqe_off = READ_ONCE(sqe->off);
1591         sqe_len = READ_ONCE(sqe->len);
1592         flags = READ_ONCE(sqe->sync_range_flags);
1593
1594         ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);
1595
1596         if (ret < 0 && (req->flags & REQ_F_LINK))
1597                 req->flags |= REQ_F_FAIL_LINK;
1598         io_cqring_add_event(req->ctx, sqe->user_data, ret);
1599         io_put_req(req);
1600         return 0;
1601 }
1602
1603 #if defined(CONFIG_NET)
1604 static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1605                            bool force_nonblock,
1606                    long (*fn)(struct socket *, struct user_msghdr __user *,
1607                                 unsigned int))
1608 {
1609         struct socket *sock;
1610         int ret;
1611
1612         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
1613                 return -EINVAL;
1614
1615         sock = sock_from_file(req->file, &ret);
1616         if (sock) {
1617                 struct user_msghdr __user *msg;
1618                 unsigned flags;
1619
1620                 flags = READ_ONCE(sqe->msg_flags);
1621                 if (flags & MSG_DONTWAIT)
1622                         req->flags |= REQ_F_NOWAIT;
1623                 else if (force_nonblock)
1624                         flags |= MSG_DONTWAIT;
1625
1626                 msg = (struct user_msghdr __user *) (unsigned long)
1627                         READ_ONCE(sqe->addr);
1628
1629                 ret = fn(sock, msg, flags);
1630                 if (force_nonblock && ret == -EAGAIN)
1631                         return ret;
1632         }
1633
1634         io_cqring_add_event(req->ctx, sqe->user_data, ret);
1635         io_put_req(req);
1636         return 0;
1637 }
1638 #endif
1639
1640 static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1641                       bool force_nonblock)
1642 {
1643 #if defined(CONFIG_NET)
1644         return io_send_recvmsg(req, sqe, force_nonblock, __sys_sendmsg_sock);
1645 #else
1646         return -EOPNOTSUPP;
1647 #endif
1648 }
1649
1650 static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1651                       bool force_nonblock)
1652 {
1653 #if defined(CONFIG_NET)
1654         return io_send_recvmsg(req, sqe, force_nonblock, __sys_recvmsg_sock);
1655 #else
1656         return -EOPNOTSUPP;
1657 #endif
1658 }
1659
1660 static void io_poll_remove_one(struct io_kiocb *req)
1661 {
1662         struct io_poll_iocb *poll = &req->poll;
1663
1664         spin_lock(&poll->head->lock);
1665         WRITE_ONCE(poll->canceled, true);
1666         if (!list_empty(&poll->wait.entry)) {
1667                 list_del_init(&poll->wait.entry);
1668                 io_queue_async_work(req->ctx, req);
1669         }
1670         spin_unlock(&poll->head->lock);
1671
1672         list_del_init(&req->list);
1673 }
1674
1675 static void io_poll_remove_all(struct io_ring_ctx *ctx)
1676 {
1677         struct io_kiocb *req;
1678
1679         spin_lock_irq(&ctx->completion_lock);
1680         while (!list_empty(&ctx->cancel_list)) {
1681                 req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list);
1682                 io_poll_remove_one(req);
1683         }
1684         spin_unlock_irq(&ctx->completion_lock);
1685 }
1686
1687 /*
1688  * Find a running poll command that matches one specified in sqe->addr,
1689  * and remove it if found.
1690  */
1691 static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1692 {
1693         struct io_ring_ctx *ctx = req->ctx;
1694         struct io_kiocb *poll_req, *next;
1695         int ret = -ENOENT;
1696
1697         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
1698                 return -EINVAL;
1699         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
1700             sqe->poll_events)
1701                 return -EINVAL;
1702
1703         spin_lock_irq(&ctx->completion_lock);
1704         list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) {
1705                 if (READ_ONCE(sqe->addr) == poll_req->user_data) {
1706                         io_poll_remove_one(poll_req);
1707                         ret = 0;
1708                         break;
1709                 }
1710         }
1711         spin_unlock_irq(&ctx->completion_lock);
1712
1713         io_cqring_add_event(req->ctx, sqe->user_data, ret);
1714         io_put_req(req);
1715         return 0;
1716 }
1717
1718 static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req,
1719                              __poll_t mask)
1720 {
1721         req->poll.done = true;
1722         io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask));
1723         io_commit_cqring(ctx);
1724 }
1725
1726 static void io_poll_complete_work(struct work_struct *work)
1727 {
1728         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
1729         struct io_poll_iocb *poll = &req->poll;
1730         struct poll_table_struct pt = { ._key = poll->events };
1731         struct io_ring_ctx *ctx = req->ctx;
1732         __poll_t mask = 0;
1733
1734         if (!READ_ONCE(poll->canceled))
1735                 mask = vfs_poll(poll->file, &pt) & poll->events;
1736
1737         /*
1738          * Note that ->ki_cancel callers also delete iocb from active_reqs after
1739          * calling ->ki_cancel.  We need the ctx_lock roundtrip here to
1740          * synchronize with them.  In the cancellation case the list_del_init
1741          * itself is not actually needed, but harmless so we keep it in to
1742          * avoid further branches in the fast path.
1743          */
1744         spin_lock_irq(&ctx->completion_lock);
1745         if (!mask && !READ_ONCE(poll->canceled)) {
1746                 add_wait_queue(poll->head, &poll->wait);
1747                 spin_unlock_irq(&ctx->completion_lock);
1748                 return;
1749         }
1750         list_del_init(&req->list);
1751         io_poll_complete(ctx, req, mask);
1752         spin_unlock_irq(&ctx->completion_lock);
1753
1754         io_cqring_ev_posted(ctx);
1755         io_put_req(req);
1756 }
1757
1758 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
1759                         void *key)
1760 {
1761         struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
1762                                                         wait);
1763         struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
1764         struct io_ring_ctx *ctx = req->ctx;
1765         __poll_t mask = key_to_poll(key);
1766         unsigned long flags;
1767
1768         /* for instances that support it check for an event match first: */
1769         if (mask && !(mask & poll->events))
1770                 return 0;
1771
1772         list_del_init(&poll->wait.entry);
1773
1774         if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
1775                 list_del(&req->list);
1776                 io_poll_complete(ctx, req, mask);
1777                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1778
1779                 io_cqring_ev_posted(ctx);
1780                 io_put_req(req);
1781         } else {
1782                 io_queue_async_work(ctx, req);
1783         }
1784
1785         return 1;
1786 }
1787
1788 struct io_poll_table {
1789         struct poll_table_struct pt;
1790         struct io_kiocb *req;
1791         int error;
1792 };
1793
1794 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
1795                                struct poll_table_struct *p)
1796 {
1797         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
1798
1799         if (unlikely(pt->req->poll.head)) {
1800                 pt->error = -EINVAL;
1801                 return;
1802         }
1803
1804         pt->error = 0;
1805         pt->req->poll.head = head;
1806         add_wait_queue(head, &pt->req->poll.wait);
1807 }
1808
1809 static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1810 {
1811         struct io_poll_iocb *poll = &req->poll;
1812         struct io_ring_ctx *ctx = req->ctx;
1813         struct io_poll_table ipt;
1814         bool cancel = false;
1815         __poll_t mask;
1816         u16 events;
1817
1818         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
1819                 return -EINVAL;
1820         if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
1821                 return -EINVAL;
1822         if (!poll->file)
1823                 return -EBADF;
1824
1825         req->submit.sqe = NULL;
1826         INIT_WORK(&req->work, io_poll_complete_work);
1827         events = READ_ONCE(sqe->poll_events);
1828         poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
1829
1830         poll->head = NULL;
1831         poll->done = false;
1832         poll->canceled = false;
1833
1834         ipt.pt._qproc = io_poll_queue_proc;
1835         ipt.pt._key = poll->events;
1836         ipt.req = req;
1837         ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
1838
1839         /* initialized the list so that we can do list_empty checks */
1840         INIT_LIST_HEAD(&poll->wait.entry);
1841         init_waitqueue_func_entry(&poll->wait, io_poll_wake);
1842
1843         INIT_LIST_HEAD(&req->list);
1844
1845         mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
1846
1847         spin_lock_irq(&ctx->completion_lock);
1848         if (likely(poll->head)) {
1849                 spin_lock(&poll->head->lock);
1850                 if (unlikely(list_empty(&poll->wait.entry))) {
1851                         if (ipt.error)
1852                                 cancel = true;
1853                         ipt.error = 0;
1854                         mask = 0;
1855                 }
1856                 if (mask || ipt.error)
1857                         list_del_init(&poll->wait.entry);
1858                 else if (cancel)
1859                         WRITE_ONCE(poll->canceled, true);
1860                 else if (!poll->done) /* actually waiting for an event */
1861                         list_add_tail(&req->list, &ctx->cancel_list);
1862                 spin_unlock(&poll->head->lock);
1863         }
1864         if (mask) { /* no async, we'd stolen it */
1865                 ipt.error = 0;
1866                 io_poll_complete(ctx, req, mask);
1867         }
1868         spin_unlock_irq(&ctx->completion_lock);
1869
1870         if (mask) {
1871                 io_cqring_ev_posted(ctx);
1872                 io_put_req(req);
1873         }
1874         return ipt.error;
1875 }
1876
1877 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
1878 {
1879         struct io_ring_ctx *ctx;
1880         struct io_kiocb *req;
1881         unsigned long flags;
1882
1883         req = container_of(timer, struct io_kiocb, timeout.timer);
1884         ctx = req->ctx;
1885         atomic_inc(&ctx->cq_timeouts);
1886
1887         spin_lock_irqsave(&ctx->completion_lock, flags);
1888         list_del(&req->list);
1889
1890         io_cqring_fill_event(ctx, req->user_data, -ETIME);
1891         io_commit_cqring(ctx);
1892         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1893
1894         io_cqring_ev_posted(ctx);
1895
1896         io_put_req(req);
1897         return HRTIMER_NORESTART;
1898 }
1899
1900 static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1901 {
1902         unsigned count;
1903         struct io_ring_ctx *ctx = req->ctx;
1904         struct list_head *entry;
1905         struct timespec64 ts;
1906
1907         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
1908                 return -EINVAL;
1909         if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags ||
1910             sqe->len != 1)
1911                 return -EINVAL;
1912
1913         if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr)))
1914                 return -EFAULT;
1915
1916         /*
1917          * sqe->off holds how many events that need to occur for this
1918          * timeout event to be satisfied.
1919          */
1920         count = READ_ONCE(sqe->off);
1921         if (!count)
1922                 count = 1;
1923
1924         req->sequence = ctx->cached_sq_head + count - 1;
1925         /* reuse it to store the count */
1926         req->submit.sequence = count;
1927         req->flags |= REQ_F_TIMEOUT;
1928
1929         /*
1930          * Insertion sort, ensuring the first entry in the list is always
1931          * the one we need first.
1932          */
1933         spin_lock_irq(&ctx->completion_lock);
1934         list_for_each_prev(entry, &ctx->timeout_list) {
1935                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
1936                 unsigned nxt_sq_head;
1937                 long long tmp, tmp_nxt;
1938
1939                 /*
1940                  * Since cached_sq_head + count - 1 can overflow, use type long
1941                  * long to store it.
1942                  */
1943                 tmp = (long long)ctx->cached_sq_head + count - 1;
1944                 nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1;
1945                 tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1;
1946
1947                 /*
1948                  * cached_sq_head may overflow, and it will never overflow twice
1949                  * once there is some timeout req still be valid.
1950                  */
1951                 if (ctx->cached_sq_head < nxt_sq_head)
1952                         tmp += UINT_MAX;
1953
1954                 if (tmp >= tmp_nxt)
1955                         break;
1956         }
1957         list_add(&req->list, entry);
1958         spin_unlock_irq(&ctx->completion_lock);
1959
1960         hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1961         req->timeout.timer.function = io_timeout_fn;
1962         hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts),
1963                         HRTIMER_MODE_REL);
1964         return 0;
1965 }
1966
1967 static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
1968                         const struct io_uring_sqe *sqe)
1969 {
1970         struct io_uring_sqe *sqe_copy;
1971
1972         if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list))
1973                 return 0;
1974
1975         sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
1976         if (!sqe_copy)
1977                 return -EAGAIN;
1978
1979         spin_lock_irq(&ctx->completion_lock);
1980         if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) {
1981                 spin_unlock_irq(&ctx->completion_lock);
1982                 kfree(sqe_copy);
1983                 return 0;
1984         }
1985
1986         memcpy(sqe_copy, sqe, sizeof(*sqe_copy));
1987         req->submit.sqe = sqe_copy;
1988
1989         INIT_WORK(&req->work, io_sq_wq_submit_work);
1990         list_add_tail(&req->list, &ctx->defer_list);
1991         spin_unlock_irq(&ctx->completion_lock);
1992         return -EIOCBQUEUED;
1993 }
1994
1995 static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
1996                            const struct sqe_submit *s, bool force_nonblock)
1997 {
1998         int ret, opcode;
1999
2000         req->user_data = READ_ONCE(s->sqe->user_data);
2001
2002         if (unlikely(s->index >= ctx->sq_entries))
2003                 return -EINVAL;
2004
2005         opcode = READ_ONCE(s->sqe->opcode);
2006         switch (opcode) {
2007         case IORING_OP_NOP:
2008                 ret = io_nop(req, req->user_data);
2009                 break;
2010         case IORING_OP_READV:
2011                 if (unlikely(s->sqe->buf_index))
2012                         return -EINVAL;
2013                 ret = io_read(req, s, force_nonblock);
2014                 break;
2015         case IORING_OP_WRITEV:
2016                 if (unlikely(s->sqe->buf_index))
2017                         return -EINVAL;
2018                 ret = io_write(req, s, force_nonblock);
2019                 break;
2020         case IORING_OP_READ_FIXED:
2021                 ret = io_read(req, s, force_nonblock);
2022                 break;
2023         case IORING_OP_WRITE_FIXED:
2024                 ret = io_write(req, s, force_nonblock);
2025                 break;
2026         case IORING_OP_FSYNC:
2027                 ret = io_fsync(req, s->sqe, force_nonblock);
2028                 break;
2029         case IORING_OP_POLL_ADD:
2030                 ret = io_poll_add(req, s->sqe);
2031                 break;
2032         case IORING_OP_POLL_REMOVE:
2033                 ret = io_poll_remove(req, s->sqe);
2034                 break;
2035         case IORING_OP_SYNC_FILE_RANGE:
2036                 ret = io_sync_file_range(req, s->sqe, force_nonblock);
2037                 break;
2038         case IORING_OP_SENDMSG:
2039                 ret = io_sendmsg(req, s->sqe, force_nonblock);
2040                 break;
2041         case IORING_OP_RECVMSG:
2042                 ret = io_recvmsg(req, s->sqe, force_nonblock);
2043                 break;
2044         case IORING_OP_TIMEOUT:
2045                 ret = io_timeout(req, s->sqe);
2046                 break;
2047         default:
2048                 ret = -EINVAL;
2049                 break;
2050         }
2051
2052         if (ret)
2053                 return ret;
2054
2055         if (ctx->flags & IORING_SETUP_IOPOLL) {
2056                 if (req->result == -EAGAIN)
2057                         return -EAGAIN;
2058
2059                 /* workqueue context doesn't hold uring_lock, grab it now */
2060                 if (s->needs_lock)
2061                         mutex_lock(&ctx->uring_lock);
2062                 io_iopoll_req_issued(req);
2063                 if (s->needs_lock)
2064                         mutex_unlock(&ctx->uring_lock);
2065         }
2066
2067         return 0;
2068 }
2069
2070 static struct async_list *io_async_list_from_sqe(struct io_ring_ctx *ctx,
2071                                                  const struct io_uring_sqe *sqe)
2072 {
2073         switch (sqe->opcode) {
2074         case IORING_OP_READV:
2075         case IORING_OP_READ_FIXED:
2076                 return &ctx->pending_async[READ];
2077         case IORING_OP_WRITEV:
2078         case IORING_OP_WRITE_FIXED:
2079                 return &ctx->pending_async[WRITE];
2080         default:
2081                 return NULL;
2082         }
2083 }
2084
2085 static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
2086 {
2087         u8 opcode = READ_ONCE(sqe->opcode);
2088
2089         return !(opcode == IORING_OP_READ_FIXED ||
2090                  opcode == IORING_OP_WRITE_FIXED);
2091 }
2092
2093 static void io_sq_wq_submit_work(struct work_struct *work)
2094 {
2095         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
2096         struct io_ring_ctx *ctx = req->ctx;
2097         struct mm_struct *cur_mm = NULL;
2098         struct async_list *async_list;
2099         LIST_HEAD(req_list);
2100         mm_segment_t old_fs;
2101         int ret;
2102
2103         async_list = io_async_list_from_sqe(ctx, req->submit.sqe);
2104 restart:
2105         do {
2106                 struct sqe_submit *s = &req->submit;
2107                 const struct io_uring_sqe *sqe = s->sqe;
2108                 unsigned int flags = req->flags;
2109
2110                 /* Ensure we clear previously set non-block flag */
2111                 req->rw.ki_flags &= ~IOCB_NOWAIT;
2112
2113                 ret = 0;
2114                 if (io_sqe_needs_user(sqe) && !cur_mm) {
2115                         if (!mmget_not_zero(ctx->sqo_mm)) {
2116                                 ret = -EFAULT;
2117                         } else {
2118                                 cur_mm = ctx->sqo_mm;
2119                                 use_mm(cur_mm);
2120                                 old_fs = get_fs();
2121                                 set_fs(USER_DS);
2122                         }
2123                 }
2124
2125                 if (!ret) {
2126                         s->has_user = cur_mm != NULL;
2127                         s->needs_lock = true;
2128                         do {
2129                                 ret = __io_submit_sqe(ctx, req, s, false);
2130                                 /*
2131                                  * We can get EAGAIN for polled IO even though
2132                                  * we're forcing a sync submission from here,
2133                                  * since we can't wait for request slots on the
2134                                  * block side.
2135                                  */
2136                                 if (ret != -EAGAIN)
2137                                         break;
2138                                 cond_resched();
2139                         } while (1);
2140                 }
2141
2142                 /* drop submission reference */
2143                 io_put_req(req);
2144
2145                 if (ret) {
2146                         io_cqring_add_event(ctx, sqe->user_data, ret);
2147                         io_put_req(req);
2148                 }
2149
2150                 /* async context always use a copy of the sqe */
2151                 kfree(sqe);
2152
2153                 /* req from defer and link list needn't decrease async cnt */
2154                 if (flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))
2155                         goto out;
2156
2157                 if (!async_list)
2158                         break;
2159                 if (!list_empty(&req_list)) {
2160                         req = list_first_entry(&req_list, struct io_kiocb,
2161                                                 list);
2162                         list_del(&req->list);
2163                         continue;
2164                 }
2165                 if (list_empty(&async_list->list))
2166                         break;
2167
2168                 req = NULL;
2169                 spin_lock(&async_list->lock);
2170                 if (list_empty(&async_list->list)) {
2171                         spin_unlock(&async_list->lock);
2172                         break;
2173                 }
2174                 list_splice_init(&async_list->list, &req_list);
2175                 spin_unlock(&async_list->lock);
2176
2177                 req = list_first_entry(&req_list, struct io_kiocb, list);
2178                 list_del(&req->list);
2179         } while (req);
2180
2181         /*
2182          * Rare case of racing with a submitter. If we find the count has
2183          * dropped to zero AND we have pending work items, then restart
2184          * the processing. This is a tiny race window.
2185          */
2186         if (async_list) {
2187                 ret = atomic_dec_return(&async_list->cnt);
2188                 while (!ret && !list_empty(&async_list->list)) {
2189                         spin_lock(&async_list->lock);
2190                         atomic_inc(&async_list->cnt);
2191                         list_splice_init(&async_list->list, &req_list);
2192                         spin_unlock(&async_list->lock);
2193
2194                         if (!list_empty(&req_list)) {
2195                                 req = list_first_entry(&req_list,
2196                                                         struct io_kiocb, list);
2197                                 list_del(&req->list);
2198                                 goto restart;
2199                         }
2200                         ret = atomic_dec_return(&async_list->cnt);
2201                 }
2202         }
2203
2204 out:
2205         if (cur_mm) {
2206                 set_fs(old_fs);
2207                 unuse_mm(cur_mm);
2208                 mmput(cur_mm);
2209         }
2210 }
2211
2212 /*
2213  * See if we can piggy back onto previously submitted work, that is still
2214  * running. We currently only allow this if the new request is sequential
2215  * to the previous one we punted.
2216  */
2217 static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
2218 {
2219         bool ret;
2220
2221         if (!list)
2222                 return false;
2223         if (!(req->flags & REQ_F_SEQ_PREV))
2224                 return false;
2225         if (!atomic_read(&list->cnt))
2226                 return false;
2227
2228         ret = true;
2229         spin_lock(&list->lock);
2230         list_add_tail(&req->list, &list->list);
2231         /*
2232          * Ensure we see a simultaneous modification from io_sq_wq_submit_work()
2233          */
2234         smp_mb();
2235         if (!atomic_read(&list->cnt)) {
2236                 list_del_init(&req->list);
2237                 ret = false;
2238         }
2239         spin_unlock(&list->lock);
2240         return ret;
2241 }
2242
2243 static bool io_op_needs_file(const struct io_uring_sqe *sqe)
2244 {
2245         int op = READ_ONCE(sqe->opcode);
2246
2247         switch (op) {
2248         case IORING_OP_NOP:
2249         case IORING_OP_POLL_REMOVE:
2250                 return false;
2251         default:
2252                 return true;
2253         }
2254 }
2255
2256 static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
2257                            struct io_submit_state *state, struct io_kiocb *req)
2258 {
2259         unsigned flags;
2260         int fd;
2261
2262         flags = READ_ONCE(s->sqe->flags);
2263         fd = READ_ONCE(s->sqe->fd);
2264
2265         if (flags & IOSQE_IO_DRAIN)
2266                 req->flags |= REQ_F_IO_DRAIN;
2267         /*
2268          * All io need record the previous position, if LINK vs DARIN,
2269          * it can be used to mark the position of the first IO in the
2270          * link list.
2271          */
2272         req->sequence = s->sequence;
2273
2274         if (!io_op_needs_file(s->sqe))
2275                 return 0;
2276
2277         if (flags & IOSQE_FIXED_FILE) {
2278                 if (unlikely(!ctx->user_files ||
2279                     (unsigned) fd >= ctx->nr_user_files))
2280                         return -EBADF;
2281                 req->file = ctx->user_files[fd];
2282                 req->flags |= REQ_F_FIXED_FILE;
2283         } else {
2284                 if (s->needs_fixed_file)
2285                         return -EBADF;
2286                 req->file = io_file_get(state, fd);
2287                 if (unlikely(!req->file))
2288                         return -EBADF;
2289         }
2290
2291         return 0;
2292 }
2293
2294 static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
2295                         struct sqe_submit *s, bool force_nonblock)
2296 {
2297         int ret;
2298
2299         ret = __io_submit_sqe(ctx, req, s, force_nonblock);
2300
2301         /*
2302          * We async punt it if the file wasn't marked NOWAIT, or if the file
2303          * doesn't support non-blocking read/write attempts
2304          */
2305         if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
2306             (req->flags & REQ_F_MUST_PUNT))) {
2307                 struct io_uring_sqe *sqe_copy;
2308
2309                 sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
2310                 if (sqe_copy) {
2311                         struct async_list *list;
2312
2313                         s->sqe = sqe_copy;
2314                         memcpy(&req->submit, s, sizeof(*s));
2315                         list = io_async_list_from_sqe(ctx, s->sqe);
2316                         if (!io_add_to_prev_work(list, req)) {
2317                                 if (list)
2318                                         atomic_inc(&list->cnt);
2319                                 INIT_WORK(&req->work, io_sq_wq_submit_work);
2320                                 io_queue_async_work(ctx, req);
2321                         }
2322
2323                         /*
2324                          * Queued up for async execution, worker will release
2325                          * submit reference when the iocb is actually submitted.
2326                          */
2327                         return 0;
2328                 }
2329         }
2330
2331         /* drop submission reference */
2332         io_put_req(req);
2333
2334         /* and drop final reference, if we failed */
2335         if (ret) {
2336                 io_cqring_add_event(ctx, req->user_data, ret);
2337                 if (req->flags & REQ_F_LINK)
2338                         req->flags |= REQ_F_FAIL_LINK;
2339                 io_put_req(req);
2340         }
2341
2342         return ret;
2343 }
2344
2345 static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
2346                         struct sqe_submit *s, bool force_nonblock)
2347 {
2348         int ret;
2349
2350         ret = io_req_defer(ctx, req, s->sqe);
2351         if (ret) {
2352                 if (ret != -EIOCBQUEUED) {
2353                         io_free_req(req);
2354                         io_cqring_add_event(ctx, s->sqe->user_data, ret);
2355                 }
2356                 return 0;
2357         }
2358
2359         return __io_queue_sqe(ctx, req, s, force_nonblock);
2360 }
2361
2362 static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
2363                               struct sqe_submit *s, struct io_kiocb *shadow,
2364                               bool force_nonblock)
2365 {
2366         int ret;
2367         int need_submit = false;
2368
2369         if (!shadow)
2370                 return io_queue_sqe(ctx, req, s, force_nonblock);
2371
2372         /*
2373          * Mark the first IO in link list as DRAIN, let all the following
2374          * IOs enter the defer list. all IO needs to be completed before link
2375          * list.
2376          */
2377         req->flags |= REQ_F_IO_DRAIN;
2378         ret = io_req_defer(ctx, req, s->sqe);
2379         if (ret) {
2380                 if (ret != -EIOCBQUEUED) {
2381                         io_free_req(req);
2382                         io_cqring_add_event(ctx, s->sqe->user_data, ret);
2383                         return 0;
2384                 }
2385         } else {
2386                 /*
2387                  * If ret == 0 means that all IOs in front of link io are
2388                  * running done. let's queue link head.
2389                  */
2390                 need_submit = true;
2391         }
2392
2393         /* Insert shadow req to defer_list, blocking next IOs */
2394         spin_lock_irq(&ctx->completion_lock);
2395         list_add_tail(&shadow->list, &ctx->defer_list);
2396         spin_unlock_irq(&ctx->completion_lock);
2397
2398         if (need_submit)
2399                 return __io_queue_sqe(ctx, req, s, force_nonblock);
2400
2401         return 0;
2402 }
2403
2404 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
2405
2406 static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
2407                           struct io_submit_state *state, struct io_kiocb **link,
2408                           bool force_nonblock)
2409 {
2410         struct io_uring_sqe *sqe_copy;
2411         struct io_kiocb *req;
2412         int ret;
2413
2414         /* enforce forwards compatibility on users */
2415         if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) {
2416                 ret = -EINVAL;
2417                 goto err;
2418         }
2419
2420         req = io_get_req(ctx, state);
2421         if (unlikely(!req)) {
2422                 ret = -EAGAIN;
2423                 goto err;
2424         }
2425
2426         ret = io_req_set_file(ctx, s, state, req);
2427         if (unlikely(ret)) {
2428 err_req:
2429                 io_free_req(req);
2430 err:
2431                 io_cqring_add_event(ctx, s->sqe->user_data, ret);
2432                 return;
2433         }
2434
2435         /*
2436          * If we already have a head request, queue this one for async
2437          * submittal once the head completes. If we don't have a head but
2438          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
2439          * submitted sync once the chain is complete. If none of those
2440          * conditions are true (normal request), then just queue it.
2441          */
2442         if (*link) {
2443                 struct io_kiocb *prev = *link;
2444
2445                 sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
2446                 if (!sqe_copy) {
2447                         ret = -EAGAIN;
2448                         goto err_req;
2449                 }
2450
2451                 s->sqe = sqe_copy;
2452                 memcpy(&req->submit, s, sizeof(*s));
2453                 list_add_tail(&req->list, &prev->link_list);
2454         } else if (s->sqe->flags & IOSQE_IO_LINK) {
2455                 req->flags |= REQ_F_LINK;
2456
2457                 memcpy(&req->submit, s, sizeof(*s));
2458                 INIT_LIST_HEAD(&req->link_list);
2459                 *link = req;
2460         } else {
2461                 io_queue_sqe(ctx, req, s, force_nonblock);
2462         }
2463 }
2464
2465 /*
2466  * Batched submission is done, ensure local IO is flushed out.
2467  */
2468 static void io_submit_state_end(struct io_submit_state *state)
2469 {
2470         blk_finish_plug(&state->plug);
2471         io_file_put(state);
2472         if (state->free_reqs)
2473                 kmem_cache_free_bulk(req_cachep, state->free_reqs,
2474                                         &state->reqs[state->cur_req]);
2475 }
2476
2477 /*
2478  * Start submission side cache.
2479  */
2480 static void io_submit_state_start(struct io_submit_state *state,
2481                                   struct io_ring_ctx *ctx, unsigned max_ios)
2482 {
2483         blk_start_plug(&state->plug);
2484         state->free_reqs = 0;
2485         state->file = NULL;
2486         state->ios_left = max_ios;
2487 }
2488
2489 static void io_commit_sqring(struct io_ring_ctx *ctx)
2490 {
2491         struct io_rings *rings = ctx->rings;
2492
2493         if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) {
2494                 /*
2495                  * Ensure any loads from the SQEs are done at this point,
2496                  * since once we write the new head, the application could
2497                  * write new data to them.
2498                  */
2499                 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
2500         }
2501 }
2502
2503 /*
2504  * Fetch an sqe, if one is available. Note that s->sqe will point to memory
2505  * that is mapped by userspace. This means that care needs to be taken to
2506  * ensure that reads are stable, as we cannot rely on userspace always
2507  * being a good citizen. If members of the sqe are validated and then later
2508  * used, it's important that those reads are done through READ_ONCE() to
2509  * prevent a re-load down the line.
2510  */
2511 static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
2512 {
2513         struct io_rings *rings = ctx->rings;
2514         u32 *sq_array = ctx->sq_array;
2515         unsigned head;
2516
2517         /*
2518          * The cached sq head (or cq tail) serves two purposes:
2519          *
2520          * 1) allows us to batch the cost of updating the user visible
2521          *    head updates.
2522          * 2) allows the kernel side to track the head on its own, even
2523          *    though the application is the one updating it.
2524          */
2525         head = ctx->cached_sq_head;
2526         /* make sure SQ entry isn't read before tail */
2527         if (head == smp_load_acquire(&rings->sq.tail))
2528                 return false;
2529
2530         head = READ_ONCE(sq_array[head & ctx->sq_mask]);
2531         if (head < ctx->sq_entries) {
2532                 s->index = head;
2533                 s->sqe = &ctx->sq_sqes[head];
2534                 s->sequence = ctx->cached_sq_head;
2535                 ctx->cached_sq_head++;
2536                 return true;
2537         }
2538
2539         /* drop invalid entries */
2540         ctx->cached_sq_head++;
2541         rings->sq_dropped++;
2542         return false;
2543 }
2544
2545 static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
2546                           unsigned int nr, bool has_user, bool mm_fault)
2547 {
2548         struct io_submit_state state, *statep = NULL;
2549         struct io_kiocb *link = NULL;
2550         struct io_kiocb *shadow_req = NULL;
2551         bool prev_was_link = false;
2552         int i, submitted = 0;
2553
2554         if (nr > IO_PLUG_THRESHOLD) {
2555                 io_submit_state_start(&state, ctx, nr);
2556                 statep = &state;
2557         }
2558
2559         for (i = 0; i < nr; i++) {
2560                 /*
2561                  * If previous wasn't linked and we have a linked command,
2562                  * that's the end of the chain. Submit the previous link.
2563                  */
2564                 if (!prev_was_link && link) {
2565                         io_queue_link_head(ctx, link, &link->submit, shadow_req,
2566                                                 true);
2567                         link = NULL;
2568                         shadow_req = NULL;
2569                 }
2570                 prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0;
2571
2572                 if (link && (sqes[i].sqe->flags & IOSQE_IO_DRAIN)) {
2573                         if (!shadow_req) {
2574                                 shadow_req = io_get_req(ctx, NULL);
2575                                 if (unlikely(!shadow_req))
2576                                         goto out;
2577                                 shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
2578                                 refcount_dec(&shadow_req->refs);
2579                         }
2580                         shadow_req->sequence = sqes[i].sequence;
2581                 }
2582
2583 out:
2584                 if (unlikely(mm_fault)) {
2585                         io_cqring_add_event(ctx, sqes[i].sqe->user_data,
2586                                                 -EFAULT);
2587                 } else {
2588                         sqes[i].has_user = has_user;
2589                         sqes[i].needs_lock = true;
2590                         sqes[i].needs_fixed_file = true;
2591                         io_submit_sqe(ctx, &sqes[i], statep, &link, true);
2592                         submitted++;
2593                 }
2594         }
2595
2596         if (link)
2597                 io_queue_link_head(ctx, link, &link->submit, shadow_req, true);
2598         if (statep)
2599                 io_submit_state_end(&state);
2600
2601         return submitted;
2602 }
2603
2604 static int io_sq_thread(void *data)
2605 {
2606         struct sqe_submit sqes[IO_IOPOLL_BATCH];
2607         struct io_ring_ctx *ctx = data;
2608         struct mm_struct *cur_mm = NULL;
2609         mm_segment_t old_fs;
2610         DEFINE_WAIT(wait);
2611         unsigned inflight;
2612         unsigned long timeout;
2613
2614         complete(&ctx->sqo_thread_started);
2615
2616         old_fs = get_fs();
2617         set_fs(USER_DS);
2618
2619         timeout = inflight = 0;
2620         while (!kthread_should_park()) {
2621                 bool all_fixed, mm_fault = false;
2622                 int i;
2623
2624                 if (inflight) {
2625                         unsigned nr_events = 0;
2626
2627                         if (ctx->flags & IORING_SETUP_IOPOLL) {
2628                                 io_iopoll_check(ctx, &nr_events, 0);
2629                         } else {
2630                                 /*
2631                                  * Normal IO, just pretend everything completed.
2632                                  * We don't have to poll completions for that.
2633                                  */
2634                                 nr_events = inflight;
2635                         }
2636
2637                         inflight -= nr_events;
2638                         if (!inflight)
2639                                 timeout = jiffies + ctx->sq_thread_idle;
2640                 }
2641
2642                 if (!io_get_sqring(ctx, &sqes[0])) {
2643                         /*
2644                          * We're polling. If we're within the defined idle
2645                          * period, then let us spin without work before going
2646                          * to sleep.
2647                          */
2648                         if (inflight || !time_after(jiffies, timeout)) {
2649                                 cond_resched();
2650                                 continue;
2651                         }
2652
2653                         /*
2654                          * Drop cur_mm before scheduling, we can't hold it for
2655                          * long periods (or over schedule()). Do this before
2656                          * adding ourselves to the waitqueue, as the unuse/drop
2657                          * may sleep.
2658                          */
2659                         if (cur_mm) {
2660                                 unuse_mm(cur_mm);
2661                                 mmput(cur_mm);
2662                                 cur_mm = NULL;
2663                         }
2664
2665                         prepare_to_wait(&ctx->sqo_wait, &wait,
2666                                                 TASK_INTERRUPTIBLE);
2667
2668                         /* Tell userspace we may need a wakeup call */
2669                         ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
2670                         /* make sure to read SQ tail after writing flags */
2671                         smp_mb();
2672
2673                         if (!io_get_sqring(ctx, &sqes[0])) {
2674                                 if (kthread_should_park()) {
2675                                         finish_wait(&ctx->sqo_wait, &wait);
2676                                         break;
2677                                 }
2678                                 if (signal_pending(current))
2679                                         flush_signals(current);
2680                                 schedule();
2681                                 finish_wait(&ctx->sqo_wait, &wait);
2682
2683                                 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
2684                                 continue;
2685                         }
2686                         finish_wait(&ctx->sqo_wait, &wait);
2687
2688                         ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
2689                 }
2690
2691                 i = 0;
2692                 all_fixed = true;
2693                 do {
2694                         if (all_fixed && io_sqe_needs_user(sqes[i].sqe))
2695                                 all_fixed = false;
2696
2697                         i++;
2698                         if (i == ARRAY_SIZE(sqes))
2699                                 break;
2700                 } while (io_get_sqring(ctx, &sqes[i]));
2701
2702                 /* Unless all new commands are FIXED regions, grab mm */
2703                 if (!all_fixed && !cur_mm) {
2704                         mm_fault = !mmget_not_zero(ctx->sqo_mm);
2705                         if (!mm_fault) {
2706                                 use_mm(ctx->sqo_mm);
2707                                 cur_mm = ctx->sqo_mm;
2708                         }
2709                 }
2710
2711                 inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL,
2712                                                 mm_fault);
2713
2714                 /* Commit SQ ring head once we've consumed all SQEs */
2715                 io_commit_sqring(ctx);
2716         }
2717
2718         set_fs(old_fs);
2719         if (cur_mm) {
2720                 unuse_mm(cur_mm);
2721                 mmput(cur_mm);
2722         }
2723
2724         kthread_parkme();
2725
2726         return 0;
2727 }
2728
2729 static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
2730                           bool block_for_last)
2731 {
2732         struct io_submit_state state, *statep = NULL;
2733         struct io_kiocb *link = NULL;
2734         struct io_kiocb *shadow_req = NULL;
2735         bool prev_was_link = false;
2736         int i, submit = 0;
2737
2738         if (to_submit > IO_PLUG_THRESHOLD) {
2739                 io_submit_state_start(&state, ctx, to_submit);
2740                 statep = &state;
2741         }
2742
2743         for (i = 0; i < to_submit; i++) {
2744                 bool force_nonblock = true;
2745                 struct sqe_submit s;
2746
2747                 if (!io_get_sqring(ctx, &s))
2748                         break;
2749
2750                 /*
2751                  * If previous wasn't linked and we have a linked command,
2752                  * that's the end of the chain. Submit the previous link.
2753                  */
2754                 if (!prev_was_link && link) {
2755                         io_queue_link_head(ctx, link, &link->submit, shadow_req,
2756                                                 force_nonblock);
2757                         link = NULL;
2758                         shadow_req = NULL;
2759                 }
2760                 prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
2761
2762                 if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {
2763                         if (!shadow_req) {
2764                                 shadow_req = io_get_req(ctx, NULL);
2765                                 if (unlikely(!shadow_req))
2766                                         goto out;
2767                                 shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
2768                                 refcount_dec(&shadow_req->refs);
2769                         }
2770                         shadow_req->sequence = s.sequence;
2771                 }
2772
2773 out:
2774                 s.has_user = true;
2775                 s.needs_lock = false;
2776                 s.needs_fixed_file = false;
2777                 submit++;
2778
2779                 /*
2780                  * The caller will block for events after submit, submit the
2781                  * last IO non-blocking. This is either the only IO it's
2782                  * submitting, or it already submitted the previous ones. This
2783                  * improves performance by avoiding an async punt that we don't
2784                  * need to do.
2785                  */
2786                 if (block_for_last && submit == to_submit)
2787                         force_nonblock = false;
2788
2789                 io_submit_sqe(ctx, &s, statep, &link, force_nonblock);
2790         }
2791         io_commit_sqring(ctx);
2792
2793         if (link)
2794                 io_queue_link_head(ctx, link, &link->submit, shadow_req,
2795                                         !block_for_last);
2796         if (statep)
2797                 io_submit_state_end(statep);
2798
2799         return submit;
2800 }
2801
2802 struct io_wait_queue {
2803         struct wait_queue_entry wq;
2804         struct io_ring_ctx *ctx;
2805         unsigned to_wait;
2806         unsigned nr_timeouts;
2807 };
2808
2809 static inline bool io_should_wake(struct io_wait_queue *iowq)
2810 {
2811         struct io_ring_ctx *ctx = iowq->ctx;
2812
2813         /*
2814          * Wake up if we have enough events, or if a timeout occured since we
2815          * started waiting. For timeouts, we always want to return to userspace,
2816          * regardless of event count.
2817          */
2818         return io_cqring_events(ctx->rings) >= iowq->to_wait ||
2819                         atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
2820 }
2821
2822 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
2823                             int wake_flags, void *key)
2824 {
2825         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
2826                                                         wq);
2827
2828         if (!io_should_wake(iowq))
2829                 return -1;
2830
2831         return autoremove_wake_function(curr, mode, wake_flags, key);
2832 }
2833
2834 /*
2835  * Wait until events become available, if we don't already have some. The
2836  * application must reap them itself, as they reside on the shared cq ring.
2837  */
2838 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
2839                           const sigset_t __user *sig, size_t sigsz)
2840 {
2841         struct io_wait_queue iowq = {
2842                 .wq = {
2843                         .private        = current,
2844                         .func           = io_wake_function,
2845                         .entry          = LIST_HEAD_INIT(iowq.wq.entry),
2846                 },
2847                 .ctx            = ctx,
2848                 .to_wait        = min_events,
2849         };
2850         struct io_rings *rings = ctx->rings;
2851         int ret;
2852
2853         if (io_cqring_events(rings) >= min_events)
2854                 return 0;
2855
2856         if (sig) {
2857 #ifdef CONFIG_COMPAT
2858                 if (in_compat_syscall())
2859                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
2860                                                       sigsz);
2861                 else
2862 #endif
2863                         ret = set_user_sigmask(sig, sigsz);
2864
2865                 if (ret)
2866                         return ret;
2867         }
2868
2869         ret = 0;
2870         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
2871         do {
2872                 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
2873                                                 TASK_INTERRUPTIBLE);
2874                 if (io_should_wake(&iowq))
2875                         break;
2876                 schedule();
2877                 if (signal_pending(current)) {
2878                         ret = -ERESTARTSYS;
2879                         break;
2880                 }
2881         } while (1);
2882         finish_wait(&ctx->wait, &iowq.wq);
2883
2884         restore_saved_sigmask_unless(ret == -ERESTARTSYS);
2885         if (ret == -ERESTARTSYS)
2886                 ret = -EINTR;
2887
2888         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
2889 }
2890
2891 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
2892 {
2893 #if defined(CONFIG_UNIX)
2894         if (ctx->ring_sock) {
2895                 struct sock *sock = ctx->ring_sock->sk;
2896                 struct sk_buff *skb;
2897
2898                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
2899                         kfree_skb(skb);
2900         }
2901 #else
2902         int i;
2903
2904         for (i = 0; i < ctx->nr_user_files; i++)
2905                 fput(ctx->user_files[i]);
2906 #endif
2907 }
2908
2909 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
2910 {
2911         if (!ctx->user_files)
2912                 return -ENXIO;
2913
2914         __io_sqe_files_unregister(ctx);
2915         kfree(ctx->user_files);
2916         ctx->user_files = NULL;
2917         ctx->nr_user_files = 0;
2918         return 0;
2919 }
2920
2921 static void io_sq_thread_stop(struct io_ring_ctx *ctx)
2922 {
2923         if (ctx->sqo_thread) {
2924                 wait_for_completion(&ctx->sqo_thread_started);
2925                 /*
2926                  * The park is a bit of a work-around, without it we get
2927                  * warning spews on shutdown with SQPOLL set and affinity
2928                  * set to a single CPU.
2929                  */
2930                 kthread_park(ctx->sqo_thread);
2931                 kthread_stop(ctx->sqo_thread);
2932                 ctx->sqo_thread = NULL;
2933         }
2934 }
2935
2936 static void io_finish_async(struct io_ring_ctx *ctx)
2937 {
2938         int i;
2939
2940         io_sq_thread_stop(ctx);
2941
2942         for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) {
2943                 if (ctx->sqo_wq[i]) {
2944                         destroy_workqueue(ctx->sqo_wq[i]);
2945                         ctx->sqo_wq[i] = NULL;
2946                 }
2947         }
2948 }
2949
2950 #if defined(CONFIG_UNIX)
2951 static void io_destruct_skb(struct sk_buff *skb)
2952 {
2953         struct io_ring_ctx *ctx = skb->sk->sk_user_data;
2954         int i;
2955
2956         for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++)
2957                 if (ctx->sqo_wq[i])
2958                         flush_workqueue(ctx->sqo_wq[i]);
2959
2960         unix_destruct_scm(skb);
2961 }
2962
2963 /*
2964  * Ensure the UNIX gc is aware of our file set, so we are certain that
2965  * the io_uring can be safely unregistered on process exit, even if we have
2966  * loops in the file referencing.
2967  */
2968 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
2969 {
2970         struct sock *sk = ctx->ring_sock->sk;
2971         struct scm_fp_list *fpl;
2972         struct sk_buff *skb;
2973         int i;
2974
2975         if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
2976                 unsigned long inflight = ctx->user->unix_inflight + nr;
2977
2978                 if (inflight > task_rlimit(current, RLIMIT_NOFILE))
2979                         return -EMFILE;
2980         }
2981
2982         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
2983         if (!fpl)
2984                 return -ENOMEM;
2985
2986         skb = alloc_skb(0, GFP_KERNEL);
2987         if (!skb) {
2988                 kfree(fpl);
2989                 return -ENOMEM;
2990         }
2991
2992         skb->sk = sk;
2993         skb->destructor = io_destruct_skb;
2994
2995         fpl->user = get_uid(ctx->user);
2996         for (i = 0; i < nr; i++) {
2997                 fpl->fp[i] = get_file(ctx->user_files[i + offset]);
2998                 unix_inflight(fpl->user, fpl->fp[i]);
2999         }
3000
3001         fpl->max = fpl->count = nr;
3002         UNIXCB(skb).fp = fpl;
3003         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
3004         skb_queue_head(&sk->sk_receive_queue, skb);
3005
3006         for (i = 0; i < nr; i++)
3007                 fput(fpl->fp[i]);
3008
3009         return 0;
3010 }
3011
3012 /*
3013  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
3014  * causes regular reference counting to break down. We rely on the UNIX
3015  * garbage collection to take care of this problem for us.
3016  */
3017 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
3018 {
3019         unsigned left, total;
3020         int ret = 0;
3021
3022         total = 0;
3023         left = ctx->nr_user_files;
3024         while (left) {
3025                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
3026
3027                 ret = __io_sqe_files_scm(ctx, this_files, total);
3028                 if (ret)
3029                         break;
3030                 left -= this_files;
3031                 total += this_files;
3032         }
3033
3034         if (!ret)
3035                 return 0;
3036
3037         while (total < ctx->nr_user_files) {
3038                 fput(ctx->user_files[total]);
3039                 total++;
3040         }
3041
3042         return ret;
3043 }
3044 #else
3045 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
3046 {
3047         return 0;
3048 }
3049 #endif
3050
3051 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
3052                                  unsigned nr_args)
3053 {
3054         __s32 __user *fds = (__s32 __user *) arg;
3055         int fd, ret = 0;
3056         unsigned i;
3057
3058         if (ctx->user_files)
3059                 return -EBUSY;
3060         if (!nr_args)
3061                 return -EINVAL;
3062         if (nr_args > IORING_MAX_FIXED_FILES)
3063                 return -EMFILE;
3064
3065         ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
3066         if (!ctx->user_files)
3067                 return -ENOMEM;
3068
3069         for (i = 0; i < nr_args; i++) {
3070                 ret = -EFAULT;
3071                 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
3072                         break;
3073
3074                 ctx->user_files[i] = fget(fd);
3075
3076                 ret = -EBADF;
3077                 if (!ctx->user_files[i])
3078                         break;
3079                 /*
3080                  * Don't allow io_uring instances to be registered. If UNIX
3081                  * isn't enabled, then this causes a reference cycle and this
3082                  * instance can never get freed. If UNIX is enabled we'll
3083                  * handle it just fine, but there's still no point in allowing
3084                  * a ring fd as it doesn't support regular read/write anyway.
3085                  */
3086                 if (ctx->user_files[i]->f_op == &io_uring_fops) {
3087                         fput(ctx->user_files[i]);
3088                         break;
3089                 }
3090                 ctx->nr_user_files++;
3091                 ret = 0;
3092         }
3093
3094         if (ret) {
3095                 for (i = 0; i < ctx->nr_user_files; i++)
3096                         fput(ctx->user_files[i]);
3097
3098                 kfree(ctx->user_files);
3099                 ctx->user_files = NULL;
3100                 ctx->nr_user_files = 0;
3101                 return ret;
3102         }
3103
3104         ret = io_sqe_files_scm(ctx);
3105         if (ret)
3106                 io_sqe_files_unregister(ctx);
3107
3108         return ret;
3109 }
3110
3111 static int io_sq_offload_start(struct io_ring_ctx *ctx,
3112                                struct io_uring_params *p)
3113 {
3114         int ret;
3115
3116         init_waitqueue_head(&ctx->sqo_wait);
3117         mmgrab(current->mm);
3118         ctx->sqo_mm = current->mm;
3119
3120         if (ctx->flags & IORING_SETUP_SQPOLL) {
3121                 ret = -EPERM;
3122                 if (!capable(CAP_SYS_ADMIN))
3123                         goto err;
3124
3125                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
3126                 if (!ctx->sq_thread_idle)
3127                         ctx->sq_thread_idle = HZ;
3128
3129                 if (p->flags & IORING_SETUP_SQ_AFF) {
3130                         int cpu = p->sq_thread_cpu;
3131
3132                         ret = -EINVAL;
3133                         if (cpu >= nr_cpu_ids)
3134                                 goto err;
3135                         if (!cpu_online(cpu))
3136                                 goto err;
3137
3138                         ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
3139                                                         ctx, cpu,
3140                                                         "io_uring-sq");
3141                 } else {
3142                         ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
3143                                                         "io_uring-sq");
3144                 }
3145                 if (IS_ERR(ctx->sqo_thread)) {
3146                         ret = PTR_ERR(ctx->sqo_thread);
3147                         ctx->sqo_thread = NULL;
3148                         goto err;
3149                 }
3150                 wake_up_process(ctx->sqo_thread);
3151         } else if (p->flags & IORING_SETUP_SQ_AFF) {
3152                 /* Can't have SQ_AFF without SQPOLL */
3153                 ret = -EINVAL;
3154                 goto err;
3155         }
3156
3157         /* Do QD, or 2 * CPUS, whatever is smallest */
3158         ctx->sqo_wq[0] = alloc_workqueue("io_ring-wq",
3159                         WQ_UNBOUND | WQ_FREEZABLE,
3160                         min(ctx->sq_entries - 1, 2 * num_online_cpus()));
3161         if (!ctx->sqo_wq[0]) {
3162                 ret = -ENOMEM;
3163                 goto err;
3164         }
3165
3166         /*
3167          * This is for buffered writes, where we want to limit the parallelism
3168          * due to file locking in file systems. As "normal" buffered writes
3169          * should parellelize on writeout quite nicely, limit us to having 2
3170          * pending. This avoids massive contention on the inode when doing
3171          * buffered async writes.
3172          */
3173         ctx->sqo_wq[1] = alloc_workqueue("io_ring-write-wq",
3174                                                 WQ_UNBOUND | WQ_FREEZABLE, 2);
3175         if (!ctx->sqo_wq[1]) {
3176                 ret = -ENOMEM;
3177                 goto err;
3178         }
3179
3180         return 0;
3181 err:
3182         io_finish_async(ctx);
3183         mmdrop(ctx->sqo_mm);
3184         ctx->sqo_mm = NULL;
3185         return ret;
3186 }
3187
3188 static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
3189 {
3190         atomic_long_sub(nr_pages, &user->locked_vm);
3191 }
3192
3193 static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
3194 {
3195         unsigned long page_limit, cur_pages, new_pages;
3196
3197         /* Don't allow more pages than we can safely lock */
3198         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
3199
3200         do {
3201                 cur_pages = atomic_long_read(&user->locked_vm);
3202                 new_pages = cur_pages + nr_pages;
3203                 if (new_pages > page_limit)
3204                         return -ENOMEM;
3205         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
3206                                         new_pages) != cur_pages);
3207
3208         return 0;
3209 }
3210
3211 static void io_mem_free(void *ptr)
3212 {
3213         struct page *page;
3214
3215         if (!ptr)
3216                 return;
3217
3218         page = virt_to_head_page(ptr);
3219         if (put_page_testzero(page))
3220                 free_compound_page(page);
3221 }
3222
3223 static void *io_mem_alloc(size_t size)
3224 {
3225         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
3226                                 __GFP_NORETRY;
3227
3228         return (void *) __get_free_pages(gfp_flags, get_order(size));
3229 }
3230
3231 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
3232                                 size_t *sq_offset)
3233 {
3234         struct io_rings *rings;
3235         size_t off, sq_array_size;
3236
3237         off = struct_size(rings, cqes, cq_entries);
3238         if (off == SIZE_MAX)
3239                 return SIZE_MAX;
3240
3241 #ifdef CONFIG_SMP
3242         off = ALIGN(off, SMP_CACHE_BYTES);
3243         if (off == 0)
3244                 return SIZE_MAX;
3245 #endif
3246
3247         sq_array_size = array_size(sizeof(u32), sq_entries);
3248         if (sq_array_size == SIZE_MAX)
3249                 return SIZE_MAX;
3250
3251         if (check_add_overflow(off, sq_array_size, &off))
3252                 return SIZE_MAX;
3253
3254         if (sq_offset)
3255                 *sq_offset = off;
3256
3257         return off;
3258 }
3259
3260 static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
3261 {
3262         size_t pages;
3263
3264         pages = (size_t)1 << get_order(
3265                 rings_size(sq_entries, cq_entries, NULL));
3266         pages += (size_t)1 << get_order(
3267                 array_size(sizeof(struct io_uring_sqe), sq_entries));
3268
3269         return pages;
3270 }
3271
3272 static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
3273 {
3274         int i, j;
3275
3276         if (!ctx->user_bufs)
3277                 return -ENXIO;
3278
3279         for (i = 0; i < ctx->nr_user_bufs; i++) {
3280                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
3281
3282                 for (j = 0; j < imu->nr_bvecs; j++)
3283                         put_user_page(imu->bvec[j].bv_page);
3284
3285                 if (ctx->account_mem)
3286                         io_unaccount_mem(ctx->user, imu->nr_bvecs);
3287                 kvfree(imu->bvec);
3288                 imu->nr_bvecs = 0;
3289         }
3290
3291         kfree(ctx->user_bufs);
3292         ctx->user_bufs = NULL;
3293         ctx->nr_user_bufs = 0;
3294         return 0;
3295 }
3296
3297 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
3298                        void __user *arg, unsigned index)
3299 {
3300         struct iovec __user *src;
3301
3302 #ifdef CONFIG_COMPAT
3303         if (ctx->compat) {
3304                 struct compat_iovec __user *ciovs;
3305                 struct compat_iovec ciov;
3306
3307                 ciovs = (struct compat_iovec __user *) arg;
3308                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
3309                         return -EFAULT;
3310
3311                 dst->iov_base = (void __user *) (unsigned long) ciov.iov_base;
3312                 dst->iov_len = ciov.iov_len;
3313                 return 0;
3314         }
3315 #endif
3316         src = (struct iovec __user *) arg;
3317         if (copy_from_user(dst, &src[index], sizeof(*dst)))
3318                 return -EFAULT;
3319         return 0;
3320 }
3321
3322 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
3323                                   unsigned nr_args)
3324 {
3325         struct vm_area_struct **vmas = NULL;
3326         struct page **pages = NULL;
3327         int i, j, got_pages = 0;
3328         int ret = -EINVAL;
3329
3330         if (ctx->user_bufs)
3331                 return -EBUSY;
3332         if (!nr_args || nr_args > UIO_MAXIOV)
3333                 return -EINVAL;
3334
3335         ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
3336                                         GFP_KERNEL);
3337         if (!ctx->user_bufs)
3338                 return -ENOMEM;
3339
3340         for (i = 0; i < nr_args; i++) {
3341                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
3342                 unsigned long off, start, end, ubuf;
3343                 int pret, nr_pages;
3344                 struct iovec iov;
3345                 size_t size;
3346
3347                 ret = io_copy_iov(ctx, &iov, arg, i);
3348                 if (ret)
3349                         goto err;
3350
3351                 /*
3352                  * Don't impose further limits on the size and buffer
3353                  * constraints here, we'll -EINVAL later when IO is
3354                  * submitted if they are wrong.
3355                  */
3356                 ret = -EFAULT;
3357                 if (!iov.iov_base || !iov.iov_len)
3358                         goto err;
3359
3360                 /* arbitrary limit, but we need something */
3361                 if (iov.iov_len > SZ_1G)
3362                         goto err;
3363
3364                 ubuf = (unsigned long) iov.iov_base;
3365                 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
3366                 start = ubuf >> PAGE_SHIFT;
3367                 nr_pages = end - start;
3368
3369                 if (ctx->account_mem) {
3370                         ret = io_account_mem(ctx->user, nr_pages);
3371                         if (ret)
3372                                 goto err;
3373                 }
3374
3375                 ret = 0;
3376                 if (!pages || nr_pages > got_pages) {
3377                         kfree(vmas);
3378                         kfree(pages);
3379                         pages = kvmalloc_array(nr_pages, sizeof(struct page *),
3380                                                 GFP_KERNEL);
3381                         vmas = kvmalloc_array(nr_pages,
3382                                         sizeof(struct vm_area_struct *),
3383                                         GFP_KERNEL);
3384                         if (!pages || !vmas) {
3385                                 ret = -ENOMEM;
3386                                 if (ctx->account_mem)
3387                                         io_unaccount_mem(ctx->user, nr_pages);
3388                                 goto err;
3389                         }
3390                         got_pages = nr_pages;
3391                 }
3392
3393                 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
3394                                                 GFP_KERNEL);
3395                 ret = -ENOMEM;
3396                 if (!imu->bvec) {
3397                         if (ctx->account_mem)
3398                                 io_unaccount_mem(ctx->user, nr_pages);
3399                         goto err;
3400                 }
3401
3402                 ret = 0;
3403                 down_read(&current->mm->mmap_sem);
3404                 pret = get_user_pages(ubuf, nr_pages,
3405                                       FOLL_WRITE | FOLL_LONGTERM,
3406                                       pages, vmas);
3407                 if (pret == nr_pages) {
3408                         /* don't support file backed memory */
3409                         for (j = 0; j < nr_pages; j++) {
3410                                 struct vm_area_struct *vma = vmas[j];
3411
3412                                 if (vma->vm_file &&
3413                                     !is_file_hugepages(vma->vm_file)) {
3414                                         ret = -EOPNOTSUPP;
3415                                         break;
3416                                 }
3417                         }
3418                 } else {
3419                         ret = pret < 0 ? pret : -EFAULT;
3420                 }
3421                 up_read(&current->mm->mmap_sem);
3422                 if (ret) {
3423                         /*
3424                          * if we did partial map, or found file backed vmas,
3425                          * release any pages we did get
3426                          */
3427                         if (pret > 0)
3428                                 put_user_pages(pages, pret);
3429                         if (ctx->account_mem)
3430                                 io_unaccount_mem(ctx->user, nr_pages);
3431                         kvfree(imu->bvec);
3432                         goto err;
3433                 }
3434
3435                 off = ubuf & ~PAGE_MASK;
3436                 size = iov.iov_len;
3437                 for (j = 0; j < nr_pages; j++) {
3438                         size_t vec_len;
3439
3440                         vec_len = min_t(size_t, size, PAGE_SIZE - off);
3441                         imu->bvec[j].bv_page = pages[j];
3442                         imu->bvec[j].bv_len = vec_len;
3443                         imu->bvec[j].bv_offset = off;
3444                         off = 0;
3445                         size -= vec_len;
3446                 }
3447                 /* store original address for later verification */
3448                 imu->ubuf = ubuf;
3449                 imu->len = iov.iov_len;
3450                 imu->nr_bvecs = nr_pages;
3451
3452                 ctx->nr_user_bufs++;
3453         }
3454         kvfree(pages);
3455         kvfree(vmas);
3456         return 0;
3457 err:
3458         kvfree(pages);
3459         kvfree(vmas);
3460         io_sqe_buffer_unregister(ctx);
3461         return ret;
3462 }
3463
3464 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
3465 {
3466         __s32 __user *fds = arg;
3467         int fd;
3468
3469         if (ctx->cq_ev_fd)
3470                 return -EBUSY;
3471
3472         if (copy_from_user(&fd, fds, sizeof(*fds)))
3473                 return -EFAULT;
3474
3475         ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
3476         if (IS_ERR(ctx->cq_ev_fd)) {
3477                 int ret = PTR_ERR(ctx->cq_ev_fd);
3478                 ctx->cq_ev_fd = NULL;
3479                 return ret;
3480         }
3481
3482         return 0;
3483 }
3484
3485 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
3486 {
3487         if (ctx->cq_ev_fd) {
3488                 eventfd_ctx_put(ctx->cq_ev_fd);
3489                 ctx->cq_ev_fd = NULL;
3490                 return 0;
3491         }
3492
3493         return -ENXIO;
3494 }
3495
3496 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
3497 {
3498         io_finish_async(ctx);
3499         if (ctx->sqo_mm)
3500                 mmdrop(ctx->sqo_mm);
3501
3502         io_iopoll_reap_events(ctx);
3503         io_sqe_buffer_unregister(ctx);
3504         io_sqe_files_unregister(ctx);
3505         io_eventfd_unregister(ctx);
3506
3507 #if defined(CONFIG_UNIX)
3508         if (ctx->ring_sock) {
3509                 ctx->ring_sock->file = NULL; /* so that iput() is called */
3510                 sock_release(ctx->ring_sock);
3511         }
3512 #endif
3513
3514         io_mem_free(ctx->rings);
3515         io_mem_free(ctx->sq_sqes);
3516
3517         percpu_ref_exit(&ctx->refs);
3518         if (ctx->account_mem)
3519                 io_unaccount_mem(ctx->user,
3520                                 ring_pages(ctx->sq_entries, ctx->cq_entries));
3521         free_uid(ctx->user);
3522         kfree(ctx);
3523 }
3524
3525 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
3526 {
3527         struct io_ring_ctx *ctx = file->private_data;
3528         __poll_t mask = 0;
3529
3530         poll_wait(file, &ctx->cq_wait, wait);
3531         /*
3532          * synchronizes with barrier from wq_has_sleeper call in
3533          * io_commit_cqring
3534          */
3535         smp_rmb();
3536         if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
3537             ctx->rings->sq_ring_entries)
3538                 mask |= EPOLLOUT | EPOLLWRNORM;
3539         if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail)
3540                 mask |= EPOLLIN | EPOLLRDNORM;
3541
3542         return mask;
3543 }
3544
3545 static int io_uring_fasync(int fd, struct file *file, int on)
3546 {
3547         struct io_ring_ctx *ctx = file->private_data;
3548
3549         return fasync_helper(fd, file, on, &ctx->cq_fasync);
3550 }
3551
3552 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
3553 {
3554         mutex_lock(&ctx->uring_lock);
3555         percpu_ref_kill(&ctx->refs);
3556         mutex_unlock(&ctx->uring_lock);
3557
3558         io_kill_timeouts(ctx);
3559         io_poll_remove_all(ctx);
3560         io_iopoll_reap_events(ctx);
3561         wait_for_completion(&ctx->ctx_done);
3562         io_ring_ctx_free(ctx);
3563 }
3564
3565 static int io_uring_release(struct inode *inode, struct file *file)
3566 {
3567         struct io_ring_ctx *ctx = file->private_data;
3568
3569         file->private_data = NULL;
3570         io_ring_ctx_wait_and_kill(ctx);
3571         return 0;
3572 }
3573
3574 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
3575 {
3576         loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
3577         unsigned long sz = vma->vm_end - vma->vm_start;
3578         struct io_ring_ctx *ctx = file->private_data;
3579         unsigned long pfn;
3580         struct page *page;
3581         void *ptr;
3582
3583         switch (offset) {
3584         case IORING_OFF_SQ_RING:
3585         case IORING_OFF_CQ_RING:
3586                 ptr = ctx->rings;
3587                 break;
3588         case IORING_OFF_SQES:
3589                 ptr = ctx->sq_sqes;
3590                 break;
3591         default:
3592                 return -EINVAL;
3593         }
3594
3595         page = virt_to_head_page(ptr);
3596         if (sz > page_size(page))
3597                 return -EINVAL;
3598
3599         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
3600         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
3601 }
3602
3603 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
3604                 u32, min_complete, u32, flags, const sigset_t __user *, sig,
3605                 size_t, sigsz)
3606 {
3607         struct io_ring_ctx *ctx;
3608         long ret = -EBADF;
3609         int submitted = 0;
3610         struct fd f;
3611
3612         if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
3613                 return -EINVAL;
3614
3615         f = fdget(fd);
3616         if (!f.file)
3617                 return -EBADF;
3618
3619         ret = -EOPNOTSUPP;
3620         if (f.file->f_op != &io_uring_fops)
3621                 goto out_fput;
3622
3623         ret = -ENXIO;
3624         ctx = f.file->private_data;
3625         if (!percpu_ref_tryget(&ctx->refs))
3626                 goto out_fput;
3627
3628         /*
3629          * For SQ polling, the thread will do all submissions and completions.
3630          * Just return the requested submit count, and wake the thread if
3631          * we were asked to.
3632          */
3633         ret = 0;
3634         if (ctx->flags & IORING_SETUP_SQPOLL) {
3635                 if (flags & IORING_ENTER_SQ_WAKEUP)
3636                         wake_up(&ctx->sqo_wait);
3637                 submitted = to_submit;
3638         } else if (to_submit) {
3639                 bool block_for_last = false;
3640
3641                 to_submit = min(to_submit, ctx->sq_entries);
3642
3643                 /*
3644                  * Allow last submission to block in a series, IFF the caller
3645                  * asked to wait for events and we don't currently have
3646                  * enough. This potentially avoids an async punt.
3647                  */
3648                 if (to_submit == min_complete &&
3649                     io_cqring_events(ctx->rings) < min_complete)
3650                         block_for_last = true;
3651
3652                 mutex_lock(&ctx->uring_lock);
3653                 submitted = io_ring_submit(ctx, to_submit, block_for_last);
3654                 mutex_unlock(&ctx->uring_lock);
3655         }
3656         if (flags & IORING_ENTER_GETEVENTS) {
3657                 unsigned nr_events = 0;
3658
3659                 min_complete = min(min_complete, ctx->cq_entries);
3660
3661                 if (ctx->flags & IORING_SETUP_IOPOLL) {
3662                         ret = io_iopoll_check(ctx, &nr_events, min_complete);
3663                 } else {
3664                         ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
3665                 }
3666         }
3667
3668         percpu_ref_put(&ctx->refs);
3669 out_fput:
3670         fdput(f);
3671         return submitted ? submitted : ret;
3672 }
3673
3674 static const struct file_operations io_uring_fops = {
3675         .release        = io_uring_release,
3676         .mmap           = io_uring_mmap,
3677         .poll           = io_uring_poll,
3678         .fasync         = io_uring_fasync,
3679 };
3680
3681 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
3682                                   struct io_uring_params *p)
3683 {
3684         struct io_rings *rings;
3685         size_t size, sq_array_offset;
3686
3687         size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
3688         if (size == SIZE_MAX)
3689                 return -EOVERFLOW;
3690
3691         rings = io_mem_alloc(size);
3692         if (!rings)
3693                 return -ENOMEM;
3694
3695         ctx->rings = rings;
3696         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
3697         rings->sq_ring_mask = p->sq_entries - 1;
3698         rings->cq_ring_mask = p->cq_entries - 1;
3699         rings->sq_ring_entries = p->sq_entries;
3700         rings->cq_ring_entries = p->cq_entries;
3701         ctx->sq_mask = rings->sq_ring_mask;
3702         ctx->cq_mask = rings->cq_ring_mask;
3703         ctx->sq_entries = rings->sq_ring_entries;
3704         ctx->cq_entries = rings->cq_ring_entries;
3705
3706         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
3707         if (size == SIZE_MAX)
3708                 return -EOVERFLOW;
3709
3710         ctx->sq_sqes = io_mem_alloc(size);
3711         if (!ctx->sq_sqes)
3712                 return -ENOMEM;
3713
3714         return 0;
3715 }
3716
3717 /*
3718  * Allocate an anonymous fd, this is what constitutes the application
3719  * visible backing of an io_uring instance. The application mmaps this
3720  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
3721  * we have to tie this fd to a socket for file garbage collection purposes.
3722  */
3723 static int io_uring_get_fd(struct io_ring_ctx *ctx)
3724 {
3725         struct file *file;
3726         int ret;
3727
3728 #if defined(CONFIG_UNIX)
3729         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
3730                                 &ctx->ring_sock);
3731         if (ret)
3732                 return ret;
3733 #endif
3734
3735         ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
3736         if (ret < 0)
3737                 goto err;
3738
3739         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
3740                                         O_RDWR | O_CLOEXEC);
3741         if (IS_ERR(file)) {
3742                 put_unused_fd(ret);
3743                 ret = PTR_ERR(file);
3744                 goto err;
3745         }
3746
3747 #if defined(CONFIG_UNIX)
3748         ctx->ring_sock->file = file;
3749         ctx->ring_sock->sk->sk_user_data = ctx;
3750 #endif
3751         fd_install(ret, file);
3752         return ret;
3753 err:
3754 #if defined(CONFIG_UNIX)
3755         sock_release(ctx->ring_sock);
3756         ctx->ring_sock = NULL;
3757 #endif
3758         return ret;
3759 }
3760
3761 static int io_uring_create(unsigned entries, struct io_uring_params *p)
3762 {
3763         struct user_struct *user = NULL;
3764         struct io_ring_ctx *ctx;
3765         bool account_mem;
3766         int ret;
3767
3768         if (!entries || entries > IORING_MAX_ENTRIES)
3769                 return -EINVAL;
3770
3771         /*
3772          * Use twice as many entries for the CQ ring. It's possible for the
3773          * application to drive a higher depth than the size of the SQ ring,
3774          * since the sqes are only used at submission time. This allows for
3775          * some flexibility in overcommitting a bit.
3776          */
3777         p->sq_entries = roundup_pow_of_two(entries);
3778         p->cq_entries = 2 * p->sq_entries;
3779
3780         user = get_uid(current_user());
3781         account_mem = !capable(CAP_IPC_LOCK);
3782
3783         if (account_mem) {
3784                 ret = io_account_mem(user,
3785                                 ring_pages(p->sq_entries, p->cq_entries));
3786                 if (ret) {
3787                         free_uid(user);
3788                         return ret;
3789                 }
3790         }
3791
3792         ctx = io_ring_ctx_alloc(p);
3793         if (!ctx) {
3794                 if (account_mem)
3795                         io_unaccount_mem(user, ring_pages(p->sq_entries,
3796                                                                 p->cq_entries));
3797                 free_uid(user);
3798                 return -ENOMEM;
3799         }
3800         ctx->compat = in_compat_syscall();
3801         ctx->account_mem = account_mem;
3802         ctx->user = user;
3803
3804         ret = io_allocate_scq_urings(ctx, p);
3805         if (ret)
3806                 goto err;
3807
3808         ret = io_sq_offload_start(ctx, p);
3809         if (ret)
3810                 goto err;
3811
3812         ret = io_uring_get_fd(ctx);
3813         if (ret < 0)
3814                 goto err;
3815
3816         memset(&p->sq_off, 0, sizeof(p->sq_off));
3817         p->sq_off.head = offsetof(struct io_rings, sq.head);
3818         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
3819         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
3820         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
3821         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
3822         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
3823         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
3824
3825         memset(&p->cq_off, 0, sizeof(p->cq_off));
3826         p->cq_off.head = offsetof(struct io_rings, cq.head);
3827         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
3828         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
3829         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
3830         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
3831         p->cq_off.cqes = offsetof(struct io_rings, cqes);
3832
3833         p->features = IORING_FEAT_SINGLE_MMAP;
3834         return ret;
3835 err:
3836         io_ring_ctx_wait_and_kill(ctx);
3837         return ret;
3838 }
3839
3840 /*
3841  * Sets up an aio uring context, and returns the fd. Applications asks for a
3842  * ring size, we return the actual sq/cq ring sizes (among other things) in the
3843  * params structure passed in.
3844  */
3845 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
3846 {
3847         struct io_uring_params p;
3848         long ret;
3849         int i;
3850
3851         if (copy_from_user(&p, params, sizeof(p)))
3852                 return -EFAULT;
3853         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
3854                 if (p.resv[i])
3855                         return -EINVAL;
3856         }
3857
3858         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
3859                         IORING_SETUP_SQ_AFF))
3860                 return -EINVAL;
3861
3862         ret = io_uring_create(entries, &p);
3863         if (ret < 0)
3864                 return ret;
3865
3866         if (copy_to_user(params, &p, sizeof(p)))
3867                 return -EFAULT;
3868
3869         return ret;
3870 }
3871
3872 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
3873                 struct io_uring_params __user *, params)
3874 {
3875         return io_uring_setup(entries, params);
3876 }
3877
3878 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
3879                                void __user *arg, unsigned nr_args)
3880         __releases(ctx->uring_lock)
3881         __acquires(ctx->uring_lock)
3882 {
3883         int ret;
3884
3885         /*
3886          * We're inside the ring mutex, if the ref is already dying, then
3887          * someone else killed the ctx or is already going through
3888          * io_uring_register().
3889          */
3890         if (percpu_ref_is_dying(&ctx->refs))
3891                 return -ENXIO;
3892
3893         percpu_ref_kill(&ctx->refs);
3894
3895         /*
3896          * Drop uring mutex before waiting for references to exit. If another
3897          * thread is currently inside io_uring_enter() it might need to grab
3898          * the uring_lock to make progress. If we hold it here across the drain
3899          * wait, then we can deadlock. It's safe to drop the mutex here, since
3900          * no new references will come in after we've killed the percpu ref.
3901          */
3902         mutex_unlock(&ctx->uring_lock);
3903         wait_for_completion(&ctx->ctx_done);
3904         mutex_lock(&ctx->uring_lock);
3905
3906         switch (opcode) {
3907         case IORING_REGISTER_BUFFERS:
3908                 ret = io_sqe_buffer_register(ctx, arg, nr_args);
3909                 break;
3910         case IORING_UNREGISTER_BUFFERS:
3911                 ret = -EINVAL;
3912                 if (arg || nr_args)
3913                         break;
3914                 ret = io_sqe_buffer_unregister(ctx);
3915                 break;
3916         case IORING_REGISTER_FILES:
3917                 ret = io_sqe_files_register(ctx, arg, nr_args);
3918                 break;
3919         case IORING_UNREGISTER_FILES:
3920                 ret = -EINVAL;
3921                 if (arg || nr_args)
3922                         break;
3923                 ret = io_sqe_files_unregister(ctx);
3924                 break;
3925         case IORING_REGISTER_EVENTFD:
3926                 ret = -EINVAL;
3927                 if (nr_args != 1)
3928                         break;
3929                 ret = io_eventfd_register(ctx, arg);
3930                 break;
3931         case IORING_UNREGISTER_EVENTFD:
3932                 ret = -EINVAL;
3933                 if (arg || nr_args)
3934                         break;
3935                 ret = io_eventfd_unregister(ctx);
3936                 break;
3937         default:
3938                 ret = -EINVAL;
3939                 break;
3940         }
3941
3942         /* bring the ctx back to life */
3943         reinit_completion(&ctx->ctx_done);
3944         percpu_ref_reinit(&ctx->refs);
3945         return ret;
3946 }
3947
3948 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
3949                 void __user *, arg, unsigned int, nr_args)
3950 {
3951         struct io_ring_ctx *ctx;
3952         long ret = -EBADF;
3953         struct fd f;
3954
3955         f = fdget(fd);
3956         if (!f.file)
3957                 return -EBADF;
3958
3959         ret = -EOPNOTSUPP;
3960         if (f.file->f_op != &io_uring_fops)
3961                 goto out_fput;
3962
3963         ctx = f.file->private_data;
3964
3965         mutex_lock(&ctx->uring_lock);
3966         ret = __io_uring_register(ctx, opcode, arg, nr_args);
3967         mutex_unlock(&ctx->uring_lock);
3968 out_fput:
3969         fdput(f);
3970         return ret;
3971 }
3972
3973 static int __init io_uring_init(void)
3974 {
3975         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
3976         return 0;
3977 };
3978 __initcall(io_uring_init);