fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqring (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <linux/compat.h>
  47 #include <linux/refcount.h>
  48 #include <linux/uio.h>
  49
  50 #include <linux/sched/signal.h>
  51 #include <linux/fs.h>
  52 #include <linux/file.h>
  53 #include <linux/fdtable.h>
  54 #include <linux/mm.h>
  55 #include <linux/mman.h>
  56 #include <linux/mmu_context.h>
  57 #include <linux/percpu.h>
  58 #include <linux/slab.h>
  59 #include <linux/kthread.h>
  60 #include <linux/blkdev.h>
  61 #include <linux/bvec.h>
  62 #include <linux/net.h>
  63 #include <net/sock.h>
  64 #include <net/af_unix.h>
  65 #include <net/scm.h>
  66 #include <linux/anon_inodes.h>
  67 #include <linux/sched/mm.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/nospec.h>
  70 #include <linux/sizes.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/highmem.h>
  73
  74 #define CREATE_TRACE_POINTS
  75 #include <trace/events/io_uring.h>
  76
  77 #include <uapi/linux/io_uring.h>
  78
  79 #include "internal.h"
  80 #include "io-wq.h"
  81
  82 #define IORING_MAX_ENTRIES      32768
  83 #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
  84
  85 /*
  86  * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
  87  */
  88 #define IORING_FILE_TABLE_SHIFT 9
  89 #define IORING_MAX_FILES_TABLE  (1U << IORING_FILE_TABLE_SHIFT)
  90 #define IORING_FILE_TABLE_MASK  (IORING_MAX_FILES_TABLE - 1)
  91 #define IORING_MAX_FIXED_FILES  (64 * IORING_MAX_FILES_TABLE)
  92
  93 struct io_uring {
  94         u32 head ____cacheline_aligned_in_smp;
  95         u32 tail ____cacheline_aligned_in_smp;
  96 };
  97
  98 /*
  99  * This data is shared with the application through the mmap at offsets
 100  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 101  *
 102  * The offsets to the member fields are published through struct
 103  * io_sqring_offsets when calling io_uring_setup.
 104  */
 105 struct io_rings {
 106         /*
 107          * Head and tail offsets into the ring; the offsets need to be
 108          * masked to get valid indices.
 109          *
 110          * The kernel controls head of the sq ring and the tail of the cq ring,
 111          * and the application controls tail of the sq ring and the head of the
 112          * cq ring.
 113          */
 114         struct io_uring         sq, cq;
 115         /*
 116          * Bitmasks to apply to head and tail offsets (constant, equals
 117          * ring_entries - 1)
 118          */
 119         u32                     sq_ring_mask, cq_ring_mask;
 120         /* Ring sizes (constant, power of 2) */
 121         u32                     sq_ring_entries, cq_ring_entries;
 122         /*
 123          * Number of invalid entries dropped by the kernel due to
 124          * invalid index stored in array
 125          *
 126          * Written by the kernel, shouldn't be modified by the
 127          * application (i.e. get number of "new events" by comparing to
 128          * cached value).
 129          *
 130          * After a new SQ head value was read by the application this
 131          * counter includes all submissions that were dropped reaching
 132          * the new SQ head (and possibly more).
 133          */
 134         u32                     sq_dropped;
 135         /*
 136          * Runtime flags
 137          *
 138          * Written by the kernel, shouldn't be modified by the
 139          * application.
 140          *
 141          * The application needs a full memory barrier before checking
 142          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 143          */
 144         u32                     sq_flags;
 145         /*
 146          * Number of completion events lost because the queue was full;
 147          * this should be avoided by the application by making sure
 148          * there are not more requests pending than there is space in
 149          * the completion queue.
 150          *
 151          * Written by the kernel, shouldn't be modified by the
 152          * application (i.e. get number of "new events" by comparing to
 153          * cached value).
 154          *
 155          * As completion events come in out of order this counter is not
 156          * ordered with any other data.
 157          */
 158         u32                     cq_overflow;
 159         /*
 160          * Ring buffer of completion events.
 161          *
 162          * The kernel writes completion events fresh every time they are
 163          * produced, so the application is allowed to modify pending
 164          * entries.
 165          */
 166         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 167 };
 168
 169 struct io_mapped_ubuf {
 170         u64             ubuf;
 171         size_t          len;
 172         struct          bio_vec *bvec;
 173         unsigned int    nr_bvecs;
 174 };
 175
 176 struct fixed_file_table {
 177         struct file             **files;
 178 };
 179
 180 struct io_ring_ctx {
 181         struct {
 182                 struct percpu_ref       refs;
 183         } ____cacheline_aligned_in_smp;
 184
 185         struct {
 186                 unsigned int            flags;
 187                 bool                    compat;
 188                 bool                    account_mem;
 189                 bool                    cq_overflow_flushed;
 190                 bool                    drain_next;
 191
 192                 /*
 193                  * Ring buffer of indices into array of io_uring_sqe, which is
 194                  * mmapped by the application using the IORING_OFF_SQES offset.
 195                  *
 196                  * This indirection could e.g. be used to assign fixed
 197                  * io_uring_sqe entries to operations and only submit them to
 198                  * the queue when needed.
 199                  *
 200                  * The kernel modifies neither the indices array nor the entries
 201                  * array.
 202                  */
 203                 u32                     *sq_array;
 204                 unsigned                cached_sq_head;
 205                 unsigned                sq_entries;
 206                 unsigned                sq_mask;
 207                 unsigned                sq_thread_idle;
 208                 unsigned                cached_sq_dropped;
 209                 atomic_t                cached_cq_overflow;
 210                 struct io_uring_sqe     *sq_sqes;
 211
 212                 struct list_head        defer_list;
 213                 struct list_head        timeout_list;
 214                 struct list_head        cq_overflow_list;
 215
 216                 wait_queue_head_t       inflight_wait;
 217         } ____cacheline_aligned_in_smp;
 218
 219         struct io_rings *rings;
 220
 221         /* IO offload */
 222         struct io_wq            *io_wq;
 223         struct task_struct      *sqo_thread;    /* if using sq thread polling */
 224         struct mm_struct        *sqo_mm;
 225         wait_queue_head_t       sqo_wait;
 226
 227         /*
 228          * If used, fixed file set. Writers must ensure that ->refs is dead,
 229          * readers must ensure that ->refs is alive as long as the file* is
 230          * used. Only updated through io_uring_register(2).
 231          */
 232         struct fixed_file_table *file_table;
 233         unsigned                nr_user_files;
 234
 235         /* if used, fixed mapped user buffers */
 236         unsigned                nr_user_bufs;
 237         struct io_mapped_ubuf   *user_bufs;
 238
 239         struct user_struct      *user;
 240
 241         const struct cred       *creds;
 242
 243         /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
 244         struct completion       *completions;
 245
 246         /* if all else fails... */
 247         struct io_kiocb         *fallback_req;
 248
 249 #if defined(CONFIG_UNIX)
 250         struct socket           *ring_sock;
 251 #endif
 252
 253         struct {
 254                 unsigned                cached_cq_tail;
 255                 unsigned                cq_entries;
 256                 unsigned                cq_mask;
 257                 atomic_t                cq_timeouts;
 258                 struct wait_queue_head  cq_wait;
 259                 struct fasync_struct    *cq_fasync;
 260                 struct eventfd_ctx      *cq_ev_fd;
 261         } ____cacheline_aligned_in_smp;
 262
 263         struct {
 264                 struct mutex            uring_lock;
 265                 wait_queue_head_t       wait;
 266         } ____cacheline_aligned_in_smp;
 267
 268         struct {
 269                 spinlock_t              completion_lock;
 270                 bool                    poll_multi_file;
 271                 /*
 272                  * ->poll_list is protected by the ctx->uring_lock for
 273                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 274                  * For SQPOLL, only the single threaded io_sq_thread() will
 275                  * manipulate the list, hence no extra locking is needed there.
 276                  */
 277                 struct list_head        poll_list;
 278                 struct hlist_head       *cancel_hash;
 279                 unsigned                cancel_hash_bits;
 280
 281                 spinlock_t              inflight_lock;
 282                 struct list_head        inflight_list;
 283         } ____cacheline_aligned_in_smp;
 284 };
 285
 286 /*
 287  * First field must be the file pointer in all the
 288  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 289  */
 290 struct io_poll_iocb {
 291         struct file                     *file;
 292         union {
 293                 struct wait_queue_head  *head;
 294                 u64                     addr;
 295         };
 296         __poll_t                        events;
 297         bool                            done;
 298         bool                            canceled;
 299         struct wait_queue_entry         wait;
 300 };
 301
 302 struct io_timeout_data {
 303         struct io_kiocb                 *req;
 304         struct hrtimer                  timer;
 305         struct timespec64               ts;
 306         enum hrtimer_mode               mode;
 307         u32                             seq_offset;
 308 };
 309
 310 struct io_accept {
 311         struct file                     *file;
 312         struct sockaddr __user          *addr;
 313         int __user                      *addr_len;
 314         int                             flags;
 315 };
 316
 317 struct io_sync {
 318         struct file                     *file;
 319         loff_t                          len;
 320         loff_t                          off;
 321         int                             flags;
 322 };
 323
 324 struct io_cancel {
 325         struct file                     *file;
 326         u64                             addr;
 327 };
 328
 329 struct io_timeout {
 330         struct file                     *file;
 331         u64                             addr;
 332         int                             flags;
 333         unsigned                        count;
 334 };
 335
 336 struct io_rw {
 337         /* NOTE: kiocb has the file as the first member, so don't do it here */
 338         struct kiocb                    kiocb;
 339         u64                             addr;
 340         u64                             len;
 341 };
 342
 343 struct io_connect {
 344         struct file                     *file;
 345         struct sockaddr __user          *addr;
 346         int                             addr_len;
 347 };
 348
 349 struct io_sr_msg {
 350         struct file                     *file;
 351         struct user_msghdr __user       *msg;
 352         int                             msg_flags;
 353 };
 354
 355 struct io_async_connect {
 356         struct sockaddr_storage         address;
 357 };
 358
 359 struct io_async_msghdr {
 360         struct iovec                    fast_iov[UIO_FASTIOV];
 361         struct iovec                    *iov;
 362         struct sockaddr __user          *uaddr;
 363         struct msghdr                   msg;
 364 };
 365
 366 struct io_async_rw {
 367         struct iovec                    fast_iov[UIO_FASTIOV];
 368         struct iovec                    *iov;
 369         ssize_t                         nr_segs;
 370         ssize_t                         size;
 371 };
 372
 373 struct io_async_ctx {
 374         union {
 375                 struct io_async_rw      rw;
 376                 struct io_async_msghdr  msg;
 377                 struct io_async_connect connect;
 378                 struct io_timeout_data  timeout;
 379         };
 380 };
 381
 382 /*
 383  * NOTE! Each of the iocb union members has the file pointer
 384  * as the first entry in their struct definition. So you can
 385  * access the file pointer through any of the sub-structs,
 386  * or directly as just 'ki_filp' in this struct.
 387  */
 388 struct io_kiocb {
 389         union {
 390                 struct file             *file;
 391                 struct io_rw            rw;
 392                 struct io_poll_iocb     poll;
 393                 struct io_accept        accept;
 394                 struct io_sync          sync;
 395                 struct io_cancel        cancel;
 396                 struct io_timeout       timeout;
 397                 struct io_connect       connect;
 398                 struct io_sr_msg        sr_msg;
 399         };
 400
 401         struct io_async_ctx             *io;
 402         struct file                     *ring_file;
 403         int                             ring_fd;
 404         bool                            has_user;
 405         bool                            in_async;
 406         bool                            needs_fixed_file;
 407         u8                              opcode;
 408
 409         struct io_ring_ctx      *ctx;
 410         union {
 411                 struct list_head        list;
 412                 struct hlist_node       hash_node;
 413         };
 414         struct list_head        link_list;
 415         unsigned int            flags;
 416         refcount_t              refs;
 417 #define REQ_F_NOWAIT            1       /* must not punt to workers */
 418 #define REQ_F_IOPOLL_COMPLETED  2       /* polled IO has completed */
 419 #define REQ_F_FIXED_FILE        4       /* ctx owns file */
 420 #define REQ_F_LINK_NEXT         8       /* already grabbed next link */
 421 #define REQ_F_IO_DRAIN          16      /* drain existing IO first */
 422 #define REQ_F_IO_DRAINED        32      /* drain done */
 423 #define REQ_F_LINK              64      /* linked sqes */
 424 #define REQ_F_LINK_TIMEOUT      128     /* has linked timeout */
 425 #define REQ_F_FAIL_LINK         256     /* fail rest of links */
 426 #define REQ_F_DRAIN_LINK        512     /* link should be fully drained */
 427 #define REQ_F_TIMEOUT           1024    /* timeout request */
 428 #define REQ_F_ISREG             2048    /* regular file */
 429 #define REQ_F_MUST_PUNT         4096    /* must be punted even for NONBLOCK */
 430 #define REQ_F_TIMEOUT_NOSEQ     8192    /* no timeout sequence */
 431 #define REQ_F_INFLIGHT          16384   /* on inflight list */
 432 #define REQ_F_COMP_LOCKED       32768   /* completion under lock */
 433 #define REQ_F_HARDLINK          65536   /* doesn't sever on completion < 0 */
 434         u64                     user_data;
 435         u32                     result;
 436         u32                     sequence;
 437
 438         struct list_head        inflight_entry;
 439
 440         struct io_wq_work       work;
 441 };
 442
 443 #define IO_PLUG_THRESHOLD               2
 444 #define IO_IOPOLL_BATCH                 8
 445
 446 struct io_submit_state {
 447         struct blk_plug         plug;
 448
 449         /*
 450          * io_kiocb alloc cache
 451          */
 452         void                    *reqs[IO_IOPOLL_BATCH];
 453         unsigned                int free_reqs;
 454         unsigned                int cur_req;
 455
 456         /*
 457          * File reference cache
 458          */
 459         struct file             *file;
 460         unsigned int            fd;
 461         unsigned int            has_refs;
 462         unsigned int            used_refs;
 463         unsigned int            ios_left;
 464 };
 465
 466 static void io_wq_submit_work(struct io_wq_work **workptr);
 467 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 468 static void __io_free_req(struct io_kiocb *req);
 469 static void io_put_req(struct io_kiocb *req);
 470 static void io_double_put_req(struct io_kiocb *req);
 471 static void __io_double_put_req(struct io_kiocb *req);
 472 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 473 static void io_queue_linked_timeout(struct io_kiocb *req);
 474
 475 static struct kmem_cache *req_cachep;
 476
 477 static const struct file_operations io_uring_fops;
 478
 479 struct sock *io_uring_get_socket(struct file *file)
 480 {
 481 #if defined(CONFIG_UNIX)
 482         if (file->f_op == &io_uring_fops) {
 483                 struct io_ring_ctx *ctx = file->private_data;
 484
 485                 return ctx->ring_sock->sk;
 486         }
 487 #endif
 488         return NULL;
 489 }
 490 EXPORT_SYMBOL(io_uring_get_socket);
 491
 492 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
 493 {
 494         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
 495
 496         complete(&ctx->completions[0]);
 497 }
 498
 499 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 500 {
 501         struct io_ring_ctx *ctx;
 502         int hash_bits;
 503
 504         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 505         if (!ctx)
 506                 return NULL;
 507
 508         ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
 509         if (!ctx->fallback_req)
 510                 goto err;
 511
 512         ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL);
 513         if (!ctx->completions)
 514                 goto err;
 515
 516         /*
 517          * Use 5 bits less than the max cq entries, that should give us around
 518          * 32 entries per hash list if totally full and uniformly spread.
 519          */
 520         hash_bits = ilog2(p->cq_entries);
 521         hash_bits -= 5;
 522         if (hash_bits <= 0)
 523                 hash_bits = 1;
 524         ctx->cancel_hash_bits = hash_bits;
 525         ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
 526                                         GFP_KERNEL);
 527         if (!ctx->cancel_hash)
 528                 goto err;
 529         __hash_init(ctx->cancel_hash, 1U << hash_bits);
 530
 531         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
 532                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
 533                 goto err;
 534
 535         ctx->flags = p->flags;
 536         init_waitqueue_head(&ctx->cq_wait);
 537         INIT_LIST_HEAD(&ctx->cq_overflow_list);
 538         init_completion(&ctx->completions[0]);
 539         init_completion(&ctx->completions[1]);
 540         mutex_init(&ctx->uring_lock);
 541         init_waitqueue_head(&ctx->wait);
 542         spin_lock_init(&ctx->completion_lock);
 543         INIT_LIST_HEAD(&ctx->poll_list);
 544         INIT_LIST_HEAD(&ctx->defer_list);
 545         INIT_LIST_HEAD(&ctx->timeout_list);
 546         init_waitqueue_head(&ctx->inflight_wait);
 547         spin_lock_init(&ctx->inflight_lock);
 548         INIT_LIST_HEAD(&ctx->inflight_list);
 549         return ctx;
 550 err:
 551         if (ctx->fallback_req)
 552                 kmem_cache_free(req_cachep, ctx->fallback_req);
 553         kfree(ctx->completions);
 554         kfree(ctx->cancel_hash);
 555         kfree(ctx);
 556         return NULL;
 557 }
 558
 559 static inline bool __req_need_defer(struct io_kiocb *req)
 560 {
 561         struct io_ring_ctx *ctx = req->ctx;
 562
 563         return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
 564                                         + atomic_read(&ctx->cached_cq_overflow);
 565 }
 566
 567 static inline bool req_need_defer(struct io_kiocb *req)
 568 {
 569         if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) == REQ_F_IO_DRAIN)
 570                 return __req_need_defer(req);
 571
 572         return false;
 573 }
 574
 575 static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
 576 {
 577         struct io_kiocb *req;
 578
 579         req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
 580         if (req && !req_need_defer(req)) {
 581                 list_del_init(&req->list);
 582                 return req;
 583         }
 584
 585         return NULL;
 586 }
 587
 588 static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
 589 {
 590         struct io_kiocb *req;
 591
 592         req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
 593         if (req) {
 594                 if (req->flags & REQ_F_TIMEOUT_NOSEQ)
 595                         return NULL;
 596                 if (!__req_need_defer(req)) {
 597                         list_del_init(&req->list);
 598                         return req;
 599                 }
 600         }
 601
 602         return NULL;
 603 }
 604
 605 static void __io_commit_cqring(struct io_ring_ctx *ctx)
 606 {
 607         struct io_rings *rings = ctx->rings;
 608
 609         if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) {
 610                 /* order cqe stores with ring update */
 611                 smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
 612
 613                 if (wq_has_sleeper(&ctx->cq_wait)) {
 614                         wake_up_interruptible(&ctx->cq_wait);
 615                         kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
 616                 }
 617         }
 618 }
 619
 620 static inline bool io_req_needs_user(struct io_kiocb *req)
 621 {
 622         return !(req->opcode == IORING_OP_READ_FIXED ||
 623                  req->opcode == IORING_OP_WRITE_FIXED);
 624 }
 625
 626 static inline bool io_prep_async_work(struct io_kiocb *req,
 627                                       struct io_kiocb **link)
 628 {
 629         bool do_hashed = false;
 630
 631         switch (req->opcode) {
 632         case IORING_OP_WRITEV:
 633         case IORING_OP_WRITE_FIXED:
 634                 /* only regular files should be hashed for writes */
 635                 if (req->flags & REQ_F_ISREG)
 636                         do_hashed = true;
 637                 /* fall-through */
 638         case IORING_OP_READV:
 639         case IORING_OP_READ_FIXED:
 640         case IORING_OP_SENDMSG:
 641         case IORING_OP_RECVMSG:
 642         case IORING_OP_ACCEPT:
 643         case IORING_OP_POLL_ADD:
 644         case IORING_OP_CONNECT:
 645                 /*
 646                  * We know REQ_F_ISREG is not set on some of these
 647                  * opcodes, but this enables us to keep the check in
 648                  * just one place.
 649                  */
 650                 if (!(req->flags & REQ_F_ISREG))
 651                         req->work.flags |= IO_WQ_WORK_UNBOUND;
 652                 break;
 653         }
 654         if (io_req_needs_user(req))
 655                 req->work.flags |= IO_WQ_WORK_NEEDS_USER;
 656
 657         *link = io_prep_linked_timeout(req);
 658         return do_hashed;
 659 }
 660
 661 static inline void io_queue_async_work(struct io_kiocb *req)
 662 {
 663         struct io_ring_ctx *ctx = req->ctx;
 664         struct io_kiocb *link;
 665         bool do_hashed;
 666
 667         do_hashed = io_prep_async_work(req, &link);
 668
 669         trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
 670                                         req->flags);
 671         if (!do_hashed) {
 672                 io_wq_enqueue(ctx->io_wq, &req->work);
 673         } else {
 674                 io_wq_enqueue_hashed(ctx->io_wq, &req->work,
 675                                         file_inode(req->file));
 676         }
 677
 678         if (link)
 679                 io_queue_linked_timeout(link);
 680 }
 681
 682 static void io_kill_timeout(struct io_kiocb *req)
 683 {
 684         int ret;
 685
 686         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
 687         if (ret != -1) {
 688                 atomic_inc(&req->ctx->cq_timeouts);
 689                 list_del_init(&req->list);
 690                 io_cqring_fill_event(req, 0);
 691                 io_put_req(req);
 692         }
 693 }
 694
 695 static void io_kill_timeouts(struct io_ring_ctx *ctx)
 696 {
 697         struct io_kiocb *req, *tmp;
 698
 699         spin_lock_irq(&ctx->completion_lock);
 700         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
 701                 io_kill_timeout(req);
 702         spin_unlock_irq(&ctx->completion_lock);
 703 }
 704
 705 static void io_commit_cqring(struct io_ring_ctx *ctx)
 706 {
 707         struct io_kiocb *req;
 708
 709         while ((req = io_get_timeout_req(ctx)) != NULL)
 710                 io_kill_timeout(req);
 711
 712         __io_commit_cqring(ctx);
 713
 714         while ((req = io_get_deferred_req(ctx)) != NULL) {
 715                 req->flags |= REQ_F_IO_DRAINED;
 716                 io_queue_async_work(req);
 717         }
 718 }
 719
 720 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 721 {
 722         struct io_rings *rings = ctx->rings;
 723         unsigned tail;
 724
 725         tail = ctx->cached_cq_tail;
 726         /*
 727          * writes to the cq entry need to come after reading head; the
 728          * control dependency is enough as we're using WRITE_ONCE to
 729          * fill the cq entry
 730          */
 731         if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
 732                 return NULL;
 733
 734         ctx->cached_cq_tail++;
 735         return &rings->cqes[tail & ctx->cq_mask];
 736 }
 737
 738 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 739 {
 740         if (waitqueue_active(&ctx->wait))
 741                 wake_up(&ctx->wait);
 742         if (waitqueue_active(&ctx->sqo_wait))
 743                 wake_up(&ctx->sqo_wait);
 744         if (ctx->cq_ev_fd)
 745                 eventfd_signal(ctx->cq_ev_fd, 1);
 746 }
 747
 748 /* Returns true if there are no backlogged entries after the flush */
 749 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 750 {
 751         struct io_rings *rings = ctx->rings;
 752         struct io_uring_cqe *cqe;
 753         struct io_kiocb *req;
 754         unsigned long flags;
 755         LIST_HEAD(list);
 756
 757         if (!force) {
 758                 if (list_empty_careful(&ctx->cq_overflow_list))
 759                         return true;
 760                 if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
 761                     rings->cq_ring_entries))
 762                         return false;
 763         }
 764
 765         spin_lock_irqsave(&ctx->completion_lock, flags);
 766
 767         /* if force is set, the ring is going away. always drop after that */
 768         if (force)
 769                 ctx->cq_overflow_flushed = true;
 770
 771         cqe = NULL;
 772         while (!list_empty(&ctx->cq_overflow_list)) {
 773                 cqe = io_get_cqring(ctx);
 774                 if (!cqe && !force)
 775                         break;
 776
 777                 req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
 778                                                 list);
 779                 list_move(&req->list, &list);
 780                 if (cqe) {
 781                         WRITE_ONCE(cqe->user_data, req->user_data);
 782                         WRITE_ONCE(cqe->res, req->result);
 783                         WRITE_ONCE(cqe->flags, 0);
 784                 } else {
 785                         WRITE_ONCE(ctx->rings->cq_overflow,
 786                                 atomic_inc_return(&ctx->cached_cq_overflow));
 787                 }
 788         }
 789
 790         io_commit_cqring(ctx);
 791         spin_unlock_irqrestore(&ctx->completion_lock, flags);
 792         io_cqring_ev_posted(ctx);
 793
 794         while (!list_empty(&list)) {
 795                 req = list_first_entry(&list, struct io_kiocb, list);
 796                 list_del(&req->list);
 797                 io_put_req(req);
 798         }
 799
 800         return cqe != NULL;
 801 }
 802
 803 static void io_cqring_fill_event(struct io_kiocb *req, long res)
 804 {
 805         struct io_ring_ctx *ctx = req->ctx;
 806         struct io_uring_cqe *cqe;
 807
 808         trace_io_uring_complete(ctx, req->user_data, res);
 809
 810         /*
 811          * If we can't get a cq entry, userspace overflowed the
 812          * submission (by quite a lot). Increment the overflow count in
 813          * the ring.
 814          */
 815         cqe = io_get_cqring(ctx);
 816         if (likely(cqe)) {
 817                 WRITE_ONCE(cqe->user_data, req->user_data);
 818                 WRITE_ONCE(cqe->res, res);
 819                 WRITE_ONCE(cqe->flags, 0);
 820         } else if (ctx->cq_overflow_flushed) {
 821                 WRITE_ONCE(ctx->rings->cq_overflow,
 822                                 atomic_inc_return(&ctx->cached_cq_overflow));
 823         } else {
 824                 refcount_inc(&req->refs);
 825                 req->result = res;
 826                 list_add_tail(&req->list, &ctx->cq_overflow_list);
 827         }
 828 }
 829
 830 static void io_cqring_add_event(struct io_kiocb *req, long res)
 831 {
 832         struct io_ring_ctx *ctx = req->ctx;
 833         unsigned long flags;
 834
 835         spin_lock_irqsave(&ctx->completion_lock, flags);
 836         io_cqring_fill_event(req, res);
 837         io_commit_cqring(ctx);
 838         spin_unlock_irqrestore(&ctx->completion_lock, flags);
 839
 840         io_cqring_ev_posted(ctx);
 841 }
 842
 843 static inline bool io_is_fallback_req(struct io_kiocb *req)
 844 {
 845         return req == (struct io_kiocb *)
 846                         ((unsigned long) req->ctx->fallback_req & ~1UL);
 847 }
 848
 849 static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
 850 {
 851         struct io_kiocb *req;
 852
 853         req = ctx->fallback_req;
 854         if (!test_and_set_bit_lock(0, (unsigned long *) ctx->fallback_req))
 855                 return req;
 856
 857         return NULL;
 858 }
 859
 860 static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 861                                    struct io_submit_state *state)
 862 {
 863         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 864         struct io_kiocb *req;
 865
 866         if (!percpu_ref_tryget(&ctx->refs))
 867                 return NULL;
 868
 869         if (!state) {
 870                 req = kmem_cache_alloc(req_cachep, gfp);
 871                 if (unlikely(!req))
 872                         goto fallback;
 873         } else if (!state->free_reqs) {
 874                 size_t sz;
 875                 int ret;
 876
 877                 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
 878                 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
 879
 880                 /*
 881                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
 882                  * retry single alloc to be on the safe side.
 883                  */
 884                 if (unlikely(ret <= 0)) {
 885                         state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
 886                         if (!state->reqs[0])
 887                                 goto fallback;
 888                         ret = 1;
 889                 }
 890                 state->free_reqs = ret - 1;
 891                 state->cur_req = 1;
 892                 req = state->reqs[0];
 893         } else {
 894                 req = state->reqs[state->cur_req];
 895                 state->free_reqs--;
 896                 state->cur_req++;
 897         }
 898
 899 got_it:
 900         req->io = NULL;
 901         req->ring_file = NULL;
 902         req->file = NULL;
 903         req->ctx = ctx;
 904         req->flags = 0;
 905         /* one is dropped after submission, the other at completion */
 906         refcount_set(&req->refs, 2);
 907         req->result = 0;
 908         INIT_IO_WORK(&req->work, io_wq_submit_work);
 909         return req;
 910 fallback:
 911         req = io_get_fallback_req(ctx);
 912         if (req)
 913                 goto got_it;
 914         percpu_ref_put(&ctx->refs);
 915         return NULL;
 916 }
 917
 918 static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
 919 {
 920         if (*nr) {
 921                 kmem_cache_free_bulk(req_cachep, *nr, reqs);
 922                 percpu_ref_put_many(&ctx->refs, *nr);
 923                 *nr = 0;
 924         }
 925 }
 926
 927 static void __io_free_req(struct io_kiocb *req)
 928 {
 929         struct io_ring_ctx *ctx = req->ctx;
 930
 931         if (req->io)
 932                 kfree(req->io);
 933         if (req->file && !(req->flags & REQ_F_FIXED_FILE))
 934                 fput(req->file);
 935         if (req->flags & REQ_F_INFLIGHT) {
 936                 unsigned long flags;
 937
 938                 spin_lock_irqsave(&ctx->inflight_lock, flags);
 939                 list_del(&req->inflight_entry);
 940                 if (waitqueue_active(&ctx->inflight_wait))
 941                         wake_up(&ctx->inflight_wait);
 942                 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
 943         }
 944         percpu_ref_put(&ctx->refs);
 945         if (likely(!io_is_fallback_req(req)))
 946                 kmem_cache_free(req_cachep, req);
 947         else
 948                 clear_bit_unlock(0, (unsigned long *) ctx->fallback_req);
 949 }
 950
 951 static bool io_link_cancel_timeout(struct io_kiocb *req)
 952 {
 953         struct io_ring_ctx *ctx = req->ctx;
 954         int ret;
 955
 956         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
 957         if (ret != -1) {
 958                 io_cqring_fill_event(req, -ECANCELED);
 959                 io_commit_cqring(ctx);
 960                 req->flags &= ~REQ_F_LINK;
 961                 io_put_req(req);
 962                 return true;
 963         }
 964
 965         return false;
 966 }
 967
 968 static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 969 {
 970         struct io_ring_ctx *ctx = req->ctx;
 971         bool wake_ev = false;
 972
 973         /* Already got next link */
 974         if (req->flags & REQ_F_LINK_NEXT)
 975                 return;
 976
 977         /*
 978          * The list should never be empty when we are called here. But could
 979          * potentially happen if the chain is messed up, check to be on the
 980          * safe side.
 981          */
 982         while (!list_empty(&req->link_list)) {
 983                 struct io_kiocb *nxt = list_first_entry(&req->link_list,
 984                                                 struct io_kiocb, link_list);
 985
 986                 if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
 987                              (nxt->flags & REQ_F_TIMEOUT))) {
 988                         list_del_init(&nxt->link_list);
 989                         wake_ev |= io_link_cancel_timeout(nxt);
 990                         req->flags &= ~REQ_F_LINK_TIMEOUT;
 991                         continue;
 992                 }
 993
 994                 list_del_init(&req->link_list);
 995                 if (!list_empty(&nxt->link_list))
 996                         nxt->flags |= REQ_F_LINK;
 997                 *nxtptr = nxt;
 998                 break;
 999         }
1000
1001         req->flags |= REQ_F_LINK_NEXT;
1002         if (wake_ev)
1003                 io_cqring_ev_posted(ctx);
1004 }
1005
1006 /*
1007  * Called if REQ_F_LINK is set, and we fail the head request
1008  */
1009 static void io_fail_links(struct io_kiocb *req)
1010 {
1011         struct io_ring_ctx *ctx = req->ctx;
1012         unsigned long flags;
1013
1014         spin_lock_irqsave(&ctx->completion_lock, flags);
1015
1016         while (!list_empty(&req->link_list)) {
1017                 struct io_kiocb *link = list_first_entry(&req->link_list,
1018                                                 struct io_kiocb, link_list);
1019
1020                 list_del_init(&link->link_list);
1021                 trace_io_uring_fail_link(req, link);
1022
1023                 if ((req->flags & REQ_F_LINK_TIMEOUT) &&
1024                     link->opcode == IORING_OP_LINK_TIMEOUT) {
1025                         io_link_cancel_timeout(link);
1026                 } else {
1027                         io_cqring_fill_event(link, -ECANCELED);
1028                         __io_double_put_req(link);
1029                 }
1030                 req->flags &= ~REQ_F_LINK_TIMEOUT;
1031         }
1032
1033         io_commit_cqring(ctx);
1034         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1035         io_cqring_ev_posted(ctx);
1036 }
1037
1038 static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
1039 {
1040         if (likely(!(req->flags & REQ_F_LINK)))
1041                 return;
1042
1043         /*
1044          * If LINK is set, we have dependent requests in this chain. If we
1045          * didn't fail this request, queue the first one up, moving any other
1046          * dependencies to the next request. In case of failure, fail the rest
1047          * of the chain.
1048          */
1049         if (req->flags & REQ_F_FAIL_LINK) {
1050                 io_fail_links(req);
1051         } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
1052                         REQ_F_LINK_TIMEOUT) {
1053                 struct io_ring_ctx *ctx = req->ctx;
1054                 unsigned long flags;
1055
1056                 /*
1057                  * If this is a timeout link, we could be racing with the
1058                  * timeout timer. Grab the completion lock for this case to
1059                  * protect against that.
1060                  */
1061                 spin_lock_irqsave(&ctx->completion_lock, flags);
1062                 io_req_link_next(req, nxt);
1063                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1064         } else {
1065                 io_req_link_next(req, nxt);
1066         }
1067 }
1068
1069 static void io_free_req(struct io_kiocb *req)
1070 {
1071         struct io_kiocb *nxt = NULL;
1072
1073         io_req_find_next(req, &nxt);
1074         __io_free_req(req);
1075
1076         if (nxt)
1077                 io_queue_async_work(nxt);
1078 }
1079
1080 /*
1081  * Drop reference to request, return next in chain (if there is one) if this
1082  * was the last reference to this request.
1083  */
1084 __attribute__((nonnull))
1085 static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
1086 {
1087         io_req_find_next(req, nxtptr);
1088
1089         if (refcount_dec_and_test(&req->refs))
1090                 __io_free_req(req);
1091 }
1092
1093 static void io_put_req(struct io_kiocb *req)
1094 {
1095         if (refcount_dec_and_test(&req->refs))
1096                 io_free_req(req);
1097 }
1098
1099 /*
1100  * Must only be used if we don't need to care about links, usually from
1101  * within the completion handling itself.
1102  */
1103 static void __io_double_put_req(struct io_kiocb *req)
1104 {
1105         /* drop both submit and complete references */
1106         if (refcount_sub_and_test(2, &req->refs))
1107                 __io_free_req(req);
1108 }
1109
1110 static void io_double_put_req(struct io_kiocb *req)
1111 {
1112         /* drop both submit and complete references */
1113         if (refcount_sub_and_test(2, &req->refs))
1114                 io_free_req(req);
1115 }
1116
1117 static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
1118 {
1119         struct io_rings *rings = ctx->rings;
1120
1121         /*
1122          * noflush == true is from the waitqueue handler, just ensure we wake
1123          * up the task, and the next invocation will flush the entries. We
1124          * cannot safely to it from here.
1125          */
1126         if (noflush && !list_empty(&ctx->cq_overflow_list))
1127                 return -1U;
1128
1129         io_cqring_overflow_flush(ctx, false);
1130
1131         /* See comment at the top of this file */
1132         smp_rmb();
1133         return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
1134 }
1135
1136 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
1137 {
1138         struct io_rings *rings = ctx->rings;
1139
1140         /* make sure SQ entry isn't read before tail */
1141         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
1142 }
1143
1144 /*
1145  * Find and free completed poll iocbs
1146  */
1147 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
1148                                struct list_head *done)
1149 {
1150         void *reqs[IO_IOPOLL_BATCH];
1151         struct io_kiocb *req;
1152         int to_free;
1153
1154         to_free = 0;
1155         while (!list_empty(done)) {
1156                 req = list_first_entry(done, struct io_kiocb, list);
1157                 list_del(&req->list);
1158
1159                 io_cqring_fill_event(req, req->result);
1160                 (*nr_events)++;
1161
1162                 if (refcount_dec_and_test(&req->refs)) {
1163                         /* If we're not using fixed files, we have to pair the
1164                          * completion part with the file put. Use regular
1165                          * completions for those, only batch free for fixed
1166                          * file and non-linked commands.
1167                          */
1168                         if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
1169                             REQ_F_FIXED_FILE) && !io_is_fallback_req(req) &&
1170                             !req->io) {
1171                                 reqs[to_free++] = req;
1172                                 if (to_free == ARRAY_SIZE(reqs))
1173                                         io_free_req_many(ctx, reqs, &to_free);
1174                         } else {
1175                                 io_free_req(req);
1176                         }
1177                 }
1178         }
1179
1180         io_commit_cqring(ctx);
1181         io_free_req_many(ctx, reqs, &to_free);
1182 }
1183
1184 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
1185                         long min)
1186 {
1187         struct io_kiocb *req, *tmp;
1188         LIST_HEAD(done);
1189         bool spin;
1190         int ret;
1191
1192         /*
1193          * Only spin for completions if we don't have multiple devices hanging
1194          * off our complete list, and we're under the requested amount.
1195          */
1196         spin = !ctx->poll_multi_file && *nr_events < min;
1197
1198         ret = 0;
1199         list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
1200                 struct kiocb *kiocb = &req->rw.kiocb;
1201
1202                 /*
1203                  * Move completed entries to our local list. If we find a
1204                  * request that requires polling, break out and complete
1205                  * the done list first, if we have entries there.
1206                  */
1207                 if (req->flags & REQ_F_IOPOLL_COMPLETED) {
1208                         list_move_tail(&req->list, &done);
1209                         continue;
1210                 }
1211                 if (!list_empty(&done))
1212                         break;
1213
1214                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
1215                 if (ret < 0)
1216                         break;
1217
1218                 if (ret && spin)
1219                         spin = false;
1220                 ret = 0;
1221         }
1222
1223         if (!list_empty(&done))
1224                 io_iopoll_complete(ctx, nr_events, &done);
1225
1226         return ret;
1227 }
1228
1229 /*
1230  * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
1231  * non-spinning poll check - we'll still enter the driver poll loop, but only
1232  * as a non-spinning completion check.
1233  */
1234 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
1235                                 long min)
1236 {
1237         while (!list_empty(&ctx->poll_list) && !need_resched()) {
1238                 int ret;
1239
1240                 ret = io_do_iopoll(ctx, nr_events, min);
1241                 if (ret < 0)
1242                         return ret;
1243                 if (!min || *nr_events >= min)
1244                         return 0;
1245         }
1246
1247         return 1;
1248 }
1249
1250 /*
1251  * We can't just wait for polled events to come to us, we have to actively
1252  * find and complete them.
1253  */
1254 static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
1255 {
1256         if (!(ctx->flags & IORING_SETUP_IOPOLL))
1257                 return;
1258
1259         mutex_lock(&ctx->uring_lock);
1260         while (!list_empty(&ctx->poll_list)) {
1261                 unsigned int nr_events = 0;
1262
1263                 io_iopoll_getevents(ctx, &nr_events, 1);
1264
1265                 /*
1266                  * Ensure we allow local-to-the-cpu processing to take place,
1267                  * in this case we need to ensure that we reap all events.
1268                  */
1269                 cond_resched();
1270         }
1271         mutex_unlock(&ctx->uring_lock);
1272 }
1273
1274 static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1275                             long min)
1276 {
1277         int iters = 0, ret = 0;
1278
1279         do {
1280                 int tmin = 0;
1281
1282                 /*
1283                  * Don't enter poll loop if we already have events pending.
1284                  * If we do, we can potentially be spinning for commands that
1285                  * already triggered a CQE (eg in error).
1286                  */
1287                 if (io_cqring_events(ctx, false))
1288                         break;
1289
1290                 /*
1291                  * If a submit got punted to a workqueue, we can have the
1292                  * application entering polling for a command before it gets
1293                  * issued. That app will hold the uring_lock for the duration
1294                  * of the poll right here, so we need to take a breather every
1295                  * now and then to ensure that the issue has a chance to add
1296                  * the poll to the issued list. Otherwise we can spin here
1297                  * forever, while the workqueue is stuck trying to acquire the
1298                  * very same mutex.
1299                  */
1300                 if (!(++iters & 7)) {
1301                         mutex_unlock(&ctx->uring_lock);
1302                         mutex_lock(&ctx->uring_lock);
1303                 }
1304
1305                 if (*nr_events < min)
1306                         tmin = min - *nr_events;
1307
1308                 ret = io_iopoll_getevents(ctx, nr_events, tmin);
1309                 if (ret <= 0)
1310                         break;
1311                 ret = 0;
1312         } while (min && !*nr_events && !need_resched());
1313
1314         return ret;
1315 }
1316
1317 static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1318                            long min)
1319 {
1320         int ret;
1321
1322         /*
1323          * We disallow the app entering submit/complete with polling, but we
1324          * still need to lock the ring to prevent racing with polled issue
1325          * that got punted to a workqueue.
1326          */
1327         mutex_lock(&ctx->uring_lock);
1328         ret = __io_iopoll_check(ctx, nr_events, min);
1329         mutex_unlock(&ctx->uring_lock);
1330         return ret;
1331 }
1332
1333 static void kiocb_end_write(struct io_kiocb *req)
1334 {
1335         /*
1336          * Tell lockdep we inherited freeze protection from submission
1337          * thread.
1338          */
1339         if (req->flags & REQ_F_ISREG) {
1340                 struct inode *inode = file_inode(req->file);
1341
1342                 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
1343         }
1344         file_end_write(req->file);
1345 }
1346
1347 static inline void req_set_fail_links(struct io_kiocb *req)
1348 {
1349         if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1350                 req->flags |= REQ_F_FAIL_LINK;
1351 }
1352
1353 static void io_complete_rw_common(struct kiocb *kiocb, long res)
1354 {
1355         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1356
1357         if (kiocb->ki_flags & IOCB_WRITE)
1358                 kiocb_end_write(req);
1359
1360         if (res != req->result)
1361                 req_set_fail_links(req);
1362         io_cqring_add_event(req, res);
1363 }
1364
1365 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
1366 {
1367         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1368
1369         io_complete_rw_common(kiocb, res);
1370         io_put_req(req);
1371 }
1372
1373 static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res)
1374 {
1375         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1376         struct io_kiocb *nxt = NULL;
1377
1378         io_complete_rw_common(kiocb, res);
1379         io_put_req_find_next(req, &nxt);
1380
1381         return nxt;
1382 }
1383
1384 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
1385 {
1386         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1387
1388         if (kiocb->ki_flags & IOCB_WRITE)
1389                 kiocb_end_write(req);
1390
1391         if (res != req->result)
1392                 req_set_fail_links(req);
1393         req->result = res;
1394         if (res != -EAGAIN)
1395                 req->flags |= REQ_F_IOPOLL_COMPLETED;
1396 }
1397
1398 /*
1399  * After the iocb has been issued, it's safe to be found on the poll list.
1400  * Adding the kiocb to the list AFTER submission ensures that we don't
1401  * find it from a io_iopoll_getevents() thread before the issuer is done
1402  * accessing the kiocb cookie.
1403  */
1404 static void io_iopoll_req_issued(struct io_kiocb *req)
1405 {
1406         struct io_ring_ctx *ctx = req->ctx;
1407
1408         /*
1409          * Track whether we have multiple files in our lists. This will impact
1410          * how we do polling eventually, not spinning if we're on potentially
1411          * different devices.
1412          */
1413         if (list_empty(&ctx->poll_list)) {
1414                 ctx->poll_multi_file = false;
1415         } else if (!ctx->poll_multi_file) {
1416                 struct io_kiocb *list_req;
1417
1418                 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
1419                                                 list);
1420                 if (list_req->file != req->file)
1421                         ctx->poll_multi_file = true;
1422         }
1423
1424         /*
1425          * For fast devices, IO may have already completed. If it has, add
1426          * it to the front so we find it first.
1427          */
1428         if (req->flags & REQ_F_IOPOLL_COMPLETED)
1429                 list_add(&req->list, &ctx->poll_list);
1430         else
1431                 list_add_tail(&req->list, &ctx->poll_list);
1432 }
1433
1434 static void io_file_put(struct io_submit_state *state)
1435 {
1436         if (state->file) {
1437                 int diff = state->has_refs - state->used_refs;
1438
1439                 if (diff)
1440                         fput_many(state->file, diff);
1441                 state->file = NULL;
1442         }
1443 }
1444
1445 /*
1446  * Get as many references to a file as we have IOs left in this submission,
1447  * assuming most submissions are for one file, or at least that each file
1448  * has more than one submission.
1449  */
1450 static struct file *io_file_get(struct io_submit_state *state, int fd)
1451 {
1452         if (!state)
1453                 return fget(fd);
1454
1455         if (state->file) {
1456                 if (state->fd == fd) {
1457                         state->used_refs++;
1458                         state->ios_left--;
1459                         return state->file;
1460                 }
1461                 io_file_put(state);
1462         }
1463         state->file = fget_many(fd, state->ios_left);
1464         if (!state->file)
1465                 return NULL;
1466
1467         state->fd = fd;
1468         state->has_refs = state->ios_left;
1469         state->used_refs = 1;
1470         state->ios_left--;
1471         return state->file;
1472 }
1473
1474 /*
1475  * If we tracked the file through the SCM inflight mechanism, we could support
1476  * any file. For now, just ensure that anything potentially problematic is done
1477  * inline.
1478  */
1479 static bool io_file_supports_async(struct file *file)
1480 {
1481         umode_t mode = file_inode(file)->i_mode;
1482
1483         if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode))
1484                 return true;
1485         if (S_ISREG(mode) && file->f_op != &io_uring_fops)
1486                 return true;
1487
1488         return false;
1489 }
1490
1491 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1492                       bool force_nonblock)
1493 {
1494         struct io_ring_ctx *ctx = req->ctx;
1495         struct kiocb *kiocb = &req->rw.kiocb;
1496         unsigned ioprio;
1497         int ret;
1498
1499         if (!req->file)
1500                 return -EBADF;
1501
1502         if (S_ISREG(file_inode(req->file)->i_mode))
1503                 req->flags |= REQ_F_ISREG;
1504
1505         kiocb->ki_pos = READ_ONCE(sqe->off);
1506         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
1507         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
1508
1509         ioprio = READ_ONCE(sqe->ioprio);
1510         if (ioprio) {
1511                 ret = ioprio_check_cap(ioprio);
1512                 if (ret)
1513                         return ret;
1514
1515                 kiocb->ki_ioprio = ioprio;
1516         } else
1517                 kiocb->ki_ioprio = get_current_ioprio();
1518
1519         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
1520         if (unlikely(ret))
1521                 return ret;
1522
1523         /* don't allow async punt if RWF_NOWAIT was requested */
1524         if ((kiocb->ki_flags & IOCB_NOWAIT) ||
1525             (req->file->f_flags & O_NONBLOCK))
1526                 req->flags |= REQ_F_NOWAIT;
1527
1528         if (force_nonblock)
1529                 kiocb->ki_flags |= IOCB_NOWAIT;
1530
1531         if (ctx->flags & IORING_SETUP_IOPOLL) {
1532                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
1533                     !kiocb->ki_filp->f_op->iopoll)
1534                         return -EOPNOTSUPP;
1535
1536                 kiocb->ki_flags |= IOCB_HIPRI;
1537                 kiocb->ki_complete = io_complete_rw_iopoll;
1538                 req->result = 0;
1539         } else {
1540                 if (kiocb->ki_flags & IOCB_HIPRI)
1541                         return -EINVAL;
1542                 kiocb->ki_complete = io_complete_rw;
1543         }
1544
1545         req->rw.addr = READ_ONCE(sqe->addr);
1546         req->rw.len = READ_ONCE(sqe->len);
1547         /* we own ->private, reuse it for the buffer index */
1548         req->rw.kiocb.private = (void *) (unsigned long)
1549                                         READ_ONCE(sqe->buf_index);
1550         return 0;
1551 }
1552
1553 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
1554 {
1555         switch (ret) {
1556         case -EIOCBQUEUED:
1557                 break;
1558         case -ERESTARTSYS:
1559         case -ERESTARTNOINTR:
1560         case -ERESTARTNOHAND:
1561         case -ERESTART_RESTARTBLOCK:
1562                 /*
1563                  * We can't just restart the syscall, since previously
1564                  * submitted sqes may already be in progress. Just fail this
1565                  * IO with EINTR.
1566                  */
1567                 ret = -EINTR;
1568                 /* fall through */
1569         default:
1570                 kiocb->ki_complete(kiocb, ret, 0);
1571         }
1572 }
1573
1574 static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
1575                        bool in_async)
1576 {
1577         if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw)
1578                 *nxt = __io_complete_rw(kiocb, ret);
1579         else
1580                 io_rw_done(kiocb, ret);
1581 }
1582
1583 static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
1584                                struct iov_iter *iter)
1585 {
1586         struct io_ring_ctx *ctx = req->ctx;
1587         size_t len = req->rw.len;
1588         struct io_mapped_ubuf *imu;
1589         unsigned index, buf_index;
1590         size_t offset;
1591         u64 buf_addr;
1592
1593         /* attempt to use fixed buffers without having provided iovecs */
1594         if (unlikely(!ctx->user_bufs))
1595                 return -EFAULT;
1596
1597         buf_index = (unsigned long) req->rw.kiocb.private;
1598         if (unlikely(buf_index >= ctx->nr_user_bufs))
1599                 return -EFAULT;
1600
1601         index = array_index_nospec(buf_index, ctx->nr_user_bufs);
1602         imu = &ctx->user_bufs[index];
1603         buf_addr = req->rw.addr;
1604
1605         /* overflow */
1606         if (buf_addr + len < buf_addr)
1607                 return -EFAULT;
1608         /* not inside the mapped region */
1609         if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
1610                 return -EFAULT;
1611
1612         /*
1613          * May not be a start of buffer, set size appropriately
1614          * and advance us to the beginning.
1615          */
1616         offset = buf_addr - imu->ubuf;
1617         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
1618
1619         if (offset) {
1620                 /*
1621                  * Don't use iov_iter_advance() here, as it's really slow for
1622                  * using the latter parts of a big fixed buffer - it iterates
1623                  * over each segment manually. We can cheat a bit here, because
1624                  * we know that:
1625                  *
1626                  * 1) it's a BVEC iter, we set it up
1627                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
1628                  *    first and last bvec
1629                  *
1630                  * So just find our index, and adjust the iterator afterwards.
1631                  * If the offset is within the first bvec (or the whole first
1632                  * bvec, just use iov_iter_advance(). This makes it easier
1633                  * since we can just skip the first segment, which may not
1634                  * be PAGE_SIZE aligned.
1635                  */
1636                 const struct bio_vec *bvec = imu->bvec;
1637
1638                 if (offset <= bvec->bv_len) {
1639                         iov_iter_advance(iter, offset);
1640                 } else {
1641                         unsigned long seg_skip;
1642
1643                         /* skip first vec */
1644                         offset -= bvec->bv_len;
1645                         seg_skip = 1 + (offset >> PAGE_SHIFT);
1646
1647                         iter->bvec = bvec + seg_skip;
1648                         iter->nr_segs -= seg_skip;
1649                         iter->count -= bvec->bv_len + offset;
1650                         iter->iov_offset = offset & ~PAGE_MASK;
1651                 }
1652         }
1653
1654         return len;
1655 }
1656
1657 static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
1658                                struct iovec **iovec, struct iov_iter *iter)
1659 {
1660         void __user *buf = u64_to_user_ptr(req->rw.addr);
1661         size_t sqe_len = req->rw.len;
1662         u8 opcode;
1663
1664         opcode = req->opcode;
1665         if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
1666                 *iovec = NULL;
1667                 return io_import_fixed(req, rw, iter);
1668         }
1669
1670         /* buffer index only valid with fixed read/write */
1671         if (req->rw.kiocb.private)
1672                 return -EINVAL;
1673
1674         if (req->io) {
1675                 struct io_async_rw *iorw = &req->io->rw;
1676
1677                 *iovec = iorw->iov;
1678                 iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
1679                 if (iorw->iov == iorw->fast_iov)
1680                         *iovec = NULL;
1681                 return iorw->size;
1682         }
1683
1684         if (!req->has_user)
1685                 return -EFAULT;
1686
1687 #ifdef CONFIG_COMPAT
1688         if (req->ctx->compat)
1689                 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
1690                                                 iovec, iter);
1691 #endif
1692
1693         return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
1694 }
1695
1696 /*
1697  * For files that don't have ->read_iter() and ->write_iter(), handle them
1698  * by looping over ->read() or ->write() manually.
1699  */
1700 static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
1701                            struct iov_iter *iter)
1702 {
1703         ssize_t ret = 0;
1704
1705         /*
1706          * Don't support polled IO through this interface, and we can't
1707          * support non-blocking either. For the latter, this just causes
1708          * the kiocb to be handled from an async context.
1709          */
1710         if (kiocb->ki_flags & IOCB_HIPRI)
1711                 return -EOPNOTSUPP;
1712         if (kiocb->ki_flags & IOCB_NOWAIT)
1713                 return -EAGAIN;
1714
1715         while (iov_iter_count(iter)) {
1716                 struct iovec iovec;
1717                 ssize_t nr;
1718
1719                 if (!iov_iter_is_bvec(iter)) {
1720                         iovec = iov_iter_iovec(iter);
1721                 } else {
1722                         /* fixed buffers import bvec */
1723                         iovec.iov_base = kmap(iter->bvec->bv_page)
1724                                                 + iter->iov_offset;
1725                         iovec.iov_len = min(iter->count,
1726                                         iter->bvec->bv_len - iter->iov_offset);
1727                 }
1728
1729                 if (rw == READ) {
1730                         nr = file->f_op->read(file, iovec.iov_base,
1731                                               iovec.iov_len, &kiocb->ki_pos);
1732                 } else {
1733                         nr = file->f_op->write(file, iovec.iov_base,
1734                                                iovec.iov_len, &kiocb->ki_pos);
1735                 }
1736
1737                 if (iov_iter_is_bvec(iter))
1738                         kunmap(iter->bvec->bv_page);
1739
1740                 if (nr < 0) {
1741                         if (!ret)
1742                                 ret = nr;
1743                         break;
1744                 }
1745                 ret += nr;
1746                 if (nr != iovec.iov_len)
1747                         break;
1748                 iov_iter_advance(iter, nr);
1749         }
1750
1751         return ret;
1752 }
1753
1754 static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
1755                           struct iovec *iovec, struct iovec *fast_iov,
1756                           struct iov_iter *iter)
1757 {
1758         req->io->rw.nr_segs = iter->nr_segs;
1759         req->io->rw.size = io_size;
1760         req->io->rw.iov = iovec;
1761         if (!req->io->rw.iov) {
1762                 req->io->rw.iov = req->io->rw.fast_iov;
1763                 memcpy(req->io->rw.iov, fast_iov,
1764                         sizeof(struct iovec) * iter->nr_segs);
1765         }
1766 }
1767
1768 static int io_alloc_async_ctx(struct io_kiocb *req)
1769 {
1770         req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
1771         return req->io == NULL;
1772 }
1773
1774 static void io_rw_async(struct io_wq_work **workptr)
1775 {
1776         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
1777         struct iovec *iov = NULL;
1778
1779         if (req->io->rw.iov != req->io->rw.fast_iov)
1780                 iov = req->io->rw.iov;
1781         io_wq_submit_work(workptr);
1782         kfree(iov);
1783 }
1784
1785 static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
1786                              struct iovec *iovec, struct iovec *fast_iov,
1787                              struct iov_iter *iter)
1788 {
1789         if (!req->io && io_alloc_async_ctx(req))
1790                 return -ENOMEM;
1791
1792         io_req_map_rw(req, io_size, iovec, fast_iov, iter);
1793         req->work.func = io_rw_async;
1794         return 0;
1795 }
1796
1797 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1798                         bool force_nonblock)
1799 {
1800         struct io_async_ctx *io;
1801         struct iov_iter iter;
1802         ssize_t ret;
1803
1804         ret = io_prep_rw(req, sqe, force_nonblock);
1805         if (ret)
1806                 return ret;
1807
1808         if (unlikely(!(req->file->f_mode & FMODE_READ)))
1809                 return -EBADF;
1810
1811         if (!req->io)
1812                 return 0;
1813
1814         io = req->io;
1815         io->rw.iov = io->rw.fast_iov;
1816         req->io = NULL;
1817         ret = io_import_iovec(READ, req, &io->rw.iov, &iter);
1818         req->io = io;
1819         if (ret < 0)
1820                 return ret;
1821
1822         io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
1823         return 0;
1824 }
1825
1826 static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
1827                    bool force_nonblock)
1828 {
1829         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
1830         struct kiocb *kiocb = &req->rw.kiocb;
1831         struct iov_iter iter;
1832         size_t iov_count;
1833         ssize_t io_size, ret;
1834
1835         ret = io_import_iovec(READ, req, &iovec, &iter);
1836         if (ret < 0)
1837                 return ret;
1838
1839         /* Ensure we clear previously set non-block flag */
1840         if (!force_nonblock)
1841                 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
1842
1843         io_size = ret;
1844         if (req->flags & REQ_F_LINK)
1845                 req->result = io_size;
1846
1847         /*
1848          * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
1849          * we know to async punt it even if it was opened O_NONBLOCK
1850          */
1851         if (force_nonblock && !io_file_supports_async(req->file)) {
1852                 req->flags |= REQ_F_MUST_PUNT;
1853                 goto copy_iov;
1854         }
1855
1856         iov_count = iov_iter_count(&iter);
1857         ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
1858         if (!ret) {
1859                 ssize_t ret2;
1860
1861                 if (req->file->f_op->read_iter)
1862                         ret2 = call_read_iter(req->file, kiocb, &iter);
1863                 else
1864                         ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
1865
1866                 /* Catch -EAGAIN return for forced non-blocking submission */
1867                 if (!force_nonblock || ret2 != -EAGAIN) {
1868                         kiocb_done(kiocb, ret2, nxt, req->in_async);
1869                 } else {
1870 copy_iov:
1871                         ret = io_setup_async_rw(req, io_size, iovec,
1872                                                 inline_vecs, &iter);
1873                         if (ret)
1874                                 goto out_free;
1875                         return -EAGAIN;
1876                 }
1877         }
1878 out_free:
1879         if (!io_wq_current_is_worker())
1880                 kfree(iovec);
1881         return ret;
1882 }
1883
1884 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
1885                          bool force_nonblock)
1886 {
1887         struct io_async_ctx *io;
1888         struct iov_iter iter;
1889         ssize_t ret;
1890
1891         ret = io_prep_rw(req, sqe, force_nonblock);
1892         if (ret)
1893                 return ret;
1894
1895         if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
1896                 return -EBADF;
1897
1898         if (!req->io)
1899                 return 0;
1900
1901         io = req->io;
1902         io->rw.iov = io->rw.fast_iov;
1903         req->io = NULL;
1904         ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter);
1905         req->io = io;
1906         if (ret < 0)
1907                 return ret;
1908
1909         io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
1910         return 0;
1911 }
1912
1913 static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
1914                     bool force_nonblock)
1915 {
1916         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
1917         struct kiocb *kiocb = &req->rw.kiocb;
1918         struct iov_iter iter;
1919         size_t iov_count;
1920         ssize_t ret, io_size;
1921
1922         ret = io_import_iovec(WRITE, req, &iovec, &iter);
1923         if (ret < 0)
1924                 return ret;
1925
1926         /* Ensure we clear previously set non-block flag */
1927         if (!force_nonblock)
1928                 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
1929
1930         io_size = ret;
1931         if (req->flags & REQ_F_LINK)
1932                 req->result = io_size;
1933
1934         /*
1935          * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
1936          * we know to async punt it even if it was opened O_NONBLOCK
1937          */
1938         if (force_nonblock && !io_file_supports_async(req->file)) {
1939                 req->flags |= REQ_F_MUST_PUNT;
1940                 goto copy_iov;
1941         }
1942
1943         /* file path doesn't support NOWAIT for non-direct_IO */
1944         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
1945             (req->flags & REQ_F_ISREG))
1946                 goto copy_iov;
1947
1948         iov_count = iov_iter_count(&iter);
1949         ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
1950         if (!ret) {
1951                 ssize_t ret2;
1952
1953                 /*
1954                  * Open-code file_start_write here to grab freeze protection,
1955                  * which will be released by another thread in
1956                  * io_complete_rw().  Fool lockdep by telling it the lock got
1957                  * released so that it doesn't complain about the held lock when
1958                  * we return to userspace.
1959                  */
1960                 if (req->flags & REQ_F_ISREG) {
1961                         __sb_start_write(file_inode(req->file)->i_sb,
1962                                                 SB_FREEZE_WRITE, true);
1963                         __sb_writers_release(file_inode(req->file)->i_sb,
1964                                                 SB_FREEZE_WRITE);
1965                 }
1966                 kiocb->ki_flags |= IOCB_WRITE;
1967
1968                 if (req->file->f_op->write_iter)
1969                         ret2 = call_write_iter(req->file, kiocb, &iter);
1970                 else
1971                         ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
1972                 if (!force_nonblock || ret2 != -EAGAIN) {
1973                         kiocb_done(kiocb, ret2, nxt, req->in_async);
1974                 } else {
1975 copy_iov:
1976                         ret = io_setup_async_rw(req, io_size, iovec,
1977                                                 inline_vecs, &iter);
1978                         if (ret)
1979                                 goto out_free;
1980                         return -EAGAIN;
1981                 }
1982         }
1983 out_free:
1984         if (!io_wq_current_is_worker())
1985                 kfree(iovec);
1986         return ret;
1987 }
1988
1989 /*
1990  * IORING_OP_NOP just posts a completion event, nothing else.
1991  */
1992 static int io_nop(struct io_kiocb *req)
1993 {
1994         struct io_ring_ctx *ctx = req->ctx;
1995
1996         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
1997                 return -EINVAL;
1998
1999         io_cqring_add_event(req, 0);
2000         io_put_req(req);
2001         return 0;
2002 }
2003
2004 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2005 {
2006         struct io_ring_ctx *ctx = req->ctx;
2007
2008         if (!req->file)
2009                 return -EBADF;
2010
2011         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2012                 return -EINVAL;
2013         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
2014                 return -EINVAL;
2015
2016         req->sync.flags = READ_ONCE(sqe->fsync_flags);
2017         if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
2018                 return -EINVAL;
2019
2020         req->sync.off = READ_ONCE(sqe->off);
2021         req->sync.len = READ_ONCE(sqe->len);
2022         return 0;
2023 }
2024
2025 static bool io_req_cancelled(struct io_kiocb *req)
2026 {
2027         if (req->work.flags & IO_WQ_WORK_CANCEL) {
2028                 req_set_fail_links(req);
2029                 io_cqring_add_event(req, -ECANCELED);
2030                 io_put_req(req);
2031                 return true;
2032         }
2033
2034         return false;
2035 }
2036
2037 static void io_fsync_finish(struct io_wq_work **workptr)
2038 {
2039         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2040         loff_t end = req->sync.off + req->sync.len;
2041         struct io_kiocb *nxt = NULL;
2042         int ret;
2043
2044         if (io_req_cancelled(req))
2045                 return;
2046
2047         ret = vfs_fsync_range(req->file, req->sync.off,
2048                                 end > 0 ? end : LLONG_MAX,
2049                                 req->sync.flags & IORING_FSYNC_DATASYNC);
2050         if (ret < 0)
2051                 req_set_fail_links(req);
2052         io_cqring_add_event(req, ret);
2053         io_put_req_find_next(req, &nxt);
2054         if (nxt)
2055                 *workptr = &nxt->work;
2056 }
2057
2058 static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt,
2059                     bool force_nonblock)
2060 {
2061         struct io_wq_work *work, *old_work;
2062
2063         /* fsync always requires a blocking context */
2064         if (force_nonblock) {
2065                 io_put_req(req);
2066                 req->work.func = io_fsync_finish;
2067                 return -EAGAIN;
2068         }
2069
2070         work = old_work = &req->work;
2071         io_fsync_finish(&work);
2072         if (work && work != old_work)
2073                 *nxt = container_of(work, struct io_kiocb, work);
2074         return 0;
2075 }
2076
2077 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2078 {
2079         struct io_ring_ctx *ctx = req->ctx;
2080
2081         if (!req->file)
2082                 return -EBADF;
2083
2084         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2085                 return -EINVAL;
2086         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
2087                 return -EINVAL;
2088
2089         req->sync.off = READ_ONCE(sqe->off);
2090         req->sync.len = READ_ONCE(sqe->len);
2091         req->sync.flags = READ_ONCE(sqe->sync_range_flags);
2092         return 0;
2093 }
2094
2095 static void io_sync_file_range_finish(struct io_wq_work **workptr)
2096 {
2097         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2098         struct io_kiocb *nxt = NULL;
2099         int ret;
2100
2101         if (io_req_cancelled(req))
2102                 return;
2103
2104         ret = sync_file_range(req->file, req->sync.off, req->sync.len,
2105                                 req->sync.flags);
2106         if (ret < 0)
2107                 req_set_fail_links(req);
2108         io_cqring_add_event(req, ret);
2109         io_put_req_find_next(req, &nxt);
2110         if (nxt)
2111                 *workptr = &nxt->work;
2112 }
2113
2114 static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt,
2115                               bool force_nonblock)
2116 {
2117         struct io_wq_work *work, *old_work;
2118
2119         /* sync_file_range always requires a blocking context */
2120         if (force_nonblock) {
2121                 io_put_req(req);
2122                 req->work.func = io_sync_file_range_finish;
2123                 return -EAGAIN;
2124         }
2125
2126         work = old_work = &req->work;
2127         io_sync_file_range_finish(&work);
2128         if (work && work != old_work)
2129                 *nxt = container_of(work, struct io_kiocb, work);
2130         return 0;
2131 }
2132
2133 #if defined(CONFIG_NET)
2134 static void io_sendrecv_async(struct io_wq_work **workptr)
2135 {
2136         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2137         struct iovec *iov = NULL;
2138
2139         if (req->io->rw.iov != req->io->rw.fast_iov)
2140                 iov = req->io->msg.iov;
2141         io_wq_submit_work(workptr);
2142         kfree(iov);
2143 }
2144 #endif
2145
2146 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2147 {
2148 #if defined(CONFIG_NET)
2149         struct io_sr_msg *sr = &req->sr_msg;
2150         struct io_async_ctx *io = req->io;
2151
2152         sr->msg_flags = READ_ONCE(sqe->msg_flags);
2153         sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
2154
2155         if (!io)
2156                 return 0;
2157
2158         io->msg.iov = io->msg.fast_iov;
2159         return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
2160                                         &io->msg.iov);
2161 #else
2162         return -EOPNOTSUPP;
2163 #endif
2164 }
2165
2166 static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
2167                       bool force_nonblock)
2168 {
2169 #if defined(CONFIG_NET)
2170         struct io_async_msghdr *kmsg = NULL;
2171         struct socket *sock;
2172         int ret;
2173
2174         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2175                 return -EINVAL;
2176
2177         sock = sock_from_file(req->file, &ret);
2178         if (sock) {
2179                 struct io_async_ctx io;
2180                 struct sockaddr_storage addr;
2181                 unsigned flags;
2182
2183                 if (req->io) {
2184                         kmsg = &req->io->msg;
2185                         kmsg->msg.msg_name = &addr;
2186                         /* if iov is set, it's allocated already */
2187                         if (!kmsg->iov)
2188                                 kmsg->iov = kmsg->fast_iov;
2189                         kmsg->msg.msg_iter.iov = kmsg->iov;
2190                 } else {
2191                         struct io_sr_msg *sr = &req->sr_msg;
2192
2193                         kmsg = &io.msg;
2194                         kmsg->msg.msg_name = &addr;
2195
2196                         io.msg.iov = io.msg.fast_iov;
2197                         ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
2198                                         sr->msg_flags, &io.msg.iov);
2199                         if (ret)
2200                                 return ret;
2201                 }
2202
2203                 flags = req->sr_msg.msg_flags;
2204                 if (flags & MSG_DONTWAIT)
2205                         req->flags |= REQ_F_NOWAIT;
2206                 else if (force_nonblock)
2207                         flags |= MSG_DONTWAIT;
2208
2209                 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
2210                 if (force_nonblock && ret == -EAGAIN) {
2211                         if (req->io)
2212                                 return -EAGAIN;
2213                         if (io_alloc_async_ctx(req))
2214                                 return -ENOMEM;
2215                         memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
2216                         req->work.func = io_sendrecv_async;
2217                         return -EAGAIN;
2218                 }
2219                 if (ret == -ERESTARTSYS)
2220                         ret = -EINTR;
2221         }
2222
2223         if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov)
2224                 kfree(kmsg->iov);
2225         io_cqring_add_event(req, ret);
2226         if (ret < 0)
2227                 req_set_fail_links(req);
2228         io_put_req_find_next(req, nxt);
2229         return 0;
2230 #else
2231         return -EOPNOTSUPP;
2232 #endif
2233 }
2234
2235 static int io_recvmsg_prep(struct io_kiocb *req,
2236                            const struct io_uring_sqe *sqe)
2237 {
2238 #if defined(CONFIG_NET)
2239         struct io_sr_msg *sr = &req->sr_msg;
2240         struct io_async_ctx *io = req->io;
2241
2242         sr->msg_flags = READ_ONCE(sqe->msg_flags);
2243         sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
2244
2245         if (!io)
2246                 return 0;
2247
2248         io->msg.iov = io->msg.fast_iov;
2249         return recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
2250                                         &io->msg.uaddr, &io->msg.iov);
2251 #else
2252         return -EOPNOTSUPP;
2253 #endif
2254 }
2255
2256 static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
2257                       bool force_nonblock)
2258 {
2259 #if defined(CONFIG_NET)
2260         struct io_async_msghdr *kmsg = NULL;
2261         struct socket *sock;
2262         int ret;
2263
2264         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2265                 return -EINVAL;
2266
2267         sock = sock_from_file(req->file, &ret);
2268         if (sock) {
2269                 struct io_async_ctx io;
2270                 struct sockaddr_storage addr;
2271                 unsigned flags;
2272
2273                 if (req->io) {
2274                         kmsg = &req->io->msg;
2275                         kmsg->msg.msg_name = &addr;
2276                         /* if iov is set, it's allocated already */
2277                         if (!kmsg->iov)
2278                                 kmsg->iov = kmsg->fast_iov;
2279                         kmsg->msg.msg_iter.iov = kmsg->iov;
2280                 } else {
2281                         struct io_sr_msg *sr = &req->sr_msg;
2282
2283                         kmsg = &io.msg;
2284                         kmsg->msg.msg_name = &addr;
2285
2286                         io.msg.iov = io.msg.fast_iov;
2287                         ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg,
2288                                         sr->msg_flags, &io.msg.uaddr,
2289                                         &io.msg.iov);
2290                         if (ret)
2291                                 return ret;
2292                 }
2293
2294                 flags = req->sr_msg.msg_flags;
2295                 if (flags & MSG_DONTWAIT)
2296                         req->flags |= REQ_F_NOWAIT;
2297                 else if (force_nonblock)
2298                         flags |= MSG_DONTWAIT;
2299
2300                 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
2301                                                 kmsg->uaddr, flags);
2302                 if (force_nonblock && ret == -EAGAIN) {
2303                         if (req->io)
2304                                 return -EAGAIN;
2305                         if (io_alloc_async_ctx(req))
2306                                 return -ENOMEM;
2307                         memcpy(&req->io->msg, &io.msg, sizeof(io.msg));
2308                         req->work.func = io_sendrecv_async;
2309                         return -EAGAIN;
2310                 }
2311                 if (ret == -ERESTARTSYS)
2312                         ret = -EINTR;
2313         }
2314
2315         if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov)
2316                 kfree(kmsg->iov);
2317         io_cqring_add_event(req, ret);
2318         if (ret < 0)
2319                 req_set_fail_links(req);
2320         io_put_req_find_next(req, nxt);
2321         return 0;
2322 #else
2323         return -EOPNOTSUPP;
2324 #endif
2325 }
2326
2327 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2328 {
2329 #if defined(CONFIG_NET)
2330         struct io_accept *accept = &req->accept;
2331
2332         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
2333                 return -EINVAL;
2334         if (sqe->ioprio || sqe->len || sqe->buf_index)
2335                 return -EINVAL;
2336
2337         accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
2338         accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
2339         accept->flags = READ_ONCE(sqe->accept_flags);
2340         return 0;
2341 #else
2342         return -EOPNOTSUPP;
2343 #endif
2344 }
2345
2346 #if defined(CONFIG_NET)
2347 static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
2348                        bool force_nonblock)
2349 {
2350         struct io_accept *accept = &req->accept;
2351         unsigned file_flags;
2352         int ret;
2353
2354         file_flags = force_nonblock ? O_NONBLOCK : 0;
2355         ret = __sys_accept4_file(req->file, file_flags, accept->addr,
2356                                         accept->addr_len, accept->flags);
2357         if (ret == -EAGAIN && force_nonblock)
2358                 return -EAGAIN;
2359         if (ret == -ERESTARTSYS)
2360                 ret = -EINTR;
2361         if (ret < 0)
2362                 req_set_fail_links(req);
2363         io_cqring_add_event(req, ret);
2364         io_put_req_find_next(req, nxt);
2365         return 0;
2366 }
2367
2368 static void io_accept_finish(struct io_wq_work **workptr)
2369 {
2370         struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
2371         struct io_kiocb *nxt = NULL;
2372
2373         if (io_req_cancelled(req))
2374                 return;
2375         __io_accept(req, &nxt, false);
2376         if (nxt)
2377                 *workptr = &nxt->work;
2378 }
2379 #endif
2380
2381 static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
2382                      bool force_nonblock)
2383 {
2384 #if defined(CONFIG_NET)
2385         int ret;
2386
2387         ret = __io_accept(req, nxt, force_nonblock);
2388         if (ret == -EAGAIN && force_nonblock) {
2389                 req->work.func = io_accept_finish;
2390                 req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
2391                 io_put_req(req);
2392                 return -EAGAIN;
2393         }
2394         return 0;
2395 #else
2396         return -EOPNOTSUPP;
2397 #endif
2398 }
2399
2400 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2401 {
2402 #if defined(CONFIG_NET)
2403         struct io_connect *conn = &req->connect;
2404         struct io_async_ctx *io = req->io;
2405
2406         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
2407                 return -EINVAL;
2408         if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
2409                 return -EINVAL;
2410
2411         conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
2412         conn->addr_len =  READ_ONCE(sqe->addr2);
2413
2414         if (!io)
2415                 return 0;
2416
2417         return move_addr_to_kernel(conn->addr, conn->addr_len,
2418                                         &io->connect.address);
2419 #else
2420         return -EOPNOTSUPP;
2421 #endif
2422 }
2423
2424 static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt,
2425                       bool force_nonblock)
2426 {
2427 #if defined(CONFIG_NET)
2428         struct io_async_ctx __io, *io;
2429         unsigned file_flags;
2430         int ret;
2431
2432         if (req->io) {
2433                 io = req->io;
2434         } else {
2435                 ret = move_addr_to_kernel(req->connect.addr,
2436                                                 req->connect.addr_len,
2437                                                 &__io.connect.address);
2438                 if (ret)
2439                         goto out;
2440                 io = &__io;
2441         }
2442
2443         file_flags = force_nonblock ? O_NONBLOCK : 0;
2444
2445         ret = __sys_connect_file(req->file, &io->connect.address,
2446                                         req->connect.addr_len, file_flags);
2447         if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
2448                 if (req->io)
2449                         return -EAGAIN;
2450                 if (io_alloc_async_ctx(req)) {
2451                         ret = -ENOMEM;
2452                         goto out;
2453                 }
2454                 memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect));
2455                 return -EAGAIN;
2456         }
2457         if (ret == -ERESTARTSYS)
2458                 ret = -EINTR;
2459 out:
2460         if (ret < 0)
2461                 req_set_fail_links(req);
2462         io_cqring_add_event(req, ret);
2463         io_put_req_find_next(req, nxt);
2464         return 0;
2465 #else
2466         return -EOPNOTSUPP;
2467 #endif
2468 }
2469
2470 static void io_poll_remove_one(struct io_kiocb *req)
2471 {
2472         struct io_poll_iocb *poll = &req->poll;
2473
2474         spin_lock(&poll->head->lock);
2475         WRITE_ONCE(poll->canceled, true);
2476         if (!list_empty(&poll->wait.entry)) {
2477                 list_del_init(&poll->wait.entry);
2478                 io_queue_async_work(req);
2479         }
2480         spin_unlock(&poll->head->lock);
2481         hash_del(&req->hash_node);
2482 }
2483
2484 static void io_poll_remove_all(struct io_ring_ctx *ctx)
2485 {
2486         struct hlist_node *tmp;
2487         struct io_kiocb *req;
2488         int i;
2489
2490         spin_lock_irq(&ctx->completion_lock);
2491         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
2492                 struct hlist_head *list;
2493
2494                 list = &ctx->cancel_hash[i];
2495                 hlist_for_each_entry_safe(req, tmp, list, hash_node)
2496                         io_poll_remove_one(req);
2497         }
2498         spin_unlock_irq(&ctx->completion_lock);
2499 }
2500
2501 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
2502 {
2503         struct hlist_head *list;
2504         struct io_kiocb *req;
2505
2506         list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
2507         hlist_for_each_entry(req, list, hash_node) {
2508                 if (sqe_addr == req->user_data) {
2509                         io_poll_remove_one(req);
2510                         return 0;
2511                 }
2512         }
2513
2514         return -ENOENT;
2515 }
2516
2517 static int io_poll_remove_prep(struct io_kiocb *req,
2518                                const struct io_uring_sqe *sqe)
2519 {
2520         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2521                 return -EINVAL;
2522         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
2523             sqe->poll_events)
2524                 return -EINVAL;
2525
2526         req->poll.addr = READ_ONCE(sqe->addr);
2527         return 0;
2528 }
2529
2530 /*
2531  * Find a running poll command that matches one specified in sqe->addr,
2532  * and remove it if found.
2533  */
2534 static int io_poll_remove(struct io_kiocb *req)
2535 {
2536         struct io_ring_ctx *ctx = req->ctx;
2537         u64 addr;
2538         int ret;
2539
2540         addr = req->poll.addr;
2541         spin_lock_irq(&ctx->completion_lock);
2542         ret = io_poll_cancel(ctx, addr);
2543         spin_unlock_irq(&ctx->completion_lock);
2544
2545         io_cqring_add_event(req, ret);
2546         if (ret < 0)
2547                 req_set_fail_links(req);
2548         io_put_req(req);
2549         return 0;
2550 }
2551
2552 static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
2553 {
2554         struct io_ring_ctx *ctx = req->ctx;
2555
2556         req->poll.done = true;
2557         if (error)
2558                 io_cqring_fill_event(req, error);
2559         else
2560                 io_cqring_fill_event(req, mangle_poll(mask));
2561         io_commit_cqring(ctx);
2562 }
2563
2564 static void io_poll_complete_work(struct io_wq_work **workptr)
2565 {
2566         struct io_wq_work *work = *workptr;
2567         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
2568         struct io_poll_iocb *poll = &req->poll;
2569         struct poll_table_struct pt = { ._key = poll->events };
2570         struct io_ring_ctx *ctx = req->ctx;
2571         struct io_kiocb *nxt = NULL;
2572         __poll_t mask = 0;
2573         int ret = 0;
2574
2575         if (work->flags & IO_WQ_WORK_CANCEL) {
2576                 WRITE_ONCE(poll->canceled, true);
2577                 ret = -ECANCELED;
2578         } else if (READ_ONCE(poll->canceled)) {
2579                 ret = -ECANCELED;
2580         }
2581
2582         if (ret != -ECANCELED)
2583                 mask = vfs_poll(poll->file, &pt) & poll->events;
2584
2585         /*
2586          * Note that ->ki_cancel callers also delete iocb from active_reqs after
2587          * calling ->ki_cancel.  We need the ctx_lock roundtrip here to
2588          * synchronize with them.  In the cancellation case the list_del_init
2589          * itself is not actually needed, but harmless so we keep it in to
2590          * avoid further branches in the fast path.
2591          */
2592         spin_lock_irq(&ctx->completion_lock);
2593         if (!mask && ret != -ECANCELED) {
2594                 add_wait_queue(poll->head, &poll->wait);
2595                 spin_unlock_irq(&ctx->completion_lock);
2596                 return;
2597         }
2598         hash_del(&req->hash_node);
2599         io_poll_complete(req, mask, ret);
2600         spin_unlock_irq(&ctx->completion_lock);
2601
2602         io_cqring_ev_posted(ctx);
2603
2604         if (ret < 0)
2605                 req_set_fail_links(req);
2606         io_put_req_find_next(req, &nxt);
2607         if (nxt)
2608                 *workptr = &nxt->work;
2609 }
2610
2611 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
2612                         void *key)
2613 {
2614         struct io_poll_iocb *poll = wait->private;
2615         struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
2616         struct io_ring_ctx *ctx = req->ctx;
2617         __poll_t mask = key_to_poll(key);
2618         unsigned long flags;
2619
2620         /* for instances that support it check for an event match first: */
2621         if (mask && !(mask & poll->events))
2622                 return 0;
2623
2624         list_del_init(&poll->wait.entry);
2625
2626         /*
2627          * Run completion inline if we can. We're using trylock here because
2628          * we are violating the completion_lock -> poll wq lock ordering.
2629          * If we have a link timeout we're going to need the completion_lock
2630          * for finalizing the request, mark us as having grabbed that already.
2631          */
2632         if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
2633                 hash_del(&req->hash_node);
2634                 io_poll_complete(req, mask, 0);
2635                 req->flags |= REQ_F_COMP_LOCKED;
2636                 io_put_req(req);
2637                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
2638
2639                 io_cqring_ev_posted(ctx);
2640         } else {
2641                 io_queue_async_work(req);
2642         }
2643
2644         return 1;
2645 }
2646
2647 struct io_poll_table {
2648         struct poll_table_struct pt;
2649         struct io_kiocb *req;
2650         int error;
2651 };
2652
2653 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
2654                                struct poll_table_struct *p)
2655 {
2656         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
2657
2658         if (unlikely(pt->req->poll.head)) {
2659                 pt->error = -EINVAL;
2660                 return;
2661         }
2662
2663         pt->error = 0;
2664         pt->req->poll.head = head;
2665         add_wait_queue(head, &pt->req->poll.wait);
2666 }
2667
2668 static void io_poll_req_insert(struct io_kiocb *req)
2669 {
2670         struct io_ring_ctx *ctx = req->ctx;
2671         struct hlist_head *list;
2672
2673         list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
2674         hlist_add_head(&req->hash_node, list);
2675 }
2676
2677 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2678 {
2679         struct io_poll_iocb *poll = &req->poll;
2680         u16 events;
2681
2682         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2683                 return -EINVAL;
2684         if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
2685                 return -EINVAL;
2686         if (!poll->file)
2687                 return -EBADF;
2688
2689         events = READ_ONCE(sqe->poll_events);
2690         poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
2691         return 0;
2692 }
2693
2694 static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt)
2695 {
2696         struct io_poll_iocb *poll = &req->poll;
2697         struct io_ring_ctx *ctx = req->ctx;
2698         struct io_poll_table ipt;
2699         bool cancel = false;
2700         __poll_t mask;
2701
2702         INIT_IO_WORK(&req->work, io_poll_complete_work);
2703         INIT_HLIST_NODE(&req->hash_node);
2704
2705         poll->head = NULL;
2706         poll->done = false;
2707         poll->canceled = false;
2708
2709         ipt.pt._qproc = io_poll_queue_proc;
2710         ipt.pt._key = poll->events;
2711         ipt.req = req;
2712         ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
2713
2714         /* initialized the list so that we can do list_empty checks */
2715         INIT_LIST_HEAD(&poll->wait.entry);
2716         init_waitqueue_func_entry(&poll->wait, io_poll_wake);
2717         poll->wait.private = poll;
2718
2719         INIT_LIST_HEAD(&req->list);
2720
2721         mask = vfs_poll(poll->file, &ipt.pt) & poll->events;
2722
2723         spin_lock_irq(&ctx->completion_lock);
2724         if (likely(poll->head)) {
2725                 spin_lock(&poll->head->lock);
2726                 if (unlikely(list_empty(&poll->wait.entry))) {
2727                         if (ipt.error)
2728                                 cancel = true;
2729                         ipt.error = 0;
2730                         mask = 0;
2731                 }
2732                 if (mask || ipt.error)
2733                         list_del_init(&poll->wait.entry);
2734                 else if (cancel)
2735                         WRITE_ONCE(poll->canceled, true);
2736                 else if (!poll->done) /* actually waiting for an event */
2737                         io_poll_req_insert(req);
2738                 spin_unlock(&poll->head->lock);
2739         }
2740         if (mask) { /* no async, we'd stolen it */
2741                 ipt.error = 0;
2742                 io_poll_complete(req, mask, 0);
2743         }
2744         spin_unlock_irq(&ctx->completion_lock);
2745
2746         if (mask) {
2747                 io_cqring_ev_posted(ctx);
2748                 io_put_req_find_next(req, nxt);
2749         }
2750         return ipt.error;
2751 }
2752
2753 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
2754 {
2755         struct io_timeout_data *data = container_of(timer,
2756                                                 struct io_timeout_data, timer);
2757         struct io_kiocb *req = data->req;
2758         struct io_ring_ctx *ctx = req->ctx;
2759         unsigned long flags;
2760
2761         atomic_inc(&ctx->cq_timeouts);
2762
2763         spin_lock_irqsave(&ctx->completion_lock, flags);
2764         /*
2765          * We could be racing with timeout deletion. If the list is empty,
2766          * then timeout lookup already found it and will be handling it.
2767          */
2768         if (!list_empty(&req->list)) {
2769                 struct io_kiocb *prev;
2770
2771                 /*
2772                  * Adjust the reqs sequence before the current one because it
2773                  * will consume a slot in the cq_ring and the cq_tail
2774                  * pointer will be increased, otherwise other timeout reqs may
2775                  * return in advance without waiting for enough wait_nr.
2776                  */
2777                 prev = req;
2778                 list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
2779                         prev->sequence++;
2780                 list_del_init(&req->list);
2781         }
2782
2783         io_cqring_fill_event(req, -ETIME);
2784         io_commit_cqring(ctx);
2785         spin_unlock_irqrestore(&ctx->completion_lock, flags);
2786
2787         io_cqring_ev_posted(ctx);
2788         req_set_fail_links(req);
2789         io_put_req(req);
2790         return HRTIMER_NORESTART;
2791 }
2792
2793 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
2794 {
2795         struct io_kiocb *req;
2796         int ret = -ENOENT;
2797
2798         list_for_each_entry(req, &ctx->timeout_list, list) {
2799                 if (user_data == req->user_data) {
2800                         list_del_init(&req->list);
2801                         ret = 0;
2802                         break;
2803                 }
2804         }
2805
2806         if (ret == -ENOENT)
2807                 return ret;
2808
2809         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
2810         if (ret == -1)
2811                 return -EALREADY;
2812
2813         req_set_fail_links(req);
2814         io_cqring_fill_event(req, -ECANCELED);
2815         io_put_req(req);
2816         return 0;
2817 }
2818
2819 static int io_timeout_remove_prep(struct io_kiocb *req,
2820                                   const struct io_uring_sqe *sqe)
2821 {
2822         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2823                 return -EINVAL;
2824         if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
2825                 return -EINVAL;
2826
2827         req->timeout.addr = READ_ONCE(sqe->addr);
2828         req->timeout.flags = READ_ONCE(sqe->timeout_flags);
2829         if (req->timeout.flags)
2830                 return -EINVAL;
2831
2832         return 0;
2833 }
2834
2835 /*
2836  * Remove or update an existing timeout command
2837  */
2838 static int io_timeout_remove(struct io_kiocb *req)
2839 {
2840         struct io_ring_ctx *ctx = req->ctx;
2841         int ret;
2842
2843         spin_lock_irq(&ctx->completion_lock);
2844         ret = io_timeout_cancel(ctx, req->timeout.addr);
2845
2846         io_cqring_fill_event(req, ret);
2847         io_commit_cqring(ctx);
2848         spin_unlock_irq(&ctx->completion_lock);
2849         io_cqring_ev_posted(ctx);
2850         if (ret < 0)
2851                 req_set_fail_links(req);
2852         io_put_req(req);
2853         return 0;
2854 }
2855
2856 static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2857                            bool is_timeout_link)
2858 {
2859         struct io_timeout_data *data;
2860         unsigned flags;
2861
2862         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2863                 return -EINVAL;
2864         if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
2865                 return -EINVAL;
2866         if (sqe->off && is_timeout_link)
2867                 return -EINVAL;
2868         flags = READ_ONCE(sqe->timeout_flags);
2869         if (flags & ~IORING_TIMEOUT_ABS)
2870                 return -EINVAL;
2871
2872         req->timeout.count = READ_ONCE(sqe->off);
2873
2874         if (!req->io && io_alloc_async_ctx(req))
2875                 return -ENOMEM;
2876
2877         data = &req->io->timeout;
2878         data->req = req;
2879         req->flags |= REQ_F_TIMEOUT;
2880
2881         if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
2882                 return -EFAULT;
2883
2884         if (flags & IORING_TIMEOUT_ABS)
2885                 data->mode = HRTIMER_MODE_ABS;
2886         else
2887                 data->mode = HRTIMER_MODE_REL;
2888
2889         hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
2890         return 0;
2891 }
2892
2893 static int io_timeout(struct io_kiocb *req)
2894 {
2895         unsigned count;
2896         struct io_ring_ctx *ctx = req->ctx;
2897         struct io_timeout_data *data;
2898         struct list_head *entry;
2899         unsigned span = 0;
2900
2901         data = &req->io->timeout;
2902
2903         /*
2904          * sqe->off holds how many events that need to occur for this
2905          * timeout event to be satisfied. If it isn't set, then this is
2906          * a pure timeout request, sequence isn't used.
2907          */
2908         count = req->timeout.count;
2909         if (!count) {
2910                 req->flags |= REQ_F_TIMEOUT_NOSEQ;
2911                 spin_lock_irq(&ctx->completion_lock);
2912                 entry = ctx->timeout_list.prev;
2913                 goto add;
2914         }
2915
2916         req->sequence = ctx->cached_sq_head + count - 1;
2917         data->seq_offset = count;
2918
2919         /*
2920          * Insertion sort, ensuring the first entry in the list is always
2921          * the one we need first.
2922          */
2923         spin_lock_irq(&ctx->completion_lock);
2924         list_for_each_prev(entry, &ctx->timeout_list) {
2925                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
2926                 unsigned nxt_sq_head;
2927                 long long tmp, tmp_nxt;
2928                 u32 nxt_offset = nxt->io->timeout.seq_offset;
2929
2930                 if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
2931                         continue;
2932
2933                 /*
2934                  * Since cached_sq_head + count - 1 can overflow, use type long
2935                  * long to store it.
2936                  */
2937                 tmp = (long long)ctx->cached_sq_head + count - 1;
2938                 nxt_sq_head = nxt->sequence - nxt_offset + 1;
2939                 tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1;
2940
2941                 /*
2942                  * cached_sq_head may overflow, and it will never overflow twice
2943                  * once there is some timeout req still be valid.
2944                  */
2945                 if (ctx->cached_sq_head < nxt_sq_head)
2946                         tmp += UINT_MAX;
2947
2948                 if (tmp > tmp_nxt)
2949                         break;
2950
2951                 /*
2952                  * Sequence of reqs after the insert one and itself should
2953                  * be adjusted because each timeout req consumes a slot.
2954                  */
2955                 span++;
2956                 nxt->sequence++;
2957         }
2958         req->sequence -= span;
2959 add:
2960         list_add(&req->list, entry);
2961         data->timer.function = io_timeout_fn;
2962         hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
2963         spin_unlock_irq(&ctx->completion_lock);
2964         return 0;
2965 }
2966
2967 static bool io_cancel_cb(struct io_wq_work *work, void *data)
2968 {
2969         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
2970
2971         return req->user_data == (unsigned long) data;
2972 }
2973
2974 static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
2975 {
2976         enum io_wq_cancel cancel_ret;
2977         int ret = 0;
2978
2979         cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
2980         switch (cancel_ret) {
2981         case IO_WQ_CANCEL_OK:
2982                 ret = 0;
2983                 break;
2984         case IO_WQ_CANCEL_RUNNING:
2985                 ret = -EALREADY;
2986                 break;
2987         case IO_WQ_CANCEL_NOTFOUND:
2988                 ret = -ENOENT;
2989                 break;
2990         }
2991
2992         return ret;
2993 }
2994
2995 static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
2996                                      struct io_kiocb *req, __u64 sqe_addr,
2997                                      struct io_kiocb **nxt, int success_ret)
2998 {
2999         unsigned long flags;
3000         int ret;
3001
3002         ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
3003         if (ret != -ENOENT) {
3004                 spin_lock_irqsave(&ctx->completion_lock, flags);
3005                 goto done;
3006         }
3007
3008         spin_lock_irqsave(&ctx->completion_lock, flags);
3009         ret = io_timeout_cancel(ctx, sqe_addr);
3010         if (ret != -ENOENT)
3011                 goto done;
3012         ret = io_poll_cancel(ctx, sqe_addr);
3013 done:
3014         if (!ret)
3015                 ret = success_ret;
3016         io_cqring_fill_event(req, ret);
3017         io_commit_cqring(ctx);
3018         spin_unlock_irqrestore(&ctx->completion_lock, flags);
3019         io_cqring_ev_posted(ctx);
3020
3021         if (ret < 0)
3022                 req_set_fail_links(req);
3023         io_put_req_find_next(req, nxt);
3024 }
3025
3026 static int io_async_cancel_prep(struct io_kiocb *req,
3027                                 const struct io_uring_sqe *sqe)
3028 {
3029         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3030                 return -EINVAL;
3031         if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
3032             sqe->cancel_flags)
3033                 return -EINVAL;
3034
3035         req->cancel.addr = READ_ONCE(sqe->addr);
3036         return 0;
3037 }
3038
3039 static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt)
3040 {
3041         struct io_ring_ctx *ctx = req->ctx;
3042
3043         io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0);
3044         return 0;
3045 }
3046
3047 static int io_req_defer_prep(struct io_kiocb *req,
3048                              const struct io_uring_sqe *sqe)
3049 {
3050         ssize_t ret = 0;
3051
3052         switch (req->opcode) {
3053         case IORING_OP_NOP:
3054                 break;
3055         case IORING_OP_READV:
3056         case IORING_OP_READ_FIXED:
3057                 ret = io_read_prep(req, sqe, true);
3058                 break;
3059         case IORING_OP_WRITEV:
3060         case IORING_OP_WRITE_FIXED:
3061                 ret = io_write_prep(req, sqe, true);
3062                 break;
3063         case IORING_OP_POLL_ADD:
3064                 ret = io_poll_add_prep(req, sqe);
3065                 break;
3066         case IORING_OP_POLL_REMOVE:
3067                 ret = io_poll_remove_prep(req, sqe);
3068                 break;
3069         case IORING_OP_FSYNC:
3070                 ret = io_prep_fsync(req, sqe);
3071                 break;
3072         case IORING_OP_SYNC_FILE_RANGE:
3073                 ret = io_prep_sfr(req, sqe);
3074                 break;
3075         case IORING_OP_SENDMSG:
3076                 ret = io_sendmsg_prep(req, sqe);
3077                 break;
3078         case IORING_OP_RECVMSG:
3079                 ret = io_recvmsg_prep(req, sqe);
3080                 break;
3081         case IORING_OP_CONNECT:
3082                 ret = io_connect_prep(req, sqe);
3083                 break;
3084         case IORING_OP_TIMEOUT:
3085                 ret = io_timeout_prep(req, sqe, false);
3086                 break;
3087         case IORING_OP_TIMEOUT_REMOVE:
3088                 ret = io_timeout_remove_prep(req, sqe);
3089                 break;
3090         case IORING_OP_ASYNC_CANCEL:
3091                 ret = io_async_cancel_prep(req, sqe);
3092                 break;
3093         case IORING_OP_LINK_TIMEOUT:
3094                 ret = io_timeout_prep(req, sqe, true);
3095                 break;
3096         case IORING_OP_ACCEPT:
3097                 ret = io_accept_prep(req, sqe);
3098                 break;
3099         default:
3100                 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
3101                                 req->opcode);
3102                 ret = -EINVAL;
3103                 break;
3104         }
3105
3106         return ret;
3107 }
3108
3109 static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3110 {
3111         struct io_ring_ctx *ctx = req->ctx;
3112         int ret;
3113
3114         /* Still need defer if there is pending req in defer list. */
3115         if (!req_need_defer(req) && list_empty(&ctx->defer_list))
3116                 return 0;
3117
3118         if (!req->io && io_alloc_async_ctx(req))
3119                 return -EAGAIN;
3120
3121         ret = io_req_defer_prep(req, sqe);
3122         if (ret < 0)
3123                 return ret;
3124
3125         spin_lock_irq(&ctx->completion_lock);
3126         if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
3127                 spin_unlock_irq(&ctx->completion_lock);
3128                 return 0;
3129         }
3130
3131         trace_io_uring_defer(ctx, req, req->user_data);
3132         list_add_tail(&req->list, &ctx->defer_list);
3133         spin_unlock_irq(&ctx->completion_lock);
3134         return -EIOCBQUEUED;
3135 }
3136
3137 static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
3138                         struct io_kiocb **nxt, bool force_nonblock)
3139 {
3140         struct io_ring_ctx *ctx = req->ctx;
3141         int ret;
3142
3143         switch (req->opcode) {
3144         case IORING_OP_NOP:
3145                 ret = io_nop(req);
3146                 break;
3147         case IORING_OP_READV:
3148         case IORING_OP_READ_FIXED:
3149                 if (sqe) {
3150                         ret = io_read_prep(req, sqe, force_nonblock);
3151                         if (ret < 0)
3152                                 break;
3153                 }
3154                 ret = io_read(req, nxt, force_nonblock);
3155                 break;
3156         case IORING_OP_WRITEV:
3157         case IORING_OP_WRITE_FIXED:
3158                 if (sqe) {
3159                         ret = io_write_prep(req, sqe, force_nonblock);
3160                         if (ret < 0)
3161                                 break;
3162                 }
3163                 ret = io_write(req, nxt, force_nonblock);
3164                 break;
3165         case IORING_OP_FSYNC:
3166                 if (sqe) {
3167                         ret = io_prep_fsync(req, sqe);
3168                         if (ret < 0)
3169                                 break;
3170                 }
3171                 ret = io_fsync(req, nxt, force_nonblock);
3172                 break;
3173         case IORING_OP_POLL_ADD:
3174                 if (sqe) {
3175                         ret = io_poll_add_prep(req, sqe);
3176                         if (ret)
3177                                 break;
3178                 }
3179                 ret = io_poll_add(req, nxt);
3180                 break;
3181         case IORING_OP_POLL_REMOVE:
3182                 if (sqe) {
3183                         ret = io_poll_remove_prep(req, sqe);
3184                         if (ret < 0)
3185                                 break;
3186                 }
3187                 ret = io_poll_remove(req);
3188                 break;
3189         case IORING_OP_SYNC_FILE_RANGE:
3190                 if (sqe) {
3191                         ret = io_prep_sfr(req, sqe);
3192                         if (ret < 0)
3193                                 break;
3194                 }
3195                 ret = io_sync_file_range(req, nxt, force_nonblock);
3196                 break;
3197         case IORING_OP_SENDMSG:
3198                 if (sqe) {
3199                         ret = io_sendmsg_prep(req, sqe);
3200                         if (ret < 0)
3201                                 break;
3202                 }
3203                 ret = io_sendmsg(req, nxt, force_nonblock);
3204                 break;
3205         case IORING_OP_RECVMSG:
3206                 if (sqe) {
3207                         ret = io_recvmsg_prep(req, sqe);
3208                         if (ret)
3209                                 break;
3210                 }
3211                 ret = io_recvmsg(req, nxt, force_nonblock);
3212                 break;
3213         case IORING_OP_TIMEOUT:
3214                 if (sqe) {
3215                         ret = io_timeout_prep(req, sqe, false);
3216                         if (ret)
3217                                 break;
3218                 }
3219                 ret = io_timeout(req);
3220                 break;
3221         case IORING_OP_TIMEOUT_REMOVE:
3222                 if (sqe) {
3223                         ret = io_timeout_remove_prep(req, sqe);
3224                         if (ret)
3225                                 break;
3226                 }
3227                 ret = io_timeout_remove(req);
3228                 break;
3229         case IORING_OP_ACCEPT:
3230                 if (sqe) {
3231                         ret = io_accept_prep(req, sqe);
3232                         if (ret)
3233                                 break;
3234                 }
3235                 ret = io_accept(req, nxt, force_nonblock);
3236                 break;
3237         case IORING_OP_CONNECT:
3238                 if (sqe) {
3239                         ret = io_connect_prep(req, sqe);
3240                         if (ret)
3241                                 break;
3242                 }
3243                 ret = io_connect(req, nxt, force_nonblock);
3244                 break;
3245         case IORING_OP_ASYNC_CANCEL:
3246                 if (sqe) {
3247                         ret = io_async_cancel_prep(req, sqe);
3248                         if (ret)
3249                                 break;
3250                 }
3251                 ret = io_async_cancel(req, nxt);
3252                 break;
3253         default:
3254                 ret = -EINVAL;
3255                 break;
3256         }
3257
3258         if (ret)
3259                 return ret;
3260
3261         if (ctx->flags & IORING_SETUP_IOPOLL) {
3262                 if (req->result == -EAGAIN)
3263                         return -EAGAIN;
3264
3265                 io_iopoll_req_issued(req);
3266         }
3267
3268         return 0;
3269 }
3270
3271 static void io_link_work_cb(struct io_wq_work **workptr)
3272 {
3273         struct io_wq_work *work = *workptr;
3274         struct io_kiocb *link = work->data;
3275
3276         io_queue_linked_timeout(link);
3277         work->func = io_wq_submit_work;
3278 }
3279
3280 static void io_wq_submit_work(struct io_wq_work **workptr)
3281 {
3282         struct io_wq_work *work = *workptr;
3283         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
3284         struct io_kiocb *nxt = NULL;
3285         int ret = 0;
3286
3287         if (work->flags & IO_WQ_WORK_CANCEL)
3288                 ret = -ECANCELED;
3289
3290         if (!ret) {
3291                 req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
3292                 req->in_async = true;
3293                 do {
3294                         ret = io_issue_sqe(req, NULL, &nxt, false);
3295                         /*
3296                          * We can get EAGAIN for polled IO even though we're
3297                          * forcing a sync submission from here, since we can't
3298                          * wait for request slots on the block side.
3299                          */
3300                         if (ret != -EAGAIN)
3301                                 break;
3302                         cond_resched();
3303                 } while (1);
3304         }
3305
3306         /* drop submission reference */
3307         io_put_req(req);
3308
3309         if (ret) {
3310                 req_set_fail_links(req);
3311                 io_cqring_add_event(req, ret);
3312                 io_put_req(req);
3313         }
3314
3315         /* if a dependent link is ready, pass it back */
3316         if (!ret && nxt) {
3317                 struct io_kiocb *link;
3318
3319                 io_prep_async_work(nxt, &link);
3320                 *workptr = &nxt->work;
3321                 if (link) {
3322                         nxt->work.flags |= IO_WQ_WORK_CB;
3323                         nxt->work.func = io_link_work_cb;
3324                         nxt->work.data = link;
3325                 }
3326         }
3327 }
3328
3329 static bool io_req_op_valid(int op)
3330 {
3331         return op >= IORING_OP_NOP && op < IORING_OP_LAST;
3332 }
3333
3334 static int io_req_needs_file(struct io_kiocb *req)
3335 {
3336         switch (req->opcode) {
3337         case IORING_OP_NOP:
3338         case IORING_OP_POLL_REMOVE:
3339         case IORING_OP_TIMEOUT:
3340         case IORING_OP_TIMEOUT_REMOVE:
3341         case IORING_OP_ASYNC_CANCEL:
3342         case IORING_OP_LINK_TIMEOUT:
3343                 return 0;
3344         default:
3345                 if (io_req_op_valid(req->opcode))
3346                         return 1;
3347                 return -EINVAL;
3348         }
3349 }
3350
3351 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
3352                                               int index)
3353 {
3354         struct fixed_file_table *table;
3355
3356         table = &ctx->file_table[index >> IORING_FILE_TABLE_SHIFT];
3357         return table->files[index & IORING_FILE_TABLE_MASK];
3358 }
3359
3360 static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
3361                            const struct io_uring_sqe *sqe)
3362 {
3363         struct io_ring_ctx *ctx = req->ctx;
3364         unsigned flags;
3365         int fd, ret;
3366
3367         flags = READ_ONCE(sqe->flags);
3368         fd = READ_ONCE(sqe->fd);
3369
3370         if (flags & IOSQE_IO_DRAIN)
3371                 req->flags |= REQ_F_IO_DRAIN;
3372
3373         ret = io_req_needs_file(req);
3374         if (ret <= 0)
3375                 return ret;
3376
3377         if (flags & IOSQE_FIXED_FILE) {
3378                 if (unlikely(!ctx->file_table ||
3379                     (unsigned) fd >= ctx->nr_user_files))
3380                         return -EBADF;
3381                 fd = array_index_nospec(fd, ctx->nr_user_files);
3382                 req->file = io_file_from_index(ctx, fd);
3383                 if (!req->file)
3384                         return -EBADF;
3385                 req->flags |= REQ_F_FIXED_FILE;
3386         } else {
3387                 if (req->needs_fixed_file)
3388                         return -EBADF;
3389                 trace_io_uring_file_get(ctx, fd);
3390                 req->file = io_file_get(state, fd);
3391                 if (unlikely(!req->file))
3392                         return -EBADF;
3393         }
3394
3395         return 0;
3396 }
3397
3398 static int io_grab_files(struct io_kiocb *req)
3399 {
3400         int ret = -EBADF;
3401         struct io_ring_ctx *ctx = req->ctx;
3402
3403         rcu_read_lock();
3404         spin_lock_irq(&ctx->inflight_lock);
3405         /*
3406          * We use the f_ops->flush() handler to ensure that we can flush
3407          * out work accessing these files if the fd is closed. Check if
3408          * the fd has changed since we started down this path, and disallow
3409          * this operation if it has.
3410          */
3411         if (fcheck(req->ring_fd) == req->ring_file) {
3412                 list_add(&req->inflight_entry, &ctx->inflight_list);
3413                 req->flags |= REQ_F_INFLIGHT;
3414                 req->work.files = current->files;
3415                 ret = 0;
3416         }
3417         spin_unlock_irq(&ctx->inflight_lock);
3418         rcu_read_unlock();
3419
3420         return ret;
3421 }
3422
3423 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
3424 {
3425         struct io_timeout_data *data = container_of(timer,
3426                                                 struct io_timeout_data, timer);
3427         struct io_kiocb *req = data->req;
3428         struct io_ring_ctx *ctx = req->ctx;
3429         struct io_kiocb *prev = NULL;
3430         unsigned long flags;
3431
3432         spin_lock_irqsave(&ctx->completion_lock, flags);
3433
3434         /*
3435          * We don't expect the list to be empty, that will only happen if we
3436          * race with the completion of the linked work.
3437          */
3438         if (!list_empty(&req->link_list)) {
3439                 prev = list_entry(req->link_list.prev, struct io_kiocb,
3440                                   link_list);
3441                 if (refcount_inc_not_zero(&prev->refs)) {
3442                         list_del_init(&req->link_list);
3443                         prev->flags &= ~REQ_F_LINK_TIMEOUT;
3444                 } else
3445                         prev = NULL;
3446         }
3447
3448         spin_unlock_irqrestore(&ctx->completion_lock, flags);
3449
3450         if (prev) {
3451                 req_set_fail_links(prev);
3452                 io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
3453                                                 -ETIME);
3454                 io_put_req(prev);
3455         } else {
3456                 io_cqring_add_event(req, -ETIME);
3457                 io_put_req(req);
3458         }
3459         return HRTIMER_NORESTART;
3460 }
3461
3462 static void io_queue_linked_timeout(struct io_kiocb *req)
3463 {
3464         struct io_ring_ctx *ctx = req->ctx;
3465
3466         /*
3467          * If the list is now empty, then our linked request finished before
3468          * we got a chance to setup the timer
3469          */
3470         spin_lock_irq(&ctx->completion_lock);
3471         if (!list_empty(&req->link_list)) {
3472                 struct io_timeout_data *data = &req->io->timeout;
3473
3474                 data->timer.function = io_link_timeout_fn;
3475                 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
3476                                 data->mode);
3477         }
3478         spin_unlock_irq(&ctx->completion_lock);
3479
3480         /* drop submission reference */
3481         io_put_req(req);
3482 }
3483
3484 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
3485 {
3486         struct io_kiocb *nxt;
3487
3488         if (!(req->flags & REQ_F_LINK))
3489                 return NULL;
3490
3491         nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
3492                                         link_list);
3493         if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
3494                 return NULL;
3495
3496         req->flags |= REQ_F_LINK_TIMEOUT;
3497         return nxt;
3498 }
3499
3500 static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3501 {
3502         struct io_kiocb *linked_timeout;
3503         struct io_kiocb *nxt = NULL;
3504         int ret;
3505
3506 again:
3507         linked_timeout = io_prep_linked_timeout(req);
3508
3509         ret = io_issue_sqe(req, sqe, &nxt, true);
3510
3511         /*
3512          * We async punt it if the file wasn't marked NOWAIT, or if the file
3513          * doesn't support non-blocking read/write attempts
3514          */
3515         if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
3516             (req->flags & REQ_F_MUST_PUNT))) {
3517                 if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
3518                         ret = io_grab_files(req);
3519                         if (ret)
3520                                 goto err;
3521                 }
3522
3523                 /*
3524                  * Queued up for async execution, worker will release
3525                  * submit reference when the iocb is actually submitted.
3526                  */
3527                 io_queue_async_work(req);
3528                 goto done_req;
3529         }
3530
3531 err:
3532         /* drop submission reference */
3533         io_put_req(req);
3534
3535         if (linked_timeout) {
3536                 if (!ret)
3537                         io_queue_linked_timeout(linked_timeout);
3538                 else
3539                         io_put_req(linked_timeout);
3540         }
3541
3542         /* and drop final reference, if we failed */
3543         if (ret) {
3544                 io_cqring_add_event(req, ret);
3545                 req_set_fail_links(req);
3546                 io_put_req(req);
3547         }
3548 done_req:
3549         if (nxt) {
3550                 req = nxt;
3551                 nxt = NULL;
3552                 goto again;
3553         }
3554 }
3555
3556 static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3557 {
3558         int ret;
3559
3560         if (unlikely(req->ctx->drain_next)) {
3561                 req->flags |= REQ_F_IO_DRAIN;
3562                 req->ctx->drain_next = false;
3563         }
3564         req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK);
3565
3566         ret = io_req_defer(req, sqe);
3567         if (ret) {
3568                 if (ret != -EIOCBQUEUED) {
3569                         io_cqring_add_event(req, ret);
3570                         req_set_fail_links(req);
3571                         io_double_put_req(req);
3572                 }
3573         } else
3574                 __io_queue_sqe(req, sqe);
3575 }
3576
3577 static inline void io_queue_link_head(struct io_kiocb *req)
3578 {
3579         if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
3580                 io_cqring_add_event(req, -ECANCELED);
3581                 io_double_put_req(req);
3582         } else
3583                 io_queue_sqe(req, NULL);
3584 }
3585
3586 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
3587                                 IOSQE_IO_HARDLINK)
3588
3589 static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
3590                           struct io_submit_state *state, struct io_kiocb **link)
3591 {
3592         struct io_ring_ctx *ctx = req->ctx;
3593         int ret;
3594
3595         /* enforce forwards compatibility on users */
3596         if (unlikely(sqe->flags & ~SQE_VALID_FLAGS)) {
3597                 ret = -EINVAL;
3598                 goto err_req;
3599         }
3600
3601         ret = io_req_set_file(state, req, sqe);
3602         if (unlikely(ret)) {
3603 err_req:
3604                 io_cqring_add_event(req, ret);
3605                 io_double_put_req(req);
3606                 return false;
3607         }
3608
3609         /*
3610          * If we already have a head request, queue this one for async
3611          * submittal once the head completes. If we don't have a head but
3612          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
3613          * submitted sync once the chain is complete. If none of those
3614          * conditions are true (normal request), then just queue it.
3615          */
3616         if (*link) {
3617                 struct io_kiocb *prev = *link;
3618
3619                 if (sqe->flags & IOSQE_IO_DRAIN)
3620                         (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
3621
3622                 if (sqe->flags & IOSQE_IO_HARDLINK)
3623                         req->flags |= REQ_F_HARDLINK;
3624
3625                 if (io_alloc_async_ctx(req)) {
3626                         ret = -EAGAIN;
3627                         goto err_req;
3628                 }
3629
3630                 ret = io_req_defer_prep(req, sqe);
3631                 if (ret) {
3632                         /* fail even hard links since we don't submit */
3633                         prev->flags |= REQ_F_FAIL_LINK;
3634                         goto err_req;
3635                 }
3636                 trace_io_uring_link(ctx, req, prev);
3637                 list_add_tail(&req->link_list, &prev->link_list);
3638         } else if (sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
3639                 req->flags |= REQ_F_LINK;
3640                 if (sqe->flags & IOSQE_IO_HARDLINK)
3641                         req->flags |= REQ_F_HARDLINK;
3642
3643                 INIT_LIST_HEAD(&req->link_list);
3644                 ret = io_req_defer_prep(req, sqe);
3645                 if (ret)
3646                         req->flags |= REQ_F_FAIL_LINK;
3647                 *link = req;
3648         } else {
3649                 io_queue_sqe(req, sqe);
3650         }
3651
3652         return true;
3653 }
3654
3655 /*
3656  * Batched submission is done, ensure local IO is flushed out.
3657  */
3658 static void io_submit_state_end(struct io_submit_state *state)
3659 {
3660         blk_finish_plug(&state->plug);
3661         io_file_put(state);
3662         if (state->free_reqs)
3663                 kmem_cache_free_bulk(req_cachep, state->free_reqs,
3664                                         &state->reqs[state->cur_req]);
3665 }
3666
3667 /*
3668  * Start submission side cache.
3669  */
3670 static void io_submit_state_start(struct io_submit_state *state,
3671                                   unsigned int max_ios)
3672 {
3673         blk_start_plug(&state->plug);
3674         state->free_reqs = 0;
3675         state->file = NULL;
3676         state->ios_left = max_ios;
3677 }
3678
3679 static void io_commit_sqring(struct io_ring_ctx *ctx)
3680 {
3681         struct io_rings *rings = ctx->rings;
3682
3683         if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) {
3684                 /*
3685                  * Ensure any loads from the SQEs are done at this point,
3686                  * since once we write the new head, the application could
3687                  * write new data to them.
3688                  */
3689                 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
3690         }
3691 }
3692
3693 /*
3694  * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
3695  * that is mapped by userspace. This means that care needs to be taken to
3696  * ensure that reads are stable, as we cannot rely on userspace always
3697  * being a good citizen. If members of the sqe are validated and then later
3698  * used, it's important that those reads are done through READ_ONCE() to
3699  * prevent a re-load down the line.
3700  */
3701 static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
3702                           const struct io_uring_sqe **sqe_ptr)
3703 {
3704         struct io_rings *rings = ctx->rings;
3705         u32 *sq_array = ctx->sq_array;
3706         unsigned head;
3707
3708         /*
3709          * The cached sq head (or cq tail) serves two purposes:
3710          *
3711          * 1) allows us to batch the cost of updating the user visible
3712          *    head updates.
3713          * 2) allows the kernel side to track the head on its own, even
3714          *    though the application is the one updating it.
3715          */
3716         head = ctx->cached_sq_head;
3717         /* make sure SQ entry isn't read before tail */
3718         if (unlikely(head == smp_load_acquire(&rings->sq.tail)))
3719                 return false;
3720
3721         head = READ_ONCE(sq_array[head & ctx->sq_mask]);
3722         if (likely(head < ctx->sq_entries)) {
3723                 /*
3724                  * All io need record the previous position, if LINK vs DARIN,
3725                  * it can be used to mark the position of the first IO in the
3726                  * link list.
3727                  */
3728                 req->sequence = ctx->cached_sq_head;
3729                 *sqe_ptr = &ctx->sq_sqes[head];
3730                 req->opcode = READ_ONCE((*sqe_ptr)->opcode);
3731                 req->user_data = READ_ONCE((*sqe_ptr)->user_data);
3732                 ctx->cached_sq_head++;
3733                 return true;
3734         }
3735
3736         /* drop invalid entries */
3737         ctx->cached_sq_head++;
3738         ctx->cached_sq_dropped++;
3739         WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped);
3740         return false;
3741 }
3742
3743 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
3744                           struct file *ring_file, int ring_fd,
3745                           struct mm_struct **mm, bool async)
3746 {
3747         struct io_submit_state state, *statep = NULL;
3748         struct io_kiocb *link = NULL;
3749         int i, submitted = 0;
3750         bool mm_fault = false;
3751
3752         /* if we have a backlog and couldn't flush it all, return BUSY */
3753         if (!list_empty(&ctx->cq_overflow_list) &&
3754             !io_cqring_overflow_flush(ctx, false))
3755                 return -EBUSY;
3756
3757         if (nr > IO_PLUG_THRESHOLD) {
3758                 io_submit_state_start(&state, nr);
3759                 statep = &state;
3760         }
3761
3762         for (i = 0; i < nr; i++) {
3763                 const struct io_uring_sqe *sqe;
3764                 struct io_kiocb *req;
3765                 unsigned int sqe_flags;
3766
3767                 req = io_get_req(ctx, statep);
3768                 if (unlikely(!req)) {
3769                         if (!submitted)
3770                                 submitted = -EAGAIN;
3771                         break;
3772                 }
3773                 if (!io_get_sqring(ctx, req, &sqe)) {
3774                         __io_free_req(req);
3775                         break;
3776                 }
3777
3778                 if (io_req_needs_user(req) && !*mm) {
3779                         mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
3780                         if (!mm_fault) {
3781                                 use_mm(ctx->sqo_mm);
3782                                 *mm = ctx->sqo_mm;
3783                         }
3784                 }
3785
3786                 submitted++;
3787                 sqe_flags = sqe->flags;
3788
3789                 req->ring_file = ring_file;
3790                 req->ring_fd = ring_fd;
3791                 req->has_user = *mm != NULL;
3792                 req->in_async = async;
3793                 req->needs_fixed_file = async;
3794                 trace_io_uring_submit_sqe(ctx, req->user_data, true, async);
3795                 if (!io_submit_sqe(req, sqe, statep, &link))
3796                         break;
3797                 /*
3798                  * If previous wasn't linked and we have a linked command,
3799                  * that's the end of the chain. Submit the previous link.
3800                  */
3801                 if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) && link) {
3802                         io_queue_link_head(link);
3803                         link = NULL;
3804                 }
3805         }
3806
3807         if (link)
3808                 io_queue_link_head(link);
3809         if (statep)
3810                 io_submit_state_end(&state);
3811
3812          /* Commit SQ ring head once we've consumed and submitted all SQEs */
3813         io_commit_sqring(ctx);
3814
3815         return submitted;
3816 }
3817
3818 static int io_sq_thread(void *data)
3819 {
3820         struct io_ring_ctx *ctx = data;
3821         struct mm_struct *cur_mm = NULL;
3822         const struct cred *old_cred;
3823         mm_segment_t old_fs;
3824         DEFINE_WAIT(wait);
3825         unsigned inflight;
3826         unsigned long timeout;
3827         int ret;
3828
3829         complete(&ctx->completions[1]);
3830
3831         old_fs = get_fs();
3832         set_fs(USER_DS);
3833         old_cred = override_creds(ctx->creds);
3834
3835         ret = timeout = inflight = 0;
3836         while (!kthread_should_park()) {
3837                 unsigned int to_submit;
3838
3839                 if (inflight) {
3840                         unsigned nr_events = 0;
3841
3842                         if (ctx->flags & IORING_SETUP_IOPOLL) {
3843                                 /*
3844                                  * inflight is the count of the maximum possible
3845                                  * entries we submitted, but it can be smaller
3846                                  * if we dropped some of them. If we don't have
3847                                  * poll entries available, then we know that we
3848                                  * have nothing left to poll for. Reset the
3849                                  * inflight count to zero in that case.
3850                                  */
3851                                 mutex_lock(&ctx->uring_lock);
3852                                 if (!list_empty(&ctx->poll_list))
3853                                         __io_iopoll_check(ctx, &nr_events, 0);
3854                                 else
3855                                         inflight = 0;
3856                                 mutex_unlock(&ctx->uring_lock);
3857                         } else {
3858                                 /*
3859                                  * Normal IO, just pretend everything completed.
3860                                  * We don't have to poll completions for that.
3861                                  */
3862                                 nr_events = inflight;
3863                         }
3864
3865                         inflight -= nr_events;
3866                         if (!inflight)
3867                                 timeout = jiffies + ctx->sq_thread_idle;
3868                 }
3869
3870                 to_submit = io_sqring_entries(ctx);
3871
3872                 /*
3873                  * If submit got -EBUSY, flag us as needing the application
3874                  * to enter the kernel to reap and flush events.
3875                  */
3876                 if (!to_submit || ret == -EBUSY) {
3877                         /*
3878                          * We're polling. If we're within the defined idle
3879                          * period, then let us spin without work before going
3880                          * to sleep. The exception is if we got EBUSY doing
3881                          * more IO, we should wait for the application to
3882                          * reap events and wake us up.
3883                          */
3884                         if (inflight ||
3885                             (!time_after(jiffies, timeout) && ret != -EBUSY)) {
3886                                 cond_resched();
3887                                 continue;
3888                         }
3889
3890                         /*
3891                          * Drop cur_mm before scheduling, we can't hold it for
3892                          * long periods (or over schedule()). Do this before
3893                          * adding ourselves to the waitqueue, as the unuse/drop
3894                          * may sleep.
3895                          */
3896                         if (cur_mm) {
3897                                 unuse_mm(cur_mm);
3898                                 mmput(cur_mm);
3899                                 cur_mm = NULL;
3900                         }
3901
3902                         prepare_to_wait(&ctx->sqo_wait, &wait,
3903                                                 TASK_INTERRUPTIBLE);
3904
3905                         /* Tell userspace we may need a wakeup call */
3906                         ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
3907                         /* make sure to read SQ tail after writing flags */
3908                         smp_mb();
3909
3910                         to_submit = io_sqring_entries(ctx);
3911                         if (!to_submit || ret == -EBUSY) {
3912                                 if (kthread_should_park()) {
3913                                         finish_wait(&ctx->sqo_wait, &wait);
3914                                         break;
3915                                 }
3916                                 if (signal_pending(current))
3917                                         flush_signals(current);
3918                                 schedule();
3919                                 finish_wait(&ctx->sqo_wait, &wait);
3920
3921                                 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
3922                                 continue;
3923                         }
3924                         finish_wait(&ctx->sqo_wait, &wait);
3925
3926                         ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
3927                 }
3928
3929                 to_submit = min(to_submit, ctx->sq_entries);
3930                 mutex_lock(&ctx->uring_lock);
3931                 ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
3932                 mutex_unlock(&ctx->uring_lock);
3933                 if (ret > 0)
3934                         inflight += ret;
3935         }
3936
3937         set_fs(old_fs);
3938         if (cur_mm) {
3939                 unuse_mm(cur_mm);
3940                 mmput(cur_mm);
3941         }
3942         revert_creds(old_cred);
3943
3944         kthread_parkme();
3945
3946         return 0;
3947 }
3948
3949 struct io_wait_queue {
3950         struct wait_queue_entry wq;
3951         struct io_ring_ctx *ctx;
3952         unsigned to_wait;
3953         unsigned nr_timeouts;
3954 };
3955
3956 static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
3957 {
3958         struct io_ring_ctx *ctx = iowq->ctx;
3959
3960         /*
3961          * Wake up if we have enough events, or if a timeout occurred since we
3962          * started waiting. For timeouts, we always want to return to userspace,
3963          * regardless of event count.
3964          */
3965         return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
3966                         atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
3967 }
3968
3969 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
3970                             int wake_flags, void *key)
3971 {
3972         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
3973                                                         wq);
3974
3975         /* use noflush == true, as we can't safely rely on locking context */
3976         if (!io_should_wake(iowq, true))
3977                 return -1;
3978
3979         return autoremove_wake_function(curr, mode, wake_flags, key);
3980 }
3981
3982 /*
3983  * Wait until events become available, if we don't already have some. The
3984  * application must reap them itself, as they reside on the shared cq ring.
3985  */
3986 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
3987                           const sigset_t __user *sig, size_t sigsz)
3988 {
3989         struct io_wait_queue iowq = {
3990                 .wq = {
3991                         .private        = current,
3992                         .func           = io_wake_function,
3993                         .entry          = LIST_HEAD_INIT(iowq.wq.entry),
3994                 },
3995                 .ctx            = ctx,
3996                 .to_wait        = min_events,
3997         };
3998         struct io_rings *rings = ctx->rings;
3999         int ret = 0;
4000
4001         if (io_cqring_events(ctx, false) >= min_events)
4002                 return 0;
4003
4004         if (sig) {
4005 #ifdef CONFIG_COMPAT
4006                 if (in_compat_syscall())
4007                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
4008                                                       sigsz);
4009                 else
4010 #endif
4011                         ret = set_user_sigmask(sig, sigsz);
4012
4013                 if (ret)
4014                         return ret;
4015         }
4016
4017         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
4018         trace_io_uring_cqring_wait(ctx, min_events);
4019         do {
4020                 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
4021                                                 TASK_INTERRUPTIBLE);
4022                 if (io_should_wake(&iowq, false))
4023                         break;
4024                 schedule();
4025                 if (signal_pending(current)) {
4026                         ret = -EINTR;
4027                         break;
4028                 }
4029         } while (1);
4030         finish_wait(&ctx->wait, &iowq.wq);
4031
4032         restore_saved_sigmask_unless(ret == -EINTR);
4033
4034         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
4035 }
4036
4037 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
4038 {
4039 #if defined(CONFIG_UNIX)
4040         if (ctx->ring_sock) {
4041                 struct sock *sock = ctx->ring_sock->sk;
4042                 struct sk_buff *skb;
4043
4044                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
4045                         kfree_skb(skb);
4046         }
4047 #else
4048         int i;
4049
4050         for (i = 0; i < ctx->nr_user_files; i++) {
4051                 struct file *file;
4052
4053                 file = io_file_from_index(ctx, i);
4054                 if (file)
4055                         fput(file);
4056         }
4057 #endif
4058 }
4059
4060 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
4061 {
4062         unsigned nr_tables, i;
4063
4064         if (!ctx->file_table)
4065                 return -ENXIO;
4066
4067         __io_sqe_files_unregister(ctx);
4068         nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
4069         for (i = 0; i < nr_tables; i++)
4070                 kfree(ctx->file_table[i].files);
4071         kfree(ctx->file_table);
4072         ctx->file_table = NULL;
4073         ctx->nr_user_files = 0;
4074         return 0;
4075 }
4076
4077 static void io_sq_thread_stop(struct io_ring_ctx *ctx)
4078 {
4079         if (ctx->sqo_thread) {
4080                 wait_for_completion(&ctx->completions[1]);
4081                 /*
4082                  * The park is a bit of a work-around, without it we get
4083                  * warning spews on shutdown with SQPOLL set and affinity
4084                  * set to a single CPU.
4085                  */
4086                 kthread_park(ctx->sqo_thread);
4087                 kthread_stop(ctx->sqo_thread);
4088                 ctx->sqo_thread = NULL;
4089         }
4090 }
4091
4092 static void io_finish_async(struct io_ring_ctx *ctx)
4093 {
4094         io_sq_thread_stop(ctx);
4095
4096         if (ctx->io_wq) {
4097                 io_wq_destroy(ctx->io_wq);
4098                 ctx->io_wq = NULL;
4099         }
4100 }
4101
4102 #if defined(CONFIG_UNIX)
4103 static void io_destruct_skb(struct sk_buff *skb)
4104 {
4105         struct io_ring_ctx *ctx = skb->sk->sk_user_data;
4106
4107         if (ctx->io_wq)
4108                 io_wq_flush(ctx->io_wq);
4109
4110         unix_destruct_scm(skb);
4111 }
4112
4113 /*
4114  * Ensure the UNIX gc is aware of our file set, so we are certain that
4115  * the io_uring can be safely unregistered on process exit, even if we have
4116  * loops in the file referencing.
4117  */
4118 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
4119 {
4120         struct sock *sk = ctx->ring_sock->sk;
4121         struct scm_fp_list *fpl;
4122         struct sk_buff *skb;
4123         int i, nr_files;
4124
4125         if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
4126                 unsigned long inflight = ctx->user->unix_inflight + nr;
4127
4128                 if (inflight > task_rlimit(current, RLIMIT_NOFILE))
4129                         return -EMFILE;
4130         }
4131
4132         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
4133         if (!fpl)
4134                 return -ENOMEM;
4135
4136         skb = alloc_skb(0, GFP_KERNEL);
4137         if (!skb) {
4138                 kfree(fpl);
4139                 return -ENOMEM;
4140         }
4141
4142         skb->sk = sk;
4143
4144         nr_files = 0;
4145         fpl->user = get_uid(ctx->user);
4146         for (i = 0; i < nr; i++) {
4147                 struct file *file = io_file_from_index(ctx, i + offset);
4148
4149                 if (!file)
4150                         continue;
4151                 fpl->fp[nr_files] = get_file(file);
4152                 unix_inflight(fpl->user, fpl->fp[nr_files]);
4153                 nr_files++;
4154         }
4155
4156         if (nr_files) {
4157                 fpl->max = SCM_MAX_FD;
4158                 fpl->count = nr_files;
4159                 UNIXCB(skb).fp = fpl;
4160                 skb->destructor = io_destruct_skb;
4161                 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
4162                 skb_queue_head(&sk->sk_receive_queue, skb);
4163
4164                 for (i = 0; i < nr_files; i++)
4165                         fput(fpl->fp[i]);
4166         } else {
4167                 kfree_skb(skb);
4168                 kfree(fpl);
4169         }
4170
4171         return 0;
4172 }
4173
4174 /*
4175  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
4176  * causes regular reference counting to break down. We rely on the UNIX
4177  * garbage collection to take care of this problem for us.
4178  */
4179 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
4180 {
4181         unsigned left, total;
4182         int ret = 0;
4183
4184         total = 0;
4185         left = ctx->nr_user_files;
4186         while (left) {
4187                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
4188
4189                 ret = __io_sqe_files_scm(ctx, this_files, total);
4190                 if (ret)
4191                         break;
4192                 left -= this_files;
4193                 total += this_files;
4194         }
4195
4196         if (!ret)
4197                 return 0;
4198
4199         while (total < ctx->nr_user_files) {
4200                 struct file *file = io_file_from_index(ctx, total);
4201
4202                 if (file)
4203                         fput(file);
4204                 total++;
4205         }
4206
4207         return ret;
4208 }
4209 #else
4210 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
4211 {
4212         return 0;
4213 }
4214 #endif
4215
4216 static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
4217                                     unsigned nr_files)
4218 {
4219         int i;
4220
4221         for (i = 0; i < nr_tables; i++) {
4222                 struct fixed_file_table *table = &ctx->file_table[i];
4223                 unsigned this_files;
4224
4225                 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
4226                 table->files = kcalloc(this_files, sizeof(struct file *),
4227                                         GFP_KERNEL);
4228                 if (!table->files)
4229                         break;
4230                 nr_files -= this_files;
4231         }
4232
4233         if (i == nr_tables)
4234                 return 0;
4235
4236         for (i = 0; i < nr_tables; i++) {
4237                 struct fixed_file_table *table = &ctx->file_table[i];
4238                 kfree(table->files);
4239         }
4240         return 1;
4241 }
4242
4243 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
4244                                  unsigned nr_args)
4245 {
4246         __s32 __user *fds = (__s32 __user *) arg;
4247         unsigned nr_tables;
4248         int fd, ret = 0;
4249         unsigned i;
4250
4251         if (ctx->file_table)
4252                 return -EBUSY;
4253         if (!nr_args)
4254                 return -EINVAL;
4255         if (nr_args > IORING_MAX_FIXED_FILES)
4256                 return -EMFILE;
4257
4258         nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
4259         ctx->file_table = kcalloc(nr_tables, sizeof(struct fixed_file_table),
4260                                         GFP_KERNEL);
4261         if (!ctx->file_table)
4262                 return -ENOMEM;
4263
4264         if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
4265                 kfree(ctx->file_table);
4266                 ctx->file_table = NULL;
4267                 return -ENOMEM;
4268         }
4269
4270         for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
4271                 struct fixed_file_table *table;
4272                 unsigned index;
4273
4274                 ret = -EFAULT;
4275                 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
4276                         break;
4277                 /* allow sparse sets */
4278                 if (fd == -1) {
4279                         ret = 0;
4280                         continue;
4281                 }
4282
4283                 table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
4284                 index = i & IORING_FILE_TABLE_MASK;
4285                 table->files[index] = fget(fd);
4286
4287                 ret = -EBADF;
4288                 if (!table->files[index])
4289                         break;
4290                 /*
4291                  * Don't allow io_uring instances to be registered. If UNIX
4292                  * isn't enabled, then this causes a reference cycle and this
4293                  * instance can never get freed. If UNIX is enabled we'll
4294                  * handle it just fine, but there's still no point in allowing
4295                  * a ring fd as it doesn't support regular read/write anyway.
4296                  */
4297                 if (table->files[index]->f_op == &io_uring_fops) {
4298                         fput(table->files[index]);
4299                         break;
4300                 }
4301                 ret = 0;
4302         }
4303
4304         if (ret) {
4305                 for (i = 0; i < ctx->nr_user_files; i++) {
4306                         struct file *file;
4307
4308                         file = io_file_from_index(ctx, i);
4309                         if (file)
4310                                 fput(file);
4311                 }
4312                 for (i = 0; i < nr_tables; i++)
4313                         kfree(ctx->file_table[i].files);
4314
4315                 kfree(ctx->file_table);
4316                 ctx->file_table = NULL;
4317                 ctx->nr_user_files = 0;
4318                 return ret;
4319         }
4320
4321         ret = io_sqe_files_scm(ctx);
4322         if (ret)
4323                 io_sqe_files_unregister(ctx);
4324
4325         return ret;
4326 }
4327
4328 static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index)
4329 {
4330 #if defined(CONFIG_UNIX)
4331         struct file *file = io_file_from_index(ctx, index);
4332         struct sock *sock = ctx->ring_sock->sk;
4333         struct sk_buff_head list, *head = &sock->sk_receive_queue;
4334         struct sk_buff *skb;
4335         int i;
4336
4337         __skb_queue_head_init(&list);
4338
4339         /*
4340          * Find the skb that holds this file in its SCM_RIGHTS. When found,
4341          * remove this entry and rearrange the file array.
4342          */
4343         skb = skb_dequeue(head);
4344         while (skb) {
4345                 struct scm_fp_list *fp;
4346
4347                 fp = UNIXCB(skb).fp;
4348                 for (i = 0; i < fp->count; i++) {
4349                         int left;
4350
4351                         if (fp->fp[i] != file)
4352                                 continue;
4353
4354                         unix_notinflight(fp->user, fp->fp[i]);
4355                         left = fp->count - 1 - i;
4356                         if (left) {
4357                                 memmove(&fp->fp[i], &fp->fp[i + 1],
4358                                                 left * sizeof(struct file *));
4359                         }
4360                         fp->count--;
4361                         if (!fp->count) {
4362                                 kfree_skb(skb);
4363                                 skb = NULL;
4364                         } else {
4365                                 __skb_queue_tail(&list, skb);
4366                         }
4367                         fput(file);
4368                         file = NULL;
4369                         break;
4370                 }
4371
4372                 if (!file)
4373                         break;
4374
4375                 __skb_queue_tail(&list, skb);
4376
4377                 skb = skb_dequeue(head);
4378         }
4379
4380         if (skb_peek(&list)) {
4381                 spin_lock_irq(&head->lock);
4382                 while ((skb = __skb_dequeue(&list)) != NULL)
4383                         __skb_queue_tail(head, skb);
4384                 spin_unlock_irq(&head->lock);
4385         }
4386 #else
4387         fput(io_file_from_index(ctx, index));
4388 #endif
4389 }
4390
4391 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
4392                                 int index)
4393 {
4394 #if defined(CONFIG_UNIX)
4395         struct sock *sock = ctx->ring_sock->sk;
4396         struct sk_buff_head *head = &sock->sk_receive_queue;
4397         struct sk_buff *skb;
4398
4399         /*
4400          * See if we can merge this file into an existing skb SCM_RIGHTS
4401          * file set. If there's no room, fall back to allocating a new skb
4402          * and filling it in.
4403          */
4404         spin_lock_irq(&head->lock);
4405         skb = skb_peek(head);
4406         if (skb) {
4407                 struct scm_fp_list *fpl = UNIXCB(skb).fp;
4408
4409                 if (fpl->count < SCM_MAX_FD) {
4410                         __skb_unlink(skb, head);
4411                         spin_unlock_irq(&head->lock);
4412                         fpl->fp[fpl->count] = get_file(file);
4413                         unix_inflight(fpl->user, fpl->fp[fpl->count]);
4414                         fpl->count++;
4415                         spin_lock_irq(&head->lock);
4416                         __skb_queue_head(head, skb);
4417                 } else {
4418                         skb = NULL;
4419                 }
4420         }
4421         spin_unlock_irq(&head->lock);
4422
4423         if (skb) {
4424                 fput(file);
4425                 return 0;
4426         }
4427
4428         return __io_sqe_files_scm(ctx, 1, index);
4429 #else
4430         return 0;
4431 #endif
4432 }
4433
4434 static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
4435                                unsigned nr_args)
4436 {
4437         struct io_uring_files_update up;
4438         __s32 __user *fds;
4439         int fd, i, err;
4440         __u32 done;
4441
4442         if (!ctx->file_table)
4443                 return -ENXIO;
4444         if (!nr_args)
4445                 return -EINVAL;
4446         if (copy_from_user(&up, arg, sizeof(up)))
4447                 return -EFAULT;
4448         if (check_add_overflow(up.offset, nr_args, &done))
4449                 return -EOVERFLOW;
4450         if (done > ctx->nr_user_files)
4451                 return -EINVAL;
4452
4453         done = 0;
4454         fds = (__s32 __user *) up.fds;
4455         while (nr_args) {
4456                 struct fixed_file_table *table;
4457                 unsigned index;
4458
4459                 err = 0;
4460                 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
4461                         err = -EFAULT;
4462                         break;
4463                 }
4464                 i = array_index_nospec(up.offset, ctx->nr_user_files);
4465                 table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
4466                 index = i & IORING_FILE_TABLE_MASK;
4467                 if (table->files[index]) {
4468                         io_sqe_file_unregister(ctx, i);
4469                         table->files[index] = NULL;
4470                 }
4471                 if (fd != -1) {
4472                         struct file *file;
4473
4474                         file = fget(fd);
4475                         if (!file) {
4476                                 err = -EBADF;
4477                                 break;
4478                         }
4479                         /*
4480                          * Don't allow io_uring instances to be registered. If
4481                          * UNIX isn't enabled, then this causes a reference
4482                          * cycle and this instance can never get freed. If UNIX
4483                          * is enabled we'll handle it just fine, but there's
4484                          * still no point in allowing a ring fd as it doesn't
4485                          * support regular read/write anyway.
4486                          */
4487                         if (file->f_op == &io_uring_fops) {
4488                                 fput(file);
4489                                 err = -EBADF;
4490                                 break;
4491                         }
4492                         table->files[index] = file;
4493                         err = io_sqe_file_register(ctx, file, i);
4494                         if (err)
4495                                 break;
4496                 }
4497                 nr_args--;
4498                 done++;
4499                 up.offset++;
4500         }
4501
4502         return done ? done : err;
4503 }
4504
4505 static void io_put_work(struct io_wq_work *work)
4506 {
4507         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
4508
4509         io_put_req(req);
4510 }
4511
4512 static void io_get_work(struct io_wq_work *work)
4513 {
4514         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
4515
4516         refcount_inc(&req->refs);
4517 }
4518
4519 static int io_sq_offload_start(struct io_ring_ctx *ctx,
4520                                struct io_uring_params *p)
4521 {
4522         struct io_wq_data data;
4523         unsigned concurrency;
4524         int ret;
4525
4526         init_waitqueue_head(&ctx->sqo_wait);
4527         mmgrab(current->mm);
4528         ctx->sqo_mm = current->mm;
4529
4530         if (ctx->flags & IORING_SETUP_SQPOLL) {
4531                 ret = -EPERM;
4532                 if (!capable(CAP_SYS_ADMIN))
4533                         goto err;
4534
4535                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
4536                 if (!ctx->sq_thread_idle)
4537                         ctx->sq_thread_idle = HZ;
4538
4539                 if (p->flags & IORING_SETUP_SQ_AFF) {
4540                         int cpu = p->sq_thread_cpu;
4541
4542                         ret = -EINVAL;
4543                         if (cpu >= nr_cpu_ids)
4544                                 goto err;
4545                         if (!cpu_online(cpu))
4546                                 goto err;
4547
4548                         ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
4549                                                         ctx, cpu,
4550                                                         "io_uring-sq");
4551                 } else {
4552                         ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
4553                                                         "io_uring-sq");
4554                 }
4555                 if (IS_ERR(ctx->sqo_thread)) {
4556                         ret = PTR_ERR(ctx->sqo_thread);
4557                         ctx->sqo_thread = NULL;
4558                         goto err;
4559                 }
4560                 wake_up_process(ctx->sqo_thread);
4561         } else if (p->flags & IORING_SETUP_SQ_AFF) {
4562                 /* Can't have SQ_AFF without SQPOLL */
4563                 ret = -EINVAL;
4564                 goto err;
4565         }
4566
4567         data.mm = ctx->sqo_mm;
4568         data.user = ctx->user;
4569         data.creds = ctx->creds;
4570         data.get_work = io_get_work;
4571         data.put_work = io_put_work;
4572
4573         /* Do QD, or 4 * CPUS, whatever is smallest */
4574         concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
4575         ctx->io_wq = io_wq_create(concurrency, &data);
4576         if (IS_ERR(ctx->io_wq)) {
4577                 ret = PTR_ERR(ctx->io_wq);
4578                 ctx->io_wq = NULL;
4579                 goto err;
4580         }
4581
4582         return 0;
4583 err:
4584         io_finish_async(ctx);
4585         mmdrop(ctx->sqo_mm);
4586         ctx->sqo_mm = NULL;
4587         return ret;
4588 }
4589
4590 static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
4591 {
4592         atomic_long_sub(nr_pages, &user->locked_vm);
4593 }
4594
4595 static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
4596 {
4597         unsigned long page_limit, cur_pages, new_pages;
4598
4599         /* Don't allow more pages than we can safely lock */
4600         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
4601
4602         do {
4603                 cur_pages = atomic_long_read(&user->locked_vm);
4604                 new_pages = cur_pages + nr_pages;
4605                 if (new_pages > page_limit)
4606                         return -ENOMEM;
4607         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
4608                                         new_pages) != cur_pages);
4609
4610         return 0;
4611 }
4612
4613 static void io_mem_free(void *ptr)
4614 {
4615         struct page *page;
4616
4617         if (!ptr)
4618                 return;
4619
4620         page = virt_to_head_page(ptr);
4621         if (put_page_testzero(page))
4622                 free_compound_page(page);
4623 }
4624
4625 static void *io_mem_alloc(size_t size)
4626 {
4627         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
4628                                 __GFP_NORETRY;
4629
4630         return (void *) __get_free_pages(gfp_flags, get_order(size));
4631 }
4632
4633 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
4634                                 size_t *sq_offset)
4635 {
4636         struct io_rings *rings;
4637         size_t off, sq_array_size;
4638
4639         off = struct_size(rings, cqes, cq_entries);
4640         if (off == SIZE_MAX)
4641                 return SIZE_MAX;
4642
4643 #ifdef CONFIG_SMP
4644         off = ALIGN(off, SMP_CACHE_BYTES);
4645         if (off == 0)
4646                 return SIZE_MAX;
4647 #endif
4648
4649         sq_array_size = array_size(sizeof(u32), sq_entries);
4650         if (sq_array_size == SIZE_MAX)
4651                 return SIZE_MAX;
4652
4653         if (check_add_overflow(off, sq_array_size, &off))
4654                 return SIZE_MAX;
4655
4656         if (sq_offset)
4657                 *sq_offset = off;
4658
4659         return off;
4660 }
4661
4662 static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
4663 {
4664         size_t pages;
4665
4666         pages = (size_t)1 << get_order(
4667                 rings_size(sq_entries, cq_entries, NULL));
4668         pages += (size_t)1 << get_order(
4669                 array_size(sizeof(struct io_uring_sqe), sq_entries));
4670
4671         return pages;
4672 }
4673
4674 static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
4675 {
4676         int i, j;
4677
4678         if (!ctx->user_bufs)
4679                 return -ENXIO;
4680
4681         for (i = 0; i < ctx->nr_user_bufs; i++) {
4682                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
4683
4684                 for (j = 0; j < imu->nr_bvecs; j++)
4685                         put_user_page(imu->bvec[j].bv_page);
4686
4687                 if (ctx->account_mem)
4688                         io_unaccount_mem(ctx->user, imu->nr_bvecs);
4689                 kvfree(imu->bvec);
4690                 imu->nr_bvecs = 0;
4691         }
4692
4693         kfree(ctx->user_bufs);
4694         ctx->user_bufs = NULL;
4695         ctx->nr_user_bufs = 0;
4696         return 0;
4697 }
4698
4699 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
4700                        void __user *arg, unsigned index)
4701 {
4702         struct iovec __user *src;
4703
4704 #ifdef CONFIG_COMPAT
4705         if (ctx->compat) {
4706                 struct compat_iovec __user *ciovs;
4707                 struct compat_iovec ciov;
4708
4709                 ciovs = (struct compat_iovec __user *) arg;
4710                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
4711                         return -EFAULT;
4712
4713                 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
4714                 dst->iov_len = ciov.iov_len;
4715                 return 0;
4716         }
4717 #endif
4718         src = (struct iovec __user *) arg;
4719         if (copy_from_user(dst, &src[index], sizeof(*dst)))
4720                 return -EFAULT;
4721         return 0;
4722 }
4723
4724 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
4725                                   unsigned nr_args)
4726 {
4727         struct vm_area_struct **vmas = NULL;
4728         struct page **pages = NULL;
4729         int i, j, got_pages = 0;
4730         int ret = -EINVAL;
4731
4732         if (ctx->user_bufs)
4733                 return -EBUSY;
4734         if (!nr_args || nr_args > UIO_MAXIOV)
4735                 return -EINVAL;
4736
4737         ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
4738                                         GFP_KERNEL);
4739         if (!ctx->user_bufs)
4740                 return -ENOMEM;
4741
4742         for (i = 0; i < nr_args; i++) {
4743                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
4744                 unsigned long off, start, end, ubuf;
4745                 int pret, nr_pages;
4746                 struct iovec iov;
4747                 size_t size;
4748
4749                 ret = io_copy_iov(ctx, &iov, arg, i);
4750                 if (ret)
4751                         goto err;
4752
4753                 /*
4754                  * Don't impose further limits on the size and buffer
4755                  * constraints here, we'll -EINVAL later when IO is
4756                  * submitted if they are wrong.
4757                  */
4758                 ret = -EFAULT;
4759                 if (!iov.iov_base || !iov.iov_len)
4760                         goto err;
4761
4762                 /* arbitrary limit, but we need something */
4763                 if (iov.iov_len > SZ_1G)
4764                         goto err;
4765
4766                 ubuf = (unsigned long) iov.iov_base;
4767                 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
4768                 start = ubuf >> PAGE_SHIFT;
4769                 nr_pages = end - start;
4770
4771                 if (ctx->account_mem) {
4772                         ret = io_account_mem(ctx->user, nr_pages);
4773                         if (ret)
4774                                 goto err;
4775                 }
4776
4777                 ret = 0;
4778                 if (!pages || nr_pages > got_pages) {
4779                         kfree(vmas);
4780                         kfree(pages);
4781                         pages = kvmalloc_array(nr_pages, sizeof(struct page *),
4782                                                 GFP_KERNEL);
4783                         vmas = kvmalloc_array(nr_pages,
4784                                         sizeof(struct vm_area_struct *),
4785                                         GFP_KERNEL);
4786                         if (!pages || !vmas) {
4787                                 ret = -ENOMEM;
4788                                 if (ctx->account_mem)
4789                                         io_unaccount_mem(ctx->user, nr_pages);
4790                                 goto err;
4791                         }
4792                         got_pages = nr_pages;
4793                 }
4794
4795                 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
4796                                                 GFP_KERNEL);
4797                 ret = -ENOMEM;
4798                 if (!imu->bvec) {
4799                         if (ctx->account_mem)
4800                                 io_unaccount_mem(ctx->user, nr_pages);
4801                         goto err;
4802                 }
4803
4804                 ret = 0;
4805                 down_read(&current->mm->mmap_sem);
4806                 pret = get_user_pages(ubuf, nr_pages,
4807                                       FOLL_WRITE | FOLL_LONGTERM,
4808                                       pages, vmas);
4809                 if (pret == nr_pages) {
4810                         /* don't support file backed memory */
4811                         for (j = 0; j < nr_pages; j++) {
4812                                 struct vm_area_struct *vma = vmas[j];
4813
4814                                 if (vma->vm_file &&
4815                                     !is_file_hugepages(vma->vm_file)) {
4816                                         ret = -EOPNOTSUPP;
4817                                         break;
4818                                 }
4819                         }
4820                 } else {
4821                         ret = pret < 0 ? pret : -EFAULT;
4822                 }
4823                 up_read(&current->mm->mmap_sem);
4824                 if (ret) {
4825                         /*
4826                          * if we did partial map, or found file backed vmas,
4827                          * release any pages we did get
4828                          */
4829                         if (pret > 0)
4830                                 put_user_pages(pages, pret);
4831                         if (ctx->account_mem)
4832                                 io_unaccount_mem(ctx->user, nr_pages);
4833                         kvfree(imu->bvec);
4834                         goto err;
4835                 }
4836
4837                 off = ubuf & ~PAGE_MASK;
4838                 size = iov.iov_len;
4839                 for (j = 0; j < nr_pages; j++) {
4840                         size_t vec_len;
4841
4842                         vec_len = min_t(size_t, size, PAGE_SIZE - off);
4843                         imu->bvec[j].bv_page = pages[j];
4844                         imu->bvec[j].bv_len = vec_len;
4845                         imu->bvec[j].bv_offset = off;
4846                         off = 0;
4847                         size -= vec_len;
4848                 }
4849                 /* store original address for later verification */
4850                 imu->ubuf = ubuf;
4851                 imu->len = iov.iov_len;
4852                 imu->nr_bvecs = nr_pages;
4853
4854                 ctx->nr_user_bufs++;
4855         }
4856         kvfree(pages);
4857         kvfree(vmas);
4858         return 0;
4859 err:
4860         kvfree(pages);
4861         kvfree(vmas);
4862         io_sqe_buffer_unregister(ctx);
4863         return ret;
4864 }
4865
4866 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
4867 {
4868         __s32 __user *fds = arg;
4869         int fd;
4870
4871         if (ctx->cq_ev_fd)
4872                 return -EBUSY;
4873
4874         if (copy_from_user(&fd, fds, sizeof(*fds)))
4875                 return -EFAULT;
4876
4877         ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
4878         if (IS_ERR(ctx->cq_ev_fd)) {
4879                 int ret = PTR_ERR(ctx->cq_ev_fd);
4880                 ctx->cq_ev_fd = NULL;
4881                 return ret;
4882         }
4883
4884         return 0;
4885 }
4886
4887 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
4888 {
4889         if (ctx->cq_ev_fd) {
4890                 eventfd_ctx_put(ctx->cq_ev_fd);
4891                 ctx->cq_ev_fd = NULL;
4892                 return 0;
4893         }
4894
4895         return -ENXIO;
4896 }
4897
4898 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
4899 {
4900         io_finish_async(ctx);
4901         if (ctx->sqo_mm)
4902                 mmdrop(ctx->sqo_mm);
4903
4904         io_iopoll_reap_events(ctx);
4905         io_sqe_buffer_unregister(ctx);
4906         io_sqe_files_unregister(ctx);
4907         io_eventfd_unregister(ctx);
4908
4909 #if defined(CONFIG_UNIX)
4910         if (ctx->ring_sock) {
4911                 ctx->ring_sock->file = NULL; /* so that iput() is called */
4912                 sock_release(ctx->ring_sock);
4913         }
4914 #endif
4915
4916         io_mem_free(ctx->rings);
4917         io_mem_free(ctx->sq_sqes);
4918
4919         percpu_ref_exit(&ctx->refs);
4920         if (ctx->account_mem)
4921                 io_unaccount_mem(ctx->user,
4922                                 ring_pages(ctx->sq_entries, ctx->cq_entries));
4923         free_uid(ctx->user);
4924         put_cred(ctx->creds);
4925         kfree(ctx->completions);
4926         kfree(ctx->cancel_hash);
4927         kmem_cache_free(req_cachep, ctx->fallback_req);
4928         kfree(ctx);
4929 }
4930
4931 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
4932 {
4933         struct io_ring_ctx *ctx = file->private_data;
4934         __poll_t mask = 0;
4935
4936         poll_wait(file, &ctx->cq_wait, wait);
4937         /*
4938          * synchronizes with barrier from wq_has_sleeper call in
4939          * io_commit_cqring
4940          */
4941         smp_rmb();
4942         if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
4943             ctx->rings->sq_ring_entries)
4944                 mask |= EPOLLOUT | EPOLLWRNORM;
4945         if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail)
4946                 mask |= EPOLLIN | EPOLLRDNORM;
4947
4948         return mask;
4949 }
4950
4951 static int io_uring_fasync(int fd, struct file *file, int on)
4952 {
4953         struct io_ring_ctx *ctx = file->private_data;
4954
4955         return fasync_helper(fd, file, on, &ctx->cq_fasync);
4956 }
4957
4958 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
4959 {
4960         mutex_lock(&ctx->uring_lock);
4961         percpu_ref_kill(&ctx->refs);
4962         mutex_unlock(&ctx->uring_lock);
4963
4964         io_kill_timeouts(ctx);
4965         io_poll_remove_all(ctx);
4966
4967         if (ctx->io_wq)
4968                 io_wq_cancel_all(ctx->io_wq);
4969
4970         io_iopoll_reap_events(ctx);
4971         /* if we failed setting up the ctx, we might not have any rings */
4972         if (ctx->rings)
4973                 io_cqring_overflow_flush(ctx, true);
4974         wait_for_completion(&ctx->completions[0]);
4975         io_ring_ctx_free(ctx);
4976 }
4977
4978 static int io_uring_release(struct inode *inode, struct file *file)
4979 {
4980         struct io_ring_ctx *ctx = file->private_data;
4981
4982         file->private_data = NULL;
4983         io_ring_ctx_wait_and_kill(ctx);
4984         return 0;
4985 }
4986
4987 static void io_uring_cancel_files(struct io_ring_ctx *ctx,
4988                                   struct files_struct *files)
4989 {
4990         struct io_kiocb *req;
4991         DEFINE_WAIT(wait);
4992
4993         while (!list_empty_careful(&ctx->inflight_list)) {
4994                 struct io_kiocb *cancel_req = NULL;
4995
4996                 spin_lock_irq(&ctx->inflight_lock);
4997                 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
4998                         if (req->work.files != files)
4999                                 continue;
5000                         /* req is being completed, ignore */
5001                         if (!refcount_inc_not_zero(&req->refs))
5002                                 continue;
5003                         cancel_req = req;
5004                         break;
5005                 }
5006                 if (cancel_req)
5007                         prepare_to_wait(&ctx->inflight_wait, &wait,
5008                                                 TASK_UNINTERRUPTIBLE);
5009                 spin_unlock_irq(&ctx->inflight_lock);
5010
5011                 /* We need to keep going until we don't find a matching req */
5012                 if (!cancel_req)
5013                         break;
5014
5015                 io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
5016                 io_put_req(cancel_req);
5017                 schedule();
5018         }
5019         finish_wait(&ctx->inflight_wait, &wait);
5020 }
5021
5022 static int io_uring_flush(struct file *file, void *data)
5023 {
5024         struct io_ring_ctx *ctx = file->private_data;
5025
5026         io_uring_cancel_files(ctx, data);
5027         if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
5028                 io_cqring_overflow_flush(ctx, true);
5029                 io_wq_cancel_all(ctx->io_wq);
5030         }
5031         return 0;
5032 }
5033
5034 static void *io_uring_validate_mmap_request(struct file *file,
5035                                             loff_t pgoff, size_t sz)
5036 {
5037         struct io_ring_ctx *ctx = file->private_data;
5038         loff_t offset = pgoff << PAGE_SHIFT;
5039         struct page *page;
5040         void *ptr;
5041
5042         switch (offset) {
5043         case IORING_OFF_SQ_RING:
5044         case IORING_OFF_CQ_RING:
5045                 ptr = ctx->rings;
5046                 break;
5047         case IORING_OFF_SQES:
5048                 ptr = ctx->sq_sqes;
5049                 break;
5050         default:
5051                 return ERR_PTR(-EINVAL);
5052         }
5053
5054         page = virt_to_head_page(ptr);
5055         if (sz > page_size(page))
5056                 return ERR_PTR(-EINVAL);
5057
5058         return ptr;
5059 }
5060
5061 #ifdef CONFIG_MMU
5062
5063 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
5064 {
5065         size_t sz = vma->vm_end - vma->vm_start;
5066         unsigned long pfn;
5067         void *ptr;
5068
5069         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
5070         if (IS_ERR(ptr))
5071                 return PTR_ERR(ptr);
5072
5073         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
5074         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
5075 }
5076
5077 #else /* !CONFIG_MMU */
5078
5079 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
5080 {
5081         return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
5082 }
5083
5084 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
5085 {
5086         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
5087 }
5088
5089 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
5090         unsigned long addr, unsigned long len,
5091         unsigned long pgoff, unsigned long flags)
5092 {
5093         void *ptr;
5094
5095         ptr = io_uring_validate_mmap_request(file, pgoff, len);
5096         if (IS_ERR(ptr))
5097                 return PTR_ERR(ptr);
5098
5099         return (unsigned long) ptr;
5100 }
5101
5102 #endif /* !CONFIG_MMU */
5103
5104 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
5105                 u32, min_complete, u32, flags, const sigset_t __user *, sig,
5106                 size_t, sigsz)
5107 {
5108         struct io_ring_ctx *ctx;
5109         long ret = -EBADF;
5110         int submitted = 0;
5111         struct fd f;
5112
5113         if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
5114                 return -EINVAL;
5115
5116         f = fdget(fd);
5117         if (!f.file)
5118                 return -EBADF;
5119
5120         ret = -EOPNOTSUPP;
5121         if (f.file->f_op != &io_uring_fops)
5122                 goto out_fput;
5123
5124         ret = -ENXIO;
5125         ctx = f.file->private_data;
5126         if (!percpu_ref_tryget(&ctx->refs))
5127                 goto out_fput;
5128
5129         /*
5130          * For SQ polling, the thread will do all submissions and completions.
5131          * Just return the requested submit count, and wake the thread if
5132          * we were asked to.
5133          */
5134         ret = 0;
5135         if (ctx->flags & IORING_SETUP_SQPOLL) {
5136                 if (!list_empty_careful(&ctx->cq_overflow_list))
5137                         io_cqring_overflow_flush(ctx, false);
5138                 if (flags & IORING_ENTER_SQ_WAKEUP)
5139                         wake_up(&ctx->sqo_wait);
5140                 submitted = to_submit;
5141         } else if (to_submit) {
5142                 struct mm_struct *cur_mm;
5143
5144                 to_submit = min(to_submit, ctx->sq_entries);
5145                 mutex_lock(&ctx->uring_lock);
5146                 /* already have mm, so io_submit_sqes() won't try to grab it */
5147                 cur_mm = ctx->sqo_mm;
5148                 submitted = io_submit_sqes(ctx, to_submit, f.file, fd,
5149                                            &cur_mm, false);
5150                 mutex_unlock(&ctx->uring_lock);
5151
5152                 if (submitted != to_submit)
5153                         goto out;
5154         }
5155         if (flags & IORING_ENTER_GETEVENTS) {
5156                 unsigned nr_events = 0;
5157
5158                 min_complete = min(min_complete, ctx->cq_entries);
5159
5160                 if (ctx->flags & IORING_SETUP_IOPOLL) {
5161                         ret = io_iopoll_check(ctx, &nr_events, min_complete);
5162                 } else {
5163                         ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
5164                 }
5165         }
5166
5167 out:
5168         percpu_ref_put(&ctx->refs);
5169 out_fput:
5170         fdput(f);
5171         return submitted ? submitted : ret;
5172 }
5173
5174 static const struct file_operations io_uring_fops = {
5175         .release        = io_uring_release,
5176         .flush          = io_uring_flush,
5177         .mmap           = io_uring_mmap,
5178 #ifndef CONFIG_MMU
5179         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
5180         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
5181 #endif
5182         .poll           = io_uring_poll,
5183         .fasync         = io_uring_fasync,
5184 };
5185
5186 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
5187                                   struct io_uring_params *p)
5188 {
5189         struct io_rings *rings;
5190         size_t size, sq_array_offset;
5191
5192         size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
5193         if (size == SIZE_MAX)
5194                 return -EOVERFLOW;
5195
5196         rings = io_mem_alloc(size);
5197         if (!rings)
5198                 return -ENOMEM;
5199
5200         ctx->rings = rings;
5201         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
5202         rings->sq_ring_mask = p->sq_entries - 1;
5203         rings->cq_ring_mask = p->cq_entries - 1;
5204         rings->sq_ring_entries = p->sq_entries;
5205         rings->cq_ring_entries = p->cq_entries;
5206         ctx->sq_mask = rings->sq_ring_mask;
5207         ctx->cq_mask = rings->cq_ring_mask;
5208         ctx->sq_entries = rings->sq_ring_entries;
5209         ctx->cq_entries = rings->cq_ring_entries;
5210
5211         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
5212         if (size == SIZE_MAX) {
5213                 io_mem_free(ctx->rings);
5214                 ctx->rings = NULL;
5215                 return -EOVERFLOW;
5216         }
5217
5218         ctx->sq_sqes = io_mem_alloc(size);
5219         if (!ctx->sq_sqes) {
5220                 io_mem_free(ctx->rings);
5221                 ctx->rings = NULL;
5222                 return -ENOMEM;
5223         }
5224
5225         return 0;
5226 }
5227
5228 /*
5229  * Allocate an anonymous fd, this is what constitutes the application
5230  * visible backing of an io_uring instance. The application mmaps this
5231  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
5232  * we have to tie this fd to a socket for file garbage collection purposes.
5233  */
5234 static int io_uring_get_fd(struct io_ring_ctx *ctx)
5235 {
5236         struct file *file;
5237         int ret;
5238
5239 #if defined(CONFIG_UNIX)
5240         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
5241                                 &ctx->ring_sock);
5242         if (ret)
5243                 return ret;
5244 #endif
5245
5246         ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
5247         if (ret < 0)
5248                 goto err;
5249
5250         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
5251                                         O_RDWR | O_CLOEXEC);
5252         if (IS_ERR(file)) {
5253                 put_unused_fd(ret);
5254                 ret = PTR_ERR(file);
5255                 goto err;
5256         }
5257
5258 #if defined(CONFIG_UNIX)
5259         ctx->ring_sock->file = file;
5260         ctx->ring_sock->sk->sk_user_data = ctx;
5261 #endif
5262         fd_install(ret, file);
5263         return ret;
5264 err:
5265 #if defined(CONFIG_UNIX)
5266         sock_release(ctx->ring_sock);
5267         ctx->ring_sock = NULL;
5268 #endif
5269         return ret;
5270 }
5271
5272 static int io_uring_create(unsigned entries, struct io_uring_params *p)
5273 {
5274         struct user_struct *user = NULL;
5275         struct io_ring_ctx *ctx;
5276         bool account_mem;
5277         int ret;
5278
5279         if (!entries || entries > IORING_MAX_ENTRIES)
5280                 return -EINVAL;
5281
5282         /*
5283          * Use twice as many entries for the CQ ring. It's possible for the
5284          * application to drive a higher depth than the size of the SQ ring,
5285          * since the sqes are only used at submission time. This allows for
5286          * some flexibility in overcommitting a bit. If the application has
5287          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
5288          * of CQ ring entries manually.
5289          */
5290         p->sq_entries = roundup_pow_of_two(entries);
5291         if (p->flags & IORING_SETUP_CQSIZE) {
5292                 /*
5293                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
5294                  * to a power-of-two, if it isn't already. We do NOT impose
5295                  * any cq vs sq ring sizing.
5296                  */
5297                 if (p->cq_entries < p->sq_entries || p->cq_entries > IORING_MAX_CQ_ENTRIES)
5298                         return -EINVAL;
5299                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
5300         } else {
5301                 p->cq_entries = 2 * p->sq_entries;
5302         }
5303
5304         user = get_uid(current_user());
5305         account_mem = !capable(CAP_IPC_LOCK);
5306
5307         if (account_mem) {
5308                 ret = io_account_mem(user,
5309                                 ring_pages(p->sq_entries, p->cq_entries));
5310                 if (ret) {
5311                         free_uid(user);
5312                         return ret;
5313                 }
5314         }
5315
5316         ctx = io_ring_ctx_alloc(p);
5317         if (!ctx) {
5318                 if (account_mem)
5319                         io_unaccount_mem(user, ring_pages(p->sq_entries,
5320                                                                 p->cq_entries));
5321                 free_uid(user);
5322                 return -ENOMEM;
5323         }
5324         ctx->compat = in_compat_syscall();
5325         ctx->account_mem = account_mem;
5326         ctx->user = user;
5327         ctx->creds = get_current_cred();
5328
5329         ret = io_allocate_scq_urings(ctx, p);
5330         if (ret)
5331                 goto err;
5332
5333         ret = io_sq_offload_start(ctx, p);
5334         if (ret)
5335                 goto err;
5336
5337         memset(&p->sq_off, 0, sizeof(p->sq_off));
5338         p->sq_off.head = offsetof(struct io_rings, sq.head);
5339         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
5340         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
5341         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
5342         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
5343         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
5344         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
5345
5346         memset(&p->cq_off, 0, sizeof(p->cq_off));
5347         p->cq_off.head = offsetof(struct io_rings, cq.head);
5348         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
5349         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
5350         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
5351         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
5352         p->cq_off.cqes = offsetof(struct io_rings, cqes);
5353
5354         /*
5355          * Install ring fd as the very last thing, so we don't risk someone
5356          * having closed it before we finish setup
5357          */
5358         ret = io_uring_get_fd(ctx);
5359         if (ret < 0)
5360                 goto err;
5361
5362         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
5363                         IORING_FEAT_SUBMIT_STABLE;
5364         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
5365         return ret;
5366 err:
5367         io_ring_ctx_wait_and_kill(ctx);
5368         return ret;
5369 }
5370
5371 /*
5372  * Sets up an aio uring context, and returns the fd. Applications asks for a
5373  * ring size, we return the actual sq/cq ring sizes (among other things) in the
5374  * params structure passed in.
5375  */
5376 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
5377 {
5378         struct io_uring_params p;
5379         long ret;
5380         int i;
5381
5382         if (copy_from_user(&p, params, sizeof(p)))
5383                 return -EFAULT;
5384         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
5385                 if (p.resv[i])
5386                         return -EINVAL;
5387         }
5388
5389         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
5390                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE))
5391                 return -EINVAL;
5392
5393         ret = io_uring_create(entries, &p);
5394         if (ret < 0)
5395                 return ret;
5396
5397         if (copy_to_user(params, &p, sizeof(p)))
5398                 return -EFAULT;
5399
5400         return ret;
5401 }
5402
5403 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
5404                 struct io_uring_params __user *, params)
5405 {
5406         return io_uring_setup(entries, params);
5407 }
5408
5409 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
5410                                void __user *arg, unsigned nr_args)
5411         __releases(ctx->uring_lock)
5412         __acquires(ctx->uring_lock)
5413 {
5414         int ret;
5415
5416         /*
5417          * We're inside the ring mutex, if the ref is already dying, then
5418          * someone else killed the ctx or is already going through
5419          * io_uring_register().
5420          */
5421         if (percpu_ref_is_dying(&ctx->refs))
5422                 return -ENXIO;
5423
5424         percpu_ref_kill(&ctx->refs);
5425
5426         /*
5427          * Drop uring mutex before waiting for references to exit. If another
5428          * thread is currently inside io_uring_enter() it might need to grab
5429          * the uring_lock to make progress. If we hold it here across the drain
5430          * wait, then we can deadlock. It's safe to drop the mutex here, since
5431          * no new references will come in after we've killed the percpu ref.
5432          */
5433         mutex_unlock(&ctx->uring_lock);
5434         wait_for_completion(&ctx->completions[0]);
5435         mutex_lock(&ctx->uring_lock);
5436
5437         switch (opcode) {
5438         case IORING_REGISTER_BUFFERS:
5439                 ret = io_sqe_buffer_register(ctx, arg, nr_args);
5440                 break;
5441         case IORING_UNREGISTER_BUFFERS:
5442                 ret = -EINVAL;
5443                 if (arg || nr_args)
5444                         break;
5445                 ret = io_sqe_buffer_unregister(ctx);
5446                 break;
5447         case IORING_REGISTER_FILES:
5448                 ret = io_sqe_files_register(ctx, arg, nr_args);
5449                 break;
5450         case IORING_UNREGISTER_FILES:
5451                 ret = -EINVAL;
5452                 if (arg || nr_args)
5453                         break;
5454                 ret = io_sqe_files_unregister(ctx);
5455                 break;
5456         case IORING_REGISTER_FILES_UPDATE:
5457                 ret = io_sqe_files_update(ctx, arg, nr_args);
5458                 break;
5459         case IORING_REGISTER_EVENTFD:
5460                 ret = -EINVAL;
5461                 if (nr_args != 1)
5462                         break;
5463                 ret = io_eventfd_register(ctx, arg);
5464                 break;
5465         case IORING_UNREGISTER_EVENTFD:
5466                 ret = -EINVAL;
5467                 if (arg || nr_args)
5468                         break;
5469                 ret = io_eventfd_unregister(ctx);
5470                 break;
5471         default:
5472                 ret = -EINVAL;
5473                 break;
5474         }
5475
5476         /* bring the ctx back to life */
5477         reinit_completion(&ctx->completions[0]);
5478         percpu_ref_reinit(&ctx->refs);
5479         return ret;
5480 }
5481
5482 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
5483                 void __user *, arg, unsigned int, nr_args)
5484 {
5485         struct io_ring_ctx *ctx;
5486         long ret = -EBADF;
5487         struct fd f;
5488
5489         f = fdget(fd);
5490         if (!f.file)
5491                 return -EBADF;
5492
5493         ret = -EOPNOTSUPP;
5494         if (f.file->f_op != &io_uring_fops)
5495                 goto out_fput;
5496
5497         ctx = f.file->private_data;
5498
5499         mutex_lock(&ctx->uring_lock);
5500         ret = __io_uring_register(ctx, opcode, arg, nr_args);
5501         mutex_unlock(&ctx->uring_lock);
5502         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
5503                                                         ctx->cq_ev_fd != NULL, ret);
5504 out_fput:
5505         fdput(f);
5506         return ret;
5507 }
5508
5509 static int __init io_uring_init(void)
5510 {
5511         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
5512         return 0;
5513 };
5514 __initcall(io_uring_init);