fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqring (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <linux/compat.h>
  47 #include <net/compat.h>
  48 #include <linux/refcount.h>
  49 #include <linux/uio.h>
  50 #include <linux/bits.h>
  51
  52 #include <linux/sched/signal.h>
  53 #include <linux/fs.h>
  54 #include <linux/file.h>
  55 #include <linux/fdtable.h>
  56 #include <linux/mm.h>
  57 #include <linux/mman.h>
  58 #include <linux/percpu.h>
  59 #include <linux/slab.h>
  60 #include <linux/blkdev.h>
  61 #include <linux/bvec.h>
  62 #include <linux/net.h>
  63 #include <net/sock.h>
  64 #include <net/af_unix.h>
  65 #include <net/scm.h>
  66 #include <linux/anon_inodes.h>
  67 #include <linux/sched/mm.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/nospec.h>
  70 #include <linux/sizes.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/highmem.h>
  73 #include <linux/namei.h>
  74 #include <linux/fsnotify.h>
  75 #include <linux/fadvise.h>
  76 #include <linux/eventpoll.h>
  77 #include <linux/fs_struct.h>
  78 #include <linux/splice.h>
  79 #include <linux/task_work.h>
  80 #include <linux/pagemap.h>
  81 #include <linux/io_uring.h>
  82 #include <linux/blk-cgroup.h>
  83 #include <linux/audit.h>
  84
  85 #define CREATE_TRACE_POINTS
  86 #include <trace/events/io_uring.h>
  87
  88 #include <uapi/linux/io_uring.h>
  89
  90 #include "internal.h"
  91 #include "io-wq.h"
  92
  93 #define IORING_MAX_ENTRIES      32768
  94 #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
  95
  96 /*
  97  * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
  98  */
  99 #define IORING_FILE_TABLE_SHIFT 9
 100 #define IORING_MAX_FILES_TABLE  (1U << IORING_FILE_TABLE_SHIFT)
 101 #define IORING_FILE_TABLE_MASK  (IORING_MAX_FILES_TABLE - 1)
 102 #define IORING_MAX_FIXED_FILES  (64 * IORING_MAX_FILES_TABLE)
 103 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
 104                                  IORING_REGISTER_LAST + IORING_OP_LAST)
 105
 106 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
 107                                 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
 108                                 IOSQE_BUFFER_SELECT)
 109
 110 struct io_uring {
 111         u32 head ____cacheline_aligned_in_smp;
 112         u32 tail ____cacheline_aligned_in_smp;
 113 };
 114
 115 /*
 116  * This data is shared with the application through the mmap at offsets
 117  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 118  *
 119  * The offsets to the member fields are published through struct
 120  * io_sqring_offsets when calling io_uring_setup.
 121  */
 122 struct io_rings {
 123         /*
 124          * Head and tail offsets into the ring; the offsets need to be
 125          * masked to get valid indices.
 126          *
 127          * The kernel controls head of the sq ring and the tail of the cq ring,
 128          * and the application controls tail of the sq ring and the head of the
 129          * cq ring.
 130          */
 131         struct io_uring         sq, cq;
 132         /*
 133          * Bitmasks to apply to head and tail offsets (constant, equals
 134          * ring_entries - 1)
 135          */
 136         u32                     sq_ring_mask, cq_ring_mask;
 137         /* Ring sizes (constant, power of 2) */
 138         u32                     sq_ring_entries, cq_ring_entries;
 139         /*
 140          * Number of invalid entries dropped by the kernel due to
 141          * invalid index stored in array
 142          *
 143          * Written by the kernel, shouldn't be modified by the
 144          * application (i.e. get number of "new events" by comparing to
 145          * cached value).
 146          *
 147          * After a new SQ head value was read by the application this
 148          * counter includes all submissions that were dropped reaching
 149          * the new SQ head (and possibly more).
 150          */
 151         u32                     sq_dropped;
 152         /*
 153          * Runtime SQ flags
 154          *
 155          * Written by the kernel, shouldn't be modified by the
 156          * application.
 157          *
 158          * The application needs a full memory barrier before checking
 159          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 160          */
 161         u32                     sq_flags;
 162         /*
 163          * Runtime CQ flags
 164          *
 165          * Written by the application, shouldn't be modified by the
 166          * kernel.
 167          */
 168         u32                     cq_flags;
 169         /*
 170          * Number of completion events lost because the queue was full;
 171          * this should be avoided by the application by making sure
 172          * there are not more requests pending than there is space in
 173          * the completion queue.
 174          *
 175          * Written by the kernel, shouldn't be modified by the
 176          * application (i.e. get number of "new events" by comparing to
 177          * cached value).
 178          *
 179          * As completion events come in out of order this counter is not
 180          * ordered with any other data.
 181          */
 182         u32                     cq_overflow;
 183         /*
 184          * Ring buffer of completion events.
 185          *
 186          * The kernel writes completion events fresh every time they are
 187          * produced, so the application is allowed to modify pending
 188          * entries.
 189          */
 190         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 191 };
 192
 193 enum io_uring_cmd_flags {
 194         IO_URING_F_NONBLOCK             = 1,
 195         IO_URING_F_COMPLETE_DEFER       = 2,
 196 };
 197
 198 struct io_mapped_ubuf {
 199         u64             ubuf;
 200         size_t          len;
 201         struct          bio_vec *bvec;
 202         unsigned int    nr_bvecs;
 203         unsigned long   acct_pages;
 204 };
 205
 206 struct io_ring_ctx;
 207
 208 struct io_rsrc_put {
 209         struct list_head list;
 210         union {
 211                 void *rsrc;
 212                 struct file *file;
 213         };
 214 };
 215
 216 struct fixed_rsrc_table {
 217         struct file             **files;
 218 };
 219
 220 struct fixed_rsrc_ref_node {
 221         struct percpu_ref               refs;
 222         struct list_head                node;
 223         struct list_head                rsrc_list;
 224         struct fixed_rsrc_data          *rsrc_data;
 225         void                            (*rsrc_put)(struct io_ring_ctx *ctx,
 226                                                     struct io_rsrc_put *prsrc);
 227         struct llist_node               llist;
 228         bool                            done;
 229 };
 230
 231 struct fixed_rsrc_data {
 232         struct fixed_rsrc_table         *table;
 233         struct io_ring_ctx              *ctx;
 234
 235         struct fixed_rsrc_ref_node      *node;
 236         struct percpu_ref               refs;
 237         struct completion               done;
 238         bool                            quiesce;
 239 };
 240
 241 struct io_buffer {
 242         struct list_head list;
 243         __u64 addr;
 244         __s32 len;
 245         __u16 bid;
 246 };
 247
 248 struct io_restriction {
 249         DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
 250         DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
 251         u8 sqe_flags_allowed;
 252         u8 sqe_flags_required;
 253         bool registered;
 254 };
 255
 256 enum {
 257         IO_SQ_THREAD_SHOULD_STOP = 0,
 258         IO_SQ_THREAD_SHOULD_PARK,
 259 };
 260
 261 struct io_sq_data {
 262         refcount_t              refs;
 263         struct mutex            lock;
 264
 265         /* ctx's that are using this sqd */
 266         struct list_head        ctx_list;
 267         struct list_head        ctx_new_list;
 268         struct mutex            ctx_lock;
 269
 270         struct task_struct      *thread;
 271         struct wait_queue_head  wait;
 272
 273         unsigned                sq_thread_idle;
 274         int                     sq_cpu;
 275         pid_t                   task_pid;
 276
 277         unsigned long           state;
 278         struct completion       startup;
 279         struct completion       completion;
 280         struct completion       exited;
 281 };
 282
 283 #define IO_IOPOLL_BATCH                 8
 284 #define IO_COMPL_BATCH                  32
 285 #define IO_REQ_CACHE_SIZE               32
 286 #define IO_REQ_ALLOC_BATCH              8
 287
 288 struct io_comp_state {
 289         struct io_kiocb         *reqs[IO_COMPL_BATCH];
 290         unsigned int            nr;
 291         unsigned int            locked_free_nr;
 292         /* inline/task_work completion list, under ->uring_lock */
 293         struct list_head        free_list;
 294         /* IRQ completion list, under ->completion_lock */
 295         struct list_head        locked_free_list;
 296 };
 297
 298 struct io_submit_link {
 299         struct io_kiocb         *head;
 300         struct io_kiocb         *last;
 301 };
 302
 303 struct io_submit_state {
 304         struct blk_plug         plug;
 305         struct io_submit_link   link;
 306
 307         /*
 308          * io_kiocb alloc cache
 309          */
 310         void                    *reqs[IO_REQ_CACHE_SIZE];
 311         unsigned int            free_reqs;
 312
 313         bool                    plug_started;
 314
 315         /*
 316          * Batch completion logic
 317          */
 318         struct io_comp_state    comp;
 319
 320         /*
 321          * File reference cache
 322          */
 323         struct file             *file;
 324         unsigned int            fd;
 325         unsigned int            file_refs;
 326         unsigned int            ios_left;
 327 };
 328
 329 struct io_ring_ctx {
 330         struct {
 331                 struct percpu_ref       refs;
 332         } ____cacheline_aligned_in_smp;
 333
 334         struct {
 335                 unsigned int            flags;
 336                 unsigned int            compat: 1;
 337                 unsigned int            cq_overflow_flushed: 1;
 338                 unsigned int            drain_next: 1;
 339                 unsigned int            eventfd_async: 1;
 340                 unsigned int            restricted: 1;
 341                 unsigned int            sqo_exec: 1;
 342
 343                 /*
 344                  * Ring buffer of indices into array of io_uring_sqe, which is
 345                  * mmapped by the application using the IORING_OFF_SQES offset.
 346                  *
 347                  * This indirection could e.g. be used to assign fixed
 348                  * io_uring_sqe entries to operations and only submit them to
 349                  * the queue when needed.
 350                  *
 351                  * The kernel modifies neither the indices array nor the entries
 352                  * array.
 353                  */
 354                 u32                     *sq_array;
 355                 unsigned                cached_sq_head;
 356                 unsigned                sq_entries;
 357                 unsigned                sq_mask;
 358                 unsigned                sq_thread_idle;
 359                 unsigned                cached_sq_dropped;
 360                 unsigned                cached_cq_overflow;
 361                 unsigned long           sq_check_overflow;
 362
 363                 /* hashed buffered write serialization */
 364                 struct io_wq_hash       *hash_map;
 365
 366                 struct list_head        defer_list;
 367                 struct list_head        timeout_list;
 368                 struct list_head        cq_overflow_list;
 369
 370                 struct io_uring_sqe     *sq_sqes;
 371         } ____cacheline_aligned_in_smp;
 372
 373         struct {
 374                 struct mutex            uring_lock;
 375                 wait_queue_head_t       wait;
 376         } ____cacheline_aligned_in_smp;
 377
 378         struct io_submit_state          submit_state;
 379
 380         struct io_rings *rings;
 381
 382         /* Only used for accounting purposes */
 383         struct mm_struct        *mm_account;
 384
 385         struct io_sq_data       *sq_data;       /* if using sq thread polling */
 386
 387         struct wait_queue_head  sqo_sq_wait;
 388         struct list_head        sqd_list;
 389
 390         /*
 391          * If used, fixed file set. Writers must ensure that ->refs is dead,
 392          * readers must ensure that ->refs is alive as long as the file* is
 393          * used. Only updated through io_uring_register(2).
 394          */
 395         struct fixed_rsrc_data  *file_data;
 396         unsigned                nr_user_files;
 397
 398         /* if used, fixed mapped user buffers */
 399         unsigned                nr_user_bufs;
 400         struct io_mapped_ubuf   *user_bufs;
 401
 402         struct user_struct      *user;
 403
 404         struct completion       ref_comp;
 405         struct completion       sq_thread_comp;
 406
 407 #if defined(CONFIG_UNIX)
 408         struct socket           *ring_sock;
 409 #endif
 410
 411         struct idr              io_buffer_idr;
 412
 413         struct idr              personality_idr;
 414
 415         struct {
 416                 unsigned                cached_cq_tail;
 417                 unsigned                cq_entries;
 418                 unsigned                cq_mask;
 419                 atomic_t                cq_timeouts;
 420                 unsigned                cq_last_tm_flush;
 421                 unsigned long           cq_check_overflow;
 422                 struct wait_queue_head  cq_wait;
 423                 struct fasync_struct    *cq_fasync;
 424                 struct eventfd_ctx      *cq_ev_fd;
 425         } ____cacheline_aligned_in_smp;
 426
 427         struct {
 428                 spinlock_t              completion_lock;
 429
 430                 /*
 431                  * ->iopoll_list is protected by the ctx->uring_lock for
 432                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 433                  * For SQPOLL, only the single threaded io_sq_thread() will
 434                  * manipulate the list, hence no extra locking is needed there.
 435                  */
 436                 struct list_head        iopoll_list;
 437                 struct hlist_head       *cancel_hash;
 438                 unsigned                cancel_hash_bits;
 439                 bool                    poll_multi_file;
 440
 441                 spinlock_t              inflight_lock;
 442                 struct list_head        inflight_list;
 443         } ____cacheline_aligned_in_smp;
 444
 445         struct delayed_work             rsrc_put_work;
 446         struct llist_head               rsrc_put_llist;
 447         struct list_head                rsrc_ref_list;
 448         spinlock_t                      rsrc_ref_lock;
 449
 450         struct io_restriction           restrictions;
 451
 452         /* exit task_work */
 453         struct callback_head            *exit_task_work;
 454
 455         struct wait_queue_head          hash_wait;
 456
 457         /* Keep this last, we don't need it for the fast path */
 458         struct work_struct              exit_work;
 459 };
 460
 461 /*
 462  * First field must be the file pointer in all the
 463  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 464  */
 465 struct io_poll_iocb {
 466         struct file                     *file;
 467         struct wait_queue_head          *head;
 468         __poll_t                        events;
 469         bool                            done;
 470         bool                            canceled;
 471         struct wait_queue_entry         wait;
 472 };
 473
 474 struct io_poll_remove {
 475         struct file                     *file;
 476         u64                             addr;
 477 };
 478
 479 struct io_close {
 480         struct file                     *file;
 481         int                             fd;
 482 };
 483
 484 struct io_timeout_data {
 485         struct io_kiocb                 *req;
 486         struct hrtimer                  timer;
 487         struct timespec64               ts;
 488         enum hrtimer_mode               mode;
 489 };
 490
 491 struct io_accept {
 492         struct file                     *file;
 493         struct sockaddr __user          *addr;
 494         int __user                      *addr_len;
 495         int                             flags;
 496         unsigned long                   nofile;
 497 };
 498
 499 struct io_sync {
 500         struct file                     *file;
 501         loff_t                          len;
 502         loff_t                          off;
 503         int                             flags;
 504         int                             mode;
 505 };
 506
 507 struct io_cancel {
 508         struct file                     *file;
 509         u64                             addr;
 510 };
 511
 512 struct io_timeout {
 513         struct file                     *file;
 514         u32                             off;
 515         u32                             target_seq;
 516         struct list_head                list;
 517         /* head of the link, used by linked timeouts only */
 518         struct io_kiocb                 *head;
 519 };
 520
 521 struct io_timeout_rem {
 522         struct file                     *file;
 523         u64                             addr;
 524
 525         /* timeout update */
 526         struct timespec64               ts;
 527         u32                             flags;
 528 };
 529
 530 struct io_rw {
 531         /* NOTE: kiocb has the file as the first member, so don't do it here */
 532         struct kiocb                    kiocb;
 533         u64                             addr;
 534         u64                             len;
 535 };
 536
 537 struct io_connect {
 538         struct file                     *file;
 539         struct sockaddr __user          *addr;
 540         int                             addr_len;
 541 };
 542
 543 struct io_sr_msg {
 544         struct file                     *file;
 545         union {
 546                 struct user_msghdr __user *umsg;
 547                 void __user             *buf;
 548         };
 549         int                             msg_flags;
 550         int                             bgid;
 551         size_t                          len;
 552         struct io_buffer                *kbuf;
 553 };
 554
 555 struct io_open {
 556         struct file                     *file;
 557         int                             dfd;
 558         struct filename                 *filename;
 559         struct open_how                 how;
 560         unsigned long                   nofile;
 561 };
 562
 563 struct io_rsrc_update {
 564         struct file                     *file;
 565         u64                             arg;
 566         u32                             nr_args;
 567         u32                             offset;
 568 };
 569
 570 struct io_fadvise {
 571         struct file                     *file;
 572         u64                             offset;
 573         u32                             len;
 574         u32                             advice;
 575 };
 576
 577 struct io_madvise {
 578         struct file                     *file;
 579         u64                             addr;
 580         u32                             len;
 581         u32                             advice;
 582 };
 583
 584 struct io_epoll {
 585         struct file                     *file;
 586         int                             epfd;
 587         int                             op;
 588         int                             fd;
 589         struct epoll_event              event;
 590 };
 591
 592 struct io_splice {
 593         struct file                     *file_out;
 594         struct file                     *file_in;
 595         loff_t                          off_out;
 596         loff_t                          off_in;
 597         u64                             len;
 598         unsigned int                    flags;
 599 };
 600
 601 struct io_provide_buf {
 602         struct file                     *file;
 603         __u64                           addr;
 604         __s32                           len;
 605         __u32                           bgid;
 606         __u16                           nbufs;
 607         __u16                           bid;
 608 };
 609
 610 struct io_statx {
 611         struct file                     *file;
 612         int                             dfd;
 613         unsigned int                    mask;
 614         unsigned int                    flags;
 615         const char __user               *filename;
 616         struct statx __user             *buffer;
 617 };
 618
 619 struct io_shutdown {
 620         struct file                     *file;
 621         int                             how;
 622 };
 623
 624 struct io_rename {
 625         struct file                     *file;
 626         int                             old_dfd;
 627         int                             new_dfd;
 628         struct filename                 *oldpath;
 629         struct filename                 *newpath;
 630         int                             flags;
 631 };
 632
 633 struct io_unlink {
 634         struct file                     *file;
 635         int                             dfd;
 636         int                             flags;
 637         struct filename                 *filename;
 638 };
 639
 640 struct io_completion {
 641         struct file                     *file;
 642         struct list_head                list;
 643         int                             cflags;
 644 };
 645
 646 struct io_async_connect {
 647         struct sockaddr_storage         address;
 648 };
 649
 650 struct io_async_msghdr {
 651         struct iovec                    fast_iov[UIO_FASTIOV];
 652         /* points to an allocated iov, if NULL we use fast_iov instead */
 653         struct iovec                    *free_iov;
 654         struct sockaddr __user          *uaddr;
 655         struct msghdr                   msg;
 656         struct sockaddr_storage         addr;
 657 };
 658
 659 struct io_async_rw {
 660         struct iovec                    fast_iov[UIO_FASTIOV];
 661         const struct iovec              *free_iovec;
 662         struct iov_iter                 iter;
 663         size_t                          bytes_done;
 664         struct wait_page_queue          wpq;
 665 };
 666
 667 enum {
 668         REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
 669         REQ_F_IO_DRAIN_BIT      = IOSQE_IO_DRAIN_BIT,
 670         REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
 671         REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
 672         REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
 673         REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
 674
 675         REQ_F_FAIL_LINK_BIT,
 676         REQ_F_INFLIGHT_BIT,
 677         REQ_F_CUR_POS_BIT,
 678         REQ_F_NOWAIT_BIT,
 679         REQ_F_LINK_TIMEOUT_BIT,
 680         REQ_F_ISREG_BIT,
 681         REQ_F_NEED_CLEANUP_BIT,
 682         REQ_F_POLLED_BIT,
 683         REQ_F_BUFFER_SELECTED_BIT,
 684         REQ_F_NO_FILE_TABLE_BIT,
 685         REQ_F_LTIMEOUT_ACTIVE_BIT,
 686         REQ_F_COMPLETE_INLINE_BIT,
 687
 688         /* not a real bit, just to check we're not overflowing the space */
 689         __REQ_F_LAST_BIT,
 690 };
 691
 692 enum {
 693         /* ctx owns file */
 694         REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
 695         /* drain existing IO first */
 696         REQ_F_IO_DRAIN          = BIT(REQ_F_IO_DRAIN_BIT),
 697         /* linked sqes */
 698         REQ_F_LINK              = BIT(REQ_F_LINK_BIT),
 699         /* doesn't sever on completion < 0 */
 700         REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
 701         /* IOSQE_ASYNC */
 702         REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
 703         /* IOSQE_BUFFER_SELECT */
 704         REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
 705
 706         /* fail rest of links */
 707         REQ_F_FAIL_LINK         = BIT(REQ_F_FAIL_LINK_BIT),
 708         /* on inflight list */
 709         REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
 710         /* read/write uses file position */
 711         REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
 712         /* must not punt to workers */
 713         REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
 714         /* has or had linked timeout */
 715         REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
 716         /* regular file */
 717         REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
 718         /* needs cleanup */
 719         REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
 720         /* already went through poll handler */
 721         REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
 722         /* buffer already selected */
 723         REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
 724         /* doesn't need file table for this request */
 725         REQ_F_NO_FILE_TABLE     = BIT(REQ_F_NO_FILE_TABLE_BIT),
 726         /* linked timeout is active, i.e. prepared by link's head */
 727         REQ_F_LTIMEOUT_ACTIVE   = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
 728         /* completion is deferred through io_comp_state */
 729         REQ_F_COMPLETE_INLINE   = BIT(REQ_F_COMPLETE_INLINE_BIT),
 730 };
 731
 732 struct async_poll {
 733         struct io_poll_iocb     poll;
 734         struct io_poll_iocb     *double_poll;
 735 };
 736
 737 struct io_task_work {
 738         struct io_wq_work_node  node;
 739         task_work_func_t        func;
 740 };
 741
 742 /*
 743  * NOTE! Each of the iocb union members has the file pointer
 744  * as the first entry in their struct definition. So you can
 745  * access the file pointer through any of the sub-structs,
 746  * or directly as just 'ki_filp' in this struct.
 747  */
 748 struct io_kiocb {
 749         union {
 750                 struct file             *file;
 751                 struct io_rw            rw;
 752                 struct io_poll_iocb     poll;
 753                 struct io_poll_remove   poll_remove;
 754                 struct io_accept        accept;
 755                 struct io_sync          sync;
 756                 struct io_cancel        cancel;
 757                 struct io_timeout       timeout;
 758                 struct io_timeout_rem   timeout_rem;
 759                 struct io_connect       connect;
 760                 struct io_sr_msg        sr_msg;
 761                 struct io_open          open;
 762                 struct io_close         close;
 763                 struct io_rsrc_update   rsrc_update;
 764                 struct io_fadvise       fadvise;
 765                 struct io_madvise       madvise;
 766                 struct io_epoll         epoll;
 767                 struct io_splice        splice;
 768                 struct io_provide_buf   pbuf;
 769                 struct io_statx         statx;
 770                 struct io_shutdown      shutdown;
 771                 struct io_rename        rename;
 772                 struct io_unlink        unlink;
 773                 /* use only after cleaning per-op data, see io_clean_op() */
 774                 struct io_completion    compl;
 775         };
 776
 777         /* opcode allocated if it needs to store data for async defer */
 778         void                            *async_data;
 779         u8                              opcode;
 780         /* polled IO has completed */
 781         u8                              iopoll_completed;
 782
 783         u16                             buf_index;
 784         u32                             result;
 785
 786         struct io_ring_ctx              *ctx;
 787         unsigned int                    flags;
 788         refcount_t                      refs;
 789         struct task_struct              *task;
 790         u64                             user_data;
 791
 792         struct io_kiocb                 *link;
 793         struct percpu_ref               *fixed_rsrc_refs;
 794
 795         /*
 796          * 1. used with ctx->iopoll_list with reads/writes
 797          * 2. to track reqs with ->files (see io_op_def::file_table)
 798          */
 799         struct list_head                inflight_entry;
 800         union {
 801                 struct io_task_work     io_task_work;
 802                 struct callback_head    task_work;
 803         };
 804         /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
 805         struct hlist_node               hash_node;
 806         struct async_poll               *apoll;
 807         struct io_wq_work               work;
 808 };
 809
 810 struct io_defer_entry {
 811         struct list_head        list;
 812         struct io_kiocb         *req;
 813         u32                     seq;
 814 };
 815
 816 struct io_op_def {
 817         /* needs req->file assigned */
 818         unsigned                needs_file : 1;
 819         /* hash wq insertion if file is a regular file */
 820         unsigned                hash_reg_file : 1;
 821         /* unbound wq insertion if file is a non-regular file */
 822         unsigned                unbound_nonreg_file : 1;
 823         /* opcode is not supported by this kernel */
 824         unsigned                not_supported : 1;
 825         /* set if opcode supports polled "wait" */
 826         unsigned                pollin : 1;
 827         unsigned                pollout : 1;
 828         /* op supports buffer selection */
 829         unsigned                buffer_select : 1;
 830         /* must always have async data allocated */
 831         unsigned                needs_async_data : 1;
 832         /* should block plug */
 833         unsigned                plug : 1;
 834         /* size of async data needed, if any */
 835         unsigned short          async_size;
 836 };
 837
 838 static const struct io_op_def io_op_defs[] = {
 839         [IORING_OP_NOP] = {},
 840         [IORING_OP_READV] = {
 841                 .needs_file             = 1,
 842                 .unbound_nonreg_file    = 1,
 843                 .pollin                 = 1,
 844                 .buffer_select          = 1,
 845                 .needs_async_data       = 1,
 846                 .plug                   = 1,
 847                 .async_size             = sizeof(struct io_async_rw),
 848         },
 849         [IORING_OP_WRITEV] = {
 850                 .needs_file             = 1,
 851                 .hash_reg_file          = 1,
 852                 .unbound_nonreg_file    = 1,
 853                 .pollout                = 1,
 854                 .needs_async_data       = 1,
 855                 .plug                   = 1,
 856                 .async_size             = sizeof(struct io_async_rw),
 857         },
 858         [IORING_OP_FSYNC] = {
 859                 .needs_file             = 1,
 860         },
 861         [IORING_OP_READ_FIXED] = {
 862                 .needs_file             = 1,
 863                 .unbound_nonreg_file    = 1,
 864                 .pollin                 = 1,
 865                 .plug                   = 1,
 866                 .async_size             = sizeof(struct io_async_rw),
 867         },
 868         [IORING_OP_WRITE_FIXED] = {
 869                 .needs_file             = 1,
 870                 .hash_reg_file          = 1,
 871                 .unbound_nonreg_file    = 1,
 872                 .pollout                = 1,
 873                 .plug                   = 1,
 874                 .async_size             = sizeof(struct io_async_rw),
 875         },
 876         [IORING_OP_POLL_ADD] = {
 877                 .needs_file             = 1,
 878                 .unbound_nonreg_file    = 1,
 879         },
 880         [IORING_OP_POLL_REMOVE] = {},
 881         [IORING_OP_SYNC_FILE_RANGE] = {
 882                 .needs_file             = 1,
 883         },
 884         [IORING_OP_SENDMSG] = {
 885                 .needs_file             = 1,
 886                 .unbound_nonreg_file    = 1,
 887                 .pollout                = 1,
 888                 .needs_async_data       = 1,
 889                 .async_size             = sizeof(struct io_async_msghdr),
 890         },
 891         [IORING_OP_RECVMSG] = {
 892                 .needs_file             = 1,
 893                 .unbound_nonreg_file    = 1,
 894                 .pollin                 = 1,
 895                 .buffer_select          = 1,
 896                 .needs_async_data       = 1,
 897                 .async_size             = sizeof(struct io_async_msghdr),
 898         },
 899         [IORING_OP_TIMEOUT] = {
 900                 .needs_async_data       = 1,
 901                 .async_size             = sizeof(struct io_timeout_data),
 902         },
 903         [IORING_OP_TIMEOUT_REMOVE] = {
 904                 /* used by timeout updates' prep() */
 905         },
 906         [IORING_OP_ACCEPT] = {
 907                 .needs_file             = 1,
 908                 .unbound_nonreg_file    = 1,
 909                 .pollin                 = 1,
 910         },
 911         [IORING_OP_ASYNC_CANCEL] = {},
 912         [IORING_OP_LINK_TIMEOUT] = {
 913                 .needs_async_data       = 1,
 914                 .async_size             = sizeof(struct io_timeout_data),
 915         },
 916         [IORING_OP_CONNECT] = {
 917                 .needs_file             = 1,
 918                 .unbound_nonreg_file    = 1,
 919                 .pollout                = 1,
 920                 .needs_async_data       = 1,
 921                 .async_size             = sizeof(struct io_async_connect),
 922         },
 923         [IORING_OP_FALLOCATE] = {
 924                 .needs_file             = 1,
 925         },
 926         [IORING_OP_OPENAT] = {},
 927         [IORING_OP_CLOSE] = {},
 928         [IORING_OP_FILES_UPDATE] = {},
 929         [IORING_OP_STATX] = {},
 930         [IORING_OP_READ] = {
 931                 .needs_file             = 1,
 932                 .unbound_nonreg_file    = 1,
 933                 .pollin                 = 1,
 934                 .buffer_select          = 1,
 935                 .plug                   = 1,
 936                 .async_size             = sizeof(struct io_async_rw),
 937         },
 938         [IORING_OP_WRITE] = {
 939                 .needs_file             = 1,
 940                 .unbound_nonreg_file    = 1,
 941                 .pollout                = 1,
 942                 .plug                   = 1,
 943                 .async_size             = sizeof(struct io_async_rw),
 944         },
 945         [IORING_OP_FADVISE] = {
 946                 .needs_file             = 1,
 947         },
 948         [IORING_OP_MADVISE] = {},
 949         [IORING_OP_SEND] = {
 950                 .needs_file             = 1,
 951                 .unbound_nonreg_file    = 1,
 952                 .pollout                = 1,
 953         },
 954         [IORING_OP_RECV] = {
 955                 .needs_file             = 1,
 956                 .unbound_nonreg_file    = 1,
 957                 .pollin                 = 1,
 958                 .buffer_select          = 1,
 959         },
 960         [IORING_OP_OPENAT2] = {
 961         },
 962         [IORING_OP_EPOLL_CTL] = {
 963                 .unbound_nonreg_file    = 1,
 964         },
 965         [IORING_OP_SPLICE] = {
 966                 .needs_file             = 1,
 967                 .hash_reg_file          = 1,
 968                 .unbound_nonreg_file    = 1,
 969         },
 970         [IORING_OP_PROVIDE_BUFFERS] = {},
 971         [IORING_OP_REMOVE_BUFFERS] = {},
 972         [IORING_OP_TEE] = {
 973                 .needs_file             = 1,
 974                 .hash_reg_file          = 1,
 975                 .unbound_nonreg_file    = 1,
 976         },
 977         [IORING_OP_SHUTDOWN] = {
 978                 .needs_file             = 1,
 979         },
 980         [IORING_OP_RENAMEAT] = {},
 981         [IORING_OP_UNLINKAT] = {},
 982 };
 983
 984 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 985                                          struct task_struct *task,
 986                                          struct files_struct *files);
 987 static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx);
 988 static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
 989 static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
 990                         struct io_ring_ctx *ctx);
 991 static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
 992
 993 static bool io_rw_reissue(struct io_kiocb *req);
 994 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 995 static void io_put_req(struct io_kiocb *req);
 996 static void io_put_req_deferred(struct io_kiocb *req, int nr);
 997 static void io_double_put_req(struct io_kiocb *req);
 998 static void io_dismantle_req(struct io_kiocb *req);
 999 static void io_put_task(struct task_struct *task, int nr);
1000 static void io_queue_next(struct io_kiocb *req);
1001 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
1002 static void __io_queue_linked_timeout(struct io_kiocb *req);
1003 static void io_queue_linked_timeout(struct io_kiocb *req);
1004 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
1005                                  struct io_uring_rsrc_update *ip,
1006                                  unsigned nr_args);
1007 static void __io_clean_op(struct io_kiocb *req);
1008 static struct file *io_file_get(struct io_submit_state *state,
1009                                 struct io_kiocb *req, int fd, bool fixed);
1010 static void __io_queue_sqe(struct io_kiocb *req);
1011 static void io_rsrc_put_work(struct work_struct *work);
1012
1013 static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
1014                            struct iov_iter *iter, bool needs_lock);
1015 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
1016                              const struct iovec *fast_iov,
1017                              struct iov_iter *iter, bool force);
1018 static void io_req_task_queue(struct io_kiocb *req);
1019 static void io_submit_flush_completions(struct io_comp_state *cs,
1020                                         struct io_ring_ctx *ctx);
1021
1022 static struct kmem_cache *req_cachep;
1023
1024 static const struct file_operations io_uring_fops;
1025
1026 struct sock *io_uring_get_socket(struct file *file)
1027 {
1028 #if defined(CONFIG_UNIX)
1029         if (file->f_op == &io_uring_fops) {
1030                 struct io_ring_ctx *ctx = file->private_data;
1031
1032                 return ctx->ring_sock->sk;
1033         }
1034 #endif
1035         return NULL;
1036 }
1037 EXPORT_SYMBOL(io_uring_get_socket);
1038
1039 #define io_for_each_link(pos, head) \
1040         for (pos = (head); pos; pos = pos->link)
1041
1042 static inline void io_clean_op(struct io_kiocb *req)
1043 {
1044         if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
1045                 __io_clean_op(req);
1046 }
1047
1048 static inline void io_set_resource_node(struct io_kiocb *req)
1049 {
1050         struct io_ring_ctx *ctx = req->ctx;
1051
1052         if (!req->fixed_rsrc_refs) {
1053                 req->fixed_rsrc_refs = &ctx->file_data->node->refs;
1054                 percpu_ref_get(req->fixed_rsrc_refs);
1055         }
1056 }
1057
1058 static bool io_match_task(struct io_kiocb *head,
1059                           struct task_struct *task,
1060                           struct files_struct *files)
1061 {
1062         struct io_kiocb *req;
1063
1064         if (task && head->task != task) {
1065                 /* in terms of cancelation, always match if req task is dead */
1066                 if (head->task->flags & PF_EXITING)
1067                         return true;
1068                 return false;
1069         }
1070         if (!files)
1071                 return true;
1072
1073         io_for_each_link(req, head) {
1074                 if (req->file && req->file->f_op == &io_uring_fops)
1075                         return true;
1076                 if (req->task->files == files)
1077                         return true;
1078         }
1079         return false;
1080 }
1081
1082 static inline void req_set_fail_links(struct io_kiocb *req)
1083 {
1084         if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1085                 req->flags |= REQ_F_FAIL_LINK;
1086 }
1087
1088 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
1089 {
1090         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1091
1092         complete(&ctx->ref_comp);
1093 }
1094
1095 static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1096 {
1097         return !req->timeout.off;
1098 }
1099
1100 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1101 {
1102         struct io_ring_ctx *ctx;
1103         int hash_bits;
1104
1105         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1106         if (!ctx)
1107                 return NULL;
1108
1109         /*
1110          * Use 5 bits less than the max cq entries, that should give us around
1111          * 32 entries per hash list if totally full and uniformly spread.
1112          */
1113         hash_bits = ilog2(p->cq_entries);
1114         hash_bits -= 5;
1115         if (hash_bits <= 0)
1116                 hash_bits = 1;
1117         ctx->cancel_hash_bits = hash_bits;
1118         ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1119                                         GFP_KERNEL);
1120         if (!ctx->cancel_hash)
1121                 goto err;
1122         __hash_init(ctx->cancel_hash, 1U << hash_bits);
1123
1124         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
1125                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1126                 goto err;
1127
1128         ctx->flags = p->flags;
1129         init_waitqueue_head(&ctx->sqo_sq_wait);
1130         INIT_LIST_HEAD(&ctx->sqd_list);
1131         init_waitqueue_head(&ctx->cq_wait);
1132         INIT_LIST_HEAD(&ctx->cq_overflow_list);
1133         init_completion(&ctx->ref_comp);
1134         init_completion(&ctx->sq_thread_comp);
1135         idr_init(&ctx->io_buffer_idr);
1136         idr_init(&ctx->personality_idr);
1137         mutex_init(&ctx->uring_lock);
1138         init_waitqueue_head(&ctx->wait);
1139         spin_lock_init(&ctx->completion_lock);
1140         INIT_LIST_HEAD(&ctx->iopoll_list);
1141         INIT_LIST_HEAD(&ctx->defer_list);
1142         INIT_LIST_HEAD(&ctx->timeout_list);
1143         spin_lock_init(&ctx->inflight_lock);
1144         INIT_LIST_HEAD(&ctx->inflight_list);
1145         spin_lock_init(&ctx->rsrc_ref_lock);
1146         INIT_LIST_HEAD(&ctx->rsrc_ref_list);
1147         INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1148         init_llist_head(&ctx->rsrc_put_llist);
1149         INIT_LIST_HEAD(&ctx->submit_state.comp.free_list);
1150         INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list);
1151         return ctx;
1152 err:
1153         kfree(ctx->cancel_hash);
1154         kfree(ctx);
1155         return NULL;
1156 }
1157
1158 static bool req_need_defer(struct io_kiocb *req, u32 seq)
1159 {
1160         if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1161                 struct io_ring_ctx *ctx = req->ctx;
1162
1163                 return seq != ctx->cached_cq_tail
1164                                 + READ_ONCE(ctx->cached_cq_overflow);
1165         }
1166
1167         return false;
1168 }
1169
1170 static void io_req_track_inflight(struct io_kiocb *req)
1171 {
1172         struct io_ring_ctx *ctx = req->ctx;
1173
1174         if (!(req->flags & REQ_F_INFLIGHT)) {
1175                 req->flags |= REQ_F_INFLIGHT;
1176
1177                 spin_lock_irq(&ctx->inflight_lock);
1178                 list_add(&req->inflight_entry, &ctx->inflight_list);
1179                 spin_unlock_irq(&ctx->inflight_lock);
1180         }
1181 }
1182
1183 static void io_prep_async_work(struct io_kiocb *req)
1184 {
1185         const struct io_op_def *def = &io_op_defs[req->opcode];
1186         struct io_ring_ctx *ctx = req->ctx;
1187
1188         if (req->flags & REQ_F_FORCE_ASYNC)
1189                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
1190
1191         if (req->flags & REQ_F_ISREG) {
1192                 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1193                         io_wq_hash_work(&req->work, file_inode(req->file));
1194         } else {
1195                 if (def->unbound_nonreg_file)
1196                         req->work.flags |= IO_WQ_WORK_UNBOUND;
1197         }
1198 }
1199
1200 static void io_prep_async_link(struct io_kiocb *req)
1201 {
1202         struct io_kiocb *cur;
1203
1204         io_for_each_link(cur, req)
1205                 io_prep_async_work(cur);
1206 }
1207
1208 static void io_queue_async_work(struct io_kiocb *req)
1209 {
1210         struct io_ring_ctx *ctx = req->ctx;
1211         struct io_kiocb *link = io_prep_linked_timeout(req);
1212         struct io_uring_task *tctx = req->task->io_uring;
1213
1214         BUG_ON(!tctx);
1215         BUG_ON(!tctx->io_wq);
1216
1217         trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1218                                         &req->work, req->flags);
1219         /* init ->work of the whole link before punting */
1220         io_prep_async_link(req);
1221         io_wq_enqueue(tctx->io_wq, &req->work);
1222         if (link)
1223                 io_queue_linked_timeout(link);
1224 }
1225
1226 static void io_kill_timeout(struct io_kiocb *req)
1227 {
1228         struct io_timeout_data *io = req->async_data;
1229         int ret;
1230
1231         ret = hrtimer_try_to_cancel(&io->timer);
1232         if (ret != -1) {
1233                 atomic_set(&req->ctx->cq_timeouts,
1234                         atomic_read(&req->ctx->cq_timeouts) + 1);
1235                 list_del_init(&req->timeout.list);
1236                 io_cqring_fill_event(req, 0);
1237                 io_put_req_deferred(req, 1);
1238         }
1239 }
1240
1241 /*
1242  * Returns true if we found and killed one or more timeouts
1243  */
1244 static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
1245                              struct files_struct *files)
1246 {
1247         struct io_kiocb *req, *tmp;
1248         int canceled = 0;
1249
1250         spin_lock_irq(&ctx->completion_lock);
1251         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
1252                 if (io_match_task(req, tsk, files)) {
1253                         io_kill_timeout(req);
1254                         canceled++;
1255                 }
1256         }
1257         spin_unlock_irq(&ctx->completion_lock);
1258         return canceled != 0;
1259 }
1260
1261 static void __io_queue_deferred(struct io_ring_ctx *ctx)
1262 {
1263         do {
1264                 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1265                                                 struct io_defer_entry, list);
1266
1267                 if (req_need_defer(de->req, de->seq))
1268                         break;
1269                 list_del_init(&de->list);
1270                 io_req_task_queue(de->req);
1271                 kfree(de);
1272         } while (!list_empty(&ctx->defer_list));
1273 }
1274
1275 static void io_flush_timeouts(struct io_ring_ctx *ctx)
1276 {
1277         u32 seq;
1278
1279         if (list_empty(&ctx->timeout_list))
1280                 return;
1281
1282         seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
1283
1284         do {
1285                 u32 events_needed, events_got;
1286                 struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
1287                                                 struct io_kiocb, timeout.list);
1288
1289                 if (io_is_timeout_noseq(req))
1290                         break;
1291
1292                 /*
1293                  * Since seq can easily wrap around over time, subtract
1294                  * the last seq at which timeouts were flushed before comparing.
1295                  * Assuming not more than 2^31-1 events have happened since,
1296                  * these subtractions won't have wrapped, so we can check if
1297                  * target is in [last_seq, current_seq] by comparing the two.
1298                  */
1299                 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
1300                 events_got = seq - ctx->cq_last_tm_flush;
1301                 if (events_got < events_needed)
1302                         break;
1303
1304                 list_del_init(&req->timeout.list);
1305                 io_kill_timeout(req);
1306         } while (!list_empty(&ctx->timeout_list));
1307
1308         ctx->cq_last_tm_flush = seq;
1309 }
1310
1311 static void io_commit_cqring(struct io_ring_ctx *ctx)
1312 {
1313         io_flush_timeouts(ctx);
1314
1315         /* order cqe stores with ring update */
1316         smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
1317
1318         if (unlikely(!list_empty(&ctx->defer_list)))
1319                 __io_queue_deferred(ctx);
1320 }
1321
1322 static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1323 {
1324         struct io_rings *r = ctx->rings;
1325
1326         return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries;
1327 }
1328
1329 static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
1330 {
1331         return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
1332 }
1333
1334 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
1335 {
1336         struct io_rings *rings = ctx->rings;
1337         unsigned tail;
1338
1339         /*
1340          * writes to the cq entry need to come after reading head; the
1341          * control dependency is enough as we're using WRITE_ONCE to
1342          * fill the cq entry
1343          */
1344         if (__io_cqring_events(ctx) == rings->cq_ring_entries)
1345                 return NULL;
1346
1347         tail = ctx->cached_cq_tail++;
1348         return &rings->cqes[tail & ctx->cq_mask];
1349 }
1350
1351 static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1352 {
1353         if (!ctx->cq_ev_fd)
1354                 return false;
1355         if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1356                 return false;
1357         if (!ctx->eventfd_async)
1358                 return true;
1359         return io_wq_current_is_worker();
1360 }
1361
1362 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1363 {
1364         /* see waitqueue_active() comment */
1365         smp_mb();
1366
1367         if (waitqueue_active(&ctx->wait))
1368                 wake_up(&ctx->wait);
1369         if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
1370                 wake_up(&ctx->sq_data->wait);
1371         if (io_should_trigger_evfd(ctx))
1372                 eventfd_signal(ctx->cq_ev_fd, 1);
1373         if (waitqueue_active(&ctx->cq_wait)) {
1374                 wake_up_interruptible(&ctx->cq_wait);
1375                 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1376         }
1377 }
1378
1379 static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1380 {
1381         /* see waitqueue_active() comment */
1382         smp_mb();
1383
1384         if (ctx->flags & IORING_SETUP_SQPOLL) {
1385                 if (waitqueue_active(&ctx->wait))
1386                         wake_up(&ctx->wait);
1387         }
1388         if (io_should_trigger_evfd(ctx))
1389                 eventfd_signal(ctx->cq_ev_fd, 1);
1390         if (waitqueue_active(&ctx->cq_wait)) {
1391                 wake_up_interruptible(&ctx->cq_wait);
1392                 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1393         }
1394 }
1395
1396 /* Returns true if there are no backlogged entries after the flush */
1397 static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
1398                                        struct task_struct *tsk,
1399                                        struct files_struct *files)
1400 {
1401         struct io_rings *rings = ctx->rings;
1402         struct io_kiocb *req, *tmp;
1403         struct io_uring_cqe *cqe;
1404         unsigned long flags;
1405         bool all_flushed, posted;
1406         LIST_HEAD(list);
1407
1408         if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries)
1409                 return false;
1410
1411         posted = false;
1412         spin_lock_irqsave(&ctx->completion_lock, flags);
1413         list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
1414                 if (!io_match_task(req, tsk, files))
1415                         continue;
1416
1417                 cqe = io_get_cqring(ctx);
1418                 if (!cqe && !force)
1419                         break;
1420
1421                 list_move(&req->compl.list, &list);
1422                 if (cqe) {
1423                         WRITE_ONCE(cqe->user_data, req->user_data);
1424                         WRITE_ONCE(cqe->res, req->result);
1425                         WRITE_ONCE(cqe->flags, req->compl.cflags);
1426                 } else {
1427                         ctx->cached_cq_overflow++;
1428                         WRITE_ONCE(ctx->rings->cq_overflow,
1429                                    ctx->cached_cq_overflow);
1430                 }
1431                 posted = true;
1432         }
1433
1434         all_flushed = list_empty(&ctx->cq_overflow_list);
1435         if (all_flushed) {
1436                 clear_bit(0, &ctx->sq_check_overflow);
1437                 clear_bit(0, &ctx->cq_check_overflow);
1438                 ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
1439         }
1440
1441         if (posted)
1442                 io_commit_cqring(ctx);
1443         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1444         if (posted)
1445                 io_cqring_ev_posted(ctx);
1446
1447         while (!list_empty(&list)) {
1448                 req = list_first_entry(&list, struct io_kiocb, compl.list);
1449                 list_del(&req->compl.list);
1450                 io_put_req(req);
1451         }
1452
1453         return all_flushed;
1454 }
1455
1456 static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
1457                                      struct task_struct *tsk,
1458                                      struct files_struct *files)
1459 {
1460         if (test_bit(0, &ctx->cq_check_overflow)) {
1461                 /* iopoll syncs against uring_lock, not completion_lock */
1462                 if (ctx->flags & IORING_SETUP_IOPOLL)
1463                         mutex_lock(&ctx->uring_lock);
1464                 __io_cqring_overflow_flush(ctx, force, tsk, files);
1465                 if (ctx->flags & IORING_SETUP_IOPOLL)
1466                         mutex_unlock(&ctx->uring_lock);
1467         }
1468 }
1469
1470 static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
1471 {
1472         struct io_ring_ctx *ctx = req->ctx;
1473         struct io_uring_cqe *cqe;
1474
1475         trace_io_uring_complete(ctx, req->user_data, res);
1476
1477         /*
1478          * If we can't get a cq entry, userspace overflowed the
1479          * submission (by quite a lot). Increment the overflow count in
1480          * the ring.
1481          */
1482         cqe = io_get_cqring(ctx);
1483         if (likely(cqe)) {
1484                 WRITE_ONCE(cqe->user_data, req->user_data);
1485                 WRITE_ONCE(cqe->res, res);
1486                 WRITE_ONCE(cqe->flags, cflags);
1487         } else if (ctx->cq_overflow_flushed ||
1488                    atomic_read(&req->task->io_uring->in_idle)) {
1489                 /*
1490                  * If we're in ring overflow flush mode, or in task cancel mode,
1491                  * then we cannot store the request for later flushing, we need
1492                  * to drop it on the floor.
1493                  */
1494                 ctx->cached_cq_overflow++;
1495                 WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow);
1496         } else {
1497                 if (list_empty(&ctx->cq_overflow_list)) {
1498                         set_bit(0, &ctx->sq_check_overflow);
1499                         set_bit(0, &ctx->cq_check_overflow);
1500                         ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
1501                 }
1502                 io_clean_op(req);
1503                 req->result = res;
1504                 req->compl.cflags = cflags;
1505                 refcount_inc(&req->refs);
1506                 list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
1507         }
1508 }
1509
1510 static void io_cqring_fill_event(struct io_kiocb *req, long res)
1511 {
1512         __io_cqring_fill_event(req, res, 0);
1513 }
1514
1515 static inline void io_req_complete_post(struct io_kiocb *req, long res,
1516                                         unsigned int cflags)
1517 {
1518         struct io_ring_ctx *ctx = req->ctx;
1519         unsigned long flags;
1520
1521         spin_lock_irqsave(&ctx->completion_lock, flags);
1522         __io_cqring_fill_event(req, res, cflags);
1523         io_commit_cqring(ctx);
1524         /*
1525          * If we're the last reference to this request, add to our locked
1526          * free_list cache.
1527          */
1528         if (refcount_dec_and_test(&req->refs)) {
1529                 struct io_comp_state *cs = &ctx->submit_state.comp;
1530
1531                 io_dismantle_req(req);
1532                 io_put_task(req->task, 1);
1533                 list_add(&req->compl.list, &cs->locked_free_list);
1534                 cs->locked_free_nr++;
1535         } else
1536                 req = NULL;
1537         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1538
1539         io_cqring_ev_posted(ctx);
1540         if (req) {
1541                 io_queue_next(req);
1542                 percpu_ref_put(&ctx->refs);
1543         }
1544 }
1545
1546 static void io_req_complete_state(struct io_kiocb *req, long res,
1547                                   unsigned int cflags)
1548 {
1549         io_clean_op(req);
1550         req->result = res;
1551         req->compl.cflags = cflags;
1552         req->flags |= REQ_F_COMPLETE_INLINE;
1553 }
1554
1555 static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
1556                                      long res, unsigned cflags)
1557 {
1558         if (issue_flags & IO_URING_F_COMPLETE_DEFER)
1559                 io_req_complete_state(req, res, cflags);
1560         else
1561                 io_req_complete_post(req, res, cflags);
1562 }
1563
1564 static inline void io_req_complete(struct io_kiocb *req, long res)
1565 {
1566         __io_req_complete(req, 0, res, 0);
1567 }
1568
1569 static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
1570 {
1571         struct io_submit_state *state = &ctx->submit_state;
1572         struct io_comp_state *cs = &state->comp;
1573         struct io_kiocb *req = NULL;
1574
1575         /*
1576          * If we have more than a batch's worth of requests in our IRQ side
1577          * locked cache, grab the lock and move them over to our submission
1578          * side cache.
1579          */
1580         if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH) {
1581                 spin_lock_irq(&ctx->completion_lock);
1582                 list_splice_init(&cs->locked_free_list, &cs->free_list);
1583                 cs->locked_free_nr = 0;
1584                 spin_unlock_irq(&ctx->completion_lock);
1585         }
1586
1587         while (!list_empty(&cs->free_list)) {
1588                 req = list_first_entry(&cs->free_list, struct io_kiocb,
1589                                         compl.list);
1590                 list_del(&req->compl.list);
1591                 state->reqs[state->free_reqs++] = req;
1592                 if (state->free_reqs == ARRAY_SIZE(state->reqs))
1593                         break;
1594         }
1595
1596         return req != NULL;
1597 }
1598
1599 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
1600 {
1601         struct io_submit_state *state = &ctx->submit_state;
1602
1603         BUILD_BUG_ON(IO_REQ_ALLOC_BATCH > ARRAY_SIZE(state->reqs));
1604
1605         if (!state->free_reqs) {
1606                 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1607                 int ret;
1608
1609                 if (io_flush_cached_reqs(ctx))
1610                         goto got_req;
1611
1612                 ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
1613                                             state->reqs);
1614
1615                 /*
1616                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
1617                  * retry single alloc to be on the safe side.
1618                  */
1619                 if (unlikely(ret <= 0)) {
1620                         state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1621                         if (!state->reqs[0])
1622                                 return NULL;
1623                         ret = 1;
1624                 }
1625                 state->free_reqs = ret;
1626         }
1627 got_req:
1628         state->free_reqs--;
1629         return state->reqs[state->free_reqs];
1630 }
1631
1632 static inline void io_put_file(struct io_kiocb *req, struct file *file,
1633                           bool fixed)
1634 {
1635         if (!fixed)
1636                 fput(file);
1637 }
1638
1639 static void io_dismantle_req(struct io_kiocb *req)
1640 {
1641         io_clean_op(req);
1642
1643         if (req->async_data)
1644                 kfree(req->async_data);
1645         if (req->file)
1646                 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
1647         if (req->fixed_rsrc_refs)
1648                 percpu_ref_put(req->fixed_rsrc_refs);
1649
1650         if (req->flags & REQ_F_INFLIGHT) {
1651                 struct io_ring_ctx *ctx = req->ctx;
1652                 struct io_uring_task *tctx = req->task->io_uring;
1653                 unsigned long flags;
1654
1655                 spin_lock_irqsave(&ctx->inflight_lock, flags);
1656                 list_del(&req->inflight_entry);
1657                 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1658                 req->flags &= ~REQ_F_INFLIGHT;
1659                 if (atomic_read(&tctx->in_idle))
1660                         wake_up(&tctx->wait);
1661         }
1662 }
1663
1664 static inline void io_put_task(struct task_struct *task, int nr)
1665 {
1666         struct io_uring_task *tctx = task->io_uring;
1667
1668         percpu_counter_sub(&tctx->inflight, nr);
1669         if (unlikely(atomic_read(&tctx->in_idle)))
1670                 wake_up(&tctx->wait);
1671         put_task_struct_many(task, nr);
1672 }
1673
1674 static void __io_free_req(struct io_kiocb *req)
1675 {
1676         struct io_ring_ctx *ctx = req->ctx;
1677
1678         io_dismantle_req(req);
1679         io_put_task(req->task, 1);
1680
1681         kmem_cache_free(req_cachep, req);
1682         percpu_ref_put(&ctx->refs);
1683 }
1684
1685 static inline void io_remove_next_linked(struct io_kiocb *req)
1686 {
1687         struct io_kiocb *nxt = req->link;
1688
1689         req->link = nxt->link;
1690         nxt->link = NULL;
1691 }
1692
1693 static void io_kill_linked_timeout(struct io_kiocb *req)
1694 {
1695         struct io_ring_ctx *ctx = req->ctx;
1696         struct io_kiocb *link;
1697         bool cancelled = false;
1698         unsigned long flags;
1699
1700         spin_lock_irqsave(&ctx->completion_lock, flags);
1701         link = req->link;
1702
1703         /*
1704          * Can happen if a linked timeout fired and link had been like
1705          * req -> link t-out -> link t-out [-> ...]
1706          */
1707         if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
1708                 struct io_timeout_data *io = link->async_data;
1709                 int ret;
1710
1711                 io_remove_next_linked(req);
1712                 link->timeout.head = NULL;
1713                 ret = hrtimer_try_to_cancel(&io->timer);
1714                 if (ret != -1) {
1715                         io_cqring_fill_event(link, -ECANCELED);
1716                         io_commit_cqring(ctx);
1717                         cancelled = true;
1718                 }
1719         }
1720         req->flags &= ~REQ_F_LINK_TIMEOUT;
1721         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1722
1723         if (cancelled) {
1724                 io_cqring_ev_posted(ctx);
1725                 io_put_req(link);
1726         }
1727 }
1728
1729
1730 static void io_fail_links(struct io_kiocb *req)
1731 {
1732         struct io_kiocb *link, *nxt;
1733         struct io_ring_ctx *ctx = req->ctx;
1734         unsigned long flags;
1735
1736         spin_lock_irqsave(&ctx->completion_lock, flags);
1737         link = req->link;
1738         req->link = NULL;
1739
1740         while (link) {
1741                 nxt = link->link;
1742                 link->link = NULL;
1743
1744                 trace_io_uring_fail_link(req, link);
1745                 io_cqring_fill_event(link, -ECANCELED);
1746
1747                 io_put_req_deferred(link, 2);
1748                 link = nxt;
1749         }
1750         io_commit_cqring(ctx);
1751         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1752
1753         io_cqring_ev_posted(ctx);
1754 }
1755
1756 static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
1757 {
1758         if (req->flags & REQ_F_LINK_TIMEOUT)
1759                 io_kill_linked_timeout(req);
1760
1761         /*
1762          * If LINK is set, we have dependent requests in this chain. If we
1763          * didn't fail this request, queue the first one up, moving any other
1764          * dependencies to the next request. In case of failure, fail the rest
1765          * of the chain.
1766          */
1767         if (likely(!(req->flags & REQ_F_FAIL_LINK))) {
1768                 struct io_kiocb *nxt = req->link;
1769
1770                 req->link = NULL;
1771                 return nxt;
1772         }
1773         io_fail_links(req);
1774         return NULL;
1775 }
1776
1777 static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
1778 {
1779         if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
1780                 return NULL;
1781         return __io_req_find_next(req);
1782 }
1783
1784 static void ctx_flush_and_put(struct io_ring_ctx *ctx)
1785 {
1786         if (!ctx)
1787                 return;
1788         if (ctx->submit_state.comp.nr) {
1789                 mutex_lock(&ctx->uring_lock);
1790                 io_submit_flush_completions(&ctx->submit_state.comp, ctx);
1791                 mutex_unlock(&ctx->uring_lock);
1792         }
1793         percpu_ref_put(&ctx->refs);
1794 }
1795
1796 static bool __tctx_task_work(struct io_uring_task *tctx)
1797 {
1798         struct io_ring_ctx *ctx = NULL;
1799         struct io_wq_work_list list;
1800         struct io_wq_work_node *node;
1801
1802         if (wq_list_empty(&tctx->task_list))
1803                 return false;
1804
1805         spin_lock_irq(&tctx->task_lock);
1806         list = tctx->task_list;
1807         INIT_WQ_LIST(&tctx->task_list);
1808         spin_unlock_irq(&tctx->task_lock);
1809
1810         node = list.first;
1811         while (node) {
1812                 struct io_wq_work_node *next = node->next;
1813                 struct io_kiocb *req;
1814
1815                 req = container_of(node, struct io_kiocb, io_task_work.node);
1816                 if (req->ctx != ctx) {
1817                         ctx_flush_and_put(ctx);
1818                         ctx = req->ctx;
1819                         percpu_ref_get(&ctx->refs);
1820                 }
1821
1822                 req->task_work.func(&req->task_work);
1823                 node = next;
1824         }
1825
1826         ctx_flush_and_put(ctx);
1827         return list.first != NULL;
1828 }
1829
1830 static void tctx_task_work(struct callback_head *cb)
1831 {
1832         struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work);
1833
1834         clear_bit(0, &tctx->task_state);
1835
1836         while (__tctx_task_work(tctx))
1837                 cond_resched();
1838 }
1839
1840 static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
1841                             enum task_work_notify_mode notify)
1842 {
1843         struct io_uring_task *tctx = tsk->io_uring;
1844         struct io_wq_work_node *node, *prev;
1845         unsigned long flags;
1846         int ret;
1847
1848         WARN_ON_ONCE(!tctx);
1849
1850         spin_lock_irqsave(&tctx->task_lock, flags);
1851         wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
1852         spin_unlock_irqrestore(&tctx->task_lock, flags);
1853
1854         /* task_work already pending, we're done */
1855         if (test_bit(0, &tctx->task_state) ||
1856             test_and_set_bit(0, &tctx->task_state))
1857                 return 0;
1858
1859         if (!task_work_add(tsk, &tctx->task_work, notify))
1860                 return 0;
1861
1862         /*
1863          * Slow path - we failed, find and delete work. if the work is not
1864          * in the list, it got run and we're fine.
1865          */
1866         ret = 0;
1867         spin_lock_irqsave(&tctx->task_lock, flags);
1868         wq_list_for_each(node, prev, &tctx->task_list) {
1869                 if (&req->io_task_work.node == node) {
1870                         wq_list_del(&tctx->task_list, node, prev);
1871                         ret = 1;
1872                         break;
1873                 }
1874         }
1875         spin_unlock_irqrestore(&tctx->task_lock, flags);
1876         clear_bit(0, &tctx->task_state);
1877         return ret;
1878 }
1879
1880 static int io_req_task_work_add(struct io_kiocb *req)
1881 {
1882         struct task_struct *tsk = req->task;
1883         struct io_ring_ctx *ctx = req->ctx;
1884         enum task_work_notify_mode notify;
1885         int ret;
1886
1887         if (tsk->flags & PF_EXITING)
1888                 return -ESRCH;
1889
1890         /*
1891          * SQPOLL kernel thread doesn't need notification, just a wakeup. For
1892          * all other cases, use TWA_SIGNAL unconditionally to ensure we're
1893          * processing task_work. There's no reliable way to tell if TWA_RESUME
1894          * will do the job.
1895          */
1896         notify = TWA_NONE;
1897         if (!(ctx->flags & IORING_SETUP_SQPOLL))
1898                 notify = TWA_SIGNAL;
1899
1900         ret = io_task_work_add(tsk, req, notify);
1901         if (!ret)
1902                 wake_up_process(tsk);
1903
1904         return ret;
1905 }
1906
1907 static void io_req_task_work_add_fallback(struct io_kiocb *req,
1908                                           task_work_func_t cb)
1909 {
1910         struct io_ring_ctx *ctx = req->ctx;
1911         struct callback_head *head;
1912
1913         init_task_work(&req->task_work, cb);
1914         do {
1915                 head = READ_ONCE(ctx->exit_task_work);
1916                 req->task_work.next = head;
1917         } while (cmpxchg(&ctx->exit_task_work, head, &req->task_work) != head);
1918 }
1919
1920 static void __io_req_task_cancel(struct io_kiocb *req, int error)
1921 {
1922         struct io_ring_ctx *ctx = req->ctx;
1923
1924         spin_lock_irq(&ctx->completion_lock);
1925         io_cqring_fill_event(req, error);
1926         io_commit_cqring(ctx);
1927         spin_unlock_irq(&ctx->completion_lock);
1928
1929         io_cqring_ev_posted(ctx);
1930         req_set_fail_links(req);
1931         io_double_put_req(req);
1932 }
1933
1934 static void io_req_task_cancel(struct callback_head *cb)
1935 {
1936         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
1937         struct io_ring_ctx *ctx = req->ctx;
1938
1939         mutex_lock(&ctx->uring_lock);
1940         __io_req_task_cancel(req, req->result);
1941         mutex_unlock(&ctx->uring_lock);
1942         percpu_ref_put(&ctx->refs);
1943 }
1944
1945 static void __io_req_task_submit(struct io_kiocb *req)
1946 {
1947         struct io_ring_ctx *ctx = req->ctx;
1948
1949         /* ctx stays valid until unlock, even if we drop all ours ctx->refs */
1950         mutex_lock(&ctx->uring_lock);
1951         if (!(current->flags & PF_EXITING) && !current->in_execve)
1952                 __io_queue_sqe(req);
1953         else
1954                 __io_req_task_cancel(req, -EFAULT);
1955         mutex_unlock(&ctx->uring_lock);
1956 }
1957
1958 static void io_req_task_submit(struct callback_head *cb)
1959 {
1960         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
1961
1962         __io_req_task_submit(req);
1963 }
1964
1965 static void io_req_task_queue(struct io_kiocb *req)
1966 {
1967         int ret;
1968
1969         req->task_work.func = io_req_task_submit;
1970         ret = io_req_task_work_add(req);
1971         if (unlikely(ret)) {
1972                 req->result = -ECANCELED;
1973                 percpu_ref_get(&req->ctx->refs);
1974                 io_req_task_work_add_fallback(req, io_req_task_cancel);
1975         }
1976 }
1977
1978 static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
1979 {
1980         percpu_ref_get(&req->ctx->refs);
1981         req->result = ret;
1982         req->task_work.func = io_req_task_cancel;
1983
1984         if (unlikely(io_req_task_work_add(req)))
1985                 io_req_task_work_add_fallback(req, io_req_task_cancel);
1986 }
1987
1988 static inline void io_queue_next(struct io_kiocb *req)
1989 {
1990         struct io_kiocb *nxt = io_req_find_next(req);
1991
1992         if (nxt)
1993                 io_req_task_queue(nxt);
1994 }
1995
1996 static void io_free_req(struct io_kiocb *req)
1997 {
1998         io_queue_next(req);
1999         __io_free_req(req);
2000 }
2001
2002 struct req_batch {
2003         struct task_struct      *task;
2004         int                     task_refs;
2005         int                     ctx_refs;
2006 };
2007
2008 static inline void io_init_req_batch(struct req_batch *rb)
2009 {
2010         rb->task_refs = 0;
2011         rb->ctx_refs = 0;
2012         rb->task = NULL;
2013 }
2014
2015 static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
2016                                      struct req_batch *rb)
2017 {
2018         if (rb->task)
2019                 io_put_task(rb->task, rb->task_refs);
2020         if (rb->ctx_refs)
2021                 percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
2022 }
2023
2024 static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
2025                               struct io_submit_state *state)
2026 {
2027         io_queue_next(req);
2028
2029         if (req->task != rb->task) {
2030                 if (rb->task)
2031                         io_put_task(rb->task, rb->task_refs);
2032                 rb->task = req->task;
2033                 rb->task_refs = 0;
2034         }
2035         rb->task_refs++;
2036         rb->ctx_refs++;
2037
2038         io_dismantle_req(req);
2039         if (state->free_reqs != ARRAY_SIZE(state->reqs))
2040                 state->reqs[state->free_reqs++] = req;
2041         else
2042                 list_add(&req->compl.list, &state->comp.free_list);
2043 }
2044
2045 static void io_submit_flush_completions(struct io_comp_state *cs,
2046                                         struct io_ring_ctx *ctx)
2047 {
2048         int i, nr = cs->nr;
2049         struct io_kiocb *req;
2050         struct req_batch rb;
2051
2052         io_init_req_batch(&rb);
2053         spin_lock_irq(&ctx->completion_lock);
2054         for (i = 0; i < nr; i++) {
2055                 req = cs->reqs[i];
2056                 __io_cqring_fill_event(req, req->result, req->compl.cflags);
2057         }
2058         io_commit_cqring(ctx);
2059         spin_unlock_irq(&ctx->completion_lock);
2060
2061         io_cqring_ev_posted(ctx);
2062         for (i = 0; i < nr; i++) {
2063                 req = cs->reqs[i];
2064
2065                 /* submission and completion refs */
2066                 if (refcount_sub_and_test(2, &req->refs))
2067                         io_req_free_batch(&rb, req, &ctx->submit_state);
2068         }
2069
2070         io_req_free_batch_finish(ctx, &rb);
2071         cs->nr = 0;
2072 }
2073
2074 /*
2075  * Drop reference to request, return next in chain (if there is one) if this
2076  * was the last reference to this request.
2077  */
2078 static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
2079 {
2080         struct io_kiocb *nxt = NULL;
2081
2082         if (refcount_dec_and_test(&req->refs)) {
2083                 nxt = io_req_find_next(req);
2084                 __io_free_req(req);
2085         }
2086         return nxt;
2087 }
2088
2089 static void io_put_req(struct io_kiocb *req)
2090 {
2091         if (refcount_dec_and_test(&req->refs))
2092                 io_free_req(req);
2093 }
2094
2095 static void io_put_req_deferred_cb(struct callback_head *cb)
2096 {
2097         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2098
2099         io_free_req(req);
2100 }
2101
2102 static void io_free_req_deferred(struct io_kiocb *req)
2103 {
2104         int ret;
2105
2106         req->task_work.func = io_put_req_deferred_cb;
2107         ret = io_req_task_work_add(req);
2108         if (unlikely(ret))
2109                 io_req_task_work_add_fallback(req, io_put_req_deferred_cb);
2110 }
2111
2112 static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
2113 {
2114         if (refcount_sub_and_test(refs, &req->refs))
2115                 io_free_req_deferred(req);
2116 }
2117
2118 static void io_double_put_req(struct io_kiocb *req)
2119 {
2120         /* drop both submit and complete references */
2121         if (refcount_sub_and_test(2, &req->refs))
2122                 io_free_req(req);
2123 }
2124
2125 static unsigned io_cqring_events(struct io_ring_ctx *ctx)
2126 {
2127         /* See comment at the top of this file */
2128         smp_rmb();
2129         return __io_cqring_events(ctx);
2130 }
2131
2132 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2133 {
2134         struct io_rings *rings = ctx->rings;
2135
2136         /* make sure SQ entry isn't read before tail */
2137         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2138 }
2139
2140 static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
2141 {
2142         unsigned int cflags;
2143
2144         cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
2145         cflags |= IORING_CQE_F_BUFFER;
2146         req->flags &= ~REQ_F_BUFFER_SELECTED;
2147         kfree(kbuf);
2148         return cflags;
2149 }
2150
2151 static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
2152 {
2153         struct io_buffer *kbuf;
2154
2155         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2156         return io_put_kbuf(req, kbuf);
2157 }
2158
2159 static inline bool io_run_task_work(void)
2160 {
2161         /*
2162          * Not safe to run on exiting task, and the task_work handling will
2163          * not add work to such a task.
2164          */
2165         if (unlikely(current->flags & PF_EXITING))
2166                 return false;
2167         if (current->task_works) {
2168                 __set_current_state(TASK_RUNNING);
2169                 task_work_run();
2170                 return true;
2171         }
2172
2173         return false;
2174 }
2175
2176 /*
2177  * Find and free completed poll iocbs
2178  */
2179 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
2180                                struct list_head *done)
2181 {
2182         struct req_batch rb;
2183         struct io_kiocb *req;
2184
2185         /* order with ->result store in io_complete_rw_iopoll() */
2186         smp_rmb();
2187
2188         io_init_req_batch(&rb);
2189         while (!list_empty(done)) {
2190                 int cflags = 0;
2191
2192                 req = list_first_entry(done, struct io_kiocb, inflight_entry);
2193                 list_del(&req->inflight_entry);
2194
2195                 if (READ_ONCE(req->result) == -EAGAIN) {
2196                         req->iopoll_completed = 0;
2197                         if (io_rw_reissue(req))
2198                                 continue;
2199                 }
2200
2201                 if (req->flags & REQ_F_BUFFER_SELECTED)
2202                         cflags = io_put_rw_kbuf(req);
2203
2204                 __io_cqring_fill_event(req, req->result, cflags);
2205                 (*nr_events)++;
2206
2207                 if (refcount_dec_and_test(&req->refs))
2208                         io_req_free_batch(&rb, req, &ctx->submit_state);
2209         }
2210
2211         io_commit_cqring(ctx);
2212         io_cqring_ev_posted_iopoll(ctx);
2213         io_req_free_batch_finish(ctx, &rb);
2214 }
2215
2216 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
2217                         long min)
2218 {
2219         struct io_kiocb *req, *tmp;
2220         LIST_HEAD(done);
2221         bool spin;
2222         int ret;
2223
2224         /*
2225          * Only spin for completions if we don't have multiple devices hanging
2226          * off our complete list, and we're under the requested amount.
2227          */
2228         spin = !ctx->poll_multi_file && *nr_events < min;
2229
2230         ret = 0;
2231         list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
2232                 struct kiocb *kiocb = &req->rw.kiocb;
2233
2234                 /*
2235                  * Move completed and retryable entries to our local lists.
2236                  * If we find a request that requires polling, break out
2237                  * and complete those lists first, if we have entries there.
2238                  */
2239                 if (READ_ONCE(req->iopoll_completed)) {
2240                         list_move_tail(&req->inflight_entry, &done);
2241                         continue;
2242                 }
2243                 if (!list_empty(&done))
2244                         break;
2245
2246                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
2247                 if (ret < 0)
2248                         break;
2249
2250                 /* iopoll may have completed current req */
2251                 if (READ_ONCE(req->iopoll_completed))
2252                         list_move_tail(&req->inflight_entry, &done);
2253
2254                 if (ret && spin)
2255                         spin = false;
2256                 ret = 0;
2257         }
2258
2259         if (!list_empty(&done))
2260                 io_iopoll_complete(ctx, nr_events, &done);
2261
2262         return ret;
2263 }
2264
2265 /*
2266  * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
2267  * non-spinning poll check - we'll still enter the driver poll loop, but only
2268  * as a non-spinning completion check.
2269  */
2270 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
2271                                 long min)
2272 {
2273         while (!list_empty(&ctx->iopoll_list) && !need_resched()) {
2274                 int ret;
2275
2276                 ret = io_do_iopoll(ctx, nr_events, min);
2277                 if (ret < 0)
2278                         return ret;
2279                 if (*nr_events >= min)
2280                         return 0;
2281         }
2282
2283         return 1;
2284 }
2285
2286 /*
2287  * We can't just wait for polled events to come to us, we have to actively
2288  * find and complete them.
2289  */
2290 static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
2291 {
2292         if (!(ctx->flags & IORING_SETUP_IOPOLL))
2293                 return;
2294
2295         mutex_lock(&ctx->uring_lock);
2296         while (!list_empty(&ctx->iopoll_list)) {
2297                 unsigned int nr_events = 0;
2298
2299                 io_do_iopoll(ctx, &nr_events, 0);
2300
2301                 /* let it sleep and repeat later if can't complete a request */
2302                 if (nr_events == 0)
2303                         break;
2304                 /*
2305                  * Ensure we allow local-to-the-cpu processing to take place,
2306                  * in this case we need to ensure that we reap all events.
2307                  * Also let task_work, etc. to progress by releasing the mutex
2308                  */
2309                 if (need_resched()) {
2310                         mutex_unlock(&ctx->uring_lock);
2311                         cond_resched();
2312                         mutex_lock(&ctx->uring_lock);
2313                 }
2314         }
2315         mutex_unlock(&ctx->uring_lock);
2316 }
2317
2318 static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
2319 {
2320         unsigned int nr_events = 0;
2321         int iters = 0, ret = 0;
2322
2323         /*
2324          * We disallow the app entering submit/complete with polling, but we
2325          * still need to lock the ring to prevent racing with polled issue
2326          * that got punted to a workqueue.
2327          */
2328         mutex_lock(&ctx->uring_lock);
2329         do {
2330                 /*
2331                  * Don't enter poll loop if we already have events pending.
2332                  * If we do, we can potentially be spinning for commands that
2333                  * already triggered a CQE (eg in error).
2334                  */
2335                 if (test_bit(0, &ctx->cq_check_overflow))
2336                         __io_cqring_overflow_flush(ctx, false, NULL, NULL);
2337                 if (io_cqring_events(ctx))
2338                         break;
2339
2340                 /*
2341                  * If a submit got punted to a workqueue, we can have the
2342                  * application entering polling for a command before it gets
2343                  * issued. That app will hold the uring_lock for the duration
2344                  * of the poll right here, so we need to take a breather every
2345                  * now and then to ensure that the issue has a chance to add
2346                  * the poll to the issued list. Otherwise we can spin here
2347                  * forever, while the workqueue is stuck trying to acquire the
2348                  * very same mutex.
2349                  */
2350                 if (!(++iters & 7)) {
2351                         mutex_unlock(&ctx->uring_lock);
2352                         io_run_task_work();
2353                         mutex_lock(&ctx->uring_lock);
2354                 }
2355
2356                 ret = io_iopoll_getevents(ctx, &nr_events, min);
2357                 if (ret <= 0)
2358                         break;
2359                 ret = 0;
2360         } while (min && !nr_events && !need_resched());
2361
2362         mutex_unlock(&ctx->uring_lock);
2363         return ret;
2364 }
2365
2366 static void kiocb_end_write(struct io_kiocb *req)
2367 {
2368         /*
2369          * Tell lockdep we inherited freeze protection from submission
2370          * thread.
2371          */
2372         if (req->flags & REQ_F_ISREG) {
2373                 struct inode *inode = file_inode(req->file);
2374
2375                 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
2376         }
2377         file_end_write(req->file);
2378 }
2379
2380 #ifdef CONFIG_BLOCK
2381 static bool io_resubmit_prep(struct io_kiocb *req)
2382 {
2383         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2384         int rw, ret;
2385         struct iov_iter iter;
2386
2387         /* already prepared */
2388         if (req->async_data)
2389                 return true;
2390
2391         switch (req->opcode) {
2392         case IORING_OP_READV:
2393         case IORING_OP_READ_FIXED:
2394         case IORING_OP_READ:
2395                 rw = READ;
2396                 break;
2397         case IORING_OP_WRITEV:
2398         case IORING_OP_WRITE_FIXED:
2399         case IORING_OP_WRITE:
2400                 rw = WRITE;
2401                 break;
2402         default:
2403                 printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n",
2404                                 req->opcode);
2405                 return false;
2406         }
2407
2408         ret = io_import_iovec(rw, req, &iovec, &iter, false);
2409         if (ret < 0)
2410                 return false;
2411         return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
2412 }
2413
2414 static bool io_rw_should_reissue(struct io_kiocb *req)
2415 {
2416         umode_t mode = file_inode(req->file)->i_mode;
2417         struct io_ring_ctx *ctx = req->ctx;
2418
2419         if (!S_ISBLK(mode) && !S_ISREG(mode))
2420                 return false;
2421         if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
2422             !(ctx->flags & IORING_SETUP_IOPOLL)))
2423                 return false;
2424         /*
2425          * If ref is dying, we might be running poll reap from the exit work.
2426          * Don't attempt to reissue from that path, just let it fail with
2427          * -EAGAIN.
2428          */
2429         if (percpu_ref_is_dying(&ctx->refs))
2430                 return false;
2431         return true;
2432 }
2433 #endif
2434
2435 static bool io_rw_reissue(struct io_kiocb *req)
2436 {
2437 #ifdef CONFIG_BLOCK
2438         if (!io_rw_should_reissue(req))
2439                 return false;
2440
2441         lockdep_assert_held(&req->ctx->uring_lock);
2442
2443         if (io_resubmit_prep(req)) {
2444                 refcount_inc(&req->refs);
2445                 io_queue_async_work(req);
2446                 return true;
2447         }
2448         req_set_fail_links(req);
2449 #endif
2450         return false;
2451 }
2452
2453 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
2454                              unsigned int issue_flags)
2455 {
2456         int cflags = 0;
2457
2458         if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_reissue(req))
2459                 return;
2460         if (res != req->result)
2461                 req_set_fail_links(req);
2462
2463         if (req->rw.kiocb.ki_flags & IOCB_WRITE)
2464                 kiocb_end_write(req);
2465         if (req->flags & REQ_F_BUFFER_SELECTED)
2466                 cflags = io_put_rw_kbuf(req);
2467         __io_req_complete(req, issue_flags, res, cflags);
2468 }
2469
2470 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
2471 {
2472         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2473
2474         __io_complete_rw(req, res, res2, 0);
2475 }
2476
2477 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
2478 {
2479         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2480
2481 #ifdef CONFIG_BLOCK
2482         /* Rewind iter, if we have one. iopoll path resubmits as usual */
2483         if (res == -EAGAIN && io_rw_should_reissue(req)) {
2484                 struct io_async_rw *rw = req->async_data;
2485
2486                 if (rw)
2487                         iov_iter_revert(&rw->iter,
2488                                         req->result - iov_iter_count(&rw->iter));
2489                 else if (!io_resubmit_prep(req))
2490                         res = -EIO;
2491         }
2492 #endif
2493
2494         if (kiocb->ki_flags & IOCB_WRITE)
2495                 kiocb_end_write(req);
2496
2497         if (res != -EAGAIN && res != req->result)
2498                 req_set_fail_links(req);
2499
2500         WRITE_ONCE(req->result, res);
2501         /* order with io_poll_complete() checking ->result */
2502         smp_wmb();
2503         WRITE_ONCE(req->iopoll_completed, 1);
2504 }
2505
2506 /*
2507  * After the iocb has been issued, it's safe to be found on the poll list.
2508  * Adding the kiocb to the list AFTER submission ensures that we don't
2509  * find it from a io_iopoll_getevents() thread before the issuer is done
2510  * accessing the kiocb cookie.
2511  */
2512 static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
2513 {
2514         struct io_ring_ctx *ctx = req->ctx;
2515
2516         /*
2517          * Track whether we have multiple files in our lists. This will impact
2518          * how we do polling eventually, not spinning if we're on potentially
2519          * different devices.
2520          */
2521         if (list_empty(&ctx->iopoll_list)) {
2522                 ctx->poll_multi_file = false;
2523         } else if (!ctx->poll_multi_file) {
2524                 struct io_kiocb *list_req;
2525
2526                 list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
2527                                                 inflight_entry);
2528                 if (list_req->file != req->file)
2529                         ctx->poll_multi_file = true;
2530         }
2531
2532         /*
2533          * For fast devices, IO may have already completed. If it has, add
2534          * it to the front so we find it first.
2535          */
2536         if (READ_ONCE(req->iopoll_completed))
2537                 list_add(&req->inflight_entry, &ctx->iopoll_list);
2538         else
2539                 list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
2540
2541         /*
2542          * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread
2543          * task context or in io worker task context. If current task context is
2544          * sq thread, we don't need to check whether should wake up sq thread.
2545          */
2546         if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) &&
2547             wq_has_sleeper(&ctx->sq_data->wait))
2548                 wake_up(&ctx->sq_data->wait);
2549 }
2550
2551 static inline void io_state_file_put(struct io_submit_state *state)
2552 {
2553         if (state->file_refs) {
2554                 fput_many(state->file, state->file_refs);
2555                 state->file_refs = 0;
2556         }
2557 }
2558
2559 /*
2560  * Get as many references to a file as we have IOs left in this submission,
2561  * assuming most submissions are for one file, or at least that each file
2562  * has more than one submission.
2563  */
2564 static struct file *__io_file_get(struct io_submit_state *state, int fd)
2565 {
2566         if (!state)
2567                 return fget(fd);
2568
2569         if (state->file_refs) {
2570                 if (state->fd == fd) {
2571                         state->file_refs--;
2572                         return state->file;
2573                 }
2574                 io_state_file_put(state);
2575         }
2576         state->file = fget_many(fd, state->ios_left);
2577         if (unlikely(!state->file))
2578                 return NULL;
2579
2580         state->fd = fd;
2581         state->file_refs = state->ios_left - 1;
2582         return state->file;
2583 }
2584
2585 static bool io_bdev_nowait(struct block_device *bdev)
2586 {
2587         return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
2588 }
2589
2590 /*
2591  * If we tracked the file through the SCM inflight mechanism, we could support
2592  * any file. For now, just ensure that anything potentially problematic is done
2593  * inline.
2594  */
2595 static bool io_file_supports_async(struct file *file, int rw)
2596 {
2597         umode_t mode = file_inode(file)->i_mode;
2598
2599         if (S_ISBLK(mode)) {
2600                 if (IS_ENABLED(CONFIG_BLOCK) &&
2601                     io_bdev_nowait(I_BDEV(file->f_mapping->host)))
2602                         return true;
2603                 return false;
2604         }
2605         if (S_ISCHR(mode) || S_ISSOCK(mode))
2606                 return true;
2607         if (S_ISREG(mode)) {
2608                 if (IS_ENABLED(CONFIG_BLOCK) &&
2609                     io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
2610                     file->f_op != &io_uring_fops)
2611                         return true;
2612                 return false;
2613         }
2614
2615         /* any ->read/write should understand O_NONBLOCK */
2616         if (file->f_flags & O_NONBLOCK)
2617                 return true;
2618
2619         if (!(file->f_mode & FMODE_NOWAIT))
2620                 return false;
2621
2622         if (rw == READ)
2623                 return file->f_op->read_iter != NULL;
2624
2625         return file->f_op->write_iter != NULL;
2626 }
2627
2628 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2629 {
2630         struct io_ring_ctx *ctx = req->ctx;
2631         struct kiocb *kiocb = &req->rw.kiocb;
2632         struct file *file = req->file;
2633         unsigned ioprio;
2634         int ret;
2635
2636         if (S_ISREG(file_inode(file)->i_mode))
2637                 req->flags |= REQ_F_ISREG;
2638
2639         kiocb->ki_pos = READ_ONCE(sqe->off);
2640         if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
2641                 req->flags |= REQ_F_CUR_POS;
2642                 kiocb->ki_pos = file->f_pos;
2643         }
2644         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
2645         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2646         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2647         if (unlikely(ret))
2648                 return ret;
2649
2650         /* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */
2651         if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK))
2652                 req->flags |= REQ_F_NOWAIT;
2653
2654         ioprio = READ_ONCE(sqe->ioprio);
2655         if (ioprio) {
2656                 ret = ioprio_check_cap(ioprio);
2657                 if (ret)
2658                         return ret;
2659
2660                 kiocb->ki_ioprio = ioprio;
2661         } else
2662                 kiocb->ki_ioprio = get_current_ioprio();
2663
2664         if (ctx->flags & IORING_SETUP_IOPOLL) {
2665                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2666                     !kiocb->ki_filp->f_op->iopoll)
2667                         return -EOPNOTSUPP;
2668
2669                 kiocb->ki_flags |= IOCB_HIPRI;
2670                 kiocb->ki_complete = io_complete_rw_iopoll;
2671                 req->iopoll_completed = 0;
2672         } else {
2673                 if (kiocb->ki_flags & IOCB_HIPRI)
2674                         return -EINVAL;
2675                 kiocb->ki_complete = io_complete_rw;
2676         }
2677
2678         req->rw.addr = READ_ONCE(sqe->addr);
2679         req->rw.len = READ_ONCE(sqe->len);
2680         req->buf_index = READ_ONCE(sqe->buf_index);
2681         return 0;
2682 }
2683
2684 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2685 {
2686         switch (ret) {
2687         case -EIOCBQUEUED:
2688                 break;
2689         case -ERESTARTSYS:
2690         case -ERESTARTNOINTR:
2691         case -ERESTARTNOHAND:
2692         case -ERESTART_RESTARTBLOCK:
2693                 /*
2694                  * We can't just restart the syscall, since previously
2695                  * submitted sqes may already be in progress. Just fail this
2696                  * IO with EINTR.
2697                  */
2698                 ret = -EINTR;
2699                 fallthrough;
2700         default:
2701                 kiocb->ki_complete(kiocb, ret, 0);
2702         }
2703 }
2704
2705 static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
2706                        unsigned int issue_flags)
2707 {
2708         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2709         struct io_async_rw *io = req->async_data;
2710
2711         /* add previously done IO, if any */
2712         if (io && io->bytes_done > 0) {
2713                 if (ret < 0)
2714                         ret = io->bytes_done;
2715                 else
2716                         ret += io->bytes_done;
2717         }
2718
2719         if (req->flags & REQ_F_CUR_POS)
2720                 req->file->f_pos = kiocb->ki_pos;
2721         if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
2722                 __io_complete_rw(req, ret, 0, issue_flags);
2723         else
2724                 io_rw_done(kiocb, ret);
2725 }
2726
2727 static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
2728 {
2729         struct io_ring_ctx *ctx = req->ctx;
2730         size_t len = req->rw.len;
2731         struct io_mapped_ubuf *imu;
2732         u16 index, buf_index = req->buf_index;
2733         size_t offset;
2734         u64 buf_addr;
2735
2736         if (unlikely(buf_index >= ctx->nr_user_bufs))
2737                 return -EFAULT;
2738         index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2739         imu = &ctx->user_bufs[index];
2740         buf_addr = req->rw.addr;
2741
2742         /* overflow */
2743         if (buf_addr + len < buf_addr)
2744                 return -EFAULT;
2745         /* not inside the mapped region */
2746         if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
2747                 return -EFAULT;
2748
2749         /*
2750          * May not be a start of buffer, set size appropriately
2751          * and advance us to the beginning.
2752          */
2753         offset = buf_addr - imu->ubuf;
2754         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
2755
2756         if (offset) {
2757                 /*
2758                  * Don't use iov_iter_advance() here, as it's really slow for
2759                  * using the latter parts of a big fixed buffer - it iterates
2760                  * over each segment manually. We can cheat a bit here, because
2761                  * we know that:
2762                  *
2763                  * 1) it's a BVEC iter, we set it up
2764                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
2765                  *    first and last bvec
2766                  *
2767                  * So just find our index, and adjust the iterator afterwards.
2768                  * If the offset is within the first bvec (or the whole first
2769                  * bvec, just use iov_iter_advance(). This makes it easier
2770                  * since we can just skip the first segment, which may not
2771                  * be PAGE_SIZE aligned.
2772                  */
2773                 const struct bio_vec *bvec = imu->bvec;
2774
2775                 if (offset <= bvec->bv_len) {
2776                         iov_iter_advance(iter, offset);
2777                 } else {
2778                         unsigned long seg_skip;
2779
2780                         /* skip first vec */
2781                         offset -= bvec->bv_len;
2782                         seg_skip = 1 + (offset >> PAGE_SHIFT);
2783
2784                         iter->bvec = bvec + seg_skip;
2785                         iter->nr_segs -= seg_skip;
2786                         iter->count -= bvec->bv_len + offset;
2787                         iter->iov_offset = offset & ~PAGE_MASK;
2788                 }
2789         }
2790
2791         return 0;
2792 }
2793
2794 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2795 {
2796         if (needs_lock)
2797                 mutex_unlock(&ctx->uring_lock);
2798 }
2799
2800 static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2801 {
2802         /*
2803          * "Normal" inline submissions always hold the uring_lock, since we
2804          * grab it from the system call. Same is true for the SQPOLL offload.
2805          * The only exception is when we've detached the request and issue it
2806          * from an async worker thread, grab the lock for that case.
2807          */
2808         if (needs_lock)
2809                 mutex_lock(&ctx->uring_lock);
2810 }
2811
2812 static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
2813                                           int bgid, struct io_buffer *kbuf,
2814                                           bool needs_lock)
2815 {
2816         struct io_buffer *head;
2817
2818         if (req->flags & REQ_F_BUFFER_SELECTED)
2819                 return kbuf;
2820
2821         io_ring_submit_lock(req->ctx, needs_lock);
2822
2823         lockdep_assert_held(&req->ctx->uring_lock);
2824
2825         head = idr_find(&req->ctx->io_buffer_idr, bgid);
2826         if (head) {
2827                 if (!list_empty(&head->list)) {
2828                         kbuf = list_last_entry(&head->list, struct io_buffer,
2829                                                         list);
2830                         list_del(&kbuf->list);
2831                 } else {
2832                         kbuf = head;
2833                         idr_remove(&req->ctx->io_buffer_idr, bgid);
2834                 }
2835                 if (*len > kbuf->len)
2836                         *len = kbuf->len;
2837         } else {
2838                 kbuf = ERR_PTR(-ENOBUFS);
2839         }
2840
2841         io_ring_submit_unlock(req->ctx, needs_lock);
2842
2843         return kbuf;
2844 }
2845
2846 static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
2847                                         bool needs_lock)
2848 {
2849         struct io_buffer *kbuf;
2850         u16 bgid;
2851
2852         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2853         bgid = req->buf_index;
2854         kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
2855         if (IS_ERR(kbuf))
2856                 return kbuf;
2857         req->rw.addr = (u64) (unsigned long) kbuf;
2858         req->flags |= REQ_F_BUFFER_SELECTED;
2859         return u64_to_user_ptr(kbuf->addr);
2860 }
2861
2862 #ifdef CONFIG_COMPAT
2863 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
2864                                 bool needs_lock)
2865 {
2866         struct compat_iovec __user *uiov;
2867         compat_ssize_t clen;
2868         void __user *buf;
2869         ssize_t len;
2870
2871         uiov = u64_to_user_ptr(req->rw.addr);
2872         if (!access_ok(uiov, sizeof(*uiov)))
2873                 return -EFAULT;
2874         if (__get_user(clen, &uiov->iov_len))
2875                 return -EFAULT;
2876         if (clen < 0)
2877                 return -EINVAL;
2878
2879         len = clen;
2880         buf = io_rw_buffer_select(req, &len, needs_lock);
2881         if (IS_ERR(buf))
2882                 return PTR_ERR(buf);
2883         iov[0].iov_base = buf;
2884         iov[0].iov_len = (compat_size_t) len;
2885         return 0;
2886 }
2887 #endif
2888
2889 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2890                                       bool needs_lock)
2891 {
2892         struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
2893         void __user *buf;
2894         ssize_t len;
2895
2896         if (copy_from_user(iov, uiov, sizeof(*uiov)))
2897                 return -EFAULT;
2898
2899         len = iov[0].iov_len;
2900         if (len < 0)
2901                 return -EINVAL;
2902         buf = io_rw_buffer_select(req, &len, needs_lock);
2903         if (IS_ERR(buf))
2904                 return PTR_ERR(buf);
2905         iov[0].iov_base = buf;
2906         iov[0].iov_len = len;
2907         return 0;
2908 }
2909
2910 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2911                                     bool needs_lock)
2912 {
2913         if (req->flags & REQ_F_BUFFER_SELECTED) {
2914                 struct io_buffer *kbuf;
2915
2916                 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2917                 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
2918                 iov[0].iov_len = kbuf->len;
2919                 return 0;
2920         }
2921         if (req->rw.len != 1)
2922                 return -EINVAL;
2923
2924 #ifdef CONFIG_COMPAT
2925         if (req->ctx->compat)
2926                 return io_compat_import(req, iov, needs_lock);
2927 #endif
2928
2929         return __io_iov_buffer_select(req, iov, needs_lock);
2930 }
2931
2932 static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
2933                            struct iov_iter *iter, bool needs_lock)
2934 {
2935         void __user *buf = u64_to_user_ptr(req->rw.addr);
2936         size_t sqe_len = req->rw.len;
2937         u8 opcode = req->opcode;
2938         ssize_t ret;
2939
2940         if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
2941                 *iovec = NULL;
2942                 return io_import_fixed(req, rw, iter);
2943         }
2944
2945         /* buffer index only valid with fixed read/write, or buffer select  */
2946         if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
2947                 return -EINVAL;
2948
2949         if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
2950                 if (req->flags & REQ_F_BUFFER_SELECT) {
2951                         buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
2952                         if (IS_ERR(buf))
2953                                 return PTR_ERR(buf);
2954                         req->rw.len = sqe_len;
2955                 }
2956
2957                 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
2958                 *iovec = NULL;
2959                 return ret;
2960         }
2961
2962         if (req->flags & REQ_F_BUFFER_SELECT) {
2963                 ret = io_iov_buffer_select(req, *iovec, needs_lock);
2964                 if (!ret)
2965                         iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
2966                 *iovec = NULL;
2967                 return ret;
2968         }
2969
2970         return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
2971                               req->ctx->compat);
2972 }
2973
2974 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
2975 {
2976         return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
2977 }
2978
2979 /*
2980  * For files that don't have ->read_iter() and ->write_iter(), handle them
2981  * by looping over ->read() or ->write() manually.
2982  */
2983 static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
2984 {
2985         struct kiocb *kiocb = &req->rw.kiocb;
2986         struct file *file = req->file;
2987         ssize_t ret = 0;
2988
2989         /*
2990          * Don't support polled IO through this interface, and we can't
2991          * support non-blocking either. For the latter, this just causes
2992          * the kiocb to be handled from an async context.
2993          */
2994         if (kiocb->ki_flags & IOCB_HIPRI)
2995                 return -EOPNOTSUPP;
2996         if (kiocb->ki_flags & IOCB_NOWAIT)
2997                 return -EAGAIN;
2998
2999         while (iov_iter_count(iter)) {
3000                 struct iovec iovec;
3001                 ssize_t nr;
3002
3003                 if (!iov_iter_is_bvec(iter)) {
3004                         iovec = iov_iter_iovec(iter);
3005                 } else {
3006                         iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3007                         iovec.iov_len = req->rw.len;
3008                 }
3009
3010                 if (rw == READ) {
3011                         nr = file->f_op->read(file, iovec.iov_base,
3012                                               iovec.iov_len, io_kiocb_ppos(kiocb));
3013                 } else {
3014                         nr = file->f_op->write(file, iovec.iov_base,
3015                                                iovec.iov_len, io_kiocb_ppos(kiocb));
3016                 }
3017
3018                 if (nr < 0) {
3019                         if (!ret)
3020                                 ret = nr;
3021                         break;
3022                 }
3023                 ret += nr;
3024                 if (nr != iovec.iov_len)
3025                         break;
3026                 req->rw.len -= nr;
3027                 req->rw.addr += nr;
3028                 iov_iter_advance(iter, nr);
3029         }
3030
3031         return ret;
3032 }
3033
3034 static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3035                           const struct iovec *fast_iov, struct iov_iter *iter)
3036 {
3037         struct io_async_rw *rw = req->async_data;
3038
3039         memcpy(&rw->iter, iter, sizeof(*iter));
3040         rw->free_iovec = iovec;
3041         rw->bytes_done = 0;
3042         /* can only be fixed buffers, no need to do anything */
3043         if (iov_iter_is_bvec(iter))
3044                 return;
3045         if (!iovec) {
3046                 unsigned iov_off = 0;
3047
3048                 rw->iter.iov = rw->fast_iov;
3049                 if (iter->iov != fast_iov) {
3050                         iov_off = iter->iov - fast_iov;
3051                         rw->iter.iov += iov_off;
3052                 }
3053                 if (rw->fast_iov != fast_iov)
3054                         memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
3055                                sizeof(struct iovec) * iter->nr_segs);
3056         } else {
3057                 req->flags |= REQ_F_NEED_CLEANUP;
3058         }
3059 }
3060
3061 static inline int __io_alloc_async_data(struct io_kiocb *req)
3062 {
3063         WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3064         req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
3065         return req->async_data == NULL;
3066 }
3067
3068 static int io_alloc_async_data(struct io_kiocb *req)
3069 {
3070         if (!io_op_defs[req->opcode].needs_async_data)
3071                 return 0;
3072
3073         return  __io_alloc_async_data(req);
3074 }
3075
3076 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3077                              const struct iovec *fast_iov,
3078                              struct iov_iter *iter, bool force)
3079 {
3080         if (!force && !io_op_defs[req->opcode].needs_async_data)
3081                 return 0;
3082         if (!req->async_data) {
3083                 if (__io_alloc_async_data(req)) {
3084                         kfree(iovec);
3085                         return -ENOMEM;
3086                 }
3087
3088                 io_req_map_rw(req, iovec, fast_iov, iter);
3089         }
3090         return 0;
3091 }
3092
3093 static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
3094 {
3095         struct io_async_rw *iorw = req->async_data;
3096         struct iovec *iov = iorw->fast_iov;
3097         int ret;
3098
3099         ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
3100         if (unlikely(ret < 0))
3101                 return ret;
3102
3103         iorw->bytes_done = 0;
3104         iorw->free_iovec = iov;
3105         if (iov)
3106                 req->flags |= REQ_F_NEED_CLEANUP;
3107         return 0;
3108 }
3109
3110 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3111 {
3112         if (unlikely(!(req->file->f_mode & FMODE_READ)))
3113                 return -EBADF;
3114         return io_prep_rw(req, sqe);
3115 }
3116
3117 /*
3118  * This is our waitqueue callback handler, registered through lock_page_async()
3119  * when we initially tried to do the IO with the iocb armed our waitqueue.
3120  * This gets called when the page is unlocked, and we generally expect that to
3121  * happen when the page IO is completed and the page is now uptodate. This will
3122  * queue a task_work based retry of the operation, attempting to copy the data
3123  * again. If the latter fails because the page was NOT uptodate, then we will
3124  * do a thread based blocking retry of the operation. That's the unexpected
3125  * slow path.
3126  */
3127 static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3128                              int sync, void *arg)
3129 {
3130         struct wait_page_queue *wpq;
3131         struct io_kiocb *req = wait->private;
3132         struct wait_page_key *key = arg;
3133
3134         wpq = container_of(wait, struct wait_page_queue, wait);
3135
3136         if (!wake_page_match(wpq, key))
3137                 return 0;
3138
3139         req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
3140         list_del_init(&wait->entry);
3141
3142         /* submit ref gets dropped, acquire a new one */
3143         refcount_inc(&req->refs);
3144         io_req_task_queue(req);
3145         return 1;
3146 }
3147
3148 /*
3149  * This controls whether a given IO request should be armed for async page
3150  * based retry. If we return false here, the request is handed to the async
3151  * worker threads for retry. If we're doing buffered reads on a regular file,
3152  * we prepare a private wait_page_queue entry and retry the operation. This
3153  * will either succeed because the page is now uptodate and unlocked, or it
3154  * will register a callback when the page is unlocked at IO completion. Through
3155  * that callback, io_uring uses task_work to setup a retry of the operation.
3156  * That retry will attempt the buffered read again. The retry will generally
3157  * succeed, or in rare cases where it fails, we then fall back to using the
3158  * async worker threads for a blocking retry.
3159  */
3160 static bool io_rw_should_retry(struct io_kiocb *req)
3161 {
3162         struct io_async_rw *rw = req->async_data;
3163         struct wait_page_queue *wait = &rw->wpq;
3164         struct kiocb *kiocb = &req->rw.kiocb;
3165
3166         /* never retry for NOWAIT, we just complete with -EAGAIN */
3167         if (req->flags & REQ_F_NOWAIT)
3168                 return false;
3169
3170         /* Only for buffered IO */
3171         if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
3172                 return false;
3173
3174         /*
3175          * just use poll if we can, and don't attempt if the fs doesn't
3176          * support callback based unlocks
3177          */
3178         if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3179                 return false;
3180
3181         wait->wait.func = io_async_buf_func;
3182         wait->wait.private = req;
3183         wait->wait.flags = 0;
3184         INIT_LIST_HEAD(&wait->wait.entry);
3185         kiocb->ki_flags |= IOCB_WAITQ;
3186         kiocb->ki_flags &= ~IOCB_NOWAIT;
3187         kiocb->ki_waitq = wait;
3188         return true;
3189 }
3190
3191 static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
3192 {
3193         if (req->file->f_op->read_iter)
3194                 return call_read_iter(req->file, &req->rw.kiocb, iter);
3195         else if (req->file->f_op->read)
3196                 return loop_rw_iter(READ, req, iter);
3197         else
3198                 return -EINVAL;
3199 }
3200
3201 static int io_read(struct io_kiocb *req, unsigned int issue_flags)
3202 {
3203         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3204         struct kiocb *kiocb = &req->rw.kiocb;
3205         struct iov_iter __iter, *iter = &__iter;
3206         struct io_async_rw *rw = req->async_data;
3207         ssize_t io_size, ret, ret2;
3208         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3209
3210         if (rw) {
3211                 iter = &rw->iter;
3212                 iovec = NULL;
3213         } else {
3214                 ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
3215                 if (ret < 0)
3216                         return ret;
3217         }
3218         io_size = iov_iter_count(iter);
3219         req->result = io_size;
3220
3221         /* Ensure we clear previously set non-block flag */
3222         if (!force_nonblock)
3223                 kiocb->ki_flags &= ~IOCB_NOWAIT;
3224         else
3225                 kiocb->ki_flags |= IOCB_NOWAIT;
3226
3227         /* If the file doesn't support async, just async punt */
3228         if (force_nonblock && !io_file_supports_async(req->file, READ)) {
3229                 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3230                 return ret ?: -EAGAIN;
3231         }
3232
3233         ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
3234         if (unlikely(ret)) {
3235                 kfree(iovec);
3236                 return ret;
3237         }
3238
3239         ret = io_iter_do_read(req, iter);
3240
3241         if (ret == -EIOCBQUEUED) {
3242                 if (req->async_data)
3243                         iov_iter_revert(iter, io_size - iov_iter_count(iter));
3244                 goto out_free;
3245         } else if (ret == -EAGAIN) {
3246                 /* IOPOLL retry should happen for io-wq threads */
3247                 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
3248                         goto done;
3249                 /* no retry on NONBLOCK nor RWF_NOWAIT */
3250                 if (req->flags & REQ_F_NOWAIT)
3251                         goto done;
3252                 /* some cases will consume bytes even on error returns */
3253                 iov_iter_revert(iter, io_size - iov_iter_count(iter));
3254                 ret = 0;
3255         } else if (ret <= 0 || ret == io_size || !force_nonblock ||
3256                    (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) {
3257                 /* read all, failed, already did sync or don't want to retry */
3258                 goto done;
3259         }
3260
3261         ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3262         if (ret2)
3263                 return ret2;
3264
3265         iovec = NULL;
3266         rw = req->async_data;
3267         /* now use our persistent iterator, if we aren't already */
3268         iter = &rw->iter;
3269
3270         do {
3271                 io_size -= ret;
3272                 rw->bytes_done += ret;
3273                 /* if we can retry, do so with the callbacks armed */
3274                 if (!io_rw_should_retry(req)) {
3275                         kiocb->ki_flags &= ~IOCB_WAITQ;
3276                         return -EAGAIN;
3277                 }
3278
3279                 /*
3280                  * Now retry read with the IOCB_WAITQ parts set in the iocb. If
3281                  * we get -EIOCBQUEUED, then we'll get a notification when the
3282                  * desired page gets unlocked. We can also get a partial read
3283                  * here, and if we do, then just retry at the new offset.
3284                  */
3285                 ret = io_iter_do_read(req, iter);
3286                 if (ret == -EIOCBQUEUED)
3287                         return 0;
3288                 /* we got some bytes, but not all. retry. */
3289         } while (ret > 0 && ret < io_size);
3290 done:
3291         kiocb_done(kiocb, ret, issue_flags);
3292 out_free:
3293         /* it's faster to check here then delegate to kfree */
3294         if (iovec)
3295                 kfree(iovec);
3296         return 0;
3297 }
3298
3299 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3300 {
3301         if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3302                 return -EBADF;
3303         return io_prep_rw(req, sqe);
3304 }
3305
3306 static int io_write(struct io_kiocb *req, unsigned int issue_flags)
3307 {
3308         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3309         struct kiocb *kiocb = &req->rw.kiocb;
3310         struct iov_iter __iter, *iter = &__iter;
3311         struct io_async_rw *rw = req->async_data;
3312         ssize_t ret, ret2, io_size;
3313         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3314
3315         if (rw) {
3316                 iter = &rw->iter;
3317                 iovec = NULL;
3318         } else {
3319                 ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
3320                 if (ret < 0)
3321                         return ret;
3322         }
3323         io_size = iov_iter_count(iter);
3324         req->result = io_size;
3325
3326         /* Ensure we clear previously set non-block flag */
3327         if (!force_nonblock)
3328                 kiocb->ki_flags &= ~IOCB_NOWAIT;
3329         else
3330                 kiocb->ki_flags |= IOCB_NOWAIT;
3331
3332         /* If the file doesn't support async, just async punt */
3333         if (force_nonblock && !io_file_supports_async(req->file, WRITE))
3334                 goto copy_iov;
3335
3336         /* file path doesn't support NOWAIT for non-direct_IO */
3337         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3338             (req->flags & REQ_F_ISREG))
3339                 goto copy_iov;
3340
3341         ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
3342         if (unlikely(ret))
3343                 goto out_free;
3344
3345         /*
3346          * Open-code file_start_write here to grab freeze protection,
3347          * which will be released by another thread in
3348          * io_complete_rw().  Fool lockdep by telling it the lock got
3349          * released so that it doesn't complain about the held lock when
3350          * we return to userspace.
3351          */
3352         if (req->flags & REQ_F_ISREG) {
3353                 sb_start_write(file_inode(req->file)->i_sb);
3354                 __sb_writers_release(file_inode(req->file)->i_sb,
3355                                         SB_FREEZE_WRITE);
3356         }
3357         kiocb->ki_flags |= IOCB_WRITE;
3358
3359         if (req->file->f_op->write_iter)
3360                 ret2 = call_write_iter(req->file, kiocb, iter);
3361         else if (req->file->f_op->write)
3362                 ret2 = loop_rw_iter(WRITE, req, iter);
3363         else
3364                 ret2 = -EINVAL;
3365
3366         /*
3367          * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3368          * retry them without IOCB_NOWAIT.
3369          */
3370         if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3371                 ret2 = -EAGAIN;
3372         /* no retry on NONBLOCK nor RWF_NOWAIT */
3373         if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
3374                 goto done;
3375         if (ret2 == -EIOCBQUEUED && req->async_data)
3376                 iov_iter_revert(iter, io_size - iov_iter_count(iter));
3377         if (!force_nonblock || ret2 != -EAGAIN) {
3378                 /* IOPOLL retry should happen for io-wq threads */
3379                 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
3380                         goto copy_iov;
3381 done:
3382                 kiocb_done(kiocb, ret2, issue_flags);
3383         } else {
3384 copy_iov:
3385                 /* some cases will consume bytes even on error returns */
3386                 iov_iter_revert(iter, io_size - iov_iter_count(iter));
3387                 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
3388                 return ret ?: -EAGAIN;
3389         }
3390 out_free:
3391         /* it's reportedly faster than delegating the null check to kfree() */
3392         if (iovec)
3393                 kfree(iovec);
3394         return ret;
3395 }
3396
3397 static int io_renameat_prep(struct io_kiocb *req,
3398                             const struct io_uring_sqe *sqe)
3399 {
3400         struct io_rename *ren = &req->rename;
3401         const char __user *oldf, *newf;
3402
3403         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3404                 return -EBADF;
3405
3406         ren->old_dfd = READ_ONCE(sqe->fd);
3407         oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3408         newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3409         ren->new_dfd = READ_ONCE(sqe->len);
3410         ren->flags = READ_ONCE(sqe->rename_flags);
3411
3412         ren->oldpath = getname(oldf);
3413         if (IS_ERR(ren->oldpath))
3414                 return PTR_ERR(ren->oldpath);
3415
3416         ren->newpath = getname(newf);
3417         if (IS_ERR(ren->newpath)) {
3418                 putname(ren->oldpath);
3419                 return PTR_ERR(ren->newpath);
3420         }
3421
3422         req->flags |= REQ_F_NEED_CLEANUP;
3423         return 0;
3424 }
3425
3426 static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
3427 {
3428         struct io_rename *ren = &req->rename;
3429         int ret;
3430
3431         if (issue_flags & IO_URING_F_NONBLOCK)
3432                 return -EAGAIN;
3433
3434         ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
3435                                 ren->newpath, ren->flags);
3436
3437         req->flags &= ~REQ_F_NEED_CLEANUP;
3438         if (ret < 0)
3439                 req_set_fail_links(req);
3440         io_req_complete(req, ret);
3441         return 0;
3442 }
3443
3444 static int io_unlinkat_prep(struct io_kiocb *req,
3445                             const struct io_uring_sqe *sqe)
3446 {
3447         struct io_unlink *un = &req->unlink;
3448         const char __user *fname;
3449
3450         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3451                 return -EBADF;
3452
3453         un->dfd = READ_ONCE(sqe->fd);
3454
3455         un->flags = READ_ONCE(sqe->unlink_flags);
3456         if (un->flags & ~AT_REMOVEDIR)
3457                 return -EINVAL;
3458
3459         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3460         un->filename = getname(fname);
3461         if (IS_ERR(un->filename))
3462                 return PTR_ERR(un->filename);
3463
3464         req->flags |= REQ_F_NEED_CLEANUP;
3465         return 0;
3466 }
3467
3468 static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
3469 {
3470         struct io_unlink *un = &req->unlink;
3471         int ret;
3472
3473         if (issue_flags & IO_URING_F_NONBLOCK)
3474                 return -EAGAIN;
3475
3476         if (un->flags & AT_REMOVEDIR)
3477                 ret = do_rmdir(un->dfd, un->filename);
3478         else
3479                 ret = do_unlinkat(un->dfd, un->filename);
3480
3481         req->flags &= ~REQ_F_NEED_CLEANUP;
3482         if (ret < 0)
3483                 req_set_fail_links(req);
3484         io_req_complete(req, ret);
3485         return 0;
3486 }
3487
3488 static int io_shutdown_prep(struct io_kiocb *req,
3489                             const struct io_uring_sqe *sqe)
3490 {
3491 #if defined(CONFIG_NET)
3492         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3493                 return -EINVAL;
3494         if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
3495             sqe->buf_index)
3496                 return -EINVAL;
3497
3498         req->shutdown.how = READ_ONCE(sqe->len);
3499         return 0;
3500 #else
3501         return -EOPNOTSUPP;
3502 #endif
3503 }
3504
3505 static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
3506 {
3507 #if defined(CONFIG_NET)
3508         struct socket *sock;
3509         int ret;
3510
3511         if (issue_flags & IO_URING_F_NONBLOCK)
3512                 return -EAGAIN;
3513
3514         sock = sock_from_file(req->file);
3515         if (unlikely(!sock))
3516                 return -ENOTSOCK;
3517
3518         ret = __sys_shutdown_sock(sock, req->shutdown.how);
3519         if (ret < 0)
3520                 req_set_fail_links(req);
3521         io_req_complete(req, ret);
3522         return 0;
3523 #else
3524         return -EOPNOTSUPP;
3525 #endif
3526 }
3527
3528 static int __io_splice_prep(struct io_kiocb *req,
3529                             const struct io_uring_sqe *sqe)
3530 {
3531         struct io_splice* sp = &req->splice;
3532         unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
3533
3534         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3535                 return -EINVAL;
3536
3537         sp->file_in = NULL;
3538         sp->len = READ_ONCE(sqe->len);
3539         sp->flags = READ_ONCE(sqe->splice_flags);
3540
3541         if (unlikely(sp->flags & ~valid_flags))
3542                 return -EINVAL;
3543
3544         sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in),
3545                                   (sp->flags & SPLICE_F_FD_IN_FIXED));
3546         if (!sp->file_in)
3547                 return -EBADF;
3548         req->flags |= REQ_F_NEED_CLEANUP;
3549
3550         if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
3551                 /*
3552                  * Splice operation will be punted aync, and here need to
3553                  * modify io_wq_work.flags, so initialize io_wq_work firstly.
3554                  */
3555                 req->work.flags |= IO_WQ_WORK_UNBOUND;
3556         }
3557
3558         return 0;
3559 }
3560
3561 static int io_tee_prep(struct io_kiocb *req,
3562                        const struct io_uring_sqe *sqe)
3563 {
3564         if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
3565                 return -EINVAL;
3566         return __io_splice_prep(req, sqe);
3567 }
3568
3569 static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
3570 {
3571         struct io_splice *sp = &req->splice;
3572         struct file *in = sp->file_in;
3573         struct file *out = sp->file_out;
3574         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3575         long ret = 0;
3576
3577         if (issue_flags & IO_URING_F_NONBLOCK)
3578                 return -EAGAIN;
3579         if (sp->len)
3580                 ret = do_tee(in, out, sp->len, flags);
3581
3582         io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3583         req->flags &= ~REQ_F_NEED_CLEANUP;
3584
3585         if (ret != sp->len)
3586                 req_set_fail_links(req);
3587         io_req_complete(req, ret);
3588         return 0;
3589 }
3590
3591 static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3592 {
3593         struct io_splice* sp = &req->splice;
3594
3595         sp->off_in = READ_ONCE(sqe->splice_off_in);
3596         sp->off_out = READ_ONCE(sqe->off);
3597         return __io_splice_prep(req, sqe);
3598 }
3599
3600 static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
3601 {
3602         struct io_splice *sp = &req->splice;
3603         struct file *in = sp->file_in;
3604         struct file *out = sp->file_out;
3605         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3606         loff_t *poff_in, *poff_out;
3607         long ret = 0;
3608
3609         if (issue_flags & IO_URING_F_NONBLOCK)
3610                 return -EAGAIN;
3611
3612         poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
3613         poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
3614
3615         if (sp->len)
3616                 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
3617
3618         io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3619         req->flags &= ~REQ_F_NEED_CLEANUP;
3620
3621         if (ret != sp->len)
3622                 req_set_fail_links(req);
3623         io_req_complete(req, ret);
3624         return 0;
3625 }
3626
3627 /*
3628  * IORING_OP_NOP just posts a completion event, nothing else.
3629  */
3630 static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
3631 {
3632         struct io_ring_ctx *ctx = req->ctx;
3633
3634         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3635                 return -EINVAL;
3636
3637         __io_req_complete(req, issue_flags, 0, 0);
3638         return 0;
3639 }
3640
3641 static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3642 {
3643         struct io_ring_ctx *ctx = req->ctx;
3644
3645         if (!req->file)
3646                 return -EBADF;
3647
3648         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3649                 return -EINVAL;
3650         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
3651                 return -EINVAL;
3652
3653         req->sync.flags = READ_ONCE(sqe->fsync_flags);
3654         if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
3655                 return -EINVAL;
3656
3657         req->sync.off = READ_ONCE(sqe->off);
3658         req->sync.len = READ_ONCE(sqe->len);
3659         return 0;
3660 }
3661
3662 static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
3663 {
3664         loff_t end = req->sync.off + req->sync.len;
3665         int ret;
3666
3667         /* fsync always requires a blocking context */
3668         if (issue_flags & IO_URING_F_NONBLOCK)
3669                 return -EAGAIN;
3670
3671         ret = vfs_fsync_range(req->file, req->sync.off,
3672                                 end > 0 ? end : LLONG_MAX,
3673                                 req->sync.flags & IORING_FSYNC_DATASYNC);
3674         if (ret < 0)
3675                 req_set_fail_links(req);
3676         io_req_complete(req, ret);
3677         return 0;
3678 }
3679
3680 static int io_fallocate_prep(struct io_kiocb *req,
3681                              const struct io_uring_sqe *sqe)
3682 {
3683         if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
3684                 return -EINVAL;
3685         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3686                 return -EINVAL;
3687
3688         req->sync.off = READ_ONCE(sqe->off);
3689         req->sync.len = READ_ONCE(sqe->addr);
3690         req->sync.mode = READ_ONCE(sqe->len);
3691         return 0;
3692 }
3693
3694 static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
3695 {
3696         int ret;
3697
3698         /* fallocate always requiring blocking context */
3699         if (issue_flags & IO_URING_F_NONBLOCK)
3700                 return -EAGAIN;
3701         ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
3702                                 req->sync.len);
3703         if (ret < 0)
3704                 req_set_fail_links(req);
3705         io_req_complete(req, ret);
3706         return 0;
3707 }
3708
3709 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3710 {
3711         const char __user *fname;
3712         int ret;
3713
3714         if (unlikely(sqe->ioprio || sqe->buf_index))
3715                 return -EINVAL;
3716         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3717                 return -EBADF;
3718
3719         /* open.how should be already initialised */
3720         if (!(req->open.how.flags & O_PATH) && force_o_largefile())
3721                 req->open.how.flags |= O_LARGEFILE;
3722
3723         req->open.dfd = READ_ONCE(sqe->fd);
3724         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3725         req->open.filename = getname(fname);
3726         if (IS_ERR(req->open.filename)) {
3727                 ret = PTR_ERR(req->open.filename);
3728                 req->open.filename = NULL;
3729                 return ret;
3730         }
3731         req->open.nofile = rlimit(RLIMIT_NOFILE);
3732         req->flags |= REQ_F_NEED_CLEANUP;
3733         return 0;
3734 }
3735
3736 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3737 {
3738         u64 flags, mode;
3739
3740         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3741                 return -EINVAL;
3742         mode = READ_ONCE(sqe->len);
3743         flags = READ_ONCE(sqe->open_flags);
3744         req->open.how = build_open_how(flags, mode);
3745         return __io_openat_prep(req, sqe);
3746 }
3747
3748 static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3749 {
3750         struct open_how __user *how;
3751         size_t len;
3752         int ret;
3753
3754         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3755                 return -EINVAL;
3756         how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3757         len = READ_ONCE(sqe->len);
3758         if (len < OPEN_HOW_SIZE_VER0)
3759                 return -EINVAL;
3760
3761         ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
3762                                         len);
3763         if (ret)
3764                 return ret;
3765
3766         return __io_openat_prep(req, sqe);
3767 }
3768
3769 static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
3770 {
3771         struct open_flags op;
3772         struct file *file;
3773         bool nonblock_set;
3774         bool resolve_nonblock;
3775         int ret;
3776
3777         ret = build_open_flags(&req->open.how, &op);
3778         if (ret)
3779                 goto err;
3780         nonblock_set = op.open_flag & O_NONBLOCK;
3781         resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
3782         if (issue_flags & IO_URING_F_NONBLOCK) {
3783                 /*
3784                  * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
3785                  * it'll always -EAGAIN
3786                  */
3787                 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
3788                         return -EAGAIN;
3789                 op.lookup_flags |= LOOKUP_CACHED;
3790                 op.open_flag |= O_NONBLOCK;
3791         }
3792
3793         ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
3794         if (ret < 0)
3795                 goto err;
3796
3797         file = do_filp_open(req->open.dfd, req->open.filename, &op);
3798         /* only retry if RESOLVE_CACHED wasn't already set by application */
3799         if ((!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)) &&
3800             file == ERR_PTR(-EAGAIN)) {
3801                 /*
3802                  * We could hang on to this 'fd', but seems like marginal
3803                  * gain for something that is now known to be a slower path.
3804                  * So just put it, and we'll get a new one when we retry.
3805                  */
3806                 put_unused_fd(ret);
3807                 return -EAGAIN;
3808         }
3809
3810         if (IS_ERR(file)) {
3811                 put_unused_fd(ret);
3812                 ret = PTR_ERR(file);
3813         } else {
3814                 if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
3815                         file->f_flags &= ~O_NONBLOCK;
3816                 fsnotify_open(file);
3817                 fd_install(ret, file);
3818         }
3819 err:
3820         putname(req->open.filename);
3821         req->flags &= ~REQ_F_NEED_CLEANUP;
3822         if (ret < 0)
3823                 req_set_fail_links(req);
3824         io_req_complete(req, ret);
3825         return 0;
3826 }
3827
3828 static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
3829 {
3830         return io_openat2(req, issue_flags & IO_URING_F_NONBLOCK);
3831 }
3832
3833 static int io_remove_buffers_prep(struct io_kiocb *req,
3834                                   const struct io_uring_sqe *sqe)
3835 {
3836         struct io_provide_buf *p = &req->pbuf;
3837         u64 tmp;
3838
3839         if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
3840                 return -EINVAL;
3841
3842         tmp = READ_ONCE(sqe->fd);
3843         if (!tmp || tmp > USHRT_MAX)
3844                 return -EINVAL;
3845
3846         memset(p, 0, sizeof(*p));
3847         p->nbufs = tmp;
3848         p->bgid = READ_ONCE(sqe->buf_group);
3849         return 0;
3850 }
3851
3852 static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
3853                                int bgid, unsigned nbufs)
3854 {
3855         unsigned i = 0;
3856
3857         /* shouldn't happen */
3858         if (!nbufs)
3859                 return 0;
3860
3861         /* the head kbuf is the list itself */
3862         while (!list_empty(&buf->list)) {
3863                 struct io_buffer *nxt;
3864
3865                 nxt = list_first_entry(&buf->list, struct io_buffer, list);
3866                 list_del(&nxt->list);
3867                 kfree(nxt);
3868                 if (++i == nbufs)
3869                         return i;
3870         }
3871         i++;
3872         kfree(buf);
3873         idr_remove(&ctx->io_buffer_idr, bgid);
3874
3875         return i;
3876 }
3877
3878 static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
3879 {
3880         struct io_provide_buf *p = &req->pbuf;
3881         struct io_ring_ctx *ctx = req->ctx;
3882         struct io_buffer *head;
3883         int ret = 0;
3884         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3885
3886         io_ring_submit_lock(ctx, !force_nonblock);
3887
3888         lockdep_assert_held(&ctx->uring_lock);
3889
3890         ret = -ENOENT;
3891         head = idr_find(&ctx->io_buffer_idr, p->bgid);
3892         if (head)
3893                 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
3894         if (ret < 0)
3895                 req_set_fail_links(req);
3896
3897         /* need to hold the lock to complete IOPOLL requests */
3898         if (ctx->flags & IORING_SETUP_IOPOLL) {
3899                 __io_req_complete(req, issue_flags, ret, 0);
3900                 io_ring_submit_unlock(ctx, !force_nonblock);
3901         } else {
3902                 io_ring_submit_unlock(ctx, !force_nonblock);
3903                 __io_req_complete(req, issue_flags, ret, 0);
3904         }
3905         return 0;
3906 }
3907
3908 static int io_provide_buffers_prep(struct io_kiocb *req,
3909                                    const struct io_uring_sqe *sqe)
3910 {
3911         struct io_provide_buf *p = &req->pbuf;
3912         u64 tmp;
3913
3914         if (sqe->ioprio || sqe->rw_flags)
3915                 return -EINVAL;
3916
3917         tmp = READ_ONCE(sqe->fd);
3918         if (!tmp || tmp > USHRT_MAX)
3919                 return -E2BIG;
3920         p->nbufs = tmp;
3921         p->addr = READ_ONCE(sqe->addr);
3922         p->len = READ_ONCE(sqe->len);
3923
3924         if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
3925                 return -EFAULT;
3926
3927         p->bgid = READ_ONCE(sqe->buf_group);
3928         tmp = READ_ONCE(sqe->off);
3929         if (tmp > USHRT_MAX)
3930                 return -E2BIG;
3931         p->bid = tmp;
3932         return 0;
3933 }
3934
3935 static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
3936 {
3937         struct io_buffer *buf;
3938         u64 addr = pbuf->addr;
3939         int i, bid = pbuf->bid;
3940
3941         for (i = 0; i < pbuf->nbufs; i++) {
3942                 buf = kmalloc(sizeof(*buf), GFP_KERNEL);
3943                 if (!buf)
3944                         break;
3945
3946                 buf->addr = addr;
3947                 buf->len = pbuf->len;
3948                 buf->bid = bid;
3949                 addr += pbuf->len;
3950                 bid++;
3951                 if (!*head) {
3952                         INIT_LIST_HEAD(&buf->list);
3953                         *head = buf;
3954                 } else {
3955                         list_add_tail(&buf->list, &(*head)->list);
3956                 }
3957         }
3958
3959         return i ? i : -ENOMEM;
3960 }
3961
3962 static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
3963 {
3964         struct io_provide_buf *p = &req->pbuf;
3965         struct io_ring_ctx *ctx = req->ctx;
3966         struct io_buffer *head, *list;
3967         int ret = 0;
3968         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3969
3970         io_ring_submit_lock(ctx, !force_nonblock);
3971
3972         lockdep_assert_held(&ctx->uring_lock);
3973
3974         list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
3975
3976         ret = io_add_buffers(p, &head);
3977         if (ret < 0)
3978                 goto out;
3979
3980         if (!list) {
3981                 ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
3982                                         GFP_KERNEL);
3983                 if (ret < 0) {
3984                         __io_remove_buffers(ctx, head, p->bgid, -1U);
3985                         goto out;
3986                 }
3987         }
3988 out:
3989         if (ret < 0)
3990                 req_set_fail_links(req);
3991
3992         /* need to hold the lock to complete IOPOLL requests */
3993         if (ctx->flags & IORING_SETUP_IOPOLL) {
3994                 __io_req_complete(req, issue_flags, ret, 0);
3995                 io_ring_submit_unlock(ctx, !force_nonblock);
3996         } else {
3997                 io_ring_submit_unlock(ctx, !force_nonblock);
3998                 __io_req_complete(req, issue_flags, ret, 0);
3999         }
4000         return 0;
4001 }
4002
4003 static int io_epoll_ctl_prep(struct io_kiocb *req,
4004                              const struct io_uring_sqe *sqe)
4005 {
4006 #if defined(CONFIG_EPOLL)
4007         if (sqe->ioprio || sqe->buf_index)
4008                 return -EINVAL;
4009         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
4010                 return -EINVAL;
4011
4012         req->epoll.epfd = READ_ONCE(sqe->fd);
4013         req->epoll.op = READ_ONCE(sqe->len);
4014         req->epoll.fd = READ_ONCE(sqe->off);
4015
4016         if (ep_op_has_event(req->epoll.op)) {
4017                 struct epoll_event __user *ev;
4018
4019                 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4020                 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4021                         return -EFAULT;
4022         }
4023
4024         return 0;
4025 #else
4026         return -EOPNOTSUPP;
4027 #endif
4028 }
4029
4030 static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
4031 {
4032 #if defined(CONFIG_EPOLL)
4033         struct io_epoll *ie = &req->epoll;
4034         int ret;
4035         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4036
4037         ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4038         if (force_nonblock && ret == -EAGAIN)
4039                 return -EAGAIN;
4040
4041         if (ret < 0)
4042                 req_set_fail_links(req);
4043         __io_req_complete(req, issue_flags, ret, 0);
4044         return 0;
4045 #else
4046         return -EOPNOTSUPP;
4047 #endif
4048 }
4049
4050 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4051 {
4052 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4053         if (sqe->ioprio || sqe->buf_index || sqe->off)
4054                 return -EINVAL;
4055         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4056                 return -EINVAL;
4057
4058         req->madvise.addr = READ_ONCE(sqe->addr);
4059         req->madvise.len = READ_ONCE(sqe->len);
4060         req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4061         return 0;
4062 #else
4063         return -EOPNOTSUPP;
4064 #endif
4065 }
4066
4067 static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
4068 {
4069 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4070         struct io_madvise *ma = &req->madvise;
4071         int ret;
4072
4073         if (issue_flags & IO_URING_F_NONBLOCK)
4074                 return -EAGAIN;
4075
4076         ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
4077         if (ret < 0)
4078                 req_set_fail_links(req);
4079         io_req_complete(req, ret);
4080         return 0;
4081 #else
4082         return -EOPNOTSUPP;
4083 #endif
4084 }
4085
4086 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4087 {
4088         if (sqe->ioprio || sqe->buf_index || sqe->addr)
4089                 return -EINVAL;
4090         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4091                 return -EINVAL;
4092
4093         req->fadvise.offset = READ_ONCE(sqe->off);
4094         req->fadvise.len = READ_ONCE(sqe->len);
4095         req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
4096         return 0;
4097 }
4098
4099 static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
4100 {
4101         struct io_fadvise *fa = &req->fadvise;
4102         int ret;
4103
4104         if (issue_flags & IO_URING_F_NONBLOCK) {
4105                 switch (fa->advice) {
4106                 case POSIX_FADV_NORMAL:
4107                 case POSIX_FADV_RANDOM:
4108                 case POSIX_FADV_SEQUENTIAL:
4109                         break;
4110                 default:
4111                         return -EAGAIN;
4112                 }
4113         }
4114
4115         ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
4116         if (ret < 0)
4117                 req_set_fail_links(req);
4118         io_req_complete(req, ret);
4119         return 0;
4120 }
4121
4122 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4123 {
4124         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
4125                 return -EINVAL;
4126         if (sqe->ioprio || sqe->buf_index)
4127                 return -EINVAL;
4128         if (req->flags & REQ_F_FIXED_FILE)
4129                 return -EBADF;
4130
4131         req->statx.dfd = READ_ONCE(sqe->fd);
4132         req->statx.mask = READ_ONCE(sqe->len);
4133         req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
4134         req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4135         req->statx.flags = READ_ONCE(sqe->statx_flags);
4136
4137         return 0;
4138 }
4139
4140 static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
4141 {
4142         struct io_statx *ctx = &req->statx;
4143         int ret;
4144
4145         if (issue_flags & IO_URING_F_NONBLOCK) {
4146                 /* only need file table for an actual valid fd */
4147                 if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
4148                         req->flags |= REQ_F_NO_FILE_TABLE;
4149                 return -EAGAIN;
4150         }
4151
4152         ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
4153                        ctx->buffer);
4154
4155         if (ret < 0)
4156                 req_set_fail_links(req);
4157         io_req_complete(req, ret);
4158         return 0;
4159 }
4160
4161 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4162 {
4163         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4164                 return -EINVAL;
4165         if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
4166             sqe->rw_flags || sqe->buf_index)
4167                 return -EINVAL;
4168         if (req->flags & REQ_F_FIXED_FILE)
4169                 return -EBADF;
4170
4171         req->close.fd = READ_ONCE(sqe->fd);
4172         return 0;
4173 }
4174
4175 static int io_close(struct io_kiocb *req, unsigned int issue_flags)
4176 {
4177         struct files_struct *files = current->files;
4178         struct io_close *close = &req->close;
4179         struct fdtable *fdt;
4180         struct file *file;
4181         int ret;
4182
4183         file = NULL;
4184         ret = -EBADF;
4185         spin_lock(&files->file_lock);
4186         fdt = files_fdtable(files);
4187         if (close->fd >= fdt->max_fds) {
4188                 spin_unlock(&files->file_lock);
4189                 goto err;
4190         }
4191         file = fdt->fd[close->fd];
4192         if (!file) {
4193                 spin_unlock(&files->file_lock);
4194                 goto err;
4195         }
4196
4197         if (file->f_op == &io_uring_fops) {
4198                 spin_unlock(&files->file_lock);
4199                 file = NULL;
4200                 goto err;
4201         }
4202
4203         /* if the file has a flush method, be safe and punt to async */
4204         if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
4205                 spin_unlock(&files->file_lock);
4206                 return -EAGAIN;
4207         }
4208
4209         ret = __close_fd_get_file(close->fd, &file);
4210         spin_unlock(&files->file_lock);
4211         if (ret < 0) {
4212                 if (ret == -ENOENT)
4213                         ret = -EBADF;
4214                 goto err;
4215         }
4216
4217         /* No ->flush() or already async, safely close from here */
4218         ret = filp_close(file, current->files);
4219 err:
4220         if (ret < 0)
4221                 req_set_fail_links(req);
4222         if (file)
4223                 fput(file);
4224         __io_req_complete(req, issue_flags, ret, 0);
4225         return 0;
4226 }
4227
4228 static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4229 {
4230         struct io_ring_ctx *ctx = req->ctx;
4231
4232         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4233                 return -EINVAL;
4234         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
4235                 return -EINVAL;
4236
4237         req->sync.off = READ_ONCE(sqe->off);
4238         req->sync.len = READ_ONCE(sqe->len);
4239         req->sync.flags = READ_ONCE(sqe->sync_range_flags);
4240         return 0;
4241 }
4242
4243 static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
4244 {
4245         int ret;
4246
4247         /* sync_file_range always requires a blocking context */
4248         if (issue_flags & IO_URING_F_NONBLOCK)
4249                 return -EAGAIN;
4250
4251         ret = sync_file_range(req->file, req->sync.off, req->sync.len,
4252                                 req->sync.flags);
4253         if (ret < 0)
4254                 req_set_fail_links(req);
4255         io_req_complete(req, ret);
4256         return 0;
4257 }
4258
4259 #if defined(CONFIG_NET)
4260 static int io_setup_async_msg(struct io_kiocb *req,
4261                               struct io_async_msghdr *kmsg)
4262 {
4263         struct io_async_msghdr *async_msg = req->async_data;
4264
4265         if (async_msg)
4266                 return -EAGAIN;
4267         if (io_alloc_async_data(req)) {
4268                 kfree(kmsg->free_iov);
4269                 return -ENOMEM;
4270         }
4271         async_msg = req->async_data;
4272         req->flags |= REQ_F_NEED_CLEANUP;
4273         memcpy(async_msg, kmsg, sizeof(*kmsg));
4274         async_msg->msg.msg_name = &async_msg->addr;
4275         /* if were using fast_iov, set it to the new one */
4276         if (!async_msg->free_iov)
4277                 async_msg->msg.msg_iter.iov = async_msg->fast_iov;
4278
4279         return -EAGAIN;
4280 }
4281
4282 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
4283                                struct io_async_msghdr *iomsg)
4284 {
4285         iomsg->msg.msg_name = &iomsg->addr;
4286         iomsg->free_iov = iomsg->fast_iov;
4287         return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
4288                                    req->sr_msg.msg_flags, &iomsg->free_iov);
4289 }
4290
4291 static int io_sendmsg_prep_async(struct io_kiocb *req)
4292 {
4293         int ret;
4294
4295         if (!io_op_defs[req->opcode].needs_async_data)
4296                 return 0;
4297         ret = io_sendmsg_copy_hdr(req, req->async_data);
4298         if (!ret)
4299                 req->flags |= REQ_F_NEED_CLEANUP;
4300         return ret;
4301 }
4302
4303 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4304 {
4305         struct io_sr_msg *sr = &req->sr_msg;
4306
4307         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4308                 return -EINVAL;
4309
4310         sr->msg_flags = READ_ONCE(sqe->msg_flags);
4311         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4312         sr->len = READ_ONCE(sqe->len);
4313
4314 #ifdef CONFIG_COMPAT
4315         if (req->ctx->compat)
4316                 sr->msg_flags |= MSG_CMSG_COMPAT;
4317 #endif
4318         return 0;
4319 }
4320
4321 static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
4322 {
4323         struct io_async_msghdr iomsg, *kmsg;
4324         struct socket *sock;
4325         unsigned flags;
4326         int ret;
4327
4328         sock = sock_from_file(req->file);
4329         if (unlikely(!sock))
4330                 return -ENOTSOCK;
4331
4332         kmsg = req->async_data;
4333         if (!kmsg) {
4334                 ret = io_sendmsg_copy_hdr(req, &iomsg);
4335                 if (ret)
4336                         return ret;
4337                 kmsg = &iomsg;
4338         }
4339
4340         flags = req->sr_msg.msg_flags;
4341         if (flags & MSG_DONTWAIT)
4342                 req->flags |= REQ_F_NOWAIT;
4343         else if (issue_flags & IO_URING_F_NONBLOCK)
4344                 flags |= MSG_DONTWAIT;
4345
4346         ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
4347         if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4348                 return io_setup_async_msg(req, kmsg);
4349         if (ret == -ERESTARTSYS)
4350                 ret = -EINTR;
4351
4352         /* fast path, check for non-NULL to avoid function call */
4353         if (kmsg->free_iov)
4354                 kfree(kmsg->free_iov);
4355         req->flags &= ~REQ_F_NEED_CLEANUP;
4356         if (ret < 0)
4357                 req_set_fail_links(req);
4358         __io_req_complete(req, issue_flags, ret, 0);
4359         return 0;
4360 }
4361
4362 static int io_send(struct io_kiocb *req, unsigned int issue_flags)
4363 {
4364         struct io_sr_msg *sr = &req->sr_msg;
4365         struct msghdr msg;
4366         struct iovec iov;
4367         struct socket *sock;
4368         unsigned flags;
4369         int ret;
4370
4371         sock = sock_from_file(req->file);
4372         if (unlikely(!sock))
4373                 return -ENOTSOCK;
4374
4375         ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
4376         if (unlikely(ret))
4377                 return ret;
4378
4379         msg.msg_name = NULL;
4380         msg.msg_control = NULL;
4381         msg.msg_controllen = 0;
4382         msg.msg_namelen = 0;
4383
4384         flags = req->sr_msg.msg_flags;
4385         if (flags & MSG_DONTWAIT)
4386                 req->flags |= REQ_F_NOWAIT;
4387         else if (issue_flags & IO_URING_F_NONBLOCK)
4388                 flags |= MSG_DONTWAIT;
4389
4390         msg.msg_flags = flags;
4391         ret = sock_sendmsg(sock, &msg);
4392         if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4393                 return -EAGAIN;
4394         if (ret == -ERESTARTSYS)
4395                 ret = -EINTR;
4396
4397         if (ret < 0)
4398                 req_set_fail_links(req);
4399         __io_req_complete(req, issue_flags, ret, 0);
4400         return 0;
4401 }
4402
4403 static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
4404                                  struct io_async_msghdr *iomsg)
4405 {
4406         struct io_sr_msg *sr = &req->sr_msg;
4407         struct iovec __user *uiov;
4408         size_t iov_len;
4409         int ret;
4410
4411         ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
4412                                         &iomsg->uaddr, &uiov, &iov_len);
4413         if (ret)
4414                 return ret;
4415
4416         if (req->flags & REQ_F_BUFFER_SELECT) {
4417                 if (iov_len > 1)
4418                         return -EINVAL;
4419                 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
4420                         return -EFAULT;
4421                 sr->len = iomsg->fast_iov[0].iov_len;
4422                 iomsg->free_iov = NULL;
4423         } else {
4424                 iomsg->free_iov = iomsg->fast_iov;
4425                 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
4426                                      &iomsg->free_iov, &iomsg->msg.msg_iter,
4427                                      false);
4428                 if (ret > 0)
4429                         ret = 0;
4430         }
4431
4432         return ret;
4433 }
4434
4435 #ifdef CONFIG_COMPAT
4436 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
4437                                         struct io_async_msghdr *iomsg)
4438 {
4439         struct compat_msghdr __user *msg_compat;
4440         struct io_sr_msg *sr = &req->sr_msg;
4441         struct compat_iovec __user *uiov;
4442         compat_uptr_t ptr;
4443         compat_size_t len;
4444         int ret;
4445
4446         msg_compat = (struct compat_msghdr __user *) sr->umsg;
4447         ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr,
4448                                         &ptr, &len);
4449         if (ret)
4450                 return ret;
4451
4452         uiov = compat_ptr(ptr);
4453         if (req->flags & REQ_F_BUFFER_SELECT) {
4454                 compat_ssize_t clen;
4455
4456                 if (len > 1)
4457                         return -EINVAL;
4458                 if (!access_ok(uiov, sizeof(*uiov)))
4459                         return -EFAULT;
4460                 if (__get_user(clen, &uiov->iov_len))
4461                         return -EFAULT;
4462                 if (clen < 0)
4463                         return -EINVAL;
4464                 sr->len = clen;
4465                 iomsg->free_iov = NULL;
4466         } else {
4467                 iomsg->free_iov = iomsg->fast_iov;
4468                 ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
4469                                    UIO_FASTIOV, &iomsg->free_iov,
4470                                    &iomsg->msg.msg_iter, true);
4471                 if (ret < 0)
4472                         return ret;
4473         }
4474
4475         return 0;
4476 }
4477 #endif
4478
4479 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
4480                                struct io_async_msghdr *iomsg)
4481 {
4482         iomsg->msg.msg_name = &iomsg->addr;
4483
4484 #ifdef CONFIG_COMPAT
4485         if (req->ctx->compat)
4486                 return __io_compat_recvmsg_copy_hdr(req, iomsg);
4487 #endif
4488
4489         return __io_recvmsg_copy_hdr(req, iomsg);
4490 }
4491
4492 static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
4493                                                bool needs_lock)
4494 {
4495         struct io_sr_msg *sr = &req->sr_msg;
4496         struct io_buffer *kbuf;
4497
4498         kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
4499         if (IS_ERR(kbuf))
4500                 return kbuf;
4501
4502         sr->kbuf = kbuf;
4503         req->flags |= REQ_F_BUFFER_SELECTED;
4504         return kbuf;
4505 }
4506
4507 static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
4508 {
4509         return io_put_kbuf(req, req->sr_msg.kbuf);
4510 }
4511
4512 static int io_recvmsg_prep_async(struct io_kiocb *req)
4513 {
4514         int ret;
4515
4516         if (!io_op_defs[req->opcode].needs_async_data)
4517                 return 0;
4518         ret = io_recvmsg_copy_hdr(req, req->async_data);
4519         if (!ret)
4520                 req->flags |= REQ_F_NEED_CLEANUP;
4521         return ret;
4522 }
4523
4524 static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4525 {
4526         struct io_sr_msg *sr = &req->sr_msg;
4527
4528         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4529                 return -EINVAL;
4530
4531         sr->msg_flags = READ_ONCE(sqe->msg_flags);
4532         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4533         sr->len = READ_ONCE(sqe->len);
4534         sr->bgid = READ_ONCE(sqe->buf_group);
4535
4536 #ifdef CONFIG_COMPAT
4537         if (req->ctx->compat)
4538                 sr->msg_flags |= MSG_CMSG_COMPAT;
4539 #endif
4540         return 0;
4541 }
4542
4543 static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
4544 {
4545         struct io_async_msghdr iomsg, *kmsg;
4546         struct socket *sock;
4547         struct io_buffer *kbuf;
4548         unsigned flags;
4549         int ret, cflags = 0;
4550         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4551
4552         sock = sock_from_file(req->file);
4553         if (unlikely(!sock))
4554                 return -ENOTSOCK;
4555
4556         kmsg = req->async_data;
4557         if (!kmsg) {
4558                 ret = io_recvmsg_copy_hdr(req, &iomsg);
4559                 if (ret)
4560                         return ret;
4561                 kmsg = &iomsg;
4562         }
4563
4564         if (req->flags & REQ_F_BUFFER_SELECT) {
4565                 kbuf = io_recv_buffer_select(req, !force_nonblock);
4566                 if (IS_ERR(kbuf))
4567                         return PTR_ERR(kbuf);
4568                 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
4569                 kmsg->fast_iov[0].iov_len = req->sr_msg.len;
4570                 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
4571                                 1, req->sr_msg.len);
4572         }
4573
4574         flags = req->sr_msg.msg_flags;
4575         if (flags & MSG_DONTWAIT)
4576                 req->flags |= REQ_F_NOWAIT;
4577         else if (force_nonblock)
4578                 flags |= MSG_DONTWAIT;
4579
4580         ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
4581                                         kmsg->uaddr, flags);
4582         if (force_nonblock && ret == -EAGAIN)
4583                 return io_setup_async_msg(req, kmsg);
4584         if (ret == -ERESTARTSYS)
4585                 ret = -EINTR;
4586
4587         if (req->flags & REQ_F_BUFFER_SELECTED)
4588                 cflags = io_put_recv_kbuf(req);
4589         /* fast path, check for non-NULL to avoid function call */
4590         if (kmsg->free_iov)
4591                 kfree(kmsg->free_iov);
4592         req->flags &= ~REQ_F_NEED_CLEANUP;
4593         if (ret < 0)
4594                 req_set_fail_links(req);
4595         __io_req_complete(req, issue_flags, ret, cflags);
4596         return 0;
4597 }
4598
4599 static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
4600 {
4601         struct io_buffer *kbuf;
4602         struct io_sr_msg *sr = &req->sr_msg;
4603         struct msghdr msg;
4604         void __user *buf = sr->buf;
4605         struct socket *sock;
4606         struct iovec iov;
4607         unsigned flags;
4608         int ret, cflags = 0;
4609         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4610
4611         sock = sock_from_file(req->file);
4612         if (unlikely(!sock))
4613                 return -ENOTSOCK;
4614
4615         if (req->flags & REQ_F_BUFFER_SELECT) {
4616                 kbuf = io_recv_buffer_select(req, !force_nonblock);
4617                 if (IS_ERR(kbuf))
4618                         return PTR_ERR(kbuf);
4619                 buf = u64_to_user_ptr(kbuf->addr);
4620         }
4621
4622         ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
4623         if (unlikely(ret))
4624                 goto out_free;
4625
4626         msg.msg_name = NULL;
4627         msg.msg_control = NULL;
4628         msg.msg_controllen = 0;
4629         msg.msg_namelen = 0;
4630         msg.msg_iocb = NULL;
4631         msg.msg_flags = 0;
4632
4633         flags = req->sr_msg.msg_flags;
4634         if (flags & MSG_DONTWAIT)
4635                 req->flags |= REQ_F_NOWAIT;
4636         else if (force_nonblock)
4637                 flags |= MSG_DONTWAIT;
4638
4639         ret = sock_recvmsg(sock, &msg, flags);
4640         if (force_nonblock && ret == -EAGAIN)
4641                 return -EAGAIN;
4642         if (ret == -ERESTARTSYS)
4643                 ret = -EINTR;
4644 out_free:
4645         if (req->flags & REQ_F_BUFFER_SELECTED)
4646                 cflags = io_put_recv_kbuf(req);
4647         if (ret < 0)
4648                 req_set_fail_links(req);
4649         __io_req_complete(req, issue_flags, ret, cflags);
4650         return 0;
4651 }
4652
4653 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4654 {
4655         struct io_accept *accept = &req->accept;
4656
4657         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4658                 return -EINVAL;
4659         if (sqe->ioprio || sqe->len || sqe->buf_index)
4660                 return -EINVAL;
4661
4662         accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4663         accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4664         accept->flags = READ_ONCE(sqe->accept_flags);
4665         accept->nofile = rlimit(RLIMIT_NOFILE);
4666         return 0;
4667 }
4668
4669 static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
4670 {
4671         struct io_accept *accept = &req->accept;
4672         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4673         unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
4674         int ret;
4675
4676         if (req->file->f_flags & O_NONBLOCK)
4677                 req->flags |= REQ_F_NOWAIT;
4678
4679         ret = __sys_accept4_file(req->file, file_flags, accept->addr,
4680                                         accept->addr_len, accept->flags,
4681                                         accept->nofile);
4682         if (ret == -EAGAIN && force_nonblock)
4683                 return -EAGAIN;
4684         if (ret < 0) {
4685                 if (ret == -ERESTARTSYS)
4686                         ret = -EINTR;
4687                 req_set_fail_links(req);
4688         }
4689         __io_req_complete(req, issue_flags, ret, 0);
4690         return 0;
4691 }
4692
4693 static int io_connect_prep_async(struct io_kiocb *req)
4694 {
4695         struct io_async_connect *io = req->async_data;
4696         struct io_connect *conn = &req->connect;
4697
4698         return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
4699 }
4700
4701 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4702 {
4703         struct io_connect *conn = &req->connect;
4704
4705         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4706                 return -EINVAL;
4707         if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
4708                 return -EINVAL;
4709
4710         conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4711         conn->addr_len =  READ_ONCE(sqe->addr2);
4712         return 0;
4713 }
4714
4715 static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
4716 {
4717         struct io_async_connect __io, *io;
4718         unsigned file_flags;
4719         int ret;
4720         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4721
4722         if (req->async_data) {
4723                 io = req->async_data;
4724         } else {
4725                 ret = move_addr_to_kernel(req->connect.addr,
4726                                                 req->connect.addr_len,
4727                                                 &__io.address);
4728                 if (ret)
4729                         goto out;
4730                 io = &__io;
4731         }
4732
4733         file_flags = force_nonblock ? O_NONBLOCK : 0;
4734
4735         ret = __sys_connect_file(req->file, &io->address,
4736                                         req->connect.addr_len, file_flags);
4737         if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
4738                 if (req->async_data)
4739                         return -EAGAIN;
4740                 if (io_alloc_async_data(req)) {
4741                         ret = -ENOMEM;
4742                         goto out;
4743                 }
4744                 io = req->async_data;
4745                 memcpy(req->async_data, &__io, sizeof(__io));
4746                 return -EAGAIN;
4747         }
4748         if (ret == -ERESTARTSYS)
4749                 ret = -EINTR;
4750 out:
4751         if (ret < 0)
4752                 req_set_fail_links(req);
4753         __io_req_complete(req, issue_flags, ret, 0);
4754         return 0;
4755 }
4756 #else /* !CONFIG_NET */
4757 #define IO_NETOP_FN(op)                                                 \
4758 static int io_##op(struct io_kiocb *req, unsigned int issue_flags)      \
4759 {                                                                       \
4760         return -EOPNOTSUPP;                                             \
4761 }
4762
4763 #define IO_NETOP_PREP(op)                                               \
4764 IO_NETOP_FN(op)                                                         \
4765 static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
4766 {                                                                       \
4767         return -EOPNOTSUPP;                                             \
4768 }                                                                       \
4769
4770 #define IO_NETOP_PREP_ASYNC(op)                                         \
4771 IO_NETOP_PREP(op)                                                       \
4772 static int io_##op##_prep_async(struct io_kiocb *req)                   \
4773 {                                                                       \
4774         return -EOPNOTSUPP;                                             \
4775 }
4776
4777 IO_NETOP_PREP_ASYNC(sendmsg);
4778 IO_NETOP_PREP_ASYNC(recvmsg);
4779 IO_NETOP_PREP_ASYNC(connect);
4780 IO_NETOP_PREP(accept);
4781 IO_NETOP_FN(send);
4782 IO_NETOP_FN(recv);
4783 #endif /* CONFIG_NET */
4784
4785 struct io_poll_table {
4786         struct poll_table_struct pt;
4787         struct io_kiocb *req;
4788         int error;
4789 };
4790
4791 static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
4792                            __poll_t mask, task_work_func_t func)
4793 {
4794         int ret;
4795
4796         /* for instances that support it check for an event match first: */
4797         if (mask && !(mask & poll->events))
4798                 return 0;
4799
4800         trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
4801
4802         list_del_init(&poll->wait.entry);
4803
4804         req->result = mask;
4805         req->task_work.func = func;
4806         percpu_ref_get(&req->ctx->refs);
4807
4808         /*
4809          * If this fails, then the task is exiting. When a task exits, the
4810          * work gets canceled, so just cancel this request as well instead
4811          * of executing it. We can't safely execute it anyway, as we may not
4812          * have the needed state needed for it anyway.
4813          */
4814         ret = io_req_task_work_add(req);
4815         if (unlikely(ret)) {
4816                 WRITE_ONCE(poll->canceled, true);
4817                 io_req_task_work_add_fallback(req, func);
4818         }
4819         return 1;
4820 }
4821
4822 static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
4823         __acquires(&req->ctx->completion_lock)
4824 {
4825         struct io_ring_ctx *ctx = req->ctx;
4826
4827         if (!req->result && !READ_ONCE(poll->canceled)) {
4828                 struct poll_table_struct pt = { ._key = poll->events };
4829
4830                 req->result = vfs_poll(req->file, &pt) & poll->events;
4831         }
4832
4833         spin_lock_irq(&ctx->completion_lock);
4834         if (!req->result && !READ_ONCE(poll->canceled)) {
4835                 add_wait_queue(poll->head, &poll->wait);
4836                 return true;
4837         }
4838
4839         return false;
4840 }
4841
4842 static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
4843 {
4844         /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
4845         if (req->opcode == IORING_OP_POLL_ADD)
4846                 return req->async_data;
4847         return req->apoll->double_poll;
4848 }
4849
4850 static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
4851 {
4852         if (req->opcode == IORING_OP_POLL_ADD)
4853                 return &req->poll;
4854         return &req->apoll->poll;
4855 }
4856
4857 static void io_poll_remove_double(struct io_kiocb *req)
4858 {
4859         struct io_poll_iocb *poll = io_poll_get_double(req);
4860
4861         lockdep_assert_held(&req->ctx->completion_lock);
4862
4863         if (poll && poll->head) {
4864                 struct wait_queue_head *head = poll->head;
4865
4866                 spin_lock(&head->lock);
4867                 list_del_init(&poll->wait.entry);
4868                 if (poll->wait.private)
4869                         refcount_dec(&req->refs);
4870                 poll->head = NULL;
4871                 spin_unlock(&head->lock);
4872         }
4873 }
4874
4875 static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
4876 {
4877         struct io_ring_ctx *ctx = req->ctx;
4878
4879         io_poll_remove_double(req);
4880         req->poll.done = true;
4881         io_cqring_fill_event(req, error ? error : mangle_poll(mask));
4882         io_commit_cqring(ctx);
4883 }
4884
4885 static void io_poll_task_func(struct callback_head *cb)
4886 {
4887         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4888         struct io_ring_ctx *ctx = req->ctx;
4889         struct io_kiocb *nxt;
4890
4891         if (io_poll_rewait(req, &req->poll)) {
4892                 spin_unlock_irq(&ctx->completion_lock);
4893         } else {
4894                 hash_del(&req->hash_node);
4895                 io_poll_complete(req, req->result, 0);
4896                 spin_unlock_irq(&ctx->completion_lock);
4897
4898                 nxt = io_put_req_find_next(req);
4899                 io_cqring_ev_posted(ctx);
4900                 if (nxt)
4901                         __io_req_task_submit(nxt);
4902         }
4903
4904         percpu_ref_put(&ctx->refs);
4905 }
4906
4907 static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
4908                                int sync, void *key)
4909 {
4910         struct io_kiocb *req = wait->private;
4911         struct io_poll_iocb *poll = io_poll_get_single(req);
4912         __poll_t mask = key_to_poll(key);
4913
4914         /* for instances that support it check for an event match first: */
4915         if (mask && !(mask & poll->events))
4916                 return 0;
4917
4918         list_del_init(&wait->entry);
4919
4920         if (poll && poll->head) {
4921                 bool done;
4922
4923                 spin_lock(&poll->head->lock);
4924                 done = list_empty(&poll->wait.entry);
4925                 if (!done)
4926                         list_del_init(&poll->wait.entry);
4927                 /* make sure double remove sees this as being gone */
4928                 wait->private = NULL;
4929                 spin_unlock(&poll->head->lock);
4930                 if (!done) {
4931                         /* use wait func handler, so it matches the rq type */
4932                         poll->wait.func(&poll->wait, mode, sync, key);
4933                 }
4934         }
4935         refcount_dec(&req->refs);
4936         return 1;
4937 }
4938
4939 static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
4940                               wait_queue_func_t wake_func)
4941 {
4942         poll->head = NULL;
4943         poll->done = false;
4944         poll->canceled = false;
4945         poll->events = events;
4946         INIT_LIST_HEAD(&poll->wait.entry);
4947         init_waitqueue_func_entry(&poll->wait, wake_func);
4948 }
4949
4950 static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
4951                             struct wait_queue_head *head,
4952                             struct io_poll_iocb **poll_ptr)
4953 {
4954         struct io_kiocb *req = pt->req;
4955
4956         /*
4957          * If poll->head is already set, it's because the file being polled
4958          * uses multiple waitqueues for poll handling (eg one for read, one
4959          * for write). Setup a separate io_poll_iocb if this happens.
4960          */
4961         if (unlikely(poll->head)) {
4962                 struct io_poll_iocb *poll_one = poll;
4963
4964                 /* already have a 2nd entry, fail a third attempt */
4965                 if (*poll_ptr) {
4966                         pt->error = -EINVAL;
4967                         return;
4968                 }
4969                 /* double add on the same waitqueue head, ignore */
4970                 if (poll->head == head)
4971                         return;
4972                 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
4973                 if (!poll) {
4974                         pt->error = -ENOMEM;
4975                         return;
4976                 }
4977                 io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
4978                 refcount_inc(&req->refs);
4979                 poll->wait.private = req;
4980                 *poll_ptr = poll;
4981         }
4982
4983         pt->error = 0;
4984         poll->head = head;
4985
4986         if (poll->events & EPOLLEXCLUSIVE)
4987                 add_wait_queue_exclusive(head, &poll->wait);
4988         else
4989                 add_wait_queue(head, &poll->wait);
4990 }
4991
4992 static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
4993                                struct poll_table_struct *p)
4994 {
4995         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
4996         struct async_poll *apoll = pt->req->apoll;
4997
4998         __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
4999 }
5000
5001 static void io_async_task_func(struct callback_head *cb)
5002 {
5003         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
5004         struct async_poll *apoll = req->apoll;
5005         struct io_ring_ctx *ctx = req->ctx;
5006
5007         trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
5008
5009         if (io_poll_rewait(req, &apoll->poll)) {
5010                 spin_unlock_irq(&ctx->completion_lock);
5011                 percpu_ref_put(&ctx->refs);
5012                 return;
5013         }
5014
5015         /* If req is still hashed, it cannot have been canceled. Don't check. */
5016         if (hash_hashed(&req->hash_node))
5017                 hash_del(&req->hash_node);
5018
5019         io_poll_remove_double(req);
5020         spin_unlock_irq(&ctx->completion_lock);
5021
5022         if (!READ_ONCE(apoll->poll.canceled))
5023                 __io_req_task_submit(req);
5024         else
5025                 __io_req_task_cancel(req, -ECANCELED);
5026
5027         percpu_ref_put(&ctx->refs);
5028         kfree(apoll->double_poll);
5029         kfree(apoll);
5030 }
5031
5032 static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5033                         void *key)
5034 {
5035         struct io_kiocb *req = wait->private;
5036         struct io_poll_iocb *poll = &req->apoll->poll;
5037
5038         trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
5039                                         key_to_poll(key));
5040
5041         return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
5042 }
5043
5044 static void io_poll_req_insert(struct io_kiocb *req)
5045 {
5046         struct io_ring_ctx *ctx = req->ctx;
5047         struct hlist_head *list;
5048
5049         list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
5050         hlist_add_head(&req->hash_node, list);
5051 }
5052
5053 static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
5054                                       struct io_poll_iocb *poll,
5055                                       struct io_poll_table *ipt, __poll_t mask,
5056                                       wait_queue_func_t wake_func)
5057         __acquires(&ctx->completion_lock)
5058 {
5059         struct io_ring_ctx *ctx = req->ctx;
5060         bool cancel = false;
5061
5062         INIT_HLIST_NODE(&req->hash_node);
5063         io_init_poll_iocb(poll, mask, wake_func);
5064         poll->file = req->file;
5065         poll->wait.private = req;
5066
5067         ipt->pt._key = mask;
5068         ipt->req = req;
5069         ipt->error = -EINVAL;
5070
5071         mask = vfs_poll(req->file, &ipt->pt) & poll->events;
5072
5073         spin_lock_irq(&ctx->completion_lock);
5074         if (likely(poll->head)) {
5075                 spin_lock(&poll->head->lock);
5076                 if (unlikely(list_empty(&poll->wait.entry))) {
5077                         if (ipt->error)
5078                                 cancel = true;
5079                         ipt->error = 0;
5080                         mask = 0;
5081                 }
5082                 if (mask || ipt->error)
5083                         list_del_init(&poll->wait.entry);
5084                 else if (cancel)
5085                         WRITE_ONCE(poll->canceled, true);
5086                 else if (!poll->done) /* actually waiting for an event */
5087                         io_poll_req_insert(req);
5088                 spin_unlock(&poll->head->lock);
5089         }
5090
5091         return mask;
5092 }
5093
5094 static bool io_arm_poll_handler(struct io_kiocb *req)
5095 {
5096         const struct io_op_def *def = &io_op_defs[req->opcode];
5097         struct io_ring_ctx *ctx = req->ctx;
5098         struct async_poll *apoll;
5099         struct io_poll_table ipt;
5100         __poll_t mask, ret;
5101         int rw;
5102
5103         if (!req->file || !file_can_poll(req->file))
5104                 return false;
5105         if (req->flags & REQ_F_POLLED)
5106                 return false;
5107         if (def->pollin)
5108                 rw = READ;
5109         else if (def->pollout)
5110                 rw = WRITE;
5111         else
5112                 return false;
5113         /* if we can't nonblock try, then no point in arming a poll handler */
5114         if (!io_file_supports_async(req->file, rw))
5115                 return false;
5116
5117         apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5118         if (unlikely(!apoll))
5119                 return false;
5120         apoll->double_poll = NULL;
5121
5122         req->flags |= REQ_F_POLLED;
5123         req->apoll = apoll;
5124
5125         mask = 0;
5126         if (def->pollin)
5127                 mask |= POLLIN | POLLRDNORM;
5128         if (def->pollout)
5129                 mask |= POLLOUT | POLLWRNORM;
5130
5131         /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
5132         if ((req->opcode == IORING_OP_RECVMSG) &&
5133             (req->sr_msg.msg_flags & MSG_ERRQUEUE))
5134                 mask &= ~POLLIN;
5135
5136         mask |= POLLERR | POLLPRI;
5137
5138         ipt.pt._qproc = io_async_queue_proc;
5139
5140         ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
5141                                         io_async_wake);
5142         if (ret || ipt.error) {
5143                 io_poll_remove_double(req);
5144                 spin_unlock_irq(&ctx->completion_lock);
5145                 kfree(apoll->double_poll);
5146                 kfree(apoll);
5147                 return false;
5148         }
5149         spin_unlock_irq(&ctx->completion_lock);
5150         trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
5151                                         apoll->poll.events);
5152         return true;
5153 }
5154
5155 static bool __io_poll_remove_one(struct io_kiocb *req,
5156                                  struct io_poll_iocb *poll)
5157 {
5158         bool do_complete = false;
5159
5160         spin_lock(&poll->head->lock);
5161         WRITE_ONCE(poll->canceled, true);
5162         if (!list_empty(&poll->wait.entry)) {
5163                 list_del_init(&poll->wait.entry);
5164                 do_complete = true;
5165         }
5166         spin_unlock(&poll->head->lock);
5167         hash_del(&req->hash_node);
5168         return do_complete;
5169 }
5170
5171 static bool io_poll_remove_one(struct io_kiocb *req)
5172 {
5173         bool do_complete;
5174
5175         io_poll_remove_double(req);
5176
5177         if (req->opcode == IORING_OP_POLL_ADD) {
5178                 do_complete = __io_poll_remove_one(req, &req->poll);
5179         } else {
5180                 struct async_poll *apoll = req->apoll;
5181
5182                 /* non-poll requests have submit ref still */
5183                 do_complete = __io_poll_remove_one(req, &apoll->poll);
5184                 if (do_complete) {
5185                         io_put_req(req);
5186                         kfree(apoll->double_poll);
5187                         kfree(apoll);
5188                 }
5189         }
5190
5191         if (do_complete) {
5192                 io_cqring_fill_event(req, -ECANCELED);
5193                 io_commit_cqring(req->ctx);
5194                 req_set_fail_links(req);
5195                 io_put_req_deferred(req, 1);
5196         }
5197
5198         return do_complete;
5199 }
5200
5201 /*
5202  * Returns true if we found and killed one or more poll requests
5203  */
5204 static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
5205                                struct files_struct *files)
5206 {
5207         struct hlist_node *tmp;
5208         struct io_kiocb *req;
5209         int posted = 0, i;
5210
5211         spin_lock_irq(&ctx->completion_lock);
5212         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
5213                 struct hlist_head *list;
5214
5215                 list = &ctx->cancel_hash[i];
5216                 hlist_for_each_entry_safe(req, tmp, list, hash_node) {
5217                         if (io_match_task(req, tsk, files))
5218                                 posted += io_poll_remove_one(req);
5219                 }
5220         }
5221         spin_unlock_irq(&ctx->completion_lock);
5222
5223         if (posted)
5224                 io_cqring_ev_posted(ctx);
5225
5226         return posted != 0;
5227 }
5228
5229 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
5230 {
5231         struct hlist_head *list;
5232         struct io_kiocb *req;
5233
5234         list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
5235         hlist_for_each_entry(req, list, hash_node) {
5236                 if (sqe_addr != req->user_data)
5237                         continue;
5238                 if (io_poll_remove_one(req))
5239                         return 0;
5240                 return -EALREADY;
5241         }
5242
5243         return -ENOENT;
5244 }
5245
5246 static int io_poll_remove_prep(struct io_kiocb *req,
5247                                const struct io_uring_sqe *sqe)
5248 {
5249         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5250                 return -EINVAL;
5251         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
5252             sqe->poll_events)
5253                 return -EINVAL;
5254
5255         req->poll_remove.addr = READ_ONCE(sqe->addr);
5256         return 0;
5257 }
5258
5259 /*
5260  * Find a running poll command that matches one specified in sqe->addr,
5261  * and remove it if found.
5262  */
5263 static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
5264 {
5265         struct io_ring_ctx *ctx = req->ctx;
5266         int ret;
5267
5268         spin_lock_irq(&ctx->completion_lock);
5269         ret = io_poll_cancel(ctx, req->poll_remove.addr);
5270         spin_unlock_irq(&ctx->completion_lock);
5271
5272         if (ret < 0)
5273                 req_set_fail_links(req);
5274         io_req_complete(req, ret);
5275         return 0;
5276 }
5277
5278 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5279                         void *key)
5280 {
5281         struct io_kiocb *req = wait->private;
5282         struct io_poll_iocb *poll = &req->poll;
5283
5284         return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
5285 }
5286
5287 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
5288                                struct poll_table_struct *p)
5289 {
5290         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5291
5292         __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
5293 }
5294
5295 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5296 {
5297         struct io_poll_iocb *poll = &req->poll;
5298         u32 events;
5299
5300         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5301                 return -EINVAL;
5302         if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
5303                 return -EINVAL;
5304
5305         events = READ_ONCE(sqe->poll32_events);
5306 #ifdef __BIG_ENDIAN
5307         events = swahw32(events);
5308 #endif
5309         poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
5310                        (events & EPOLLEXCLUSIVE);
5311         return 0;
5312 }
5313
5314 static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
5315 {
5316         struct io_poll_iocb *poll = &req->poll;
5317         struct io_ring_ctx *ctx = req->ctx;
5318         struct io_poll_table ipt;
5319         __poll_t mask;
5320
5321         ipt.pt._qproc = io_poll_queue_proc;
5322
5323         mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
5324                                         io_poll_wake);
5325
5326         if (mask) { /* no async, we'd stolen it */
5327                 ipt.error = 0;
5328                 io_poll_complete(req, mask, 0);
5329         }
5330         spin_unlock_irq(&ctx->completion_lock);
5331
5332         if (mask) {
5333                 io_cqring_ev_posted(ctx);
5334                 io_put_req(req);
5335         }
5336         return ipt.error;
5337 }
5338
5339 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
5340 {
5341         struct io_timeout_data *data = container_of(timer,
5342                                                 struct io_timeout_data, timer);
5343         struct io_kiocb *req = data->req;
5344         struct io_ring_ctx *ctx = req->ctx;
5345         unsigned long flags;
5346
5347         spin_lock_irqsave(&ctx->completion_lock, flags);
5348         list_del_init(&req->timeout.list);
5349         atomic_set(&req->ctx->cq_timeouts,
5350                 atomic_read(&req->ctx->cq_timeouts) + 1);
5351
5352         io_cqring_fill_event(req, -ETIME);
5353         io_commit_cqring(ctx);
5354         spin_unlock_irqrestore(&ctx->completion_lock, flags);
5355
5356         io_cqring_ev_posted(ctx);
5357         req_set_fail_links(req);
5358         io_put_req(req);
5359         return HRTIMER_NORESTART;
5360 }
5361
5362 static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
5363                                            __u64 user_data)
5364 {
5365         struct io_timeout_data *io;
5366         struct io_kiocb *req;
5367         int ret = -ENOENT;
5368
5369         list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
5370                 if (user_data == req->user_data) {
5371                         ret = 0;
5372                         break;
5373                 }
5374         }
5375
5376         if (ret == -ENOENT)
5377                 return ERR_PTR(ret);
5378
5379         io = req->async_data;
5380         ret = hrtimer_try_to_cancel(&io->timer);
5381         if (ret == -1)
5382                 return ERR_PTR(-EALREADY);
5383         list_del_init(&req->timeout.list);
5384         return req;
5385 }
5386
5387 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
5388 {
5389         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5390
5391         if (IS_ERR(req))
5392                 return PTR_ERR(req);
5393
5394         req_set_fail_links(req);
5395         io_cqring_fill_event(req, -ECANCELED);
5396         io_put_req_deferred(req, 1);
5397         return 0;
5398 }
5399
5400 static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
5401                              struct timespec64 *ts, enum hrtimer_mode mode)
5402 {
5403         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5404         struct io_timeout_data *data;
5405
5406         if (IS_ERR(req))
5407                 return PTR_ERR(req);
5408
5409         req->timeout.off = 0; /* noseq */
5410         data = req->async_data;
5411         list_add_tail(&req->timeout.list, &ctx->timeout_list);
5412         hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
5413         data->timer.function = io_timeout_fn;
5414         hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
5415         return 0;
5416 }
5417
5418 static int io_timeout_remove_prep(struct io_kiocb *req,
5419                                   const struct io_uring_sqe *sqe)
5420 {
5421         struct io_timeout_rem *tr = &req->timeout_rem;
5422
5423         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5424                 return -EINVAL;
5425         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5426                 return -EINVAL;
5427         if (sqe->ioprio || sqe->buf_index || sqe->len)
5428                 return -EINVAL;
5429
5430         tr->addr = READ_ONCE(sqe->addr);
5431         tr->flags = READ_ONCE(sqe->timeout_flags);
5432         if (tr->flags & IORING_TIMEOUT_UPDATE) {
5433                 if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
5434                         return -EINVAL;
5435                 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
5436                         return -EFAULT;
5437         } else if (tr->flags) {
5438                 /* timeout removal doesn't support flags */
5439                 return -EINVAL;
5440         }
5441
5442         return 0;
5443 }
5444
5445 static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
5446 {
5447         return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
5448                                             : HRTIMER_MODE_REL;
5449 }
5450
5451 /*
5452  * Remove or update an existing timeout command
5453  */
5454 static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
5455 {
5456         struct io_timeout_rem *tr = &req->timeout_rem;
5457         struct io_ring_ctx *ctx = req->ctx;
5458         int ret;
5459
5460         spin_lock_irq(&ctx->completion_lock);
5461         if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE))
5462                 ret = io_timeout_cancel(ctx, tr->addr);
5463         else
5464                 ret = io_timeout_update(ctx, tr->addr, &tr->ts,
5465                                         io_translate_timeout_mode(tr->flags));
5466
5467         io_cqring_fill_event(req, ret);
5468         io_commit_cqring(ctx);
5469         spin_unlock_irq(&ctx->completion_lock);
5470         io_cqring_ev_posted(ctx);
5471         if (ret < 0)
5472                 req_set_fail_links(req);
5473         io_put_req(req);
5474         return 0;
5475 }
5476
5477 static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5478                            bool is_timeout_link)
5479 {
5480         struct io_timeout_data *data;
5481         unsigned flags;
5482         u32 off = READ_ONCE(sqe->off);
5483
5484         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5485                 return -EINVAL;
5486         if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
5487                 return -EINVAL;
5488         if (off && is_timeout_link)
5489                 return -EINVAL;
5490         flags = READ_ONCE(sqe->timeout_flags);
5491         if (flags & ~IORING_TIMEOUT_ABS)
5492                 return -EINVAL;
5493
5494         req->timeout.off = off;
5495
5496         if (!req->async_data && io_alloc_async_data(req))
5497                 return -ENOMEM;
5498
5499         data = req->async_data;
5500         data->req = req;
5501
5502         if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5503                 return -EFAULT;
5504
5505         data->mode = io_translate_timeout_mode(flags);
5506         hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
5507         return 0;
5508 }
5509
5510 static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
5511 {
5512         struct io_ring_ctx *ctx = req->ctx;
5513         struct io_timeout_data *data = req->async_data;
5514         struct list_head *entry;
5515         u32 tail, off = req->timeout.off;
5516
5517         spin_lock_irq(&ctx->completion_lock);
5518
5519         /*
5520          * sqe->off holds how many events that need to occur for this
5521          * timeout event to be satisfied. If it isn't set, then this is
5522          * a pure timeout request, sequence isn't used.
5523          */
5524         if (io_is_timeout_noseq(req)) {
5525                 entry = ctx->timeout_list.prev;
5526                 goto add;
5527         }
5528
5529         tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
5530         req->timeout.target_seq = tail + off;
5531
5532         /* Update the last seq here in case io_flush_timeouts() hasn't.
5533          * This is safe because ->completion_lock is held, and submissions
5534          * and completions are never mixed in the same ->completion_lock section.
5535          */
5536         ctx->cq_last_tm_flush = tail;
5537
5538         /*
5539          * Insertion sort, ensuring the first entry in the list is always
5540          * the one we need first.
5541          */
5542         list_for_each_prev(entry, &ctx->timeout_list) {
5543                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
5544                                                   timeout.list);
5545
5546                 if (io_is_timeout_noseq(nxt))
5547                         continue;
5548                 /* nxt.seq is behind @tail, otherwise would've been completed */
5549                 if (off >= nxt->timeout.target_seq - tail)
5550                         break;
5551         }
5552 add:
5553         list_add(&req->timeout.list, entry);
5554         data->timer.function = io_timeout_fn;
5555         hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
5556         spin_unlock_irq(&ctx->completion_lock);
5557         return 0;
5558 }
5559
5560 static bool io_cancel_cb(struct io_wq_work *work, void *data)
5561 {
5562         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5563
5564         return req->user_data == (unsigned long) data;
5565 }
5566
5567 static int io_async_cancel_one(struct io_uring_task *tctx, void *sqe_addr)
5568 {
5569         enum io_wq_cancel cancel_ret;
5570         int ret = 0;
5571
5572         if (!tctx->io_wq)
5573                 return -ENOENT;
5574
5575         cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, sqe_addr, false);
5576         switch (cancel_ret) {
5577         case IO_WQ_CANCEL_OK:
5578                 ret = 0;
5579                 break;
5580         case IO_WQ_CANCEL_RUNNING:
5581                 ret = -EALREADY;
5582                 break;
5583         case IO_WQ_CANCEL_NOTFOUND:
5584                 ret = -ENOENT;
5585                 break;
5586         }
5587
5588         return ret;
5589 }
5590
5591 static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
5592                                      struct io_kiocb *req, __u64 sqe_addr,
5593                                      int success_ret)
5594 {
5595         unsigned long flags;
5596         int ret;
5597
5598         ret = io_async_cancel_one(req->task->io_uring,
5599                                         (void *) (unsigned long) sqe_addr);
5600         if (ret != -ENOENT) {
5601                 spin_lock_irqsave(&ctx->completion_lock, flags);
5602                 goto done;
5603         }
5604
5605         spin_lock_irqsave(&ctx->completion_lock, flags);
5606         ret = io_timeout_cancel(ctx, sqe_addr);
5607         if (ret != -ENOENT)
5608                 goto done;
5609         ret = io_poll_cancel(ctx, sqe_addr);
5610 done:
5611         if (!ret)
5612                 ret = success_ret;
5613         io_cqring_fill_event(req, ret);
5614         io_commit_cqring(ctx);
5615         spin_unlock_irqrestore(&ctx->completion_lock, flags);
5616         io_cqring_ev_posted(ctx);
5617
5618         if (ret < 0)
5619                 req_set_fail_links(req);
5620         io_put_req(req);
5621 }
5622
5623 static int io_async_cancel_prep(struct io_kiocb *req,
5624                                 const struct io_uring_sqe *sqe)
5625 {
5626         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5627                 return -EINVAL;
5628         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5629                 return -EINVAL;
5630         if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
5631                 return -EINVAL;
5632
5633         req->cancel.addr = READ_ONCE(sqe->addr);
5634         return 0;
5635 }
5636
5637 static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
5638 {
5639         struct io_ring_ctx *ctx = req->ctx;
5640
5641         io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
5642         return 0;
5643 }
5644
5645 static int io_rsrc_update_prep(struct io_kiocb *req,
5646                                 const struct io_uring_sqe *sqe)
5647 {
5648         if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL))
5649                 return -EINVAL;
5650         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5651                 return -EINVAL;
5652         if (sqe->ioprio || sqe->rw_flags)
5653                 return -EINVAL;
5654
5655         req->rsrc_update.offset = READ_ONCE(sqe->off);
5656         req->rsrc_update.nr_args = READ_ONCE(sqe->len);
5657         if (!req->rsrc_update.nr_args)
5658                 return -EINVAL;
5659         req->rsrc_update.arg = READ_ONCE(sqe->addr);
5660         return 0;
5661 }
5662
5663 static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
5664 {
5665         struct io_ring_ctx *ctx = req->ctx;
5666         struct io_uring_rsrc_update up;
5667         int ret;
5668
5669         if (issue_flags & IO_URING_F_NONBLOCK)
5670                 return -EAGAIN;
5671
5672         up.offset = req->rsrc_update.offset;
5673         up.data = req->rsrc_update.arg;
5674
5675         mutex_lock(&ctx->uring_lock);
5676         ret = __io_sqe_files_update(ctx, &up, req->rsrc_update.nr_args);
5677         mutex_unlock(&ctx->uring_lock);
5678
5679         if (ret < 0)
5680                 req_set_fail_links(req);
5681         __io_req_complete(req, issue_flags, ret, 0);
5682         return 0;
5683 }
5684
5685 static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5686 {
5687         switch (req->opcode) {
5688         case IORING_OP_NOP:
5689                 return 0;
5690         case IORING_OP_READV:
5691         case IORING_OP_READ_FIXED:
5692         case IORING_OP_READ:
5693                 return io_read_prep(req, sqe);
5694         case IORING_OP_WRITEV:
5695         case IORING_OP_WRITE_FIXED:
5696         case IORING_OP_WRITE:
5697                 return io_write_prep(req, sqe);
5698         case IORING_OP_POLL_ADD:
5699                 return io_poll_add_prep(req, sqe);
5700         case IORING_OP_POLL_REMOVE:
5701                 return io_poll_remove_prep(req, sqe);
5702         case IORING_OP_FSYNC:
5703                 return io_fsync_prep(req, sqe);
5704         case IORING_OP_SYNC_FILE_RANGE:
5705                 return io_sfr_prep(req, sqe);
5706         case IORING_OP_SENDMSG:
5707         case IORING_OP_SEND:
5708                 return io_sendmsg_prep(req, sqe);
5709         case IORING_OP_RECVMSG:
5710         case IORING_OP_RECV:
5711                 return io_recvmsg_prep(req, sqe);
5712         case IORING_OP_CONNECT:
5713                 return io_connect_prep(req, sqe);
5714         case IORING_OP_TIMEOUT:
5715                 return io_timeout_prep(req, sqe, false);
5716         case IORING_OP_TIMEOUT_REMOVE:
5717                 return io_timeout_remove_prep(req, sqe);
5718         case IORING_OP_ASYNC_CANCEL:
5719                 return io_async_cancel_prep(req, sqe);
5720         case IORING_OP_LINK_TIMEOUT:
5721                 return io_timeout_prep(req, sqe, true);
5722         case IORING_OP_ACCEPT:
5723                 return io_accept_prep(req, sqe);
5724         case IORING_OP_FALLOCATE:
5725                 return io_fallocate_prep(req, sqe);
5726         case IORING_OP_OPENAT:
5727                 return io_openat_prep(req, sqe);
5728         case IORING_OP_CLOSE:
5729                 return io_close_prep(req, sqe);
5730         case IORING_OP_FILES_UPDATE:
5731                 return io_rsrc_update_prep(req, sqe);
5732         case IORING_OP_STATX:
5733                 return io_statx_prep(req, sqe);
5734         case IORING_OP_FADVISE:
5735                 return io_fadvise_prep(req, sqe);
5736         case IORING_OP_MADVISE:
5737                 return io_madvise_prep(req, sqe);
5738         case IORING_OP_OPENAT2:
5739                 return io_openat2_prep(req, sqe);
5740         case IORING_OP_EPOLL_CTL:
5741                 return io_epoll_ctl_prep(req, sqe);
5742         case IORING_OP_SPLICE:
5743                 return io_splice_prep(req, sqe);
5744         case IORING_OP_PROVIDE_BUFFERS:
5745                 return io_provide_buffers_prep(req, sqe);
5746         case IORING_OP_REMOVE_BUFFERS:
5747                 return io_remove_buffers_prep(req, sqe);
5748         case IORING_OP_TEE:
5749                 return io_tee_prep(req, sqe);
5750         case IORING_OP_SHUTDOWN:
5751                 return io_shutdown_prep(req, sqe);
5752         case IORING_OP_RENAMEAT:
5753                 return io_renameat_prep(req, sqe);
5754         case IORING_OP_UNLINKAT:
5755                 return io_unlinkat_prep(req, sqe);
5756         }
5757
5758         printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
5759                         req->opcode);
5760         return-EINVAL;
5761 }
5762
5763 static int io_req_prep_async(struct io_kiocb *req)
5764 {
5765         switch (req->opcode) {
5766         case IORING_OP_READV:
5767         case IORING_OP_READ_FIXED:
5768         case IORING_OP_READ:
5769                 return io_rw_prep_async(req, READ);
5770         case IORING_OP_WRITEV:
5771         case IORING_OP_WRITE_FIXED:
5772         case IORING_OP_WRITE:
5773                 return io_rw_prep_async(req, WRITE);
5774         case IORING_OP_SENDMSG:
5775         case IORING_OP_SEND:
5776                 return io_sendmsg_prep_async(req);
5777         case IORING_OP_RECVMSG:
5778         case IORING_OP_RECV:
5779                 return io_recvmsg_prep_async(req);
5780         case IORING_OP_CONNECT:
5781                 return io_connect_prep_async(req);
5782         }
5783         return 0;
5784 }
5785
5786 static int io_req_defer_prep(struct io_kiocb *req)
5787 {
5788         if (!io_op_defs[req->opcode].needs_async_data)
5789                 return 0;
5790         /* some opcodes init it during the inital prep */
5791         if (req->async_data)
5792                 return 0;
5793         if (__io_alloc_async_data(req))
5794                 return -EAGAIN;
5795         return io_req_prep_async(req);
5796 }
5797
5798 static u32 io_get_sequence(struct io_kiocb *req)
5799 {
5800         struct io_kiocb *pos;
5801         struct io_ring_ctx *ctx = req->ctx;
5802         u32 total_submitted, nr_reqs = 0;
5803
5804         io_for_each_link(pos, req)
5805                 nr_reqs++;
5806
5807         total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
5808         return total_submitted - nr_reqs;
5809 }
5810
5811 static int io_req_defer(struct io_kiocb *req)
5812 {
5813         struct io_ring_ctx *ctx = req->ctx;
5814         struct io_defer_entry *de;
5815         int ret;
5816         u32 seq;
5817
5818         /* Still need defer if there is pending req in defer list. */
5819         if (likely(list_empty_careful(&ctx->defer_list) &&
5820                 !(req->flags & REQ_F_IO_DRAIN)))
5821                 return 0;
5822
5823         seq = io_get_sequence(req);
5824         /* Still a chance to pass the sequence check */
5825         if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
5826                 return 0;
5827
5828         ret = io_req_defer_prep(req);
5829         if (ret)
5830                 return ret;
5831         io_prep_async_link(req);
5832         de = kmalloc(sizeof(*de), GFP_KERNEL);
5833         if (!de)
5834                 return -ENOMEM;
5835
5836         spin_lock_irq(&ctx->completion_lock);
5837         if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
5838                 spin_unlock_irq(&ctx->completion_lock);
5839                 kfree(de);
5840                 io_queue_async_work(req);
5841                 return -EIOCBQUEUED;
5842         }
5843
5844         trace_io_uring_defer(ctx, req, req->user_data);
5845         de->req = req;
5846         de->seq = seq;
5847         list_add_tail(&de->list, &ctx->defer_list);
5848         spin_unlock_irq(&ctx->completion_lock);
5849         return -EIOCBQUEUED;
5850 }
5851
5852 static void __io_clean_op(struct io_kiocb *req)
5853 {
5854         if (req->flags & REQ_F_BUFFER_SELECTED) {
5855                 switch (req->opcode) {
5856                 case IORING_OP_READV:
5857                 case IORING_OP_READ_FIXED:
5858                 case IORING_OP_READ:
5859                         kfree((void *)(unsigned long)req->rw.addr);
5860                         break;
5861                 case IORING_OP_RECVMSG:
5862                 case IORING_OP_RECV:
5863                         kfree(req->sr_msg.kbuf);
5864                         break;
5865                 }
5866                 req->flags &= ~REQ_F_BUFFER_SELECTED;
5867         }
5868
5869         if (req->flags & REQ_F_NEED_CLEANUP) {
5870                 switch (req->opcode) {
5871                 case IORING_OP_READV:
5872                 case IORING_OP_READ_FIXED:
5873                 case IORING_OP_READ:
5874                 case IORING_OP_WRITEV:
5875                 case IORING_OP_WRITE_FIXED:
5876                 case IORING_OP_WRITE: {
5877                         struct io_async_rw *io = req->async_data;
5878                         if (io->free_iovec)
5879                                 kfree(io->free_iovec);
5880                         break;
5881                         }
5882                 case IORING_OP_RECVMSG:
5883                 case IORING_OP_SENDMSG: {
5884                         struct io_async_msghdr *io = req->async_data;
5885
5886                         kfree(io->free_iov);
5887                         break;
5888                         }
5889                 case IORING_OP_SPLICE:
5890                 case IORING_OP_TEE:
5891                         io_put_file(req, req->splice.file_in,
5892                                     (req->splice.flags & SPLICE_F_FD_IN_FIXED));
5893                         break;
5894                 case IORING_OP_OPENAT:
5895                 case IORING_OP_OPENAT2:
5896                         if (req->open.filename)
5897                                 putname(req->open.filename);
5898                         break;
5899                 case IORING_OP_RENAMEAT:
5900                         putname(req->rename.oldpath);
5901                         putname(req->rename.newpath);
5902                         break;
5903                 case IORING_OP_UNLINKAT:
5904                         putname(req->unlink.filename);
5905                         break;
5906                 }
5907                 req->flags &= ~REQ_F_NEED_CLEANUP;
5908         }
5909 }
5910
5911 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
5912 {
5913         struct io_ring_ctx *ctx = req->ctx;
5914         const struct cred *creds = NULL;
5915         int ret;
5916
5917         if (req->work.personality) {
5918                 const struct cred *new_creds;
5919
5920                 if (!(issue_flags & IO_URING_F_NONBLOCK))
5921                         mutex_lock(&ctx->uring_lock);
5922                 new_creds = idr_find(&ctx->personality_idr, req->work.personality);
5923                 if (!(issue_flags & IO_URING_F_NONBLOCK))
5924                         mutex_unlock(&ctx->uring_lock);
5925                 if (!new_creds)
5926                         return -EINVAL;
5927                 creds = override_creds(new_creds);
5928         }
5929
5930         switch (req->opcode) {
5931         case IORING_OP_NOP:
5932                 ret = io_nop(req, issue_flags);
5933                 break;
5934         case IORING_OP_READV:
5935         case IORING_OP_READ_FIXED:
5936         case IORING_OP_READ:
5937                 ret = io_read(req, issue_flags);
5938                 break;
5939         case IORING_OP_WRITEV:
5940         case IORING_OP_WRITE_FIXED:
5941         case IORING_OP_WRITE:
5942                 ret = io_write(req, issue_flags);
5943                 break;
5944         case IORING_OP_FSYNC:
5945                 ret = io_fsync(req, issue_flags);
5946                 break;
5947         case IORING_OP_POLL_ADD:
5948                 ret = io_poll_add(req, issue_flags);
5949                 break;
5950         case IORING_OP_POLL_REMOVE:
5951                 ret = io_poll_remove(req, issue_flags);
5952                 break;
5953         case IORING_OP_SYNC_FILE_RANGE:
5954                 ret = io_sync_file_range(req, issue_flags);
5955                 break;
5956         case IORING_OP_SENDMSG:
5957                 ret = io_sendmsg(req, issue_flags);
5958                 break;
5959         case IORING_OP_SEND:
5960                 ret = io_send(req, issue_flags);
5961                 break;
5962         case IORING_OP_RECVMSG:
5963                 ret = io_recvmsg(req, issue_flags);
5964                 break;
5965         case IORING_OP_RECV:
5966                 ret = io_recv(req, issue_flags);
5967                 break;
5968         case IORING_OP_TIMEOUT:
5969                 ret = io_timeout(req, issue_flags);
5970                 break;
5971         case IORING_OP_TIMEOUT_REMOVE:
5972                 ret = io_timeout_remove(req, issue_flags);
5973                 break;
5974         case IORING_OP_ACCEPT:
5975                 ret = io_accept(req, issue_flags);
5976                 break;
5977         case IORING_OP_CONNECT:
5978                 ret = io_connect(req, issue_flags);
5979                 break;
5980         case IORING_OP_ASYNC_CANCEL:
5981                 ret = io_async_cancel(req, issue_flags);
5982                 break;
5983         case IORING_OP_FALLOCATE:
5984                 ret = io_fallocate(req, issue_flags);
5985                 break;
5986         case IORING_OP_OPENAT:
5987                 ret = io_openat(req, issue_flags);
5988                 break;
5989         case IORING_OP_CLOSE:
5990                 ret = io_close(req, issue_flags);
5991                 break;
5992         case IORING_OP_FILES_UPDATE:
5993                 ret = io_files_update(req, issue_flags);
5994                 break;
5995         case IORING_OP_STATX:
5996                 ret = io_statx(req, issue_flags);
5997                 break;
5998         case IORING_OP_FADVISE:
5999                 ret = io_fadvise(req, issue_flags);
6000                 break;
6001         case IORING_OP_MADVISE:
6002                 ret = io_madvise(req, issue_flags);
6003                 break;
6004         case IORING_OP_OPENAT2:
6005                 ret = io_openat2(req, issue_flags);
6006                 break;
6007         case IORING_OP_EPOLL_CTL:
6008                 ret = io_epoll_ctl(req, issue_flags);
6009                 break;
6010         case IORING_OP_SPLICE:
6011                 ret = io_splice(req, issue_flags);
6012                 break;
6013         case IORING_OP_PROVIDE_BUFFERS:
6014                 ret = io_provide_buffers(req, issue_flags);
6015                 break;
6016         case IORING_OP_REMOVE_BUFFERS:
6017                 ret = io_remove_buffers(req, issue_flags);
6018                 break;
6019         case IORING_OP_TEE:
6020                 ret = io_tee(req, issue_flags);
6021                 break;
6022         case IORING_OP_SHUTDOWN:
6023                 ret = io_shutdown(req, issue_flags);
6024                 break;
6025         case IORING_OP_RENAMEAT:
6026                 ret = io_renameat(req, issue_flags);
6027                 break;
6028         case IORING_OP_UNLINKAT:
6029                 ret = io_unlinkat(req, issue_flags);
6030                 break;
6031         default:
6032                 ret = -EINVAL;
6033                 break;
6034         }
6035
6036         if (creds)
6037                 revert_creds(creds);
6038
6039         if (ret)
6040                 return ret;
6041
6042         /* If the op doesn't have a file, we're not polling for it */
6043         if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
6044                 const bool in_async = io_wq_current_is_worker();
6045
6046                 /* workqueue context doesn't hold uring_lock, grab it now */
6047                 if (in_async)
6048                         mutex_lock(&ctx->uring_lock);
6049
6050                 io_iopoll_req_issued(req, in_async);
6051
6052                 if (in_async)
6053                         mutex_unlock(&ctx->uring_lock);
6054         }
6055
6056         return 0;
6057 }
6058
6059 static void io_wq_submit_work(struct io_wq_work *work)
6060 {
6061         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6062         struct io_kiocb *timeout;
6063         int ret = 0;
6064
6065         timeout = io_prep_linked_timeout(req);
6066         if (timeout)
6067                 io_queue_linked_timeout(timeout);
6068
6069         if (work->flags & IO_WQ_WORK_CANCEL)
6070                 ret = -ECANCELED;
6071
6072         if (!ret) {
6073                 do {
6074                         ret = io_issue_sqe(req, 0);
6075                         /*
6076                          * We can get EAGAIN for polled IO even though we're
6077                          * forcing a sync submission from here, since we can't
6078                          * wait for request slots on the block side.
6079                          */
6080                         if (ret != -EAGAIN)
6081                                 break;
6082                         cond_resched();
6083                 } while (1);
6084         }
6085
6086         /* avoid locking problems by failing it from a clean context */
6087         if (ret) {
6088                 /* io-wq is going to take one down */
6089                 refcount_inc(&req->refs);
6090                 io_req_task_queue_fail(req, ret);
6091         }
6092 }
6093
6094 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
6095                                               int index)
6096 {
6097         struct fixed_rsrc_table *table;
6098
6099         table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
6100         return table->files[index & IORING_FILE_TABLE_MASK];
6101 }
6102
6103 static struct file *io_file_get(struct io_submit_state *state,
6104                                 struct io_kiocb *req, int fd, bool fixed)
6105 {
6106         struct io_ring_ctx *ctx = req->ctx;
6107         struct file *file;
6108
6109         if (fixed) {
6110                 if (unlikely((unsigned int)fd >= ctx->nr_user_files))
6111                         return NULL;
6112                 fd = array_index_nospec(fd, ctx->nr_user_files);
6113                 file = io_file_from_index(ctx, fd);
6114                 io_set_resource_node(req);
6115         } else {
6116                 trace_io_uring_file_get(ctx, fd);
6117                 file = __io_file_get(state, fd);
6118         }
6119
6120         if (file && unlikely(file->f_op == &io_uring_fops))
6121                 io_req_track_inflight(req);
6122         return file;
6123 }
6124
6125 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
6126 {
6127         struct io_timeout_data *data = container_of(timer,
6128                                                 struct io_timeout_data, timer);
6129         struct io_kiocb *prev, *req = data->req;
6130         struct io_ring_ctx *ctx = req->ctx;
6131         unsigned long flags;
6132
6133         spin_lock_irqsave(&ctx->completion_lock, flags);
6134         prev = req->timeout.head;
6135         req->timeout.head = NULL;
6136
6137         /*
6138          * We don't expect the list to be empty, that will only happen if we
6139          * race with the completion of the linked work.
6140          */
6141         if (prev && refcount_inc_not_zero(&prev->refs))
6142                 io_remove_next_linked(prev);
6143         else
6144                 prev = NULL;
6145         spin_unlock_irqrestore(&ctx->completion_lock, flags);
6146
6147         if (prev) {
6148                 req_set_fail_links(prev);
6149                 io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
6150                 io_put_req_deferred(prev, 1);
6151         } else {
6152                 io_req_complete_post(req, -ETIME, 0);
6153                 io_put_req_deferred(req, 1);
6154         }
6155         return HRTIMER_NORESTART;
6156 }
6157
6158 static void __io_queue_linked_timeout(struct io_kiocb *req)
6159 {
6160         /*
6161          * If the back reference is NULL, then our linked request finished
6162          * before we got a chance to setup the timer
6163          */
6164         if (req->timeout.head) {
6165                 struct io_timeout_data *data = req->async_data;
6166
6167                 data->timer.function = io_link_timeout_fn;
6168                 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
6169                                 data->mode);
6170         }
6171 }
6172
6173 static void io_queue_linked_timeout(struct io_kiocb *req)
6174 {
6175         struct io_ring_ctx *ctx = req->ctx;
6176
6177         spin_lock_irq(&ctx->completion_lock);
6178         __io_queue_linked_timeout(req);
6179         spin_unlock_irq(&ctx->completion_lock);
6180
6181         /* drop submission reference */
6182         io_put_req(req);
6183 }
6184
6185 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
6186 {
6187         struct io_kiocb *nxt = req->link;
6188
6189         if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) ||
6190             nxt->opcode != IORING_OP_LINK_TIMEOUT)
6191                 return NULL;
6192
6193         nxt->timeout.head = req;
6194         nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
6195         req->flags |= REQ_F_LINK_TIMEOUT;
6196         return nxt;
6197 }
6198
6199 static void __io_queue_sqe(struct io_kiocb *req)
6200 {
6201         struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
6202         int ret;
6203
6204         ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
6205
6206         /*
6207          * We async punt it if the file wasn't marked NOWAIT, or if the file
6208          * doesn't support non-blocking read/write attempts
6209          */
6210         if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
6211                 if (!io_arm_poll_handler(req)) {
6212                         /*
6213                          * Queued up for async execution, worker will release
6214                          * submit reference when the iocb is actually submitted.
6215                          */
6216                         io_queue_async_work(req);
6217                 }
6218         } else if (likely(!ret)) {
6219                 /* drop submission reference */
6220                 if (req->flags & REQ_F_COMPLETE_INLINE) {
6221                         struct io_ring_ctx *ctx = req->ctx;
6222                         struct io_comp_state *cs = &ctx->submit_state.comp;
6223
6224                         cs->reqs[cs->nr++] = req;
6225                         if (cs->nr == ARRAY_SIZE(cs->reqs))
6226                                 io_submit_flush_completions(cs, ctx);
6227                 } else {
6228                         io_put_req(req);
6229                 }
6230         } else {
6231                 req_set_fail_links(req);
6232                 io_put_req(req);
6233                 io_req_complete(req, ret);
6234         }
6235         if (linked_timeout)
6236                 io_queue_linked_timeout(linked_timeout);
6237 }
6238
6239 static void io_queue_sqe(struct io_kiocb *req)
6240 {
6241         int ret;
6242
6243         ret = io_req_defer(req);
6244         if (ret) {
6245                 if (ret != -EIOCBQUEUED) {
6246 fail_req:
6247                         req_set_fail_links(req);
6248                         io_put_req(req);
6249                         io_req_complete(req, ret);
6250                 }
6251         } else if (req->flags & REQ_F_FORCE_ASYNC) {
6252                 ret = io_req_defer_prep(req);
6253                 if (unlikely(ret))
6254                         goto fail_req;
6255                 io_queue_async_work(req);
6256         } else {
6257                 __io_queue_sqe(req);
6258         }
6259 }
6260
6261 /*
6262  * Check SQE restrictions (opcode and flags).
6263  *
6264  * Returns 'true' if SQE is allowed, 'false' otherwise.
6265  */
6266 static inline bool io_check_restriction(struct io_ring_ctx *ctx,
6267                                         struct io_kiocb *req,
6268                                         unsigned int sqe_flags)
6269 {
6270         if (!ctx->restricted)
6271                 return true;
6272
6273         if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
6274                 return false;
6275
6276         if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
6277             ctx->restrictions.sqe_flags_required)
6278                 return false;
6279
6280         if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
6281                           ctx->restrictions.sqe_flags_required))
6282                 return false;
6283
6284         return true;
6285 }
6286
6287 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
6288                        const struct io_uring_sqe *sqe)
6289 {
6290         struct io_submit_state *state;
6291         unsigned int sqe_flags;
6292         int ret = 0;
6293
6294         req->opcode = READ_ONCE(sqe->opcode);
6295         /* same numerical values with corresponding REQ_F_*, safe to copy */
6296         req->flags = sqe_flags = READ_ONCE(sqe->flags);
6297         req->user_data = READ_ONCE(sqe->user_data);
6298         req->async_data = NULL;
6299         req->file = NULL;
6300         req->ctx = ctx;
6301         req->link = NULL;
6302         req->fixed_rsrc_refs = NULL;
6303         /* one is dropped after submission, the other at completion */
6304         refcount_set(&req->refs, 2);
6305         req->task = current;
6306         req->result = 0;
6307
6308         /* enforce forwards compatibility on users */
6309         if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
6310                 req->flags = 0;
6311                 return -EINVAL;
6312         }
6313
6314         if (unlikely(req->opcode >= IORING_OP_LAST))
6315                 return -EINVAL;
6316
6317         if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
6318                 return -EACCES;
6319
6320         if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
6321             !io_op_defs[req->opcode].buffer_select)
6322                 return -EOPNOTSUPP;
6323
6324         req->work.list.next = NULL;
6325         req->work.flags = 0;
6326         req->work.personality = READ_ONCE(sqe->personality);
6327         state = &ctx->submit_state;
6328
6329         /*
6330          * Plug now if we have more than 1 IO left after this, and the target
6331          * is potentially a read/write to block based storage.
6332          */
6333         if (!state->plug_started && state->ios_left > 1 &&
6334             io_op_defs[req->opcode].plug) {
6335                 blk_start_plug(&state->plug);
6336                 state->plug_started = true;
6337         }
6338
6339         if (io_op_defs[req->opcode].needs_file) {
6340                 bool fixed = req->flags & REQ_F_FIXED_FILE;
6341
6342                 req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
6343                 if (unlikely(!req->file))
6344                         ret = -EBADF;
6345         }
6346
6347         state->ios_left--;
6348         return ret;
6349 }
6350
6351 static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
6352                          const struct io_uring_sqe *sqe)
6353 {
6354         struct io_submit_link *link = &ctx->submit_state.link;
6355         int ret;
6356
6357         ret = io_init_req(ctx, req, sqe);
6358         if (unlikely(ret)) {
6359 fail_req:
6360                 io_put_req(req);
6361                 io_req_complete(req, ret);
6362                 if (link->head) {
6363                         /* fail even hard links since we don't submit */
6364                         link->head->flags |= REQ_F_FAIL_LINK;
6365                         io_put_req(link->head);
6366                         io_req_complete(link->head, -ECANCELED);
6367                         link->head = NULL;
6368                 }
6369                 return ret;
6370         }
6371         ret = io_req_prep(req, sqe);
6372         if (unlikely(ret))
6373                 goto fail_req;
6374
6375         /* don't need @sqe from now on */
6376         trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
6377                                 true, ctx->flags & IORING_SETUP_SQPOLL);
6378
6379         /*
6380          * If we already have a head request, queue this one for async
6381          * submittal once the head completes. If we don't have a head but
6382          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
6383          * submitted sync once the chain is complete. If none of those
6384          * conditions are true (normal request), then just queue it.
6385          */
6386         if (link->head) {
6387                 struct io_kiocb *head = link->head;
6388
6389                 /*
6390                  * Taking sequential execution of a link, draining both sides
6391                  * of the link also fullfils IOSQE_IO_DRAIN semantics for all
6392                  * requests in the link. So, it drains the head and the
6393                  * next after the link request. The last one is done via
6394                  * drain_next flag to persist the effect across calls.
6395                  */
6396                 if (req->flags & REQ_F_IO_DRAIN) {
6397                         head->flags |= REQ_F_IO_DRAIN;
6398                         ctx->drain_next = 1;
6399                 }
6400                 ret = io_req_defer_prep(req);
6401                 if (unlikely(ret))
6402                         goto fail_req;
6403                 trace_io_uring_link(ctx, req, head);
6404                 link->last->link = req;
6405                 link->last = req;
6406
6407                 /* last request of a link, enqueue the link */
6408                 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
6409                         io_queue_sqe(head);
6410                         link->head = NULL;
6411                 }
6412         } else {
6413                 if (unlikely(ctx->drain_next)) {
6414                         req->flags |= REQ_F_IO_DRAIN;
6415                         ctx->drain_next = 0;
6416                 }
6417                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
6418                         link->head = req;
6419                         link->last = req;
6420                 } else {
6421                         io_queue_sqe(req);
6422                 }
6423         }
6424
6425         return 0;
6426 }
6427
6428 /*
6429  * Batched submission is done, ensure local IO is flushed out.
6430  */
6431 static void io_submit_state_end(struct io_submit_state *state,
6432                                 struct io_ring_ctx *ctx)
6433 {
6434         if (state->link.head)
6435                 io_queue_sqe(state->link.head);
6436         if (state->comp.nr)
6437                 io_submit_flush_completions(&state->comp, ctx);
6438         if (state->plug_started)
6439                 blk_finish_plug(&state->plug);
6440         io_state_file_put(state);
6441 }
6442
6443 /*
6444  * Start submission side cache.
6445  */
6446 static void io_submit_state_start(struct io_submit_state *state,
6447                                   unsigned int max_ios)
6448 {
6449         state->plug_started = false;
6450         state->ios_left = max_ios;
6451         /* set only head, no need to init link_last in advance */
6452         state->link.head = NULL;
6453 }
6454
6455 static void io_commit_sqring(struct io_ring_ctx *ctx)
6456 {
6457         struct io_rings *rings = ctx->rings;
6458
6459         /*
6460          * Ensure any loads from the SQEs are done at this point,
6461          * since once we write the new head, the application could
6462          * write new data to them.
6463          */
6464         smp_store_release(&rings->sq.head, ctx->cached_sq_head);
6465 }
6466
6467 /*
6468  * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
6469  * that is mapped by userspace. This means that care needs to be taken to
6470  * ensure that reads are stable, as we cannot rely on userspace always
6471  * being a good citizen. If members of the sqe are validated and then later
6472  * used, it's important that those reads are done through READ_ONCE() to
6473  * prevent a re-load down the line.
6474  */
6475 static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
6476 {
6477         u32 *sq_array = ctx->sq_array;
6478         unsigned head;
6479
6480         /*
6481          * The cached sq head (or cq tail) serves two purposes:
6482          *
6483          * 1) allows us to batch the cost of updating the user visible
6484          *    head updates.
6485          * 2) allows the kernel side to track the head on its own, even
6486          *    though the application is the one updating it.
6487          */
6488         head = READ_ONCE(sq_array[ctx->cached_sq_head++ & ctx->sq_mask]);
6489         if (likely(head < ctx->sq_entries))
6490                 return &ctx->sq_sqes[head];
6491
6492         /* drop invalid entries */
6493         ctx->cached_sq_dropped++;
6494         WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
6495         return NULL;
6496 }
6497
6498 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
6499 {
6500         int submitted = 0;
6501
6502         /* if we have a backlog and couldn't flush it all, return BUSY */
6503         if (test_bit(0, &ctx->sq_check_overflow)) {
6504                 if (!__io_cqring_overflow_flush(ctx, false, NULL, NULL))
6505                         return -EBUSY;
6506         }
6507
6508         /* make sure SQ entry isn't read before tail */
6509         nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
6510
6511         if (!percpu_ref_tryget_many(&ctx->refs, nr))
6512                 return -EAGAIN;
6513
6514         percpu_counter_add(&current->io_uring->inflight, nr);
6515         refcount_add(nr, &current->usage);
6516         io_submit_state_start(&ctx->submit_state, nr);
6517
6518         while (submitted < nr) {
6519                 const struct io_uring_sqe *sqe;
6520                 struct io_kiocb *req;
6521
6522                 req = io_alloc_req(ctx);
6523                 if (unlikely(!req)) {
6524                         if (!submitted)
6525                                 submitted = -EAGAIN;
6526                         break;
6527                 }
6528                 sqe = io_get_sqe(ctx);
6529                 if (unlikely(!sqe)) {
6530                         kmem_cache_free(req_cachep, req);
6531                         break;
6532                 }
6533                 /* will complete beyond this point, count as submitted */
6534                 submitted++;
6535                 if (io_submit_sqe(ctx, req, sqe))
6536                         break;
6537         }
6538
6539         if (unlikely(submitted != nr)) {
6540                 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
6541                 struct io_uring_task *tctx = current->io_uring;
6542                 int unused = nr - ref_used;
6543
6544                 percpu_ref_put_many(&ctx->refs, unused);
6545                 percpu_counter_sub(&tctx->inflight, unused);
6546                 put_task_struct_many(current, unused);
6547         }
6548
6549         io_submit_state_end(&ctx->submit_state, ctx);
6550          /* Commit SQ ring head once we've consumed and submitted all SQEs */
6551         io_commit_sqring(ctx);
6552
6553         return submitted;
6554 }
6555
6556 static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
6557 {
6558         /* Tell userspace we may need a wakeup call */
6559         spin_lock_irq(&ctx->completion_lock);
6560         ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
6561         spin_unlock_irq(&ctx->completion_lock);
6562 }
6563
6564 static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
6565 {
6566         spin_lock_irq(&ctx->completion_lock);
6567         ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6568         spin_unlock_irq(&ctx->completion_lock);
6569 }
6570
6571 static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
6572 {
6573         unsigned int to_submit;
6574         int ret = 0;
6575
6576         to_submit = io_sqring_entries(ctx);
6577         /* if we're handling multiple rings, cap submit size for fairness */
6578         if (cap_entries && to_submit > 8)
6579                 to_submit = 8;
6580
6581         if (!list_empty(&ctx->iopoll_list) || to_submit) {
6582                 unsigned nr_events = 0;
6583
6584                 mutex_lock(&ctx->uring_lock);
6585                 if (!list_empty(&ctx->iopoll_list))
6586                         io_do_iopoll(ctx, &nr_events, 0);
6587
6588                 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)))
6589                         ret = io_submit_sqes(ctx, to_submit);
6590                 mutex_unlock(&ctx->uring_lock);
6591         }
6592
6593         if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
6594                 wake_up(&ctx->sqo_sq_wait);
6595
6596         return ret;
6597 }
6598
6599 static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
6600 {
6601         struct io_ring_ctx *ctx;
6602         unsigned sq_thread_idle = 0;
6603
6604         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
6605                 if (sq_thread_idle < ctx->sq_thread_idle)
6606                         sq_thread_idle = ctx->sq_thread_idle;
6607         }
6608
6609         sqd->sq_thread_idle = sq_thread_idle;
6610 }
6611
6612 static void io_sqd_init_new(struct io_sq_data *sqd)
6613 {
6614         struct io_ring_ctx *ctx;
6615
6616         while (!list_empty(&sqd->ctx_new_list)) {
6617                 ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list);
6618                 list_move_tail(&ctx->sqd_list, &sqd->ctx_list);
6619                 complete(&ctx->sq_thread_comp);
6620         }
6621
6622         io_sqd_update_thread_idle(sqd);
6623 }
6624
6625 static bool io_sq_thread_should_stop(struct io_sq_data *sqd)
6626 {
6627         return test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
6628 }
6629
6630 static bool io_sq_thread_should_park(struct io_sq_data *sqd)
6631 {
6632         return test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
6633 }
6634
6635 static void io_sq_thread_parkme(struct io_sq_data *sqd)
6636 {
6637         for (;;) {
6638                 /*
6639                  * TASK_PARKED is a special state; we must serialize against
6640                  * possible pending wakeups to avoid store-store collisions on
6641                  * task->state.
6642                  *
6643                  * Such a collision might possibly result in the task state
6644                  * changin from TASK_PARKED and us failing the
6645                  * wait_task_inactive() in kthread_park().
6646                  */
6647                 set_special_state(TASK_PARKED);
6648                 if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state))
6649                         break;
6650
6651                 /*
6652                  * Thread is going to call schedule(), do not preempt it,
6653                  * or the caller of kthread_park() may spend more time in
6654                  * wait_task_inactive().
6655                  */
6656                 preempt_disable();
6657                 complete(&sqd->completion);
6658                 schedule_preempt_disabled();
6659                 preempt_enable();
6660         }
6661         __set_current_state(TASK_RUNNING);
6662 }
6663
6664 static int io_sq_thread(void *data)
6665 {
6666         struct io_sq_data *sqd = data;
6667         struct io_ring_ctx *ctx;
6668         unsigned long timeout = 0;
6669         char buf[TASK_COMM_LEN];
6670         DEFINE_WAIT(wait);
6671
6672         sprintf(buf, "iou-sqp-%d", sqd->task_pid);
6673         set_task_comm(current, buf);
6674         sqd->thread = current;
6675         current->pf_io_worker = NULL;
6676
6677         if (sqd->sq_cpu != -1)
6678                 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
6679         else
6680                 set_cpus_allowed_ptr(current, cpu_online_mask);
6681         current->flags |= PF_NO_SETAFFINITY;
6682
6683         complete(&sqd->completion);
6684
6685         wait_for_completion(&sqd->startup);
6686
6687         while (!io_sq_thread_should_stop(sqd)) {
6688                 int ret;
6689                 bool cap_entries, sqt_spin, needs_sched;
6690
6691                 /*
6692                  * Any changes to the sqd lists are synchronized through the
6693                  * thread parking. This synchronizes the thread vs users,
6694                  * the users are synchronized on the sqd->ctx_lock.
6695                  */
6696                 if (io_sq_thread_should_park(sqd)) {
6697                         io_sq_thread_parkme(sqd);
6698                         continue;
6699                 }
6700                 if (unlikely(!list_empty(&sqd->ctx_new_list))) {
6701                         io_sqd_init_new(sqd);
6702                         timeout = jiffies + sqd->sq_thread_idle;
6703                 }
6704                 if (fatal_signal_pending(current))
6705                         break;
6706                 sqt_spin = false;
6707                 cap_entries = !list_is_singular(&sqd->ctx_list);
6708                 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
6709                         ret = __io_sq_thread(ctx, cap_entries);
6710                         if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
6711                                 sqt_spin = true;
6712                 }
6713
6714                 if (sqt_spin || !time_after(jiffies, timeout)) {
6715                         io_run_task_work();
6716                         cond_resched();
6717                         if (sqt_spin)
6718                                 timeout = jiffies + sqd->sq_thread_idle;
6719                         continue;
6720                 }
6721
6722                 needs_sched = true;
6723                 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
6724                 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
6725                         if ((ctx->flags & IORING_SETUP_IOPOLL) &&
6726                             !list_empty_careful(&ctx->iopoll_list)) {
6727                                 needs_sched = false;
6728                                 break;
6729                         }
6730                         if (io_sqring_entries(ctx)) {
6731                                 needs_sched = false;
6732                                 break;
6733                         }
6734                 }
6735
6736                 if (needs_sched && !io_sq_thread_should_park(sqd)) {
6737                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6738                                 io_ring_set_wakeup_flag(ctx);
6739
6740                         schedule();
6741                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6742                                 io_ring_clear_wakeup_flag(ctx);
6743                 }
6744
6745                 finish_wait(&sqd->wait, &wait);
6746                 timeout = jiffies + sqd->sq_thread_idle;
6747         }
6748
6749         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6750                 io_uring_cancel_sqpoll(ctx);
6751
6752         io_run_task_work();
6753
6754         if (io_sq_thread_should_park(sqd))
6755                 io_sq_thread_parkme(sqd);
6756
6757         /*
6758          * Clear thread under lock so that concurrent parks work correctly
6759          */
6760         complete(&sqd->completion);
6761         mutex_lock(&sqd->lock);
6762         sqd->thread = NULL;
6763         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
6764                 ctx->sqo_exec = 1;
6765                 io_ring_set_wakeup_flag(ctx);
6766         }
6767
6768         complete(&sqd->exited);
6769         mutex_unlock(&sqd->lock);
6770         do_exit(0);
6771 }
6772
6773 struct io_wait_queue {
6774         struct wait_queue_entry wq;
6775         struct io_ring_ctx *ctx;
6776         unsigned to_wait;
6777         unsigned nr_timeouts;
6778 };
6779
6780 static inline bool io_should_wake(struct io_wait_queue *iowq)
6781 {
6782         struct io_ring_ctx *ctx = iowq->ctx;
6783
6784         /*
6785          * Wake up if we have enough events, or if a timeout occurred since we
6786          * started waiting. For timeouts, we always want to return to userspace,
6787          * regardless of event count.
6788          */
6789         return io_cqring_events(ctx) >= iowq->to_wait ||
6790                         atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
6791 }
6792
6793 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
6794                             int wake_flags, void *key)
6795 {
6796         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
6797                                                         wq);
6798
6799         /*
6800          * Cannot safely flush overflowed CQEs from here, ensure we wake up
6801          * the task, and the next invocation will do it.
6802          */
6803         if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->cq_check_overflow))
6804                 return autoremove_wake_function(curr, mode, wake_flags, key);
6805         return -1;
6806 }
6807
6808 static int io_run_task_work_sig(void)
6809 {
6810         if (io_run_task_work())
6811                 return 1;
6812         if (!signal_pending(current))
6813                 return 0;
6814         if (test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))
6815                 return -ERESTARTSYS;
6816         return -EINTR;
6817 }
6818
6819 /* when returns >0, the caller should retry */
6820 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
6821                                           struct io_wait_queue *iowq,
6822                                           signed long *timeout)
6823 {
6824         int ret;
6825
6826         /* make sure we run task_work before checking for signals */
6827         ret = io_run_task_work_sig();
6828         if (ret || io_should_wake(iowq))
6829                 return ret;
6830         /* let the caller flush overflows, retry */
6831         if (test_bit(0, &ctx->cq_check_overflow))
6832                 return 1;
6833
6834         *timeout = schedule_timeout(*timeout);
6835         return !*timeout ? -ETIME : 1;
6836 }
6837
6838 /*
6839  * Wait until events become available, if we don't already have some. The
6840  * application must reap them itself, as they reside on the shared cq ring.
6841  */
6842 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
6843                           const sigset_t __user *sig, size_t sigsz,
6844                           struct __kernel_timespec __user *uts)
6845 {
6846         struct io_wait_queue iowq = {
6847                 .wq = {
6848                         .private        = current,
6849                         .func           = io_wake_function,
6850                         .entry          = LIST_HEAD_INIT(iowq.wq.entry),
6851                 },
6852                 .ctx            = ctx,
6853                 .to_wait        = min_events,
6854         };
6855         struct io_rings *rings = ctx->rings;
6856         signed long timeout = MAX_SCHEDULE_TIMEOUT;
6857         int ret;
6858
6859         do {
6860                 io_cqring_overflow_flush(ctx, false, NULL, NULL);
6861                 if (io_cqring_events(ctx) >= min_events)
6862                         return 0;
6863                 if (!io_run_task_work())
6864                         break;
6865         } while (1);
6866
6867         if (sig) {
6868 #ifdef CONFIG_COMPAT
6869                 if (in_compat_syscall())
6870                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
6871                                                       sigsz);
6872                 else
6873 #endif
6874                         ret = set_user_sigmask(sig, sigsz);
6875
6876                 if (ret)
6877                         return ret;
6878         }
6879
6880         if (uts) {
6881                 struct timespec64 ts;
6882
6883                 if (get_timespec64(&ts, uts))
6884                         return -EFAULT;
6885                 timeout = timespec64_to_jiffies(&ts);
6886         }
6887
6888         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
6889         trace_io_uring_cqring_wait(ctx, min_events);
6890         do {
6891                 io_cqring_overflow_flush(ctx, false, NULL, NULL);
6892                 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
6893                                                 TASK_INTERRUPTIBLE);
6894                 ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
6895                 finish_wait(&ctx->wait, &iowq.wq);
6896         } while (ret > 0);
6897
6898         restore_saved_sigmask_unless(ret == -EINTR);
6899
6900         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
6901 }
6902
6903 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
6904 {
6905 #if defined(CONFIG_UNIX)
6906         if (ctx->ring_sock) {
6907                 struct sock *sock = ctx->ring_sock->sk;
6908                 struct sk_buff *skb;
6909
6910                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
6911                         kfree_skb(skb);
6912         }
6913 #else
6914         int i;
6915
6916         for (i = 0; i < ctx->nr_user_files; i++) {
6917                 struct file *file;
6918
6919                 file = io_file_from_index(ctx, i);
6920                 if (file)
6921                         fput(file);
6922         }
6923 #endif
6924 }
6925
6926 static void io_rsrc_data_ref_zero(struct percpu_ref *ref)
6927 {
6928         struct fixed_rsrc_data *data;
6929
6930         data = container_of(ref, struct fixed_rsrc_data, refs);
6931         complete(&data->done);
6932 }
6933
6934 static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
6935 {
6936         spin_lock_bh(&ctx->rsrc_ref_lock);
6937 }
6938
6939 static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx)
6940 {
6941         spin_unlock_bh(&ctx->rsrc_ref_lock);
6942 }
6943
6944 static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx,
6945                                  struct fixed_rsrc_data *rsrc_data,
6946                                  struct fixed_rsrc_ref_node *ref_node)
6947 {
6948         io_rsrc_ref_lock(ctx);
6949         rsrc_data->node = ref_node;
6950         list_add_tail(&ref_node->node, &ctx->rsrc_ref_list);
6951         io_rsrc_ref_unlock(ctx);
6952         percpu_ref_get(&rsrc_data->refs);
6953 }
6954
6955 static void io_sqe_rsrc_kill_node(struct io_ring_ctx *ctx, struct fixed_rsrc_data *data)
6956 {
6957         struct fixed_rsrc_ref_node *ref_node = NULL;
6958
6959         io_rsrc_ref_lock(ctx);
6960         ref_node = data->node;
6961         data->node = NULL;
6962         io_rsrc_ref_unlock(ctx);
6963         if (ref_node)
6964                 percpu_ref_kill(&ref_node->refs);
6965 }
6966
6967 static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
6968                                struct io_ring_ctx *ctx,
6969                                void (*rsrc_put)(struct io_ring_ctx *ctx,
6970                                                 struct io_rsrc_put *prsrc))
6971 {
6972         struct fixed_rsrc_ref_node *backup_node;
6973         int ret;
6974
6975         if (data->quiesce)
6976                 return -ENXIO;
6977
6978         data->quiesce = true;
6979         do {
6980                 ret = -ENOMEM;
6981                 backup_node = alloc_fixed_rsrc_ref_node(ctx);
6982                 if (!backup_node)
6983                         break;
6984                 backup_node->rsrc_data = data;
6985                 backup_node->rsrc_put = rsrc_put;
6986
6987                 io_sqe_rsrc_kill_node(ctx, data);
6988                 percpu_ref_kill(&data->refs);
6989                 flush_delayed_work(&ctx->rsrc_put_work);
6990
6991                 ret = wait_for_completion_interruptible(&data->done);
6992                 if (!ret)
6993                         break;
6994
6995                 percpu_ref_resurrect(&data->refs);
6996                 io_sqe_rsrc_set_node(ctx, data, backup_node);
6997                 backup_node = NULL;
6998                 reinit_completion(&data->done);
6999                 mutex_unlock(&ctx->uring_lock);
7000                 ret = io_run_task_work_sig();
7001                 mutex_lock(&ctx->uring_lock);
7002         } while (ret >= 0);
7003         data->quiesce = false;
7004
7005         if (backup_node)
7006                 destroy_fixed_rsrc_ref_node(backup_node);
7007         return ret;
7008 }
7009
7010 static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx)
7011 {
7012         struct fixed_rsrc_data *data;
7013
7014         data = kzalloc(sizeof(*data), GFP_KERNEL);
7015         if (!data)
7016                 return NULL;
7017
7018         if (percpu_ref_init(&data->refs, io_rsrc_data_ref_zero,
7019                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
7020                 kfree(data);
7021                 return NULL;
7022         }
7023         data->ctx = ctx;
7024         init_completion(&data->done);
7025         return data;
7026 }
7027
7028 static void free_fixed_rsrc_data(struct fixed_rsrc_data *data)
7029 {
7030         percpu_ref_exit(&data->refs);
7031         kfree(data->table);
7032         kfree(data);
7033 }
7034
7035 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
7036 {
7037         struct fixed_rsrc_data *data = ctx->file_data;
7038         unsigned nr_tables, i;
7039         int ret;
7040
7041         /*
7042          * percpu_ref_is_dying() is to stop parallel files unregister
7043          * Since we possibly drop uring lock later in this function to
7044          * run task work.
7045          */
7046         if (!data || percpu_ref_is_dying(&data->refs))
7047                 return -ENXIO;
7048         ret = io_rsrc_ref_quiesce(data, ctx, io_ring_file_put);
7049         if (ret)
7050                 return ret;
7051
7052         __io_sqe_files_unregister(ctx);
7053         nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
7054         for (i = 0; i < nr_tables; i++)
7055                 kfree(data->table[i].files);
7056         free_fixed_rsrc_data(data);
7057         ctx->file_data = NULL;
7058         ctx->nr_user_files = 0;
7059         return 0;
7060 }
7061
7062 static void io_sq_thread_unpark(struct io_sq_data *sqd)
7063         __releases(&sqd->lock)
7064 {
7065         if (!sqd->thread)
7066                 return;
7067         if (sqd->thread == current)
7068                 return;
7069         clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7070         wake_up_state(sqd->thread, TASK_PARKED);
7071         mutex_unlock(&sqd->lock);
7072 }
7073
7074 static bool io_sq_thread_park(struct io_sq_data *sqd)
7075         __acquires(&sqd->lock)
7076 {
7077         if (sqd->thread == current)
7078                 return true;
7079         mutex_lock(&sqd->lock);
7080         if (!sqd->thread) {
7081                 mutex_unlock(&sqd->lock);
7082                 return false;
7083         }
7084         set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7085         wake_up_process(sqd->thread);
7086         wait_for_completion(&sqd->completion);
7087         return true;
7088 }
7089
7090 static void io_sq_thread_stop(struct io_sq_data *sqd)
7091 {
7092         if (test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state))
7093                 return;
7094         mutex_lock(&sqd->lock);
7095         if (sqd->thread) {
7096                 set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
7097                 WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state));
7098                 wake_up_process(sqd->thread);
7099                 mutex_unlock(&sqd->lock);
7100                 wait_for_completion(&sqd->exited);
7101                 WARN_ON_ONCE(sqd->thread);
7102         } else {
7103                 mutex_unlock(&sqd->lock);
7104         }
7105 }
7106
7107 static void io_put_sq_data(struct io_sq_data *sqd)
7108 {
7109         if (refcount_dec_and_test(&sqd->refs)) {
7110                 io_sq_thread_stop(sqd);
7111                 kfree(sqd);
7112         }
7113 }
7114
7115 static void io_sq_thread_finish(struct io_ring_ctx *ctx)
7116 {
7117         struct io_sq_data *sqd = ctx->sq_data;
7118
7119         if (sqd) {
7120                 complete(&sqd->startup);
7121                 if (sqd->thread) {
7122                         wait_for_completion(&ctx->sq_thread_comp);
7123                         io_sq_thread_park(sqd);
7124                 }
7125
7126                 mutex_lock(&sqd->ctx_lock);
7127                 list_del(&ctx->sqd_list);
7128                 io_sqd_update_thread_idle(sqd);
7129                 mutex_unlock(&sqd->ctx_lock);
7130
7131                 if (sqd->thread)
7132                         io_sq_thread_unpark(sqd);
7133
7134                 io_put_sq_data(sqd);
7135                 ctx->sq_data = NULL;
7136         }
7137 }
7138
7139 static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
7140 {
7141         struct io_ring_ctx *ctx_attach;
7142         struct io_sq_data *sqd;
7143         struct fd f;
7144
7145         f = fdget(p->wq_fd);
7146         if (!f.file)
7147                 return ERR_PTR(-ENXIO);
7148         if (f.file->f_op != &io_uring_fops) {
7149                 fdput(f);
7150                 return ERR_PTR(-EINVAL);
7151         }
7152
7153         ctx_attach = f.file->private_data;
7154         sqd = ctx_attach->sq_data;
7155         if (!sqd) {
7156                 fdput(f);
7157                 return ERR_PTR(-EINVAL);
7158         }
7159
7160         refcount_inc(&sqd->refs);
7161         fdput(f);
7162         return sqd;
7163 }
7164
7165 static struct io_sq_data *io_get_sq_data(struct io_uring_params *p)
7166 {
7167         struct io_sq_data *sqd;
7168
7169         if (p->flags & IORING_SETUP_ATTACH_WQ)
7170                 return io_attach_sq_data(p);
7171
7172         sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
7173         if (!sqd)
7174                 return ERR_PTR(-ENOMEM);
7175
7176         refcount_set(&sqd->refs, 1);
7177         INIT_LIST_HEAD(&sqd->ctx_list);
7178         INIT_LIST_HEAD(&sqd->ctx_new_list);
7179         mutex_init(&sqd->ctx_lock);
7180         mutex_init(&sqd->lock);
7181         init_waitqueue_head(&sqd->wait);
7182         init_completion(&sqd->startup);
7183         init_completion(&sqd->completion);
7184         init_completion(&sqd->exited);
7185         return sqd;
7186 }
7187
7188 #if defined(CONFIG_UNIX)
7189 /*
7190  * Ensure the UNIX gc is aware of our file set, so we are certain that
7191  * the io_uring can be safely unregistered on process exit, even if we have
7192  * loops in the file referencing.
7193  */
7194 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
7195 {
7196         struct sock *sk = ctx->ring_sock->sk;
7197         struct scm_fp_list *fpl;
7198         struct sk_buff *skb;
7199         int i, nr_files;
7200
7201         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
7202         if (!fpl)
7203                 return -ENOMEM;
7204
7205         skb = alloc_skb(0, GFP_KERNEL);
7206         if (!skb) {
7207                 kfree(fpl);
7208                 return -ENOMEM;
7209         }
7210
7211         skb->sk = sk;
7212
7213         nr_files = 0;
7214         fpl->user = get_uid(current_user());
7215         for (i = 0; i < nr; i++) {
7216                 struct file *file = io_file_from_index(ctx, i + offset);
7217
7218                 if (!file)
7219                         continue;
7220                 fpl->fp[nr_files] = get_file(file);
7221                 unix_inflight(fpl->user, fpl->fp[nr_files]);
7222                 nr_files++;
7223         }
7224
7225         if (nr_files) {
7226                 fpl->max = SCM_MAX_FD;
7227                 fpl->count = nr_files;
7228                 UNIXCB(skb).fp = fpl;
7229                 skb->destructor = unix_destruct_scm;
7230                 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
7231                 skb_queue_head(&sk->sk_receive_queue, skb);
7232
7233                 for (i = 0; i < nr_files; i++)
7234                         fput(fpl->fp[i]);
7235         } else {
7236                 kfree_skb(skb);
7237                 kfree(fpl);
7238         }
7239
7240         return 0;
7241 }
7242
7243 /*
7244  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
7245  * causes regular reference counting to break down. We rely on the UNIX
7246  * garbage collection to take care of this problem for us.
7247  */
7248 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7249 {
7250         unsigned left, total;
7251         int ret = 0;
7252
7253         total = 0;
7254         left = ctx->nr_user_files;
7255         while (left) {
7256                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
7257
7258                 ret = __io_sqe_files_scm(ctx, this_files, total);
7259                 if (ret)
7260                         break;
7261                 left -= this_files;
7262                 total += this_files;
7263         }
7264
7265         if (!ret)
7266                 return 0;
7267
7268         while (total < ctx->nr_user_files) {
7269                 struct file *file = io_file_from_index(ctx, total);
7270
7271                 if (file)
7272                         fput(file);
7273                 total++;
7274         }
7275
7276         return ret;
7277 }
7278 #else
7279 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7280 {
7281         return 0;
7282 }
7283 #endif
7284
7285 static int io_sqe_alloc_file_tables(struct fixed_rsrc_data *file_data,
7286                                     unsigned nr_tables, unsigned nr_files)
7287 {
7288         int i;
7289
7290         for (i = 0; i < nr_tables; i++) {
7291                 struct fixed_rsrc_table *table = &file_data->table[i];
7292                 unsigned this_files;
7293
7294                 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
7295                 table->files = kcalloc(this_files, sizeof(struct file *),
7296                                         GFP_KERNEL);
7297                 if (!table->files)
7298                         break;
7299                 nr_files -= this_files;
7300         }
7301
7302         if (i == nr_tables)
7303                 return 0;
7304
7305         for (i = 0; i < nr_tables; i++) {
7306                 struct fixed_rsrc_table *table = &file_data->table[i];
7307                 kfree(table->files);
7308         }
7309         return 1;
7310 }
7311
7312 static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
7313 {
7314         struct file *file = prsrc->file;
7315 #if defined(CONFIG_UNIX)
7316         struct sock *sock = ctx->ring_sock->sk;
7317         struct sk_buff_head list, *head = &sock->sk_receive_queue;
7318         struct sk_buff *skb;
7319         int i;
7320
7321         __skb_queue_head_init(&list);
7322
7323         /*
7324          * Find the skb that holds this file in its SCM_RIGHTS. When found,
7325          * remove this entry and rearrange the file array.
7326          */
7327         skb = skb_dequeue(head);
7328         while (skb) {
7329                 struct scm_fp_list *fp;
7330
7331                 fp = UNIXCB(skb).fp;
7332                 for (i = 0; i < fp->count; i++) {
7333                         int left;
7334
7335                         if (fp->fp[i] != file)
7336                                 continue;
7337
7338                         unix_notinflight(fp->user, fp->fp[i]);
7339                         left = fp->count - 1 - i;
7340                         if (left) {
7341                                 memmove(&fp->fp[i], &fp->fp[i + 1],
7342                                                 left * sizeof(struct file *));
7343                         }
7344                         fp->count--;
7345                         if (!fp->count) {
7346                                 kfree_skb(skb);
7347                                 skb = NULL;
7348                         } else {
7349                                 __skb_queue_tail(&list, skb);
7350                         }
7351                         fput(file);
7352                         file = NULL;
7353                         break;
7354                 }
7355
7356                 if (!file)
7357                         break;
7358
7359                 __skb_queue_tail(&list, skb);
7360
7361                 skb = skb_dequeue(head);
7362         }
7363
7364         if (skb_peek(&list)) {
7365                 spin_lock_irq(&head->lock);
7366                 while ((skb = __skb_dequeue(&list)) != NULL)
7367                         __skb_queue_tail(head, skb);
7368                 spin_unlock_irq(&head->lock);
7369         }
7370 #else
7371         fput(file);
7372 #endif
7373 }
7374
7375 static void __io_rsrc_put_work(struct fixed_rsrc_ref_node *ref_node)
7376 {
7377         struct fixed_rsrc_data *rsrc_data = ref_node->rsrc_data;
7378         struct io_ring_ctx *ctx = rsrc_data->ctx;
7379         struct io_rsrc_put *prsrc, *tmp;
7380
7381         list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
7382                 list_del(&prsrc->list);
7383                 ref_node->rsrc_put(ctx, prsrc);
7384                 kfree(prsrc);
7385         }
7386
7387         percpu_ref_exit(&ref_node->refs);
7388         kfree(ref_node);
7389         percpu_ref_put(&rsrc_data->refs);
7390 }
7391
7392 static void io_rsrc_put_work(struct work_struct *work)
7393 {
7394         struct io_ring_ctx *ctx;
7395         struct llist_node *node;
7396
7397         ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
7398         node = llist_del_all(&ctx->rsrc_put_llist);
7399
7400         while (node) {
7401                 struct fixed_rsrc_ref_node *ref_node;
7402                 struct llist_node *next = node->next;
7403
7404                 ref_node = llist_entry(node, struct fixed_rsrc_ref_node, llist);
7405                 __io_rsrc_put_work(ref_node);
7406                 node = next;
7407         }
7408 }
7409
7410 static struct file **io_fixed_file_slot(struct fixed_rsrc_data *file_data,
7411                                         unsigned i)
7412 {
7413         struct fixed_rsrc_table *table;
7414
7415         table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
7416         return &table->files[i & IORING_FILE_TABLE_MASK];
7417 }
7418
7419 static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
7420 {
7421         struct fixed_rsrc_ref_node *ref_node;
7422         struct fixed_rsrc_data *data;
7423         struct io_ring_ctx *ctx;
7424         bool first_add = false;
7425         int delay = HZ;
7426
7427         ref_node = container_of(ref, struct fixed_rsrc_ref_node, refs);
7428         data = ref_node->rsrc_data;
7429         ctx = data->ctx;
7430
7431         io_rsrc_ref_lock(ctx);
7432         ref_node->done = true;
7433
7434         while (!list_empty(&ctx->rsrc_ref_list)) {
7435                 ref_node = list_first_entry(&ctx->rsrc_ref_list,
7436                                         struct fixed_rsrc_ref_node, node);
7437                 /* recycle ref nodes in order */
7438                 if (!ref_node->done)
7439                         break;
7440                 list_del(&ref_node->node);
7441                 first_add |= llist_add(&ref_node->llist, &ctx->rsrc_put_llist);
7442         }
7443         io_rsrc_ref_unlock(ctx);
7444
7445         if (percpu_ref_is_dying(&data->refs))
7446                 delay = 0;
7447
7448         if (!delay)
7449                 mod_delayed_work(system_wq, &ctx->rsrc_put_work, 0);
7450         else if (first_add)
7451                 queue_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
7452 }
7453
7454 static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
7455                         struct io_ring_ctx *ctx)
7456 {
7457         struct fixed_rsrc_ref_node *ref_node;
7458
7459         ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
7460         if (!ref_node)
7461                 return NULL;
7462
7463         if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
7464                             0, GFP_KERNEL)) {
7465                 kfree(ref_node);
7466                 return NULL;
7467         }
7468         INIT_LIST_HEAD(&ref_node->node);
7469         INIT_LIST_HEAD(&ref_node->rsrc_list);
7470         ref_node->done = false;
7471         return ref_node;
7472 }
7473
7474 static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
7475                                      struct fixed_rsrc_ref_node *ref_node)
7476 {
7477         ref_node->rsrc_data = ctx->file_data;
7478         ref_node->rsrc_put = io_ring_file_put;
7479 }
7480
7481 static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
7482 {
7483         percpu_ref_exit(&ref_node->refs);
7484         kfree(ref_node);
7485 }
7486
7487
7488 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
7489                                  unsigned nr_args)
7490 {
7491         __s32 __user *fds = (__s32 __user *) arg;
7492         unsigned nr_tables, i;
7493         struct file *file;
7494         int fd, ret = -ENOMEM;
7495         struct fixed_rsrc_ref_node *ref_node;
7496         struct fixed_rsrc_data *file_data;
7497
7498         if (ctx->file_data)
7499                 return -EBUSY;
7500         if (!nr_args)
7501                 return -EINVAL;
7502         if (nr_args > IORING_MAX_FIXED_FILES)
7503                 return -EMFILE;
7504
7505         file_data = alloc_fixed_rsrc_data(ctx);
7506         if (!file_data)
7507                 return -ENOMEM;
7508         ctx->file_data = file_data;
7509
7510         nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
7511         file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
7512                                    GFP_KERNEL);
7513         if (!file_data->table)
7514                 goto out_free;
7515
7516         if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args))
7517                 goto out_free;
7518
7519         for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
7520                 if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
7521                         ret = -EFAULT;
7522                         goto out_fput;
7523                 }
7524                 /* allow sparse sets */
7525                 if (fd == -1)
7526                         continue;
7527
7528                 file = fget(fd);
7529                 ret = -EBADF;
7530                 if (!file)
7531                         goto out_fput;
7532
7533                 /*
7534                  * Don't allow io_uring instances to be registered. If UNIX
7535                  * isn't enabled, then this causes a reference cycle and this
7536                  * instance can never get freed. If UNIX is enabled we'll
7537                  * handle it just fine, but there's still no point in allowing
7538                  * a ring fd as it doesn't support regular read/write anyway.
7539                  */
7540                 if (file->f_op == &io_uring_fops) {
7541                         fput(file);
7542                         goto out_fput;
7543                 }
7544                 *io_fixed_file_slot(file_data, i) = file;
7545         }
7546
7547         ret = io_sqe_files_scm(ctx);
7548         if (ret) {
7549                 io_sqe_files_unregister(ctx);
7550                 return ret;
7551         }
7552
7553         ref_node = alloc_fixed_rsrc_ref_node(ctx);
7554         if (!ref_node) {
7555                 io_sqe_files_unregister(ctx);
7556                 return -ENOMEM;
7557         }
7558         init_fixed_file_ref_node(ctx, ref_node);
7559
7560         io_sqe_rsrc_set_node(ctx, file_data, ref_node);
7561         return ret;
7562 out_fput:
7563         for (i = 0; i < ctx->nr_user_files; i++) {
7564                 file = io_file_from_index(ctx, i);
7565                 if (file)
7566                         fput(file);
7567         }
7568         for (i = 0; i < nr_tables; i++)
7569                 kfree(file_data->table[i].files);
7570         ctx->nr_user_files = 0;
7571 out_free:
7572         free_fixed_rsrc_data(ctx->file_data);
7573         ctx->file_data = NULL;
7574         return ret;
7575 }
7576
7577 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
7578                                 int index)
7579 {
7580 #if defined(CONFIG_UNIX)
7581         struct sock *sock = ctx->ring_sock->sk;
7582         struct sk_buff_head *head = &sock->sk_receive_queue;
7583         struct sk_buff *skb;
7584
7585         /*
7586          * See if we can merge this file into an existing skb SCM_RIGHTS
7587          * file set. If there's no room, fall back to allocating a new skb
7588          * and filling it in.
7589          */
7590         spin_lock_irq(&head->lock);
7591         skb = skb_peek(head);
7592         if (skb) {
7593                 struct scm_fp_list *fpl = UNIXCB(skb).fp;
7594
7595                 if (fpl->count < SCM_MAX_FD) {
7596                         __skb_unlink(skb, head);
7597                         spin_unlock_irq(&head->lock);
7598                         fpl->fp[fpl->count] = get_file(file);
7599                         unix_inflight(fpl->user, fpl->fp[fpl->count]);
7600                         fpl->count++;
7601                         spin_lock_irq(&head->lock);
7602                         __skb_queue_head(head, skb);
7603                 } else {
7604                         skb = NULL;
7605                 }
7606         }
7607         spin_unlock_irq(&head->lock);
7608
7609         if (skb) {
7610                 fput(file);
7611                 return 0;
7612         }
7613
7614         return __io_sqe_files_scm(ctx, 1, index);
7615 #else
7616         return 0;
7617 #endif
7618 }
7619
7620 static int io_queue_rsrc_removal(struct fixed_rsrc_data *data, void *rsrc)
7621 {
7622         struct io_rsrc_put *prsrc;
7623         struct fixed_rsrc_ref_node *ref_node = data->node;
7624
7625         prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
7626         if (!prsrc)
7627                 return -ENOMEM;
7628
7629         prsrc->rsrc = rsrc;
7630         list_add(&prsrc->list, &ref_node->rsrc_list);
7631
7632         return 0;
7633 }
7634
7635 static inline int io_queue_file_removal(struct fixed_rsrc_data *data,
7636                                         struct file *file)
7637 {
7638         return io_queue_rsrc_removal(data, (void *)file);
7639 }
7640
7641 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
7642                                  struct io_uring_rsrc_update *up,
7643                                  unsigned nr_args)
7644 {
7645         struct fixed_rsrc_data *data = ctx->file_data;
7646         struct fixed_rsrc_ref_node *ref_node;
7647         struct file *file, **file_slot;
7648         __s32 __user *fds;
7649         int fd, i, err;
7650         __u32 done;
7651         bool needs_switch = false;
7652
7653         if (check_add_overflow(up->offset, nr_args, &done))
7654                 return -EOVERFLOW;
7655         if (done > ctx->nr_user_files)
7656                 return -EINVAL;
7657
7658         ref_node = alloc_fixed_rsrc_ref_node(ctx);
7659         if (!ref_node)
7660                 return -ENOMEM;
7661         init_fixed_file_ref_node(ctx, ref_node);
7662
7663         fds = u64_to_user_ptr(up->data);
7664         for (done = 0; done < nr_args; done++) {
7665                 err = 0;
7666                 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
7667                         err = -EFAULT;
7668                         break;
7669                 }
7670                 if (fd == IORING_REGISTER_FILES_SKIP)
7671                         continue;
7672
7673                 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
7674                 file_slot = io_fixed_file_slot(ctx->file_data, i);
7675
7676                 if (*file_slot) {
7677                         err = io_queue_file_removal(data, *file_slot);
7678                         if (err)
7679                                 break;
7680                         *file_slot = NULL;
7681                         needs_switch = true;
7682                 }
7683                 if (fd != -1) {
7684                         file = fget(fd);
7685                         if (!file) {
7686                                 err = -EBADF;
7687                                 break;
7688                         }
7689                         /*
7690                          * Don't allow io_uring instances to be registered. If
7691                          * UNIX isn't enabled, then this causes a reference
7692                          * cycle and this instance can never get freed. If UNIX
7693                          * is enabled we'll handle it just fine, but there's
7694                          * still no point in allowing a ring fd as it doesn't
7695                          * support regular read/write anyway.
7696                          */
7697                         if (file->f_op == &io_uring_fops) {
7698                                 fput(file);
7699                                 err = -EBADF;
7700                                 break;
7701                         }
7702                         *file_slot = file;
7703                         err = io_sqe_file_register(ctx, file, i);
7704                         if (err) {
7705                                 *file_slot = NULL;
7706                                 fput(file);
7707                                 break;
7708                         }
7709                 }
7710         }
7711
7712         if (needs_switch) {
7713                 percpu_ref_kill(&data->node->refs);
7714                 io_sqe_rsrc_set_node(ctx, data, ref_node);
7715         } else
7716                 destroy_fixed_rsrc_ref_node(ref_node);
7717
7718         return done ? done : err;
7719 }
7720
7721 static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
7722                                unsigned nr_args)
7723 {
7724         struct io_uring_rsrc_update up;
7725
7726         if (!ctx->file_data)
7727                 return -ENXIO;
7728         if (!nr_args)
7729                 return -EINVAL;
7730         if (copy_from_user(&up, arg, sizeof(up)))
7731                 return -EFAULT;
7732         if (up.resv)
7733                 return -EINVAL;
7734
7735         return __io_sqe_files_update(ctx, &up, nr_args);
7736 }
7737
7738 static struct io_wq_work *io_free_work(struct io_wq_work *work)
7739 {
7740         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
7741
7742         req = io_put_req_find_next(req);
7743         return req ? &req->work : NULL;
7744 }
7745
7746 static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
7747 {
7748         struct io_wq_hash *hash;
7749         struct io_wq_data data;
7750         unsigned int concurrency;
7751
7752         hash = ctx->hash_map;
7753         if (!hash) {
7754                 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
7755                 if (!hash)
7756                         return ERR_PTR(-ENOMEM);
7757                 refcount_set(&hash->refs, 1);
7758                 init_waitqueue_head(&hash->wait);
7759                 ctx->hash_map = hash;
7760         }
7761
7762         data.hash = hash;
7763         data.free_work = io_free_work;
7764         data.do_work = io_wq_submit_work;
7765
7766         /* Do QD, or 4 * CPUS, whatever is smallest */
7767         concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
7768
7769         return io_wq_create(concurrency, &data);
7770 }
7771
7772 static int io_uring_alloc_task_context(struct task_struct *task,
7773                                        struct io_ring_ctx *ctx)
7774 {
7775         struct io_uring_task *tctx;
7776         int ret;
7777
7778         tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
7779         if (unlikely(!tctx))
7780                 return -ENOMEM;
7781
7782         ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
7783         if (unlikely(ret)) {
7784                 kfree(tctx);
7785                 return ret;
7786         }
7787
7788         tctx->io_wq = io_init_wq_offload(ctx);
7789         if (IS_ERR(tctx->io_wq)) {
7790                 ret = PTR_ERR(tctx->io_wq);
7791                 percpu_counter_destroy(&tctx->inflight);
7792                 kfree(tctx);
7793                 return ret;
7794         }
7795
7796         xa_init(&tctx->xa);
7797         init_waitqueue_head(&tctx->wait);
7798         tctx->last = NULL;
7799         atomic_set(&tctx->in_idle, 0);
7800         tctx->sqpoll = false;
7801         task->io_uring = tctx;
7802         spin_lock_init(&tctx->task_lock);
7803         INIT_WQ_LIST(&tctx->task_list);
7804         tctx->task_state = 0;
7805         init_task_work(&tctx->task_work, tctx_task_work);
7806         return 0;
7807 }
7808
7809 void __io_uring_free(struct task_struct *tsk)
7810 {
7811         struct io_uring_task *tctx = tsk->io_uring;
7812
7813         WARN_ON_ONCE(!xa_empty(&tctx->xa));
7814         WARN_ON_ONCE(tctx->io_wq);
7815
7816         percpu_counter_destroy(&tctx->inflight);
7817         kfree(tctx);
7818         tsk->io_uring = NULL;
7819 }
7820
7821 static int io_sq_thread_fork(struct io_sq_data *sqd, struct io_ring_ctx *ctx)
7822 {
7823         int ret;
7824
7825         clear_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
7826         reinit_completion(&sqd->completion);
7827         ctx->sqo_exec = 0;
7828         sqd->task_pid = current->pid;
7829         current->flags |= PF_IO_WORKER;
7830         ret = io_wq_fork_thread(io_sq_thread, sqd);
7831         current->flags &= ~PF_IO_WORKER;
7832         if (ret < 0) {
7833                 sqd->thread = NULL;
7834                 return ret;
7835         }
7836         wait_for_completion(&sqd->completion);
7837         return io_uring_alloc_task_context(sqd->thread, ctx);
7838 }
7839
7840 static int io_sq_offload_create(struct io_ring_ctx *ctx,
7841                                 struct io_uring_params *p)
7842 {
7843         int ret;
7844
7845         /* Retain compatibility with failing for an invalid attach attempt */
7846         if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
7847                                 IORING_SETUP_ATTACH_WQ) {
7848                 struct fd f;
7849
7850                 f = fdget(p->wq_fd);
7851                 if (!f.file)
7852                         return -ENXIO;
7853                 if (f.file->f_op != &io_uring_fops) {
7854                         fdput(f);
7855                         return -EINVAL;
7856                 }
7857                 fdput(f);
7858         }
7859         if (ctx->flags & IORING_SETUP_SQPOLL) {
7860                 struct io_sq_data *sqd;
7861
7862                 ret = -EPERM;
7863                 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
7864                         goto err;
7865
7866                 sqd = io_get_sq_data(p);
7867                 if (IS_ERR(sqd)) {
7868                         ret = PTR_ERR(sqd);
7869                         goto err;
7870                 }
7871
7872                 ctx->sq_data = sqd;
7873                 io_sq_thread_park(sqd);
7874                 mutex_lock(&sqd->ctx_lock);
7875                 list_add(&ctx->sqd_list, &sqd->ctx_new_list);
7876                 mutex_unlock(&sqd->ctx_lock);
7877                 io_sq_thread_unpark(sqd);
7878
7879                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
7880                 if (!ctx->sq_thread_idle)
7881                         ctx->sq_thread_idle = HZ;
7882
7883                 if (sqd->thread)
7884                         return 0;
7885
7886                 if (p->flags & IORING_SETUP_SQ_AFF) {
7887                         int cpu = p->sq_thread_cpu;
7888
7889                         ret = -EINVAL;
7890                         if (cpu >= nr_cpu_ids)
7891                                 goto err;
7892                         if (!cpu_online(cpu))
7893                                 goto err;
7894
7895                         sqd->sq_cpu = cpu;
7896                 } else {
7897                         sqd->sq_cpu = -1;
7898                 }
7899
7900                 sqd->task_pid = current->pid;
7901                 current->flags |= PF_IO_WORKER;
7902                 ret = io_wq_fork_thread(io_sq_thread, sqd);
7903                 current->flags &= ~PF_IO_WORKER;
7904                 if (ret < 0) {
7905                         sqd->thread = NULL;
7906                         goto err;
7907                 }
7908                 wait_for_completion(&sqd->completion);
7909                 ret = io_uring_alloc_task_context(sqd->thread, ctx);
7910                 if (ret)
7911                         goto err;
7912         } else if (p->flags & IORING_SETUP_SQ_AFF) {
7913                 /* Can't have SQ_AFF without SQPOLL */
7914                 ret = -EINVAL;
7915                 goto err;
7916         }
7917
7918         return 0;
7919 err:
7920         io_sq_thread_finish(ctx);
7921         return ret;
7922 }
7923
7924 static void io_sq_offload_start(struct io_ring_ctx *ctx)
7925 {
7926         struct io_sq_data *sqd = ctx->sq_data;
7927
7928         ctx->flags &= ~IORING_SETUP_R_DISABLED;
7929         if (ctx->flags & IORING_SETUP_SQPOLL)
7930                 complete(&sqd->startup);
7931 }
7932
7933 static inline void __io_unaccount_mem(struct user_struct *user,
7934                                       unsigned long nr_pages)
7935 {
7936         atomic_long_sub(nr_pages, &user->locked_vm);
7937 }
7938
7939 static inline int __io_account_mem(struct user_struct *user,
7940                                    unsigned long nr_pages)
7941 {
7942         unsigned long page_limit, cur_pages, new_pages;
7943
7944         /* Don't allow more pages than we can safely lock */
7945         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
7946
7947         do {
7948                 cur_pages = atomic_long_read(&user->locked_vm);
7949                 new_pages = cur_pages + nr_pages;
7950                 if (new_pages > page_limit)
7951                         return -ENOMEM;
7952         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
7953                                         new_pages) != cur_pages);
7954
7955         return 0;
7956 }
7957
7958 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
7959 {
7960         if (ctx->user)
7961                 __io_unaccount_mem(ctx->user, nr_pages);
7962
7963         if (ctx->mm_account)
7964                 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
7965 }
7966
7967 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
7968 {
7969         int ret;
7970
7971         if (ctx->user) {
7972                 ret = __io_account_mem(ctx->user, nr_pages);
7973                 if (ret)
7974                         return ret;
7975         }
7976
7977         if (ctx->mm_account)
7978                 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
7979
7980         return 0;
7981 }
7982
7983 static void io_mem_free(void *ptr)
7984 {
7985         struct page *page;
7986
7987         if (!ptr)
7988                 return;
7989
7990         page = virt_to_head_page(ptr);
7991         if (put_page_testzero(page))
7992                 free_compound_page(page);
7993 }
7994
7995 static void *io_mem_alloc(size_t size)
7996 {
7997         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
7998                                 __GFP_NORETRY | __GFP_ACCOUNT;
7999
8000         return (void *) __get_free_pages(gfp_flags, get_order(size));
8001 }
8002
8003 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
8004                                 size_t *sq_offset)
8005 {
8006         struct io_rings *rings;
8007         size_t off, sq_array_size;
8008
8009         off = struct_size(rings, cqes, cq_entries);
8010         if (off == SIZE_MAX)
8011                 return SIZE_MAX;
8012
8013 #ifdef CONFIG_SMP
8014         off = ALIGN(off, SMP_CACHE_BYTES);
8015         if (off == 0)
8016                 return SIZE_MAX;
8017 #endif
8018
8019         if (sq_offset)
8020                 *sq_offset = off;
8021
8022         sq_array_size = array_size(sizeof(u32), sq_entries);
8023         if (sq_array_size == SIZE_MAX)
8024                 return SIZE_MAX;
8025
8026         if (check_add_overflow(off, sq_array_size, &off))
8027                 return SIZE_MAX;
8028
8029         return off;
8030 }
8031
8032 static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
8033 {
8034         int i, j;
8035
8036         if (!ctx->user_bufs)
8037                 return -ENXIO;
8038
8039         for (i = 0; i < ctx->nr_user_bufs; i++) {
8040                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8041
8042                 for (j = 0; j < imu->nr_bvecs; j++)
8043                         unpin_user_page(imu->bvec[j].bv_page);
8044
8045                 if (imu->acct_pages)
8046                         io_unaccount_mem(ctx, imu->acct_pages);
8047                 kvfree(imu->bvec);
8048                 imu->nr_bvecs = 0;
8049         }
8050
8051         kfree(ctx->user_bufs);
8052         ctx->user_bufs = NULL;
8053         ctx->nr_user_bufs = 0;
8054         return 0;
8055 }
8056
8057 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
8058                        void __user *arg, unsigned index)
8059 {
8060         struct iovec __user *src;
8061
8062 #ifdef CONFIG_COMPAT
8063         if (ctx->compat) {
8064                 struct compat_iovec __user *ciovs;
8065                 struct compat_iovec ciov;
8066
8067                 ciovs = (struct compat_iovec __user *) arg;
8068                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
8069                         return -EFAULT;
8070
8071                 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
8072                 dst->iov_len = ciov.iov_len;
8073                 return 0;
8074         }
8075 #endif
8076         src = (struct iovec __user *) arg;
8077         if (copy_from_user(dst, &src[index], sizeof(*dst)))
8078                 return -EFAULT;
8079         return 0;
8080 }
8081
8082 /*
8083  * Not super efficient, but this is just a registration time. And we do cache
8084  * the last compound head, so generally we'll only do a full search if we don't
8085  * match that one.
8086  *
8087  * We check if the given compound head page has already been accounted, to
8088  * avoid double accounting it. This allows us to account the full size of the
8089  * page, not just the constituent pages of a huge page.
8090  */
8091 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
8092                                   int nr_pages, struct page *hpage)
8093 {
8094         int i, j;
8095
8096         /* check current page array */
8097         for (i = 0; i < nr_pages; i++) {
8098                 if (!PageCompound(pages[i]))
8099                         continue;
8100                 if (compound_head(pages[i]) == hpage)
8101                         return true;
8102         }
8103
8104         /* check previously registered pages */
8105         for (i = 0; i < ctx->nr_user_bufs; i++) {
8106                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8107
8108                 for (j = 0; j < imu->nr_bvecs; j++) {
8109                         if (!PageCompound(imu->bvec[j].bv_page))
8110                                 continue;
8111                         if (compound_head(imu->bvec[j].bv_page) == hpage)
8112                                 return true;
8113                 }
8114         }
8115
8116         return false;
8117 }
8118
8119 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
8120                                  int nr_pages, struct io_mapped_ubuf *imu,
8121                                  struct page **last_hpage)
8122 {
8123         int i, ret;
8124
8125         for (i = 0; i < nr_pages; i++) {
8126                 if (!PageCompound(pages[i])) {
8127                         imu->acct_pages++;
8128                 } else {
8129                         struct page *hpage;
8130
8131                         hpage = compound_head(pages[i]);
8132                         if (hpage == *last_hpage)
8133                                 continue;
8134                         *last_hpage = hpage;
8135                         if (headpage_already_acct(ctx, pages, i, hpage))
8136                                 continue;
8137                         imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
8138                 }
8139         }
8140
8141         if (!imu->acct_pages)
8142                 return 0;
8143
8144         ret = io_account_mem(ctx, imu->acct_pages);
8145         if (ret)
8146                 imu->acct_pages = 0;
8147         return ret;
8148 }
8149
8150 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
8151                                   struct io_mapped_ubuf *imu,
8152                                   struct page **last_hpage)
8153 {
8154         struct vm_area_struct **vmas = NULL;
8155         struct page **pages = NULL;
8156         unsigned long off, start, end, ubuf;
8157         size_t size;
8158         int ret, pret, nr_pages, i;
8159
8160         ubuf = (unsigned long) iov->iov_base;
8161         end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
8162         start = ubuf >> PAGE_SHIFT;
8163         nr_pages = end - start;
8164
8165         ret = -ENOMEM;
8166
8167         pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
8168         if (!pages)
8169                 goto done;
8170
8171         vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
8172                               GFP_KERNEL);
8173         if (!vmas)
8174                 goto done;
8175
8176         imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
8177                                    GFP_KERNEL);
8178         if (!imu->bvec)
8179                 goto done;
8180
8181         ret = 0;
8182         mmap_read_lock(current->mm);
8183         pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
8184                               pages, vmas);
8185         if (pret == nr_pages) {
8186                 /* don't support file backed memory */
8187                 for (i = 0; i < nr_pages; i++) {
8188                         struct vm_area_struct *vma = vmas[i];
8189
8190                         if (vma->vm_file &&
8191                             !is_file_hugepages(vma->vm_file)) {
8192                                 ret = -EOPNOTSUPP;
8193                                 break;
8194                         }
8195                 }
8196         } else {
8197                 ret = pret < 0 ? pret : -EFAULT;
8198         }
8199         mmap_read_unlock(current->mm);
8200         if (ret) {
8201                 /*
8202                  * if we did partial map, or found file backed vmas,
8203                  * release any pages we did get
8204                  */
8205                 if (pret > 0)
8206                         unpin_user_pages(pages, pret);
8207                 kvfree(imu->bvec);
8208                 goto done;
8209         }
8210
8211         ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
8212         if (ret) {
8213                 unpin_user_pages(pages, pret);
8214                 kvfree(imu->bvec);
8215                 goto done;
8216         }
8217
8218         off = ubuf & ~PAGE_MASK;
8219         size = iov->iov_len;
8220         for (i = 0; i < nr_pages; i++) {
8221                 size_t vec_len;
8222
8223                 vec_len = min_t(size_t, size, PAGE_SIZE - off);
8224                 imu->bvec[i].bv_page = pages[i];
8225                 imu->bvec[i].bv_len = vec_len;
8226                 imu->bvec[i].bv_offset = off;
8227                 off = 0;
8228                 size -= vec_len;
8229         }
8230         /* store original address for later verification */
8231         imu->ubuf = ubuf;
8232         imu->len = iov->iov_len;
8233         imu->nr_bvecs = nr_pages;
8234         ret = 0;
8235 done:
8236         kvfree(pages);
8237         kvfree(vmas);
8238         return ret;
8239 }
8240
8241 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
8242 {
8243         if (ctx->user_bufs)
8244                 return -EBUSY;
8245         if (!nr_args || nr_args > UIO_MAXIOV)
8246                 return -EINVAL;
8247
8248         ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
8249                                         GFP_KERNEL);
8250         if (!ctx->user_bufs)
8251                 return -ENOMEM;
8252
8253         return 0;
8254 }
8255
8256 static int io_buffer_validate(struct iovec *iov)
8257 {
8258         /*
8259          * Don't impose further limits on the size and buffer
8260          * constraints here, we'll -EINVAL later when IO is
8261          * submitted if they are wrong.
8262          */
8263         if (!iov->iov_base || !iov->iov_len)
8264                 return -EFAULT;
8265
8266         /* arbitrary limit, but we need something */
8267         if (iov->iov_len > SZ_1G)
8268                 return -EFAULT;
8269
8270         return 0;
8271 }
8272
8273 static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
8274                                    unsigned int nr_args)
8275 {
8276         int i, ret;
8277         struct iovec iov;
8278         struct page *last_hpage = NULL;
8279
8280         ret = io_buffers_map_alloc(ctx, nr_args);
8281         if (ret)
8282                 return ret;
8283
8284         for (i = 0; i < nr_args; i++) {
8285                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8286
8287                 ret = io_copy_iov(ctx, &iov, arg, i);
8288                 if (ret)
8289                         break;
8290
8291                 ret = io_buffer_validate(&iov);
8292                 if (ret)
8293                         break;
8294
8295                 ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
8296                 if (ret)
8297                         break;
8298
8299                 ctx->nr_user_bufs++;
8300         }
8301
8302         if (ret)
8303                 io_sqe_buffers_unregister(ctx);
8304
8305         return ret;
8306 }
8307
8308 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
8309 {
8310         __s32 __user *fds = arg;
8311         int fd;
8312
8313         if (ctx->cq_ev_fd)
8314                 return -EBUSY;
8315
8316         if (copy_from_user(&fd, fds, sizeof(*fds)))
8317                 return -EFAULT;
8318
8319         ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
8320         if (IS_ERR(ctx->cq_ev_fd)) {
8321                 int ret = PTR_ERR(ctx->cq_ev_fd);
8322                 ctx->cq_ev_fd = NULL;
8323                 return ret;
8324         }
8325
8326         return 0;
8327 }
8328
8329 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
8330 {
8331         if (ctx->cq_ev_fd) {
8332                 eventfd_ctx_put(ctx->cq_ev_fd);
8333                 ctx->cq_ev_fd = NULL;
8334                 return 0;
8335         }
8336
8337         return -ENXIO;
8338 }
8339
8340 static int __io_destroy_buffers(int id, void *p, void *data)
8341 {
8342         struct io_ring_ctx *ctx = data;
8343         struct io_buffer *buf = p;
8344
8345         __io_remove_buffers(ctx, buf, id, -1U);
8346         return 0;
8347 }
8348
8349 static void io_destroy_buffers(struct io_ring_ctx *ctx)
8350 {
8351         idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
8352         idr_destroy(&ctx->io_buffer_idr);
8353 }
8354
8355 static void io_req_cache_free(struct list_head *list, struct task_struct *tsk)
8356 {
8357         struct io_kiocb *req, *nxt;
8358
8359         list_for_each_entry_safe(req, nxt, list, compl.list) {
8360                 if (tsk && req->task != tsk)
8361                         continue;
8362                 list_del(&req->compl.list);
8363                 kmem_cache_free(req_cachep, req);
8364         }
8365 }
8366
8367 static void io_req_caches_free(struct io_ring_ctx *ctx)
8368 {
8369         struct io_submit_state *submit_state = &ctx->submit_state;
8370         struct io_comp_state *cs = &ctx->submit_state.comp;
8371
8372         mutex_lock(&ctx->uring_lock);
8373
8374         if (submit_state->free_reqs) {
8375                 kmem_cache_free_bulk(req_cachep, submit_state->free_reqs,
8376                                      submit_state->reqs);
8377                 submit_state->free_reqs = 0;
8378         }
8379
8380         spin_lock_irq(&ctx->completion_lock);
8381         list_splice_init(&cs->locked_free_list, &cs->free_list);
8382         cs->locked_free_nr = 0;
8383         spin_unlock_irq(&ctx->completion_lock);
8384
8385         io_req_cache_free(&cs->free_list, NULL);
8386
8387         mutex_unlock(&ctx->uring_lock);
8388 }
8389
8390 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
8391 {
8392         /*
8393          * Some may use context even when all refs and requests have been put,
8394          * and they are free to do so while still holding uring_lock, see
8395          * __io_req_task_submit(). Wait for them to finish.
8396          */
8397         mutex_lock(&ctx->uring_lock);
8398         mutex_unlock(&ctx->uring_lock);
8399
8400         io_sq_thread_finish(ctx);
8401         io_sqe_buffers_unregister(ctx);
8402
8403         if (ctx->mm_account) {
8404                 mmdrop(ctx->mm_account);
8405                 ctx->mm_account = NULL;
8406         }
8407
8408         mutex_lock(&ctx->uring_lock);
8409         io_sqe_files_unregister(ctx);
8410         mutex_unlock(&ctx->uring_lock);
8411         io_eventfd_unregister(ctx);
8412         io_destroy_buffers(ctx);
8413         idr_destroy(&ctx->personality_idr);
8414
8415 #if defined(CONFIG_UNIX)
8416         if (ctx->ring_sock) {
8417                 ctx->ring_sock->file = NULL; /* so that iput() is called */
8418                 sock_release(ctx->ring_sock);
8419         }
8420 #endif
8421
8422         io_mem_free(ctx->rings);
8423         io_mem_free(ctx->sq_sqes);
8424
8425         percpu_ref_exit(&ctx->refs);
8426         free_uid(ctx->user);
8427         io_req_caches_free(ctx);
8428         if (ctx->hash_map)
8429                 io_wq_put_hash(ctx->hash_map);
8430         kfree(ctx->cancel_hash);
8431         kfree(ctx);
8432 }
8433
8434 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
8435 {
8436         struct io_ring_ctx *ctx = file->private_data;
8437         __poll_t mask = 0;
8438
8439         poll_wait(file, &ctx->cq_wait, wait);
8440         /*
8441          * synchronizes with barrier from wq_has_sleeper call in
8442          * io_commit_cqring
8443          */
8444         smp_rmb();
8445         if (!io_sqring_full(ctx))
8446                 mask |= EPOLLOUT | EPOLLWRNORM;
8447
8448         /*
8449          * Don't flush cqring overflow list here, just do a simple check.
8450          * Otherwise there could possible be ABBA deadlock:
8451          *      CPU0                    CPU1
8452          *      ----                    ----
8453          * lock(&ctx->uring_lock);
8454          *                              lock(&ep->mtx);
8455          *                              lock(&ctx->uring_lock);
8456          * lock(&ep->mtx);
8457          *
8458          * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
8459          * pushs them to do the flush.
8460          */
8461         if (io_cqring_events(ctx) || test_bit(0, &ctx->cq_check_overflow))
8462                 mask |= EPOLLIN | EPOLLRDNORM;
8463
8464         return mask;
8465 }
8466
8467 static int io_uring_fasync(int fd, struct file *file, int on)
8468 {
8469         struct io_ring_ctx *ctx = file->private_data;
8470
8471         return fasync_helper(fd, file, on, &ctx->cq_fasync);
8472 }
8473
8474 static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
8475 {
8476         const struct cred *creds;
8477
8478         creds = idr_remove(&ctx->personality_idr, id);
8479         if (creds) {
8480                 put_cred(creds);
8481                 return 0;
8482         }
8483
8484         return -EINVAL;
8485 }
8486
8487 static int io_remove_personalities(int id, void *p, void *data)
8488 {
8489         struct io_ring_ctx *ctx = data;
8490
8491         io_unregister_personality(ctx, id);
8492         return 0;
8493 }
8494
8495 static bool io_run_ctx_fallback(struct io_ring_ctx *ctx)
8496 {
8497         struct callback_head *work, *next;
8498         bool executed = false;
8499
8500         do {
8501                 work = xchg(&ctx->exit_task_work, NULL);
8502                 if (!work)
8503                         break;
8504
8505                 do {
8506                         next = work->next;
8507                         work->func(work);
8508                         work = next;
8509                         cond_resched();
8510                 } while (work);
8511                 executed = true;
8512         } while (1);
8513
8514         return executed;
8515 }
8516
8517 static void io_ring_exit_work(struct work_struct *work)
8518 {
8519         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
8520                                                exit_work);
8521
8522         /*
8523          * If we're doing polled IO and end up having requests being
8524          * submitted async (out-of-line), then completions can come in while
8525          * we're waiting for refs to drop. We need to reap these manually,
8526          * as nobody else will be looking for them.
8527          */
8528         do {
8529                 io_uring_try_cancel_requests(ctx, NULL, NULL);
8530         } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
8531         io_ring_ctx_free(ctx);
8532 }
8533
8534 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
8535 {
8536         mutex_lock(&ctx->uring_lock);
8537         percpu_ref_kill(&ctx->refs);
8538         /* if force is set, the ring is going away. always drop after that */
8539         ctx->cq_overflow_flushed = 1;
8540         if (ctx->rings)
8541                 __io_cqring_overflow_flush(ctx, true, NULL, NULL);
8542         idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
8543         mutex_unlock(&ctx->uring_lock);
8544
8545         io_kill_timeouts(ctx, NULL, NULL);
8546         io_poll_remove_all(ctx, NULL, NULL);
8547
8548         /* if we failed setting up the ctx, we might not have any rings */
8549         io_iopoll_try_reap_events(ctx);
8550
8551         INIT_WORK(&ctx->exit_work, io_ring_exit_work);
8552         /*
8553          * Use system_unbound_wq to avoid spawning tons of event kworkers
8554          * if we're exiting a ton of rings at the same time. It just adds
8555          * noise and overhead, there's no discernable change in runtime
8556          * over using system_wq.
8557          */
8558         queue_work(system_unbound_wq, &ctx->exit_work);
8559 }
8560
8561 static int io_uring_release(struct inode *inode, struct file *file)
8562 {
8563         struct io_ring_ctx *ctx = file->private_data;
8564
8565         file->private_data = NULL;
8566         io_ring_ctx_wait_and_kill(ctx);
8567         return 0;
8568 }
8569
8570 struct io_task_cancel {
8571         struct task_struct *task;
8572         struct files_struct *files;
8573 };
8574
8575 static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
8576 {
8577         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
8578         struct io_task_cancel *cancel = data;
8579         bool ret;
8580
8581         if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) {
8582                 unsigned long flags;
8583                 struct io_ring_ctx *ctx = req->ctx;
8584
8585                 /* protect against races with linked timeouts */
8586                 spin_lock_irqsave(&ctx->completion_lock, flags);
8587                 ret = io_match_task(req, cancel->task, cancel->files);
8588                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
8589         } else {
8590                 ret = io_match_task(req, cancel->task, cancel->files);
8591         }
8592         return ret;
8593 }
8594
8595 static void io_cancel_defer_files(struct io_ring_ctx *ctx,
8596                                   struct task_struct *task,
8597                                   struct files_struct *files)
8598 {
8599         struct io_defer_entry *de = NULL;
8600         LIST_HEAD(list);
8601
8602         spin_lock_irq(&ctx->completion_lock);
8603         list_for_each_entry_reverse(de, &ctx->defer_list, list) {
8604                 if (io_match_task(de->req, task, files)) {
8605                         list_cut_position(&list, &ctx->defer_list, &de->list);
8606                         break;
8607                 }
8608         }
8609         spin_unlock_irq(&ctx->completion_lock);
8610
8611         while (!list_empty(&list)) {
8612                 de = list_first_entry(&list, struct io_defer_entry, list);
8613                 list_del_init(&de->list);
8614                 req_set_fail_links(de->req);
8615                 io_put_req(de->req);
8616                 io_req_complete(de->req, -ECANCELED);
8617                 kfree(de);
8618         }
8619 }
8620
8621 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
8622                                          struct task_struct *task,
8623                                          struct files_struct *files)
8624 {
8625         struct io_task_cancel cancel = { .task = task, .files = files, };
8626         struct task_struct *tctx_task = task ?: current;
8627         struct io_uring_task *tctx = tctx_task->io_uring;
8628
8629         while (1) {
8630                 enum io_wq_cancel cret;
8631                 bool ret = false;
8632
8633                 if (tctx && tctx->io_wq) {
8634                         cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
8635                                                &cancel, true);
8636                         ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
8637                 }
8638
8639                 /* SQPOLL thread does its own polling */
8640                 if (!(ctx->flags & IORING_SETUP_SQPOLL) && !files) {
8641                         while (!list_empty_careful(&ctx->iopoll_list)) {
8642                                 io_iopoll_try_reap_events(ctx);
8643                                 ret = true;
8644                         }
8645                 }
8646
8647                 ret |= io_poll_remove_all(ctx, task, files);
8648                 ret |= io_kill_timeouts(ctx, task, files);
8649                 ret |= io_run_task_work();
8650                 ret |= io_run_ctx_fallback(ctx);
8651                 io_cqring_overflow_flush(ctx, true, task, files);
8652                 if (!ret)
8653                         break;
8654                 cond_resched();
8655         }
8656 }
8657
8658 static int io_uring_count_inflight(struct io_ring_ctx *ctx,
8659                                    struct task_struct *task,
8660                                    struct files_struct *files)
8661 {
8662         struct io_kiocb *req;
8663         int cnt = 0;
8664
8665         spin_lock_irq(&ctx->inflight_lock);
8666         list_for_each_entry(req, &ctx->inflight_list, inflight_entry)
8667                 cnt += io_match_task(req, task, files);
8668         spin_unlock_irq(&ctx->inflight_lock);
8669         return cnt;
8670 }
8671
8672 static void io_uring_cancel_files(struct io_ring_ctx *ctx,
8673                                   struct task_struct *task,
8674                                   struct files_struct *files)
8675 {
8676         while (!list_empty_careful(&ctx->inflight_list)) {
8677                 DEFINE_WAIT(wait);
8678                 int inflight;
8679
8680                 inflight = io_uring_count_inflight(ctx, task, files);
8681                 if (!inflight)
8682                         break;
8683
8684                 io_uring_try_cancel_requests(ctx, task, files);
8685
8686                 if (ctx->sq_data)
8687                         io_sq_thread_unpark(ctx->sq_data);
8688                 prepare_to_wait(&task->io_uring->wait, &wait,
8689                                 TASK_UNINTERRUPTIBLE);
8690                 if (inflight == io_uring_count_inflight(ctx, task, files))
8691                         schedule();
8692                 finish_wait(&task->io_uring->wait, &wait);
8693                 if (ctx->sq_data)
8694                         io_sq_thread_park(ctx->sq_data);
8695         }
8696 }
8697
8698 /*
8699  * We need to iteratively cancel requests, in case a request has dependent
8700  * hard links. These persist even for failure of cancelations, hence keep
8701  * looping until none are found.
8702  */
8703 static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
8704                                           struct files_struct *files)
8705 {
8706         struct task_struct *task = current;
8707         bool did_park = false;
8708
8709         if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
8710                 /* never started, nothing to cancel */
8711                 if (ctx->flags & IORING_SETUP_R_DISABLED) {
8712                         io_sq_offload_start(ctx);
8713                         return;
8714                 }
8715                 did_park = io_sq_thread_park(ctx->sq_data);
8716                 if (did_park) {
8717                         task = ctx->sq_data->thread;
8718                         atomic_inc(&task->io_uring->in_idle);
8719                 }
8720         }
8721
8722         io_cancel_defer_files(ctx, task, files);
8723
8724         io_uring_cancel_files(ctx, task, files);
8725         if (!files)
8726                 io_uring_try_cancel_requests(ctx, task, NULL);
8727
8728         if (did_park) {
8729                 atomic_dec(&task->io_uring->in_idle);
8730                 io_sq_thread_unpark(ctx->sq_data);
8731         }
8732 }
8733
8734 /*
8735  * Note that this task has used io_uring. We use it for cancelation purposes.
8736  */
8737 static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
8738 {
8739         struct io_uring_task *tctx = current->io_uring;
8740         int ret;
8741
8742         if (unlikely(!tctx)) {
8743                 ret = io_uring_alloc_task_context(current, ctx);
8744                 if (unlikely(ret))
8745                         return ret;
8746                 tctx = current->io_uring;
8747         }
8748         if (tctx->last != file) {
8749                 void *old = xa_load(&tctx->xa, (unsigned long)file);
8750
8751                 if (!old) {
8752                         get_file(file);
8753                         ret = xa_err(xa_store(&tctx->xa, (unsigned long)file,
8754                                                 file, GFP_KERNEL));
8755                         if (ret) {
8756                                 fput(file);
8757                                 return ret;
8758                         }
8759                 }
8760                 tctx->last = file;
8761         }
8762
8763         /*
8764          * This is race safe in that the task itself is doing this, hence it
8765          * cannot be going through the exit/cancel paths at the same time.
8766          * This cannot be modified while exit/cancel is running.
8767          */
8768         if (!tctx->sqpoll && (ctx->flags & IORING_SETUP_SQPOLL))
8769                 tctx->sqpoll = true;
8770
8771         return 0;
8772 }
8773
8774 /*
8775  * Remove this io_uring_file -> task mapping.
8776  */
8777 static void io_uring_del_task_file(struct file *file)
8778 {
8779         struct io_uring_task *tctx = current->io_uring;
8780
8781         if (tctx->last == file)
8782                 tctx->last = NULL;
8783         file = xa_erase(&tctx->xa, (unsigned long)file);
8784         if (file)
8785                 fput(file);
8786 }
8787
8788 static void io_uring_clean_tctx(struct io_uring_task *tctx)
8789 {
8790         struct file *file;
8791         unsigned long index;
8792
8793         xa_for_each(&tctx->xa, index, file)
8794                 io_uring_del_task_file(file);
8795         if (tctx->io_wq) {
8796                 io_wq_put_and_exit(tctx->io_wq);
8797                 tctx->io_wq = NULL;
8798         }
8799 }
8800
8801 void __io_uring_files_cancel(struct files_struct *files)
8802 {
8803         struct io_uring_task *tctx = current->io_uring;
8804         struct file *file;
8805         unsigned long index;
8806
8807         /* make sure overflow events are dropped */
8808         atomic_inc(&tctx->in_idle);
8809         xa_for_each(&tctx->xa, index, file)
8810                 io_uring_cancel_task_requests(file->private_data, files);
8811         atomic_dec(&tctx->in_idle);
8812
8813         if (files)
8814                 io_uring_clean_tctx(tctx);
8815 }
8816
8817 static s64 tctx_inflight(struct io_uring_task *tctx)
8818 {
8819         return percpu_counter_sum(&tctx->inflight);
8820 }
8821
8822 static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
8823 {
8824         struct io_sq_data *sqd = ctx->sq_data;
8825         struct io_uring_task *tctx;
8826         s64 inflight;
8827         DEFINE_WAIT(wait);
8828
8829         if (!sqd)
8830                 return;
8831         if (!io_sq_thread_park(sqd))
8832                 return;
8833         tctx = ctx->sq_data->thread->io_uring;
8834         /* can happen on fork/alloc failure, just ignore that state */
8835         if (!tctx) {
8836                 io_sq_thread_unpark(sqd);
8837                 return;
8838         }
8839
8840         atomic_inc(&tctx->in_idle);
8841         do {
8842                 /* read completions before cancelations */
8843                 inflight = tctx_inflight(tctx);
8844                 if (!inflight)
8845                         break;
8846                 io_uring_cancel_task_requests(ctx, NULL);
8847
8848                 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
8849                 /*
8850                  * If we've seen completions, retry without waiting. This
8851                  * avoids a race where a completion comes in before we did
8852                  * prepare_to_wait().
8853                  */
8854                 if (inflight == tctx_inflight(tctx))
8855                         schedule();
8856                 finish_wait(&tctx->wait, &wait);
8857         } while (1);
8858         atomic_dec(&tctx->in_idle);
8859         io_sq_thread_unpark(sqd);
8860 }
8861
8862 /*
8863  * Find any io_uring fd that this task has registered or done IO on, and cancel
8864  * requests.
8865  */
8866 void __io_uring_task_cancel(void)
8867 {
8868         struct io_uring_task *tctx = current->io_uring;
8869         DEFINE_WAIT(wait);
8870         s64 inflight;
8871
8872         /* make sure overflow events are dropped */
8873         atomic_inc(&tctx->in_idle);
8874
8875         if (tctx->sqpoll) {
8876                 struct file *file;
8877                 unsigned long index;
8878
8879                 xa_for_each(&tctx->xa, index, file)
8880                         io_uring_cancel_sqpoll(file->private_data);
8881         }
8882
8883         do {
8884                 /* read completions before cancelations */
8885                 inflight = tctx_inflight(tctx);
8886                 if (!inflight)
8887                         break;
8888                 __io_uring_files_cancel(NULL);
8889
8890                 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
8891
8892                 /*
8893                  * If we've seen completions, retry without waiting. This
8894                  * avoids a race where a completion comes in before we did
8895                  * prepare_to_wait().
8896                  */
8897                 if (inflight == tctx_inflight(tctx))
8898                         schedule();
8899                 finish_wait(&tctx->wait, &wait);
8900         } while (1);
8901
8902         atomic_dec(&tctx->in_idle);
8903
8904         io_uring_clean_tctx(tctx);
8905         /* all current's requests should be gone, we can kill tctx */
8906         __io_uring_free(current);
8907 }
8908
8909 static void *io_uring_validate_mmap_request(struct file *file,
8910                                             loff_t pgoff, size_t sz)
8911 {
8912         struct io_ring_ctx *ctx = file->private_data;
8913         loff_t offset = pgoff << PAGE_SHIFT;
8914         struct page *page;
8915         void *ptr;
8916
8917         switch (offset) {
8918         case IORING_OFF_SQ_RING:
8919         case IORING_OFF_CQ_RING:
8920                 ptr = ctx->rings;
8921                 break;
8922         case IORING_OFF_SQES:
8923                 ptr = ctx->sq_sqes;
8924                 break;
8925         default:
8926                 return ERR_PTR(-EINVAL);
8927         }
8928
8929         page = virt_to_head_page(ptr);
8930         if (sz > page_size(page))
8931                 return ERR_PTR(-EINVAL);
8932
8933         return ptr;
8934 }
8935
8936 #ifdef CONFIG_MMU
8937
8938 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
8939 {
8940         size_t sz = vma->vm_end - vma->vm_start;
8941         unsigned long pfn;
8942         void *ptr;
8943
8944         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
8945         if (IS_ERR(ptr))
8946                 return PTR_ERR(ptr);
8947
8948         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
8949         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
8950 }
8951
8952 #else /* !CONFIG_MMU */
8953
8954 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
8955 {
8956         return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
8957 }
8958
8959 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
8960 {
8961         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
8962 }
8963
8964 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
8965         unsigned long addr, unsigned long len,
8966         unsigned long pgoff, unsigned long flags)
8967 {
8968         void *ptr;
8969
8970         ptr = io_uring_validate_mmap_request(file, pgoff, len);
8971         if (IS_ERR(ptr))
8972                 return PTR_ERR(ptr);
8973
8974         return (unsigned long) ptr;
8975 }
8976
8977 #endif /* !CONFIG_MMU */
8978
8979 static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
8980 {
8981         int ret = 0;
8982         DEFINE_WAIT(wait);
8983
8984         do {
8985                 if (!io_sqring_full(ctx))
8986                         break;
8987                 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
8988
8989                 if (!io_sqring_full(ctx))
8990                         break;
8991                 schedule();
8992         } while (!signal_pending(current));
8993
8994         finish_wait(&ctx->sqo_sq_wait, &wait);
8995         return ret;
8996 }
8997
8998 static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
8999                           struct __kernel_timespec __user **ts,
9000                           const sigset_t __user **sig)
9001 {
9002         struct io_uring_getevents_arg arg;
9003
9004         /*
9005          * If EXT_ARG isn't set, then we have no timespec and the argp pointer
9006          * is just a pointer to the sigset_t.
9007          */
9008         if (!(flags & IORING_ENTER_EXT_ARG)) {
9009                 *sig = (const sigset_t __user *) argp;
9010                 *ts = NULL;
9011                 return 0;
9012         }
9013
9014         /*
9015          * EXT_ARG is set - ensure we agree on the size of it and copy in our
9016          * timespec and sigset_t pointers if good.
9017          */
9018         if (*argsz != sizeof(arg))
9019                 return -EINVAL;
9020         if (copy_from_user(&arg, argp, sizeof(arg)))
9021                 return -EFAULT;
9022         *sig = u64_to_user_ptr(arg.sigmask);
9023         *argsz = arg.sigmask_sz;
9024         *ts = u64_to_user_ptr(arg.ts);
9025         return 0;
9026 }
9027
9028 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
9029                 u32, min_complete, u32, flags, const void __user *, argp,
9030                 size_t, argsz)
9031 {
9032         struct io_ring_ctx *ctx;
9033         long ret = -EBADF;
9034         int submitted = 0;
9035         struct fd f;
9036
9037         io_run_task_work();
9038
9039         if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
9040                         IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))
9041                 return -EINVAL;
9042
9043         f = fdget(fd);
9044         if (!f.file)
9045                 return -EBADF;
9046
9047         ret = -EOPNOTSUPP;
9048         if (f.file->f_op != &io_uring_fops)
9049                 goto out_fput;
9050
9051         ret = -ENXIO;
9052         ctx = f.file->private_data;
9053         if (!percpu_ref_tryget(&ctx->refs))
9054                 goto out_fput;
9055
9056         ret = -EBADFD;
9057         if (ctx->flags & IORING_SETUP_R_DISABLED)
9058                 goto out;
9059
9060         /*
9061          * For SQ polling, the thread will do all submissions and completions.
9062          * Just return the requested submit count, and wake the thread if
9063          * we were asked to.
9064          */
9065         ret = 0;
9066         if (ctx->flags & IORING_SETUP_SQPOLL) {
9067                 io_cqring_overflow_flush(ctx, false, NULL, NULL);
9068
9069                 if (unlikely(ctx->sqo_exec)) {
9070                         ret = io_sq_thread_fork(ctx->sq_data, ctx);
9071                         if (ret)
9072                                 goto out;
9073                         ctx->sqo_exec = 0;
9074                 }
9075                 ret = -EOWNERDEAD;
9076                 if (flags & IORING_ENTER_SQ_WAKEUP)
9077                         wake_up(&ctx->sq_data->wait);
9078                 if (flags & IORING_ENTER_SQ_WAIT) {
9079                         ret = io_sqpoll_wait_sq(ctx);
9080                         if (ret)
9081                                 goto out;
9082                 }
9083                 submitted = to_submit;
9084         } else if (to_submit) {
9085                 ret = io_uring_add_task_file(ctx, f.file);
9086                 if (unlikely(ret))
9087                         goto out;
9088                 mutex_lock(&ctx->uring_lock);
9089                 submitted = io_submit_sqes(ctx, to_submit);
9090                 mutex_unlock(&ctx->uring_lock);
9091
9092                 if (submitted != to_submit)
9093                         goto out;
9094         }
9095         if (flags & IORING_ENTER_GETEVENTS) {
9096                 const sigset_t __user *sig;
9097                 struct __kernel_timespec __user *ts;
9098
9099                 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
9100                 if (unlikely(ret))
9101                         goto out;
9102
9103                 min_complete = min(min_complete, ctx->cq_entries);
9104
9105                 /*
9106                  * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
9107                  * space applications don't need to do io completion events
9108                  * polling again, they can rely on io_sq_thread to do polling
9109                  * work, which can reduce cpu usage and uring_lock contention.
9110                  */
9111                 if (ctx->flags & IORING_SETUP_IOPOLL &&
9112                     !(ctx->flags & IORING_SETUP_SQPOLL)) {
9113                         ret = io_iopoll_check(ctx, min_complete);
9114                 } else {
9115                         ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
9116                 }
9117         }
9118
9119 out:
9120         percpu_ref_put(&ctx->refs);
9121 out_fput:
9122         fdput(f);
9123         return submitted ? submitted : ret;
9124 }
9125
9126 #ifdef CONFIG_PROC_FS
9127 static int io_uring_show_cred(int id, void *p, void *data)
9128 {
9129         const struct cred *cred = p;
9130         struct seq_file *m = data;
9131         struct user_namespace *uns = seq_user_ns(m);
9132         struct group_info *gi;
9133         kernel_cap_t cap;
9134         unsigned __capi;
9135         int g;
9136
9137         seq_printf(m, "%5d\n", id);
9138         seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
9139         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
9140         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
9141         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
9142         seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
9143         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
9144         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
9145         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
9146         seq_puts(m, "\n\tGroups:\t");
9147         gi = cred->group_info;
9148         for (g = 0; g < gi->ngroups; g++) {
9149                 seq_put_decimal_ull(m, g ? " " : "",
9150                                         from_kgid_munged(uns, gi->gid[g]));
9151         }
9152         seq_puts(m, "\n\tCapEff:\t");
9153         cap = cred->cap_effective;
9154         CAP_FOR_EACH_U32(__capi)
9155                 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
9156         seq_putc(m, '\n');
9157         return 0;
9158 }
9159
9160 static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
9161 {
9162         struct io_sq_data *sq = NULL;
9163         bool has_lock;
9164         int i;
9165
9166         /*
9167          * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
9168          * since fdinfo case grabs it in the opposite direction of normal use
9169          * cases. If we fail to get the lock, we just don't iterate any
9170          * structures that could be going away outside the io_uring mutex.
9171          */
9172         has_lock = mutex_trylock(&ctx->uring_lock);
9173
9174         if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
9175                 sq = ctx->sq_data;
9176                 if (!sq->thread)
9177                         sq = NULL;
9178         }
9179
9180         seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
9181         seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
9182         seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
9183         for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
9184                 struct file *f = *io_fixed_file_slot(ctx->file_data, i);
9185
9186                 if (f)
9187                         seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
9188                 else
9189                         seq_printf(m, "%5u: <none>\n", i);
9190         }
9191         seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
9192         for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
9193                 struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
9194
9195                 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
9196                                                 (unsigned int) buf->len);
9197         }
9198         if (has_lock && !idr_is_empty(&ctx->personality_idr)) {
9199                 seq_printf(m, "Personalities:\n");
9200                 idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
9201         }
9202         seq_printf(m, "PollList:\n");
9203         spin_lock_irq(&ctx->completion_lock);
9204         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
9205                 struct hlist_head *list = &ctx->cancel_hash[i];
9206                 struct io_kiocb *req;
9207
9208                 hlist_for_each_entry(req, list, hash_node)
9209                         seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
9210                                         req->task->task_works != NULL);
9211         }
9212         spin_unlock_irq(&ctx->completion_lock);
9213         if (has_lock)
9214                 mutex_unlock(&ctx->uring_lock);
9215 }
9216
9217 static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
9218 {
9219         struct io_ring_ctx *ctx = f->private_data;
9220
9221         if (percpu_ref_tryget(&ctx->refs)) {
9222                 __io_uring_show_fdinfo(ctx, m);
9223                 percpu_ref_put(&ctx->refs);
9224         }
9225 }
9226 #endif
9227
9228 static const struct file_operations io_uring_fops = {
9229         .release        = io_uring_release,
9230         .mmap           = io_uring_mmap,
9231 #ifndef CONFIG_MMU
9232         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
9233         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
9234 #endif
9235         .poll           = io_uring_poll,
9236         .fasync         = io_uring_fasync,
9237 #ifdef CONFIG_PROC_FS
9238         .show_fdinfo    = io_uring_show_fdinfo,
9239 #endif
9240 };
9241
9242 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
9243                                   struct io_uring_params *p)
9244 {
9245         struct io_rings *rings;
9246         size_t size, sq_array_offset;
9247
9248         /* make sure these are sane, as we already accounted them */
9249         ctx->sq_entries = p->sq_entries;
9250         ctx->cq_entries = p->cq_entries;
9251
9252         size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
9253         if (size == SIZE_MAX)
9254                 return -EOVERFLOW;
9255
9256         rings = io_mem_alloc(size);
9257         if (!rings)
9258                 return -ENOMEM;
9259
9260         ctx->rings = rings;
9261         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
9262         rings->sq_ring_mask = p->sq_entries - 1;
9263         rings->cq_ring_mask = p->cq_entries - 1;
9264         rings->sq_ring_entries = p->sq_entries;
9265         rings->cq_ring_entries = p->cq_entries;
9266         ctx->sq_mask = rings->sq_ring_mask;
9267         ctx->cq_mask = rings->cq_ring_mask;
9268
9269         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
9270         if (size == SIZE_MAX) {
9271                 io_mem_free(ctx->rings);
9272                 ctx->rings = NULL;
9273                 return -EOVERFLOW;
9274         }
9275
9276         ctx->sq_sqes = io_mem_alloc(size);
9277         if (!ctx->sq_sqes) {
9278                 io_mem_free(ctx->rings);
9279                 ctx->rings = NULL;
9280                 return -ENOMEM;
9281         }
9282
9283         return 0;
9284 }
9285
9286 static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
9287 {
9288         int ret, fd;
9289
9290         fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
9291         if (fd < 0)
9292                 return fd;
9293
9294         ret = io_uring_add_task_file(ctx, file);
9295         if (ret) {
9296                 put_unused_fd(fd);
9297                 return ret;
9298         }
9299         fd_install(fd, file);
9300         return fd;
9301 }
9302
9303 /*
9304  * Allocate an anonymous fd, this is what constitutes the application
9305  * visible backing of an io_uring instance. The application mmaps this
9306  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
9307  * we have to tie this fd to a socket for file garbage collection purposes.
9308  */
9309 static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
9310 {
9311         struct file *file;
9312 #if defined(CONFIG_UNIX)
9313         int ret;
9314
9315         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
9316                                 &ctx->ring_sock);
9317         if (ret)
9318                 return ERR_PTR(ret);
9319 #endif
9320
9321         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
9322                                         O_RDWR | O_CLOEXEC);
9323 #if defined(CONFIG_UNIX)
9324         if (IS_ERR(file)) {
9325                 sock_release(ctx->ring_sock);
9326                 ctx->ring_sock = NULL;
9327         } else {
9328                 ctx->ring_sock->file = file;
9329         }
9330 #endif
9331         return file;
9332 }
9333
9334 static int io_uring_create(unsigned entries, struct io_uring_params *p,
9335                            struct io_uring_params __user *params)
9336 {
9337         struct io_ring_ctx *ctx;
9338         struct file *file;
9339         int ret;
9340
9341         if (!entries)
9342                 return -EINVAL;
9343         if (entries > IORING_MAX_ENTRIES) {
9344                 if (!(p->flags & IORING_SETUP_CLAMP))
9345                         return -EINVAL;
9346                 entries = IORING_MAX_ENTRIES;
9347         }
9348
9349         /*
9350          * Use twice as many entries for the CQ ring. It's possible for the
9351          * application to drive a higher depth than the size of the SQ ring,
9352          * since the sqes are only used at submission time. This allows for
9353          * some flexibility in overcommitting a bit. If the application has
9354          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
9355          * of CQ ring entries manually.
9356          */
9357         p->sq_entries = roundup_pow_of_two(entries);
9358         if (p->flags & IORING_SETUP_CQSIZE) {
9359                 /*
9360                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
9361                  * to a power-of-two, if it isn't already. We do NOT impose
9362                  * any cq vs sq ring sizing.
9363                  */
9364                 if (!p->cq_entries)
9365                         return -EINVAL;
9366                 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
9367                         if (!(p->flags & IORING_SETUP_CLAMP))
9368                                 return -EINVAL;
9369                         p->cq_entries = IORING_MAX_CQ_ENTRIES;
9370                 }
9371                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
9372                 if (p->cq_entries < p->sq_entries)
9373                         return -EINVAL;
9374         } else {
9375                 p->cq_entries = 2 * p->sq_entries;
9376         }
9377
9378         ctx = io_ring_ctx_alloc(p);
9379         if (!ctx)
9380                 return -ENOMEM;
9381         ctx->compat = in_compat_syscall();
9382         if (!capable(CAP_IPC_LOCK))
9383                 ctx->user = get_uid(current_user());
9384
9385         /*
9386          * This is just grabbed for accounting purposes. When a process exits,
9387          * the mm is exited and dropped before the files, hence we need to hang
9388          * on to this mm purely for the purposes of being able to unaccount
9389          * memory (locked/pinned vm). It's not used for anything else.
9390          */
9391         mmgrab(current->mm);
9392         ctx->mm_account = current->mm;
9393
9394         ret = io_allocate_scq_urings(ctx, p);
9395         if (ret)
9396                 goto err;
9397
9398         ret = io_sq_offload_create(ctx, p);
9399         if (ret)
9400                 goto err;
9401
9402         if (!(p->flags & IORING_SETUP_R_DISABLED))
9403                 io_sq_offload_start(ctx);
9404
9405         memset(&p->sq_off, 0, sizeof(p->sq_off));
9406         p->sq_off.head = offsetof(struct io_rings, sq.head);
9407         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
9408         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
9409         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
9410         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
9411         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
9412         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
9413
9414         memset(&p->cq_off, 0, sizeof(p->cq_off));
9415         p->cq_off.head = offsetof(struct io_rings, cq.head);
9416         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
9417         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
9418         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
9419         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
9420         p->cq_off.cqes = offsetof(struct io_rings, cqes);
9421         p->cq_off.flags = offsetof(struct io_rings, cq_flags);
9422
9423         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
9424                         IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
9425                         IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
9426                         IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
9427                         IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS;
9428
9429         if (copy_to_user(params, p, sizeof(*p))) {
9430                 ret = -EFAULT;
9431                 goto err;
9432         }
9433
9434         file = io_uring_get_file(ctx);
9435         if (IS_ERR(file)) {
9436                 ret = PTR_ERR(file);
9437                 goto err;
9438         }
9439
9440         /*
9441          * Install ring fd as the very last thing, so we don't risk someone
9442          * having closed it before we finish setup
9443          */
9444         ret = io_uring_install_fd(ctx, file);
9445         if (ret < 0) {
9446                 /* fput will clean it up */
9447                 fput(file);
9448                 return ret;
9449         }
9450
9451         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
9452         return ret;
9453 err:
9454         io_ring_ctx_wait_and_kill(ctx);
9455         return ret;
9456 }
9457
9458 /*
9459  * Sets up an aio uring context, and returns the fd. Applications asks for a
9460  * ring size, we return the actual sq/cq ring sizes (among other things) in the
9461  * params structure passed in.
9462  */
9463 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
9464 {
9465         struct io_uring_params p;
9466         int i;
9467
9468         if (copy_from_user(&p, params, sizeof(p)))
9469                 return -EFAULT;
9470         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
9471                 if (p.resv[i])
9472                         return -EINVAL;
9473         }
9474
9475         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
9476                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
9477                         IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
9478                         IORING_SETUP_R_DISABLED))
9479                 return -EINVAL;
9480
9481         return  io_uring_create(entries, &p, params);
9482 }
9483
9484 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
9485                 struct io_uring_params __user *, params)
9486 {
9487         return io_uring_setup(entries, params);
9488 }
9489
9490 static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
9491 {
9492         struct io_uring_probe *p;
9493         size_t size;
9494         int i, ret;
9495
9496         size = struct_size(p, ops, nr_args);
9497         if (size == SIZE_MAX)
9498                 return -EOVERFLOW;
9499         p = kzalloc(size, GFP_KERNEL);
9500         if (!p)
9501                 return -ENOMEM;
9502
9503         ret = -EFAULT;
9504         if (copy_from_user(p, arg, size))
9505                 goto out;
9506         ret = -EINVAL;
9507         if (memchr_inv(p, 0, size))
9508                 goto out;
9509
9510         p->last_op = IORING_OP_LAST - 1;
9511         if (nr_args > IORING_OP_LAST)
9512                 nr_args = IORING_OP_LAST;
9513
9514         for (i = 0; i < nr_args; i++) {
9515                 p->ops[i].op = i;
9516                 if (!io_op_defs[i].not_supported)
9517                         p->ops[i].flags = IO_URING_OP_SUPPORTED;
9518         }
9519         p->ops_len = i;
9520
9521         ret = 0;
9522         if (copy_to_user(arg, p, size))
9523                 ret = -EFAULT;
9524 out:
9525         kfree(p);
9526         return ret;
9527 }
9528
9529 static int io_register_personality(struct io_ring_ctx *ctx)
9530 {
9531         const struct cred *creds;
9532         int ret;
9533
9534         creds = get_current_cred();
9535
9536         ret = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
9537                                 USHRT_MAX, GFP_KERNEL);
9538         if (ret < 0)
9539                 put_cred(creds);
9540         return ret;
9541 }
9542
9543 static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
9544                                     unsigned int nr_args)
9545 {
9546         struct io_uring_restriction *res;
9547         size_t size;
9548         int i, ret;
9549
9550         /* Restrictions allowed only if rings started disabled */
9551         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
9552                 return -EBADFD;
9553
9554         /* We allow only a single restrictions registration */
9555         if (ctx->restrictions.registered)
9556                 return -EBUSY;
9557
9558         if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
9559                 return -EINVAL;
9560
9561         size = array_size(nr_args, sizeof(*res));
9562         if (size == SIZE_MAX)
9563                 return -EOVERFLOW;
9564
9565         res = memdup_user(arg, size);
9566         if (IS_ERR(res))
9567                 return PTR_ERR(res);
9568
9569         ret = 0;
9570
9571         for (i = 0; i < nr_args; i++) {
9572                 switch (res[i].opcode) {
9573                 case IORING_RESTRICTION_REGISTER_OP:
9574                         if (res[i].register_op >= IORING_REGISTER_LAST) {
9575                                 ret = -EINVAL;
9576                                 goto out;
9577                         }
9578
9579                         __set_bit(res[i].register_op,
9580                                   ctx->restrictions.register_op);
9581                         break;
9582                 case IORING_RESTRICTION_SQE_OP:
9583                         if (res[i].sqe_op >= IORING_OP_LAST) {
9584                                 ret = -EINVAL;
9585                                 goto out;
9586                         }
9587
9588                         __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
9589                         break;
9590                 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
9591                         ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
9592                         break;
9593                 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
9594                         ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
9595                         break;
9596                 default:
9597                         ret = -EINVAL;
9598                         goto out;
9599                 }
9600         }
9601
9602 out:
9603         /* Reset all restrictions if an error happened */
9604         if (ret != 0)
9605                 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
9606         else
9607                 ctx->restrictions.registered = true;
9608
9609         kfree(res);
9610         return ret;
9611 }
9612
9613 static int io_register_enable_rings(struct io_ring_ctx *ctx)
9614 {
9615         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
9616                 return -EBADFD;
9617
9618         if (ctx->restrictions.registered)
9619                 ctx->restricted = 1;
9620
9621         io_sq_offload_start(ctx);
9622         return 0;
9623 }
9624
9625 static bool io_register_op_must_quiesce(int op)
9626 {
9627         switch (op) {
9628         case IORING_UNREGISTER_FILES:
9629         case IORING_REGISTER_FILES_UPDATE:
9630         case IORING_REGISTER_PROBE:
9631         case IORING_REGISTER_PERSONALITY:
9632         case IORING_UNREGISTER_PERSONALITY:
9633                 return false;
9634         default:
9635                 return true;
9636         }
9637 }
9638
9639 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
9640                                void __user *arg, unsigned nr_args)
9641         __releases(ctx->uring_lock)
9642         __acquires(ctx->uring_lock)
9643 {
9644         int ret;
9645
9646         /*
9647          * We're inside the ring mutex, if the ref is already dying, then
9648          * someone else killed the ctx or is already going through
9649          * io_uring_register().
9650          */
9651         if (percpu_ref_is_dying(&ctx->refs))
9652                 return -ENXIO;
9653
9654         if (io_register_op_must_quiesce(opcode)) {
9655                 percpu_ref_kill(&ctx->refs);
9656
9657                 /*
9658                  * Drop uring mutex before waiting for references to exit. If
9659                  * another thread is currently inside io_uring_enter() it might
9660                  * need to grab the uring_lock to make progress. If we hold it
9661                  * here across the drain wait, then we can deadlock. It's safe
9662                  * to drop the mutex here, since no new references will come in
9663                  * after we've killed the percpu ref.
9664                  */
9665                 mutex_unlock(&ctx->uring_lock);
9666                 do {
9667                         ret = wait_for_completion_interruptible(&ctx->ref_comp);
9668                         if (!ret)
9669                                 break;
9670                         ret = io_run_task_work_sig();
9671                         if (ret < 0)
9672                                 break;
9673                 } while (1);
9674
9675                 mutex_lock(&ctx->uring_lock);
9676
9677                 if (ret) {
9678                         percpu_ref_resurrect(&ctx->refs);
9679                         goto out_quiesce;
9680                 }
9681         }
9682
9683         if (ctx->restricted) {
9684                 if (opcode >= IORING_REGISTER_LAST) {
9685                         ret = -EINVAL;
9686                         goto out;
9687                 }
9688
9689                 if (!test_bit(opcode, ctx->restrictions.register_op)) {
9690                         ret = -EACCES;
9691                         goto out;
9692                 }
9693         }
9694
9695         switch (opcode) {
9696         case IORING_REGISTER_BUFFERS:
9697                 ret = io_sqe_buffers_register(ctx, arg, nr_args);
9698                 break;
9699         case IORING_UNREGISTER_BUFFERS:
9700                 ret = -EINVAL;
9701                 if (arg || nr_args)
9702                         break;
9703                 ret = io_sqe_buffers_unregister(ctx);
9704                 break;
9705         case IORING_REGISTER_FILES:
9706                 ret = io_sqe_files_register(ctx, arg, nr_args);
9707                 break;
9708         case IORING_UNREGISTER_FILES:
9709                 ret = -EINVAL;
9710                 if (arg || nr_args)
9711                         break;
9712                 ret = io_sqe_files_unregister(ctx);
9713                 break;
9714         case IORING_REGISTER_FILES_UPDATE:
9715                 ret = io_sqe_files_update(ctx, arg, nr_args);
9716                 break;
9717         case IORING_REGISTER_EVENTFD:
9718         case IORING_REGISTER_EVENTFD_ASYNC:
9719                 ret = -EINVAL;
9720                 if (nr_args != 1)
9721                         break;
9722                 ret = io_eventfd_register(ctx, arg);
9723                 if (ret)
9724                         break;
9725                 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
9726                         ctx->eventfd_async = 1;
9727                 else
9728                         ctx->eventfd_async = 0;
9729                 break;
9730         case IORING_UNREGISTER_EVENTFD:
9731                 ret = -EINVAL;
9732                 if (arg || nr_args)
9733                         break;
9734                 ret = io_eventfd_unregister(ctx);
9735                 break;
9736         case IORING_REGISTER_PROBE:
9737                 ret = -EINVAL;
9738                 if (!arg || nr_args > 256)
9739                         break;
9740                 ret = io_probe(ctx, arg, nr_args);
9741                 break;
9742         case IORING_REGISTER_PERSONALITY:
9743                 ret = -EINVAL;
9744                 if (arg || nr_args)
9745                         break;
9746                 ret = io_register_personality(ctx);
9747                 break;
9748         case IORING_UNREGISTER_PERSONALITY:
9749                 ret = -EINVAL;
9750                 if (arg)
9751                         break;
9752                 ret = io_unregister_personality(ctx, nr_args);
9753                 break;
9754         case IORING_REGISTER_ENABLE_RINGS:
9755                 ret = -EINVAL;
9756                 if (arg || nr_args)
9757                         break;
9758                 ret = io_register_enable_rings(ctx);
9759                 break;
9760         case IORING_REGISTER_RESTRICTIONS:
9761                 ret = io_register_restrictions(ctx, arg, nr_args);
9762                 break;
9763         default:
9764                 ret = -EINVAL;
9765                 break;
9766         }
9767
9768 out:
9769         if (io_register_op_must_quiesce(opcode)) {
9770                 /* bring the ctx back to life */
9771                 percpu_ref_reinit(&ctx->refs);
9772 out_quiesce:
9773                 reinit_completion(&ctx->ref_comp);
9774         }
9775         return ret;
9776 }
9777
9778 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
9779                 void __user *, arg, unsigned int, nr_args)
9780 {
9781         struct io_ring_ctx *ctx;
9782         long ret = -EBADF;
9783         struct fd f;
9784
9785         f = fdget(fd);
9786         if (!f.file)
9787                 return -EBADF;
9788
9789         ret = -EOPNOTSUPP;
9790         if (f.file->f_op != &io_uring_fops)
9791                 goto out_fput;
9792
9793         ctx = f.file->private_data;
9794
9795         io_run_task_work();
9796
9797         mutex_lock(&ctx->uring_lock);
9798         ret = __io_uring_register(ctx, opcode, arg, nr_args);
9799         mutex_unlock(&ctx->uring_lock);
9800         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
9801                                                         ctx->cq_ev_fd != NULL, ret);
9802 out_fput:
9803         fdput(f);
9804         return ret;
9805 }
9806
9807 static int __init io_uring_init(void)
9808 {
9809 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
9810         BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
9811         BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
9812 } while (0)
9813
9814 #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
9815         __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
9816         BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
9817         BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
9818         BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
9819         BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
9820         BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
9821         BUILD_BUG_SQE_ELEM(8,  __u64,  off);
9822         BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
9823         BUILD_BUG_SQE_ELEM(16, __u64,  addr);
9824         BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
9825         BUILD_BUG_SQE_ELEM(24, __u32,  len);
9826         BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
9827         BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
9828         BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
9829         BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
9830         BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
9831         BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
9832         BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
9833         BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
9834         BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
9835         BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
9836         BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
9837         BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
9838         BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
9839         BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
9840         BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
9841         BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
9842         BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
9843         BUILD_BUG_SQE_ELEM(42, __u16,  personality);
9844         BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
9845
9846         BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
9847         BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
9848         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
9849                                 SLAB_ACCOUNT);
9850         return 0;
9851 };
9852 __initcall(io_uring_init);