fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqring (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <linux/compat.h>
  47 #include <net/compat.h>
  48 #include <linux/refcount.h>
  49 #include <linux/uio.h>
  50 #include <linux/bits.h>
  51
  52 #include <linux/sched/signal.h>
  53 #include <linux/fs.h>
  54 #include <linux/file.h>
  55 #include <linux/fdtable.h>
  56 #include <linux/mm.h>
  57 #include <linux/mman.h>
  58 #include <linux/percpu.h>
  59 #include <linux/slab.h>
  60 #include <linux/blkdev.h>
  61 #include <linux/bvec.h>
  62 #include <linux/net.h>
  63 #include <net/sock.h>
  64 #include <net/af_unix.h>
  65 #include <net/scm.h>
  66 #include <linux/anon_inodes.h>
  67 #include <linux/sched/mm.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/nospec.h>
  70 #include <linux/sizes.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/highmem.h>
  73 #include <linux/namei.h>
  74 #include <linux/fsnotify.h>
  75 #include <linux/fadvise.h>
  76 #include <linux/eventpoll.h>
  77 #include <linux/splice.h>
  78 #include <linux/task_work.h>
  79 #include <linux/pagemap.h>
  80 #include <linux/io_uring.h>
  81 #include <linux/freezer.h>
  82
  83 #define CREATE_TRACE_POINTS
  84 #include <trace/events/io_uring.h>
  85
  86 #include <uapi/linux/io_uring.h>
  87
  88 #include "internal.h"
  89 #include "io-wq.h"
  90
  91 #define IORING_MAX_ENTRIES      32768
  92 #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
  93
  94 /*
  95  * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
  96  */
  97 #define IORING_FILE_TABLE_SHIFT 9
  98 #define IORING_MAX_FILES_TABLE  (1U << IORING_FILE_TABLE_SHIFT)
  99 #define IORING_FILE_TABLE_MASK  (IORING_MAX_FILES_TABLE - 1)
 100 #define IORING_MAX_FIXED_FILES  (64 * IORING_MAX_FILES_TABLE)
 101 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
 102                                  IORING_REGISTER_LAST + IORING_OP_LAST)
 103
 104 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
 105                                 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
 106                                 IOSQE_BUFFER_SELECT)
 107
 108 struct io_uring {
 109         u32 head ____cacheline_aligned_in_smp;
 110         u32 tail ____cacheline_aligned_in_smp;
 111 };
 112
 113 /*
 114  * This data is shared with the application through the mmap at offsets
 115  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 116  *
 117  * The offsets to the member fields are published through struct
 118  * io_sqring_offsets when calling io_uring_setup.
 119  */
 120 struct io_rings {
 121         /*
 122          * Head and tail offsets into the ring; the offsets need to be
 123          * masked to get valid indices.
 124          *
 125          * The kernel controls head of the sq ring and the tail of the cq ring,
 126          * and the application controls tail of the sq ring and the head of the
 127          * cq ring.
 128          */
 129         struct io_uring         sq, cq;
 130         /*
 131          * Bitmasks to apply to head and tail offsets (constant, equals
 132          * ring_entries - 1)
 133          */
 134         u32                     sq_ring_mask, cq_ring_mask;
 135         /* Ring sizes (constant, power of 2) */
 136         u32                     sq_ring_entries, cq_ring_entries;
 137         /*
 138          * Number of invalid entries dropped by the kernel due to
 139          * invalid index stored in array
 140          *
 141          * Written by the kernel, shouldn't be modified by the
 142          * application (i.e. get number of "new events" by comparing to
 143          * cached value).
 144          *
 145          * After a new SQ head value was read by the application this
 146          * counter includes all submissions that were dropped reaching
 147          * the new SQ head (and possibly more).
 148          */
 149         u32                     sq_dropped;
 150         /*
 151          * Runtime SQ flags
 152          *
 153          * Written by the kernel, shouldn't be modified by the
 154          * application.
 155          *
 156          * The application needs a full memory barrier before checking
 157          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 158          */
 159         u32                     sq_flags;
 160         /*
 161          * Runtime CQ flags
 162          *
 163          * Written by the application, shouldn't be modified by the
 164          * kernel.
 165          */
 166         u32                     cq_flags;
 167         /*
 168          * Number of completion events lost because the queue was full;
 169          * this should be avoided by the application by making sure
 170          * there are not more requests pending than there is space in
 171          * the completion queue.
 172          *
 173          * Written by the kernel, shouldn't be modified by the
 174          * application (i.e. get number of "new events" by comparing to
 175          * cached value).
 176          *
 177          * As completion events come in out of order this counter is not
 178          * ordered with any other data.
 179          */
 180         u32                     cq_overflow;
 181         /*
 182          * Ring buffer of completion events.
 183          *
 184          * The kernel writes completion events fresh every time they are
 185          * produced, so the application is allowed to modify pending
 186          * entries.
 187          */
 188         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 189 };
 190
 191 enum io_uring_cmd_flags {
 192         IO_URING_F_NONBLOCK             = 1,
 193         IO_URING_F_COMPLETE_DEFER       = 2,
 194 };
 195
 196 struct io_mapped_ubuf {
 197         u64             ubuf;
 198         size_t          len;
 199         struct          bio_vec *bvec;
 200         unsigned int    nr_bvecs;
 201         unsigned long   acct_pages;
 202 };
 203
 204 struct io_ring_ctx;
 205
 206 struct io_rsrc_put {
 207         struct list_head list;
 208         union {
 209                 void *rsrc;
 210                 struct file *file;
 211         };
 212 };
 213
 214 struct fixed_rsrc_table {
 215         struct file             **files;
 216 };
 217
 218 struct fixed_rsrc_ref_node {
 219         struct percpu_ref               refs;
 220         struct list_head                node;
 221         struct list_head                rsrc_list;
 222         struct fixed_rsrc_data          *rsrc_data;
 223         void                            (*rsrc_put)(struct io_ring_ctx *ctx,
 224                                                     struct io_rsrc_put *prsrc);
 225         struct llist_node               llist;
 226         bool                            done;
 227 };
 228
 229 struct fixed_rsrc_data {
 230         struct fixed_rsrc_table         *table;
 231         struct io_ring_ctx              *ctx;
 232
 233         struct fixed_rsrc_ref_node      *node;
 234         struct percpu_ref               refs;
 235         struct completion               done;
 236         bool                            quiesce;
 237 };
 238
 239 struct io_buffer {
 240         struct list_head list;
 241         __u64 addr;
 242         __s32 len;
 243         __u16 bid;
 244 };
 245
 246 struct io_restriction {
 247         DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
 248         DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
 249         u8 sqe_flags_allowed;
 250         u8 sqe_flags_required;
 251         bool registered;
 252 };
 253
 254 enum {
 255         IO_SQ_THREAD_SHOULD_STOP = 0,
 256         IO_SQ_THREAD_SHOULD_PARK,
 257 };
 258
 259 struct io_sq_data {
 260         refcount_t              refs;
 261         struct mutex            lock;
 262
 263         /* ctx's that are using this sqd */
 264         struct list_head        ctx_list;
 265         struct list_head        ctx_new_list;
 266         struct mutex            ctx_lock;
 267
 268         struct task_struct      *thread;
 269         struct wait_queue_head  wait;
 270
 271         unsigned                sq_thread_idle;
 272         int                     sq_cpu;
 273         pid_t                   task_pid;
 274
 275         unsigned long           state;
 276         struct completion       startup;
 277         struct completion       parked;
 278         struct completion       exited;
 279 };
 280
 281 #define IO_IOPOLL_BATCH                 8
 282 #define IO_COMPL_BATCH                  32
 283 #define IO_REQ_CACHE_SIZE               32
 284 #define IO_REQ_ALLOC_BATCH              8
 285
 286 struct io_comp_state {
 287         struct io_kiocb         *reqs[IO_COMPL_BATCH];
 288         unsigned int            nr;
 289         unsigned int            locked_free_nr;
 290         /* inline/task_work completion list, under ->uring_lock */
 291         struct list_head        free_list;
 292         /* IRQ completion list, under ->completion_lock */
 293         struct list_head        locked_free_list;
 294 };
 295
 296 struct io_submit_link {
 297         struct io_kiocb         *head;
 298         struct io_kiocb         *last;
 299 };
 300
 301 struct io_submit_state {
 302         struct blk_plug         plug;
 303         struct io_submit_link   link;
 304
 305         /*
 306          * io_kiocb alloc cache
 307          */
 308         void                    *reqs[IO_REQ_CACHE_SIZE];
 309         unsigned int            free_reqs;
 310
 311         bool                    plug_started;
 312
 313         /*
 314          * Batch completion logic
 315          */
 316         struct io_comp_state    comp;
 317
 318         /*
 319          * File reference cache
 320          */
 321         struct file             *file;
 322         unsigned int            fd;
 323         unsigned int            file_refs;
 324         unsigned int            ios_left;
 325 };
 326
 327 struct io_ring_ctx {
 328         struct {
 329                 struct percpu_ref       refs;
 330         } ____cacheline_aligned_in_smp;
 331
 332         struct {
 333                 unsigned int            flags;
 334                 unsigned int            compat: 1;
 335                 unsigned int            cq_overflow_flushed: 1;
 336                 unsigned int            drain_next: 1;
 337                 unsigned int            eventfd_async: 1;
 338                 unsigned int            restricted: 1;
 339                 unsigned int            sqo_exec: 1;
 340
 341                 /*
 342                  * Ring buffer of indices into array of io_uring_sqe, which is
 343                  * mmapped by the application using the IORING_OFF_SQES offset.
 344                  *
 345                  * This indirection could e.g. be used to assign fixed
 346                  * io_uring_sqe entries to operations and only submit them to
 347                  * the queue when needed.
 348                  *
 349                  * The kernel modifies neither the indices array nor the entries
 350                  * array.
 351                  */
 352                 u32                     *sq_array;
 353                 unsigned                cached_sq_head;
 354                 unsigned                sq_entries;
 355                 unsigned                sq_mask;
 356                 unsigned                sq_thread_idle;
 357                 unsigned                cached_sq_dropped;
 358                 unsigned                cached_cq_overflow;
 359                 unsigned long           sq_check_overflow;
 360
 361                 /* hashed buffered write serialization */
 362                 struct io_wq_hash       *hash_map;
 363
 364                 struct list_head        defer_list;
 365                 struct list_head        timeout_list;
 366                 struct list_head        cq_overflow_list;
 367
 368                 struct io_uring_sqe     *sq_sqes;
 369         } ____cacheline_aligned_in_smp;
 370
 371         struct {
 372                 struct mutex            uring_lock;
 373                 wait_queue_head_t       wait;
 374         } ____cacheline_aligned_in_smp;
 375
 376         struct io_submit_state          submit_state;
 377
 378         struct io_rings *rings;
 379
 380         /* Only used for accounting purposes */
 381         struct mm_struct        *mm_account;
 382
 383         struct io_sq_data       *sq_data;       /* if using sq thread polling */
 384
 385         struct wait_queue_head  sqo_sq_wait;
 386         struct list_head        sqd_list;
 387
 388         /*
 389          * If used, fixed file set. Writers must ensure that ->refs is dead,
 390          * readers must ensure that ->refs is alive as long as the file* is
 391          * used. Only updated through io_uring_register(2).
 392          */
 393         struct fixed_rsrc_data  *file_data;
 394         unsigned                nr_user_files;
 395
 396         /* if used, fixed mapped user buffers */
 397         unsigned                nr_user_bufs;
 398         struct io_mapped_ubuf   *user_bufs;
 399
 400         struct user_struct      *user;
 401
 402         struct completion       ref_comp;
 403         struct completion       sq_thread_comp;
 404
 405 #if defined(CONFIG_UNIX)
 406         struct socket           *ring_sock;
 407 #endif
 408
 409         struct idr              io_buffer_idr;
 410
 411         struct idr              personality_idr;
 412
 413         struct {
 414                 unsigned                cached_cq_tail;
 415                 unsigned                cq_entries;
 416                 unsigned                cq_mask;
 417                 atomic_t                cq_timeouts;
 418                 unsigned                cq_last_tm_flush;
 419                 unsigned long           cq_check_overflow;
 420                 struct wait_queue_head  cq_wait;
 421                 struct fasync_struct    *cq_fasync;
 422                 struct eventfd_ctx      *cq_ev_fd;
 423         } ____cacheline_aligned_in_smp;
 424
 425         struct {
 426                 spinlock_t              completion_lock;
 427
 428                 /*
 429                  * ->iopoll_list is protected by the ctx->uring_lock for
 430                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 431                  * For SQPOLL, only the single threaded io_sq_thread() will
 432                  * manipulate the list, hence no extra locking is needed there.
 433                  */
 434                 struct list_head        iopoll_list;
 435                 struct hlist_head       *cancel_hash;
 436                 unsigned                cancel_hash_bits;
 437                 bool                    poll_multi_file;
 438
 439                 spinlock_t              inflight_lock;
 440                 struct list_head        inflight_list;
 441         } ____cacheline_aligned_in_smp;
 442
 443         struct delayed_work             rsrc_put_work;
 444         struct llist_head               rsrc_put_llist;
 445         struct list_head                rsrc_ref_list;
 446         spinlock_t                      rsrc_ref_lock;
 447
 448         struct io_restriction           restrictions;
 449
 450         /* exit task_work */
 451         struct callback_head            *exit_task_work;
 452
 453         struct wait_queue_head          hash_wait;
 454
 455         /* Keep this last, we don't need it for the fast path */
 456         struct work_struct              exit_work;
 457 };
 458
 459 /*
 460  * First field must be the file pointer in all the
 461  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 462  */
 463 struct io_poll_iocb {
 464         struct file                     *file;
 465         struct wait_queue_head          *head;
 466         __poll_t                        events;
 467         bool                            done;
 468         bool                            canceled;
 469         struct wait_queue_entry         wait;
 470 };
 471
 472 struct io_poll_remove {
 473         struct file                     *file;
 474         u64                             addr;
 475 };
 476
 477 struct io_close {
 478         struct file                     *file;
 479         int                             fd;
 480 };
 481
 482 struct io_timeout_data {
 483         struct io_kiocb                 *req;
 484         struct hrtimer                  timer;
 485         struct timespec64               ts;
 486         enum hrtimer_mode               mode;
 487 };
 488
 489 struct io_accept {
 490         struct file                     *file;
 491         struct sockaddr __user          *addr;
 492         int __user                      *addr_len;
 493         int                             flags;
 494         unsigned long                   nofile;
 495 };
 496
 497 struct io_sync {
 498         struct file                     *file;
 499         loff_t                          len;
 500         loff_t                          off;
 501         int                             flags;
 502         int                             mode;
 503 };
 504
 505 struct io_cancel {
 506         struct file                     *file;
 507         u64                             addr;
 508 };
 509
 510 struct io_timeout {
 511         struct file                     *file;
 512         u32                             off;
 513         u32                             target_seq;
 514         struct list_head                list;
 515         /* head of the link, used by linked timeouts only */
 516         struct io_kiocb                 *head;
 517 };
 518
 519 struct io_timeout_rem {
 520         struct file                     *file;
 521         u64                             addr;
 522
 523         /* timeout update */
 524         struct timespec64               ts;
 525         u32                             flags;
 526 };
 527
 528 struct io_rw {
 529         /* NOTE: kiocb has the file as the first member, so don't do it here */
 530         struct kiocb                    kiocb;
 531         u64                             addr;
 532         u64                             len;
 533 };
 534
 535 struct io_connect {
 536         struct file                     *file;
 537         struct sockaddr __user          *addr;
 538         int                             addr_len;
 539 };
 540
 541 struct io_sr_msg {
 542         struct file                     *file;
 543         union {
 544                 struct user_msghdr __user *umsg;
 545                 void __user             *buf;
 546         };
 547         int                             msg_flags;
 548         int                             bgid;
 549         size_t                          len;
 550         struct io_buffer                *kbuf;
 551 };
 552
 553 struct io_open {
 554         struct file                     *file;
 555         int                             dfd;
 556         struct filename                 *filename;
 557         struct open_how                 how;
 558         unsigned long                   nofile;
 559 };
 560
 561 struct io_rsrc_update {
 562         struct file                     *file;
 563         u64                             arg;
 564         u32                             nr_args;
 565         u32                             offset;
 566 };
 567
 568 struct io_fadvise {
 569         struct file                     *file;
 570         u64                             offset;
 571         u32                             len;
 572         u32                             advice;
 573 };
 574
 575 struct io_madvise {
 576         struct file                     *file;
 577         u64                             addr;
 578         u32                             len;
 579         u32                             advice;
 580 };
 581
 582 struct io_epoll {
 583         struct file                     *file;
 584         int                             epfd;
 585         int                             op;
 586         int                             fd;
 587         struct epoll_event              event;
 588 };
 589
 590 struct io_splice {
 591         struct file                     *file_out;
 592         struct file                     *file_in;
 593         loff_t                          off_out;
 594         loff_t                          off_in;
 595         u64                             len;
 596         unsigned int                    flags;
 597 };
 598
 599 struct io_provide_buf {
 600         struct file                     *file;
 601         __u64                           addr;
 602         __s32                           len;
 603         __u32                           bgid;
 604         __u16                           nbufs;
 605         __u16                           bid;
 606 };
 607
 608 struct io_statx {
 609         struct file                     *file;
 610         int                             dfd;
 611         unsigned int                    mask;
 612         unsigned int                    flags;
 613         const char __user               *filename;
 614         struct statx __user             *buffer;
 615 };
 616
 617 struct io_shutdown {
 618         struct file                     *file;
 619         int                             how;
 620 };
 621
 622 struct io_rename {
 623         struct file                     *file;
 624         int                             old_dfd;
 625         int                             new_dfd;
 626         struct filename                 *oldpath;
 627         struct filename                 *newpath;
 628         int                             flags;
 629 };
 630
 631 struct io_unlink {
 632         struct file                     *file;
 633         int                             dfd;
 634         int                             flags;
 635         struct filename                 *filename;
 636 };
 637
 638 struct io_completion {
 639         struct file                     *file;
 640         struct list_head                list;
 641         int                             cflags;
 642 };
 643
 644 struct io_async_connect {
 645         struct sockaddr_storage         address;
 646 };
 647
 648 struct io_async_msghdr {
 649         struct iovec                    fast_iov[UIO_FASTIOV];
 650         /* points to an allocated iov, if NULL we use fast_iov instead */
 651         struct iovec                    *free_iov;
 652         struct sockaddr __user          *uaddr;
 653         struct msghdr                   msg;
 654         struct sockaddr_storage         addr;
 655 };
 656
 657 struct io_async_rw {
 658         struct iovec                    fast_iov[UIO_FASTIOV];
 659         const struct iovec              *free_iovec;
 660         struct iov_iter                 iter;
 661         size_t                          bytes_done;
 662         struct wait_page_queue          wpq;
 663 };
 664
 665 enum {
 666         REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
 667         REQ_F_IO_DRAIN_BIT      = IOSQE_IO_DRAIN_BIT,
 668         REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
 669         REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
 670         REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
 671         REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
 672
 673         REQ_F_FAIL_LINK_BIT,
 674         REQ_F_INFLIGHT_BIT,
 675         REQ_F_CUR_POS_BIT,
 676         REQ_F_NOWAIT_BIT,
 677         REQ_F_LINK_TIMEOUT_BIT,
 678         REQ_F_ISREG_BIT,
 679         REQ_F_NEED_CLEANUP_BIT,
 680         REQ_F_POLLED_BIT,
 681         REQ_F_BUFFER_SELECTED_BIT,
 682         REQ_F_NO_FILE_TABLE_BIT,
 683         REQ_F_LTIMEOUT_ACTIVE_BIT,
 684         REQ_F_COMPLETE_INLINE_BIT,
 685
 686         /* not a real bit, just to check we're not overflowing the space */
 687         __REQ_F_LAST_BIT,
 688 };
 689
 690 enum {
 691         /* ctx owns file */
 692         REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
 693         /* drain existing IO first */
 694         REQ_F_IO_DRAIN          = BIT(REQ_F_IO_DRAIN_BIT),
 695         /* linked sqes */
 696         REQ_F_LINK              = BIT(REQ_F_LINK_BIT),
 697         /* doesn't sever on completion < 0 */
 698         REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
 699         /* IOSQE_ASYNC */
 700         REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
 701         /* IOSQE_BUFFER_SELECT */
 702         REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
 703
 704         /* fail rest of links */
 705         REQ_F_FAIL_LINK         = BIT(REQ_F_FAIL_LINK_BIT),
 706         /* on inflight list, should be cancelled and waited on exit reliably */
 707         REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
 708         /* read/write uses file position */
 709         REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
 710         /* must not punt to workers */
 711         REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
 712         /* has or had linked timeout */
 713         REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
 714         /* regular file */
 715         REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
 716         /* needs cleanup */
 717         REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
 718         /* already went through poll handler */
 719         REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
 720         /* buffer already selected */
 721         REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
 722         /* doesn't need file table for this request */
 723         REQ_F_NO_FILE_TABLE     = BIT(REQ_F_NO_FILE_TABLE_BIT),
 724         /* linked timeout is active, i.e. prepared by link's head */
 725         REQ_F_LTIMEOUT_ACTIVE   = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
 726         /* completion is deferred through io_comp_state */
 727         REQ_F_COMPLETE_INLINE   = BIT(REQ_F_COMPLETE_INLINE_BIT),
 728 };
 729
 730 struct async_poll {
 731         struct io_poll_iocb     poll;
 732         struct io_poll_iocb     *double_poll;
 733 };
 734
 735 struct io_task_work {
 736         struct io_wq_work_node  node;
 737         task_work_func_t        func;
 738 };
 739
 740 /*
 741  * NOTE! Each of the iocb union members has the file pointer
 742  * as the first entry in their struct definition. So you can
 743  * access the file pointer through any of the sub-structs,
 744  * or directly as just 'ki_filp' in this struct.
 745  */
 746 struct io_kiocb {
 747         union {
 748                 struct file             *file;
 749                 struct io_rw            rw;
 750                 struct io_poll_iocb     poll;
 751                 struct io_poll_remove   poll_remove;
 752                 struct io_accept        accept;
 753                 struct io_sync          sync;
 754                 struct io_cancel        cancel;
 755                 struct io_timeout       timeout;
 756                 struct io_timeout_rem   timeout_rem;
 757                 struct io_connect       connect;
 758                 struct io_sr_msg        sr_msg;
 759                 struct io_open          open;
 760                 struct io_close         close;
 761                 struct io_rsrc_update   rsrc_update;
 762                 struct io_fadvise       fadvise;
 763                 struct io_madvise       madvise;
 764                 struct io_epoll         epoll;
 765                 struct io_splice        splice;
 766                 struct io_provide_buf   pbuf;
 767                 struct io_statx         statx;
 768                 struct io_shutdown      shutdown;
 769                 struct io_rename        rename;
 770                 struct io_unlink        unlink;
 771                 /* use only after cleaning per-op data, see io_clean_op() */
 772                 struct io_completion    compl;
 773         };
 774
 775         /* opcode allocated if it needs to store data for async defer */
 776         void                            *async_data;
 777         u8                              opcode;
 778         /* polled IO has completed */
 779         u8                              iopoll_completed;
 780
 781         u16                             buf_index;
 782         u32                             result;
 783
 784         struct io_ring_ctx              *ctx;
 785         unsigned int                    flags;
 786         refcount_t                      refs;
 787         struct task_struct              *task;
 788         u64                             user_data;
 789
 790         struct io_kiocb                 *link;
 791         struct percpu_ref               *fixed_rsrc_refs;
 792
 793         /*
 794          * 1. used with ctx->iopoll_list with reads/writes
 795          * 2. to track reqs with ->files (see io_op_def::file_table)
 796          */
 797         struct list_head                inflight_entry;
 798         union {
 799                 struct io_task_work     io_task_work;
 800                 struct callback_head    task_work;
 801         };
 802         /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
 803         struct hlist_node               hash_node;
 804         struct async_poll               *apoll;
 805         struct io_wq_work               work;
 806 };
 807
 808 struct io_defer_entry {
 809         struct list_head        list;
 810         struct io_kiocb         *req;
 811         u32                     seq;
 812 };
 813
 814 struct io_op_def {
 815         /* needs req->file assigned */
 816         unsigned                needs_file : 1;
 817         /* hash wq insertion if file is a regular file */
 818         unsigned                hash_reg_file : 1;
 819         /* unbound wq insertion if file is a non-regular file */
 820         unsigned                unbound_nonreg_file : 1;
 821         /* opcode is not supported by this kernel */
 822         unsigned                not_supported : 1;
 823         /* set if opcode supports polled "wait" */
 824         unsigned                pollin : 1;
 825         unsigned                pollout : 1;
 826         /* op supports buffer selection */
 827         unsigned                buffer_select : 1;
 828         /* must always have async data allocated */
 829         unsigned                needs_async_data : 1;
 830         /* should block plug */
 831         unsigned                plug : 1;
 832         /* size of async data needed, if any */
 833         unsigned short          async_size;
 834 };
 835
 836 static const struct io_op_def io_op_defs[] = {
 837         [IORING_OP_NOP] = {},
 838         [IORING_OP_READV] = {
 839                 .needs_file             = 1,
 840                 .unbound_nonreg_file    = 1,
 841                 .pollin                 = 1,
 842                 .buffer_select          = 1,
 843                 .needs_async_data       = 1,
 844                 .plug                   = 1,
 845                 .async_size             = sizeof(struct io_async_rw),
 846         },
 847         [IORING_OP_WRITEV] = {
 848                 .needs_file             = 1,
 849                 .hash_reg_file          = 1,
 850                 .unbound_nonreg_file    = 1,
 851                 .pollout                = 1,
 852                 .needs_async_data       = 1,
 853                 .plug                   = 1,
 854                 .async_size             = sizeof(struct io_async_rw),
 855         },
 856         [IORING_OP_FSYNC] = {
 857                 .needs_file             = 1,
 858         },
 859         [IORING_OP_READ_FIXED] = {
 860                 .needs_file             = 1,
 861                 .unbound_nonreg_file    = 1,
 862                 .pollin                 = 1,
 863                 .plug                   = 1,
 864                 .async_size             = sizeof(struct io_async_rw),
 865         },
 866         [IORING_OP_WRITE_FIXED] = {
 867                 .needs_file             = 1,
 868                 .hash_reg_file          = 1,
 869                 .unbound_nonreg_file    = 1,
 870                 .pollout                = 1,
 871                 .plug                   = 1,
 872                 .async_size             = sizeof(struct io_async_rw),
 873         },
 874         [IORING_OP_POLL_ADD] = {
 875                 .needs_file             = 1,
 876                 .unbound_nonreg_file    = 1,
 877         },
 878         [IORING_OP_POLL_REMOVE] = {},
 879         [IORING_OP_SYNC_FILE_RANGE] = {
 880                 .needs_file             = 1,
 881         },
 882         [IORING_OP_SENDMSG] = {
 883                 .needs_file             = 1,
 884                 .unbound_nonreg_file    = 1,
 885                 .pollout                = 1,
 886                 .needs_async_data       = 1,
 887                 .async_size             = sizeof(struct io_async_msghdr),
 888         },
 889         [IORING_OP_RECVMSG] = {
 890                 .needs_file             = 1,
 891                 .unbound_nonreg_file    = 1,
 892                 .pollin                 = 1,
 893                 .buffer_select          = 1,
 894                 .needs_async_data       = 1,
 895                 .async_size             = sizeof(struct io_async_msghdr),
 896         },
 897         [IORING_OP_TIMEOUT] = {
 898                 .needs_async_data       = 1,
 899                 .async_size             = sizeof(struct io_timeout_data),
 900         },
 901         [IORING_OP_TIMEOUT_REMOVE] = {
 902                 /* used by timeout updates' prep() */
 903         },
 904         [IORING_OP_ACCEPT] = {
 905                 .needs_file             = 1,
 906                 .unbound_nonreg_file    = 1,
 907                 .pollin                 = 1,
 908         },
 909         [IORING_OP_ASYNC_CANCEL] = {},
 910         [IORING_OP_LINK_TIMEOUT] = {
 911                 .needs_async_data       = 1,
 912                 .async_size             = sizeof(struct io_timeout_data),
 913         },
 914         [IORING_OP_CONNECT] = {
 915                 .needs_file             = 1,
 916                 .unbound_nonreg_file    = 1,
 917                 .pollout                = 1,
 918                 .needs_async_data       = 1,
 919                 .async_size             = sizeof(struct io_async_connect),
 920         },
 921         [IORING_OP_FALLOCATE] = {
 922                 .needs_file             = 1,
 923         },
 924         [IORING_OP_OPENAT] = {},
 925         [IORING_OP_CLOSE] = {},
 926         [IORING_OP_FILES_UPDATE] = {},
 927         [IORING_OP_STATX] = {},
 928         [IORING_OP_READ] = {
 929                 .needs_file             = 1,
 930                 .unbound_nonreg_file    = 1,
 931                 .pollin                 = 1,
 932                 .buffer_select          = 1,
 933                 .plug                   = 1,
 934                 .async_size             = sizeof(struct io_async_rw),
 935         },
 936         [IORING_OP_WRITE] = {
 937                 .needs_file             = 1,
 938                 .unbound_nonreg_file    = 1,
 939                 .pollout                = 1,
 940                 .plug                   = 1,
 941                 .async_size             = sizeof(struct io_async_rw),
 942         },
 943         [IORING_OP_FADVISE] = {
 944                 .needs_file             = 1,
 945         },
 946         [IORING_OP_MADVISE] = {},
 947         [IORING_OP_SEND] = {
 948                 .needs_file             = 1,
 949                 .unbound_nonreg_file    = 1,
 950                 .pollout                = 1,
 951         },
 952         [IORING_OP_RECV] = {
 953                 .needs_file             = 1,
 954                 .unbound_nonreg_file    = 1,
 955                 .pollin                 = 1,
 956                 .buffer_select          = 1,
 957         },
 958         [IORING_OP_OPENAT2] = {
 959         },
 960         [IORING_OP_EPOLL_CTL] = {
 961                 .unbound_nonreg_file    = 1,
 962         },
 963         [IORING_OP_SPLICE] = {
 964                 .needs_file             = 1,
 965                 .hash_reg_file          = 1,
 966                 .unbound_nonreg_file    = 1,
 967         },
 968         [IORING_OP_PROVIDE_BUFFERS] = {},
 969         [IORING_OP_REMOVE_BUFFERS] = {},
 970         [IORING_OP_TEE] = {
 971                 .needs_file             = 1,
 972                 .hash_reg_file          = 1,
 973                 .unbound_nonreg_file    = 1,
 974         },
 975         [IORING_OP_SHUTDOWN] = {
 976                 .needs_file             = 1,
 977         },
 978         [IORING_OP_RENAMEAT] = {},
 979         [IORING_OP_UNLINKAT] = {},
 980 };
 981
 982 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 983                                          struct task_struct *task,
 984                                          struct files_struct *files);
 985 static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx);
 986 static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
 987 static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
 988                         struct io_ring_ctx *ctx);
 989 static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
 990
 991 static bool io_rw_reissue(struct io_kiocb *req);
 992 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 993 static void io_put_req(struct io_kiocb *req);
 994 static void io_put_req_deferred(struct io_kiocb *req, int nr);
 995 static void io_double_put_req(struct io_kiocb *req);
 996 static void io_dismantle_req(struct io_kiocb *req);
 997 static void io_put_task(struct task_struct *task, int nr);
 998 static void io_queue_next(struct io_kiocb *req);
 999 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
1000 static void __io_queue_linked_timeout(struct io_kiocb *req);
1001 static void io_queue_linked_timeout(struct io_kiocb *req);
1002 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
1003                                  struct io_uring_rsrc_update *ip,
1004                                  unsigned nr_args);
1005 static void __io_clean_op(struct io_kiocb *req);
1006 static struct file *io_file_get(struct io_submit_state *state,
1007                                 struct io_kiocb *req, int fd, bool fixed);
1008 static void __io_queue_sqe(struct io_kiocb *req);
1009 static void io_rsrc_put_work(struct work_struct *work);
1010
1011 static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
1012                            struct iov_iter *iter, bool needs_lock);
1013 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
1014                              const struct iovec *fast_iov,
1015                              struct iov_iter *iter, bool force);
1016 static void io_req_task_queue(struct io_kiocb *req);
1017 static void io_submit_flush_completions(struct io_comp_state *cs,
1018                                         struct io_ring_ctx *ctx);
1019
1020 static struct kmem_cache *req_cachep;
1021
1022 static const struct file_operations io_uring_fops;
1023
1024 struct sock *io_uring_get_socket(struct file *file)
1025 {
1026 #if defined(CONFIG_UNIX)
1027         if (file->f_op == &io_uring_fops) {
1028                 struct io_ring_ctx *ctx = file->private_data;
1029
1030                 return ctx->ring_sock->sk;
1031         }
1032 #endif
1033         return NULL;
1034 }
1035 EXPORT_SYMBOL(io_uring_get_socket);
1036
1037 #define io_for_each_link(pos, head) \
1038         for (pos = (head); pos; pos = pos->link)
1039
1040 static inline void io_clean_op(struct io_kiocb *req)
1041 {
1042         if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
1043                 __io_clean_op(req);
1044 }
1045
1046 static inline void io_set_resource_node(struct io_kiocb *req)
1047 {
1048         struct io_ring_ctx *ctx = req->ctx;
1049
1050         if (!req->fixed_rsrc_refs) {
1051                 req->fixed_rsrc_refs = &ctx->file_data->node->refs;
1052                 percpu_ref_get(req->fixed_rsrc_refs);
1053         }
1054 }
1055
1056 static bool io_match_task(struct io_kiocb *head,
1057                           struct task_struct *task,
1058                           struct files_struct *files)
1059 {
1060         struct io_kiocb *req;
1061
1062         if (task && head->task != task) {
1063                 /* in terms of cancelation, always match if req task is dead */
1064                 if (head->task->flags & PF_EXITING)
1065                         return true;
1066                 return false;
1067         }
1068         if (!files)
1069                 return true;
1070
1071         io_for_each_link(req, head) {
1072                 if (req->flags & REQ_F_INFLIGHT)
1073                         return true;
1074                 if (req->task->files == files)
1075                         return true;
1076         }
1077         return false;
1078 }
1079
1080 static inline void req_set_fail_links(struct io_kiocb *req)
1081 {
1082         if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1083                 req->flags |= REQ_F_FAIL_LINK;
1084 }
1085
1086 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
1087 {
1088         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1089
1090         complete(&ctx->ref_comp);
1091 }
1092
1093 static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1094 {
1095         return !req->timeout.off;
1096 }
1097
1098 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1099 {
1100         struct io_ring_ctx *ctx;
1101         int hash_bits;
1102
1103         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1104         if (!ctx)
1105                 return NULL;
1106
1107         /*
1108          * Use 5 bits less than the max cq entries, that should give us around
1109          * 32 entries per hash list if totally full and uniformly spread.
1110          */
1111         hash_bits = ilog2(p->cq_entries);
1112         hash_bits -= 5;
1113         if (hash_bits <= 0)
1114                 hash_bits = 1;
1115         ctx->cancel_hash_bits = hash_bits;
1116         ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1117                                         GFP_KERNEL);
1118         if (!ctx->cancel_hash)
1119                 goto err;
1120         __hash_init(ctx->cancel_hash, 1U << hash_bits);
1121
1122         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
1123                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1124                 goto err;
1125
1126         ctx->flags = p->flags;
1127         init_waitqueue_head(&ctx->sqo_sq_wait);
1128         INIT_LIST_HEAD(&ctx->sqd_list);
1129         init_waitqueue_head(&ctx->cq_wait);
1130         INIT_LIST_HEAD(&ctx->cq_overflow_list);
1131         init_completion(&ctx->ref_comp);
1132         init_completion(&ctx->sq_thread_comp);
1133         idr_init(&ctx->io_buffer_idr);
1134         idr_init(&ctx->personality_idr);
1135         mutex_init(&ctx->uring_lock);
1136         init_waitqueue_head(&ctx->wait);
1137         spin_lock_init(&ctx->completion_lock);
1138         INIT_LIST_HEAD(&ctx->iopoll_list);
1139         INIT_LIST_HEAD(&ctx->defer_list);
1140         INIT_LIST_HEAD(&ctx->timeout_list);
1141         spin_lock_init(&ctx->inflight_lock);
1142         INIT_LIST_HEAD(&ctx->inflight_list);
1143         spin_lock_init(&ctx->rsrc_ref_lock);
1144         INIT_LIST_HEAD(&ctx->rsrc_ref_list);
1145         INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1146         init_llist_head(&ctx->rsrc_put_llist);
1147         INIT_LIST_HEAD(&ctx->submit_state.comp.free_list);
1148         INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list);
1149         return ctx;
1150 err:
1151         kfree(ctx->cancel_hash);
1152         kfree(ctx);
1153         return NULL;
1154 }
1155
1156 static bool req_need_defer(struct io_kiocb *req, u32 seq)
1157 {
1158         if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1159                 struct io_ring_ctx *ctx = req->ctx;
1160
1161                 return seq != ctx->cached_cq_tail
1162                                 + READ_ONCE(ctx->cached_cq_overflow);
1163         }
1164
1165         return false;
1166 }
1167
1168 static void io_req_track_inflight(struct io_kiocb *req)
1169 {
1170         struct io_ring_ctx *ctx = req->ctx;
1171
1172         if (!(req->flags & REQ_F_INFLIGHT)) {
1173                 req->flags |= REQ_F_INFLIGHT;
1174
1175                 spin_lock_irq(&ctx->inflight_lock);
1176                 list_add(&req->inflight_entry, &ctx->inflight_list);
1177                 spin_unlock_irq(&ctx->inflight_lock);
1178         }
1179 }
1180
1181 static void io_prep_async_work(struct io_kiocb *req)
1182 {
1183         const struct io_op_def *def = &io_op_defs[req->opcode];
1184         struct io_ring_ctx *ctx = req->ctx;
1185
1186         if (req->flags & REQ_F_FORCE_ASYNC)
1187                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
1188
1189         if (req->flags & REQ_F_ISREG) {
1190                 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1191                         io_wq_hash_work(&req->work, file_inode(req->file));
1192         } else {
1193                 if (def->unbound_nonreg_file)
1194                         req->work.flags |= IO_WQ_WORK_UNBOUND;
1195         }
1196 }
1197
1198 static void io_prep_async_link(struct io_kiocb *req)
1199 {
1200         struct io_kiocb *cur;
1201
1202         io_for_each_link(cur, req)
1203                 io_prep_async_work(cur);
1204 }
1205
1206 static void io_queue_async_work(struct io_kiocb *req)
1207 {
1208         struct io_ring_ctx *ctx = req->ctx;
1209         struct io_kiocb *link = io_prep_linked_timeout(req);
1210         struct io_uring_task *tctx = req->task->io_uring;
1211
1212         BUG_ON(!tctx);
1213         BUG_ON(!tctx->io_wq);
1214
1215         trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1216                                         &req->work, req->flags);
1217         /* init ->work of the whole link before punting */
1218         io_prep_async_link(req);
1219         io_wq_enqueue(tctx->io_wq, &req->work);
1220         if (link)
1221                 io_queue_linked_timeout(link);
1222 }
1223
1224 static void io_kill_timeout(struct io_kiocb *req)
1225 {
1226         struct io_timeout_data *io = req->async_data;
1227         int ret;
1228
1229         ret = hrtimer_try_to_cancel(&io->timer);
1230         if (ret != -1) {
1231                 atomic_set(&req->ctx->cq_timeouts,
1232                         atomic_read(&req->ctx->cq_timeouts) + 1);
1233                 list_del_init(&req->timeout.list);
1234                 io_cqring_fill_event(req, 0);
1235                 io_put_req_deferred(req, 1);
1236         }
1237 }
1238
1239 /*
1240  * Returns true if we found and killed one or more timeouts
1241  */
1242 static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
1243                              struct files_struct *files)
1244 {
1245         struct io_kiocb *req, *tmp;
1246         int canceled = 0;
1247
1248         spin_lock_irq(&ctx->completion_lock);
1249         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
1250                 if (io_match_task(req, tsk, files)) {
1251                         io_kill_timeout(req);
1252                         canceled++;
1253                 }
1254         }
1255         spin_unlock_irq(&ctx->completion_lock);
1256         return canceled != 0;
1257 }
1258
1259 static void __io_queue_deferred(struct io_ring_ctx *ctx)
1260 {
1261         do {
1262                 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1263                                                 struct io_defer_entry, list);
1264
1265                 if (req_need_defer(de->req, de->seq))
1266                         break;
1267                 list_del_init(&de->list);
1268                 io_req_task_queue(de->req);
1269                 kfree(de);
1270         } while (!list_empty(&ctx->defer_list));
1271 }
1272
1273 static void io_flush_timeouts(struct io_ring_ctx *ctx)
1274 {
1275         u32 seq;
1276
1277         if (list_empty(&ctx->timeout_list))
1278                 return;
1279
1280         seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
1281
1282         do {
1283                 u32 events_needed, events_got;
1284                 struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
1285                                                 struct io_kiocb, timeout.list);
1286
1287                 if (io_is_timeout_noseq(req))
1288                         break;
1289
1290                 /*
1291                  * Since seq can easily wrap around over time, subtract
1292                  * the last seq at which timeouts were flushed before comparing.
1293                  * Assuming not more than 2^31-1 events have happened since,
1294                  * these subtractions won't have wrapped, so we can check if
1295                  * target is in [last_seq, current_seq] by comparing the two.
1296                  */
1297                 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
1298                 events_got = seq - ctx->cq_last_tm_flush;
1299                 if (events_got < events_needed)
1300                         break;
1301
1302                 list_del_init(&req->timeout.list);
1303                 io_kill_timeout(req);
1304         } while (!list_empty(&ctx->timeout_list));
1305
1306         ctx->cq_last_tm_flush = seq;
1307 }
1308
1309 static void io_commit_cqring(struct io_ring_ctx *ctx)
1310 {
1311         io_flush_timeouts(ctx);
1312
1313         /* order cqe stores with ring update */
1314         smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
1315
1316         if (unlikely(!list_empty(&ctx->defer_list)))
1317                 __io_queue_deferred(ctx);
1318 }
1319
1320 static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1321 {
1322         struct io_rings *r = ctx->rings;
1323
1324         return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries;
1325 }
1326
1327 static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
1328 {
1329         return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
1330 }
1331
1332 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
1333 {
1334         struct io_rings *rings = ctx->rings;
1335         unsigned tail;
1336
1337         /*
1338          * writes to the cq entry need to come after reading head; the
1339          * control dependency is enough as we're using WRITE_ONCE to
1340          * fill the cq entry
1341          */
1342         if (__io_cqring_events(ctx) == rings->cq_ring_entries)
1343                 return NULL;
1344
1345         tail = ctx->cached_cq_tail++;
1346         return &rings->cqes[tail & ctx->cq_mask];
1347 }
1348
1349 static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1350 {
1351         if (!ctx->cq_ev_fd)
1352                 return false;
1353         if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1354                 return false;
1355         if (!ctx->eventfd_async)
1356                 return true;
1357         return io_wq_current_is_worker();
1358 }
1359
1360 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1361 {
1362         /* see waitqueue_active() comment */
1363         smp_mb();
1364
1365         if (waitqueue_active(&ctx->wait))
1366                 wake_up(&ctx->wait);
1367         if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
1368                 wake_up(&ctx->sq_data->wait);
1369         if (io_should_trigger_evfd(ctx))
1370                 eventfd_signal(ctx->cq_ev_fd, 1);
1371         if (waitqueue_active(&ctx->cq_wait)) {
1372                 wake_up_interruptible(&ctx->cq_wait);
1373                 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1374         }
1375 }
1376
1377 static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1378 {
1379         /* see waitqueue_active() comment */
1380         smp_mb();
1381
1382         if (ctx->flags & IORING_SETUP_SQPOLL) {
1383                 if (waitqueue_active(&ctx->wait))
1384                         wake_up(&ctx->wait);
1385         }
1386         if (io_should_trigger_evfd(ctx))
1387                 eventfd_signal(ctx->cq_ev_fd, 1);
1388         if (waitqueue_active(&ctx->cq_wait)) {
1389                 wake_up_interruptible(&ctx->cq_wait);
1390                 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1391         }
1392 }
1393
1394 /* Returns true if there are no backlogged entries after the flush */
1395 static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
1396                                        struct task_struct *tsk,
1397                                        struct files_struct *files)
1398 {
1399         struct io_rings *rings = ctx->rings;
1400         struct io_kiocb *req, *tmp;
1401         struct io_uring_cqe *cqe;
1402         unsigned long flags;
1403         bool all_flushed, posted;
1404         LIST_HEAD(list);
1405
1406         if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries)
1407                 return false;
1408
1409         posted = false;
1410         spin_lock_irqsave(&ctx->completion_lock, flags);
1411         list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
1412                 if (!io_match_task(req, tsk, files))
1413                         continue;
1414
1415                 cqe = io_get_cqring(ctx);
1416                 if (!cqe && !force)
1417                         break;
1418
1419                 list_move(&req->compl.list, &list);
1420                 if (cqe) {
1421                         WRITE_ONCE(cqe->user_data, req->user_data);
1422                         WRITE_ONCE(cqe->res, req->result);
1423                         WRITE_ONCE(cqe->flags, req->compl.cflags);
1424                 } else {
1425                         ctx->cached_cq_overflow++;
1426                         WRITE_ONCE(ctx->rings->cq_overflow,
1427                                    ctx->cached_cq_overflow);
1428                 }
1429                 posted = true;
1430         }
1431
1432         all_flushed = list_empty(&ctx->cq_overflow_list);
1433         if (all_flushed) {
1434                 clear_bit(0, &ctx->sq_check_overflow);
1435                 clear_bit(0, &ctx->cq_check_overflow);
1436                 ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
1437         }
1438
1439         if (posted)
1440                 io_commit_cqring(ctx);
1441         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1442         if (posted)
1443                 io_cqring_ev_posted(ctx);
1444
1445         while (!list_empty(&list)) {
1446                 req = list_first_entry(&list, struct io_kiocb, compl.list);
1447                 list_del(&req->compl.list);
1448                 io_put_req(req);
1449         }
1450
1451         return all_flushed;
1452 }
1453
1454 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
1455                                      struct task_struct *tsk,
1456                                      struct files_struct *files)
1457 {
1458         bool ret = true;
1459
1460         if (test_bit(0, &ctx->cq_check_overflow)) {
1461                 /* iopoll syncs against uring_lock, not completion_lock */
1462                 if (ctx->flags & IORING_SETUP_IOPOLL)
1463                         mutex_lock(&ctx->uring_lock);
1464                 ret = __io_cqring_overflow_flush(ctx, force, tsk, files);
1465                 if (ctx->flags & IORING_SETUP_IOPOLL)
1466                         mutex_unlock(&ctx->uring_lock);
1467         }
1468
1469         return ret;
1470 }
1471
1472 static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
1473 {
1474         struct io_ring_ctx *ctx = req->ctx;
1475         struct io_uring_cqe *cqe;
1476
1477         trace_io_uring_complete(ctx, req->user_data, res);
1478
1479         /*
1480          * If we can't get a cq entry, userspace overflowed the
1481          * submission (by quite a lot). Increment the overflow count in
1482          * the ring.
1483          */
1484         cqe = io_get_cqring(ctx);
1485         if (likely(cqe)) {
1486                 WRITE_ONCE(cqe->user_data, req->user_data);
1487                 WRITE_ONCE(cqe->res, res);
1488                 WRITE_ONCE(cqe->flags, cflags);
1489         } else if (ctx->cq_overflow_flushed ||
1490                    atomic_read(&req->task->io_uring->in_idle)) {
1491                 /*
1492                  * If we're in ring overflow flush mode, or in task cancel mode,
1493                  * then we cannot store the request for later flushing, we need
1494                  * to drop it on the floor.
1495                  */
1496                 ctx->cached_cq_overflow++;
1497                 WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow);
1498         } else {
1499                 if (list_empty(&ctx->cq_overflow_list)) {
1500                         set_bit(0, &ctx->sq_check_overflow);
1501                         set_bit(0, &ctx->cq_check_overflow);
1502                         ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
1503                 }
1504                 io_clean_op(req);
1505                 req->result = res;
1506                 req->compl.cflags = cflags;
1507                 refcount_inc(&req->refs);
1508                 list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
1509         }
1510 }
1511
1512 static void io_cqring_fill_event(struct io_kiocb *req, long res)
1513 {
1514         __io_cqring_fill_event(req, res, 0);
1515 }
1516
1517 static inline void io_req_complete_post(struct io_kiocb *req, long res,
1518                                         unsigned int cflags)
1519 {
1520         struct io_ring_ctx *ctx = req->ctx;
1521         unsigned long flags;
1522
1523         spin_lock_irqsave(&ctx->completion_lock, flags);
1524         __io_cqring_fill_event(req, res, cflags);
1525         io_commit_cqring(ctx);
1526         /*
1527          * If we're the last reference to this request, add to our locked
1528          * free_list cache.
1529          */
1530         if (refcount_dec_and_test(&req->refs)) {
1531                 struct io_comp_state *cs = &ctx->submit_state.comp;
1532
1533                 io_dismantle_req(req);
1534                 io_put_task(req->task, 1);
1535                 list_add(&req->compl.list, &cs->locked_free_list);
1536                 cs->locked_free_nr++;
1537         } else
1538                 req = NULL;
1539         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1540
1541         io_cqring_ev_posted(ctx);
1542         if (req) {
1543                 io_queue_next(req);
1544                 percpu_ref_put(&ctx->refs);
1545         }
1546 }
1547
1548 static void io_req_complete_state(struct io_kiocb *req, long res,
1549                                   unsigned int cflags)
1550 {
1551         io_clean_op(req);
1552         req->result = res;
1553         req->compl.cflags = cflags;
1554         req->flags |= REQ_F_COMPLETE_INLINE;
1555 }
1556
1557 static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
1558                                      long res, unsigned cflags)
1559 {
1560         if (issue_flags & IO_URING_F_COMPLETE_DEFER)
1561                 io_req_complete_state(req, res, cflags);
1562         else
1563                 io_req_complete_post(req, res, cflags);
1564 }
1565
1566 static inline void io_req_complete(struct io_kiocb *req, long res)
1567 {
1568         __io_req_complete(req, 0, res, 0);
1569 }
1570
1571 static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
1572 {
1573         struct io_submit_state *state = &ctx->submit_state;
1574         struct io_comp_state *cs = &state->comp;
1575         struct io_kiocb *req = NULL;
1576
1577         /*
1578          * If we have more than a batch's worth of requests in our IRQ side
1579          * locked cache, grab the lock and move them over to our submission
1580          * side cache.
1581          */
1582         if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH) {
1583                 spin_lock_irq(&ctx->completion_lock);
1584                 list_splice_init(&cs->locked_free_list, &cs->free_list);
1585                 cs->locked_free_nr = 0;
1586                 spin_unlock_irq(&ctx->completion_lock);
1587         }
1588
1589         while (!list_empty(&cs->free_list)) {
1590                 req = list_first_entry(&cs->free_list, struct io_kiocb,
1591                                         compl.list);
1592                 list_del(&req->compl.list);
1593                 state->reqs[state->free_reqs++] = req;
1594                 if (state->free_reqs == ARRAY_SIZE(state->reqs))
1595                         break;
1596         }
1597
1598         return req != NULL;
1599 }
1600
1601 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
1602 {
1603         struct io_submit_state *state = &ctx->submit_state;
1604
1605         BUILD_BUG_ON(IO_REQ_ALLOC_BATCH > ARRAY_SIZE(state->reqs));
1606
1607         if (!state->free_reqs) {
1608                 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1609                 int ret;
1610
1611                 if (io_flush_cached_reqs(ctx))
1612                         goto got_req;
1613
1614                 ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
1615                                             state->reqs);
1616
1617                 /*
1618                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
1619                  * retry single alloc to be on the safe side.
1620                  */
1621                 if (unlikely(ret <= 0)) {
1622                         state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1623                         if (!state->reqs[0])
1624                                 return NULL;
1625                         ret = 1;
1626                 }
1627                 state->free_reqs = ret;
1628         }
1629 got_req:
1630         state->free_reqs--;
1631         return state->reqs[state->free_reqs];
1632 }
1633
1634 static inline void io_put_file(struct io_kiocb *req, struct file *file,
1635                           bool fixed)
1636 {
1637         if (!fixed)
1638                 fput(file);
1639 }
1640
1641 static void io_dismantle_req(struct io_kiocb *req)
1642 {
1643         io_clean_op(req);
1644
1645         if (req->async_data)
1646                 kfree(req->async_data);
1647         if (req->file)
1648                 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
1649         if (req->fixed_rsrc_refs)
1650                 percpu_ref_put(req->fixed_rsrc_refs);
1651
1652         if (req->flags & REQ_F_INFLIGHT) {
1653                 struct io_ring_ctx *ctx = req->ctx;
1654                 unsigned long flags;
1655
1656                 spin_lock_irqsave(&ctx->inflight_lock, flags);
1657                 list_del(&req->inflight_entry);
1658                 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1659                 req->flags &= ~REQ_F_INFLIGHT;
1660         }
1661 }
1662
1663 /* must to be called somewhat shortly after putting a request */
1664 static inline void io_put_task(struct task_struct *task, int nr)
1665 {
1666         struct io_uring_task *tctx = task->io_uring;
1667
1668         percpu_counter_sub(&tctx->inflight, nr);
1669         if (unlikely(atomic_read(&tctx->in_idle)))
1670                 wake_up(&tctx->wait);
1671         put_task_struct_many(task, nr);
1672 }
1673
1674 static void __io_free_req(struct io_kiocb *req)
1675 {
1676         struct io_ring_ctx *ctx = req->ctx;
1677
1678         io_dismantle_req(req);
1679         io_put_task(req->task, 1);
1680
1681         kmem_cache_free(req_cachep, req);
1682         percpu_ref_put(&ctx->refs);
1683 }
1684
1685 static inline void io_remove_next_linked(struct io_kiocb *req)
1686 {
1687         struct io_kiocb *nxt = req->link;
1688
1689         req->link = nxt->link;
1690         nxt->link = NULL;
1691 }
1692
1693 static void io_kill_linked_timeout(struct io_kiocb *req)
1694 {
1695         struct io_ring_ctx *ctx = req->ctx;
1696         struct io_kiocb *link;
1697         bool cancelled = false;
1698         unsigned long flags;
1699
1700         spin_lock_irqsave(&ctx->completion_lock, flags);
1701         link = req->link;
1702
1703         /*
1704          * Can happen if a linked timeout fired and link had been like
1705          * req -> link t-out -> link t-out [-> ...]
1706          */
1707         if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
1708                 struct io_timeout_data *io = link->async_data;
1709                 int ret;
1710
1711                 io_remove_next_linked(req);
1712                 link->timeout.head = NULL;
1713                 ret = hrtimer_try_to_cancel(&io->timer);
1714                 if (ret != -1) {
1715                         io_cqring_fill_event(link, -ECANCELED);
1716                         io_commit_cqring(ctx);
1717                         cancelled = true;
1718                 }
1719         }
1720         req->flags &= ~REQ_F_LINK_TIMEOUT;
1721         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1722
1723         if (cancelled) {
1724                 io_cqring_ev_posted(ctx);
1725                 io_put_req(link);
1726         }
1727 }
1728
1729
1730 static void io_fail_links(struct io_kiocb *req)
1731 {
1732         struct io_kiocb *link, *nxt;
1733         struct io_ring_ctx *ctx = req->ctx;
1734         unsigned long flags;
1735
1736         spin_lock_irqsave(&ctx->completion_lock, flags);
1737         link = req->link;
1738         req->link = NULL;
1739
1740         while (link) {
1741                 nxt = link->link;
1742                 link->link = NULL;
1743
1744                 trace_io_uring_fail_link(req, link);
1745                 io_cqring_fill_event(link, -ECANCELED);
1746
1747                 io_put_req_deferred(link, 2);
1748                 link = nxt;
1749         }
1750         io_commit_cqring(ctx);
1751         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1752
1753         io_cqring_ev_posted(ctx);
1754 }
1755
1756 static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
1757 {
1758         if (req->flags & REQ_F_LINK_TIMEOUT)
1759                 io_kill_linked_timeout(req);
1760
1761         /*
1762          * If LINK is set, we have dependent requests in this chain. If we
1763          * didn't fail this request, queue the first one up, moving any other
1764          * dependencies to the next request. In case of failure, fail the rest
1765          * of the chain.
1766          */
1767         if (likely(!(req->flags & REQ_F_FAIL_LINK))) {
1768                 struct io_kiocb *nxt = req->link;
1769
1770                 req->link = NULL;
1771                 return nxt;
1772         }
1773         io_fail_links(req);
1774         return NULL;
1775 }
1776
1777 static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
1778 {
1779         if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
1780                 return NULL;
1781         return __io_req_find_next(req);
1782 }
1783
1784 static void ctx_flush_and_put(struct io_ring_ctx *ctx)
1785 {
1786         if (!ctx)
1787                 return;
1788         if (ctx->submit_state.comp.nr) {
1789                 mutex_lock(&ctx->uring_lock);
1790                 io_submit_flush_completions(&ctx->submit_state.comp, ctx);
1791                 mutex_unlock(&ctx->uring_lock);
1792         }
1793         percpu_ref_put(&ctx->refs);
1794 }
1795
1796 static bool __tctx_task_work(struct io_uring_task *tctx)
1797 {
1798         struct io_ring_ctx *ctx = NULL;
1799         struct io_wq_work_list list;
1800         struct io_wq_work_node *node;
1801
1802         if (wq_list_empty(&tctx->task_list))
1803                 return false;
1804
1805         spin_lock_irq(&tctx->task_lock);
1806         list = tctx->task_list;
1807         INIT_WQ_LIST(&tctx->task_list);
1808         spin_unlock_irq(&tctx->task_lock);
1809
1810         node = list.first;
1811         while (node) {
1812                 struct io_wq_work_node *next = node->next;
1813                 struct io_kiocb *req;
1814
1815                 req = container_of(node, struct io_kiocb, io_task_work.node);
1816                 if (req->ctx != ctx) {
1817                         ctx_flush_and_put(ctx);
1818                         ctx = req->ctx;
1819                         percpu_ref_get(&ctx->refs);
1820                 }
1821
1822                 req->task_work.func(&req->task_work);
1823                 node = next;
1824         }
1825
1826         ctx_flush_and_put(ctx);
1827         return list.first != NULL;
1828 }
1829
1830 static void tctx_task_work(struct callback_head *cb)
1831 {
1832         struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work);
1833
1834         clear_bit(0, &tctx->task_state);
1835
1836         while (__tctx_task_work(tctx))
1837                 cond_resched();
1838 }
1839
1840 static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
1841                             enum task_work_notify_mode notify)
1842 {
1843         struct io_uring_task *tctx = tsk->io_uring;
1844         struct io_wq_work_node *node, *prev;
1845         unsigned long flags;
1846         int ret;
1847
1848         WARN_ON_ONCE(!tctx);
1849
1850         spin_lock_irqsave(&tctx->task_lock, flags);
1851         wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
1852         spin_unlock_irqrestore(&tctx->task_lock, flags);
1853
1854         /* task_work already pending, we're done */
1855         if (test_bit(0, &tctx->task_state) ||
1856             test_and_set_bit(0, &tctx->task_state))
1857                 return 0;
1858
1859         if (!task_work_add(tsk, &tctx->task_work, notify))
1860                 return 0;
1861
1862         /*
1863          * Slow path - we failed, find and delete work. if the work is not
1864          * in the list, it got run and we're fine.
1865          */
1866         ret = 0;
1867         spin_lock_irqsave(&tctx->task_lock, flags);
1868         wq_list_for_each(node, prev, &tctx->task_list) {
1869                 if (&req->io_task_work.node == node) {
1870                         wq_list_del(&tctx->task_list, node, prev);
1871                         ret = 1;
1872                         break;
1873                 }
1874         }
1875         spin_unlock_irqrestore(&tctx->task_lock, flags);
1876         clear_bit(0, &tctx->task_state);
1877         return ret;
1878 }
1879
1880 static int io_req_task_work_add(struct io_kiocb *req)
1881 {
1882         struct task_struct *tsk = req->task;
1883         struct io_ring_ctx *ctx = req->ctx;
1884         enum task_work_notify_mode notify;
1885         int ret;
1886
1887         if (tsk->flags & PF_EXITING)
1888                 return -ESRCH;
1889
1890         /*
1891          * SQPOLL kernel thread doesn't need notification, just a wakeup. For
1892          * all other cases, use TWA_SIGNAL unconditionally to ensure we're
1893          * processing task_work. There's no reliable way to tell if TWA_RESUME
1894          * will do the job.
1895          */
1896         notify = TWA_NONE;
1897         if (!(ctx->flags & IORING_SETUP_SQPOLL))
1898                 notify = TWA_SIGNAL;
1899
1900         ret = io_task_work_add(tsk, req, notify);
1901         if (!ret)
1902                 wake_up_process(tsk);
1903
1904         return ret;
1905 }
1906
1907 static void io_req_task_work_add_fallback(struct io_kiocb *req,
1908                                           task_work_func_t cb)
1909 {
1910         struct io_ring_ctx *ctx = req->ctx;
1911         struct callback_head *head;
1912
1913         init_task_work(&req->task_work, cb);
1914         do {
1915                 head = READ_ONCE(ctx->exit_task_work);
1916                 req->task_work.next = head;
1917         } while (cmpxchg(&ctx->exit_task_work, head, &req->task_work) != head);
1918 }
1919
1920 static void __io_req_task_cancel(struct io_kiocb *req, int error)
1921 {
1922         struct io_ring_ctx *ctx = req->ctx;
1923
1924         spin_lock_irq(&ctx->completion_lock);
1925         io_cqring_fill_event(req, error);
1926         io_commit_cqring(ctx);
1927         spin_unlock_irq(&ctx->completion_lock);
1928
1929         io_cqring_ev_posted(ctx);
1930         req_set_fail_links(req);
1931         io_double_put_req(req);
1932 }
1933
1934 static void io_req_task_cancel(struct callback_head *cb)
1935 {
1936         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
1937         struct io_ring_ctx *ctx = req->ctx;
1938
1939         mutex_lock(&ctx->uring_lock);
1940         __io_req_task_cancel(req, req->result);
1941         mutex_unlock(&ctx->uring_lock);
1942         percpu_ref_put(&ctx->refs);
1943 }
1944
1945 static void __io_req_task_submit(struct io_kiocb *req)
1946 {
1947         struct io_ring_ctx *ctx = req->ctx;
1948
1949         /* ctx stays valid until unlock, even if we drop all ours ctx->refs */
1950         mutex_lock(&ctx->uring_lock);
1951         if (!(current->flags & PF_EXITING) && !current->in_execve)
1952                 __io_queue_sqe(req);
1953         else
1954                 __io_req_task_cancel(req, -EFAULT);
1955         mutex_unlock(&ctx->uring_lock);
1956 }
1957
1958 static void io_req_task_submit(struct callback_head *cb)
1959 {
1960         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
1961
1962         __io_req_task_submit(req);
1963 }
1964
1965 static void io_req_task_queue(struct io_kiocb *req)
1966 {
1967         int ret;
1968
1969         req->task_work.func = io_req_task_submit;
1970         ret = io_req_task_work_add(req);
1971         if (unlikely(ret)) {
1972                 req->result = -ECANCELED;
1973                 percpu_ref_get(&req->ctx->refs);
1974                 io_req_task_work_add_fallback(req, io_req_task_cancel);
1975         }
1976 }
1977
1978 static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
1979 {
1980         percpu_ref_get(&req->ctx->refs);
1981         req->result = ret;
1982         req->task_work.func = io_req_task_cancel;
1983
1984         if (unlikely(io_req_task_work_add(req)))
1985                 io_req_task_work_add_fallback(req, io_req_task_cancel);
1986 }
1987
1988 static inline void io_queue_next(struct io_kiocb *req)
1989 {
1990         struct io_kiocb *nxt = io_req_find_next(req);
1991
1992         if (nxt)
1993                 io_req_task_queue(nxt);
1994 }
1995
1996 static void io_free_req(struct io_kiocb *req)
1997 {
1998         io_queue_next(req);
1999         __io_free_req(req);
2000 }
2001
2002 struct req_batch {
2003         struct task_struct      *task;
2004         int                     task_refs;
2005         int                     ctx_refs;
2006 };
2007
2008 static inline void io_init_req_batch(struct req_batch *rb)
2009 {
2010         rb->task_refs = 0;
2011         rb->ctx_refs = 0;
2012         rb->task = NULL;
2013 }
2014
2015 static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
2016                                      struct req_batch *rb)
2017 {
2018         if (rb->task)
2019                 io_put_task(rb->task, rb->task_refs);
2020         if (rb->ctx_refs)
2021                 percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
2022 }
2023
2024 static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
2025                               struct io_submit_state *state)
2026 {
2027         io_queue_next(req);
2028
2029         if (req->task != rb->task) {
2030                 if (rb->task)
2031                         io_put_task(rb->task, rb->task_refs);
2032                 rb->task = req->task;
2033                 rb->task_refs = 0;
2034         }
2035         rb->task_refs++;
2036         rb->ctx_refs++;
2037
2038         io_dismantle_req(req);
2039         if (state->free_reqs != ARRAY_SIZE(state->reqs))
2040                 state->reqs[state->free_reqs++] = req;
2041         else
2042                 list_add(&req->compl.list, &state->comp.free_list);
2043 }
2044
2045 static void io_submit_flush_completions(struct io_comp_state *cs,
2046                                         struct io_ring_ctx *ctx)
2047 {
2048         int i, nr = cs->nr;
2049         struct io_kiocb *req;
2050         struct req_batch rb;
2051
2052         io_init_req_batch(&rb);
2053         spin_lock_irq(&ctx->completion_lock);
2054         for (i = 0; i < nr; i++) {
2055                 req = cs->reqs[i];
2056                 __io_cqring_fill_event(req, req->result, req->compl.cflags);
2057         }
2058         io_commit_cqring(ctx);
2059         spin_unlock_irq(&ctx->completion_lock);
2060
2061         io_cqring_ev_posted(ctx);
2062         for (i = 0; i < nr; i++) {
2063                 req = cs->reqs[i];
2064
2065                 /* submission and completion refs */
2066                 if (refcount_sub_and_test(2, &req->refs))
2067                         io_req_free_batch(&rb, req, &ctx->submit_state);
2068         }
2069
2070         io_req_free_batch_finish(ctx, &rb);
2071         cs->nr = 0;
2072 }
2073
2074 /*
2075  * Drop reference to request, return next in chain (if there is one) if this
2076  * was the last reference to this request.
2077  */
2078 static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
2079 {
2080         struct io_kiocb *nxt = NULL;
2081
2082         if (refcount_dec_and_test(&req->refs)) {
2083                 nxt = io_req_find_next(req);
2084                 __io_free_req(req);
2085         }
2086         return nxt;
2087 }
2088
2089 static void io_put_req(struct io_kiocb *req)
2090 {
2091         if (refcount_dec_and_test(&req->refs))
2092                 io_free_req(req);
2093 }
2094
2095 static void io_put_req_deferred_cb(struct callback_head *cb)
2096 {
2097         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2098
2099         io_free_req(req);
2100 }
2101
2102 static void io_free_req_deferred(struct io_kiocb *req)
2103 {
2104         int ret;
2105
2106         req->task_work.func = io_put_req_deferred_cb;
2107         ret = io_req_task_work_add(req);
2108         if (unlikely(ret))
2109                 io_req_task_work_add_fallback(req, io_put_req_deferred_cb);
2110 }
2111
2112 static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
2113 {
2114         if (refcount_sub_and_test(refs, &req->refs))
2115                 io_free_req_deferred(req);
2116 }
2117
2118 static void io_double_put_req(struct io_kiocb *req)
2119 {
2120         /* drop both submit and complete references */
2121         if (refcount_sub_and_test(2, &req->refs))
2122                 io_free_req(req);
2123 }
2124
2125 static unsigned io_cqring_events(struct io_ring_ctx *ctx)
2126 {
2127         /* See comment at the top of this file */
2128         smp_rmb();
2129         return __io_cqring_events(ctx);
2130 }
2131
2132 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2133 {
2134         struct io_rings *rings = ctx->rings;
2135
2136         /* make sure SQ entry isn't read before tail */
2137         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2138 }
2139
2140 static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
2141 {
2142         unsigned int cflags;
2143
2144         cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
2145         cflags |= IORING_CQE_F_BUFFER;
2146         req->flags &= ~REQ_F_BUFFER_SELECTED;
2147         kfree(kbuf);
2148         return cflags;
2149 }
2150
2151 static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
2152 {
2153         struct io_buffer *kbuf;
2154
2155         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2156         return io_put_kbuf(req, kbuf);
2157 }
2158
2159 static inline bool io_run_task_work(void)
2160 {
2161         /*
2162          * Not safe to run on exiting task, and the task_work handling will
2163          * not add work to such a task.
2164          */
2165         if (unlikely(current->flags & PF_EXITING))
2166                 return false;
2167         if (current->task_works) {
2168                 __set_current_state(TASK_RUNNING);
2169                 task_work_run();
2170                 return true;
2171         }
2172
2173         return false;
2174 }
2175
2176 /*
2177  * Find and free completed poll iocbs
2178  */
2179 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
2180                                struct list_head *done)
2181 {
2182         struct req_batch rb;
2183         struct io_kiocb *req;
2184
2185         /* order with ->result store in io_complete_rw_iopoll() */
2186         smp_rmb();
2187
2188         io_init_req_batch(&rb);
2189         while (!list_empty(done)) {
2190                 int cflags = 0;
2191
2192                 req = list_first_entry(done, struct io_kiocb, inflight_entry);
2193                 list_del(&req->inflight_entry);
2194
2195                 if (READ_ONCE(req->result) == -EAGAIN) {
2196                         req->iopoll_completed = 0;
2197                         if (io_rw_reissue(req))
2198                                 continue;
2199                 }
2200
2201                 if (req->flags & REQ_F_BUFFER_SELECTED)
2202                         cflags = io_put_rw_kbuf(req);
2203
2204                 __io_cqring_fill_event(req, req->result, cflags);
2205                 (*nr_events)++;
2206
2207                 if (refcount_dec_and_test(&req->refs))
2208                         io_req_free_batch(&rb, req, &ctx->submit_state);
2209         }
2210
2211         io_commit_cqring(ctx);
2212         io_cqring_ev_posted_iopoll(ctx);
2213         io_req_free_batch_finish(ctx, &rb);
2214 }
2215
2216 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
2217                         long min)
2218 {
2219         struct io_kiocb *req, *tmp;
2220         LIST_HEAD(done);
2221         bool spin;
2222         int ret;
2223
2224         /*
2225          * Only spin for completions if we don't have multiple devices hanging
2226          * off our complete list, and we're under the requested amount.
2227          */
2228         spin = !ctx->poll_multi_file && *nr_events < min;
2229
2230         ret = 0;
2231         list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
2232                 struct kiocb *kiocb = &req->rw.kiocb;
2233
2234                 /*
2235                  * Move completed and retryable entries to our local lists.
2236                  * If we find a request that requires polling, break out
2237                  * and complete those lists first, if we have entries there.
2238                  */
2239                 if (READ_ONCE(req->iopoll_completed)) {
2240                         list_move_tail(&req->inflight_entry, &done);
2241                         continue;
2242                 }
2243                 if (!list_empty(&done))
2244                         break;
2245
2246                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
2247                 if (ret < 0)
2248                         break;
2249
2250                 /* iopoll may have completed current req */
2251                 if (READ_ONCE(req->iopoll_completed))
2252                         list_move_tail(&req->inflight_entry, &done);
2253
2254                 if (ret && spin)
2255                         spin = false;
2256                 ret = 0;
2257         }
2258
2259         if (!list_empty(&done))
2260                 io_iopoll_complete(ctx, nr_events, &done);
2261
2262         return ret;
2263 }
2264
2265 /*
2266  * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
2267  * non-spinning poll check - we'll still enter the driver poll loop, but only
2268  * as a non-spinning completion check.
2269  */
2270 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
2271                                 long min)
2272 {
2273         while (!list_empty(&ctx->iopoll_list) && !need_resched()) {
2274                 int ret;
2275
2276                 ret = io_do_iopoll(ctx, nr_events, min);
2277                 if (ret < 0)
2278                         return ret;
2279                 if (*nr_events >= min)
2280                         return 0;
2281         }
2282
2283         return 1;
2284 }
2285
2286 /*
2287  * We can't just wait for polled events to come to us, we have to actively
2288  * find and complete them.
2289  */
2290 static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
2291 {
2292         if (!(ctx->flags & IORING_SETUP_IOPOLL))
2293                 return;
2294
2295         mutex_lock(&ctx->uring_lock);
2296         while (!list_empty(&ctx->iopoll_list)) {
2297                 unsigned int nr_events = 0;
2298
2299                 io_do_iopoll(ctx, &nr_events, 0);
2300
2301                 /* let it sleep and repeat later if can't complete a request */
2302                 if (nr_events == 0)
2303                         break;
2304                 /*
2305                  * Ensure we allow local-to-the-cpu processing to take place,
2306                  * in this case we need to ensure that we reap all events.
2307                  * Also let task_work, etc. to progress by releasing the mutex
2308                  */
2309                 if (need_resched()) {
2310                         mutex_unlock(&ctx->uring_lock);
2311                         cond_resched();
2312                         mutex_lock(&ctx->uring_lock);
2313                 }
2314         }
2315         mutex_unlock(&ctx->uring_lock);
2316 }
2317
2318 static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
2319 {
2320         unsigned int nr_events = 0;
2321         int iters = 0, ret = 0;
2322
2323         /*
2324          * We disallow the app entering submit/complete with polling, but we
2325          * still need to lock the ring to prevent racing with polled issue
2326          * that got punted to a workqueue.
2327          */
2328         mutex_lock(&ctx->uring_lock);
2329         do {
2330                 /*
2331                  * Don't enter poll loop if we already have events pending.
2332                  * If we do, we can potentially be spinning for commands that
2333                  * already triggered a CQE (eg in error).
2334                  */
2335                 if (test_bit(0, &ctx->cq_check_overflow))
2336                         __io_cqring_overflow_flush(ctx, false, NULL, NULL);
2337                 if (io_cqring_events(ctx))
2338                         break;
2339
2340                 /*
2341                  * If a submit got punted to a workqueue, we can have the
2342                  * application entering polling for a command before it gets
2343                  * issued. That app will hold the uring_lock for the duration
2344                  * of the poll right here, so we need to take a breather every
2345                  * now and then to ensure that the issue has a chance to add
2346                  * the poll to the issued list. Otherwise we can spin here
2347                  * forever, while the workqueue is stuck trying to acquire the
2348                  * very same mutex.
2349                  */
2350                 if (!(++iters & 7)) {
2351                         mutex_unlock(&ctx->uring_lock);
2352                         io_run_task_work();
2353                         mutex_lock(&ctx->uring_lock);
2354                 }
2355
2356                 ret = io_iopoll_getevents(ctx, &nr_events, min);
2357                 if (ret <= 0)
2358                         break;
2359                 ret = 0;
2360         } while (min && !nr_events && !need_resched());
2361
2362         mutex_unlock(&ctx->uring_lock);
2363         return ret;
2364 }
2365
2366 static void kiocb_end_write(struct io_kiocb *req)
2367 {
2368         /*
2369          * Tell lockdep we inherited freeze protection from submission
2370          * thread.
2371          */
2372         if (req->flags & REQ_F_ISREG) {
2373                 struct inode *inode = file_inode(req->file);
2374
2375                 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
2376         }
2377         file_end_write(req->file);
2378 }
2379
2380 #ifdef CONFIG_BLOCK
2381 static bool io_resubmit_prep(struct io_kiocb *req)
2382 {
2383         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2384         int rw, ret;
2385         struct iov_iter iter;
2386
2387         /* already prepared */
2388         if (req->async_data)
2389                 return true;
2390
2391         switch (req->opcode) {
2392         case IORING_OP_READV:
2393         case IORING_OP_READ_FIXED:
2394         case IORING_OP_READ:
2395                 rw = READ;
2396                 break;
2397         case IORING_OP_WRITEV:
2398         case IORING_OP_WRITE_FIXED:
2399         case IORING_OP_WRITE:
2400                 rw = WRITE;
2401                 break;
2402         default:
2403                 printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n",
2404                                 req->opcode);
2405                 return false;
2406         }
2407
2408         ret = io_import_iovec(rw, req, &iovec, &iter, false);
2409         if (ret < 0)
2410                 return false;
2411         return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
2412 }
2413
2414 static bool io_rw_should_reissue(struct io_kiocb *req)
2415 {
2416         umode_t mode = file_inode(req->file)->i_mode;
2417         struct io_ring_ctx *ctx = req->ctx;
2418
2419         if (!S_ISBLK(mode) && !S_ISREG(mode))
2420                 return false;
2421         if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
2422             !(ctx->flags & IORING_SETUP_IOPOLL)))
2423                 return false;
2424         /*
2425          * If ref is dying, we might be running poll reap from the exit work.
2426          * Don't attempt to reissue from that path, just let it fail with
2427          * -EAGAIN.
2428          */
2429         if (percpu_ref_is_dying(&ctx->refs))
2430                 return false;
2431         return true;
2432 }
2433 #endif
2434
2435 static bool io_rw_reissue(struct io_kiocb *req)
2436 {
2437 #ifdef CONFIG_BLOCK
2438         if (!io_rw_should_reissue(req))
2439                 return false;
2440
2441         lockdep_assert_held(&req->ctx->uring_lock);
2442
2443         if (io_resubmit_prep(req)) {
2444                 refcount_inc(&req->refs);
2445                 io_queue_async_work(req);
2446                 return true;
2447         }
2448         req_set_fail_links(req);
2449 #endif
2450         return false;
2451 }
2452
2453 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
2454                              unsigned int issue_flags)
2455 {
2456         int cflags = 0;
2457
2458         if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_reissue(req))
2459                 return;
2460         if (res != req->result)
2461                 req_set_fail_links(req);
2462
2463         if (req->rw.kiocb.ki_flags & IOCB_WRITE)
2464                 kiocb_end_write(req);
2465         if (req->flags & REQ_F_BUFFER_SELECTED)
2466                 cflags = io_put_rw_kbuf(req);
2467         __io_req_complete(req, issue_flags, res, cflags);
2468 }
2469
2470 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
2471 {
2472         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2473
2474         __io_complete_rw(req, res, res2, 0);
2475 }
2476
2477 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
2478 {
2479         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2480
2481 #ifdef CONFIG_BLOCK
2482         /* Rewind iter, if we have one. iopoll path resubmits as usual */
2483         if (res == -EAGAIN && io_rw_should_reissue(req)) {
2484                 struct io_async_rw *rw = req->async_data;
2485
2486                 if (rw)
2487                         iov_iter_revert(&rw->iter,
2488                                         req->result - iov_iter_count(&rw->iter));
2489                 else if (!io_resubmit_prep(req))
2490                         res = -EIO;
2491         }
2492 #endif
2493
2494         if (kiocb->ki_flags & IOCB_WRITE)
2495                 kiocb_end_write(req);
2496
2497         if (res != -EAGAIN && res != req->result)
2498                 req_set_fail_links(req);
2499
2500         WRITE_ONCE(req->result, res);
2501         /* order with io_poll_complete() checking ->result */
2502         smp_wmb();
2503         WRITE_ONCE(req->iopoll_completed, 1);
2504 }
2505
2506 /*
2507  * After the iocb has been issued, it's safe to be found on the poll list.
2508  * Adding the kiocb to the list AFTER submission ensures that we don't
2509  * find it from a io_iopoll_getevents() thread before the issuer is done
2510  * accessing the kiocb cookie.
2511  */
2512 static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
2513 {
2514         struct io_ring_ctx *ctx = req->ctx;
2515
2516         /*
2517          * Track whether we have multiple files in our lists. This will impact
2518          * how we do polling eventually, not spinning if we're on potentially
2519          * different devices.
2520          */
2521         if (list_empty(&ctx->iopoll_list)) {
2522                 ctx->poll_multi_file = false;
2523         } else if (!ctx->poll_multi_file) {
2524                 struct io_kiocb *list_req;
2525
2526                 list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
2527                                                 inflight_entry);
2528                 if (list_req->file != req->file)
2529                         ctx->poll_multi_file = true;
2530         }
2531
2532         /*
2533          * For fast devices, IO may have already completed. If it has, add
2534          * it to the front so we find it first.
2535          */
2536         if (READ_ONCE(req->iopoll_completed))
2537                 list_add(&req->inflight_entry, &ctx->iopoll_list);
2538         else
2539                 list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
2540
2541         /*
2542          * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread
2543          * task context or in io worker task context. If current task context is
2544          * sq thread, we don't need to check whether should wake up sq thread.
2545          */
2546         if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) &&
2547             wq_has_sleeper(&ctx->sq_data->wait))
2548                 wake_up(&ctx->sq_data->wait);
2549 }
2550
2551 static inline void io_state_file_put(struct io_submit_state *state)
2552 {
2553         if (state->file_refs) {
2554                 fput_many(state->file, state->file_refs);
2555                 state->file_refs = 0;
2556         }
2557 }
2558
2559 /*
2560  * Get as many references to a file as we have IOs left in this submission,
2561  * assuming most submissions are for one file, or at least that each file
2562  * has more than one submission.
2563  */
2564 static struct file *__io_file_get(struct io_submit_state *state, int fd)
2565 {
2566         if (!state)
2567                 return fget(fd);
2568
2569         if (state->file_refs) {
2570                 if (state->fd == fd) {
2571                         state->file_refs--;
2572                         return state->file;
2573                 }
2574                 io_state_file_put(state);
2575         }
2576         state->file = fget_many(fd, state->ios_left);
2577         if (unlikely(!state->file))
2578                 return NULL;
2579
2580         state->fd = fd;
2581         state->file_refs = state->ios_left - 1;
2582         return state->file;
2583 }
2584
2585 static bool io_bdev_nowait(struct block_device *bdev)
2586 {
2587         return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
2588 }
2589
2590 /*
2591  * If we tracked the file through the SCM inflight mechanism, we could support
2592  * any file. For now, just ensure that anything potentially problematic is done
2593  * inline.
2594  */
2595 static bool io_file_supports_async(struct file *file, int rw)
2596 {
2597         umode_t mode = file_inode(file)->i_mode;
2598
2599         if (S_ISBLK(mode)) {
2600                 if (IS_ENABLED(CONFIG_BLOCK) &&
2601                     io_bdev_nowait(I_BDEV(file->f_mapping->host)))
2602                         return true;
2603                 return false;
2604         }
2605         if (S_ISCHR(mode) || S_ISSOCK(mode))
2606                 return true;
2607         if (S_ISREG(mode)) {
2608                 if (IS_ENABLED(CONFIG_BLOCK) &&
2609                     io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
2610                     file->f_op != &io_uring_fops)
2611                         return true;
2612                 return false;
2613         }
2614
2615         /* any ->read/write should understand O_NONBLOCK */
2616         if (file->f_flags & O_NONBLOCK)
2617                 return true;
2618
2619         if (!(file->f_mode & FMODE_NOWAIT))
2620                 return false;
2621
2622         if (rw == READ)
2623                 return file->f_op->read_iter != NULL;
2624
2625         return file->f_op->write_iter != NULL;
2626 }
2627
2628 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2629 {
2630         struct io_ring_ctx *ctx = req->ctx;
2631         struct kiocb *kiocb = &req->rw.kiocb;
2632         struct file *file = req->file;
2633         unsigned ioprio;
2634         int ret;
2635
2636         if (S_ISREG(file_inode(file)->i_mode))
2637                 req->flags |= REQ_F_ISREG;
2638
2639         kiocb->ki_pos = READ_ONCE(sqe->off);
2640         if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
2641                 req->flags |= REQ_F_CUR_POS;
2642                 kiocb->ki_pos = file->f_pos;
2643         }
2644         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
2645         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2646         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2647         if (unlikely(ret))
2648                 return ret;
2649
2650         /* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */
2651         if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK))
2652                 req->flags |= REQ_F_NOWAIT;
2653
2654         ioprio = READ_ONCE(sqe->ioprio);
2655         if (ioprio) {
2656                 ret = ioprio_check_cap(ioprio);
2657                 if (ret)
2658                         return ret;
2659
2660                 kiocb->ki_ioprio = ioprio;
2661         } else
2662                 kiocb->ki_ioprio = get_current_ioprio();
2663
2664         if (ctx->flags & IORING_SETUP_IOPOLL) {
2665                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2666                     !kiocb->ki_filp->f_op->iopoll)
2667                         return -EOPNOTSUPP;
2668
2669                 kiocb->ki_flags |= IOCB_HIPRI;
2670                 kiocb->ki_complete = io_complete_rw_iopoll;
2671                 req->iopoll_completed = 0;
2672         } else {
2673                 if (kiocb->ki_flags & IOCB_HIPRI)
2674                         return -EINVAL;
2675                 kiocb->ki_complete = io_complete_rw;
2676         }
2677
2678         req->rw.addr = READ_ONCE(sqe->addr);
2679         req->rw.len = READ_ONCE(sqe->len);
2680         req->buf_index = READ_ONCE(sqe->buf_index);
2681         return 0;
2682 }
2683
2684 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2685 {
2686         switch (ret) {
2687         case -EIOCBQUEUED:
2688                 break;
2689         case -ERESTARTSYS:
2690         case -ERESTARTNOINTR:
2691         case -ERESTARTNOHAND:
2692         case -ERESTART_RESTARTBLOCK:
2693                 /*
2694                  * We can't just restart the syscall, since previously
2695                  * submitted sqes may already be in progress. Just fail this
2696                  * IO with EINTR.
2697                  */
2698                 ret = -EINTR;
2699                 fallthrough;
2700         default:
2701                 kiocb->ki_complete(kiocb, ret, 0);
2702         }
2703 }
2704
2705 static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
2706                        unsigned int issue_flags)
2707 {
2708         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2709         struct io_async_rw *io = req->async_data;
2710
2711         /* add previously done IO, if any */
2712         if (io && io->bytes_done > 0) {
2713                 if (ret < 0)
2714                         ret = io->bytes_done;
2715                 else
2716                         ret += io->bytes_done;
2717         }
2718
2719         if (req->flags & REQ_F_CUR_POS)
2720                 req->file->f_pos = kiocb->ki_pos;
2721         if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
2722                 __io_complete_rw(req, ret, 0, issue_flags);
2723         else
2724                 io_rw_done(kiocb, ret);
2725 }
2726
2727 static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
2728 {
2729         struct io_ring_ctx *ctx = req->ctx;
2730         size_t len = req->rw.len;
2731         struct io_mapped_ubuf *imu;
2732         u16 index, buf_index = req->buf_index;
2733         size_t offset;
2734         u64 buf_addr;
2735
2736         if (unlikely(buf_index >= ctx->nr_user_bufs))
2737                 return -EFAULT;
2738         index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2739         imu = &ctx->user_bufs[index];
2740         buf_addr = req->rw.addr;
2741
2742         /* overflow */
2743         if (buf_addr + len < buf_addr)
2744                 return -EFAULT;
2745         /* not inside the mapped region */
2746         if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
2747                 return -EFAULT;
2748
2749         /*
2750          * May not be a start of buffer, set size appropriately
2751          * and advance us to the beginning.
2752          */
2753         offset = buf_addr - imu->ubuf;
2754         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
2755
2756         if (offset) {
2757                 /*
2758                  * Don't use iov_iter_advance() here, as it's really slow for
2759                  * using the latter parts of a big fixed buffer - it iterates
2760                  * over each segment manually. We can cheat a bit here, because
2761                  * we know that:
2762                  *
2763                  * 1) it's a BVEC iter, we set it up
2764                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
2765                  *    first and last bvec
2766                  *
2767                  * So just find our index, and adjust the iterator afterwards.
2768                  * If the offset is within the first bvec (or the whole first
2769                  * bvec, just use iov_iter_advance(). This makes it easier
2770                  * since we can just skip the first segment, which may not
2771                  * be PAGE_SIZE aligned.
2772                  */
2773                 const struct bio_vec *bvec = imu->bvec;
2774
2775                 if (offset <= bvec->bv_len) {
2776                         iov_iter_advance(iter, offset);
2777                 } else {
2778                         unsigned long seg_skip;
2779
2780                         /* skip first vec */
2781                         offset -= bvec->bv_len;
2782                         seg_skip = 1 + (offset >> PAGE_SHIFT);
2783
2784                         iter->bvec = bvec + seg_skip;
2785                         iter->nr_segs -= seg_skip;
2786                         iter->count -= bvec->bv_len + offset;
2787                         iter->iov_offset = offset & ~PAGE_MASK;
2788                 }
2789         }
2790
2791         return 0;
2792 }
2793
2794 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2795 {
2796         if (needs_lock)
2797                 mutex_unlock(&ctx->uring_lock);
2798 }
2799
2800 static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2801 {
2802         /*
2803          * "Normal" inline submissions always hold the uring_lock, since we
2804          * grab it from the system call. Same is true for the SQPOLL offload.
2805          * The only exception is when we've detached the request and issue it
2806          * from an async worker thread, grab the lock for that case.
2807          */
2808         if (needs_lock)
2809                 mutex_lock(&ctx->uring_lock);
2810 }
2811
2812 static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
2813                                           int bgid, struct io_buffer *kbuf,
2814                                           bool needs_lock)
2815 {
2816         struct io_buffer *head;
2817
2818         if (req->flags & REQ_F_BUFFER_SELECTED)
2819                 return kbuf;
2820
2821         io_ring_submit_lock(req->ctx, needs_lock);
2822
2823         lockdep_assert_held(&req->ctx->uring_lock);
2824
2825         head = idr_find(&req->ctx->io_buffer_idr, bgid);
2826         if (head) {
2827                 if (!list_empty(&head->list)) {
2828                         kbuf = list_last_entry(&head->list, struct io_buffer,
2829                                                         list);
2830                         list_del(&kbuf->list);
2831                 } else {
2832                         kbuf = head;
2833                         idr_remove(&req->ctx->io_buffer_idr, bgid);
2834                 }
2835                 if (*len > kbuf->len)
2836                         *len = kbuf->len;
2837         } else {
2838                 kbuf = ERR_PTR(-ENOBUFS);
2839         }
2840
2841         io_ring_submit_unlock(req->ctx, needs_lock);
2842
2843         return kbuf;
2844 }
2845
2846 static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
2847                                         bool needs_lock)
2848 {
2849         struct io_buffer *kbuf;
2850         u16 bgid;
2851
2852         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2853         bgid = req->buf_index;
2854         kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
2855         if (IS_ERR(kbuf))
2856                 return kbuf;
2857         req->rw.addr = (u64) (unsigned long) kbuf;
2858         req->flags |= REQ_F_BUFFER_SELECTED;
2859         return u64_to_user_ptr(kbuf->addr);
2860 }
2861
2862 #ifdef CONFIG_COMPAT
2863 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
2864                                 bool needs_lock)
2865 {
2866         struct compat_iovec __user *uiov;
2867         compat_ssize_t clen;
2868         void __user *buf;
2869         ssize_t len;
2870
2871         uiov = u64_to_user_ptr(req->rw.addr);
2872         if (!access_ok(uiov, sizeof(*uiov)))
2873                 return -EFAULT;
2874         if (__get_user(clen, &uiov->iov_len))
2875                 return -EFAULT;
2876         if (clen < 0)
2877                 return -EINVAL;
2878
2879         len = clen;
2880         buf = io_rw_buffer_select(req, &len, needs_lock);
2881         if (IS_ERR(buf))
2882                 return PTR_ERR(buf);
2883         iov[0].iov_base = buf;
2884         iov[0].iov_len = (compat_size_t) len;
2885         return 0;
2886 }
2887 #endif
2888
2889 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2890                                       bool needs_lock)
2891 {
2892         struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
2893         void __user *buf;
2894         ssize_t len;
2895
2896         if (copy_from_user(iov, uiov, sizeof(*uiov)))
2897                 return -EFAULT;
2898
2899         len = iov[0].iov_len;
2900         if (len < 0)
2901                 return -EINVAL;
2902         buf = io_rw_buffer_select(req, &len, needs_lock);
2903         if (IS_ERR(buf))
2904                 return PTR_ERR(buf);
2905         iov[0].iov_base = buf;
2906         iov[0].iov_len = len;
2907         return 0;
2908 }
2909
2910 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2911                                     bool needs_lock)
2912 {
2913         if (req->flags & REQ_F_BUFFER_SELECTED) {
2914                 struct io_buffer *kbuf;
2915
2916                 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2917                 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
2918                 iov[0].iov_len = kbuf->len;
2919                 return 0;
2920         }
2921         if (req->rw.len != 1)
2922                 return -EINVAL;
2923
2924 #ifdef CONFIG_COMPAT
2925         if (req->ctx->compat)
2926                 return io_compat_import(req, iov, needs_lock);
2927 #endif
2928
2929         return __io_iov_buffer_select(req, iov, needs_lock);
2930 }
2931
2932 static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
2933                            struct iov_iter *iter, bool needs_lock)
2934 {
2935         void __user *buf = u64_to_user_ptr(req->rw.addr);
2936         size_t sqe_len = req->rw.len;
2937         u8 opcode = req->opcode;
2938         ssize_t ret;
2939
2940         if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
2941                 *iovec = NULL;
2942                 return io_import_fixed(req, rw, iter);
2943         }
2944
2945         /* buffer index only valid with fixed read/write, or buffer select  */
2946         if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
2947                 return -EINVAL;
2948
2949         if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
2950                 if (req->flags & REQ_F_BUFFER_SELECT) {
2951                         buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
2952                         if (IS_ERR(buf))
2953                                 return PTR_ERR(buf);
2954                         req->rw.len = sqe_len;
2955                 }
2956
2957                 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
2958                 *iovec = NULL;
2959                 return ret;
2960         }
2961
2962         if (req->flags & REQ_F_BUFFER_SELECT) {
2963                 ret = io_iov_buffer_select(req, *iovec, needs_lock);
2964                 if (!ret)
2965                         iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
2966                 *iovec = NULL;
2967                 return ret;
2968         }
2969
2970         return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
2971                               req->ctx->compat);
2972 }
2973
2974 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
2975 {
2976         return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
2977 }
2978
2979 /*
2980  * For files that don't have ->read_iter() and ->write_iter(), handle them
2981  * by looping over ->read() or ->write() manually.
2982  */
2983 static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
2984 {
2985         struct kiocb *kiocb = &req->rw.kiocb;
2986         struct file *file = req->file;
2987         ssize_t ret = 0;
2988
2989         /*
2990          * Don't support polled IO through this interface, and we can't
2991          * support non-blocking either. For the latter, this just causes
2992          * the kiocb to be handled from an async context.
2993          */
2994         if (kiocb->ki_flags & IOCB_HIPRI)
2995                 return -EOPNOTSUPP;
2996         if (kiocb->ki_flags & IOCB_NOWAIT)
2997                 return -EAGAIN;
2998
2999         while (iov_iter_count(iter)) {
3000                 struct iovec iovec;
3001                 ssize_t nr;
3002
3003                 if (!iov_iter_is_bvec(iter)) {
3004                         iovec = iov_iter_iovec(iter);
3005                 } else {
3006                         iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3007                         iovec.iov_len = req->rw.len;
3008                 }
3009
3010                 if (rw == READ) {
3011                         nr = file->f_op->read(file, iovec.iov_base,
3012                                               iovec.iov_len, io_kiocb_ppos(kiocb));
3013                 } else {
3014                         nr = file->f_op->write(file, iovec.iov_base,
3015                                                iovec.iov_len, io_kiocb_ppos(kiocb));
3016                 }
3017
3018                 if (nr < 0) {
3019                         if (!ret)
3020                                 ret = nr;
3021                         break;
3022                 }
3023                 ret += nr;
3024                 if (nr != iovec.iov_len)
3025                         break;
3026                 req->rw.len -= nr;
3027                 req->rw.addr += nr;
3028                 iov_iter_advance(iter, nr);
3029         }
3030
3031         return ret;
3032 }
3033
3034 static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3035                           const struct iovec *fast_iov, struct iov_iter *iter)
3036 {
3037         struct io_async_rw *rw = req->async_data;
3038
3039         memcpy(&rw->iter, iter, sizeof(*iter));
3040         rw->free_iovec = iovec;
3041         rw->bytes_done = 0;
3042         /* can only be fixed buffers, no need to do anything */
3043         if (iov_iter_is_bvec(iter))
3044                 return;
3045         if (!iovec) {
3046                 unsigned iov_off = 0;
3047
3048                 rw->iter.iov = rw->fast_iov;
3049                 if (iter->iov != fast_iov) {
3050                         iov_off = iter->iov - fast_iov;
3051                         rw->iter.iov += iov_off;
3052                 }
3053                 if (rw->fast_iov != fast_iov)
3054                         memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
3055                                sizeof(struct iovec) * iter->nr_segs);
3056         } else {
3057                 req->flags |= REQ_F_NEED_CLEANUP;
3058         }
3059 }
3060
3061 static inline int __io_alloc_async_data(struct io_kiocb *req)
3062 {
3063         WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3064         req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
3065         return req->async_data == NULL;
3066 }
3067
3068 static int io_alloc_async_data(struct io_kiocb *req)
3069 {
3070         if (!io_op_defs[req->opcode].needs_async_data)
3071                 return 0;
3072
3073         return  __io_alloc_async_data(req);
3074 }
3075
3076 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3077                              const struct iovec *fast_iov,
3078                              struct iov_iter *iter, bool force)
3079 {
3080         if (!force && !io_op_defs[req->opcode].needs_async_data)
3081                 return 0;
3082         if (!req->async_data) {
3083                 if (__io_alloc_async_data(req)) {
3084                         kfree(iovec);
3085                         return -ENOMEM;
3086                 }
3087
3088                 io_req_map_rw(req, iovec, fast_iov, iter);
3089         }
3090         return 0;
3091 }
3092
3093 static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
3094 {
3095         struct io_async_rw *iorw = req->async_data;
3096         struct iovec *iov = iorw->fast_iov;
3097         int ret;
3098
3099         ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
3100         if (unlikely(ret < 0))
3101                 return ret;
3102
3103         iorw->bytes_done = 0;
3104         iorw->free_iovec = iov;
3105         if (iov)
3106                 req->flags |= REQ_F_NEED_CLEANUP;
3107         return 0;
3108 }
3109
3110 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3111 {
3112         if (unlikely(!(req->file->f_mode & FMODE_READ)))
3113                 return -EBADF;
3114         return io_prep_rw(req, sqe);
3115 }
3116
3117 /*
3118  * This is our waitqueue callback handler, registered through lock_page_async()
3119  * when we initially tried to do the IO with the iocb armed our waitqueue.
3120  * This gets called when the page is unlocked, and we generally expect that to
3121  * happen when the page IO is completed and the page is now uptodate. This will
3122  * queue a task_work based retry of the operation, attempting to copy the data
3123  * again. If the latter fails because the page was NOT uptodate, then we will
3124  * do a thread based blocking retry of the operation. That's the unexpected
3125  * slow path.
3126  */
3127 static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3128                              int sync, void *arg)
3129 {
3130         struct wait_page_queue *wpq;
3131         struct io_kiocb *req = wait->private;
3132         struct wait_page_key *key = arg;
3133
3134         wpq = container_of(wait, struct wait_page_queue, wait);
3135
3136         if (!wake_page_match(wpq, key))
3137                 return 0;
3138
3139         req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
3140         list_del_init(&wait->entry);
3141
3142         /* submit ref gets dropped, acquire a new one */
3143         refcount_inc(&req->refs);
3144         io_req_task_queue(req);
3145         return 1;
3146 }
3147
3148 /*
3149  * This controls whether a given IO request should be armed for async page
3150  * based retry. If we return false here, the request is handed to the async
3151  * worker threads for retry. If we're doing buffered reads on a regular file,
3152  * we prepare a private wait_page_queue entry and retry the operation. This
3153  * will either succeed because the page is now uptodate and unlocked, or it
3154  * will register a callback when the page is unlocked at IO completion. Through
3155  * that callback, io_uring uses task_work to setup a retry of the operation.
3156  * That retry will attempt the buffered read again. The retry will generally
3157  * succeed, or in rare cases where it fails, we then fall back to using the
3158  * async worker threads for a blocking retry.
3159  */
3160 static bool io_rw_should_retry(struct io_kiocb *req)
3161 {
3162         struct io_async_rw *rw = req->async_data;
3163         struct wait_page_queue *wait = &rw->wpq;
3164         struct kiocb *kiocb = &req->rw.kiocb;
3165
3166         /* never retry for NOWAIT, we just complete with -EAGAIN */
3167         if (req->flags & REQ_F_NOWAIT)
3168                 return false;
3169
3170         /* Only for buffered IO */
3171         if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
3172                 return false;
3173
3174         /*
3175          * just use poll if we can, and don't attempt if the fs doesn't
3176          * support callback based unlocks
3177          */
3178         if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3179                 return false;
3180
3181         wait->wait.func = io_async_buf_func;
3182         wait->wait.private = req;
3183         wait->wait.flags = 0;
3184         INIT_LIST_HEAD(&wait->wait.entry);
3185         kiocb->ki_flags |= IOCB_WAITQ;
3186         kiocb->ki_flags &= ~IOCB_NOWAIT;
3187         kiocb->ki_waitq = wait;
3188         return true;
3189 }
3190
3191 static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
3192 {
3193         if (req->file->f_op->read_iter)
3194                 return call_read_iter(req->file, &req->rw.kiocb, iter);
3195         else if (req->file->f_op->read)
3196                 return loop_rw_iter(READ, req, iter);
3197         else
3198                 return -EINVAL;
3199 }
3200
3201 static int io_read(struct io_kiocb *req, unsigned int issue_flags)
3202 {
3203         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3204         struct kiocb *kiocb = &req->rw.kiocb;
3205         struct iov_iter __iter, *iter = &__iter;
3206         struct io_async_rw *rw = req->async_data;
3207         ssize_t io_size, ret, ret2;
3208         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3209
3210         if (rw) {
3211                 iter = &rw->iter;
3212                 iovec = NULL;
3213         } else {
3214                 ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
3215                 if (ret < 0)
3216                         return ret;
3217         }
3218         io_size = iov_iter_count(iter);
3219         req->result = io_size;
3220
3221         /* Ensure we clear previously set non-block flag */
3222         if (!force_nonblock)
3223                 kiocb->ki_flags &= ~IOCB_NOWAIT;
3224         else
3225                 kiocb->ki_flags |= IOCB_NOWAIT;
3226
3227         /* If the file doesn't support async, just async punt */
3228         if (force_nonblock && !io_file_supports_async(req->file, READ)) {
3229                 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3230                 return ret ?: -EAGAIN;
3231         }
3232
3233         ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
3234         if (unlikely(ret)) {
3235                 kfree(iovec);
3236                 return ret;
3237         }
3238
3239         ret = io_iter_do_read(req, iter);
3240
3241         if (ret == -EIOCBQUEUED) {
3242                 if (req->async_data)
3243                         iov_iter_revert(iter, io_size - iov_iter_count(iter));
3244                 goto out_free;
3245         } else if (ret == -EAGAIN) {
3246                 /* IOPOLL retry should happen for io-wq threads */
3247                 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
3248                         goto done;
3249                 /* no retry on NONBLOCK nor RWF_NOWAIT */
3250                 if (req->flags & REQ_F_NOWAIT)
3251                         goto done;
3252                 /* some cases will consume bytes even on error returns */
3253                 iov_iter_revert(iter, io_size - iov_iter_count(iter));
3254                 ret = 0;
3255         } else if (ret <= 0 || ret == io_size || !force_nonblock ||
3256                    (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) {
3257                 /* read all, failed, already did sync or don't want to retry */
3258                 goto done;
3259         }
3260
3261         ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3262         if (ret2)
3263                 return ret2;
3264
3265         iovec = NULL;
3266         rw = req->async_data;
3267         /* now use our persistent iterator, if we aren't already */
3268         iter = &rw->iter;
3269
3270         do {
3271                 io_size -= ret;
3272                 rw->bytes_done += ret;
3273                 /* if we can retry, do so with the callbacks armed */
3274                 if (!io_rw_should_retry(req)) {
3275                         kiocb->ki_flags &= ~IOCB_WAITQ;
3276                         return -EAGAIN;
3277                 }
3278
3279                 /*
3280                  * Now retry read with the IOCB_WAITQ parts set in the iocb. If
3281                  * we get -EIOCBQUEUED, then we'll get a notification when the
3282                  * desired page gets unlocked. We can also get a partial read
3283                  * here, and if we do, then just retry at the new offset.
3284                  */
3285                 ret = io_iter_do_read(req, iter);
3286                 if (ret == -EIOCBQUEUED)
3287                         return 0;
3288                 /* we got some bytes, but not all. retry. */
3289                 kiocb->ki_flags &= ~IOCB_WAITQ;
3290         } while (ret > 0 && ret < io_size);
3291 done:
3292         kiocb_done(kiocb, ret, issue_flags);
3293 out_free:
3294         /* it's faster to check here then delegate to kfree */
3295         if (iovec)
3296                 kfree(iovec);
3297         return 0;
3298 }
3299
3300 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3301 {
3302         if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3303                 return -EBADF;
3304         return io_prep_rw(req, sqe);
3305 }
3306
3307 static int io_write(struct io_kiocb *req, unsigned int issue_flags)
3308 {
3309         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3310         struct kiocb *kiocb = &req->rw.kiocb;
3311         struct iov_iter __iter, *iter = &__iter;
3312         struct io_async_rw *rw = req->async_data;
3313         ssize_t ret, ret2, io_size;
3314         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3315
3316         if (rw) {
3317                 iter = &rw->iter;
3318                 iovec = NULL;
3319         } else {
3320                 ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
3321                 if (ret < 0)
3322                         return ret;
3323         }
3324         io_size = iov_iter_count(iter);
3325         req->result = io_size;
3326
3327         /* Ensure we clear previously set non-block flag */
3328         if (!force_nonblock)
3329                 kiocb->ki_flags &= ~IOCB_NOWAIT;
3330         else
3331                 kiocb->ki_flags |= IOCB_NOWAIT;
3332
3333         /* If the file doesn't support async, just async punt */
3334         if (force_nonblock && !io_file_supports_async(req->file, WRITE))
3335                 goto copy_iov;
3336
3337         /* file path doesn't support NOWAIT for non-direct_IO */
3338         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3339             (req->flags & REQ_F_ISREG))
3340                 goto copy_iov;
3341
3342         ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
3343         if (unlikely(ret))
3344                 goto out_free;
3345
3346         /*
3347          * Open-code file_start_write here to grab freeze protection,
3348          * which will be released by another thread in
3349          * io_complete_rw().  Fool lockdep by telling it the lock got
3350          * released so that it doesn't complain about the held lock when
3351          * we return to userspace.
3352          */
3353         if (req->flags & REQ_F_ISREG) {
3354                 sb_start_write(file_inode(req->file)->i_sb);
3355                 __sb_writers_release(file_inode(req->file)->i_sb,
3356                                         SB_FREEZE_WRITE);
3357         }
3358         kiocb->ki_flags |= IOCB_WRITE;
3359
3360         if (req->file->f_op->write_iter)
3361                 ret2 = call_write_iter(req->file, kiocb, iter);
3362         else if (req->file->f_op->write)
3363                 ret2 = loop_rw_iter(WRITE, req, iter);
3364         else
3365                 ret2 = -EINVAL;
3366
3367         /*
3368          * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3369          * retry them without IOCB_NOWAIT.
3370          */
3371         if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3372                 ret2 = -EAGAIN;
3373         /* no retry on NONBLOCK nor RWF_NOWAIT */
3374         if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
3375                 goto done;
3376         if (ret2 == -EIOCBQUEUED && req->async_data)
3377                 iov_iter_revert(iter, io_size - iov_iter_count(iter));
3378         if (!force_nonblock || ret2 != -EAGAIN) {
3379                 /* IOPOLL retry should happen for io-wq threads */
3380                 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
3381                         goto copy_iov;
3382 done:
3383                 kiocb_done(kiocb, ret2, issue_flags);
3384         } else {
3385 copy_iov:
3386                 /* some cases will consume bytes even on error returns */
3387                 iov_iter_revert(iter, io_size - iov_iter_count(iter));
3388                 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
3389                 return ret ?: -EAGAIN;
3390         }
3391 out_free:
3392         /* it's reportedly faster than delegating the null check to kfree() */
3393         if (iovec)
3394                 kfree(iovec);
3395         return ret;
3396 }
3397
3398 static int io_renameat_prep(struct io_kiocb *req,
3399                             const struct io_uring_sqe *sqe)
3400 {
3401         struct io_rename *ren = &req->rename;
3402         const char __user *oldf, *newf;
3403
3404         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3405                 return -EBADF;
3406
3407         ren->old_dfd = READ_ONCE(sqe->fd);
3408         oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3409         newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3410         ren->new_dfd = READ_ONCE(sqe->len);
3411         ren->flags = READ_ONCE(sqe->rename_flags);
3412
3413         ren->oldpath = getname(oldf);
3414         if (IS_ERR(ren->oldpath))
3415                 return PTR_ERR(ren->oldpath);
3416
3417         ren->newpath = getname(newf);
3418         if (IS_ERR(ren->newpath)) {
3419                 putname(ren->oldpath);
3420                 return PTR_ERR(ren->newpath);
3421         }
3422
3423         req->flags |= REQ_F_NEED_CLEANUP;
3424         return 0;
3425 }
3426
3427 static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
3428 {
3429         struct io_rename *ren = &req->rename;
3430         int ret;
3431
3432         if (issue_flags & IO_URING_F_NONBLOCK)
3433                 return -EAGAIN;
3434
3435         ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
3436                                 ren->newpath, ren->flags);
3437
3438         req->flags &= ~REQ_F_NEED_CLEANUP;
3439         if (ret < 0)
3440                 req_set_fail_links(req);
3441         io_req_complete(req, ret);
3442         return 0;
3443 }
3444
3445 static int io_unlinkat_prep(struct io_kiocb *req,
3446                             const struct io_uring_sqe *sqe)
3447 {
3448         struct io_unlink *un = &req->unlink;
3449         const char __user *fname;
3450
3451         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3452                 return -EBADF;
3453
3454         un->dfd = READ_ONCE(sqe->fd);
3455
3456         un->flags = READ_ONCE(sqe->unlink_flags);
3457         if (un->flags & ~AT_REMOVEDIR)
3458                 return -EINVAL;
3459
3460         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3461         un->filename = getname(fname);
3462         if (IS_ERR(un->filename))
3463                 return PTR_ERR(un->filename);
3464
3465         req->flags |= REQ_F_NEED_CLEANUP;
3466         return 0;
3467 }
3468
3469 static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
3470 {
3471         struct io_unlink *un = &req->unlink;
3472         int ret;
3473
3474         if (issue_flags & IO_URING_F_NONBLOCK)
3475                 return -EAGAIN;
3476
3477         if (un->flags & AT_REMOVEDIR)
3478                 ret = do_rmdir(un->dfd, un->filename);
3479         else
3480                 ret = do_unlinkat(un->dfd, un->filename);
3481
3482         req->flags &= ~REQ_F_NEED_CLEANUP;
3483         if (ret < 0)
3484                 req_set_fail_links(req);
3485         io_req_complete(req, ret);
3486         return 0;
3487 }
3488
3489 static int io_shutdown_prep(struct io_kiocb *req,
3490                             const struct io_uring_sqe *sqe)
3491 {
3492 #if defined(CONFIG_NET)
3493         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3494                 return -EINVAL;
3495         if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
3496             sqe->buf_index)
3497                 return -EINVAL;
3498
3499         req->shutdown.how = READ_ONCE(sqe->len);
3500         return 0;
3501 #else
3502         return -EOPNOTSUPP;
3503 #endif
3504 }
3505
3506 static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
3507 {
3508 #if defined(CONFIG_NET)
3509         struct socket *sock;
3510         int ret;
3511
3512         if (issue_flags & IO_URING_F_NONBLOCK)
3513                 return -EAGAIN;
3514
3515         sock = sock_from_file(req->file);
3516         if (unlikely(!sock))
3517                 return -ENOTSOCK;
3518
3519         ret = __sys_shutdown_sock(sock, req->shutdown.how);
3520         if (ret < 0)
3521                 req_set_fail_links(req);
3522         io_req_complete(req, ret);
3523         return 0;
3524 #else
3525         return -EOPNOTSUPP;
3526 #endif
3527 }
3528
3529 static int __io_splice_prep(struct io_kiocb *req,
3530                             const struct io_uring_sqe *sqe)
3531 {
3532         struct io_splice* sp = &req->splice;
3533         unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
3534
3535         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3536                 return -EINVAL;
3537
3538         sp->file_in = NULL;
3539         sp->len = READ_ONCE(sqe->len);
3540         sp->flags = READ_ONCE(sqe->splice_flags);
3541
3542         if (unlikely(sp->flags & ~valid_flags))
3543                 return -EINVAL;
3544
3545         sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in),
3546                                   (sp->flags & SPLICE_F_FD_IN_FIXED));
3547         if (!sp->file_in)
3548                 return -EBADF;
3549         req->flags |= REQ_F_NEED_CLEANUP;
3550
3551         if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
3552                 /*
3553                  * Splice operation will be punted aync, and here need to
3554                  * modify io_wq_work.flags, so initialize io_wq_work firstly.
3555                  */
3556                 req->work.flags |= IO_WQ_WORK_UNBOUND;
3557         }
3558
3559         return 0;
3560 }
3561
3562 static int io_tee_prep(struct io_kiocb *req,
3563                        const struct io_uring_sqe *sqe)
3564 {
3565         if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
3566                 return -EINVAL;
3567         return __io_splice_prep(req, sqe);
3568 }
3569
3570 static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
3571 {
3572         struct io_splice *sp = &req->splice;
3573         struct file *in = sp->file_in;
3574         struct file *out = sp->file_out;
3575         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3576         long ret = 0;
3577
3578         if (issue_flags & IO_URING_F_NONBLOCK)
3579                 return -EAGAIN;
3580         if (sp->len)
3581                 ret = do_tee(in, out, sp->len, flags);
3582
3583         io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3584         req->flags &= ~REQ_F_NEED_CLEANUP;
3585
3586         if (ret != sp->len)
3587                 req_set_fail_links(req);
3588         io_req_complete(req, ret);
3589         return 0;
3590 }
3591
3592 static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3593 {
3594         struct io_splice* sp = &req->splice;
3595
3596         sp->off_in = READ_ONCE(sqe->splice_off_in);
3597         sp->off_out = READ_ONCE(sqe->off);
3598         return __io_splice_prep(req, sqe);
3599 }
3600
3601 static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
3602 {
3603         struct io_splice *sp = &req->splice;
3604         struct file *in = sp->file_in;
3605         struct file *out = sp->file_out;
3606         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3607         loff_t *poff_in, *poff_out;
3608         long ret = 0;
3609
3610         if (issue_flags & IO_URING_F_NONBLOCK)
3611                 return -EAGAIN;
3612
3613         poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
3614         poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
3615
3616         if (sp->len)
3617                 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
3618
3619         io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3620         req->flags &= ~REQ_F_NEED_CLEANUP;
3621
3622         if (ret != sp->len)
3623                 req_set_fail_links(req);
3624         io_req_complete(req, ret);
3625         return 0;
3626 }
3627
3628 /*
3629  * IORING_OP_NOP just posts a completion event, nothing else.
3630  */
3631 static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
3632 {
3633         struct io_ring_ctx *ctx = req->ctx;
3634
3635         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3636                 return -EINVAL;
3637
3638         __io_req_complete(req, issue_flags, 0, 0);
3639         return 0;
3640 }
3641
3642 static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3643 {
3644         struct io_ring_ctx *ctx = req->ctx;
3645
3646         if (!req->file)
3647                 return -EBADF;
3648
3649         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3650                 return -EINVAL;
3651         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
3652                 return -EINVAL;
3653
3654         req->sync.flags = READ_ONCE(sqe->fsync_flags);
3655         if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
3656                 return -EINVAL;
3657
3658         req->sync.off = READ_ONCE(sqe->off);
3659         req->sync.len = READ_ONCE(sqe->len);
3660         return 0;
3661 }
3662
3663 static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
3664 {
3665         loff_t end = req->sync.off + req->sync.len;
3666         int ret;
3667
3668         /* fsync always requires a blocking context */
3669         if (issue_flags & IO_URING_F_NONBLOCK)
3670                 return -EAGAIN;
3671
3672         ret = vfs_fsync_range(req->file, req->sync.off,
3673                                 end > 0 ? end : LLONG_MAX,
3674                                 req->sync.flags & IORING_FSYNC_DATASYNC);
3675         if (ret < 0)
3676                 req_set_fail_links(req);
3677         io_req_complete(req, ret);
3678         return 0;
3679 }
3680
3681 static int io_fallocate_prep(struct io_kiocb *req,
3682                              const struct io_uring_sqe *sqe)
3683 {
3684         if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
3685                 return -EINVAL;
3686         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3687                 return -EINVAL;
3688
3689         req->sync.off = READ_ONCE(sqe->off);
3690         req->sync.len = READ_ONCE(sqe->addr);
3691         req->sync.mode = READ_ONCE(sqe->len);
3692         return 0;
3693 }
3694
3695 static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
3696 {
3697         int ret;
3698
3699         /* fallocate always requiring blocking context */
3700         if (issue_flags & IO_URING_F_NONBLOCK)
3701                 return -EAGAIN;
3702         ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
3703                                 req->sync.len);
3704         if (ret < 0)
3705                 req_set_fail_links(req);
3706         io_req_complete(req, ret);
3707         return 0;
3708 }
3709
3710 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3711 {
3712         const char __user *fname;
3713         int ret;
3714
3715         if (unlikely(sqe->ioprio || sqe->buf_index))
3716                 return -EINVAL;
3717         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3718                 return -EBADF;
3719
3720         /* open.how should be already initialised */
3721         if (!(req->open.how.flags & O_PATH) && force_o_largefile())
3722                 req->open.how.flags |= O_LARGEFILE;
3723
3724         req->open.dfd = READ_ONCE(sqe->fd);
3725         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3726         req->open.filename = getname(fname);
3727         if (IS_ERR(req->open.filename)) {
3728                 ret = PTR_ERR(req->open.filename);
3729                 req->open.filename = NULL;
3730                 return ret;
3731         }
3732         req->open.nofile = rlimit(RLIMIT_NOFILE);
3733         req->flags |= REQ_F_NEED_CLEANUP;
3734         return 0;
3735 }
3736
3737 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3738 {
3739         u64 flags, mode;
3740
3741         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3742                 return -EINVAL;
3743         mode = READ_ONCE(sqe->len);
3744         flags = READ_ONCE(sqe->open_flags);
3745         req->open.how = build_open_how(flags, mode);
3746         return __io_openat_prep(req, sqe);
3747 }
3748
3749 static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3750 {
3751         struct open_how __user *how;
3752         size_t len;
3753         int ret;
3754
3755         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3756                 return -EINVAL;
3757         how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3758         len = READ_ONCE(sqe->len);
3759         if (len < OPEN_HOW_SIZE_VER0)
3760                 return -EINVAL;
3761
3762         ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
3763                                         len);
3764         if (ret)
3765                 return ret;
3766
3767         return __io_openat_prep(req, sqe);
3768 }
3769
3770 static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
3771 {
3772         struct open_flags op;
3773         struct file *file;
3774         bool nonblock_set;
3775         bool resolve_nonblock;
3776         int ret;
3777
3778         ret = build_open_flags(&req->open.how, &op);
3779         if (ret)
3780                 goto err;
3781         nonblock_set = op.open_flag & O_NONBLOCK;
3782         resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
3783         if (issue_flags & IO_URING_F_NONBLOCK) {
3784                 /*
3785                  * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
3786                  * it'll always -EAGAIN
3787                  */
3788                 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
3789                         return -EAGAIN;
3790                 op.lookup_flags |= LOOKUP_CACHED;
3791                 op.open_flag |= O_NONBLOCK;
3792         }
3793
3794         ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
3795         if (ret < 0)
3796                 goto err;
3797
3798         file = do_filp_open(req->open.dfd, req->open.filename, &op);
3799         /* only retry if RESOLVE_CACHED wasn't already set by application */
3800         if ((!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)) &&
3801             file == ERR_PTR(-EAGAIN)) {
3802                 /*
3803                  * We could hang on to this 'fd', but seems like marginal
3804                  * gain for something that is now known to be a slower path.
3805                  * So just put it, and we'll get a new one when we retry.
3806                  */
3807                 put_unused_fd(ret);
3808                 return -EAGAIN;
3809         }
3810
3811         if (IS_ERR(file)) {
3812                 put_unused_fd(ret);
3813                 ret = PTR_ERR(file);
3814         } else {
3815                 if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
3816                         file->f_flags &= ~O_NONBLOCK;
3817                 fsnotify_open(file);
3818                 fd_install(ret, file);
3819         }
3820 err:
3821         putname(req->open.filename);
3822         req->flags &= ~REQ_F_NEED_CLEANUP;
3823         if (ret < 0)
3824                 req_set_fail_links(req);
3825         io_req_complete(req, ret);
3826         return 0;
3827 }
3828
3829 static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
3830 {
3831         return io_openat2(req, issue_flags);
3832 }
3833
3834 static int io_remove_buffers_prep(struct io_kiocb *req,
3835                                   const struct io_uring_sqe *sqe)
3836 {
3837         struct io_provide_buf *p = &req->pbuf;
3838         u64 tmp;
3839
3840         if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
3841                 return -EINVAL;
3842
3843         tmp = READ_ONCE(sqe->fd);
3844         if (!tmp || tmp > USHRT_MAX)
3845                 return -EINVAL;
3846
3847         memset(p, 0, sizeof(*p));
3848         p->nbufs = tmp;
3849         p->bgid = READ_ONCE(sqe->buf_group);
3850         return 0;
3851 }
3852
3853 static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
3854                                int bgid, unsigned nbufs)
3855 {
3856         unsigned i = 0;
3857
3858         /* shouldn't happen */
3859         if (!nbufs)
3860                 return 0;
3861
3862         /* the head kbuf is the list itself */
3863         while (!list_empty(&buf->list)) {
3864                 struct io_buffer *nxt;
3865
3866                 nxt = list_first_entry(&buf->list, struct io_buffer, list);
3867                 list_del(&nxt->list);
3868                 kfree(nxt);
3869                 if (++i == nbufs)
3870                         return i;
3871         }
3872         i++;
3873         kfree(buf);
3874         idr_remove(&ctx->io_buffer_idr, bgid);
3875
3876         return i;
3877 }
3878
3879 static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
3880 {
3881         struct io_provide_buf *p = &req->pbuf;
3882         struct io_ring_ctx *ctx = req->ctx;
3883         struct io_buffer *head;
3884         int ret = 0;
3885         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3886
3887         io_ring_submit_lock(ctx, !force_nonblock);
3888
3889         lockdep_assert_held(&ctx->uring_lock);
3890
3891         ret = -ENOENT;
3892         head = idr_find(&ctx->io_buffer_idr, p->bgid);
3893         if (head)
3894                 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
3895         if (ret < 0)
3896                 req_set_fail_links(req);
3897
3898         /* need to hold the lock to complete IOPOLL requests */
3899         if (ctx->flags & IORING_SETUP_IOPOLL) {
3900                 __io_req_complete(req, issue_flags, ret, 0);
3901                 io_ring_submit_unlock(ctx, !force_nonblock);
3902         } else {
3903                 io_ring_submit_unlock(ctx, !force_nonblock);
3904                 __io_req_complete(req, issue_flags, ret, 0);
3905         }
3906         return 0;
3907 }
3908
3909 static int io_provide_buffers_prep(struct io_kiocb *req,
3910                                    const struct io_uring_sqe *sqe)
3911 {
3912         struct io_provide_buf *p = &req->pbuf;
3913         u64 tmp;
3914
3915         if (sqe->ioprio || sqe->rw_flags)
3916                 return -EINVAL;
3917
3918         tmp = READ_ONCE(sqe->fd);
3919         if (!tmp || tmp > USHRT_MAX)
3920                 return -E2BIG;
3921         p->nbufs = tmp;
3922         p->addr = READ_ONCE(sqe->addr);
3923         p->len = READ_ONCE(sqe->len);
3924
3925         if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
3926                 return -EFAULT;
3927
3928         p->bgid = READ_ONCE(sqe->buf_group);
3929         tmp = READ_ONCE(sqe->off);
3930         if (tmp > USHRT_MAX)
3931                 return -E2BIG;
3932         p->bid = tmp;
3933         return 0;
3934 }
3935
3936 static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
3937 {
3938         struct io_buffer *buf;
3939         u64 addr = pbuf->addr;
3940         int i, bid = pbuf->bid;
3941
3942         for (i = 0; i < pbuf->nbufs; i++) {
3943                 buf = kmalloc(sizeof(*buf), GFP_KERNEL);
3944                 if (!buf)
3945                         break;
3946
3947                 buf->addr = addr;
3948                 buf->len = pbuf->len;
3949                 buf->bid = bid;
3950                 addr += pbuf->len;
3951                 bid++;
3952                 if (!*head) {
3953                         INIT_LIST_HEAD(&buf->list);
3954                         *head = buf;
3955                 } else {
3956                         list_add_tail(&buf->list, &(*head)->list);
3957                 }
3958         }
3959
3960         return i ? i : -ENOMEM;
3961 }
3962
3963 static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
3964 {
3965         struct io_provide_buf *p = &req->pbuf;
3966         struct io_ring_ctx *ctx = req->ctx;
3967         struct io_buffer *head, *list;
3968         int ret = 0;
3969         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3970
3971         io_ring_submit_lock(ctx, !force_nonblock);
3972
3973         lockdep_assert_held(&ctx->uring_lock);
3974
3975         list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
3976
3977         ret = io_add_buffers(p, &head);
3978         if (ret < 0)
3979                 goto out;
3980
3981         if (!list) {
3982                 ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
3983                                         GFP_KERNEL);
3984                 if (ret < 0) {
3985                         __io_remove_buffers(ctx, head, p->bgid, -1U);
3986                         goto out;
3987                 }
3988         }
3989 out:
3990         if (ret < 0)
3991                 req_set_fail_links(req);
3992
3993         /* need to hold the lock to complete IOPOLL requests */
3994         if (ctx->flags & IORING_SETUP_IOPOLL) {
3995                 __io_req_complete(req, issue_flags, ret, 0);
3996                 io_ring_submit_unlock(ctx, !force_nonblock);
3997         } else {
3998                 io_ring_submit_unlock(ctx, !force_nonblock);
3999                 __io_req_complete(req, issue_flags, ret, 0);
4000         }
4001         return 0;
4002 }
4003
4004 static int io_epoll_ctl_prep(struct io_kiocb *req,
4005                              const struct io_uring_sqe *sqe)
4006 {
4007 #if defined(CONFIG_EPOLL)
4008         if (sqe->ioprio || sqe->buf_index)
4009                 return -EINVAL;
4010         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
4011                 return -EINVAL;
4012
4013         req->epoll.epfd = READ_ONCE(sqe->fd);
4014         req->epoll.op = READ_ONCE(sqe->len);
4015         req->epoll.fd = READ_ONCE(sqe->off);
4016
4017         if (ep_op_has_event(req->epoll.op)) {
4018                 struct epoll_event __user *ev;
4019
4020                 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4021                 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4022                         return -EFAULT;
4023         }
4024
4025         return 0;
4026 #else
4027         return -EOPNOTSUPP;
4028 #endif
4029 }
4030
4031 static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
4032 {
4033 #if defined(CONFIG_EPOLL)
4034         struct io_epoll *ie = &req->epoll;
4035         int ret;
4036         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4037
4038         ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4039         if (force_nonblock && ret == -EAGAIN)
4040                 return -EAGAIN;
4041
4042         if (ret < 0)
4043                 req_set_fail_links(req);
4044         __io_req_complete(req, issue_flags, ret, 0);
4045         return 0;
4046 #else
4047         return -EOPNOTSUPP;
4048 #endif
4049 }
4050
4051 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4052 {
4053 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4054         if (sqe->ioprio || sqe->buf_index || sqe->off)
4055                 return -EINVAL;
4056         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4057                 return -EINVAL;
4058
4059         req->madvise.addr = READ_ONCE(sqe->addr);
4060         req->madvise.len = READ_ONCE(sqe->len);
4061         req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4062         return 0;
4063 #else
4064         return -EOPNOTSUPP;
4065 #endif
4066 }
4067
4068 static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
4069 {
4070 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4071         struct io_madvise *ma = &req->madvise;
4072         int ret;
4073
4074         if (issue_flags & IO_URING_F_NONBLOCK)
4075                 return -EAGAIN;
4076
4077         ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
4078         if (ret < 0)
4079                 req_set_fail_links(req);
4080         io_req_complete(req, ret);
4081         return 0;
4082 #else
4083         return -EOPNOTSUPP;
4084 #endif
4085 }
4086
4087 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4088 {
4089         if (sqe->ioprio || sqe->buf_index || sqe->addr)
4090                 return -EINVAL;
4091         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4092                 return -EINVAL;
4093
4094         req->fadvise.offset = READ_ONCE(sqe->off);
4095         req->fadvise.len = READ_ONCE(sqe->len);
4096         req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
4097         return 0;
4098 }
4099
4100 static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
4101 {
4102         struct io_fadvise *fa = &req->fadvise;
4103         int ret;
4104
4105         if (issue_flags & IO_URING_F_NONBLOCK) {
4106                 switch (fa->advice) {
4107                 case POSIX_FADV_NORMAL:
4108                 case POSIX_FADV_RANDOM:
4109                 case POSIX_FADV_SEQUENTIAL:
4110                         break;
4111                 default:
4112                         return -EAGAIN;
4113                 }
4114         }
4115
4116         ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
4117         if (ret < 0)
4118                 req_set_fail_links(req);
4119         io_req_complete(req, ret);
4120         return 0;
4121 }
4122
4123 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4124 {
4125         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
4126                 return -EINVAL;
4127         if (sqe->ioprio || sqe->buf_index)
4128                 return -EINVAL;
4129         if (req->flags & REQ_F_FIXED_FILE)
4130                 return -EBADF;
4131
4132         req->statx.dfd = READ_ONCE(sqe->fd);
4133         req->statx.mask = READ_ONCE(sqe->len);
4134         req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
4135         req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4136         req->statx.flags = READ_ONCE(sqe->statx_flags);
4137
4138         return 0;
4139 }
4140
4141 static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
4142 {
4143         struct io_statx *ctx = &req->statx;
4144         int ret;
4145
4146         if (issue_flags & IO_URING_F_NONBLOCK) {
4147                 /* only need file table for an actual valid fd */
4148                 if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
4149                         req->flags |= REQ_F_NO_FILE_TABLE;
4150                 return -EAGAIN;
4151         }
4152
4153         ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
4154                        ctx->buffer);
4155
4156         if (ret < 0)
4157                 req_set_fail_links(req);
4158         io_req_complete(req, ret);
4159         return 0;
4160 }
4161
4162 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4163 {
4164         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4165                 return -EINVAL;
4166         if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
4167             sqe->rw_flags || sqe->buf_index)
4168                 return -EINVAL;
4169         if (req->flags & REQ_F_FIXED_FILE)
4170                 return -EBADF;
4171
4172         req->close.fd = READ_ONCE(sqe->fd);
4173         return 0;
4174 }
4175
4176 static int io_close(struct io_kiocb *req, unsigned int issue_flags)
4177 {
4178         struct files_struct *files = current->files;
4179         struct io_close *close = &req->close;
4180         struct fdtable *fdt;
4181         struct file *file;
4182         int ret;
4183
4184         file = NULL;
4185         ret = -EBADF;
4186         spin_lock(&files->file_lock);
4187         fdt = files_fdtable(files);
4188         if (close->fd >= fdt->max_fds) {
4189                 spin_unlock(&files->file_lock);
4190                 goto err;
4191         }
4192         file = fdt->fd[close->fd];
4193         if (!file) {
4194                 spin_unlock(&files->file_lock);
4195                 goto err;
4196         }
4197
4198         if (file->f_op == &io_uring_fops) {
4199                 spin_unlock(&files->file_lock);
4200                 file = NULL;
4201                 goto err;
4202         }
4203
4204         /* if the file has a flush method, be safe and punt to async */
4205         if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
4206                 spin_unlock(&files->file_lock);
4207                 return -EAGAIN;
4208         }
4209
4210         ret = __close_fd_get_file(close->fd, &file);
4211         spin_unlock(&files->file_lock);
4212         if (ret < 0) {
4213                 if (ret == -ENOENT)
4214                         ret = -EBADF;
4215                 goto err;
4216         }
4217
4218         /* No ->flush() or already async, safely close from here */
4219         ret = filp_close(file, current->files);
4220 err:
4221         if (ret < 0)
4222                 req_set_fail_links(req);
4223         if (file)
4224                 fput(file);
4225         __io_req_complete(req, issue_flags, ret, 0);
4226         return 0;
4227 }
4228
4229 static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4230 {
4231         struct io_ring_ctx *ctx = req->ctx;
4232
4233         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4234                 return -EINVAL;
4235         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
4236                 return -EINVAL;
4237
4238         req->sync.off = READ_ONCE(sqe->off);
4239         req->sync.len = READ_ONCE(sqe->len);
4240         req->sync.flags = READ_ONCE(sqe->sync_range_flags);
4241         return 0;
4242 }
4243
4244 static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
4245 {
4246         int ret;
4247
4248         /* sync_file_range always requires a blocking context */
4249         if (issue_flags & IO_URING_F_NONBLOCK)
4250                 return -EAGAIN;
4251
4252         ret = sync_file_range(req->file, req->sync.off, req->sync.len,
4253                                 req->sync.flags);
4254         if (ret < 0)
4255                 req_set_fail_links(req);
4256         io_req_complete(req, ret);
4257         return 0;
4258 }
4259
4260 #if defined(CONFIG_NET)
4261 static int io_setup_async_msg(struct io_kiocb *req,
4262                               struct io_async_msghdr *kmsg)
4263 {
4264         struct io_async_msghdr *async_msg = req->async_data;
4265
4266         if (async_msg)
4267                 return -EAGAIN;
4268         if (io_alloc_async_data(req)) {
4269                 kfree(kmsg->free_iov);
4270                 return -ENOMEM;
4271         }
4272         async_msg = req->async_data;
4273         req->flags |= REQ_F_NEED_CLEANUP;
4274         memcpy(async_msg, kmsg, sizeof(*kmsg));
4275         async_msg->msg.msg_name = &async_msg->addr;
4276         /* if were using fast_iov, set it to the new one */
4277         if (!async_msg->free_iov)
4278                 async_msg->msg.msg_iter.iov = async_msg->fast_iov;
4279
4280         return -EAGAIN;
4281 }
4282
4283 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
4284                                struct io_async_msghdr *iomsg)
4285 {
4286         iomsg->msg.msg_name = &iomsg->addr;
4287         iomsg->free_iov = iomsg->fast_iov;
4288         return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
4289                                    req->sr_msg.msg_flags, &iomsg->free_iov);
4290 }
4291
4292 static int io_sendmsg_prep_async(struct io_kiocb *req)
4293 {
4294         int ret;
4295
4296         if (!io_op_defs[req->opcode].needs_async_data)
4297                 return 0;
4298         ret = io_sendmsg_copy_hdr(req, req->async_data);
4299         if (!ret)
4300                 req->flags |= REQ_F_NEED_CLEANUP;
4301         return ret;
4302 }
4303
4304 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4305 {
4306         struct io_sr_msg *sr = &req->sr_msg;
4307
4308         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4309                 return -EINVAL;
4310
4311         sr->msg_flags = READ_ONCE(sqe->msg_flags);
4312         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4313         sr->len = READ_ONCE(sqe->len);
4314
4315 #ifdef CONFIG_COMPAT
4316         if (req->ctx->compat)
4317                 sr->msg_flags |= MSG_CMSG_COMPAT;
4318 #endif
4319         return 0;
4320 }
4321
4322 static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
4323 {
4324         struct io_async_msghdr iomsg, *kmsg;
4325         struct socket *sock;
4326         unsigned flags;
4327         int ret;
4328
4329         sock = sock_from_file(req->file);
4330         if (unlikely(!sock))
4331                 return -ENOTSOCK;
4332
4333         kmsg = req->async_data;
4334         if (!kmsg) {
4335                 ret = io_sendmsg_copy_hdr(req, &iomsg);
4336                 if (ret)
4337                         return ret;
4338                 kmsg = &iomsg;
4339         }
4340
4341         flags = req->sr_msg.msg_flags;
4342         if (flags & MSG_DONTWAIT)
4343                 req->flags |= REQ_F_NOWAIT;
4344         else if (issue_flags & IO_URING_F_NONBLOCK)
4345                 flags |= MSG_DONTWAIT;
4346
4347         ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
4348         if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4349                 return io_setup_async_msg(req, kmsg);
4350         if (ret == -ERESTARTSYS)
4351                 ret = -EINTR;
4352
4353         /* fast path, check for non-NULL to avoid function call */
4354         if (kmsg->free_iov)
4355                 kfree(kmsg->free_iov);
4356         req->flags &= ~REQ_F_NEED_CLEANUP;
4357         if (ret < 0)
4358                 req_set_fail_links(req);
4359         __io_req_complete(req, issue_flags, ret, 0);
4360         return 0;
4361 }
4362
4363 static int io_send(struct io_kiocb *req, unsigned int issue_flags)
4364 {
4365         struct io_sr_msg *sr = &req->sr_msg;
4366         struct msghdr msg;
4367         struct iovec iov;
4368         struct socket *sock;
4369         unsigned flags;
4370         int ret;
4371
4372         sock = sock_from_file(req->file);
4373         if (unlikely(!sock))
4374                 return -ENOTSOCK;
4375
4376         ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
4377         if (unlikely(ret))
4378                 return ret;
4379
4380         msg.msg_name = NULL;
4381         msg.msg_control = NULL;
4382         msg.msg_controllen = 0;
4383         msg.msg_namelen = 0;
4384
4385         flags = req->sr_msg.msg_flags;
4386         if (flags & MSG_DONTWAIT)
4387                 req->flags |= REQ_F_NOWAIT;
4388         else if (issue_flags & IO_URING_F_NONBLOCK)
4389                 flags |= MSG_DONTWAIT;
4390
4391         msg.msg_flags = flags;
4392         ret = sock_sendmsg(sock, &msg);
4393         if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4394                 return -EAGAIN;
4395         if (ret == -ERESTARTSYS)
4396                 ret = -EINTR;
4397
4398         if (ret < 0)
4399                 req_set_fail_links(req);
4400         __io_req_complete(req, issue_flags, ret, 0);
4401         return 0;
4402 }
4403
4404 static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
4405                                  struct io_async_msghdr *iomsg)
4406 {
4407         struct io_sr_msg *sr = &req->sr_msg;
4408         struct iovec __user *uiov;
4409         size_t iov_len;
4410         int ret;
4411
4412         ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
4413                                         &iomsg->uaddr, &uiov, &iov_len);
4414         if (ret)
4415                 return ret;
4416
4417         if (req->flags & REQ_F_BUFFER_SELECT) {
4418                 if (iov_len > 1)
4419                         return -EINVAL;
4420                 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
4421                         return -EFAULT;
4422                 sr->len = iomsg->fast_iov[0].iov_len;
4423                 iomsg->free_iov = NULL;
4424         } else {
4425                 iomsg->free_iov = iomsg->fast_iov;
4426                 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
4427                                      &iomsg->free_iov, &iomsg->msg.msg_iter,
4428                                      false);
4429                 if (ret > 0)
4430                         ret = 0;
4431         }
4432
4433         return ret;
4434 }
4435
4436 #ifdef CONFIG_COMPAT
4437 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
4438                                         struct io_async_msghdr *iomsg)
4439 {
4440         struct compat_msghdr __user *msg_compat;
4441         struct io_sr_msg *sr = &req->sr_msg;
4442         struct compat_iovec __user *uiov;
4443         compat_uptr_t ptr;
4444         compat_size_t len;
4445         int ret;
4446
4447         msg_compat = (struct compat_msghdr __user *) sr->umsg;
4448         ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr,
4449                                         &ptr, &len);
4450         if (ret)
4451                 return ret;
4452
4453         uiov = compat_ptr(ptr);
4454         if (req->flags & REQ_F_BUFFER_SELECT) {
4455                 compat_ssize_t clen;
4456
4457                 if (len > 1)
4458                         return -EINVAL;
4459                 if (!access_ok(uiov, sizeof(*uiov)))
4460                         return -EFAULT;
4461                 if (__get_user(clen, &uiov->iov_len))
4462                         return -EFAULT;
4463                 if (clen < 0)
4464                         return -EINVAL;
4465                 sr->len = clen;
4466                 iomsg->free_iov = NULL;
4467         } else {
4468                 iomsg->free_iov = iomsg->fast_iov;
4469                 ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
4470                                    UIO_FASTIOV, &iomsg->free_iov,
4471                                    &iomsg->msg.msg_iter, true);
4472                 if (ret < 0)
4473                         return ret;
4474         }
4475
4476         return 0;
4477 }
4478 #endif
4479
4480 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
4481                                struct io_async_msghdr *iomsg)
4482 {
4483         iomsg->msg.msg_name = &iomsg->addr;
4484
4485 #ifdef CONFIG_COMPAT
4486         if (req->ctx->compat)
4487                 return __io_compat_recvmsg_copy_hdr(req, iomsg);
4488 #endif
4489
4490         return __io_recvmsg_copy_hdr(req, iomsg);
4491 }
4492
4493 static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
4494                                                bool needs_lock)
4495 {
4496         struct io_sr_msg *sr = &req->sr_msg;
4497         struct io_buffer *kbuf;
4498
4499         kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
4500         if (IS_ERR(kbuf))
4501                 return kbuf;
4502
4503         sr->kbuf = kbuf;
4504         req->flags |= REQ_F_BUFFER_SELECTED;
4505         return kbuf;
4506 }
4507
4508 static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
4509 {
4510         return io_put_kbuf(req, req->sr_msg.kbuf);
4511 }
4512
4513 static int io_recvmsg_prep_async(struct io_kiocb *req)
4514 {
4515         int ret;
4516
4517         if (!io_op_defs[req->opcode].needs_async_data)
4518                 return 0;
4519         ret = io_recvmsg_copy_hdr(req, req->async_data);
4520         if (!ret)
4521                 req->flags |= REQ_F_NEED_CLEANUP;
4522         return ret;
4523 }
4524
4525 static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4526 {
4527         struct io_sr_msg *sr = &req->sr_msg;
4528
4529         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4530                 return -EINVAL;
4531
4532         sr->msg_flags = READ_ONCE(sqe->msg_flags);
4533         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4534         sr->len = READ_ONCE(sqe->len);
4535         sr->bgid = READ_ONCE(sqe->buf_group);
4536
4537 #ifdef CONFIG_COMPAT
4538         if (req->ctx->compat)
4539                 sr->msg_flags |= MSG_CMSG_COMPAT;
4540 #endif
4541         return 0;
4542 }
4543
4544 static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
4545 {
4546         struct io_async_msghdr iomsg, *kmsg;
4547         struct socket *sock;
4548         struct io_buffer *kbuf;
4549         unsigned flags;
4550         int ret, cflags = 0;
4551         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4552
4553         sock = sock_from_file(req->file);
4554         if (unlikely(!sock))
4555                 return -ENOTSOCK;
4556
4557         kmsg = req->async_data;
4558         if (!kmsg) {
4559                 ret = io_recvmsg_copy_hdr(req, &iomsg);
4560                 if (ret)
4561                         return ret;
4562                 kmsg = &iomsg;
4563         }
4564
4565         if (req->flags & REQ_F_BUFFER_SELECT) {
4566                 kbuf = io_recv_buffer_select(req, !force_nonblock);
4567                 if (IS_ERR(kbuf))
4568                         return PTR_ERR(kbuf);
4569                 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
4570                 kmsg->fast_iov[0].iov_len = req->sr_msg.len;
4571                 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
4572                                 1, req->sr_msg.len);
4573         }
4574
4575         flags = req->sr_msg.msg_flags;
4576         if (flags & MSG_DONTWAIT)
4577                 req->flags |= REQ_F_NOWAIT;
4578         else if (force_nonblock)
4579                 flags |= MSG_DONTWAIT;
4580
4581         ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
4582                                         kmsg->uaddr, flags);
4583         if (force_nonblock && ret == -EAGAIN)
4584                 return io_setup_async_msg(req, kmsg);
4585         if (ret == -ERESTARTSYS)
4586                 ret = -EINTR;
4587
4588         if (req->flags & REQ_F_BUFFER_SELECTED)
4589                 cflags = io_put_recv_kbuf(req);
4590         /* fast path, check for non-NULL to avoid function call */
4591         if (kmsg->free_iov)
4592                 kfree(kmsg->free_iov);
4593         req->flags &= ~REQ_F_NEED_CLEANUP;
4594         if (ret < 0)
4595                 req_set_fail_links(req);
4596         __io_req_complete(req, issue_flags, ret, cflags);
4597         return 0;
4598 }
4599
4600 static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
4601 {
4602         struct io_buffer *kbuf;
4603         struct io_sr_msg *sr = &req->sr_msg;
4604         struct msghdr msg;
4605         void __user *buf = sr->buf;
4606         struct socket *sock;
4607         struct iovec iov;
4608         unsigned flags;
4609         int ret, cflags = 0;
4610         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4611
4612         sock = sock_from_file(req->file);
4613         if (unlikely(!sock))
4614                 return -ENOTSOCK;
4615
4616         if (req->flags & REQ_F_BUFFER_SELECT) {
4617                 kbuf = io_recv_buffer_select(req, !force_nonblock);
4618                 if (IS_ERR(kbuf))
4619                         return PTR_ERR(kbuf);
4620                 buf = u64_to_user_ptr(kbuf->addr);
4621         }
4622
4623         ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
4624         if (unlikely(ret))
4625                 goto out_free;
4626
4627         msg.msg_name = NULL;
4628         msg.msg_control = NULL;
4629         msg.msg_controllen = 0;
4630         msg.msg_namelen = 0;
4631         msg.msg_iocb = NULL;
4632         msg.msg_flags = 0;
4633
4634         flags = req->sr_msg.msg_flags;
4635         if (flags & MSG_DONTWAIT)
4636                 req->flags |= REQ_F_NOWAIT;
4637         else if (force_nonblock)
4638                 flags |= MSG_DONTWAIT;
4639
4640         ret = sock_recvmsg(sock, &msg, flags);
4641         if (force_nonblock && ret == -EAGAIN)
4642                 return -EAGAIN;
4643         if (ret == -ERESTARTSYS)
4644                 ret = -EINTR;
4645 out_free:
4646         if (req->flags & REQ_F_BUFFER_SELECTED)
4647                 cflags = io_put_recv_kbuf(req);
4648         if (ret < 0)
4649                 req_set_fail_links(req);
4650         __io_req_complete(req, issue_flags, ret, cflags);
4651         return 0;
4652 }
4653
4654 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4655 {
4656         struct io_accept *accept = &req->accept;
4657
4658         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4659                 return -EINVAL;
4660         if (sqe->ioprio || sqe->len || sqe->buf_index)
4661                 return -EINVAL;
4662
4663         accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4664         accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4665         accept->flags = READ_ONCE(sqe->accept_flags);
4666         accept->nofile = rlimit(RLIMIT_NOFILE);
4667         return 0;
4668 }
4669
4670 static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
4671 {
4672         struct io_accept *accept = &req->accept;
4673         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4674         unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
4675         int ret;
4676
4677         if (req->file->f_flags & O_NONBLOCK)
4678                 req->flags |= REQ_F_NOWAIT;
4679
4680         ret = __sys_accept4_file(req->file, file_flags, accept->addr,
4681                                         accept->addr_len, accept->flags,
4682                                         accept->nofile);
4683         if (ret == -EAGAIN && force_nonblock)
4684                 return -EAGAIN;
4685         if (ret < 0) {
4686                 if (ret == -ERESTARTSYS)
4687                         ret = -EINTR;
4688                 req_set_fail_links(req);
4689         }
4690         __io_req_complete(req, issue_flags, ret, 0);
4691         return 0;
4692 }
4693
4694 static int io_connect_prep_async(struct io_kiocb *req)
4695 {
4696         struct io_async_connect *io = req->async_data;
4697         struct io_connect *conn = &req->connect;
4698
4699         return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
4700 }
4701
4702 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4703 {
4704         struct io_connect *conn = &req->connect;
4705
4706         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4707                 return -EINVAL;
4708         if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
4709                 return -EINVAL;
4710
4711         conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4712         conn->addr_len =  READ_ONCE(sqe->addr2);
4713         return 0;
4714 }
4715
4716 static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
4717 {
4718         struct io_async_connect __io, *io;
4719         unsigned file_flags;
4720         int ret;
4721         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4722
4723         if (req->async_data) {
4724                 io = req->async_data;
4725         } else {
4726                 ret = move_addr_to_kernel(req->connect.addr,
4727                                                 req->connect.addr_len,
4728                                                 &__io.address);
4729                 if (ret)
4730                         goto out;
4731                 io = &__io;
4732         }
4733
4734         file_flags = force_nonblock ? O_NONBLOCK : 0;
4735
4736         ret = __sys_connect_file(req->file, &io->address,
4737                                         req->connect.addr_len, file_flags);
4738         if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
4739                 if (req->async_data)
4740                         return -EAGAIN;
4741                 if (io_alloc_async_data(req)) {
4742                         ret = -ENOMEM;
4743                         goto out;
4744                 }
4745                 io = req->async_data;
4746                 memcpy(req->async_data, &__io, sizeof(__io));
4747                 return -EAGAIN;
4748         }
4749         if (ret == -ERESTARTSYS)
4750                 ret = -EINTR;
4751 out:
4752         if (ret < 0)
4753                 req_set_fail_links(req);
4754         __io_req_complete(req, issue_flags, ret, 0);
4755         return 0;
4756 }
4757 #else /* !CONFIG_NET */
4758 #define IO_NETOP_FN(op)                                                 \
4759 static int io_##op(struct io_kiocb *req, unsigned int issue_flags)      \
4760 {                                                                       \
4761         return -EOPNOTSUPP;                                             \
4762 }
4763
4764 #define IO_NETOP_PREP(op)                                               \
4765 IO_NETOP_FN(op)                                                         \
4766 static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
4767 {                                                                       \
4768         return -EOPNOTSUPP;                                             \
4769 }                                                                       \
4770
4771 #define IO_NETOP_PREP_ASYNC(op)                                         \
4772 IO_NETOP_PREP(op)                                                       \
4773 static int io_##op##_prep_async(struct io_kiocb *req)                   \
4774 {                                                                       \
4775         return -EOPNOTSUPP;                                             \
4776 }
4777
4778 IO_NETOP_PREP_ASYNC(sendmsg);
4779 IO_NETOP_PREP_ASYNC(recvmsg);
4780 IO_NETOP_PREP_ASYNC(connect);
4781 IO_NETOP_PREP(accept);
4782 IO_NETOP_FN(send);
4783 IO_NETOP_FN(recv);
4784 #endif /* CONFIG_NET */
4785
4786 struct io_poll_table {
4787         struct poll_table_struct pt;
4788         struct io_kiocb *req;
4789         int error;
4790 };
4791
4792 static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
4793                            __poll_t mask, task_work_func_t func)
4794 {
4795         int ret;
4796
4797         /* for instances that support it check for an event match first: */
4798         if (mask && !(mask & poll->events))
4799                 return 0;
4800
4801         trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
4802
4803         list_del_init(&poll->wait.entry);
4804
4805         req->result = mask;
4806         req->task_work.func = func;
4807         percpu_ref_get(&req->ctx->refs);
4808
4809         /*
4810          * If this fails, then the task is exiting. When a task exits, the
4811          * work gets canceled, so just cancel this request as well instead
4812          * of executing it. We can't safely execute it anyway, as we may not
4813          * have the needed state needed for it anyway.
4814          */
4815         ret = io_req_task_work_add(req);
4816         if (unlikely(ret)) {
4817                 WRITE_ONCE(poll->canceled, true);
4818                 io_req_task_work_add_fallback(req, func);
4819         }
4820         return 1;
4821 }
4822
4823 static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
4824         __acquires(&req->ctx->completion_lock)
4825 {
4826         struct io_ring_ctx *ctx = req->ctx;
4827
4828         if (!req->result && !READ_ONCE(poll->canceled)) {
4829                 struct poll_table_struct pt = { ._key = poll->events };
4830
4831                 req->result = vfs_poll(req->file, &pt) & poll->events;
4832         }
4833
4834         spin_lock_irq(&ctx->completion_lock);
4835         if (!req->result && !READ_ONCE(poll->canceled)) {
4836                 add_wait_queue(poll->head, &poll->wait);
4837                 return true;
4838         }
4839
4840         return false;
4841 }
4842
4843 static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
4844 {
4845         /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
4846         if (req->opcode == IORING_OP_POLL_ADD)
4847                 return req->async_data;
4848         return req->apoll->double_poll;
4849 }
4850
4851 static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
4852 {
4853         if (req->opcode == IORING_OP_POLL_ADD)
4854                 return &req->poll;
4855         return &req->apoll->poll;
4856 }
4857
4858 static void io_poll_remove_double(struct io_kiocb *req)
4859 {
4860         struct io_poll_iocb *poll = io_poll_get_double(req);
4861
4862         lockdep_assert_held(&req->ctx->completion_lock);
4863
4864         if (poll && poll->head) {
4865                 struct wait_queue_head *head = poll->head;
4866
4867                 spin_lock(&head->lock);
4868                 list_del_init(&poll->wait.entry);
4869                 if (poll->wait.private)
4870                         refcount_dec(&req->refs);
4871                 poll->head = NULL;
4872                 spin_unlock(&head->lock);
4873         }
4874 }
4875
4876 static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
4877 {
4878         struct io_ring_ctx *ctx = req->ctx;
4879
4880         io_poll_remove_double(req);
4881         req->poll.done = true;
4882         io_cqring_fill_event(req, error ? error : mangle_poll(mask));
4883         io_commit_cqring(ctx);
4884 }
4885
4886 static void io_poll_task_func(struct callback_head *cb)
4887 {
4888         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4889         struct io_ring_ctx *ctx = req->ctx;
4890         struct io_kiocb *nxt;
4891
4892         if (io_poll_rewait(req, &req->poll)) {
4893                 spin_unlock_irq(&ctx->completion_lock);
4894         } else {
4895                 hash_del(&req->hash_node);
4896                 io_poll_complete(req, req->result, 0);
4897                 spin_unlock_irq(&ctx->completion_lock);
4898
4899                 nxt = io_put_req_find_next(req);
4900                 io_cqring_ev_posted(ctx);
4901                 if (nxt)
4902                         __io_req_task_submit(nxt);
4903         }
4904
4905         percpu_ref_put(&ctx->refs);
4906 }
4907
4908 static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
4909                                int sync, void *key)
4910 {
4911         struct io_kiocb *req = wait->private;
4912         struct io_poll_iocb *poll = io_poll_get_single(req);
4913         __poll_t mask = key_to_poll(key);
4914
4915         /* for instances that support it check for an event match first: */
4916         if (mask && !(mask & poll->events))
4917                 return 0;
4918
4919         list_del_init(&wait->entry);
4920
4921         if (poll && poll->head) {
4922                 bool done;
4923
4924                 spin_lock(&poll->head->lock);
4925                 done = list_empty(&poll->wait.entry);
4926                 if (!done)
4927                         list_del_init(&poll->wait.entry);
4928                 /* make sure double remove sees this as being gone */
4929                 wait->private = NULL;
4930                 spin_unlock(&poll->head->lock);
4931                 if (!done) {
4932                         /* use wait func handler, so it matches the rq type */
4933                         poll->wait.func(&poll->wait, mode, sync, key);
4934                 }
4935         }
4936         refcount_dec(&req->refs);
4937         return 1;
4938 }
4939
4940 static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
4941                               wait_queue_func_t wake_func)
4942 {
4943         poll->head = NULL;
4944         poll->done = false;
4945         poll->canceled = false;
4946         poll->events = events;
4947         INIT_LIST_HEAD(&poll->wait.entry);
4948         init_waitqueue_func_entry(&poll->wait, wake_func);
4949 }
4950
4951 static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
4952                             struct wait_queue_head *head,
4953                             struct io_poll_iocb **poll_ptr)
4954 {
4955         struct io_kiocb *req = pt->req;
4956
4957         /*
4958          * If poll->head is already set, it's because the file being polled
4959          * uses multiple waitqueues for poll handling (eg one for read, one
4960          * for write). Setup a separate io_poll_iocb if this happens.
4961          */
4962         if (unlikely(poll->head)) {
4963                 struct io_poll_iocb *poll_one = poll;
4964
4965                 /* already have a 2nd entry, fail a third attempt */
4966                 if (*poll_ptr) {
4967                         pt->error = -EINVAL;
4968                         return;
4969                 }
4970                 /* double add on the same waitqueue head, ignore */
4971                 if (poll->head == head)
4972                         return;
4973                 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
4974                 if (!poll) {
4975                         pt->error = -ENOMEM;
4976                         return;
4977                 }
4978                 io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
4979                 refcount_inc(&req->refs);
4980                 poll->wait.private = req;
4981                 *poll_ptr = poll;
4982         }
4983
4984         pt->error = 0;
4985         poll->head = head;
4986
4987         if (poll->events & EPOLLEXCLUSIVE)
4988                 add_wait_queue_exclusive(head, &poll->wait);
4989         else
4990                 add_wait_queue(head, &poll->wait);
4991 }
4992
4993 static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
4994                                struct poll_table_struct *p)
4995 {
4996         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
4997         struct async_poll *apoll = pt->req->apoll;
4998
4999         __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
5000 }
5001
5002 static void io_async_task_func(struct callback_head *cb)
5003 {
5004         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
5005         struct async_poll *apoll = req->apoll;
5006         struct io_ring_ctx *ctx = req->ctx;
5007
5008         trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
5009
5010         if (io_poll_rewait(req, &apoll->poll)) {
5011                 spin_unlock_irq(&ctx->completion_lock);
5012                 percpu_ref_put(&ctx->refs);
5013                 return;
5014         }
5015
5016         /* If req is still hashed, it cannot have been canceled. Don't check. */
5017         if (hash_hashed(&req->hash_node))
5018                 hash_del(&req->hash_node);
5019
5020         io_poll_remove_double(req);
5021         spin_unlock_irq(&ctx->completion_lock);
5022
5023         if (!READ_ONCE(apoll->poll.canceled))
5024                 __io_req_task_submit(req);
5025         else
5026                 __io_req_task_cancel(req, -ECANCELED);
5027
5028         percpu_ref_put(&ctx->refs);
5029         kfree(apoll->double_poll);
5030         kfree(apoll);
5031 }
5032
5033 static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5034                         void *key)
5035 {
5036         struct io_kiocb *req = wait->private;
5037         struct io_poll_iocb *poll = &req->apoll->poll;
5038
5039         trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
5040                                         key_to_poll(key));
5041
5042         return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
5043 }
5044
5045 static void io_poll_req_insert(struct io_kiocb *req)
5046 {
5047         struct io_ring_ctx *ctx = req->ctx;
5048         struct hlist_head *list;
5049
5050         list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
5051         hlist_add_head(&req->hash_node, list);
5052 }
5053
5054 static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
5055                                       struct io_poll_iocb *poll,
5056                                       struct io_poll_table *ipt, __poll_t mask,
5057                                       wait_queue_func_t wake_func)
5058         __acquires(&ctx->completion_lock)
5059 {
5060         struct io_ring_ctx *ctx = req->ctx;
5061         bool cancel = false;
5062
5063         INIT_HLIST_NODE(&req->hash_node);
5064         io_init_poll_iocb(poll, mask, wake_func);
5065         poll->file = req->file;
5066         poll->wait.private = req;
5067
5068         ipt->pt._key = mask;
5069         ipt->req = req;
5070         ipt->error = -EINVAL;
5071
5072         mask = vfs_poll(req->file, &ipt->pt) & poll->events;
5073
5074         spin_lock_irq(&ctx->completion_lock);
5075         if (likely(poll->head)) {
5076                 spin_lock(&poll->head->lock);
5077                 if (unlikely(list_empty(&poll->wait.entry))) {
5078                         if (ipt->error)
5079                                 cancel = true;
5080                         ipt->error = 0;
5081                         mask = 0;
5082                 }
5083                 if (mask || ipt->error)
5084                         list_del_init(&poll->wait.entry);
5085                 else if (cancel)
5086                         WRITE_ONCE(poll->canceled, true);
5087                 else if (!poll->done) /* actually waiting for an event */
5088                         io_poll_req_insert(req);
5089                 spin_unlock(&poll->head->lock);
5090         }
5091
5092         return mask;
5093 }
5094
5095 static bool io_arm_poll_handler(struct io_kiocb *req)
5096 {
5097         const struct io_op_def *def = &io_op_defs[req->opcode];
5098         struct io_ring_ctx *ctx = req->ctx;
5099         struct async_poll *apoll;
5100         struct io_poll_table ipt;
5101         __poll_t mask, ret;
5102         int rw;
5103
5104         if (!req->file || !file_can_poll(req->file))
5105                 return false;
5106         if (req->flags & REQ_F_POLLED)
5107                 return false;
5108         if (def->pollin)
5109                 rw = READ;
5110         else if (def->pollout)
5111                 rw = WRITE;
5112         else
5113                 return false;
5114         /* if we can't nonblock try, then no point in arming a poll handler */
5115         if (!io_file_supports_async(req->file, rw))
5116                 return false;
5117
5118         apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5119         if (unlikely(!apoll))
5120                 return false;
5121         apoll->double_poll = NULL;
5122
5123         req->flags |= REQ_F_POLLED;
5124         req->apoll = apoll;
5125
5126         mask = 0;
5127         if (def->pollin)
5128                 mask |= POLLIN | POLLRDNORM;
5129         if (def->pollout)
5130                 mask |= POLLOUT | POLLWRNORM;
5131
5132         /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
5133         if ((req->opcode == IORING_OP_RECVMSG) &&
5134             (req->sr_msg.msg_flags & MSG_ERRQUEUE))
5135                 mask &= ~POLLIN;
5136
5137         mask |= POLLERR | POLLPRI;
5138
5139         ipt.pt._qproc = io_async_queue_proc;
5140
5141         ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
5142                                         io_async_wake);
5143         if (ret || ipt.error) {
5144                 io_poll_remove_double(req);
5145                 spin_unlock_irq(&ctx->completion_lock);
5146                 kfree(apoll->double_poll);
5147                 kfree(apoll);
5148                 return false;
5149         }
5150         spin_unlock_irq(&ctx->completion_lock);
5151         trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
5152                                         apoll->poll.events);
5153         return true;
5154 }
5155
5156 static bool __io_poll_remove_one(struct io_kiocb *req,
5157                                  struct io_poll_iocb *poll)
5158 {
5159         bool do_complete = false;
5160
5161         spin_lock(&poll->head->lock);
5162         WRITE_ONCE(poll->canceled, true);
5163         if (!list_empty(&poll->wait.entry)) {
5164                 list_del_init(&poll->wait.entry);
5165                 do_complete = true;
5166         }
5167         spin_unlock(&poll->head->lock);
5168         hash_del(&req->hash_node);
5169         return do_complete;
5170 }
5171
5172 static bool io_poll_remove_one(struct io_kiocb *req)
5173 {
5174         bool do_complete;
5175
5176         io_poll_remove_double(req);
5177
5178         if (req->opcode == IORING_OP_POLL_ADD) {
5179                 do_complete = __io_poll_remove_one(req, &req->poll);
5180         } else {
5181                 struct async_poll *apoll = req->apoll;
5182
5183                 /* non-poll requests have submit ref still */
5184                 do_complete = __io_poll_remove_one(req, &apoll->poll);
5185                 if (do_complete) {
5186                         io_put_req(req);
5187                         kfree(apoll->double_poll);
5188                         kfree(apoll);
5189                 }
5190         }
5191
5192         if (do_complete) {
5193                 io_cqring_fill_event(req, -ECANCELED);
5194                 io_commit_cqring(req->ctx);
5195                 req_set_fail_links(req);
5196                 io_put_req_deferred(req, 1);
5197         }
5198
5199         return do_complete;
5200 }
5201
5202 /*
5203  * Returns true if we found and killed one or more poll requests
5204  */
5205 static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
5206                                struct files_struct *files)
5207 {
5208         struct hlist_node *tmp;
5209         struct io_kiocb *req;
5210         int posted = 0, i;
5211
5212         spin_lock_irq(&ctx->completion_lock);
5213         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
5214                 struct hlist_head *list;
5215
5216                 list = &ctx->cancel_hash[i];
5217                 hlist_for_each_entry_safe(req, tmp, list, hash_node) {
5218                         if (io_match_task(req, tsk, files))
5219                                 posted += io_poll_remove_one(req);
5220                 }
5221         }
5222         spin_unlock_irq(&ctx->completion_lock);
5223
5224         if (posted)
5225                 io_cqring_ev_posted(ctx);
5226
5227         return posted != 0;
5228 }
5229
5230 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
5231 {
5232         struct hlist_head *list;
5233         struct io_kiocb *req;
5234
5235         list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
5236         hlist_for_each_entry(req, list, hash_node) {
5237                 if (sqe_addr != req->user_data)
5238                         continue;
5239                 if (io_poll_remove_one(req))
5240                         return 0;
5241                 return -EALREADY;
5242         }
5243
5244         return -ENOENT;
5245 }
5246
5247 static int io_poll_remove_prep(struct io_kiocb *req,
5248                                const struct io_uring_sqe *sqe)
5249 {
5250         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5251                 return -EINVAL;
5252         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
5253             sqe->poll_events)
5254                 return -EINVAL;
5255
5256         req->poll_remove.addr = READ_ONCE(sqe->addr);
5257         return 0;
5258 }
5259
5260 /*
5261  * Find a running poll command that matches one specified in sqe->addr,
5262  * and remove it if found.
5263  */
5264 static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
5265 {
5266         struct io_ring_ctx *ctx = req->ctx;
5267         int ret;
5268
5269         spin_lock_irq(&ctx->completion_lock);
5270         ret = io_poll_cancel(ctx, req->poll_remove.addr);
5271         spin_unlock_irq(&ctx->completion_lock);
5272
5273         if (ret < 0)
5274                 req_set_fail_links(req);
5275         io_req_complete(req, ret);
5276         return 0;
5277 }
5278
5279 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5280                         void *key)
5281 {
5282         struct io_kiocb *req = wait->private;
5283         struct io_poll_iocb *poll = &req->poll;
5284
5285         return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
5286 }
5287
5288 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
5289                                struct poll_table_struct *p)
5290 {
5291         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5292
5293         __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
5294 }
5295
5296 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5297 {
5298         struct io_poll_iocb *poll = &req->poll;
5299         u32 events;
5300
5301         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5302                 return -EINVAL;
5303         if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
5304                 return -EINVAL;
5305
5306         events = READ_ONCE(sqe->poll32_events);
5307 #ifdef __BIG_ENDIAN
5308         events = swahw32(events);
5309 #endif
5310         poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
5311                        (events & EPOLLEXCLUSIVE);
5312         return 0;
5313 }
5314
5315 static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
5316 {
5317         struct io_poll_iocb *poll = &req->poll;
5318         struct io_ring_ctx *ctx = req->ctx;
5319         struct io_poll_table ipt;
5320         __poll_t mask;
5321
5322         ipt.pt._qproc = io_poll_queue_proc;
5323
5324         mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
5325                                         io_poll_wake);
5326
5327         if (mask) { /* no async, we'd stolen it */
5328                 ipt.error = 0;
5329                 io_poll_complete(req, mask, 0);
5330         }
5331         spin_unlock_irq(&ctx->completion_lock);
5332
5333         if (mask) {
5334                 io_cqring_ev_posted(ctx);
5335                 io_put_req(req);
5336         }
5337         return ipt.error;
5338 }
5339
5340 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
5341 {
5342         struct io_timeout_data *data = container_of(timer,
5343                                                 struct io_timeout_data, timer);
5344         struct io_kiocb *req = data->req;
5345         struct io_ring_ctx *ctx = req->ctx;
5346         unsigned long flags;
5347
5348         spin_lock_irqsave(&ctx->completion_lock, flags);
5349         list_del_init(&req->timeout.list);
5350         atomic_set(&req->ctx->cq_timeouts,
5351                 atomic_read(&req->ctx->cq_timeouts) + 1);
5352
5353         io_cqring_fill_event(req, -ETIME);
5354         io_commit_cqring(ctx);
5355         spin_unlock_irqrestore(&ctx->completion_lock, flags);
5356
5357         io_cqring_ev_posted(ctx);
5358         req_set_fail_links(req);
5359         io_put_req(req);
5360         return HRTIMER_NORESTART;
5361 }
5362
5363 static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
5364                                            __u64 user_data)
5365 {
5366         struct io_timeout_data *io;
5367         struct io_kiocb *req;
5368         int ret = -ENOENT;
5369
5370         list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
5371                 if (user_data == req->user_data) {
5372                         ret = 0;
5373                         break;
5374                 }
5375         }
5376
5377         if (ret == -ENOENT)
5378                 return ERR_PTR(ret);
5379
5380         io = req->async_data;
5381         ret = hrtimer_try_to_cancel(&io->timer);
5382         if (ret == -1)
5383                 return ERR_PTR(-EALREADY);
5384         list_del_init(&req->timeout.list);
5385         return req;
5386 }
5387
5388 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
5389 {
5390         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5391
5392         if (IS_ERR(req))
5393                 return PTR_ERR(req);
5394
5395         req_set_fail_links(req);
5396         io_cqring_fill_event(req, -ECANCELED);
5397         io_put_req_deferred(req, 1);
5398         return 0;
5399 }
5400
5401 static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
5402                              struct timespec64 *ts, enum hrtimer_mode mode)
5403 {
5404         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5405         struct io_timeout_data *data;
5406
5407         if (IS_ERR(req))
5408                 return PTR_ERR(req);
5409
5410         req->timeout.off = 0; /* noseq */
5411         data = req->async_data;
5412         list_add_tail(&req->timeout.list, &ctx->timeout_list);
5413         hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
5414         data->timer.function = io_timeout_fn;
5415         hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
5416         return 0;
5417 }
5418
5419 static int io_timeout_remove_prep(struct io_kiocb *req,
5420                                   const struct io_uring_sqe *sqe)
5421 {
5422         struct io_timeout_rem *tr = &req->timeout_rem;
5423
5424         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5425                 return -EINVAL;
5426         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5427                 return -EINVAL;
5428         if (sqe->ioprio || sqe->buf_index || sqe->len)
5429                 return -EINVAL;
5430
5431         tr->addr = READ_ONCE(sqe->addr);
5432         tr->flags = READ_ONCE(sqe->timeout_flags);
5433         if (tr->flags & IORING_TIMEOUT_UPDATE) {
5434                 if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
5435                         return -EINVAL;
5436                 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
5437                         return -EFAULT;
5438         } else if (tr->flags) {
5439                 /* timeout removal doesn't support flags */
5440                 return -EINVAL;
5441         }
5442
5443         return 0;
5444 }
5445
5446 static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
5447 {
5448         return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
5449                                             : HRTIMER_MODE_REL;
5450 }
5451
5452 /*
5453  * Remove or update an existing timeout command
5454  */
5455 static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
5456 {
5457         struct io_timeout_rem *tr = &req->timeout_rem;
5458         struct io_ring_ctx *ctx = req->ctx;
5459         int ret;
5460
5461         spin_lock_irq(&ctx->completion_lock);
5462         if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE))
5463                 ret = io_timeout_cancel(ctx, tr->addr);
5464         else
5465                 ret = io_timeout_update(ctx, tr->addr, &tr->ts,
5466                                         io_translate_timeout_mode(tr->flags));
5467
5468         io_cqring_fill_event(req, ret);
5469         io_commit_cqring(ctx);
5470         spin_unlock_irq(&ctx->completion_lock);
5471         io_cqring_ev_posted(ctx);
5472         if (ret < 0)
5473                 req_set_fail_links(req);
5474         io_put_req(req);
5475         return 0;
5476 }
5477
5478 static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5479                            bool is_timeout_link)
5480 {
5481         struct io_timeout_data *data;
5482         unsigned flags;
5483         u32 off = READ_ONCE(sqe->off);
5484
5485         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5486                 return -EINVAL;
5487         if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
5488                 return -EINVAL;
5489         if (off && is_timeout_link)
5490                 return -EINVAL;
5491         flags = READ_ONCE(sqe->timeout_flags);
5492         if (flags & ~IORING_TIMEOUT_ABS)
5493                 return -EINVAL;
5494
5495         req->timeout.off = off;
5496
5497         if (!req->async_data && io_alloc_async_data(req))
5498                 return -ENOMEM;
5499
5500         data = req->async_data;
5501         data->req = req;
5502
5503         if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5504                 return -EFAULT;
5505
5506         data->mode = io_translate_timeout_mode(flags);
5507         hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
5508         io_req_track_inflight(req);
5509         return 0;
5510 }
5511
5512 static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
5513 {
5514         struct io_ring_ctx *ctx = req->ctx;
5515         struct io_timeout_data *data = req->async_data;
5516         struct list_head *entry;
5517         u32 tail, off = req->timeout.off;
5518
5519         spin_lock_irq(&ctx->completion_lock);
5520
5521         /*
5522          * sqe->off holds how many events that need to occur for this
5523          * timeout event to be satisfied. If it isn't set, then this is
5524          * a pure timeout request, sequence isn't used.
5525          */
5526         if (io_is_timeout_noseq(req)) {
5527                 entry = ctx->timeout_list.prev;
5528                 goto add;
5529         }
5530
5531         tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
5532         req->timeout.target_seq = tail + off;
5533
5534         /* Update the last seq here in case io_flush_timeouts() hasn't.
5535          * This is safe because ->completion_lock is held, and submissions
5536          * and completions are never mixed in the same ->completion_lock section.
5537          */
5538         ctx->cq_last_tm_flush = tail;
5539
5540         /*
5541          * Insertion sort, ensuring the first entry in the list is always
5542          * the one we need first.
5543          */
5544         list_for_each_prev(entry, &ctx->timeout_list) {
5545                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
5546                                                   timeout.list);
5547
5548                 if (io_is_timeout_noseq(nxt))
5549                         continue;
5550                 /* nxt.seq is behind @tail, otherwise would've been completed */
5551                 if (off >= nxt->timeout.target_seq - tail)
5552                         break;
5553         }
5554 add:
5555         list_add(&req->timeout.list, entry);
5556         data->timer.function = io_timeout_fn;
5557         hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
5558         spin_unlock_irq(&ctx->completion_lock);
5559         return 0;
5560 }
5561
5562 static bool io_cancel_cb(struct io_wq_work *work, void *data)
5563 {
5564         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5565
5566         return req->user_data == (unsigned long) data;
5567 }
5568
5569 static int io_async_cancel_one(struct io_uring_task *tctx, void *sqe_addr)
5570 {
5571         enum io_wq_cancel cancel_ret;
5572         int ret = 0;
5573
5574         if (!tctx->io_wq)
5575                 return -ENOENT;
5576
5577         cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, sqe_addr, false);
5578         switch (cancel_ret) {
5579         case IO_WQ_CANCEL_OK:
5580                 ret = 0;
5581                 break;
5582         case IO_WQ_CANCEL_RUNNING:
5583                 ret = -EALREADY;
5584                 break;
5585         case IO_WQ_CANCEL_NOTFOUND:
5586                 ret = -ENOENT;
5587                 break;
5588         }
5589
5590         return ret;
5591 }
5592
5593 static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
5594                                      struct io_kiocb *req, __u64 sqe_addr,
5595                                      int success_ret)
5596 {
5597         unsigned long flags;
5598         int ret;
5599
5600         ret = io_async_cancel_one(req->task->io_uring,
5601                                         (void *) (unsigned long) sqe_addr);
5602         if (ret != -ENOENT) {
5603                 spin_lock_irqsave(&ctx->completion_lock, flags);
5604                 goto done;
5605         }
5606
5607         spin_lock_irqsave(&ctx->completion_lock, flags);
5608         ret = io_timeout_cancel(ctx, sqe_addr);
5609         if (ret != -ENOENT)
5610                 goto done;
5611         ret = io_poll_cancel(ctx, sqe_addr);
5612 done:
5613         if (!ret)
5614                 ret = success_ret;
5615         io_cqring_fill_event(req, ret);
5616         io_commit_cqring(ctx);
5617         spin_unlock_irqrestore(&ctx->completion_lock, flags);
5618         io_cqring_ev_posted(ctx);
5619
5620         if (ret < 0)
5621                 req_set_fail_links(req);
5622         io_put_req(req);
5623 }
5624
5625 static int io_async_cancel_prep(struct io_kiocb *req,
5626                                 const struct io_uring_sqe *sqe)
5627 {
5628         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5629                 return -EINVAL;
5630         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5631                 return -EINVAL;
5632         if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
5633                 return -EINVAL;
5634
5635         req->cancel.addr = READ_ONCE(sqe->addr);
5636         return 0;
5637 }
5638
5639 static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
5640 {
5641         struct io_ring_ctx *ctx = req->ctx;
5642
5643         io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
5644         return 0;
5645 }
5646
5647 static int io_rsrc_update_prep(struct io_kiocb *req,
5648                                 const struct io_uring_sqe *sqe)
5649 {
5650         if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL))
5651                 return -EINVAL;
5652         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5653                 return -EINVAL;
5654         if (sqe->ioprio || sqe->rw_flags)
5655                 return -EINVAL;
5656
5657         req->rsrc_update.offset = READ_ONCE(sqe->off);
5658         req->rsrc_update.nr_args = READ_ONCE(sqe->len);
5659         if (!req->rsrc_update.nr_args)
5660                 return -EINVAL;
5661         req->rsrc_update.arg = READ_ONCE(sqe->addr);
5662         return 0;
5663 }
5664
5665 static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
5666 {
5667         struct io_ring_ctx *ctx = req->ctx;
5668         struct io_uring_rsrc_update up;
5669         int ret;
5670
5671         if (issue_flags & IO_URING_F_NONBLOCK)
5672                 return -EAGAIN;
5673
5674         up.offset = req->rsrc_update.offset;
5675         up.data = req->rsrc_update.arg;
5676
5677         mutex_lock(&ctx->uring_lock);
5678         ret = __io_sqe_files_update(ctx, &up, req->rsrc_update.nr_args);
5679         mutex_unlock(&ctx->uring_lock);
5680
5681         if (ret < 0)
5682                 req_set_fail_links(req);
5683         __io_req_complete(req, issue_flags, ret, 0);
5684         return 0;
5685 }
5686
5687 static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5688 {
5689         switch (req->opcode) {
5690         case IORING_OP_NOP:
5691                 return 0;
5692         case IORING_OP_READV:
5693         case IORING_OP_READ_FIXED:
5694         case IORING_OP_READ:
5695                 return io_read_prep(req, sqe);
5696         case IORING_OP_WRITEV:
5697         case IORING_OP_WRITE_FIXED:
5698         case IORING_OP_WRITE:
5699                 return io_write_prep(req, sqe);
5700         case IORING_OP_POLL_ADD:
5701                 return io_poll_add_prep(req, sqe);
5702         case IORING_OP_POLL_REMOVE:
5703                 return io_poll_remove_prep(req, sqe);
5704         case IORING_OP_FSYNC:
5705                 return io_fsync_prep(req, sqe);
5706         case IORING_OP_SYNC_FILE_RANGE:
5707                 return io_sfr_prep(req, sqe);
5708         case IORING_OP_SENDMSG:
5709         case IORING_OP_SEND:
5710                 return io_sendmsg_prep(req, sqe);
5711         case IORING_OP_RECVMSG:
5712         case IORING_OP_RECV:
5713                 return io_recvmsg_prep(req, sqe);
5714         case IORING_OP_CONNECT:
5715                 return io_connect_prep(req, sqe);
5716         case IORING_OP_TIMEOUT:
5717                 return io_timeout_prep(req, sqe, false);
5718         case IORING_OP_TIMEOUT_REMOVE:
5719                 return io_timeout_remove_prep(req, sqe);
5720         case IORING_OP_ASYNC_CANCEL:
5721                 return io_async_cancel_prep(req, sqe);
5722         case IORING_OP_LINK_TIMEOUT:
5723                 return io_timeout_prep(req, sqe, true);
5724         case IORING_OP_ACCEPT:
5725                 return io_accept_prep(req, sqe);
5726         case IORING_OP_FALLOCATE:
5727                 return io_fallocate_prep(req, sqe);
5728         case IORING_OP_OPENAT:
5729                 return io_openat_prep(req, sqe);
5730         case IORING_OP_CLOSE:
5731                 return io_close_prep(req, sqe);
5732         case IORING_OP_FILES_UPDATE:
5733                 return io_rsrc_update_prep(req, sqe);
5734         case IORING_OP_STATX:
5735                 return io_statx_prep(req, sqe);
5736         case IORING_OP_FADVISE:
5737                 return io_fadvise_prep(req, sqe);
5738         case IORING_OP_MADVISE:
5739                 return io_madvise_prep(req, sqe);
5740         case IORING_OP_OPENAT2:
5741                 return io_openat2_prep(req, sqe);
5742         case IORING_OP_EPOLL_CTL:
5743                 return io_epoll_ctl_prep(req, sqe);
5744         case IORING_OP_SPLICE:
5745                 return io_splice_prep(req, sqe);
5746         case IORING_OP_PROVIDE_BUFFERS:
5747                 return io_provide_buffers_prep(req, sqe);
5748         case IORING_OP_REMOVE_BUFFERS:
5749                 return io_remove_buffers_prep(req, sqe);
5750         case IORING_OP_TEE:
5751                 return io_tee_prep(req, sqe);
5752         case IORING_OP_SHUTDOWN:
5753                 return io_shutdown_prep(req, sqe);
5754         case IORING_OP_RENAMEAT:
5755                 return io_renameat_prep(req, sqe);
5756         case IORING_OP_UNLINKAT:
5757                 return io_unlinkat_prep(req, sqe);
5758         }
5759
5760         printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
5761                         req->opcode);
5762         return-EINVAL;
5763 }
5764
5765 static int io_req_prep_async(struct io_kiocb *req)
5766 {
5767         switch (req->opcode) {
5768         case IORING_OP_READV:
5769         case IORING_OP_READ_FIXED:
5770         case IORING_OP_READ:
5771                 return io_rw_prep_async(req, READ);
5772         case IORING_OP_WRITEV:
5773         case IORING_OP_WRITE_FIXED:
5774         case IORING_OP_WRITE:
5775                 return io_rw_prep_async(req, WRITE);
5776         case IORING_OP_SENDMSG:
5777         case IORING_OP_SEND:
5778                 return io_sendmsg_prep_async(req);
5779         case IORING_OP_RECVMSG:
5780         case IORING_OP_RECV:
5781                 return io_recvmsg_prep_async(req);
5782         case IORING_OP_CONNECT:
5783                 return io_connect_prep_async(req);
5784         }
5785         return 0;
5786 }
5787
5788 static int io_req_defer_prep(struct io_kiocb *req)
5789 {
5790         if (!io_op_defs[req->opcode].needs_async_data)
5791                 return 0;
5792         /* some opcodes init it during the inital prep */
5793         if (req->async_data)
5794                 return 0;
5795         if (__io_alloc_async_data(req))
5796                 return -EAGAIN;
5797         return io_req_prep_async(req);
5798 }
5799
5800 static u32 io_get_sequence(struct io_kiocb *req)
5801 {
5802         struct io_kiocb *pos;
5803         struct io_ring_ctx *ctx = req->ctx;
5804         u32 total_submitted, nr_reqs = 0;
5805
5806         io_for_each_link(pos, req)
5807                 nr_reqs++;
5808
5809         total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
5810         return total_submitted - nr_reqs;
5811 }
5812
5813 static int io_req_defer(struct io_kiocb *req)
5814 {
5815         struct io_ring_ctx *ctx = req->ctx;
5816         struct io_defer_entry *de;
5817         int ret;
5818         u32 seq;
5819
5820         /* Still need defer if there is pending req in defer list. */
5821         if (likely(list_empty_careful(&ctx->defer_list) &&
5822                 !(req->flags & REQ_F_IO_DRAIN)))
5823                 return 0;
5824
5825         seq = io_get_sequence(req);
5826         /* Still a chance to pass the sequence check */
5827         if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
5828                 return 0;
5829
5830         ret = io_req_defer_prep(req);
5831         if (ret)
5832                 return ret;
5833         io_prep_async_link(req);
5834         de = kmalloc(sizeof(*de), GFP_KERNEL);
5835         if (!de)
5836                 return -ENOMEM;
5837
5838         spin_lock_irq(&ctx->completion_lock);
5839         if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
5840                 spin_unlock_irq(&ctx->completion_lock);
5841                 kfree(de);
5842                 io_queue_async_work(req);
5843                 return -EIOCBQUEUED;
5844         }
5845
5846         trace_io_uring_defer(ctx, req, req->user_data);
5847         de->req = req;
5848         de->seq = seq;
5849         list_add_tail(&de->list, &ctx->defer_list);
5850         spin_unlock_irq(&ctx->completion_lock);
5851         return -EIOCBQUEUED;
5852 }
5853
5854 static void __io_clean_op(struct io_kiocb *req)
5855 {
5856         if (req->flags & REQ_F_BUFFER_SELECTED) {
5857                 switch (req->opcode) {
5858                 case IORING_OP_READV:
5859                 case IORING_OP_READ_FIXED:
5860                 case IORING_OP_READ:
5861                         kfree((void *)(unsigned long)req->rw.addr);
5862                         break;
5863                 case IORING_OP_RECVMSG:
5864                 case IORING_OP_RECV:
5865                         kfree(req->sr_msg.kbuf);
5866                         break;
5867                 }
5868                 req->flags &= ~REQ_F_BUFFER_SELECTED;
5869         }
5870
5871         if (req->flags & REQ_F_NEED_CLEANUP) {
5872                 switch (req->opcode) {
5873                 case IORING_OP_READV:
5874                 case IORING_OP_READ_FIXED:
5875                 case IORING_OP_READ:
5876                 case IORING_OP_WRITEV:
5877                 case IORING_OP_WRITE_FIXED:
5878                 case IORING_OP_WRITE: {
5879                         struct io_async_rw *io = req->async_data;
5880                         if (io->free_iovec)
5881                                 kfree(io->free_iovec);
5882                         break;
5883                         }
5884                 case IORING_OP_RECVMSG:
5885                 case IORING_OP_SENDMSG: {
5886                         struct io_async_msghdr *io = req->async_data;
5887
5888                         kfree(io->free_iov);
5889                         break;
5890                         }
5891                 case IORING_OP_SPLICE:
5892                 case IORING_OP_TEE:
5893                         io_put_file(req, req->splice.file_in,
5894                                     (req->splice.flags & SPLICE_F_FD_IN_FIXED));
5895                         break;
5896                 case IORING_OP_OPENAT:
5897                 case IORING_OP_OPENAT2:
5898                         if (req->open.filename)
5899                                 putname(req->open.filename);
5900                         break;
5901                 case IORING_OP_RENAMEAT:
5902                         putname(req->rename.oldpath);
5903                         putname(req->rename.newpath);
5904                         break;
5905                 case IORING_OP_UNLINKAT:
5906                         putname(req->unlink.filename);
5907                         break;
5908                 }
5909                 req->flags &= ~REQ_F_NEED_CLEANUP;
5910         }
5911 }
5912
5913 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
5914 {
5915         struct io_ring_ctx *ctx = req->ctx;
5916         const struct cred *creds = NULL;
5917         int ret;
5918
5919         if (req->work.personality) {
5920                 const struct cred *new_creds;
5921
5922                 if (!(issue_flags & IO_URING_F_NONBLOCK))
5923                         mutex_lock(&ctx->uring_lock);
5924                 new_creds = idr_find(&ctx->personality_idr, req->work.personality);
5925                 if (!(issue_flags & IO_URING_F_NONBLOCK))
5926                         mutex_unlock(&ctx->uring_lock);
5927                 if (!new_creds)
5928                         return -EINVAL;
5929                 creds = override_creds(new_creds);
5930         }
5931
5932         switch (req->opcode) {
5933         case IORING_OP_NOP:
5934                 ret = io_nop(req, issue_flags);
5935                 break;
5936         case IORING_OP_READV:
5937         case IORING_OP_READ_FIXED:
5938         case IORING_OP_READ:
5939                 ret = io_read(req, issue_flags);
5940                 break;
5941         case IORING_OP_WRITEV:
5942         case IORING_OP_WRITE_FIXED:
5943         case IORING_OP_WRITE:
5944                 ret = io_write(req, issue_flags);
5945                 break;
5946         case IORING_OP_FSYNC:
5947                 ret = io_fsync(req, issue_flags);
5948                 break;
5949         case IORING_OP_POLL_ADD:
5950                 ret = io_poll_add(req, issue_flags);
5951                 break;
5952         case IORING_OP_POLL_REMOVE:
5953                 ret = io_poll_remove(req, issue_flags);
5954                 break;
5955         case IORING_OP_SYNC_FILE_RANGE:
5956                 ret = io_sync_file_range(req, issue_flags);
5957                 break;
5958         case IORING_OP_SENDMSG:
5959                 ret = io_sendmsg(req, issue_flags);
5960                 break;
5961         case IORING_OP_SEND:
5962                 ret = io_send(req, issue_flags);
5963                 break;
5964         case IORING_OP_RECVMSG:
5965                 ret = io_recvmsg(req, issue_flags);
5966                 break;
5967         case IORING_OP_RECV:
5968                 ret = io_recv(req, issue_flags);
5969                 break;
5970         case IORING_OP_TIMEOUT:
5971                 ret = io_timeout(req, issue_flags);
5972                 break;
5973         case IORING_OP_TIMEOUT_REMOVE:
5974                 ret = io_timeout_remove(req, issue_flags);
5975                 break;
5976         case IORING_OP_ACCEPT:
5977                 ret = io_accept(req, issue_flags);
5978                 break;
5979         case IORING_OP_CONNECT:
5980                 ret = io_connect(req, issue_flags);
5981                 break;
5982         case IORING_OP_ASYNC_CANCEL:
5983                 ret = io_async_cancel(req, issue_flags);
5984                 break;
5985         case IORING_OP_FALLOCATE:
5986                 ret = io_fallocate(req, issue_flags);
5987                 break;
5988         case IORING_OP_OPENAT:
5989                 ret = io_openat(req, issue_flags);
5990                 break;
5991         case IORING_OP_CLOSE:
5992                 ret = io_close(req, issue_flags);
5993                 break;
5994         case IORING_OP_FILES_UPDATE:
5995                 ret = io_files_update(req, issue_flags);
5996                 break;
5997         case IORING_OP_STATX:
5998                 ret = io_statx(req, issue_flags);
5999                 break;
6000         case IORING_OP_FADVISE:
6001                 ret = io_fadvise(req, issue_flags);
6002                 break;
6003         case IORING_OP_MADVISE:
6004                 ret = io_madvise(req, issue_flags);
6005                 break;
6006         case IORING_OP_OPENAT2:
6007                 ret = io_openat2(req, issue_flags);
6008                 break;
6009         case IORING_OP_EPOLL_CTL:
6010                 ret = io_epoll_ctl(req, issue_flags);
6011                 break;
6012         case IORING_OP_SPLICE:
6013                 ret = io_splice(req, issue_flags);
6014                 break;
6015         case IORING_OP_PROVIDE_BUFFERS:
6016                 ret = io_provide_buffers(req, issue_flags);
6017                 break;
6018         case IORING_OP_REMOVE_BUFFERS:
6019                 ret = io_remove_buffers(req, issue_flags);
6020                 break;
6021         case IORING_OP_TEE:
6022                 ret = io_tee(req, issue_flags);
6023                 break;
6024         case IORING_OP_SHUTDOWN:
6025                 ret = io_shutdown(req, issue_flags);
6026                 break;
6027         case IORING_OP_RENAMEAT:
6028                 ret = io_renameat(req, issue_flags);
6029                 break;
6030         case IORING_OP_UNLINKAT:
6031                 ret = io_unlinkat(req, issue_flags);
6032                 break;
6033         default:
6034                 ret = -EINVAL;
6035                 break;
6036         }
6037
6038         if (creds)
6039                 revert_creds(creds);
6040
6041         if (ret)
6042                 return ret;
6043
6044         /* If the op doesn't have a file, we're not polling for it */
6045         if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
6046                 const bool in_async = io_wq_current_is_worker();
6047
6048                 /* workqueue context doesn't hold uring_lock, grab it now */
6049                 if (in_async)
6050                         mutex_lock(&ctx->uring_lock);
6051
6052                 io_iopoll_req_issued(req, in_async);
6053
6054                 if (in_async)
6055                         mutex_unlock(&ctx->uring_lock);
6056         }
6057
6058         return 0;
6059 }
6060
6061 static void io_wq_submit_work(struct io_wq_work *work)
6062 {
6063         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6064         struct io_kiocb *timeout;
6065         int ret = 0;
6066
6067         timeout = io_prep_linked_timeout(req);
6068         if (timeout)
6069                 io_queue_linked_timeout(timeout);
6070
6071         if (work->flags & IO_WQ_WORK_CANCEL)
6072                 ret = -ECANCELED;
6073
6074         if (!ret) {
6075                 do {
6076                         ret = io_issue_sqe(req, 0);
6077                         /*
6078                          * We can get EAGAIN for polled IO even though we're
6079                          * forcing a sync submission from here, since we can't
6080                          * wait for request slots on the block side.
6081                          */
6082                         if (ret != -EAGAIN)
6083                                 break;
6084                         cond_resched();
6085                 } while (1);
6086         }
6087
6088         /* avoid locking problems by failing it from a clean context */
6089         if (ret) {
6090                 /* io-wq is going to take one down */
6091                 refcount_inc(&req->refs);
6092                 io_req_task_queue_fail(req, ret);
6093         }
6094 }
6095
6096 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
6097                                               int index)
6098 {
6099         struct fixed_rsrc_table *table;
6100
6101         table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
6102         return table->files[index & IORING_FILE_TABLE_MASK];
6103 }
6104
6105 static struct file *io_file_get(struct io_submit_state *state,
6106                                 struct io_kiocb *req, int fd, bool fixed)
6107 {
6108         struct io_ring_ctx *ctx = req->ctx;
6109         struct file *file;
6110
6111         if (fixed) {
6112                 if (unlikely((unsigned int)fd >= ctx->nr_user_files))
6113                         return NULL;
6114                 fd = array_index_nospec(fd, ctx->nr_user_files);
6115                 file = io_file_from_index(ctx, fd);
6116                 io_set_resource_node(req);
6117         } else {
6118                 trace_io_uring_file_get(ctx, fd);
6119                 file = __io_file_get(state, fd);
6120         }
6121
6122         if (file && unlikely(file->f_op == &io_uring_fops))
6123                 io_req_track_inflight(req);
6124         return file;
6125 }
6126
6127 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
6128 {
6129         struct io_timeout_data *data = container_of(timer,
6130                                                 struct io_timeout_data, timer);
6131         struct io_kiocb *prev, *req = data->req;
6132         struct io_ring_ctx *ctx = req->ctx;
6133         unsigned long flags;
6134
6135         spin_lock_irqsave(&ctx->completion_lock, flags);
6136         prev = req->timeout.head;
6137         req->timeout.head = NULL;
6138
6139         /*
6140          * We don't expect the list to be empty, that will only happen if we
6141          * race with the completion of the linked work.
6142          */
6143         if (prev && refcount_inc_not_zero(&prev->refs))
6144                 io_remove_next_linked(prev);
6145         else
6146                 prev = NULL;
6147         spin_unlock_irqrestore(&ctx->completion_lock, flags);
6148
6149         if (prev) {
6150                 req_set_fail_links(prev);
6151                 io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
6152                 io_put_req_deferred(prev, 1);
6153         } else {
6154                 io_req_complete_post(req, -ETIME, 0);
6155                 io_put_req_deferred(req, 1);
6156         }
6157         return HRTIMER_NORESTART;
6158 }
6159
6160 static void __io_queue_linked_timeout(struct io_kiocb *req)
6161 {
6162         /*
6163          * If the back reference is NULL, then our linked request finished
6164          * before we got a chance to setup the timer
6165          */
6166         if (req->timeout.head) {
6167                 struct io_timeout_data *data = req->async_data;
6168
6169                 data->timer.function = io_link_timeout_fn;
6170                 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
6171                                 data->mode);
6172         }
6173 }
6174
6175 static void io_queue_linked_timeout(struct io_kiocb *req)
6176 {
6177         struct io_ring_ctx *ctx = req->ctx;
6178
6179         spin_lock_irq(&ctx->completion_lock);
6180         __io_queue_linked_timeout(req);
6181         spin_unlock_irq(&ctx->completion_lock);
6182
6183         /* drop submission reference */
6184         io_put_req(req);
6185 }
6186
6187 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
6188 {
6189         struct io_kiocb *nxt = req->link;
6190
6191         if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) ||
6192             nxt->opcode != IORING_OP_LINK_TIMEOUT)
6193                 return NULL;
6194
6195         nxt->timeout.head = req;
6196         nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
6197         req->flags |= REQ_F_LINK_TIMEOUT;
6198         return nxt;
6199 }
6200
6201 static void __io_queue_sqe(struct io_kiocb *req)
6202 {
6203         struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
6204         int ret;
6205
6206         ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
6207
6208         /*
6209          * We async punt it if the file wasn't marked NOWAIT, or if the file
6210          * doesn't support non-blocking read/write attempts
6211          */
6212         if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
6213                 if (!io_arm_poll_handler(req)) {
6214                         /*
6215                          * Queued up for async execution, worker will release
6216                          * submit reference when the iocb is actually submitted.
6217                          */
6218                         io_queue_async_work(req);
6219                 }
6220         } else if (likely(!ret)) {
6221                 /* drop submission reference */
6222                 if (req->flags & REQ_F_COMPLETE_INLINE) {
6223                         struct io_ring_ctx *ctx = req->ctx;
6224                         struct io_comp_state *cs = &ctx->submit_state.comp;
6225
6226                         cs->reqs[cs->nr++] = req;
6227                         if (cs->nr == ARRAY_SIZE(cs->reqs))
6228                                 io_submit_flush_completions(cs, ctx);
6229                 } else {
6230                         io_put_req(req);
6231                 }
6232         } else {
6233                 req_set_fail_links(req);
6234                 io_put_req(req);
6235                 io_req_complete(req, ret);
6236         }
6237         if (linked_timeout)
6238                 io_queue_linked_timeout(linked_timeout);
6239 }
6240
6241 static void io_queue_sqe(struct io_kiocb *req)
6242 {
6243         int ret;
6244
6245         ret = io_req_defer(req);
6246         if (ret) {
6247                 if (ret != -EIOCBQUEUED) {
6248 fail_req:
6249                         req_set_fail_links(req);
6250                         io_put_req(req);
6251                         io_req_complete(req, ret);
6252                 }
6253         } else if (req->flags & REQ_F_FORCE_ASYNC) {
6254                 ret = io_req_defer_prep(req);
6255                 if (unlikely(ret))
6256                         goto fail_req;
6257                 io_queue_async_work(req);
6258         } else {
6259                 __io_queue_sqe(req);
6260         }
6261 }
6262
6263 /*
6264  * Check SQE restrictions (opcode and flags).
6265  *
6266  * Returns 'true' if SQE is allowed, 'false' otherwise.
6267  */
6268 static inline bool io_check_restriction(struct io_ring_ctx *ctx,
6269                                         struct io_kiocb *req,
6270                                         unsigned int sqe_flags)
6271 {
6272         if (!ctx->restricted)
6273                 return true;
6274
6275         if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
6276                 return false;
6277
6278         if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
6279             ctx->restrictions.sqe_flags_required)
6280                 return false;
6281
6282         if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
6283                           ctx->restrictions.sqe_flags_required))
6284                 return false;
6285
6286         return true;
6287 }
6288
6289 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
6290                        const struct io_uring_sqe *sqe)
6291 {
6292         struct io_submit_state *state;
6293         unsigned int sqe_flags;
6294         int ret = 0;
6295
6296         req->opcode = READ_ONCE(sqe->opcode);
6297         /* same numerical values with corresponding REQ_F_*, safe to copy */
6298         req->flags = sqe_flags = READ_ONCE(sqe->flags);
6299         req->user_data = READ_ONCE(sqe->user_data);
6300         req->async_data = NULL;
6301         req->file = NULL;
6302         req->ctx = ctx;
6303         req->link = NULL;
6304         req->fixed_rsrc_refs = NULL;
6305         /* one is dropped after submission, the other at completion */
6306         refcount_set(&req->refs, 2);
6307         req->task = current;
6308         req->result = 0;
6309
6310         /* enforce forwards compatibility on users */
6311         if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
6312                 req->flags = 0;
6313                 return -EINVAL;
6314         }
6315
6316         if (unlikely(req->opcode >= IORING_OP_LAST))
6317                 return -EINVAL;
6318
6319         if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
6320                 return -EACCES;
6321
6322         if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
6323             !io_op_defs[req->opcode].buffer_select)
6324                 return -EOPNOTSUPP;
6325
6326         req->work.list.next = NULL;
6327         req->work.flags = 0;
6328         req->work.personality = READ_ONCE(sqe->personality);
6329         state = &ctx->submit_state;
6330
6331         /*
6332          * Plug now if we have more than 1 IO left after this, and the target
6333          * is potentially a read/write to block based storage.
6334          */
6335         if (!state->plug_started && state->ios_left > 1 &&
6336             io_op_defs[req->opcode].plug) {
6337                 blk_start_plug(&state->plug);
6338                 state->plug_started = true;
6339         }
6340
6341         if (io_op_defs[req->opcode].needs_file) {
6342                 bool fixed = req->flags & REQ_F_FIXED_FILE;
6343
6344                 req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
6345                 if (unlikely(!req->file))
6346                         ret = -EBADF;
6347         }
6348
6349         state->ios_left--;
6350         return ret;
6351 }
6352
6353 static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
6354                          const struct io_uring_sqe *sqe)
6355 {
6356         struct io_submit_link *link = &ctx->submit_state.link;
6357         int ret;
6358
6359         ret = io_init_req(ctx, req, sqe);
6360         if (unlikely(ret)) {
6361 fail_req:
6362                 io_put_req(req);
6363                 io_req_complete(req, ret);
6364                 if (link->head) {
6365                         /* fail even hard links since we don't submit */
6366                         link->head->flags |= REQ_F_FAIL_LINK;
6367                         io_put_req(link->head);
6368                         io_req_complete(link->head, -ECANCELED);
6369                         link->head = NULL;
6370                 }
6371                 return ret;
6372         }
6373         ret = io_req_prep(req, sqe);
6374         if (unlikely(ret))
6375                 goto fail_req;
6376
6377         /* don't need @sqe from now on */
6378         trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
6379                                 true, ctx->flags & IORING_SETUP_SQPOLL);
6380
6381         /*
6382          * If we already have a head request, queue this one for async
6383          * submittal once the head completes. If we don't have a head but
6384          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
6385          * submitted sync once the chain is complete. If none of those
6386          * conditions are true (normal request), then just queue it.
6387          */
6388         if (link->head) {
6389                 struct io_kiocb *head = link->head;
6390
6391                 /*
6392                  * Taking sequential execution of a link, draining both sides
6393                  * of the link also fullfils IOSQE_IO_DRAIN semantics for all
6394                  * requests in the link. So, it drains the head and the
6395                  * next after the link request. The last one is done via
6396                  * drain_next flag to persist the effect across calls.
6397                  */
6398                 if (req->flags & REQ_F_IO_DRAIN) {
6399                         head->flags |= REQ_F_IO_DRAIN;
6400                         ctx->drain_next = 1;
6401                 }
6402                 ret = io_req_defer_prep(req);
6403                 if (unlikely(ret))
6404                         goto fail_req;
6405                 trace_io_uring_link(ctx, req, head);
6406                 link->last->link = req;
6407                 link->last = req;
6408
6409                 /* last request of a link, enqueue the link */
6410                 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
6411                         io_queue_sqe(head);
6412                         link->head = NULL;
6413                 }
6414         } else {
6415                 if (unlikely(ctx->drain_next)) {
6416                         req->flags |= REQ_F_IO_DRAIN;
6417                         ctx->drain_next = 0;
6418                 }
6419                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
6420                         link->head = req;
6421                         link->last = req;
6422                 } else {
6423                         io_queue_sqe(req);
6424                 }
6425         }
6426
6427         return 0;
6428 }
6429
6430 /*
6431  * Batched submission is done, ensure local IO is flushed out.
6432  */
6433 static void io_submit_state_end(struct io_submit_state *state,
6434                                 struct io_ring_ctx *ctx)
6435 {
6436         if (state->link.head)
6437                 io_queue_sqe(state->link.head);
6438         if (state->comp.nr)
6439                 io_submit_flush_completions(&state->comp, ctx);
6440         if (state->plug_started)
6441                 blk_finish_plug(&state->plug);
6442         io_state_file_put(state);
6443 }
6444
6445 /*
6446  * Start submission side cache.
6447  */
6448 static void io_submit_state_start(struct io_submit_state *state,
6449                                   unsigned int max_ios)
6450 {
6451         state->plug_started = false;
6452         state->ios_left = max_ios;
6453         /* set only head, no need to init link_last in advance */
6454         state->link.head = NULL;
6455 }
6456
6457 static void io_commit_sqring(struct io_ring_ctx *ctx)
6458 {
6459         struct io_rings *rings = ctx->rings;
6460
6461         /*
6462          * Ensure any loads from the SQEs are done at this point,
6463          * since once we write the new head, the application could
6464          * write new data to them.
6465          */
6466         smp_store_release(&rings->sq.head, ctx->cached_sq_head);
6467 }
6468
6469 /*
6470  * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
6471  * that is mapped by userspace. This means that care needs to be taken to
6472  * ensure that reads are stable, as we cannot rely on userspace always
6473  * being a good citizen. If members of the sqe are validated and then later
6474  * used, it's important that those reads are done through READ_ONCE() to
6475  * prevent a re-load down the line.
6476  */
6477 static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
6478 {
6479         u32 *sq_array = ctx->sq_array;
6480         unsigned head;
6481
6482         /*
6483          * The cached sq head (or cq tail) serves two purposes:
6484          *
6485          * 1) allows us to batch the cost of updating the user visible
6486          *    head updates.
6487          * 2) allows the kernel side to track the head on its own, even
6488          *    though the application is the one updating it.
6489          */
6490         head = READ_ONCE(sq_array[ctx->cached_sq_head++ & ctx->sq_mask]);
6491         if (likely(head < ctx->sq_entries))
6492                 return &ctx->sq_sqes[head];
6493
6494         /* drop invalid entries */
6495         ctx->cached_sq_dropped++;
6496         WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
6497         return NULL;
6498 }
6499
6500 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
6501 {
6502         int submitted = 0;
6503
6504         /* if we have a backlog and couldn't flush it all, return BUSY */
6505         if (test_bit(0, &ctx->sq_check_overflow)) {
6506                 if (!__io_cqring_overflow_flush(ctx, false, NULL, NULL))
6507                         return -EBUSY;
6508         }
6509
6510         /* make sure SQ entry isn't read before tail */
6511         nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
6512
6513         if (!percpu_ref_tryget_many(&ctx->refs, nr))
6514                 return -EAGAIN;
6515
6516         percpu_counter_add(&current->io_uring->inflight, nr);
6517         refcount_add(nr, &current->usage);
6518         io_submit_state_start(&ctx->submit_state, nr);
6519
6520         while (submitted < nr) {
6521                 const struct io_uring_sqe *sqe;
6522                 struct io_kiocb *req;
6523
6524                 req = io_alloc_req(ctx);
6525                 if (unlikely(!req)) {
6526                         if (!submitted)
6527                                 submitted = -EAGAIN;
6528                         break;
6529                 }
6530                 sqe = io_get_sqe(ctx);
6531                 if (unlikely(!sqe)) {
6532                         kmem_cache_free(req_cachep, req);
6533                         break;
6534                 }
6535                 /* will complete beyond this point, count as submitted */
6536                 submitted++;
6537                 if (io_submit_sqe(ctx, req, sqe))
6538                         break;
6539         }
6540
6541         if (unlikely(submitted != nr)) {
6542                 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
6543                 struct io_uring_task *tctx = current->io_uring;
6544                 int unused = nr - ref_used;
6545
6546                 percpu_ref_put_many(&ctx->refs, unused);
6547                 percpu_counter_sub(&tctx->inflight, unused);
6548                 put_task_struct_many(current, unused);
6549         }
6550
6551         io_submit_state_end(&ctx->submit_state, ctx);
6552          /* Commit SQ ring head once we've consumed and submitted all SQEs */
6553         io_commit_sqring(ctx);
6554
6555         return submitted;
6556 }
6557
6558 static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
6559 {
6560         /* Tell userspace we may need a wakeup call */
6561         spin_lock_irq(&ctx->completion_lock);
6562         ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
6563         spin_unlock_irq(&ctx->completion_lock);
6564 }
6565
6566 static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
6567 {
6568         spin_lock_irq(&ctx->completion_lock);
6569         ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6570         spin_unlock_irq(&ctx->completion_lock);
6571 }
6572
6573 static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
6574 {
6575         unsigned int to_submit;
6576         int ret = 0;
6577
6578         to_submit = io_sqring_entries(ctx);
6579         /* if we're handling multiple rings, cap submit size for fairness */
6580         if (cap_entries && to_submit > 8)
6581                 to_submit = 8;
6582
6583         if (!list_empty(&ctx->iopoll_list) || to_submit) {
6584                 unsigned nr_events = 0;
6585
6586                 mutex_lock(&ctx->uring_lock);
6587                 if (!list_empty(&ctx->iopoll_list))
6588                         io_do_iopoll(ctx, &nr_events, 0);
6589
6590                 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)))
6591                         ret = io_submit_sqes(ctx, to_submit);
6592                 mutex_unlock(&ctx->uring_lock);
6593         }
6594
6595         if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
6596                 wake_up(&ctx->sqo_sq_wait);
6597
6598         return ret;
6599 }
6600
6601 static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
6602 {
6603         struct io_ring_ctx *ctx;
6604         unsigned sq_thread_idle = 0;
6605
6606         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
6607                 if (sq_thread_idle < ctx->sq_thread_idle)
6608                         sq_thread_idle = ctx->sq_thread_idle;
6609         }
6610
6611         sqd->sq_thread_idle = sq_thread_idle;
6612 }
6613
6614 static void io_sqd_init_new(struct io_sq_data *sqd)
6615 {
6616         struct io_ring_ctx *ctx;
6617
6618         while (!list_empty(&sqd->ctx_new_list)) {
6619                 ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list);
6620                 list_move_tail(&ctx->sqd_list, &sqd->ctx_list);
6621                 complete(&ctx->sq_thread_comp);
6622         }
6623
6624         io_sqd_update_thread_idle(sqd);
6625 }
6626
6627 static bool io_sq_thread_should_stop(struct io_sq_data *sqd)
6628 {
6629         return test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
6630 }
6631
6632 static bool io_sq_thread_should_park(struct io_sq_data *sqd)
6633 {
6634         return test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
6635 }
6636
6637 static void io_sq_thread_parkme(struct io_sq_data *sqd)
6638 {
6639         for (;;) {
6640                 /*
6641                  * TASK_PARKED is a special state; we must serialize against
6642                  * possible pending wakeups to avoid store-store collisions on
6643                  * task->state.
6644                  *
6645                  * Such a collision might possibly result in the task state
6646                  * changin from TASK_PARKED and us failing the
6647                  * wait_task_inactive() in kthread_park().
6648                  */
6649                 set_special_state(TASK_PARKED);
6650                 if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state))
6651                         break;
6652
6653                 /*
6654                  * Thread is going to call schedule(), do not preempt it,
6655                  * or the caller of kthread_park() may spend more time in
6656                  * wait_task_inactive().
6657                  */
6658                 preempt_disable();
6659                 complete(&sqd->parked);
6660                 schedule_preempt_disabled();
6661                 preempt_enable();
6662         }
6663         __set_current_state(TASK_RUNNING);
6664 }
6665
6666 static int io_sq_thread(void *data)
6667 {
6668         struct io_sq_data *sqd = data;
6669         struct io_ring_ctx *ctx;
6670         unsigned long timeout = 0;
6671         char buf[TASK_COMM_LEN];
6672         DEFINE_WAIT(wait);
6673
6674         sprintf(buf, "iou-sqp-%d", sqd->task_pid);
6675         set_task_comm(current, buf);
6676         current->pf_io_worker = NULL;
6677
6678         if (sqd->sq_cpu != -1)
6679                 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
6680         else
6681                 set_cpus_allowed_ptr(current, cpu_online_mask);
6682         current->flags |= PF_NO_SETAFFINITY;
6683
6684         wait_for_completion(&sqd->startup);
6685
6686         while (!io_sq_thread_should_stop(sqd)) {
6687                 int ret;
6688                 bool cap_entries, sqt_spin, needs_sched;
6689
6690                 /*
6691                  * Any changes to the sqd lists are synchronized through the
6692                  * thread parking. This synchronizes the thread vs users,
6693                  * the users are synchronized on the sqd->ctx_lock.
6694                  */
6695                 if (io_sq_thread_should_park(sqd)) {
6696                         io_sq_thread_parkme(sqd);
6697                         continue;
6698                 }
6699                 if (unlikely(!list_empty(&sqd->ctx_new_list))) {
6700                         io_sqd_init_new(sqd);
6701                         timeout = jiffies + sqd->sq_thread_idle;
6702                 }
6703                 if (fatal_signal_pending(current))
6704                         break;
6705                 sqt_spin = false;
6706                 cap_entries = !list_is_singular(&sqd->ctx_list);
6707                 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
6708                         ret = __io_sq_thread(ctx, cap_entries);
6709                         if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
6710                                 sqt_spin = true;
6711                 }
6712
6713                 if (sqt_spin || !time_after(jiffies, timeout)) {
6714                         io_run_task_work();
6715                         cond_resched();
6716                         if (sqt_spin)
6717                                 timeout = jiffies + sqd->sq_thread_idle;
6718                         continue;
6719                 }
6720
6721                 needs_sched = true;
6722                 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
6723                 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
6724                         if ((ctx->flags & IORING_SETUP_IOPOLL) &&
6725                             !list_empty_careful(&ctx->iopoll_list)) {
6726                                 needs_sched = false;
6727                                 break;
6728                         }
6729                         if (io_sqring_entries(ctx)) {
6730                                 needs_sched = false;
6731                                 break;
6732                         }
6733                 }
6734
6735                 if (needs_sched && !io_sq_thread_should_park(sqd)) {
6736                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6737                                 io_ring_set_wakeup_flag(ctx);
6738
6739                         schedule();
6740                         try_to_freeze();
6741                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6742                                 io_ring_clear_wakeup_flag(ctx);
6743                 }
6744
6745                 finish_wait(&sqd->wait, &wait);
6746                 timeout = jiffies + sqd->sq_thread_idle;
6747         }
6748
6749         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6750                 io_uring_cancel_sqpoll(ctx);
6751
6752         io_run_task_work();
6753
6754         /*
6755          * Ensure that we park properly if racing with someone trying to park
6756          * while we're exiting. If we fail to grab the lock, check park and
6757          * park if necessary. The ordering with the park bit and the lock
6758          * ensures that we catch this reliably.
6759          */
6760         if (!mutex_trylock(&sqd->lock)) {
6761                 if (io_sq_thread_should_park(sqd))
6762                         io_sq_thread_parkme(sqd);
6763                 mutex_lock(&sqd->lock);
6764         }
6765
6766         sqd->thread = NULL;
6767         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
6768                 ctx->sqo_exec = 1;
6769                 io_ring_set_wakeup_flag(ctx);
6770         }
6771
6772         complete(&sqd->exited);
6773         mutex_unlock(&sqd->lock);
6774         do_exit(0);
6775 }
6776
6777 struct io_wait_queue {
6778         struct wait_queue_entry wq;
6779         struct io_ring_ctx *ctx;
6780         unsigned to_wait;
6781         unsigned nr_timeouts;
6782 };
6783
6784 static inline bool io_should_wake(struct io_wait_queue *iowq)
6785 {
6786         struct io_ring_ctx *ctx = iowq->ctx;
6787
6788         /*
6789          * Wake up if we have enough events, or if a timeout occurred since we
6790          * started waiting. For timeouts, we always want to return to userspace,
6791          * regardless of event count.
6792          */
6793         return io_cqring_events(ctx) >= iowq->to_wait ||
6794                         atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
6795 }
6796
6797 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
6798                             int wake_flags, void *key)
6799 {
6800         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
6801                                                         wq);
6802
6803         /*
6804          * Cannot safely flush overflowed CQEs from here, ensure we wake up
6805          * the task, and the next invocation will do it.
6806          */
6807         if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->cq_check_overflow))
6808                 return autoremove_wake_function(curr, mode, wake_flags, key);
6809         return -1;
6810 }
6811
6812 static int io_run_task_work_sig(void)
6813 {
6814         if (io_run_task_work())
6815                 return 1;
6816         if (!signal_pending(current))
6817                 return 0;
6818         if (test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))
6819                 return -ERESTARTSYS;
6820         return -EINTR;
6821 }
6822
6823 /* when returns >0, the caller should retry */
6824 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
6825                                           struct io_wait_queue *iowq,
6826                                           signed long *timeout)
6827 {
6828         int ret;
6829
6830         /* make sure we run task_work before checking for signals */
6831         ret = io_run_task_work_sig();
6832         if (ret || io_should_wake(iowq))
6833                 return ret;
6834         /* let the caller flush overflows, retry */
6835         if (test_bit(0, &ctx->cq_check_overflow))
6836                 return 1;
6837
6838         *timeout = schedule_timeout(*timeout);
6839         return !*timeout ? -ETIME : 1;
6840 }
6841
6842 /*
6843  * Wait until events become available, if we don't already have some. The
6844  * application must reap them itself, as they reside on the shared cq ring.
6845  */
6846 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
6847                           const sigset_t __user *sig, size_t sigsz,
6848                           struct __kernel_timespec __user *uts)
6849 {
6850         struct io_wait_queue iowq = {
6851                 .wq = {
6852                         .private        = current,
6853                         .func           = io_wake_function,
6854                         .entry          = LIST_HEAD_INIT(iowq.wq.entry),
6855                 },
6856                 .ctx            = ctx,
6857                 .to_wait        = min_events,
6858         };
6859         struct io_rings *rings = ctx->rings;
6860         signed long timeout = MAX_SCHEDULE_TIMEOUT;
6861         int ret;
6862
6863         do {
6864                 io_cqring_overflow_flush(ctx, false, NULL, NULL);
6865                 if (io_cqring_events(ctx) >= min_events)
6866                         return 0;
6867                 if (!io_run_task_work())
6868                         break;
6869         } while (1);
6870
6871         if (sig) {
6872 #ifdef CONFIG_COMPAT
6873                 if (in_compat_syscall())
6874                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
6875                                                       sigsz);
6876                 else
6877 #endif
6878                         ret = set_user_sigmask(sig, sigsz);
6879
6880                 if (ret)
6881                         return ret;
6882         }
6883
6884         if (uts) {
6885                 struct timespec64 ts;
6886
6887                 if (get_timespec64(&ts, uts))
6888                         return -EFAULT;
6889                 timeout = timespec64_to_jiffies(&ts);
6890         }
6891
6892         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
6893         trace_io_uring_cqring_wait(ctx, min_events);
6894         do {
6895                 /* if we can't even flush overflow, don't wait for more */
6896                 if (!io_cqring_overflow_flush(ctx, false, NULL, NULL)) {
6897                         ret = -EBUSY;
6898                         break;
6899                 }
6900                 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
6901                                                 TASK_INTERRUPTIBLE);
6902                 ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
6903                 finish_wait(&ctx->wait, &iowq.wq);
6904                 cond_resched();
6905         } while (ret > 0);
6906
6907         restore_saved_sigmask_unless(ret == -EINTR);
6908
6909         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
6910 }
6911
6912 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
6913 {
6914 #if defined(CONFIG_UNIX)
6915         if (ctx->ring_sock) {
6916                 struct sock *sock = ctx->ring_sock->sk;
6917                 struct sk_buff *skb;
6918
6919                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
6920                         kfree_skb(skb);
6921         }
6922 #else
6923         int i;
6924
6925         for (i = 0; i < ctx->nr_user_files; i++) {
6926                 struct file *file;
6927
6928                 file = io_file_from_index(ctx, i);
6929                 if (file)
6930                         fput(file);
6931         }
6932 #endif
6933 }
6934
6935 static void io_rsrc_data_ref_zero(struct percpu_ref *ref)
6936 {
6937         struct fixed_rsrc_data *data;
6938
6939         data = container_of(ref, struct fixed_rsrc_data, refs);
6940         complete(&data->done);
6941 }
6942
6943 static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
6944 {
6945         spin_lock_bh(&ctx->rsrc_ref_lock);
6946 }
6947
6948 static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx)
6949 {
6950         spin_unlock_bh(&ctx->rsrc_ref_lock);
6951 }
6952
6953 static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx,
6954                                  struct fixed_rsrc_data *rsrc_data,
6955                                  struct fixed_rsrc_ref_node *ref_node)
6956 {
6957         io_rsrc_ref_lock(ctx);
6958         rsrc_data->node = ref_node;
6959         list_add_tail(&ref_node->node, &ctx->rsrc_ref_list);
6960         io_rsrc_ref_unlock(ctx);
6961         percpu_ref_get(&rsrc_data->refs);
6962 }
6963
6964 static void io_sqe_rsrc_kill_node(struct io_ring_ctx *ctx, struct fixed_rsrc_data *data)
6965 {
6966         struct fixed_rsrc_ref_node *ref_node = NULL;
6967
6968         io_rsrc_ref_lock(ctx);
6969         ref_node = data->node;
6970         data->node = NULL;
6971         io_rsrc_ref_unlock(ctx);
6972         if (ref_node)
6973                 percpu_ref_kill(&ref_node->refs);
6974 }
6975
6976 static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
6977                                struct io_ring_ctx *ctx,
6978                                void (*rsrc_put)(struct io_ring_ctx *ctx,
6979                                                 struct io_rsrc_put *prsrc))
6980 {
6981         struct fixed_rsrc_ref_node *backup_node;
6982         int ret;
6983
6984         if (data->quiesce)
6985                 return -ENXIO;
6986
6987         data->quiesce = true;
6988         do {
6989                 ret = -ENOMEM;
6990                 backup_node = alloc_fixed_rsrc_ref_node(ctx);
6991                 if (!backup_node)
6992                         break;
6993                 backup_node->rsrc_data = data;
6994                 backup_node->rsrc_put = rsrc_put;
6995
6996                 io_sqe_rsrc_kill_node(ctx, data);
6997                 percpu_ref_kill(&data->refs);
6998                 flush_delayed_work(&ctx->rsrc_put_work);
6999
7000                 ret = wait_for_completion_interruptible(&data->done);
7001                 if (!ret)
7002                         break;
7003
7004                 percpu_ref_resurrect(&data->refs);
7005                 io_sqe_rsrc_set_node(ctx, data, backup_node);
7006                 backup_node = NULL;
7007                 reinit_completion(&data->done);
7008                 mutex_unlock(&ctx->uring_lock);
7009                 ret = io_run_task_work_sig();
7010                 mutex_lock(&ctx->uring_lock);
7011         } while (ret >= 0);
7012         data->quiesce = false;
7013
7014         if (backup_node)
7015                 destroy_fixed_rsrc_ref_node(backup_node);
7016         return ret;
7017 }
7018
7019 static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx)
7020 {
7021         struct fixed_rsrc_data *data;
7022
7023         data = kzalloc(sizeof(*data), GFP_KERNEL);
7024         if (!data)
7025                 return NULL;
7026
7027         if (percpu_ref_init(&data->refs, io_rsrc_data_ref_zero,
7028                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
7029                 kfree(data);
7030                 return NULL;
7031         }
7032         data->ctx = ctx;
7033         init_completion(&data->done);
7034         return data;
7035 }
7036
7037 static void free_fixed_rsrc_data(struct fixed_rsrc_data *data)
7038 {
7039         percpu_ref_exit(&data->refs);
7040         kfree(data->table);
7041         kfree(data);
7042 }
7043
7044 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
7045 {
7046         struct fixed_rsrc_data *data = ctx->file_data;
7047         unsigned nr_tables, i;
7048         int ret;
7049
7050         /*
7051          * percpu_ref_is_dying() is to stop parallel files unregister
7052          * Since we possibly drop uring lock later in this function to
7053          * run task work.
7054          */
7055         if (!data || percpu_ref_is_dying(&data->refs))
7056                 return -ENXIO;
7057         ret = io_rsrc_ref_quiesce(data, ctx, io_ring_file_put);
7058         if (ret)
7059                 return ret;
7060
7061         __io_sqe_files_unregister(ctx);
7062         nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
7063         for (i = 0; i < nr_tables; i++)
7064                 kfree(data->table[i].files);
7065         free_fixed_rsrc_data(data);
7066         ctx->file_data = NULL;
7067         ctx->nr_user_files = 0;
7068         return 0;
7069 }
7070
7071 static void io_sq_thread_unpark(struct io_sq_data *sqd)
7072         __releases(&sqd->lock)
7073 {
7074         if (sqd->thread == current)
7075                 return;
7076         clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7077         if (sqd->thread)
7078                 wake_up_state(sqd->thread, TASK_PARKED);
7079         mutex_unlock(&sqd->lock);
7080 }
7081
7082 static void io_sq_thread_park(struct io_sq_data *sqd)
7083         __acquires(&sqd->lock)
7084 {
7085         if (sqd->thread == current)
7086                 return;
7087         set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7088         mutex_lock(&sqd->lock);
7089         if (sqd->thread) {
7090                 wake_up_process(sqd->thread);
7091                 wait_for_completion(&sqd->parked);
7092         }
7093 }
7094
7095 static void io_sq_thread_stop(struct io_sq_data *sqd)
7096 {
7097         if (test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state))
7098                 return;
7099         mutex_lock(&sqd->lock);
7100         if (sqd->thread) {
7101                 set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
7102                 WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state));
7103                 wake_up_process(sqd->thread);
7104                 mutex_unlock(&sqd->lock);
7105                 wait_for_completion(&sqd->exited);
7106                 WARN_ON_ONCE(sqd->thread);
7107         } else {
7108                 mutex_unlock(&sqd->lock);
7109         }
7110 }
7111
7112 static void io_put_sq_data(struct io_sq_data *sqd)
7113 {
7114         if (refcount_dec_and_test(&sqd->refs)) {
7115                 io_sq_thread_stop(sqd);
7116                 kfree(sqd);
7117         }
7118 }
7119
7120 static void io_sq_thread_finish(struct io_ring_ctx *ctx)
7121 {
7122         struct io_sq_data *sqd = ctx->sq_data;
7123
7124         if (sqd) {
7125                 complete(&sqd->startup);
7126                 if (sqd->thread) {
7127                         wait_for_completion(&ctx->sq_thread_comp);
7128                         io_sq_thread_park(sqd);
7129                 }
7130
7131                 mutex_lock(&sqd->ctx_lock);
7132                 list_del(&ctx->sqd_list);
7133                 io_sqd_update_thread_idle(sqd);
7134                 mutex_unlock(&sqd->ctx_lock);
7135
7136                 if (sqd->thread)
7137                         io_sq_thread_unpark(sqd);
7138
7139                 io_put_sq_data(sqd);
7140                 ctx->sq_data = NULL;
7141         }
7142 }
7143
7144 static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
7145 {
7146         struct io_ring_ctx *ctx_attach;
7147         struct io_sq_data *sqd;
7148         struct fd f;
7149
7150         f = fdget(p->wq_fd);
7151         if (!f.file)
7152                 return ERR_PTR(-ENXIO);
7153         if (f.file->f_op != &io_uring_fops) {
7154                 fdput(f);
7155                 return ERR_PTR(-EINVAL);
7156         }
7157
7158         ctx_attach = f.file->private_data;
7159         sqd = ctx_attach->sq_data;
7160         if (!sqd) {
7161                 fdput(f);
7162                 return ERR_PTR(-EINVAL);
7163         }
7164
7165         refcount_inc(&sqd->refs);
7166         fdput(f);
7167         return sqd;
7168 }
7169
7170 static struct io_sq_data *io_get_sq_data(struct io_uring_params *p)
7171 {
7172         struct io_sq_data *sqd;
7173
7174         if (p->flags & IORING_SETUP_ATTACH_WQ)
7175                 return io_attach_sq_data(p);
7176
7177         sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
7178         if (!sqd)
7179                 return ERR_PTR(-ENOMEM);
7180
7181         refcount_set(&sqd->refs, 1);
7182         INIT_LIST_HEAD(&sqd->ctx_list);
7183         INIT_LIST_HEAD(&sqd->ctx_new_list);
7184         mutex_init(&sqd->ctx_lock);
7185         mutex_init(&sqd->lock);
7186         init_waitqueue_head(&sqd->wait);
7187         init_completion(&sqd->startup);
7188         init_completion(&sqd->parked);
7189         init_completion(&sqd->exited);
7190         return sqd;
7191 }
7192
7193 #if defined(CONFIG_UNIX)
7194 /*
7195  * Ensure the UNIX gc is aware of our file set, so we are certain that
7196  * the io_uring can be safely unregistered on process exit, even if we have
7197  * loops in the file referencing.
7198  */
7199 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
7200 {
7201         struct sock *sk = ctx->ring_sock->sk;
7202         struct scm_fp_list *fpl;
7203         struct sk_buff *skb;
7204         int i, nr_files;
7205
7206         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
7207         if (!fpl)
7208                 return -ENOMEM;
7209
7210         skb = alloc_skb(0, GFP_KERNEL);
7211         if (!skb) {
7212                 kfree(fpl);
7213                 return -ENOMEM;
7214         }
7215
7216         skb->sk = sk;
7217
7218         nr_files = 0;
7219         fpl->user = get_uid(current_user());
7220         for (i = 0; i < nr; i++) {
7221                 struct file *file = io_file_from_index(ctx, i + offset);
7222
7223                 if (!file)
7224                         continue;
7225                 fpl->fp[nr_files] = get_file(file);
7226                 unix_inflight(fpl->user, fpl->fp[nr_files]);
7227                 nr_files++;
7228         }
7229
7230         if (nr_files) {
7231                 fpl->max = SCM_MAX_FD;
7232                 fpl->count = nr_files;
7233                 UNIXCB(skb).fp = fpl;
7234                 skb->destructor = unix_destruct_scm;
7235                 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
7236                 skb_queue_head(&sk->sk_receive_queue, skb);
7237
7238                 for (i = 0; i < nr_files; i++)
7239                         fput(fpl->fp[i]);
7240         } else {
7241                 kfree_skb(skb);
7242                 kfree(fpl);
7243         }
7244
7245         return 0;
7246 }
7247
7248 /*
7249  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
7250  * causes regular reference counting to break down. We rely on the UNIX
7251  * garbage collection to take care of this problem for us.
7252  */
7253 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7254 {
7255         unsigned left, total;
7256         int ret = 0;
7257
7258         total = 0;
7259         left = ctx->nr_user_files;
7260         while (left) {
7261                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
7262
7263                 ret = __io_sqe_files_scm(ctx, this_files, total);
7264                 if (ret)
7265                         break;
7266                 left -= this_files;
7267                 total += this_files;
7268         }
7269
7270         if (!ret)
7271                 return 0;
7272
7273         while (total < ctx->nr_user_files) {
7274                 struct file *file = io_file_from_index(ctx, total);
7275
7276                 if (file)
7277                         fput(file);
7278                 total++;
7279         }
7280
7281         return ret;
7282 }
7283 #else
7284 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7285 {
7286         return 0;
7287 }
7288 #endif
7289
7290 static int io_sqe_alloc_file_tables(struct fixed_rsrc_data *file_data,
7291                                     unsigned nr_tables, unsigned nr_files)
7292 {
7293         int i;
7294
7295         for (i = 0; i < nr_tables; i++) {
7296                 struct fixed_rsrc_table *table = &file_data->table[i];
7297                 unsigned this_files;
7298
7299                 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
7300                 table->files = kcalloc(this_files, sizeof(struct file *),
7301                                         GFP_KERNEL);
7302                 if (!table->files)
7303                         break;
7304                 nr_files -= this_files;
7305         }
7306
7307         if (i == nr_tables)
7308                 return 0;
7309
7310         for (i = 0; i < nr_tables; i++) {
7311                 struct fixed_rsrc_table *table = &file_data->table[i];
7312                 kfree(table->files);
7313         }
7314         return 1;
7315 }
7316
7317 static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
7318 {
7319         struct file *file = prsrc->file;
7320 #if defined(CONFIG_UNIX)
7321         struct sock *sock = ctx->ring_sock->sk;
7322         struct sk_buff_head list, *head = &sock->sk_receive_queue;
7323         struct sk_buff *skb;
7324         int i;
7325
7326         __skb_queue_head_init(&list);
7327
7328         /*
7329          * Find the skb that holds this file in its SCM_RIGHTS. When found,
7330          * remove this entry and rearrange the file array.
7331          */
7332         skb = skb_dequeue(head);
7333         while (skb) {
7334                 struct scm_fp_list *fp;
7335
7336                 fp = UNIXCB(skb).fp;
7337                 for (i = 0; i < fp->count; i++) {
7338                         int left;
7339
7340                         if (fp->fp[i] != file)
7341                                 continue;
7342
7343                         unix_notinflight(fp->user, fp->fp[i]);
7344                         left = fp->count - 1 - i;
7345                         if (left) {
7346                                 memmove(&fp->fp[i], &fp->fp[i + 1],
7347                                                 left * sizeof(struct file *));
7348                         }
7349                         fp->count--;
7350                         if (!fp->count) {
7351                                 kfree_skb(skb);
7352                                 skb = NULL;
7353                         } else {
7354                                 __skb_queue_tail(&list, skb);
7355                         }
7356                         fput(file);
7357                         file = NULL;
7358                         break;
7359                 }
7360
7361                 if (!file)
7362                         break;
7363
7364                 __skb_queue_tail(&list, skb);
7365
7366                 skb = skb_dequeue(head);
7367         }
7368
7369         if (skb_peek(&list)) {
7370                 spin_lock_irq(&head->lock);
7371                 while ((skb = __skb_dequeue(&list)) != NULL)
7372                         __skb_queue_tail(head, skb);
7373                 spin_unlock_irq(&head->lock);
7374         }
7375 #else
7376         fput(file);
7377 #endif
7378 }
7379
7380 static void __io_rsrc_put_work(struct fixed_rsrc_ref_node *ref_node)
7381 {
7382         struct fixed_rsrc_data *rsrc_data = ref_node->rsrc_data;
7383         struct io_ring_ctx *ctx = rsrc_data->ctx;
7384         struct io_rsrc_put *prsrc, *tmp;
7385
7386         list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
7387                 list_del(&prsrc->list);
7388                 ref_node->rsrc_put(ctx, prsrc);
7389                 kfree(prsrc);
7390         }
7391
7392         percpu_ref_exit(&ref_node->refs);
7393         kfree(ref_node);
7394         percpu_ref_put(&rsrc_data->refs);
7395 }
7396
7397 static void io_rsrc_put_work(struct work_struct *work)
7398 {
7399         struct io_ring_ctx *ctx;
7400         struct llist_node *node;
7401
7402         ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
7403         node = llist_del_all(&ctx->rsrc_put_llist);
7404
7405         while (node) {
7406                 struct fixed_rsrc_ref_node *ref_node;
7407                 struct llist_node *next = node->next;
7408
7409                 ref_node = llist_entry(node, struct fixed_rsrc_ref_node, llist);
7410                 __io_rsrc_put_work(ref_node);
7411                 node = next;
7412         }
7413 }
7414
7415 static struct file **io_fixed_file_slot(struct fixed_rsrc_data *file_data,
7416                                         unsigned i)
7417 {
7418         struct fixed_rsrc_table *table;
7419
7420         table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
7421         return &table->files[i & IORING_FILE_TABLE_MASK];
7422 }
7423
7424 static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
7425 {
7426         struct fixed_rsrc_ref_node *ref_node;
7427         struct fixed_rsrc_data *data;
7428         struct io_ring_ctx *ctx;
7429         bool first_add = false;
7430         int delay = HZ;
7431
7432         ref_node = container_of(ref, struct fixed_rsrc_ref_node, refs);
7433         data = ref_node->rsrc_data;
7434         ctx = data->ctx;
7435
7436         io_rsrc_ref_lock(ctx);
7437         ref_node->done = true;
7438
7439         while (!list_empty(&ctx->rsrc_ref_list)) {
7440                 ref_node = list_first_entry(&ctx->rsrc_ref_list,
7441                                         struct fixed_rsrc_ref_node, node);
7442                 /* recycle ref nodes in order */
7443                 if (!ref_node->done)
7444                         break;
7445                 list_del(&ref_node->node);
7446                 first_add |= llist_add(&ref_node->llist, &ctx->rsrc_put_llist);
7447         }
7448         io_rsrc_ref_unlock(ctx);
7449
7450         if (percpu_ref_is_dying(&data->refs))
7451                 delay = 0;
7452
7453         if (!delay)
7454                 mod_delayed_work(system_wq, &ctx->rsrc_put_work, 0);
7455         else if (first_add)
7456                 queue_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
7457 }
7458
7459 static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
7460                         struct io_ring_ctx *ctx)
7461 {
7462         struct fixed_rsrc_ref_node *ref_node;
7463
7464         ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
7465         if (!ref_node)
7466                 return NULL;
7467
7468         if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
7469                             0, GFP_KERNEL)) {
7470                 kfree(ref_node);
7471                 return NULL;
7472         }
7473         INIT_LIST_HEAD(&ref_node->node);
7474         INIT_LIST_HEAD(&ref_node->rsrc_list);
7475         ref_node->done = false;
7476         return ref_node;
7477 }
7478
7479 static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
7480                                      struct fixed_rsrc_ref_node *ref_node)
7481 {
7482         ref_node->rsrc_data = ctx->file_data;
7483         ref_node->rsrc_put = io_ring_file_put;
7484 }
7485
7486 static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
7487 {
7488         percpu_ref_exit(&ref_node->refs);
7489         kfree(ref_node);
7490 }
7491
7492
7493 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
7494                                  unsigned nr_args)
7495 {
7496         __s32 __user *fds = (__s32 __user *) arg;
7497         unsigned nr_tables, i;
7498         struct file *file;
7499         int fd, ret = -ENOMEM;
7500         struct fixed_rsrc_ref_node *ref_node;
7501         struct fixed_rsrc_data *file_data;
7502
7503         if (ctx->file_data)
7504                 return -EBUSY;
7505         if (!nr_args)
7506                 return -EINVAL;
7507         if (nr_args > IORING_MAX_FIXED_FILES)
7508                 return -EMFILE;
7509
7510         file_data = alloc_fixed_rsrc_data(ctx);
7511         if (!file_data)
7512                 return -ENOMEM;
7513         ctx->file_data = file_data;
7514
7515         nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
7516         file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
7517                                    GFP_KERNEL);
7518         if (!file_data->table)
7519                 goto out_free;
7520
7521         if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args))
7522                 goto out_free;
7523
7524         for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
7525                 if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
7526                         ret = -EFAULT;
7527                         goto out_fput;
7528                 }
7529                 /* allow sparse sets */
7530                 if (fd == -1)
7531                         continue;
7532
7533                 file = fget(fd);
7534                 ret = -EBADF;
7535                 if (!file)
7536                         goto out_fput;
7537
7538                 /*
7539                  * Don't allow io_uring instances to be registered. If UNIX
7540                  * isn't enabled, then this causes a reference cycle and this
7541                  * instance can never get freed. If UNIX is enabled we'll
7542                  * handle it just fine, but there's still no point in allowing
7543                  * a ring fd as it doesn't support regular read/write anyway.
7544                  */
7545                 if (file->f_op == &io_uring_fops) {
7546                         fput(file);
7547                         goto out_fput;
7548                 }
7549                 *io_fixed_file_slot(file_data, i) = file;
7550         }
7551
7552         ret = io_sqe_files_scm(ctx);
7553         if (ret) {
7554                 io_sqe_files_unregister(ctx);
7555                 return ret;
7556         }
7557
7558         ref_node = alloc_fixed_rsrc_ref_node(ctx);
7559         if (!ref_node) {
7560                 io_sqe_files_unregister(ctx);
7561                 return -ENOMEM;
7562         }
7563         init_fixed_file_ref_node(ctx, ref_node);
7564
7565         io_sqe_rsrc_set_node(ctx, file_data, ref_node);
7566         return ret;
7567 out_fput:
7568         for (i = 0; i < ctx->nr_user_files; i++) {
7569                 file = io_file_from_index(ctx, i);
7570                 if (file)
7571                         fput(file);
7572         }
7573         for (i = 0; i < nr_tables; i++)
7574                 kfree(file_data->table[i].files);
7575         ctx->nr_user_files = 0;
7576 out_free:
7577         free_fixed_rsrc_data(ctx->file_data);
7578         ctx->file_data = NULL;
7579         return ret;
7580 }
7581
7582 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
7583                                 int index)
7584 {
7585 #if defined(CONFIG_UNIX)
7586         struct sock *sock = ctx->ring_sock->sk;
7587         struct sk_buff_head *head = &sock->sk_receive_queue;
7588         struct sk_buff *skb;
7589
7590         /*
7591          * See if we can merge this file into an existing skb SCM_RIGHTS
7592          * file set. If there's no room, fall back to allocating a new skb
7593          * and filling it in.
7594          */
7595         spin_lock_irq(&head->lock);
7596         skb = skb_peek(head);
7597         if (skb) {
7598                 struct scm_fp_list *fpl = UNIXCB(skb).fp;
7599
7600                 if (fpl->count < SCM_MAX_FD) {
7601                         __skb_unlink(skb, head);
7602                         spin_unlock_irq(&head->lock);
7603                         fpl->fp[fpl->count] = get_file(file);
7604                         unix_inflight(fpl->user, fpl->fp[fpl->count]);
7605                         fpl->count++;
7606                         spin_lock_irq(&head->lock);
7607                         __skb_queue_head(head, skb);
7608                 } else {
7609                         skb = NULL;
7610                 }
7611         }
7612         spin_unlock_irq(&head->lock);
7613
7614         if (skb) {
7615                 fput(file);
7616                 return 0;
7617         }
7618
7619         return __io_sqe_files_scm(ctx, 1, index);
7620 #else
7621         return 0;
7622 #endif
7623 }
7624
7625 static int io_queue_rsrc_removal(struct fixed_rsrc_data *data, void *rsrc)
7626 {
7627         struct io_rsrc_put *prsrc;
7628         struct fixed_rsrc_ref_node *ref_node = data->node;
7629
7630         prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
7631         if (!prsrc)
7632                 return -ENOMEM;
7633
7634         prsrc->rsrc = rsrc;
7635         list_add(&prsrc->list, &ref_node->rsrc_list);
7636
7637         return 0;
7638 }
7639
7640 static inline int io_queue_file_removal(struct fixed_rsrc_data *data,
7641                                         struct file *file)
7642 {
7643         return io_queue_rsrc_removal(data, (void *)file);
7644 }
7645
7646 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
7647                                  struct io_uring_rsrc_update *up,
7648                                  unsigned nr_args)
7649 {
7650         struct fixed_rsrc_data *data = ctx->file_data;
7651         struct fixed_rsrc_ref_node *ref_node;
7652         struct file *file, **file_slot;
7653         __s32 __user *fds;
7654         int fd, i, err;
7655         __u32 done;
7656         bool needs_switch = false;
7657
7658         if (check_add_overflow(up->offset, nr_args, &done))
7659                 return -EOVERFLOW;
7660         if (done > ctx->nr_user_files)
7661                 return -EINVAL;
7662
7663         ref_node = alloc_fixed_rsrc_ref_node(ctx);
7664         if (!ref_node)
7665                 return -ENOMEM;
7666         init_fixed_file_ref_node(ctx, ref_node);
7667
7668         fds = u64_to_user_ptr(up->data);
7669         for (done = 0; done < nr_args; done++) {
7670                 err = 0;
7671                 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
7672                         err = -EFAULT;
7673                         break;
7674                 }
7675                 if (fd == IORING_REGISTER_FILES_SKIP)
7676                         continue;
7677
7678                 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
7679                 file_slot = io_fixed_file_slot(ctx->file_data, i);
7680
7681                 if (*file_slot) {
7682                         err = io_queue_file_removal(data, *file_slot);
7683                         if (err)
7684                                 break;
7685                         *file_slot = NULL;
7686                         needs_switch = true;
7687                 }
7688                 if (fd != -1) {
7689                         file = fget(fd);
7690                         if (!file) {
7691                                 err = -EBADF;
7692                                 break;
7693                         }
7694                         /*
7695                          * Don't allow io_uring instances to be registered. If
7696                          * UNIX isn't enabled, then this causes a reference
7697                          * cycle and this instance can never get freed. If UNIX
7698                          * is enabled we'll handle it just fine, but there's
7699                          * still no point in allowing a ring fd as it doesn't
7700                          * support regular read/write anyway.
7701                          */
7702                         if (file->f_op == &io_uring_fops) {
7703                                 fput(file);
7704                                 err = -EBADF;
7705                                 break;
7706                         }
7707                         *file_slot = file;
7708                         err = io_sqe_file_register(ctx, file, i);
7709                         if (err) {
7710                                 *file_slot = NULL;
7711                                 fput(file);
7712                                 break;
7713                         }
7714                 }
7715         }
7716
7717         if (needs_switch) {
7718                 percpu_ref_kill(&data->node->refs);
7719                 io_sqe_rsrc_set_node(ctx, data, ref_node);
7720         } else
7721                 destroy_fixed_rsrc_ref_node(ref_node);
7722
7723         return done ? done : err;
7724 }
7725
7726 static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
7727                                unsigned nr_args)
7728 {
7729         struct io_uring_rsrc_update up;
7730
7731         if (!ctx->file_data)
7732                 return -ENXIO;
7733         if (!nr_args)
7734                 return -EINVAL;
7735         if (copy_from_user(&up, arg, sizeof(up)))
7736                 return -EFAULT;
7737         if (up.resv)
7738                 return -EINVAL;
7739
7740         return __io_sqe_files_update(ctx, &up, nr_args);
7741 }
7742
7743 static struct io_wq_work *io_free_work(struct io_wq_work *work)
7744 {
7745         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
7746
7747         req = io_put_req_find_next(req);
7748         return req ? &req->work : NULL;
7749 }
7750
7751 static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
7752 {
7753         struct io_wq_hash *hash;
7754         struct io_wq_data data;
7755         unsigned int concurrency;
7756
7757         hash = ctx->hash_map;
7758         if (!hash) {
7759                 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
7760                 if (!hash)
7761                         return ERR_PTR(-ENOMEM);
7762                 refcount_set(&hash->refs, 1);
7763                 init_waitqueue_head(&hash->wait);
7764                 ctx->hash_map = hash;
7765         }
7766
7767         data.hash = hash;
7768         data.free_work = io_free_work;
7769         data.do_work = io_wq_submit_work;
7770
7771         /* Do QD, or 4 * CPUS, whatever is smallest */
7772         concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
7773
7774         return io_wq_create(concurrency, &data);
7775 }
7776
7777 static int io_uring_alloc_task_context(struct task_struct *task,
7778                                        struct io_ring_ctx *ctx)
7779 {
7780         struct io_uring_task *tctx;
7781         int ret;
7782
7783         tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
7784         if (unlikely(!tctx))
7785                 return -ENOMEM;
7786
7787         ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
7788         if (unlikely(ret)) {
7789                 kfree(tctx);
7790                 return ret;
7791         }
7792
7793         tctx->io_wq = io_init_wq_offload(ctx);
7794         if (IS_ERR(tctx->io_wq)) {
7795                 ret = PTR_ERR(tctx->io_wq);
7796                 percpu_counter_destroy(&tctx->inflight);
7797                 kfree(tctx);
7798                 return ret;
7799         }
7800
7801         xa_init(&tctx->xa);
7802         init_waitqueue_head(&tctx->wait);
7803         tctx->last = NULL;
7804         atomic_set(&tctx->in_idle, 0);
7805         tctx->sqpoll = false;
7806         task->io_uring = tctx;
7807         spin_lock_init(&tctx->task_lock);
7808         INIT_WQ_LIST(&tctx->task_list);
7809         tctx->task_state = 0;
7810         init_task_work(&tctx->task_work, tctx_task_work);
7811         return 0;
7812 }
7813
7814 void __io_uring_free(struct task_struct *tsk)
7815 {
7816         struct io_uring_task *tctx = tsk->io_uring;
7817
7818         WARN_ON_ONCE(!xa_empty(&tctx->xa));
7819         WARN_ON_ONCE(tctx->io_wq);
7820
7821         percpu_counter_destroy(&tctx->inflight);
7822         kfree(tctx);
7823         tsk->io_uring = NULL;
7824 }
7825
7826 static int io_sq_thread_fork(struct io_sq_data *sqd, struct io_ring_ctx *ctx)
7827 {
7828         struct task_struct *tsk;
7829         int ret;
7830
7831         clear_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
7832         reinit_completion(&sqd->parked);
7833         ctx->sqo_exec = 0;
7834         sqd->task_pid = current->pid;
7835         tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
7836         if (IS_ERR(tsk))
7837                 return PTR_ERR(tsk);
7838         ret = io_uring_alloc_task_context(tsk, ctx);
7839         if (ret)
7840                 set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
7841         sqd->thread = tsk;
7842         wake_up_new_task(tsk);
7843         return ret;
7844 }
7845
7846 static int io_sq_offload_create(struct io_ring_ctx *ctx,
7847                                 struct io_uring_params *p)
7848 {
7849         int ret;
7850
7851         /* Retain compatibility with failing for an invalid attach attempt */
7852         if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
7853                                 IORING_SETUP_ATTACH_WQ) {
7854                 struct fd f;
7855
7856                 f = fdget(p->wq_fd);
7857                 if (!f.file)
7858                         return -ENXIO;
7859                 if (f.file->f_op != &io_uring_fops) {
7860                         fdput(f);
7861                         return -EINVAL;
7862                 }
7863                 fdput(f);
7864         }
7865         if (ctx->flags & IORING_SETUP_SQPOLL) {
7866                 struct task_struct *tsk;
7867                 struct io_sq_data *sqd;
7868
7869                 ret = -EPERM;
7870                 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
7871                         goto err;
7872
7873                 sqd = io_get_sq_data(p);
7874                 if (IS_ERR(sqd)) {
7875                         ret = PTR_ERR(sqd);
7876                         goto err;
7877                 }
7878
7879                 ctx->sq_data = sqd;
7880                 io_sq_thread_park(sqd);
7881                 mutex_lock(&sqd->ctx_lock);
7882                 list_add(&ctx->sqd_list, &sqd->ctx_new_list);
7883                 mutex_unlock(&sqd->ctx_lock);
7884                 io_sq_thread_unpark(sqd);
7885
7886                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
7887                 if (!ctx->sq_thread_idle)
7888                         ctx->sq_thread_idle = HZ;
7889
7890                 if (sqd->thread)
7891                         return 0;
7892
7893                 if (p->flags & IORING_SETUP_SQ_AFF) {
7894                         int cpu = p->sq_thread_cpu;
7895
7896                         ret = -EINVAL;
7897                         if (cpu >= nr_cpu_ids)
7898                                 goto err;
7899                         if (!cpu_online(cpu))
7900                                 goto err;
7901
7902                         sqd->sq_cpu = cpu;
7903                 } else {
7904                         sqd->sq_cpu = -1;
7905                 }
7906
7907                 sqd->task_pid = current->pid;
7908                 tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
7909                 if (IS_ERR(tsk)) {
7910                         ret = PTR_ERR(tsk);
7911                         goto err;
7912                 }
7913                 ret = io_uring_alloc_task_context(tsk, ctx);
7914                 if (ret)
7915                         set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
7916                 sqd->thread = tsk;
7917                 wake_up_new_task(tsk);
7918                 if (ret)
7919                         goto err;
7920         } else if (p->flags & IORING_SETUP_SQ_AFF) {
7921                 /* Can't have SQ_AFF without SQPOLL */
7922                 ret = -EINVAL;
7923                 goto err;
7924         }
7925
7926         return 0;
7927 err:
7928         io_sq_thread_finish(ctx);
7929         return ret;
7930 }
7931
7932 static void io_sq_offload_start(struct io_ring_ctx *ctx)
7933 {
7934         struct io_sq_data *sqd = ctx->sq_data;
7935
7936         ctx->flags &= ~IORING_SETUP_R_DISABLED;
7937         if (ctx->flags & IORING_SETUP_SQPOLL)
7938                 complete(&sqd->startup);
7939 }
7940
7941 static inline void __io_unaccount_mem(struct user_struct *user,
7942                                       unsigned long nr_pages)
7943 {
7944         atomic_long_sub(nr_pages, &user->locked_vm);
7945 }
7946
7947 static inline int __io_account_mem(struct user_struct *user,
7948                                    unsigned long nr_pages)
7949 {
7950         unsigned long page_limit, cur_pages, new_pages;
7951
7952         /* Don't allow more pages than we can safely lock */
7953         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
7954
7955         do {
7956                 cur_pages = atomic_long_read(&user->locked_vm);
7957                 new_pages = cur_pages + nr_pages;
7958                 if (new_pages > page_limit)
7959                         return -ENOMEM;
7960         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
7961                                         new_pages) != cur_pages);
7962
7963         return 0;
7964 }
7965
7966 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
7967 {
7968         if (ctx->user)
7969                 __io_unaccount_mem(ctx->user, nr_pages);
7970
7971         if (ctx->mm_account)
7972                 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
7973 }
7974
7975 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
7976 {
7977         int ret;
7978
7979         if (ctx->user) {
7980                 ret = __io_account_mem(ctx->user, nr_pages);
7981                 if (ret)
7982                         return ret;
7983         }
7984
7985         if (ctx->mm_account)
7986                 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
7987
7988         return 0;
7989 }
7990
7991 static void io_mem_free(void *ptr)
7992 {
7993         struct page *page;
7994
7995         if (!ptr)
7996                 return;
7997
7998         page = virt_to_head_page(ptr);
7999         if (put_page_testzero(page))
8000                 free_compound_page(page);
8001 }
8002
8003 static void *io_mem_alloc(size_t size)
8004 {
8005         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
8006                                 __GFP_NORETRY | __GFP_ACCOUNT;
8007
8008         return (void *) __get_free_pages(gfp_flags, get_order(size));
8009 }
8010
8011 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
8012                                 size_t *sq_offset)
8013 {
8014         struct io_rings *rings;
8015         size_t off, sq_array_size;
8016
8017         off = struct_size(rings, cqes, cq_entries);
8018         if (off == SIZE_MAX)
8019                 return SIZE_MAX;
8020
8021 #ifdef CONFIG_SMP
8022         off = ALIGN(off, SMP_CACHE_BYTES);
8023         if (off == 0)
8024                 return SIZE_MAX;
8025 #endif
8026
8027         if (sq_offset)
8028                 *sq_offset = off;
8029
8030         sq_array_size = array_size(sizeof(u32), sq_entries);
8031         if (sq_array_size == SIZE_MAX)
8032                 return SIZE_MAX;
8033
8034         if (check_add_overflow(off, sq_array_size, &off))
8035                 return SIZE_MAX;
8036
8037         return off;
8038 }
8039
8040 static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
8041 {
8042         int i, j;
8043
8044         if (!ctx->user_bufs)
8045                 return -ENXIO;
8046
8047         for (i = 0; i < ctx->nr_user_bufs; i++) {
8048                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8049
8050                 for (j = 0; j < imu->nr_bvecs; j++)
8051                         unpin_user_page(imu->bvec[j].bv_page);
8052
8053                 if (imu->acct_pages)
8054                         io_unaccount_mem(ctx, imu->acct_pages);
8055                 kvfree(imu->bvec);
8056                 imu->nr_bvecs = 0;
8057         }
8058
8059         kfree(ctx->user_bufs);
8060         ctx->user_bufs = NULL;
8061         ctx->nr_user_bufs = 0;
8062         return 0;
8063 }
8064
8065 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
8066                        void __user *arg, unsigned index)
8067 {
8068         struct iovec __user *src;
8069
8070 #ifdef CONFIG_COMPAT
8071         if (ctx->compat) {
8072                 struct compat_iovec __user *ciovs;
8073                 struct compat_iovec ciov;
8074
8075                 ciovs = (struct compat_iovec __user *) arg;
8076                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
8077                         return -EFAULT;
8078
8079                 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
8080                 dst->iov_len = ciov.iov_len;
8081                 return 0;
8082         }
8083 #endif
8084         src = (struct iovec __user *) arg;
8085         if (copy_from_user(dst, &src[index], sizeof(*dst)))
8086                 return -EFAULT;
8087         return 0;
8088 }
8089
8090 /*
8091  * Not super efficient, but this is just a registration time. And we do cache
8092  * the last compound head, so generally we'll only do a full search if we don't
8093  * match that one.
8094  *
8095  * We check if the given compound head page has already been accounted, to
8096  * avoid double accounting it. This allows us to account the full size of the
8097  * page, not just the constituent pages of a huge page.
8098  */
8099 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
8100                                   int nr_pages, struct page *hpage)
8101 {
8102         int i, j;
8103
8104         /* check current page array */
8105         for (i = 0; i < nr_pages; i++) {
8106                 if (!PageCompound(pages[i]))
8107                         continue;
8108                 if (compound_head(pages[i]) == hpage)
8109                         return true;
8110         }
8111
8112         /* check previously registered pages */
8113         for (i = 0; i < ctx->nr_user_bufs; i++) {
8114                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8115
8116                 for (j = 0; j < imu->nr_bvecs; j++) {
8117                         if (!PageCompound(imu->bvec[j].bv_page))
8118                                 continue;
8119                         if (compound_head(imu->bvec[j].bv_page) == hpage)
8120                                 return true;
8121                 }
8122         }
8123
8124         return false;
8125 }
8126
8127 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
8128                                  int nr_pages, struct io_mapped_ubuf *imu,
8129                                  struct page **last_hpage)
8130 {
8131         int i, ret;
8132
8133         for (i = 0; i < nr_pages; i++) {
8134                 if (!PageCompound(pages[i])) {
8135                         imu->acct_pages++;
8136                 } else {
8137                         struct page *hpage;
8138
8139                         hpage = compound_head(pages[i]);
8140                         if (hpage == *last_hpage)
8141                                 continue;
8142                         *last_hpage = hpage;
8143                         if (headpage_already_acct(ctx, pages, i, hpage))
8144                                 continue;
8145                         imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
8146                 }
8147         }
8148
8149         if (!imu->acct_pages)
8150                 return 0;
8151
8152         ret = io_account_mem(ctx, imu->acct_pages);
8153         if (ret)
8154                 imu->acct_pages = 0;
8155         return ret;
8156 }
8157
8158 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
8159                                   struct io_mapped_ubuf *imu,
8160                                   struct page **last_hpage)
8161 {
8162         struct vm_area_struct **vmas = NULL;
8163         struct page **pages = NULL;
8164         unsigned long off, start, end, ubuf;
8165         size_t size;
8166         int ret, pret, nr_pages, i;
8167
8168         ubuf = (unsigned long) iov->iov_base;
8169         end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
8170         start = ubuf >> PAGE_SHIFT;
8171         nr_pages = end - start;
8172
8173         ret = -ENOMEM;
8174
8175         pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
8176         if (!pages)
8177                 goto done;
8178
8179         vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
8180                               GFP_KERNEL);
8181         if (!vmas)
8182                 goto done;
8183
8184         imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
8185                                    GFP_KERNEL);
8186         if (!imu->bvec)
8187                 goto done;
8188
8189         ret = 0;
8190         mmap_read_lock(current->mm);
8191         pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
8192                               pages, vmas);
8193         if (pret == nr_pages) {
8194                 /* don't support file backed memory */
8195                 for (i = 0; i < nr_pages; i++) {
8196                         struct vm_area_struct *vma = vmas[i];
8197
8198                         if (vma->vm_file &&
8199                             !is_file_hugepages(vma->vm_file)) {
8200                                 ret = -EOPNOTSUPP;
8201                                 break;
8202                         }
8203                 }
8204         } else {
8205                 ret = pret < 0 ? pret : -EFAULT;
8206         }
8207         mmap_read_unlock(current->mm);
8208         if (ret) {
8209                 /*
8210                  * if we did partial map, or found file backed vmas,
8211                  * release any pages we did get
8212                  */
8213                 if (pret > 0)
8214                         unpin_user_pages(pages, pret);
8215                 kvfree(imu->bvec);
8216                 goto done;
8217         }
8218
8219         ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
8220         if (ret) {
8221                 unpin_user_pages(pages, pret);
8222                 kvfree(imu->bvec);
8223                 goto done;
8224         }
8225
8226         off = ubuf & ~PAGE_MASK;
8227         size = iov->iov_len;
8228         for (i = 0; i < nr_pages; i++) {
8229                 size_t vec_len;
8230
8231                 vec_len = min_t(size_t, size, PAGE_SIZE - off);
8232                 imu->bvec[i].bv_page = pages[i];
8233                 imu->bvec[i].bv_len = vec_len;
8234                 imu->bvec[i].bv_offset = off;
8235                 off = 0;
8236                 size -= vec_len;
8237         }
8238         /* store original address for later verification */
8239         imu->ubuf = ubuf;
8240         imu->len = iov->iov_len;
8241         imu->nr_bvecs = nr_pages;
8242         ret = 0;
8243 done:
8244         kvfree(pages);
8245         kvfree(vmas);
8246         return ret;
8247 }
8248
8249 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
8250 {
8251         if (ctx->user_bufs)
8252                 return -EBUSY;
8253         if (!nr_args || nr_args > UIO_MAXIOV)
8254                 return -EINVAL;
8255
8256         ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
8257                                         GFP_KERNEL);
8258         if (!ctx->user_bufs)
8259                 return -ENOMEM;
8260
8261         return 0;
8262 }
8263
8264 static int io_buffer_validate(struct iovec *iov)
8265 {
8266         /*
8267          * Don't impose further limits on the size and buffer
8268          * constraints here, we'll -EINVAL later when IO is
8269          * submitted if they are wrong.
8270          */
8271         if (!iov->iov_base || !iov->iov_len)
8272                 return -EFAULT;
8273
8274         /* arbitrary limit, but we need something */
8275         if (iov->iov_len > SZ_1G)
8276                 return -EFAULT;
8277
8278         return 0;
8279 }
8280
8281 static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
8282                                    unsigned int nr_args)
8283 {
8284         int i, ret;
8285         struct iovec iov;
8286         struct page *last_hpage = NULL;
8287
8288         ret = io_buffers_map_alloc(ctx, nr_args);
8289         if (ret)
8290                 return ret;
8291
8292         for (i = 0; i < nr_args; i++) {
8293                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8294
8295                 ret = io_copy_iov(ctx, &iov, arg, i);
8296                 if (ret)
8297                         break;
8298
8299                 ret = io_buffer_validate(&iov);
8300                 if (ret)
8301                         break;
8302
8303                 ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
8304                 if (ret)
8305                         break;
8306
8307                 ctx->nr_user_bufs++;
8308         }
8309
8310         if (ret)
8311                 io_sqe_buffers_unregister(ctx);
8312
8313         return ret;
8314 }
8315
8316 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
8317 {
8318         __s32 __user *fds = arg;
8319         int fd;
8320
8321         if (ctx->cq_ev_fd)
8322                 return -EBUSY;
8323
8324         if (copy_from_user(&fd, fds, sizeof(*fds)))
8325                 return -EFAULT;
8326
8327         ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
8328         if (IS_ERR(ctx->cq_ev_fd)) {
8329                 int ret = PTR_ERR(ctx->cq_ev_fd);
8330                 ctx->cq_ev_fd = NULL;
8331                 return ret;
8332         }
8333
8334         return 0;
8335 }
8336
8337 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
8338 {
8339         if (ctx->cq_ev_fd) {
8340                 eventfd_ctx_put(ctx->cq_ev_fd);
8341                 ctx->cq_ev_fd = NULL;
8342                 return 0;
8343         }
8344
8345         return -ENXIO;
8346 }
8347
8348 static int __io_destroy_buffers(int id, void *p, void *data)
8349 {
8350         struct io_ring_ctx *ctx = data;
8351         struct io_buffer *buf = p;
8352
8353         __io_remove_buffers(ctx, buf, id, -1U);
8354         return 0;
8355 }
8356
8357 static void io_destroy_buffers(struct io_ring_ctx *ctx)
8358 {
8359         idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
8360         idr_destroy(&ctx->io_buffer_idr);
8361 }
8362
8363 static void io_req_cache_free(struct list_head *list, struct task_struct *tsk)
8364 {
8365         struct io_kiocb *req, *nxt;
8366
8367         list_for_each_entry_safe(req, nxt, list, compl.list) {
8368                 if (tsk && req->task != tsk)
8369                         continue;
8370                 list_del(&req->compl.list);
8371                 kmem_cache_free(req_cachep, req);
8372         }
8373 }
8374
8375 static void io_req_caches_free(struct io_ring_ctx *ctx)
8376 {
8377         struct io_submit_state *submit_state = &ctx->submit_state;
8378         struct io_comp_state *cs = &ctx->submit_state.comp;
8379
8380         mutex_lock(&ctx->uring_lock);
8381
8382         if (submit_state->free_reqs) {
8383                 kmem_cache_free_bulk(req_cachep, submit_state->free_reqs,
8384                                      submit_state->reqs);
8385                 submit_state->free_reqs = 0;
8386         }
8387
8388         spin_lock_irq(&ctx->completion_lock);
8389         list_splice_init(&cs->locked_free_list, &cs->free_list);
8390         cs->locked_free_nr = 0;
8391         spin_unlock_irq(&ctx->completion_lock);
8392
8393         io_req_cache_free(&cs->free_list, NULL);
8394
8395         mutex_unlock(&ctx->uring_lock);
8396 }
8397
8398 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
8399 {
8400         /*
8401          * Some may use context even when all refs and requests have been put,
8402          * and they are free to do so while still holding uring_lock, see
8403          * __io_req_task_submit(). Wait for them to finish.
8404          */
8405         mutex_lock(&ctx->uring_lock);
8406         mutex_unlock(&ctx->uring_lock);
8407
8408         io_sq_thread_finish(ctx);
8409         io_sqe_buffers_unregister(ctx);
8410
8411         if (ctx->mm_account) {
8412                 mmdrop(ctx->mm_account);
8413                 ctx->mm_account = NULL;
8414         }
8415
8416         mutex_lock(&ctx->uring_lock);
8417         io_sqe_files_unregister(ctx);
8418         mutex_unlock(&ctx->uring_lock);
8419         io_eventfd_unregister(ctx);
8420         io_destroy_buffers(ctx);
8421         idr_destroy(&ctx->personality_idr);
8422
8423 #if defined(CONFIG_UNIX)
8424         if (ctx->ring_sock) {
8425                 ctx->ring_sock->file = NULL; /* so that iput() is called */
8426                 sock_release(ctx->ring_sock);
8427         }
8428 #endif
8429
8430         io_mem_free(ctx->rings);
8431         io_mem_free(ctx->sq_sqes);
8432
8433         percpu_ref_exit(&ctx->refs);
8434         free_uid(ctx->user);
8435         io_req_caches_free(ctx);
8436         if (ctx->hash_map)
8437                 io_wq_put_hash(ctx->hash_map);
8438         kfree(ctx->cancel_hash);
8439         kfree(ctx);
8440 }
8441
8442 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
8443 {
8444         struct io_ring_ctx *ctx = file->private_data;
8445         __poll_t mask = 0;
8446
8447         poll_wait(file, &ctx->cq_wait, wait);
8448         /*
8449          * synchronizes with barrier from wq_has_sleeper call in
8450          * io_commit_cqring
8451          */
8452         smp_rmb();
8453         if (!io_sqring_full(ctx))
8454                 mask |= EPOLLOUT | EPOLLWRNORM;
8455
8456         /*
8457          * Don't flush cqring overflow list here, just do a simple check.
8458          * Otherwise there could possible be ABBA deadlock:
8459          *      CPU0                    CPU1
8460          *      ----                    ----
8461          * lock(&ctx->uring_lock);
8462          *                              lock(&ep->mtx);
8463          *                              lock(&ctx->uring_lock);
8464          * lock(&ep->mtx);
8465          *
8466          * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
8467          * pushs them to do the flush.
8468          */
8469         if (io_cqring_events(ctx) || test_bit(0, &ctx->cq_check_overflow))
8470                 mask |= EPOLLIN | EPOLLRDNORM;
8471
8472         return mask;
8473 }
8474
8475 static int io_uring_fasync(int fd, struct file *file, int on)
8476 {
8477         struct io_ring_ctx *ctx = file->private_data;
8478
8479         return fasync_helper(fd, file, on, &ctx->cq_fasync);
8480 }
8481
8482 static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
8483 {
8484         const struct cred *creds;
8485
8486         creds = idr_remove(&ctx->personality_idr, id);
8487         if (creds) {
8488                 put_cred(creds);
8489                 return 0;
8490         }
8491
8492         return -EINVAL;
8493 }
8494
8495 static int io_remove_personalities(int id, void *p, void *data)
8496 {
8497         struct io_ring_ctx *ctx = data;
8498
8499         io_unregister_personality(ctx, id);
8500         return 0;
8501 }
8502
8503 static bool io_run_ctx_fallback(struct io_ring_ctx *ctx)
8504 {
8505         struct callback_head *work, *next;
8506         bool executed = false;
8507
8508         do {
8509                 work = xchg(&ctx->exit_task_work, NULL);
8510                 if (!work)
8511                         break;
8512
8513                 do {
8514                         next = work->next;
8515                         work->func(work);
8516                         work = next;
8517                         cond_resched();
8518                 } while (work);
8519                 executed = true;
8520         } while (1);
8521
8522         return executed;
8523 }
8524
8525 static void io_ring_exit_work(struct work_struct *work)
8526 {
8527         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
8528                                                exit_work);
8529
8530         /*
8531          * If we're doing polled IO and end up having requests being
8532          * submitted async (out-of-line), then completions can come in while
8533          * we're waiting for refs to drop. We need to reap these manually,
8534          * as nobody else will be looking for them.
8535          */
8536         do {
8537                 io_uring_try_cancel_requests(ctx, NULL, NULL);
8538         } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
8539         io_ring_ctx_free(ctx);
8540 }
8541
8542 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
8543 {
8544         mutex_lock(&ctx->uring_lock);
8545         percpu_ref_kill(&ctx->refs);
8546         /* if force is set, the ring is going away. always drop after that */
8547         ctx->cq_overflow_flushed = 1;
8548         if (ctx->rings)
8549                 __io_cqring_overflow_flush(ctx, true, NULL, NULL);
8550         idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
8551         mutex_unlock(&ctx->uring_lock);
8552
8553         io_kill_timeouts(ctx, NULL, NULL);
8554         io_poll_remove_all(ctx, NULL, NULL);
8555
8556         /* if we failed setting up the ctx, we might not have any rings */
8557         io_iopoll_try_reap_events(ctx);
8558
8559         INIT_WORK(&ctx->exit_work, io_ring_exit_work);
8560         /*
8561          * Use system_unbound_wq to avoid spawning tons of event kworkers
8562          * if we're exiting a ton of rings at the same time. It just adds
8563          * noise and overhead, there's no discernable change in runtime
8564          * over using system_wq.
8565          */
8566         queue_work(system_unbound_wq, &ctx->exit_work);
8567 }
8568
8569 static int io_uring_release(struct inode *inode, struct file *file)
8570 {
8571         struct io_ring_ctx *ctx = file->private_data;
8572
8573         file->private_data = NULL;
8574         io_ring_ctx_wait_and_kill(ctx);
8575         return 0;
8576 }
8577
8578 struct io_task_cancel {
8579         struct task_struct *task;
8580         struct files_struct *files;
8581 };
8582
8583 static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
8584 {
8585         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
8586         struct io_task_cancel *cancel = data;
8587         bool ret;
8588
8589         if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) {
8590                 unsigned long flags;
8591                 struct io_ring_ctx *ctx = req->ctx;
8592
8593                 /* protect against races with linked timeouts */
8594                 spin_lock_irqsave(&ctx->completion_lock, flags);
8595                 ret = io_match_task(req, cancel->task, cancel->files);
8596                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
8597         } else {
8598                 ret = io_match_task(req, cancel->task, cancel->files);
8599         }
8600         return ret;
8601 }
8602
8603 static void io_cancel_defer_files(struct io_ring_ctx *ctx,
8604                                   struct task_struct *task,
8605                                   struct files_struct *files)
8606 {
8607         struct io_defer_entry *de = NULL;
8608         LIST_HEAD(list);
8609
8610         spin_lock_irq(&ctx->completion_lock);
8611         list_for_each_entry_reverse(de, &ctx->defer_list, list) {
8612                 if (io_match_task(de->req, task, files)) {
8613                         list_cut_position(&list, &ctx->defer_list, &de->list);
8614                         break;
8615                 }
8616         }
8617         spin_unlock_irq(&ctx->completion_lock);
8618
8619         while (!list_empty(&list)) {
8620                 de = list_first_entry(&list, struct io_defer_entry, list);
8621                 list_del_init(&de->list);
8622                 req_set_fail_links(de->req);
8623                 io_put_req(de->req);
8624                 io_req_complete(de->req, -ECANCELED);
8625                 kfree(de);
8626         }
8627 }
8628
8629 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
8630                                          struct task_struct *task,
8631                                          struct files_struct *files)
8632 {
8633         struct io_task_cancel cancel = { .task = task, .files = files, };
8634         struct task_struct *tctx_task = task ?: current;
8635         struct io_uring_task *tctx = tctx_task->io_uring;
8636
8637         while (1) {
8638                 enum io_wq_cancel cret;
8639                 bool ret = false;
8640
8641                 if (tctx && tctx->io_wq) {
8642                         cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
8643                                                &cancel, true);
8644                         ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
8645                 }
8646
8647                 /* SQPOLL thread does its own polling */
8648                 if (!(ctx->flags & IORING_SETUP_SQPOLL) && !files) {
8649                         while (!list_empty_careful(&ctx->iopoll_list)) {
8650                                 io_iopoll_try_reap_events(ctx);
8651                                 ret = true;
8652                         }
8653                 }
8654
8655                 ret |= io_poll_remove_all(ctx, task, files);
8656                 ret |= io_kill_timeouts(ctx, task, files);
8657                 ret |= io_run_task_work();
8658                 ret |= io_run_ctx_fallback(ctx);
8659                 io_cqring_overflow_flush(ctx, true, task, files);
8660                 if (!ret)
8661                         break;
8662                 cond_resched();
8663         }
8664 }
8665
8666 static int io_uring_count_inflight(struct io_ring_ctx *ctx,
8667                                    struct task_struct *task,
8668                                    struct files_struct *files)
8669 {
8670         struct io_kiocb *req;
8671         int cnt = 0;
8672
8673         spin_lock_irq(&ctx->inflight_lock);
8674         list_for_each_entry(req, &ctx->inflight_list, inflight_entry)
8675                 cnt += io_match_task(req, task, files);
8676         spin_unlock_irq(&ctx->inflight_lock);
8677         return cnt;
8678 }
8679
8680 static void io_uring_cancel_files(struct io_ring_ctx *ctx,
8681                                   struct task_struct *task,
8682                                   struct files_struct *files)
8683 {
8684         while (!list_empty_careful(&ctx->inflight_list)) {
8685                 DEFINE_WAIT(wait);
8686                 int inflight;
8687
8688                 inflight = io_uring_count_inflight(ctx, task, files);
8689                 if (!inflight)
8690                         break;
8691
8692                 io_uring_try_cancel_requests(ctx, task, files);
8693
8694                 if (ctx->sq_data)
8695                         io_sq_thread_unpark(ctx->sq_data);
8696                 prepare_to_wait(&task->io_uring->wait, &wait,
8697                                 TASK_UNINTERRUPTIBLE);
8698                 if (inflight == io_uring_count_inflight(ctx, task, files))
8699                         schedule();
8700                 finish_wait(&task->io_uring->wait, &wait);
8701                 if (ctx->sq_data)
8702                         io_sq_thread_park(ctx->sq_data);
8703         }
8704 }
8705
8706 /*
8707  * We need to iteratively cancel requests, in case a request has dependent
8708  * hard links. These persist even for failure of cancelations, hence keep
8709  * looping until none are found.
8710  */
8711 static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
8712                                           struct files_struct *files)
8713 {
8714         struct task_struct *task = current;
8715
8716         if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
8717                 /* never started, nothing to cancel */
8718                 if (ctx->flags & IORING_SETUP_R_DISABLED) {
8719                         io_sq_offload_start(ctx);
8720                         return;
8721                 }
8722                 io_sq_thread_park(ctx->sq_data);
8723                 task = ctx->sq_data->thread;
8724                 if (task)
8725                         atomic_inc(&task->io_uring->in_idle);
8726         }
8727
8728         io_cancel_defer_files(ctx, task, files);
8729
8730         io_uring_cancel_files(ctx, task, files);
8731         if (!files)
8732                 io_uring_try_cancel_requests(ctx, task, NULL);
8733
8734         if (task)
8735                 atomic_dec(&task->io_uring->in_idle);
8736         if (ctx->sq_data)
8737                 io_sq_thread_unpark(ctx->sq_data);
8738 }
8739
8740 /*
8741  * Note that this task has used io_uring. We use it for cancelation purposes.
8742  */
8743 static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
8744 {
8745         struct io_uring_task *tctx = current->io_uring;
8746         int ret;
8747
8748         if (unlikely(!tctx)) {
8749                 ret = io_uring_alloc_task_context(current, ctx);
8750                 if (unlikely(ret))
8751                         return ret;
8752                 tctx = current->io_uring;
8753         }
8754         if (tctx->last != file) {
8755                 void *old = xa_load(&tctx->xa, (unsigned long)file);
8756
8757                 if (!old) {
8758                         get_file(file);
8759                         ret = xa_err(xa_store(&tctx->xa, (unsigned long)file,
8760                                                 file, GFP_KERNEL));
8761                         if (ret) {
8762                                 fput(file);
8763                                 return ret;
8764                         }
8765                 }
8766                 tctx->last = file;
8767         }
8768
8769         /*
8770          * This is race safe in that the task itself is doing this, hence it
8771          * cannot be going through the exit/cancel paths at the same time.
8772          * This cannot be modified while exit/cancel is running.
8773          */
8774         if (!tctx->sqpoll && (ctx->flags & IORING_SETUP_SQPOLL))
8775                 tctx->sqpoll = true;
8776
8777         return 0;
8778 }
8779
8780 /*
8781  * Remove this io_uring_file -> task mapping.
8782  */
8783 static void io_uring_del_task_file(struct file *file)
8784 {
8785         struct io_uring_task *tctx = current->io_uring;
8786
8787         if (tctx->last == file)
8788                 tctx->last = NULL;
8789         file = xa_erase(&tctx->xa, (unsigned long)file);
8790         if (file)
8791                 fput(file);
8792 }
8793
8794 static void io_uring_clean_tctx(struct io_uring_task *tctx)
8795 {
8796         struct file *file;
8797         unsigned long index;
8798
8799         xa_for_each(&tctx->xa, index, file)
8800                 io_uring_del_task_file(file);
8801         if (tctx->io_wq) {
8802                 io_wq_put_and_exit(tctx->io_wq);
8803                 tctx->io_wq = NULL;
8804         }
8805 }
8806
8807 void __io_uring_files_cancel(struct files_struct *files)
8808 {
8809         struct io_uring_task *tctx = current->io_uring;
8810         struct file *file;
8811         unsigned long index;
8812
8813         /* make sure overflow events are dropped */
8814         atomic_inc(&tctx->in_idle);
8815         xa_for_each(&tctx->xa, index, file)
8816                 io_uring_cancel_task_requests(file->private_data, files);
8817         atomic_dec(&tctx->in_idle);
8818
8819         if (files)
8820                 io_uring_clean_tctx(tctx);
8821 }
8822
8823 static s64 tctx_inflight(struct io_uring_task *tctx)
8824 {
8825         return percpu_counter_sum(&tctx->inflight);
8826 }
8827
8828 static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
8829 {
8830         struct io_sq_data *sqd = ctx->sq_data;
8831         struct io_uring_task *tctx;
8832         s64 inflight;
8833         DEFINE_WAIT(wait);
8834
8835         if (!sqd)
8836                 return;
8837         io_sq_thread_park(sqd);
8838         if (!sqd->thread || !sqd->thread->io_uring) {
8839                 io_sq_thread_unpark(sqd);
8840                 return;
8841         }
8842         tctx = ctx->sq_data->thread->io_uring;
8843         atomic_inc(&tctx->in_idle);
8844         do {
8845                 /* read completions before cancelations */
8846                 inflight = tctx_inflight(tctx);
8847                 if (!inflight)
8848                         break;
8849                 io_uring_cancel_task_requests(ctx, NULL);
8850
8851                 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
8852                 /*
8853                  * If we've seen completions, retry without waiting. This
8854                  * avoids a race where a completion comes in before we did
8855                  * prepare_to_wait().
8856                  */
8857                 if (inflight == tctx_inflight(tctx))
8858                         schedule();
8859                 finish_wait(&tctx->wait, &wait);
8860         } while (1);
8861         atomic_dec(&tctx->in_idle);
8862         io_sq_thread_unpark(sqd);
8863 }
8864
8865 /*
8866  * Find any io_uring fd that this task has registered or done IO on, and cancel
8867  * requests.
8868  */
8869 void __io_uring_task_cancel(void)
8870 {
8871         struct io_uring_task *tctx = current->io_uring;
8872         DEFINE_WAIT(wait);
8873         s64 inflight;
8874
8875         /* make sure overflow events are dropped */
8876         atomic_inc(&tctx->in_idle);
8877
8878         if (tctx->sqpoll) {
8879                 struct file *file;
8880                 unsigned long index;
8881
8882                 xa_for_each(&tctx->xa, index, file)
8883                         io_uring_cancel_sqpoll(file->private_data);
8884         }
8885
8886         do {
8887                 /* read completions before cancelations */
8888                 inflight = tctx_inflight(tctx);
8889                 if (!inflight)
8890                         break;
8891                 __io_uring_files_cancel(NULL);
8892
8893                 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
8894
8895                 /*
8896                  * If we've seen completions, retry without waiting. This
8897                  * avoids a race where a completion comes in before we did
8898                  * prepare_to_wait().
8899                  */
8900                 if (inflight == tctx_inflight(tctx))
8901                         schedule();
8902                 finish_wait(&tctx->wait, &wait);
8903         } while (1);
8904
8905         atomic_dec(&tctx->in_idle);
8906
8907         io_uring_clean_tctx(tctx);
8908         /* all current's requests should be gone, we can kill tctx */
8909         __io_uring_free(current);
8910 }
8911
8912 static void *io_uring_validate_mmap_request(struct file *file,
8913                                             loff_t pgoff, size_t sz)
8914 {
8915         struct io_ring_ctx *ctx = file->private_data;
8916         loff_t offset = pgoff << PAGE_SHIFT;
8917         struct page *page;
8918         void *ptr;
8919
8920         switch (offset) {
8921         case IORING_OFF_SQ_RING:
8922         case IORING_OFF_CQ_RING:
8923                 ptr = ctx->rings;
8924                 break;
8925         case IORING_OFF_SQES:
8926                 ptr = ctx->sq_sqes;
8927                 break;
8928         default:
8929                 return ERR_PTR(-EINVAL);
8930         }
8931
8932         page = virt_to_head_page(ptr);
8933         if (sz > page_size(page))
8934                 return ERR_PTR(-EINVAL);
8935
8936         return ptr;
8937 }
8938
8939 #ifdef CONFIG_MMU
8940
8941 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
8942 {
8943         size_t sz = vma->vm_end - vma->vm_start;
8944         unsigned long pfn;
8945         void *ptr;
8946
8947         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
8948         if (IS_ERR(ptr))
8949                 return PTR_ERR(ptr);
8950
8951         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
8952         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
8953 }
8954
8955 #else /* !CONFIG_MMU */
8956
8957 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
8958 {
8959         return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
8960 }
8961
8962 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
8963 {
8964         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
8965 }
8966
8967 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
8968         unsigned long addr, unsigned long len,
8969         unsigned long pgoff, unsigned long flags)
8970 {
8971         void *ptr;
8972
8973         ptr = io_uring_validate_mmap_request(file, pgoff, len);
8974         if (IS_ERR(ptr))
8975                 return PTR_ERR(ptr);
8976
8977         return (unsigned long) ptr;
8978 }
8979
8980 #endif /* !CONFIG_MMU */
8981
8982 static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
8983 {
8984         int ret = 0;
8985         DEFINE_WAIT(wait);
8986
8987         do {
8988                 if (!io_sqring_full(ctx))
8989                         break;
8990                 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
8991
8992                 if (!io_sqring_full(ctx))
8993                         break;
8994                 schedule();
8995         } while (!signal_pending(current));
8996
8997         finish_wait(&ctx->sqo_sq_wait, &wait);
8998         return ret;
8999 }
9000
9001 static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
9002                           struct __kernel_timespec __user **ts,
9003                           const sigset_t __user **sig)
9004 {
9005         struct io_uring_getevents_arg arg;
9006
9007         /*
9008          * If EXT_ARG isn't set, then we have no timespec and the argp pointer
9009          * is just a pointer to the sigset_t.
9010          */
9011         if (!(flags & IORING_ENTER_EXT_ARG)) {
9012                 *sig = (const sigset_t __user *) argp;
9013                 *ts = NULL;
9014                 return 0;
9015         }
9016
9017         /*
9018          * EXT_ARG is set - ensure we agree on the size of it and copy in our
9019          * timespec and sigset_t pointers if good.
9020          */
9021         if (*argsz != sizeof(arg))
9022                 return -EINVAL;
9023         if (copy_from_user(&arg, argp, sizeof(arg)))
9024                 return -EFAULT;
9025         *sig = u64_to_user_ptr(arg.sigmask);
9026         *argsz = arg.sigmask_sz;
9027         *ts = u64_to_user_ptr(arg.ts);
9028         return 0;
9029 }
9030
9031 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
9032                 u32, min_complete, u32, flags, const void __user *, argp,
9033                 size_t, argsz)
9034 {
9035         struct io_ring_ctx *ctx;
9036         long ret = -EBADF;
9037         int submitted = 0;
9038         struct fd f;
9039
9040         io_run_task_work();
9041
9042         if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
9043                         IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))
9044                 return -EINVAL;
9045
9046         f = fdget(fd);
9047         if (!f.file)
9048                 return -EBADF;
9049
9050         ret = -EOPNOTSUPP;
9051         if (f.file->f_op != &io_uring_fops)
9052                 goto out_fput;
9053
9054         ret = -ENXIO;
9055         ctx = f.file->private_data;
9056         if (!percpu_ref_tryget(&ctx->refs))
9057                 goto out_fput;
9058
9059         ret = -EBADFD;
9060         if (ctx->flags & IORING_SETUP_R_DISABLED)
9061                 goto out;
9062
9063         /*
9064          * For SQ polling, the thread will do all submissions and completions.
9065          * Just return the requested submit count, and wake the thread if
9066          * we were asked to.
9067          */
9068         ret = 0;
9069         if (ctx->flags & IORING_SETUP_SQPOLL) {
9070                 io_cqring_overflow_flush(ctx, false, NULL, NULL);
9071
9072                 if (unlikely(ctx->sqo_exec)) {
9073                         ret = io_sq_thread_fork(ctx->sq_data, ctx);
9074                         if (ret)
9075                                 goto out;
9076                         ctx->sqo_exec = 0;
9077                 }
9078                 ret = -EOWNERDEAD;
9079                 if (flags & IORING_ENTER_SQ_WAKEUP)
9080                         wake_up(&ctx->sq_data->wait);
9081                 if (flags & IORING_ENTER_SQ_WAIT) {
9082                         ret = io_sqpoll_wait_sq(ctx);
9083                         if (ret)
9084                                 goto out;
9085                 }
9086                 submitted = to_submit;
9087         } else if (to_submit) {
9088                 ret = io_uring_add_task_file(ctx, f.file);
9089                 if (unlikely(ret))
9090                         goto out;
9091                 mutex_lock(&ctx->uring_lock);
9092                 submitted = io_submit_sqes(ctx, to_submit);
9093                 mutex_unlock(&ctx->uring_lock);
9094
9095                 if (submitted != to_submit)
9096                         goto out;
9097         }
9098         if (flags & IORING_ENTER_GETEVENTS) {
9099                 const sigset_t __user *sig;
9100                 struct __kernel_timespec __user *ts;
9101
9102                 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
9103                 if (unlikely(ret))
9104                         goto out;
9105
9106                 min_complete = min(min_complete, ctx->cq_entries);
9107
9108                 /*
9109                  * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
9110                  * space applications don't need to do io completion events
9111                  * polling again, they can rely on io_sq_thread to do polling
9112                  * work, which can reduce cpu usage and uring_lock contention.
9113                  */
9114                 if (ctx->flags & IORING_SETUP_IOPOLL &&
9115                     !(ctx->flags & IORING_SETUP_SQPOLL)) {
9116                         ret = io_iopoll_check(ctx, min_complete);
9117                 } else {
9118                         ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
9119                 }
9120         }
9121
9122 out:
9123         percpu_ref_put(&ctx->refs);
9124 out_fput:
9125         fdput(f);
9126         return submitted ? submitted : ret;
9127 }
9128
9129 #ifdef CONFIG_PROC_FS
9130 static int io_uring_show_cred(int id, void *p, void *data)
9131 {
9132         const struct cred *cred = p;
9133         struct seq_file *m = data;
9134         struct user_namespace *uns = seq_user_ns(m);
9135         struct group_info *gi;
9136         kernel_cap_t cap;
9137         unsigned __capi;
9138         int g;
9139
9140         seq_printf(m, "%5d\n", id);
9141         seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
9142         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
9143         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
9144         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
9145         seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
9146         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
9147         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
9148         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
9149         seq_puts(m, "\n\tGroups:\t");
9150         gi = cred->group_info;
9151         for (g = 0; g < gi->ngroups; g++) {
9152                 seq_put_decimal_ull(m, g ? " " : "",
9153                                         from_kgid_munged(uns, gi->gid[g]));
9154         }
9155         seq_puts(m, "\n\tCapEff:\t");
9156         cap = cred->cap_effective;
9157         CAP_FOR_EACH_U32(__capi)
9158                 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
9159         seq_putc(m, '\n');
9160         return 0;
9161 }
9162
9163 static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
9164 {
9165         struct io_sq_data *sq = NULL;
9166         bool has_lock;
9167         int i;
9168
9169         /*
9170          * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
9171          * since fdinfo case grabs it in the opposite direction of normal use
9172          * cases. If we fail to get the lock, we just don't iterate any
9173          * structures that could be going away outside the io_uring mutex.
9174          */
9175         has_lock = mutex_trylock(&ctx->uring_lock);
9176
9177         if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
9178                 sq = ctx->sq_data;
9179                 if (!sq->thread)
9180                         sq = NULL;
9181         }
9182
9183         seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
9184         seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
9185         seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
9186         for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
9187                 struct file *f = *io_fixed_file_slot(ctx->file_data, i);
9188
9189                 if (f)
9190                         seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
9191                 else
9192                         seq_printf(m, "%5u: <none>\n", i);
9193         }
9194         seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
9195         for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
9196                 struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
9197
9198                 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
9199                                                 (unsigned int) buf->len);
9200         }
9201         if (has_lock && !idr_is_empty(&ctx->personality_idr)) {
9202                 seq_printf(m, "Personalities:\n");
9203                 idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
9204         }
9205         seq_printf(m, "PollList:\n");
9206         spin_lock_irq(&ctx->completion_lock);
9207         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
9208                 struct hlist_head *list = &ctx->cancel_hash[i];
9209                 struct io_kiocb *req;
9210
9211                 hlist_for_each_entry(req, list, hash_node)
9212                         seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
9213                                         req->task->task_works != NULL);
9214         }
9215         spin_unlock_irq(&ctx->completion_lock);
9216         if (has_lock)
9217                 mutex_unlock(&ctx->uring_lock);
9218 }
9219
9220 static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
9221 {
9222         struct io_ring_ctx *ctx = f->private_data;
9223
9224         if (percpu_ref_tryget(&ctx->refs)) {
9225                 __io_uring_show_fdinfo(ctx, m);
9226                 percpu_ref_put(&ctx->refs);
9227         }
9228 }
9229 #endif
9230
9231 static const struct file_operations io_uring_fops = {
9232         .release        = io_uring_release,
9233         .mmap           = io_uring_mmap,
9234 #ifndef CONFIG_MMU
9235         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
9236         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
9237 #endif
9238         .poll           = io_uring_poll,
9239         .fasync         = io_uring_fasync,
9240 #ifdef CONFIG_PROC_FS
9241         .show_fdinfo    = io_uring_show_fdinfo,
9242 #endif
9243 };
9244
9245 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
9246                                   struct io_uring_params *p)
9247 {
9248         struct io_rings *rings;
9249         size_t size, sq_array_offset;
9250
9251         /* make sure these are sane, as we already accounted them */
9252         ctx->sq_entries = p->sq_entries;
9253         ctx->cq_entries = p->cq_entries;
9254
9255         size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
9256         if (size == SIZE_MAX)
9257                 return -EOVERFLOW;
9258
9259         rings = io_mem_alloc(size);
9260         if (!rings)
9261                 return -ENOMEM;
9262
9263         ctx->rings = rings;
9264         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
9265         rings->sq_ring_mask = p->sq_entries - 1;
9266         rings->cq_ring_mask = p->cq_entries - 1;
9267         rings->sq_ring_entries = p->sq_entries;
9268         rings->cq_ring_entries = p->cq_entries;
9269         ctx->sq_mask = rings->sq_ring_mask;
9270         ctx->cq_mask = rings->cq_ring_mask;
9271
9272         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
9273         if (size == SIZE_MAX) {
9274                 io_mem_free(ctx->rings);
9275                 ctx->rings = NULL;
9276                 return -EOVERFLOW;
9277         }
9278
9279         ctx->sq_sqes = io_mem_alloc(size);
9280         if (!ctx->sq_sqes) {
9281                 io_mem_free(ctx->rings);
9282                 ctx->rings = NULL;
9283                 return -ENOMEM;
9284         }
9285
9286         return 0;
9287 }
9288
9289 static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
9290 {
9291         int ret, fd;
9292
9293         fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
9294         if (fd < 0)
9295                 return fd;
9296
9297         ret = io_uring_add_task_file(ctx, file);
9298         if (ret) {
9299                 put_unused_fd(fd);
9300                 return ret;
9301         }
9302         fd_install(fd, file);
9303         return fd;
9304 }
9305
9306 /*
9307  * Allocate an anonymous fd, this is what constitutes the application
9308  * visible backing of an io_uring instance. The application mmaps this
9309  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
9310  * we have to tie this fd to a socket for file garbage collection purposes.
9311  */
9312 static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
9313 {
9314         struct file *file;
9315 #if defined(CONFIG_UNIX)
9316         int ret;
9317
9318         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
9319                                 &ctx->ring_sock);
9320         if (ret)
9321                 return ERR_PTR(ret);
9322 #endif
9323
9324         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
9325                                         O_RDWR | O_CLOEXEC);
9326 #if defined(CONFIG_UNIX)
9327         if (IS_ERR(file)) {
9328                 sock_release(ctx->ring_sock);
9329                 ctx->ring_sock = NULL;
9330         } else {
9331                 ctx->ring_sock->file = file;
9332         }
9333 #endif
9334         return file;
9335 }
9336
9337 static int io_uring_create(unsigned entries, struct io_uring_params *p,
9338                            struct io_uring_params __user *params)
9339 {
9340         struct io_ring_ctx *ctx;
9341         struct file *file;
9342         int ret;
9343
9344         if (!entries)
9345                 return -EINVAL;
9346         if (entries > IORING_MAX_ENTRIES) {
9347                 if (!(p->flags & IORING_SETUP_CLAMP))
9348                         return -EINVAL;
9349                 entries = IORING_MAX_ENTRIES;
9350         }
9351
9352         /*
9353          * Use twice as many entries for the CQ ring. It's possible for the
9354          * application to drive a higher depth than the size of the SQ ring,
9355          * since the sqes are only used at submission time. This allows for
9356          * some flexibility in overcommitting a bit. If the application has
9357          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
9358          * of CQ ring entries manually.
9359          */
9360         p->sq_entries = roundup_pow_of_two(entries);
9361         if (p->flags & IORING_SETUP_CQSIZE) {
9362                 /*
9363                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
9364                  * to a power-of-two, if it isn't already. We do NOT impose
9365                  * any cq vs sq ring sizing.
9366                  */
9367                 if (!p->cq_entries)
9368                         return -EINVAL;
9369                 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
9370                         if (!(p->flags & IORING_SETUP_CLAMP))
9371                                 return -EINVAL;
9372                         p->cq_entries = IORING_MAX_CQ_ENTRIES;
9373                 }
9374                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
9375                 if (p->cq_entries < p->sq_entries)
9376                         return -EINVAL;
9377         } else {
9378                 p->cq_entries = 2 * p->sq_entries;
9379         }
9380
9381         ctx = io_ring_ctx_alloc(p);
9382         if (!ctx)
9383                 return -ENOMEM;
9384         ctx->compat = in_compat_syscall();
9385         if (!capable(CAP_IPC_LOCK))
9386                 ctx->user = get_uid(current_user());
9387
9388         /*
9389          * This is just grabbed for accounting purposes. When a process exits,
9390          * the mm is exited and dropped before the files, hence we need to hang
9391          * on to this mm purely for the purposes of being able to unaccount
9392          * memory (locked/pinned vm). It's not used for anything else.
9393          */
9394         mmgrab(current->mm);
9395         ctx->mm_account = current->mm;
9396
9397         ret = io_allocate_scq_urings(ctx, p);
9398         if (ret)
9399                 goto err;
9400
9401         ret = io_sq_offload_create(ctx, p);
9402         if (ret)
9403                 goto err;
9404
9405         if (!(p->flags & IORING_SETUP_R_DISABLED))
9406                 io_sq_offload_start(ctx);
9407
9408         memset(&p->sq_off, 0, sizeof(p->sq_off));
9409         p->sq_off.head = offsetof(struct io_rings, sq.head);
9410         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
9411         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
9412         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
9413         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
9414         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
9415         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
9416
9417         memset(&p->cq_off, 0, sizeof(p->cq_off));
9418         p->cq_off.head = offsetof(struct io_rings, cq.head);
9419         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
9420         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
9421         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
9422         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
9423         p->cq_off.cqes = offsetof(struct io_rings, cqes);
9424         p->cq_off.flags = offsetof(struct io_rings, cq_flags);
9425
9426         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
9427                         IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
9428                         IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
9429                         IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
9430                         IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS;
9431
9432         if (copy_to_user(params, p, sizeof(*p))) {
9433                 ret = -EFAULT;
9434                 goto err;
9435         }
9436
9437         file = io_uring_get_file(ctx);
9438         if (IS_ERR(file)) {
9439                 ret = PTR_ERR(file);
9440                 goto err;
9441         }
9442
9443         /*
9444          * Install ring fd as the very last thing, so we don't risk someone
9445          * having closed it before we finish setup
9446          */
9447         ret = io_uring_install_fd(ctx, file);
9448         if (ret < 0) {
9449                 /* fput will clean it up */
9450                 fput(file);
9451                 return ret;
9452         }
9453
9454         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
9455         return ret;
9456 err:
9457         io_ring_ctx_wait_and_kill(ctx);
9458         return ret;
9459 }
9460
9461 /*
9462  * Sets up an aio uring context, and returns the fd. Applications asks for a
9463  * ring size, we return the actual sq/cq ring sizes (among other things) in the
9464  * params structure passed in.
9465  */
9466 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
9467 {
9468         struct io_uring_params p;
9469         int i;
9470
9471         if (copy_from_user(&p, params, sizeof(p)))
9472                 return -EFAULT;
9473         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
9474                 if (p.resv[i])
9475                         return -EINVAL;
9476         }
9477
9478         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
9479                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
9480                         IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
9481                         IORING_SETUP_R_DISABLED))
9482                 return -EINVAL;
9483
9484         return  io_uring_create(entries, &p, params);
9485 }
9486
9487 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
9488                 struct io_uring_params __user *, params)
9489 {
9490         return io_uring_setup(entries, params);
9491 }
9492
9493 static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
9494 {
9495         struct io_uring_probe *p;
9496         size_t size;
9497         int i, ret;
9498
9499         size = struct_size(p, ops, nr_args);
9500         if (size == SIZE_MAX)
9501                 return -EOVERFLOW;
9502         p = kzalloc(size, GFP_KERNEL);
9503         if (!p)
9504                 return -ENOMEM;
9505
9506         ret = -EFAULT;
9507         if (copy_from_user(p, arg, size))
9508                 goto out;
9509         ret = -EINVAL;
9510         if (memchr_inv(p, 0, size))
9511                 goto out;
9512
9513         p->last_op = IORING_OP_LAST - 1;
9514         if (nr_args > IORING_OP_LAST)
9515                 nr_args = IORING_OP_LAST;
9516
9517         for (i = 0; i < nr_args; i++) {
9518                 p->ops[i].op = i;
9519                 if (!io_op_defs[i].not_supported)
9520                         p->ops[i].flags = IO_URING_OP_SUPPORTED;
9521         }
9522         p->ops_len = i;
9523
9524         ret = 0;
9525         if (copy_to_user(arg, p, size))
9526                 ret = -EFAULT;
9527 out:
9528         kfree(p);
9529         return ret;
9530 }
9531
9532 static int io_register_personality(struct io_ring_ctx *ctx)
9533 {
9534         const struct cred *creds;
9535         int ret;
9536
9537         creds = get_current_cred();
9538
9539         ret = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
9540                                 USHRT_MAX, GFP_KERNEL);
9541         if (ret < 0)
9542                 put_cred(creds);
9543         return ret;
9544 }
9545
9546 static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
9547                                     unsigned int nr_args)
9548 {
9549         struct io_uring_restriction *res;
9550         size_t size;
9551         int i, ret;
9552
9553         /* Restrictions allowed only if rings started disabled */
9554         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
9555                 return -EBADFD;
9556
9557         /* We allow only a single restrictions registration */
9558         if (ctx->restrictions.registered)
9559                 return -EBUSY;
9560
9561         if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
9562                 return -EINVAL;
9563
9564         size = array_size(nr_args, sizeof(*res));
9565         if (size == SIZE_MAX)
9566                 return -EOVERFLOW;
9567
9568         res = memdup_user(arg, size);
9569         if (IS_ERR(res))
9570                 return PTR_ERR(res);
9571
9572         ret = 0;
9573
9574         for (i = 0; i < nr_args; i++) {
9575                 switch (res[i].opcode) {
9576                 case IORING_RESTRICTION_REGISTER_OP:
9577                         if (res[i].register_op >= IORING_REGISTER_LAST) {
9578                                 ret = -EINVAL;
9579                                 goto out;
9580                         }
9581
9582                         __set_bit(res[i].register_op,
9583                                   ctx->restrictions.register_op);
9584                         break;
9585                 case IORING_RESTRICTION_SQE_OP:
9586                         if (res[i].sqe_op >= IORING_OP_LAST) {
9587                                 ret = -EINVAL;
9588                                 goto out;
9589                         }
9590
9591                         __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
9592                         break;
9593                 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
9594                         ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
9595                         break;
9596                 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
9597                         ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
9598                         break;
9599                 default:
9600                         ret = -EINVAL;
9601                         goto out;
9602                 }
9603         }
9604
9605 out:
9606         /* Reset all restrictions if an error happened */
9607         if (ret != 0)
9608                 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
9609         else
9610                 ctx->restrictions.registered = true;
9611
9612         kfree(res);
9613         return ret;
9614 }
9615
9616 static int io_register_enable_rings(struct io_ring_ctx *ctx)
9617 {
9618         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
9619                 return -EBADFD;
9620
9621         if (ctx->restrictions.registered)
9622                 ctx->restricted = 1;
9623
9624         io_sq_offload_start(ctx);
9625         return 0;
9626 }
9627
9628 static bool io_register_op_must_quiesce(int op)
9629 {
9630         switch (op) {
9631         case IORING_UNREGISTER_FILES:
9632         case IORING_REGISTER_FILES_UPDATE:
9633         case IORING_REGISTER_PROBE:
9634         case IORING_REGISTER_PERSONALITY:
9635         case IORING_UNREGISTER_PERSONALITY:
9636                 return false;
9637         default:
9638                 return true;
9639         }
9640 }
9641
9642 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
9643                                void __user *arg, unsigned nr_args)
9644         __releases(ctx->uring_lock)
9645         __acquires(ctx->uring_lock)
9646 {
9647         int ret;
9648
9649         /*
9650          * We're inside the ring mutex, if the ref is already dying, then
9651          * someone else killed the ctx or is already going through
9652          * io_uring_register().
9653          */
9654         if (percpu_ref_is_dying(&ctx->refs))
9655                 return -ENXIO;
9656
9657         if (io_register_op_must_quiesce(opcode)) {
9658                 percpu_ref_kill(&ctx->refs);
9659
9660                 /*
9661                  * Drop uring mutex before waiting for references to exit. If
9662                  * another thread is currently inside io_uring_enter() it might
9663                  * need to grab the uring_lock to make progress. If we hold it
9664                  * here across the drain wait, then we can deadlock. It's safe
9665                  * to drop the mutex here, since no new references will come in
9666                  * after we've killed the percpu ref.
9667                  */
9668                 mutex_unlock(&ctx->uring_lock);
9669                 do {
9670                         ret = wait_for_completion_interruptible(&ctx->ref_comp);
9671                         if (!ret)
9672                                 break;
9673                         ret = io_run_task_work_sig();
9674                         if (ret < 0)
9675                                 break;
9676                 } while (1);
9677
9678                 mutex_lock(&ctx->uring_lock);
9679
9680                 if (ret) {
9681                         percpu_ref_resurrect(&ctx->refs);
9682                         goto out_quiesce;
9683                 }
9684         }
9685
9686         if (ctx->restricted) {
9687                 if (opcode >= IORING_REGISTER_LAST) {
9688                         ret = -EINVAL;
9689                         goto out;
9690                 }
9691
9692                 if (!test_bit(opcode, ctx->restrictions.register_op)) {
9693                         ret = -EACCES;
9694                         goto out;
9695                 }
9696         }
9697
9698         switch (opcode) {
9699         case IORING_REGISTER_BUFFERS:
9700                 ret = io_sqe_buffers_register(ctx, arg, nr_args);
9701                 break;
9702         case IORING_UNREGISTER_BUFFERS:
9703                 ret = -EINVAL;
9704                 if (arg || nr_args)
9705                         break;
9706                 ret = io_sqe_buffers_unregister(ctx);
9707                 break;
9708         case IORING_REGISTER_FILES:
9709                 ret = io_sqe_files_register(ctx, arg, nr_args);
9710                 break;
9711         case IORING_UNREGISTER_FILES:
9712                 ret = -EINVAL;
9713                 if (arg || nr_args)
9714                         break;
9715                 ret = io_sqe_files_unregister(ctx);
9716                 break;
9717         case IORING_REGISTER_FILES_UPDATE:
9718                 ret = io_sqe_files_update(ctx, arg, nr_args);
9719                 break;
9720         case IORING_REGISTER_EVENTFD:
9721         case IORING_REGISTER_EVENTFD_ASYNC:
9722                 ret = -EINVAL;
9723                 if (nr_args != 1)
9724                         break;
9725                 ret = io_eventfd_register(ctx, arg);
9726                 if (ret)
9727                         break;
9728                 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
9729                         ctx->eventfd_async = 1;
9730                 else
9731                         ctx->eventfd_async = 0;
9732                 break;
9733         case IORING_UNREGISTER_EVENTFD:
9734                 ret = -EINVAL;
9735                 if (arg || nr_args)
9736                         break;
9737                 ret = io_eventfd_unregister(ctx);
9738                 break;
9739         case IORING_REGISTER_PROBE:
9740                 ret = -EINVAL;
9741                 if (!arg || nr_args > 256)
9742                         break;
9743                 ret = io_probe(ctx, arg, nr_args);
9744                 break;
9745         case IORING_REGISTER_PERSONALITY:
9746                 ret = -EINVAL;
9747                 if (arg || nr_args)
9748                         break;
9749                 ret = io_register_personality(ctx);
9750                 break;
9751         case IORING_UNREGISTER_PERSONALITY:
9752                 ret = -EINVAL;
9753                 if (arg)
9754                         break;
9755                 ret = io_unregister_personality(ctx, nr_args);
9756                 break;
9757         case IORING_REGISTER_ENABLE_RINGS:
9758                 ret = -EINVAL;
9759                 if (arg || nr_args)
9760                         break;
9761                 ret = io_register_enable_rings(ctx);
9762                 break;
9763         case IORING_REGISTER_RESTRICTIONS:
9764                 ret = io_register_restrictions(ctx, arg, nr_args);
9765                 break;
9766         default:
9767                 ret = -EINVAL;
9768                 break;
9769         }
9770
9771 out:
9772         if (io_register_op_must_quiesce(opcode)) {
9773                 /* bring the ctx back to life */
9774                 percpu_ref_reinit(&ctx->refs);
9775 out_quiesce:
9776                 reinit_completion(&ctx->ref_comp);
9777         }
9778         return ret;
9779 }
9780
9781 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
9782                 void __user *, arg, unsigned int, nr_args)
9783 {
9784         struct io_ring_ctx *ctx;
9785         long ret = -EBADF;
9786         struct fd f;
9787
9788         f = fdget(fd);
9789         if (!f.file)
9790                 return -EBADF;
9791
9792         ret = -EOPNOTSUPP;
9793         if (f.file->f_op != &io_uring_fops)
9794                 goto out_fput;
9795
9796         ctx = f.file->private_data;
9797
9798         io_run_task_work();
9799
9800         mutex_lock(&ctx->uring_lock);
9801         ret = __io_uring_register(ctx, opcode, arg, nr_args);
9802         mutex_unlock(&ctx->uring_lock);
9803         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
9804                                                         ctx->cq_ev_fd != NULL, ret);
9805 out_fput:
9806         fdput(f);
9807         return ret;
9808 }
9809
9810 static int __init io_uring_init(void)
9811 {
9812 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
9813         BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
9814         BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
9815 } while (0)
9816
9817 #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
9818         __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
9819         BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
9820         BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
9821         BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
9822         BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
9823         BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
9824         BUILD_BUG_SQE_ELEM(8,  __u64,  off);
9825         BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
9826         BUILD_BUG_SQE_ELEM(16, __u64,  addr);
9827         BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
9828         BUILD_BUG_SQE_ELEM(24, __u32,  len);
9829         BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
9830         BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
9831         BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
9832         BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
9833         BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
9834         BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
9835         BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
9836         BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
9837         BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
9838         BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
9839         BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
9840         BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
9841         BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
9842         BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
9843         BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
9844         BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
9845         BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
9846         BUILD_BUG_SQE_ELEM(42, __u16,  personality);
9847         BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
9848
9849         BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
9850         BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
9851         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
9852                                 SLAB_ACCOUNT);
9853         return 0;
9854 };
9855 __initcall(io_uring_init);