fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqring (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <linux/compat.h>
  47 #include <net/compat.h>
  48 #include <linux/refcount.h>
  49 #include <linux/uio.h>
  50 #include <linux/bits.h>
  51
  52 #include <linux/sched/signal.h>
  53 #include <linux/fs.h>
  54 #include <linux/file.h>
  55 #include <linux/fdtable.h>
  56 #include <linux/mm.h>
  57 #include <linux/mman.h>
  58 #include <linux/percpu.h>
  59 #include <linux/slab.h>
  60 #include <linux/blkdev.h>
  61 #include <linux/bvec.h>
  62 #include <linux/net.h>
  63 #include <net/sock.h>
  64 #include <net/af_unix.h>
  65 #include <net/scm.h>
  66 #include <linux/anon_inodes.h>
  67 #include <linux/sched/mm.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/nospec.h>
  70 #include <linux/sizes.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/highmem.h>
  73 #include <linux/namei.h>
  74 #include <linux/fsnotify.h>
  75 #include <linux/fadvise.h>
  76 #include <linux/eventpoll.h>
  77 #include <linux/splice.h>
  78 #include <linux/task_work.h>
  79 #include <linux/pagemap.h>
  80 #include <linux/io_uring.h>
  81 #include <linux/freezer.h>
  82
  83 #define CREATE_TRACE_POINTS
  84 #include <trace/events/io_uring.h>
  85
  86 #include <uapi/linux/io_uring.h>
  87
  88 #include "internal.h"
  89 #include "io-wq.h"
  90
  91 #define IORING_MAX_ENTRIES      32768
  92 #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
  93
  94 /*
  95  * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
  96  */
  97 #define IORING_FILE_TABLE_SHIFT 9
  98 #define IORING_MAX_FILES_TABLE  (1U << IORING_FILE_TABLE_SHIFT)
  99 #define IORING_FILE_TABLE_MASK  (IORING_MAX_FILES_TABLE - 1)
 100 #define IORING_MAX_FIXED_FILES  (64 * IORING_MAX_FILES_TABLE)
 101 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
 102                                  IORING_REGISTER_LAST + IORING_OP_LAST)
 103
 104 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
 105                                 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
 106                                 IOSQE_BUFFER_SELECT)
 107
 108 struct io_uring {
 109         u32 head ____cacheline_aligned_in_smp;
 110         u32 tail ____cacheline_aligned_in_smp;
 111 };
 112
 113 /*
 114  * This data is shared with the application through the mmap at offsets
 115  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 116  *
 117  * The offsets to the member fields are published through struct
 118  * io_sqring_offsets when calling io_uring_setup.
 119  */
 120 struct io_rings {
 121         /*
 122          * Head and tail offsets into the ring; the offsets need to be
 123          * masked to get valid indices.
 124          *
 125          * The kernel controls head of the sq ring and the tail of the cq ring,
 126          * and the application controls tail of the sq ring and the head of the
 127          * cq ring.
 128          */
 129         struct io_uring         sq, cq;
 130         /*
 131          * Bitmasks to apply to head and tail offsets (constant, equals
 132          * ring_entries - 1)
 133          */
 134         u32                     sq_ring_mask, cq_ring_mask;
 135         /* Ring sizes (constant, power of 2) */
 136         u32                     sq_ring_entries, cq_ring_entries;
 137         /*
 138          * Number of invalid entries dropped by the kernel due to
 139          * invalid index stored in array
 140          *
 141          * Written by the kernel, shouldn't be modified by the
 142          * application (i.e. get number of "new events" by comparing to
 143          * cached value).
 144          *
 145          * After a new SQ head value was read by the application this
 146          * counter includes all submissions that were dropped reaching
 147          * the new SQ head (and possibly more).
 148          */
 149         u32                     sq_dropped;
 150         /*
 151          * Runtime SQ flags
 152          *
 153          * Written by the kernel, shouldn't be modified by the
 154          * application.
 155          *
 156          * The application needs a full memory barrier before checking
 157          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 158          */
 159         u32                     sq_flags;
 160         /*
 161          * Runtime CQ flags
 162          *
 163          * Written by the application, shouldn't be modified by the
 164          * kernel.
 165          */
 166         u32                     cq_flags;
 167         /*
 168          * Number of completion events lost because the queue was full;
 169          * this should be avoided by the application by making sure
 170          * there are not more requests pending than there is space in
 171          * the completion queue.
 172          *
 173          * Written by the kernel, shouldn't be modified by the
 174          * application (i.e. get number of "new events" by comparing to
 175          * cached value).
 176          *
 177          * As completion events come in out of order this counter is not
 178          * ordered with any other data.
 179          */
 180         u32                     cq_overflow;
 181         /*
 182          * Ring buffer of completion events.
 183          *
 184          * The kernel writes completion events fresh every time they are
 185          * produced, so the application is allowed to modify pending
 186          * entries.
 187          */
 188         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 189 };
 190
 191 enum io_uring_cmd_flags {
 192         IO_URING_F_NONBLOCK             = 1,
 193         IO_URING_F_COMPLETE_DEFER       = 2,
 194 };
 195
 196 struct io_mapped_ubuf {
 197         u64             ubuf;
 198         size_t          len;
 199         struct          bio_vec *bvec;
 200         unsigned int    nr_bvecs;
 201         unsigned long   acct_pages;
 202 };
 203
 204 struct io_ring_ctx;
 205
 206 struct io_rsrc_put {
 207         struct list_head list;
 208         union {
 209                 void *rsrc;
 210                 struct file *file;
 211         };
 212 };
 213
 214 struct fixed_rsrc_table {
 215         struct file             **files;
 216 };
 217
 218 struct fixed_rsrc_ref_node {
 219         struct percpu_ref               refs;
 220         struct list_head                node;
 221         struct list_head                rsrc_list;
 222         struct fixed_rsrc_data          *rsrc_data;
 223         void                            (*rsrc_put)(struct io_ring_ctx *ctx,
 224                                                     struct io_rsrc_put *prsrc);
 225         struct llist_node               llist;
 226         bool                            done;
 227 };
 228
 229 struct fixed_rsrc_data {
 230         struct fixed_rsrc_table         *table;
 231         struct io_ring_ctx              *ctx;
 232
 233         struct fixed_rsrc_ref_node      *node;
 234         struct percpu_ref               refs;
 235         struct completion               done;
 236         bool                            quiesce;
 237 };
 238
 239 struct io_buffer {
 240         struct list_head list;
 241         __u64 addr;
 242         __s32 len;
 243         __u16 bid;
 244 };
 245
 246 struct io_restriction {
 247         DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
 248         DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
 249         u8 sqe_flags_allowed;
 250         u8 sqe_flags_required;
 251         bool registered;
 252 };
 253
 254 enum {
 255         IO_SQ_THREAD_SHOULD_STOP = 0,
 256         IO_SQ_THREAD_SHOULD_PARK,
 257 };
 258
 259 struct io_sq_data {
 260         refcount_t              refs;
 261         atomic_t                park_pending;
 262         struct mutex            lock;
 263
 264         /* ctx's that are using this sqd */
 265         struct list_head        ctx_list;
 266
 267         struct task_struct      *thread;
 268         struct wait_queue_head  wait;
 269
 270         unsigned                sq_thread_idle;
 271         int                     sq_cpu;
 272         pid_t                   task_pid;
 273         pid_t                   task_tgid;
 274
 275         unsigned long           state;
 276         struct completion       exited;
 277 };
 278
 279 #define IO_IOPOLL_BATCH                 8
 280 #define IO_COMPL_BATCH                  32
 281 #define IO_REQ_CACHE_SIZE               32
 282 #define IO_REQ_ALLOC_BATCH              8
 283
 284 struct io_comp_state {
 285         struct io_kiocb         *reqs[IO_COMPL_BATCH];
 286         unsigned int            nr;
 287         unsigned int            locked_free_nr;
 288         /* inline/task_work completion list, under ->uring_lock */
 289         struct list_head        free_list;
 290         /* IRQ completion list, under ->completion_lock */
 291         struct list_head        locked_free_list;
 292 };
 293
 294 struct io_submit_link {
 295         struct io_kiocb         *head;
 296         struct io_kiocb         *last;
 297 };
 298
 299 struct io_submit_state {
 300         struct blk_plug         plug;
 301         struct io_submit_link   link;
 302
 303         /*
 304          * io_kiocb alloc cache
 305          */
 306         void                    *reqs[IO_REQ_CACHE_SIZE];
 307         unsigned int            free_reqs;
 308
 309         bool                    plug_started;
 310
 311         /*
 312          * Batch completion logic
 313          */
 314         struct io_comp_state    comp;
 315
 316         /*
 317          * File reference cache
 318          */
 319         struct file             *file;
 320         unsigned int            fd;
 321         unsigned int            file_refs;
 322         unsigned int            ios_left;
 323 };
 324
 325 struct io_ring_ctx {
 326         struct {
 327                 struct percpu_ref       refs;
 328         } ____cacheline_aligned_in_smp;
 329
 330         struct {
 331                 unsigned int            flags;
 332                 unsigned int            compat: 1;
 333                 unsigned int            cq_overflow_flushed: 1;
 334                 unsigned int            drain_next: 1;
 335                 unsigned int            eventfd_async: 1;
 336                 unsigned int            restricted: 1;
 337
 338                 /*
 339                  * Ring buffer of indices into array of io_uring_sqe, which is
 340                  * mmapped by the application using the IORING_OFF_SQES offset.
 341                  *
 342                  * This indirection could e.g. be used to assign fixed
 343                  * io_uring_sqe entries to operations and only submit them to
 344                  * the queue when needed.
 345                  *
 346                  * The kernel modifies neither the indices array nor the entries
 347                  * array.
 348                  */
 349                 u32                     *sq_array;
 350                 unsigned                cached_sq_head;
 351                 unsigned                sq_entries;
 352                 unsigned                sq_mask;
 353                 unsigned                sq_thread_idle;
 354                 unsigned                cached_sq_dropped;
 355                 unsigned                cached_cq_overflow;
 356                 unsigned long           sq_check_overflow;
 357
 358                 /* hashed buffered write serialization */
 359                 struct io_wq_hash       *hash_map;
 360
 361                 struct list_head        defer_list;
 362                 struct list_head        timeout_list;
 363                 struct list_head        cq_overflow_list;
 364
 365                 struct io_uring_sqe     *sq_sqes;
 366         } ____cacheline_aligned_in_smp;
 367
 368         struct {
 369                 struct mutex            uring_lock;
 370                 wait_queue_head_t       wait;
 371         } ____cacheline_aligned_in_smp;
 372
 373         struct io_submit_state          submit_state;
 374
 375         struct io_rings *rings;
 376
 377         /* Only used for accounting purposes */
 378         struct mm_struct        *mm_account;
 379
 380         const struct cred       *sq_creds;      /* cred used for __io_sq_thread() */
 381         struct io_sq_data       *sq_data;       /* if using sq thread polling */
 382
 383         struct wait_queue_head  sqo_sq_wait;
 384         struct list_head        sqd_list;
 385
 386         /*
 387          * If used, fixed file set. Writers must ensure that ->refs is dead,
 388          * readers must ensure that ->refs is alive as long as the file* is
 389          * used. Only updated through io_uring_register(2).
 390          */
 391         struct fixed_rsrc_data  *file_data;
 392         unsigned                nr_user_files;
 393
 394         /* if used, fixed mapped user buffers */
 395         unsigned                nr_user_bufs;
 396         struct io_mapped_ubuf   *user_bufs;
 397
 398         struct user_struct      *user;
 399
 400         struct completion       ref_comp;
 401
 402 #if defined(CONFIG_UNIX)
 403         struct socket           *ring_sock;
 404 #endif
 405
 406         struct xarray           io_buffers;
 407
 408         struct xarray           personalities;
 409         u32                     pers_next;
 410
 411         struct {
 412                 unsigned                cached_cq_tail;
 413                 unsigned                cq_entries;
 414                 unsigned                cq_mask;
 415                 atomic_t                cq_timeouts;
 416                 unsigned                cq_last_tm_flush;
 417                 unsigned long           cq_check_overflow;
 418                 struct wait_queue_head  cq_wait;
 419                 struct fasync_struct    *cq_fasync;
 420                 struct eventfd_ctx      *cq_ev_fd;
 421         } ____cacheline_aligned_in_smp;
 422
 423         struct {
 424                 spinlock_t              completion_lock;
 425
 426                 /*
 427                  * ->iopoll_list is protected by the ctx->uring_lock for
 428                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 429                  * For SQPOLL, only the single threaded io_sq_thread() will
 430                  * manipulate the list, hence no extra locking is needed there.
 431                  */
 432                 struct list_head        iopoll_list;
 433                 struct hlist_head       *cancel_hash;
 434                 unsigned                cancel_hash_bits;
 435                 bool                    poll_multi_file;
 436
 437                 spinlock_t              inflight_lock;
 438                 struct list_head        inflight_list;
 439         } ____cacheline_aligned_in_smp;
 440
 441         struct delayed_work             rsrc_put_work;
 442         struct llist_head               rsrc_put_llist;
 443         struct list_head                rsrc_ref_list;
 444         spinlock_t                      rsrc_ref_lock;
 445
 446         struct io_restriction           restrictions;
 447
 448         /* exit task_work */
 449         struct callback_head            *exit_task_work;
 450
 451         struct wait_queue_head          hash_wait;
 452
 453         /* Keep this last, we don't need it for the fast path */
 454         struct work_struct              exit_work;
 455         struct list_head                tctx_list;
 456 };
 457
 458 /*
 459  * First field must be the file pointer in all the
 460  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 461  */
 462 struct io_poll_iocb {
 463         struct file                     *file;
 464         struct wait_queue_head          *head;
 465         __poll_t                        events;
 466         bool                            done;
 467         bool                            canceled;
 468         struct wait_queue_entry         wait;
 469 };
 470
 471 struct io_poll_remove {
 472         struct file                     *file;
 473         u64                             addr;
 474 };
 475
 476 struct io_close {
 477         struct file                     *file;
 478         int                             fd;
 479 };
 480
 481 struct io_timeout_data {
 482         struct io_kiocb                 *req;
 483         struct hrtimer                  timer;
 484         struct timespec64               ts;
 485         enum hrtimer_mode               mode;
 486 };
 487
 488 struct io_accept {
 489         struct file                     *file;
 490         struct sockaddr __user          *addr;
 491         int __user                      *addr_len;
 492         int                             flags;
 493         unsigned long                   nofile;
 494 };
 495
 496 struct io_sync {
 497         struct file                     *file;
 498         loff_t                          len;
 499         loff_t                          off;
 500         int                             flags;
 501         int                             mode;
 502 };
 503
 504 struct io_cancel {
 505         struct file                     *file;
 506         u64                             addr;
 507 };
 508
 509 struct io_timeout {
 510         struct file                     *file;
 511         u32                             off;
 512         u32                             target_seq;
 513         struct list_head                list;
 514         /* head of the link, used by linked timeouts only */
 515         struct io_kiocb                 *head;
 516 };
 517
 518 struct io_timeout_rem {
 519         struct file                     *file;
 520         u64                             addr;
 521
 522         /* timeout update */
 523         struct timespec64               ts;
 524         u32                             flags;
 525 };
 526
 527 struct io_rw {
 528         /* NOTE: kiocb has the file as the first member, so don't do it here */
 529         struct kiocb                    kiocb;
 530         u64                             addr;
 531         u64                             len;
 532 };
 533
 534 struct io_connect {
 535         struct file                     *file;
 536         struct sockaddr __user          *addr;
 537         int                             addr_len;
 538 };
 539
 540 struct io_sr_msg {
 541         struct file                     *file;
 542         union {
 543                 struct user_msghdr __user *umsg;
 544                 void __user             *buf;
 545         };
 546         int                             msg_flags;
 547         int                             bgid;
 548         size_t                          len;
 549         struct io_buffer                *kbuf;
 550 };
 551
 552 struct io_open {
 553         struct file                     *file;
 554         int                             dfd;
 555         struct filename                 *filename;
 556         struct open_how                 how;
 557         unsigned long                   nofile;
 558 };
 559
 560 struct io_rsrc_update {
 561         struct file                     *file;
 562         u64                             arg;
 563         u32                             nr_args;
 564         u32                             offset;
 565 };
 566
 567 struct io_fadvise {
 568         struct file                     *file;
 569         u64                             offset;
 570         u32                             len;
 571         u32                             advice;
 572 };
 573
 574 struct io_madvise {
 575         struct file                     *file;
 576         u64                             addr;
 577         u32                             len;
 578         u32                             advice;
 579 };
 580
 581 struct io_epoll {
 582         struct file                     *file;
 583         int                             epfd;
 584         int                             op;
 585         int                             fd;
 586         struct epoll_event              event;
 587 };
 588
 589 struct io_splice {
 590         struct file                     *file_out;
 591         struct file                     *file_in;
 592         loff_t                          off_out;
 593         loff_t                          off_in;
 594         u64                             len;
 595         unsigned int                    flags;
 596 };
 597
 598 struct io_provide_buf {
 599         struct file                     *file;
 600         __u64                           addr;
 601         __s32                           len;
 602         __u32                           bgid;
 603         __u16                           nbufs;
 604         __u16                           bid;
 605 };
 606
 607 struct io_statx {
 608         struct file                     *file;
 609         int                             dfd;
 610         unsigned int                    mask;
 611         unsigned int                    flags;
 612         const char __user               *filename;
 613         struct statx __user             *buffer;
 614 };
 615
 616 struct io_shutdown {
 617         struct file                     *file;
 618         int                             how;
 619 };
 620
 621 struct io_rename {
 622         struct file                     *file;
 623         int                             old_dfd;
 624         int                             new_dfd;
 625         struct filename                 *oldpath;
 626         struct filename                 *newpath;
 627         int                             flags;
 628 };
 629
 630 struct io_unlink {
 631         struct file                     *file;
 632         int                             dfd;
 633         int                             flags;
 634         struct filename                 *filename;
 635 };
 636
 637 struct io_completion {
 638         struct file                     *file;
 639         struct list_head                list;
 640         int                             cflags;
 641 };
 642
 643 struct io_async_connect {
 644         struct sockaddr_storage         address;
 645 };
 646
 647 struct io_async_msghdr {
 648         struct iovec                    fast_iov[UIO_FASTIOV];
 649         /* points to an allocated iov, if NULL we use fast_iov instead */
 650         struct iovec                    *free_iov;
 651         struct sockaddr __user          *uaddr;
 652         struct msghdr                   msg;
 653         struct sockaddr_storage         addr;
 654 };
 655
 656 struct io_async_rw {
 657         struct iovec                    fast_iov[UIO_FASTIOV];
 658         const struct iovec              *free_iovec;
 659         struct iov_iter                 iter;
 660         size_t                          bytes_done;
 661         struct wait_page_queue          wpq;
 662 };
 663
 664 enum {
 665         REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
 666         REQ_F_IO_DRAIN_BIT      = IOSQE_IO_DRAIN_BIT,
 667         REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
 668         REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
 669         REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
 670         REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
 671
 672         REQ_F_FAIL_LINK_BIT,
 673         REQ_F_INFLIGHT_BIT,
 674         REQ_F_CUR_POS_BIT,
 675         REQ_F_NOWAIT_BIT,
 676         REQ_F_LINK_TIMEOUT_BIT,
 677         REQ_F_ISREG_BIT,
 678         REQ_F_NEED_CLEANUP_BIT,
 679         REQ_F_POLLED_BIT,
 680         REQ_F_BUFFER_SELECTED_BIT,
 681         REQ_F_NO_FILE_TABLE_BIT,
 682         REQ_F_LTIMEOUT_ACTIVE_BIT,
 683         REQ_F_COMPLETE_INLINE_BIT,
 684
 685         /* not a real bit, just to check we're not overflowing the space */
 686         __REQ_F_LAST_BIT,
 687 };
 688
 689 enum {
 690         /* ctx owns file */
 691         REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
 692         /* drain existing IO first */
 693         REQ_F_IO_DRAIN          = BIT(REQ_F_IO_DRAIN_BIT),
 694         /* linked sqes */
 695         REQ_F_LINK              = BIT(REQ_F_LINK_BIT),
 696         /* doesn't sever on completion < 0 */
 697         REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
 698         /* IOSQE_ASYNC */
 699         REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
 700         /* IOSQE_BUFFER_SELECT */
 701         REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
 702
 703         /* fail rest of links */
 704         REQ_F_FAIL_LINK         = BIT(REQ_F_FAIL_LINK_BIT),
 705         /* on inflight list, should be cancelled and waited on exit reliably */
 706         REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
 707         /* read/write uses file position */
 708         REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
 709         /* must not punt to workers */
 710         REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
 711         /* has or had linked timeout */
 712         REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
 713         /* regular file */
 714         REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
 715         /* needs cleanup */
 716         REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
 717         /* already went through poll handler */
 718         REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
 719         /* buffer already selected */
 720         REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
 721         /* doesn't need file table for this request */
 722         REQ_F_NO_FILE_TABLE     = BIT(REQ_F_NO_FILE_TABLE_BIT),
 723         /* linked timeout is active, i.e. prepared by link's head */
 724         REQ_F_LTIMEOUT_ACTIVE   = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
 725         /* completion is deferred through io_comp_state */
 726         REQ_F_COMPLETE_INLINE   = BIT(REQ_F_COMPLETE_INLINE_BIT),
 727 };
 728
 729 struct async_poll {
 730         struct io_poll_iocb     poll;
 731         struct io_poll_iocb     *double_poll;
 732 };
 733
 734 struct io_task_work {
 735         struct io_wq_work_node  node;
 736         task_work_func_t        func;
 737 };
 738
 739 /*
 740  * NOTE! Each of the iocb union members has the file pointer
 741  * as the first entry in their struct definition. So you can
 742  * access the file pointer through any of the sub-structs,
 743  * or directly as just 'ki_filp' in this struct.
 744  */
 745 struct io_kiocb {
 746         union {
 747                 struct file             *file;
 748                 struct io_rw            rw;
 749                 struct io_poll_iocb     poll;
 750                 struct io_poll_remove   poll_remove;
 751                 struct io_accept        accept;
 752                 struct io_sync          sync;
 753                 struct io_cancel        cancel;
 754                 struct io_timeout       timeout;
 755                 struct io_timeout_rem   timeout_rem;
 756                 struct io_connect       connect;
 757                 struct io_sr_msg        sr_msg;
 758                 struct io_open          open;
 759                 struct io_close         close;
 760                 struct io_rsrc_update   rsrc_update;
 761                 struct io_fadvise       fadvise;
 762                 struct io_madvise       madvise;
 763                 struct io_epoll         epoll;
 764                 struct io_splice        splice;
 765                 struct io_provide_buf   pbuf;
 766                 struct io_statx         statx;
 767                 struct io_shutdown      shutdown;
 768                 struct io_rename        rename;
 769                 struct io_unlink        unlink;
 770                 /* use only after cleaning per-op data, see io_clean_op() */
 771                 struct io_completion    compl;
 772         };
 773
 774         /* opcode allocated if it needs to store data for async defer */
 775         void                            *async_data;
 776         u8                              opcode;
 777         /* polled IO has completed */
 778         u8                              iopoll_completed;
 779
 780         u16                             buf_index;
 781         u32                             result;
 782
 783         struct io_ring_ctx              *ctx;
 784         unsigned int                    flags;
 785         refcount_t                      refs;
 786         struct task_struct              *task;
 787         u64                             user_data;
 788
 789         struct io_kiocb                 *link;
 790         struct percpu_ref               *fixed_rsrc_refs;
 791
 792         /*
 793          * 1. used with ctx->iopoll_list with reads/writes
 794          * 2. to track reqs with ->files (see io_op_def::file_table)
 795          */
 796         struct list_head                inflight_entry;
 797         union {
 798                 struct io_task_work     io_task_work;
 799                 struct callback_head    task_work;
 800         };
 801         /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
 802         struct hlist_node               hash_node;
 803         struct async_poll               *apoll;
 804         struct io_wq_work               work;
 805 };
 806
 807 struct io_tctx_node {
 808         struct list_head        ctx_node;
 809         struct task_struct      *task;
 810         struct io_ring_ctx      *ctx;
 811 };
 812
 813 struct io_defer_entry {
 814         struct list_head        list;
 815         struct io_kiocb         *req;
 816         u32                     seq;
 817 };
 818
 819 struct io_op_def {
 820         /* needs req->file assigned */
 821         unsigned                needs_file : 1;
 822         /* hash wq insertion if file is a regular file */
 823         unsigned                hash_reg_file : 1;
 824         /* unbound wq insertion if file is a non-regular file */
 825         unsigned                unbound_nonreg_file : 1;
 826         /* opcode is not supported by this kernel */
 827         unsigned                not_supported : 1;
 828         /* set if opcode supports polled "wait" */
 829         unsigned                pollin : 1;
 830         unsigned                pollout : 1;
 831         /* op supports buffer selection */
 832         unsigned                buffer_select : 1;
 833         /* must always have async data allocated */
 834         unsigned                needs_async_data : 1;
 835         /* should block plug */
 836         unsigned                plug : 1;
 837         /* size of async data needed, if any */
 838         unsigned short          async_size;
 839 };
 840
 841 static const struct io_op_def io_op_defs[] = {
 842         [IORING_OP_NOP] = {},
 843         [IORING_OP_READV] = {
 844                 .needs_file             = 1,
 845                 .unbound_nonreg_file    = 1,
 846                 .pollin                 = 1,
 847                 .buffer_select          = 1,
 848                 .needs_async_data       = 1,
 849                 .plug                   = 1,
 850                 .async_size             = sizeof(struct io_async_rw),
 851         },
 852         [IORING_OP_WRITEV] = {
 853                 .needs_file             = 1,
 854                 .hash_reg_file          = 1,
 855                 .unbound_nonreg_file    = 1,
 856                 .pollout                = 1,
 857                 .needs_async_data       = 1,
 858                 .plug                   = 1,
 859                 .async_size             = sizeof(struct io_async_rw),
 860         },
 861         [IORING_OP_FSYNC] = {
 862                 .needs_file             = 1,
 863         },
 864         [IORING_OP_READ_FIXED] = {
 865                 .needs_file             = 1,
 866                 .unbound_nonreg_file    = 1,
 867                 .pollin                 = 1,
 868                 .plug                   = 1,
 869                 .async_size             = sizeof(struct io_async_rw),
 870         },
 871         [IORING_OP_WRITE_FIXED] = {
 872                 .needs_file             = 1,
 873                 .hash_reg_file          = 1,
 874                 .unbound_nonreg_file    = 1,
 875                 .pollout                = 1,
 876                 .plug                   = 1,
 877                 .async_size             = sizeof(struct io_async_rw),
 878         },
 879         [IORING_OP_POLL_ADD] = {
 880                 .needs_file             = 1,
 881                 .unbound_nonreg_file    = 1,
 882         },
 883         [IORING_OP_POLL_REMOVE] = {},
 884         [IORING_OP_SYNC_FILE_RANGE] = {
 885                 .needs_file             = 1,
 886         },
 887         [IORING_OP_SENDMSG] = {
 888                 .needs_file             = 1,
 889                 .unbound_nonreg_file    = 1,
 890                 .pollout                = 1,
 891                 .needs_async_data       = 1,
 892                 .async_size             = sizeof(struct io_async_msghdr),
 893         },
 894         [IORING_OP_RECVMSG] = {
 895                 .needs_file             = 1,
 896                 .unbound_nonreg_file    = 1,
 897                 .pollin                 = 1,
 898                 .buffer_select          = 1,
 899                 .needs_async_data       = 1,
 900                 .async_size             = sizeof(struct io_async_msghdr),
 901         },
 902         [IORING_OP_TIMEOUT] = {
 903                 .needs_async_data       = 1,
 904                 .async_size             = sizeof(struct io_timeout_data),
 905         },
 906         [IORING_OP_TIMEOUT_REMOVE] = {
 907                 /* used by timeout updates' prep() */
 908         },
 909         [IORING_OP_ACCEPT] = {
 910                 .needs_file             = 1,
 911                 .unbound_nonreg_file    = 1,
 912                 .pollin                 = 1,
 913         },
 914         [IORING_OP_ASYNC_CANCEL] = {},
 915         [IORING_OP_LINK_TIMEOUT] = {
 916                 .needs_async_data       = 1,
 917                 .async_size             = sizeof(struct io_timeout_data),
 918         },
 919         [IORING_OP_CONNECT] = {
 920                 .needs_file             = 1,
 921                 .unbound_nonreg_file    = 1,
 922                 .pollout                = 1,
 923                 .needs_async_data       = 1,
 924                 .async_size             = sizeof(struct io_async_connect),
 925         },
 926         [IORING_OP_FALLOCATE] = {
 927                 .needs_file             = 1,
 928         },
 929         [IORING_OP_OPENAT] = {},
 930         [IORING_OP_CLOSE] = {},
 931         [IORING_OP_FILES_UPDATE] = {},
 932         [IORING_OP_STATX] = {},
 933         [IORING_OP_READ] = {
 934                 .needs_file             = 1,
 935                 .unbound_nonreg_file    = 1,
 936                 .pollin                 = 1,
 937                 .buffer_select          = 1,
 938                 .plug                   = 1,
 939                 .async_size             = sizeof(struct io_async_rw),
 940         },
 941         [IORING_OP_WRITE] = {
 942                 .needs_file             = 1,
 943                 .unbound_nonreg_file    = 1,
 944                 .pollout                = 1,
 945                 .plug                   = 1,
 946                 .async_size             = sizeof(struct io_async_rw),
 947         },
 948         [IORING_OP_FADVISE] = {
 949                 .needs_file             = 1,
 950         },
 951         [IORING_OP_MADVISE] = {},
 952         [IORING_OP_SEND] = {
 953                 .needs_file             = 1,
 954                 .unbound_nonreg_file    = 1,
 955                 .pollout                = 1,
 956         },
 957         [IORING_OP_RECV] = {
 958                 .needs_file             = 1,
 959                 .unbound_nonreg_file    = 1,
 960                 .pollin                 = 1,
 961                 .buffer_select          = 1,
 962         },
 963         [IORING_OP_OPENAT2] = {
 964         },
 965         [IORING_OP_EPOLL_CTL] = {
 966                 .unbound_nonreg_file    = 1,
 967         },
 968         [IORING_OP_SPLICE] = {
 969                 .needs_file             = 1,
 970                 .hash_reg_file          = 1,
 971                 .unbound_nonreg_file    = 1,
 972         },
 973         [IORING_OP_PROVIDE_BUFFERS] = {},
 974         [IORING_OP_REMOVE_BUFFERS] = {},
 975         [IORING_OP_TEE] = {
 976                 .needs_file             = 1,
 977                 .hash_reg_file          = 1,
 978                 .unbound_nonreg_file    = 1,
 979         },
 980         [IORING_OP_SHUTDOWN] = {
 981                 .needs_file             = 1,
 982         },
 983         [IORING_OP_RENAMEAT] = {},
 984         [IORING_OP_UNLINKAT] = {},
 985 };
 986
 987 static bool io_disarm_next(struct io_kiocb *req);
 988 static void io_uring_del_task_file(unsigned long index);
 989 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 990                                          struct task_struct *task,
 991                                          struct files_struct *files);
 992 static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx);
 993 static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
 994 static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
 995                         struct io_ring_ctx *ctx);
 996 static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
 997
 998 static bool io_rw_reissue(struct io_kiocb *req);
 999 static void io_cqring_fill_event(struct io_kiocb *req, long res);
1000 static void io_put_req(struct io_kiocb *req);
1001 static void io_put_req_deferred(struct io_kiocb *req, int nr);
1002 static void io_double_put_req(struct io_kiocb *req);
1003 static void io_dismantle_req(struct io_kiocb *req);
1004 static void io_put_task(struct task_struct *task, int nr);
1005 static void io_queue_next(struct io_kiocb *req);
1006 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
1007 static void __io_queue_linked_timeout(struct io_kiocb *req);
1008 static void io_queue_linked_timeout(struct io_kiocb *req);
1009 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
1010                                  struct io_uring_rsrc_update *ip,
1011                                  unsigned nr_args);
1012 static void __io_clean_op(struct io_kiocb *req);
1013 static struct file *io_file_get(struct io_submit_state *state,
1014                                 struct io_kiocb *req, int fd, bool fixed);
1015 static void __io_queue_sqe(struct io_kiocb *req);
1016 static void io_rsrc_put_work(struct work_struct *work);
1017
1018 static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
1019                            struct iov_iter *iter, bool needs_lock);
1020 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
1021                              const struct iovec *fast_iov,
1022                              struct iov_iter *iter, bool force);
1023 static void io_req_task_queue(struct io_kiocb *req);
1024 static void io_submit_flush_completions(struct io_comp_state *cs,
1025                                         struct io_ring_ctx *ctx);
1026
1027 static struct kmem_cache *req_cachep;
1028
1029 static const struct file_operations io_uring_fops;
1030
1031 struct sock *io_uring_get_socket(struct file *file)
1032 {
1033 #if defined(CONFIG_UNIX)
1034         if (file->f_op == &io_uring_fops) {
1035                 struct io_ring_ctx *ctx = file->private_data;
1036
1037                 return ctx->ring_sock->sk;
1038         }
1039 #endif
1040         return NULL;
1041 }
1042 EXPORT_SYMBOL(io_uring_get_socket);
1043
1044 #define io_for_each_link(pos, head) \
1045         for (pos = (head); pos; pos = pos->link)
1046
1047 static inline void io_clean_op(struct io_kiocb *req)
1048 {
1049         if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
1050                 __io_clean_op(req);
1051 }
1052
1053 static inline void io_set_resource_node(struct io_kiocb *req)
1054 {
1055         struct io_ring_ctx *ctx = req->ctx;
1056
1057         if (!req->fixed_rsrc_refs) {
1058                 req->fixed_rsrc_refs = &ctx->file_data->node->refs;
1059                 percpu_ref_get(req->fixed_rsrc_refs);
1060         }
1061 }
1062
1063 static bool io_match_task(struct io_kiocb *head,
1064                           struct task_struct *task,
1065                           struct files_struct *files)
1066 {
1067         struct io_kiocb *req;
1068
1069         if (task && head->task != task) {
1070                 /* in terms of cancelation, always match if req task is dead */
1071                 if (head->task->flags & PF_EXITING)
1072                         return true;
1073                 return false;
1074         }
1075         if (!files)
1076                 return true;
1077
1078         io_for_each_link(req, head) {
1079                 if (req->flags & REQ_F_INFLIGHT)
1080                         return true;
1081                 if (req->task->files == files)
1082                         return true;
1083         }
1084         return false;
1085 }
1086
1087 static inline void req_set_fail_links(struct io_kiocb *req)
1088 {
1089         if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1090                 req->flags |= REQ_F_FAIL_LINK;
1091 }
1092
1093 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
1094 {
1095         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1096
1097         complete(&ctx->ref_comp);
1098 }
1099
1100 static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1101 {
1102         return !req->timeout.off;
1103 }
1104
1105 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1106 {
1107         struct io_ring_ctx *ctx;
1108         int hash_bits;
1109
1110         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1111         if (!ctx)
1112                 return NULL;
1113
1114         /*
1115          * Use 5 bits less than the max cq entries, that should give us around
1116          * 32 entries per hash list if totally full and uniformly spread.
1117          */
1118         hash_bits = ilog2(p->cq_entries);
1119         hash_bits -= 5;
1120         if (hash_bits <= 0)
1121                 hash_bits = 1;
1122         ctx->cancel_hash_bits = hash_bits;
1123         ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1124                                         GFP_KERNEL);
1125         if (!ctx->cancel_hash)
1126                 goto err;
1127         __hash_init(ctx->cancel_hash, 1U << hash_bits);
1128
1129         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
1130                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1131                 goto err;
1132
1133         ctx->flags = p->flags;
1134         init_waitqueue_head(&ctx->sqo_sq_wait);
1135         INIT_LIST_HEAD(&ctx->sqd_list);
1136         init_waitqueue_head(&ctx->cq_wait);
1137         INIT_LIST_HEAD(&ctx->cq_overflow_list);
1138         init_completion(&ctx->ref_comp);
1139         xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
1140         xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
1141         mutex_init(&ctx->uring_lock);
1142         init_waitqueue_head(&ctx->wait);
1143         spin_lock_init(&ctx->completion_lock);
1144         INIT_LIST_HEAD(&ctx->iopoll_list);
1145         INIT_LIST_HEAD(&ctx->defer_list);
1146         INIT_LIST_HEAD(&ctx->timeout_list);
1147         spin_lock_init(&ctx->inflight_lock);
1148         INIT_LIST_HEAD(&ctx->inflight_list);
1149         spin_lock_init(&ctx->rsrc_ref_lock);
1150         INIT_LIST_HEAD(&ctx->rsrc_ref_list);
1151         INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1152         init_llist_head(&ctx->rsrc_put_llist);
1153         INIT_LIST_HEAD(&ctx->tctx_list);
1154         INIT_LIST_HEAD(&ctx->submit_state.comp.free_list);
1155         INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list);
1156         return ctx;
1157 err:
1158         kfree(ctx->cancel_hash);
1159         kfree(ctx);
1160         return NULL;
1161 }
1162
1163 static bool req_need_defer(struct io_kiocb *req, u32 seq)
1164 {
1165         if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1166                 struct io_ring_ctx *ctx = req->ctx;
1167
1168                 return seq != ctx->cached_cq_tail
1169                                 + READ_ONCE(ctx->cached_cq_overflow);
1170         }
1171
1172         return false;
1173 }
1174
1175 static void io_req_track_inflight(struct io_kiocb *req)
1176 {
1177         struct io_ring_ctx *ctx = req->ctx;
1178
1179         if (!(req->flags & REQ_F_INFLIGHT)) {
1180                 req->flags |= REQ_F_INFLIGHT;
1181
1182                 spin_lock_irq(&ctx->inflight_lock);
1183                 list_add(&req->inflight_entry, &ctx->inflight_list);
1184                 spin_unlock_irq(&ctx->inflight_lock);
1185         }
1186 }
1187
1188 static void io_prep_async_work(struct io_kiocb *req)
1189 {
1190         const struct io_op_def *def = &io_op_defs[req->opcode];
1191         struct io_ring_ctx *ctx = req->ctx;
1192
1193         if (!req->work.creds)
1194                 req->work.creds = get_current_cred();
1195
1196         if (req->flags & REQ_F_FORCE_ASYNC)
1197                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
1198
1199         if (req->flags & REQ_F_ISREG) {
1200                 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1201                         io_wq_hash_work(&req->work, file_inode(req->file));
1202         } else {
1203                 if (def->unbound_nonreg_file)
1204                         req->work.flags |= IO_WQ_WORK_UNBOUND;
1205         }
1206 }
1207
1208 static void io_prep_async_link(struct io_kiocb *req)
1209 {
1210         struct io_kiocb *cur;
1211
1212         io_for_each_link(cur, req)
1213                 io_prep_async_work(cur);
1214 }
1215
1216 static void io_queue_async_work(struct io_kiocb *req)
1217 {
1218         struct io_ring_ctx *ctx = req->ctx;
1219         struct io_kiocb *link = io_prep_linked_timeout(req);
1220         struct io_uring_task *tctx = req->task->io_uring;
1221
1222         BUG_ON(!tctx);
1223         BUG_ON(!tctx->io_wq);
1224
1225         trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1226                                         &req->work, req->flags);
1227         /* init ->work of the whole link before punting */
1228         io_prep_async_link(req);
1229         io_wq_enqueue(tctx->io_wq, &req->work);
1230         if (link)
1231                 io_queue_linked_timeout(link);
1232 }
1233
1234 static void io_kill_timeout(struct io_kiocb *req)
1235 {
1236         struct io_timeout_data *io = req->async_data;
1237         int ret;
1238
1239         ret = hrtimer_try_to_cancel(&io->timer);
1240         if (ret != -1) {
1241                 atomic_set(&req->ctx->cq_timeouts,
1242                         atomic_read(&req->ctx->cq_timeouts) + 1);
1243                 list_del_init(&req->timeout.list);
1244                 io_cqring_fill_event(req, 0);
1245                 io_put_req_deferred(req, 1);
1246         }
1247 }
1248
1249 /*
1250  * Returns true if we found and killed one or more timeouts
1251  */
1252 static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
1253                              struct files_struct *files)
1254 {
1255         struct io_kiocb *req, *tmp;
1256         int canceled = 0;
1257
1258         spin_lock_irq(&ctx->completion_lock);
1259         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
1260                 if (io_match_task(req, tsk, files)) {
1261                         io_kill_timeout(req);
1262                         canceled++;
1263                 }
1264         }
1265         spin_unlock_irq(&ctx->completion_lock);
1266         return canceled != 0;
1267 }
1268
1269 static void __io_queue_deferred(struct io_ring_ctx *ctx)
1270 {
1271         do {
1272                 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1273                                                 struct io_defer_entry, list);
1274
1275                 if (req_need_defer(de->req, de->seq))
1276                         break;
1277                 list_del_init(&de->list);
1278                 io_req_task_queue(de->req);
1279                 kfree(de);
1280         } while (!list_empty(&ctx->defer_list));
1281 }
1282
1283 static void io_flush_timeouts(struct io_ring_ctx *ctx)
1284 {
1285         u32 seq;
1286
1287         if (list_empty(&ctx->timeout_list))
1288                 return;
1289
1290         seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
1291
1292         do {
1293                 u32 events_needed, events_got;
1294                 struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
1295                                                 struct io_kiocb, timeout.list);
1296
1297                 if (io_is_timeout_noseq(req))
1298                         break;
1299
1300                 /*
1301                  * Since seq can easily wrap around over time, subtract
1302                  * the last seq at which timeouts were flushed before comparing.
1303                  * Assuming not more than 2^31-1 events have happened since,
1304                  * these subtractions won't have wrapped, so we can check if
1305                  * target is in [last_seq, current_seq] by comparing the two.
1306                  */
1307                 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
1308                 events_got = seq - ctx->cq_last_tm_flush;
1309                 if (events_got < events_needed)
1310                         break;
1311
1312                 list_del_init(&req->timeout.list);
1313                 io_kill_timeout(req);
1314         } while (!list_empty(&ctx->timeout_list));
1315
1316         ctx->cq_last_tm_flush = seq;
1317 }
1318
1319 static void io_commit_cqring(struct io_ring_ctx *ctx)
1320 {
1321         io_flush_timeouts(ctx);
1322
1323         /* order cqe stores with ring update */
1324         smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
1325
1326         if (unlikely(!list_empty(&ctx->defer_list)))
1327                 __io_queue_deferred(ctx);
1328 }
1329
1330 static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1331 {
1332         struct io_rings *r = ctx->rings;
1333
1334         return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries;
1335 }
1336
1337 static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
1338 {
1339         return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
1340 }
1341
1342 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
1343 {
1344         struct io_rings *rings = ctx->rings;
1345         unsigned tail;
1346
1347         /*
1348          * writes to the cq entry need to come after reading head; the
1349          * control dependency is enough as we're using WRITE_ONCE to
1350          * fill the cq entry
1351          */
1352         if (__io_cqring_events(ctx) == rings->cq_ring_entries)
1353                 return NULL;
1354
1355         tail = ctx->cached_cq_tail++;
1356         return &rings->cqes[tail & ctx->cq_mask];
1357 }
1358
1359 static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1360 {
1361         if (!ctx->cq_ev_fd)
1362                 return false;
1363         if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1364                 return false;
1365         if (!ctx->eventfd_async)
1366                 return true;
1367         return io_wq_current_is_worker();
1368 }
1369
1370 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1371 {
1372         /* see waitqueue_active() comment */
1373         smp_mb();
1374
1375         if (waitqueue_active(&ctx->wait))
1376                 wake_up(&ctx->wait);
1377         if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
1378                 wake_up(&ctx->sq_data->wait);
1379         if (io_should_trigger_evfd(ctx))
1380                 eventfd_signal(ctx->cq_ev_fd, 1);
1381         if (waitqueue_active(&ctx->cq_wait)) {
1382                 wake_up_interruptible(&ctx->cq_wait);
1383                 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1384         }
1385 }
1386
1387 static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1388 {
1389         /* see waitqueue_active() comment */
1390         smp_mb();
1391
1392         if (ctx->flags & IORING_SETUP_SQPOLL) {
1393                 if (waitqueue_active(&ctx->wait))
1394                         wake_up(&ctx->wait);
1395         }
1396         if (io_should_trigger_evfd(ctx))
1397                 eventfd_signal(ctx->cq_ev_fd, 1);
1398         if (waitqueue_active(&ctx->cq_wait)) {
1399                 wake_up_interruptible(&ctx->cq_wait);
1400                 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1401         }
1402 }
1403
1404 /* Returns true if there are no backlogged entries after the flush */
1405 static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
1406                                        struct task_struct *tsk,
1407                                        struct files_struct *files)
1408 {
1409         struct io_rings *rings = ctx->rings;
1410         struct io_kiocb *req, *tmp;
1411         struct io_uring_cqe *cqe;
1412         unsigned long flags;
1413         bool all_flushed, posted;
1414         LIST_HEAD(list);
1415
1416         if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries)
1417                 return false;
1418
1419         posted = false;
1420         spin_lock_irqsave(&ctx->completion_lock, flags);
1421         list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
1422                 if (!io_match_task(req, tsk, files))
1423                         continue;
1424
1425                 cqe = io_get_cqring(ctx);
1426                 if (!cqe && !force)
1427                         break;
1428
1429                 list_move(&req->compl.list, &list);
1430                 if (cqe) {
1431                         WRITE_ONCE(cqe->user_data, req->user_data);
1432                         WRITE_ONCE(cqe->res, req->result);
1433                         WRITE_ONCE(cqe->flags, req->compl.cflags);
1434                 } else {
1435                         ctx->cached_cq_overflow++;
1436                         WRITE_ONCE(ctx->rings->cq_overflow,
1437                                    ctx->cached_cq_overflow);
1438                 }
1439                 posted = true;
1440         }
1441
1442         all_flushed = list_empty(&ctx->cq_overflow_list);
1443         if (all_flushed) {
1444                 clear_bit(0, &ctx->sq_check_overflow);
1445                 clear_bit(0, &ctx->cq_check_overflow);
1446                 ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
1447         }
1448
1449         if (posted)
1450                 io_commit_cqring(ctx);
1451         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1452         if (posted)
1453                 io_cqring_ev_posted(ctx);
1454
1455         while (!list_empty(&list)) {
1456                 req = list_first_entry(&list, struct io_kiocb, compl.list);
1457                 list_del(&req->compl.list);
1458                 io_put_req(req);
1459         }
1460
1461         return all_flushed;
1462 }
1463
1464 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
1465                                      struct task_struct *tsk,
1466                                      struct files_struct *files)
1467 {
1468         bool ret = true;
1469
1470         if (test_bit(0, &ctx->cq_check_overflow)) {
1471                 /* iopoll syncs against uring_lock, not completion_lock */
1472                 if (ctx->flags & IORING_SETUP_IOPOLL)
1473                         mutex_lock(&ctx->uring_lock);
1474                 ret = __io_cqring_overflow_flush(ctx, force, tsk, files);
1475                 if (ctx->flags & IORING_SETUP_IOPOLL)
1476                         mutex_unlock(&ctx->uring_lock);
1477         }
1478
1479         return ret;
1480 }
1481
1482 static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
1483 {
1484         struct io_ring_ctx *ctx = req->ctx;
1485         struct io_uring_cqe *cqe;
1486
1487         trace_io_uring_complete(ctx, req->user_data, res);
1488
1489         /*
1490          * If we can't get a cq entry, userspace overflowed the
1491          * submission (by quite a lot). Increment the overflow count in
1492          * the ring.
1493          */
1494         cqe = io_get_cqring(ctx);
1495         if (likely(cqe)) {
1496                 WRITE_ONCE(cqe->user_data, req->user_data);
1497                 WRITE_ONCE(cqe->res, res);
1498                 WRITE_ONCE(cqe->flags, cflags);
1499         } else if (ctx->cq_overflow_flushed ||
1500                    atomic_read(&req->task->io_uring->in_idle)) {
1501                 /*
1502                  * If we're in ring overflow flush mode, or in task cancel mode,
1503                  * then we cannot store the request for later flushing, we need
1504                  * to drop it on the floor.
1505                  */
1506                 ctx->cached_cq_overflow++;
1507                 WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow);
1508         } else {
1509                 if (list_empty(&ctx->cq_overflow_list)) {
1510                         set_bit(0, &ctx->sq_check_overflow);
1511                         set_bit(0, &ctx->cq_check_overflow);
1512                         ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
1513                 }
1514                 io_clean_op(req);
1515                 req->result = res;
1516                 req->compl.cflags = cflags;
1517                 refcount_inc(&req->refs);
1518                 list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
1519         }
1520 }
1521
1522 static void io_cqring_fill_event(struct io_kiocb *req, long res)
1523 {
1524         __io_cqring_fill_event(req, res, 0);
1525 }
1526
1527 static void io_req_complete_post(struct io_kiocb *req, long res,
1528                                  unsigned int cflags)
1529 {
1530         struct io_ring_ctx *ctx = req->ctx;
1531         unsigned long flags;
1532
1533         spin_lock_irqsave(&ctx->completion_lock, flags);
1534         __io_cqring_fill_event(req, res, cflags);
1535         /*
1536          * If we're the last reference to this request, add to our locked
1537          * free_list cache.
1538          */
1539         if (refcount_dec_and_test(&req->refs)) {
1540                 struct io_comp_state *cs = &ctx->submit_state.comp;
1541
1542                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
1543                         if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK))
1544                                 io_disarm_next(req);
1545                         if (req->link) {
1546                                 io_req_task_queue(req->link);
1547                                 req->link = NULL;
1548                         }
1549                 }
1550                 io_dismantle_req(req);
1551                 io_put_task(req->task, 1);
1552                 list_add(&req->compl.list, &cs->locked_free_list);
1553                 cs->locked_free_nr++;
1554         } else {
1555                 if (!percpu_ref_tryget(&ctx->refs))
1556                         req = NULL;
1557         }
1558         io_commit_cqring(ctx);
1559         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1560
1561         if (req) {
1562                 io_cqring_ev_posted(ctx);
1563                 percpu_ref_put(&ctx->refs);
1564         }
1565 }
1566
1567 static void io_req_complete_state(struct io_kiocb *req, long res,
1568                                   unsigned int cflags)
1569 {
1570         io_clean_op(req);
1571         req->result = res;
1572         req->compl.cflags = cflags;
1573         req->flags |= REQ_F_COMPLETE_INLINE;
1574 }
1575
1576 static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
1577                                      long res, unsigned cflags)
1578 {
1579         if (issue_flags & IO_URING_F_COMPLETE_DEFER)
1580                 io_req_complete_state(req, res, cflags);
1581         else
1582                 io_req_complete_post(req, res, cflags);
1583 }
1584
1585 static inline void io_req_complete(struct io_kiocb *req, long res)
1586 {
1587         __io_req_complete(req, 0, res, 0);
1588 }
1589
1590 static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
1591 {
1592         struct io_submit_state *state = &ctx->submit_state;
1593         struct io_comp_state *cs = &state->comp;
1594         struct io_kiocb *req = NULL;
1595
1596         /*
1597          * If we have more than a batch's worth of requests in our IRQ side
1598          * locked cache, grab the lock and move them over to our submission
1599          * side cache.
1600          */
1601         if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH) {
1602                 spin_lock_irq(&ctx->completion_lock);
1603                 list_splice_init(&cs->locked_free_list, &cs->free_list);
1604                 cs->locked_free_nr = 0;
1605                 spin_unlock_irq(&ctx->completion_lock);
1606         }
1607
1608         while (!list_empty(&cs->free_list)) {
1609                 req = list_first_entry(&cs->free_list, struct io_kiocb,
1610                                         compl.list);
1611                 list_del(&req->compl.list);
1612                 state->reqs[state->free_reqs++] = req;
1613                 if (state->free_reqs == ARRAY_SIZE(state->reqs))
1614                         break;
1615         }
1616
1617         return req != NULL;
1618 }
1619
1620 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
1621 {
1622         struct io_submit_state *state = &ctx->submit_state;
1623
1624         BUILD_BUG_ON(IO_REQ_ALLOC_BATCH > ARRAY_SIZE(state->reqs));
1625
1626         if (!state->free_reqs) {
1627                 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1628                 int ret;
1629
1630                 if (io_flush_cached_reqs(ctx))
1631                         goto got_req;
1632
1633                 ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
1634                                             state->reqs);
1635
1636                 /*
1637                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
1638                  * retry single alloc to be on the safe side.
1639                  */
1640                 if (unlikely(ret <= 0)) {
1641                         state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1642                         if (!state->reqs[0])
1643                                 return NULL;
1644                         ret = 1;
1645                 }
1646                 state->free_reqs = ret;
1647         }
1648 got_req:
1649         state->free_reqs--;
1650         return state->reqs[state->free_reqs];
1651 }
1652
1653 static inline void io_put_file(struct io_kiocb *req, struct file *file,
1654                           bool fixed)
1655 {
1656         if (!fixed)
1657                 fput(file);
1658 }
1659
1660 static void io_dismantle_req(struct io_kiocb *req)
1661 {
1662         io_clean_op(req);
1663
1664         if (req->async_data)
1665                 kfree(req->async_data);
1666         if (req->file)
1667                 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
1668         if (req->fixed_rsrc_refs)
1669                 percpu_ref_put(req->fixed_rsrc_refs);
1670         if (req->work.creds) {
1671                 put_cred(req->work.creds);
1672                 req->work.creds = NULL;
1673         }
1674
1675         if (req->flags & REQ_F_INFLIGHT) {
1676                 struct io_ring_ctx *ctx = req->ctx;
1677                 unsigned long flags;
1678
1679                 spin_lock_irqsave(&ctx->inflight_lock, flags);
1680                 list_del(&req->inflight_entry);
1681                 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1682                 req->flags &= ~REQ_F_INFLIGHT;
1683         }
1684 }
1685
1686 /* must to be called somewhat shortly after putting a request */
1687 static inline void io_put_task(struct task_struct *task, int nr)
1688 {
1689         struct io_uring_task *tctx = task->io_uring;
1690
1691         percpu_counter_sub(&tctx->inflight, nr);
1692         if (unlikely(atomic_read(&tctx->in_idle)))
1693                 wake_up(&tctx->wait);
1694         put_task_struct_many(task, nr);
1695 }
1696
1697 static void __io_free_req(struct io_kiocb *req)
1698 {
1699         struct io_ring_ctx *ctx = req->ctx;
1700
1701         io_dismantle_req(req);
1702         io_put_task(req->task, 1);
1703
1704         kmem_cache_free(req_cachep, req);
1705         percpu_ref_put(&ctx->refs);
1706 }
1707
1708 static inline void io_remove_next_linked(struct io_kiocb *req)
1709 {
1710         struct io_kiocb *nxt = req->link;
1711
1712         req->link = nxt->link;
1713         nxt->link = NULL;
1714 }
1715
1716 static bool io_kill_linked_timeout(struct io_kiocb *req)
1717         __must_hold(&req->ctx->completion_lock)
1718 {
1719         struct io_kiocb *link = req->link;
1720         bool cancelled = false;
1721
1722         /*
1723          * Can happen if a linked timeout fired and link had been like
1724          * req -> link t-out -> link t-out [-> ...]
1725          */
1726         if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
1727                 struct io_timeout_data *io = link->async_data;
1728                 int ret;
1729
1730                 io_remove_next_linked(req);
1731                 link->timeout.head = NULL;
1732                 ret = hrtimer_try_to_cancel(&io->timer);
1733                 if (ret != -1) {
1734                         io_cqring_fill_event(link, -ECANCELED);
1735                         io_put_req_deferred(link, 1);
1736                         cancelled = true;
1737                 }
1738         }
1739         req->flags &= ~REQ_F_LINK_TIMEOUT;
1740         return cancelled;
1741 }
1742
1743 static void io_fail_links(struct io_kiocb *req)
1744         __must_hold(&req->ctx->completion_lock)
1745 {
1746         struct io_kiocb *nxt, *link = req->link;
1747
1748         req->link = NULL;
1749         while (link) {
1750                 nxt = link->link;
1751                 link->link = NULL;
1752
1753                 trace_io_uring_fail_link(req, link);
1754                 io_cqring_fill_event(link, -ECANCELED);
1755                 io_put_req_deferred(link, 2);
1756                 link = nxt;
1757         }
1758 }
1759
1760 static bool io_disarm_next(struct io_kiocb *req)
1761         __must_hold(&req->ctx->completion_lock)
1762 {
1763         bool posted = false;
1764
1765         if (likely(req->flags & REQ_F_LINK_TIMEOUT))
1766                 posted = io_kill_linked_timeout(req);
1767         if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
1768                 posted |= (req->link != NULL);
1769                 io_fail_links(req);
1770         }
1771         return posted;
1772 }
1773
1774 static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
1775 {
1776         struct io_kiocb *nxt;
1777
1778         /*
1779          * If LINK is set, we have dependent requests in this chain. If we
1780          * didn't fail this request, queue the first one up, moving any other
1781          * dependencies to the next request. In case of failure, fail the rest
1782          * of the chain.
1783          */
1784         if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK)) {
1785                 struct io_ring_ctx *ctx = req->ctx;
1786                 unsigned long flags;
1787                 bool posted;
1788
1789                 spin_lock_irqsave(&ctx->completion_lock, flags);
1790                 posted = io_disarm_next(req);
1791                 if (posted)
1792                         io_commit_cqring(req->ctx);
1793                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1794                 if (posted)
1795                         io_cqring_ev_posted(ctx);
1796         }
1797         nxt = req->link;
1798         req->link = NULL;
1799         return nxt;
1800 }
1801
1802 static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
1803 {
1804         if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
1805                 return NULL;
1806         return __io_req_find_next(req);
1807 }
1808
1809 static void ctx_flush_and_put(struct io_ring_ctx *ctx)
1810 {
1811         if (!ctx)
1812                 return;
1813         if (ctx->submit_state.comp.nr) {
1814                 mutex_lock(&ctx->uring_lock);
1815                 io_submit_flush_completions(&ctx->submit_state.comp, ctx);
1816                 mutex_unlock(&ctx->uring_lock);
1817         }
1818         percpu_ref_put(&ctx->refs);
1819 }
1820
1821 static bool __tctx_task_work(struct io_uring_task *tctx)
1822 {
1823         struct io_ring_ctx *ctx = NULL;
1824         struct io_wq_work_list list;
1825         struct io_wq_work_node *node;
1826
1827         if (wq_list_empty(&tctx->task_list))
1828                 return false;
1829
1830         spin_lock_irq(&tctx->task_lock);
1831         list = tctx->task_list;
1832         INIT_WQ_LIST(&tctx->task_list);
1833         spin_unlock_irq(&tctx->task_lock);
1834
1835         node = list.first;
1836         while (node) {
1837                 struct io_wq_work_node *next = node->next;
1838                 struct io_kiocb *req;
1839
1840                 req = container_of(node, struct io_kiocb, io_task_work.node);
1841                 if (req->ctx != ctx) {
1842                         ctx_flush_and_put(ctx);
1843                         ctx = req->ctx;
1844                         percpu_ref_get(&ctx->refs);
1845                 }
1846
1847                 req->task_work.func(&req->task_work);
1848                 node = next;
1849         }
1850
1851         ctx_flush_and_put(ctx);
1852         return list.first != NULL;
1853 }
1854
1855 static void tctx_task_work(struct callback_head *cb)
1856 {
1857         struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work);
1858
1859         clear_bit(0, &tctx->task_state);
1860
1861         while (__tctx_task_work(tctx))
1862                 cond_resched();
1863 }
1864
1865 static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
1866                             enum task_work_notify_mode notify)
1867 {
1868         struct io_uring_task *tctx = tsk->io_uring;
1869         struct io_wq_work_node *node, *prev;
1870         unsigned long flags;
1871         int ret;
1872
1873         WARN_ON_ONCE(!tctx);
1874
1875         spin_lock_irqsave(&tctx->task_lock, flags);
1876         wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
1877         spin_unlock_irqrestore(&tctx->task_lock, flags);
1878
1879         /* task_work already pending, we're done */
1880         if (test_bit(0, &tctx->task_state) ||
1881             test_and_set_bit(0, &tctx->task_state))
1882                 return 0;
1883
1884         if (!task_work_add(tsk, &tctx->task_work, notify))
1885                 return 0;
1886
1887         /*
1888          * Slow path - we failed, find and delete work. if the work is not
1889          * in the list, it got run and we're fine.
1890          */
1891         ret = 0;
1892         spin_lock_irqsave(&tctx->task_lock, flags);
1893         wq_list_for_each(node, prev, &tctx->task_list) {
1894                 if (&req->io_task_work.node == node) {
1895                         wq_list_del(&tctx->task_list, node, prev);
1896                         ret = 1;
1897                         break;
1898                 }
1899         }
1900         spin_unlock_irqrestore(&tctx->task_lock, flags);
1901         clear_bit(0, &tctx->task_state);
1902         return ret;
1903 }
1904
1905 static int io_req_task_work_add(struct io_kiocb *req)
1906 {
1907         struct task_struct *tsk = req->task;
1908         struct io_ring_ctx *ctx = req->ctx;
1909         enum task_work_notify_mode notify;
1910         int ret;
1911
1912         if (tsk->flags & PF_EXITING)
1913                 return -ESRCH;
1914
1915         /*
1916          * SQPOLL kernel thread doesn't need notification, just a wakeup. For
1917          * all other cases, use TWA_SIGNAL unconditionally to ensure we're
1918          * processing task_work. There's no reliable way to tell if TWA_RESUME
1919          * will do the job.
1920          */
1921         notify = TWA_NONE;
1922         if (!(ctx->flags & IORING_SETUP_SQPOLL))
1923                 notify = TWA_SIGNAL;
1924
1925         ret = io_task_work_add(tsk, req, notify);
1926         if (!ret)
1927                 wake_up_process(tsk);
1928
1929         return ret;
1930 }
1931
1932 static void io_req_task_work_add_fallback(struct io_kiocb *req,
1933                                           task_work_func_t cb)
1934 {
1935         struct io_ring_ctx *ctx = req->ctx;
1936         struct callback_head *head;
1937
1938         init_task_work(&req->task_work, cb);
1939         do {
1940                 head = READ_ONCE(ctx->exit_task_work);
1941                 req->task_work.next = head;
1942         } while (cmpxchg(&ctx->exit_task_work, head, &req->task_work) != head);
1943 }
1944
1945 static void __io_req_task_cancel(struct io_kiocb *req, int error)
1946 {
1947         struct io_ring_ctx *ctx = req->ctx;
1948
1949         spin_lock_irq(&ctx->completion_lock);
1950         io_cqring_fill_event(req, error);
1951         io_commit_cqring(ctx);
1952         spin_unlock_irq(&ctx->completion_lock);
1953
1954         io_cqring_ev_posted(ctx);
1955         req_set_fail_links(req);
1956         io_double_put_req(req);
1957 }
1958
1959 static void io_req_task_cancel(struct callback_head *cb)
1960 {
1961         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
1962         struct io_ring_ctx *ctx = req->ctx;
1963
1964         mutex_lock(&ctx->uring_lock);
1965         __io_req_task_cancel(req, req->result);
1966         mutex_unlock(&ctx->uring_lock);
1967         percpu_ref_put(&ctx->refs);
1968 }
1969
1970 static void __io_req_task_submit(struct io_kiocb *req)
1971 {
1972         struct io_ring_ctx *ctx = req->ctx;
1973
1974         /* ctx stays valid until unlock, even if we drop all ours ctx->refs */
1975         mutex_lock(&ctx->uring_lock);
1976         if (!(current->flags & PF_EXITING) && !current->in_execve)
1977                 __io_queue_sqe(req);
1978         else
1979                 __io_req_task_cancel(req, -EFAULT);
1980         mutex_unlock(&ctx->uring_lock);
1981 }
1982
1983 static void io_req_task_submit(struct callback_head *cb)
1984 {
1985         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
1986
1987         __io_req_task_submit(req);
1988 }
1989
1990 static void io_req_task_queue(struct io_kiocb *req)
1991 {
1992         int ret;
1993
1994         req->task_work.func = io_req_task_submit;
1995         ret = io_req_task_work_add(req);
1996         if (unlikely(ret)) {
1997                 req->result = -ECANCELED;
1998                 percpu_ref_get(&req->ctx->refs);
1999                 io_req_task_work_add_fallback(req, io_req_task_cancel);
2000         }
2001 }
2002
2003 static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
2004 {
2005         percpu_ref_get(&req->ctx->refs);
2006         req->result = ret;
2007         req->task_work.func = io_req_task_cancel;
2008
2009         if (unlikely(io_req_task_work_add(req)))
2010                 io_req_task_work_add_fallback(req, io_req_task_cancel);
2011 }
2012
2013 static inline void io_queue_next(struct io_kiocb *req)
2014 {
2015         struct io_kiocb *nxt = io_req_find_next(req);
2016
2017         if (nxt)
2018                 io_req_task_queue(nxt);
2019 }
2020
2021 static void io_free_req(struct io_kiocb *req)
2022 {
2023         io_queue_next(req);
2024         __io_free_req(req);
2025 }
2026
2027 struct req_batch {
2028         struct task_struct      *task;
2029         int                     task_refs;
2030         int                     ctx_refs;
2031 };
2032
2033 static inline void io_init_req_batch(struct req_batch *rb)
2034 {
2035         rb->task_refs = 0;
2036         rb->ctx_refs = 0;
2037         rb->task = NULL;
2038 }
2039
2040 static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
2041                                      struct req_batch *rb)
2042 {
2043         if (rb->task)
2044                 io_put_task(rb->task, rb->task_refs);
2045         if (rb->ctx_refs)
2046                 percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
2047 }
2048
2049 static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
2050                               struct io_submit_state *state)
2051 {
2052         io_queue_next(req);
2053
2054         if (req->task != rb->task) {
2055                 if (rb->task)
2056                         io_put_task(rb->task, rb->task_refs);
2057                 rb->task = req->task;
2058                 rb->task_refs = 0;
2059         }
2060         rb->task_refs++;
2061         rb->ctx_refs++;
2062
2063         io_dismantle_req(req);
2064         if (state->free_reqs != ARRAY_SIZE(state->reqs))
2065                 state->reqs[state->free_reqs++] = req;
2066         else
2067                 list_add(&req->compl.list, &state->comp.free_list);
2068 }
2069
2070 static void io_submit_flush_completions(struct io_comp_state *cs,
2071                                         struct io_ring_ctx *ctx)
2072 {
2073         int i, nr = cs->nr;
2074         struct io_kiocb *req;
2075         struct req_batch rb;
2076
2077         io_init_req_batch(&rb);
2078         spin_lock_irq(&ctx->completion_lock);
2079         for (i = 0; i < nr; i++) {
2080                 req = cs->reqs[i];
2081                 __io_cqring_fill_event(req, req->result, req->compl.cflags);
2082         }
2083         io_commit_cqring(ctx);
2084         spin_unlock_irq(&ctx->completion_lock);
2085
2086         io_cqring_ev_posted(ctx);
2087         for (i = 0; i < nr; i++) {
2088                 req = cs->reqs[i];
2089
2090                 /* submission and completion refs */
2091                 if (refcount_sub_and_test(2, &req->refs))
2092                         io_req_free_batch(&rb, req, &ctx->submit_state);
2093         }
2094
2095         io_req_free_batch_finish(ctx, &rb);
2096         cs->nr = 0;
2097 }
2098
2099 /*
2100  * Drop reference to request, return next in chain (if there is one) if this
2101  * was the last reference to this request.
2102  */
2103 static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
2104 {
2105         struct io_kiocb *nxt = NULL;
2106
2107         if (refcount_dec_and_test(&req->refs)) {
2108                 nxt = io_req_find_next(req);
2109                 __io_free_req(req);
2110         }
2111         return nxt;
2112 }
2113
2114 static void io_put_req(struct io_kiocb *req)
2115 {
2116         if (refcount_dec_and_test(&req->refs))
2117                 io_free_req(req);
2118 }
2119
2120 static void io_put_req_deferred_cb(struct callback_head *cb)
2121 {
2122         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2123
2124         io_free_req(req);
2125 }
2126
2127 static void io_free_req_deferred(struct io_kiocb *req)
2128 {
2129         int ret;
2130
2131         req->task_work.func = io_put_req_deferred_cb;
2132         ret = io_req_task_work_add(req);
2133         if (unlikely(ret))
2134                 io_req_task_work_add_fallback(req, io_put_req_deferred_cb);
2135 }
2136
2137 static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
2138 {
2139         if (refcount_sub_and_test(refs, &req->refs))
2140                 io_free_req_deferred(req);
2141 }
2142
2143 static void io_double_put_req(struct io_kiocb *req)
2144 {
2145         /* drop both submit and complete references */
2146         if (refcount_sub_and_test(2, &req->refs))
2147                 io_free_req(req);
2148 }
2149
2150 static unsigned io_cqring_events(struct io_ring_ctx *ctx)
2151 {
2152         /* See comment at the top of this file */
2153         smp_rmb();
2154         return __io_cqring_events(ctx);
2155 }
2156
2157 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2158 {
2159         struct io_rings *rings = ctx->rings;
2160
2161         /* make sure SQ entry isn't read before tail */
2162         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2163 }
2164
2165 static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
2166 {
2167         unsigned int cflags;
2168
2169         cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
2170         cflags |= IORING_CQE_F_BUFFER;
2171         req->flags &= ~REQ_F_BUFFER_SELECTED;
2172         kfree(kbuf);
2173         return cflags;
2174 }
2175
2176 static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
2177 {
2178         struct io_buffer *kbuf;
2179
2180         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2181         return io_put_kbuf(req, kbuf);
2182 }
2183
2184 static inline bool io_run_task_work(void)
2185 {
2186         /*
2187          * Not safe to run on exiting task, and the task_work handling will
2188          * not add work to such a task.
2189          */
2190         if (unlikely(current->flags & PF_EXITING))
2191                 return false;
2192         if (current->task_works) {
2193                 __set_current_state(TASK_RUNNING);
2194                 task_work_run();
2195                 return true;
2196         }
2197
2198         return false;
2199 }
2200
2201 /*
2202  * Find and free completed poll iocbs
2203  */
2204 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
2205                                struct list_head *done)
2206 {
2207         struct req_batch rb;
2208         struct io_kiocb *req;
2209
2210         /* order with ->result store in io_complete_rw_iopoll() */
2211         smp_rmb();
2212
2213         io_init_req_batch(&rb);
2214         while (!list_empty(done)) {
2215                 int cflags = 0;
2216
2217                 req = list_first_entry(done, struct io_kiocb, inflight_entry);
2218                 list_del(&req->inflight_entry);
2219
2220                 if (READ_ONCE(req->result) == -EAGAIN) {
2221                         req->iopoll_completed = 0;
2222                         if (io_rw_reissue(req))
2223                                 continue;
2224                 }
2225
2226                 if (req->flags & REQ_F_BUFFER_SELECTED)
2227                         cflags = io_put_rw_kbuf(req);
2228
2229                 __io_cqring_fill_event(req, req->result, cflags);
2230                 (*nr_events)++;
2231
2232                 if (refcount_dec_and_test(&req->refs))
2233                         io_req_free_batch(&rb, req, &ctx->submit_state);
2234         }
2235
2236         io_commit_cqring(ctx);
2237         io_cqring_ev_posted_iopoll(ctx);
2238         io_req_free_batch_finish(ctx, &rb);
2239 }
2240
2241 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
2242                         long min)
2243 {
2244         struct io_kiocb *req, *tmp;
2245         LIST_HEAD(done);
2246         bool spin;
2247         int ret;
2248
2249         /*
2250          * Only spin for completions if we don't have multiple devices hanging
2251          * off our complete list, and we're under the requested amount.
2252          */
2253         spin = !ctx->poll_multi_file && *nr_events < min;
2254
2255         ret = 0;
2256         list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
2257                 struct kiocb *kiocb = &req->rw.kiocb;
2258
2259                 /*
2260                  * Move completed and retryable entries to our local lists.
2261                  * If we find a request that requires polling, break out
2262                  * and complete those lists first, if we have entries there.
2263                  */
2264                 if (READ_ONCE(req->iopoll_completed)) {
2265                         list_move_tail(&req->inflight_entry, &done);
2266                         continue;
2267                 }
2268                 if (!list_empty(&done))
2269                         break;
2270
2271                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
2272                 if (ret < 0)
2273                         break;
2274
2275                 /* iopoll may have completed current req */
2276                 if (READ_ONCE(req->iopoll_completed))
2277                         list_move_tail(&req->inflight_entry, &done);
2278
2279                 if (ret && spin)
2280                         spin = false;
2281                 ret = 0;
2282         }
2283
2284         if (!list_empty(&done))
2285                 io_iopoll_complete(ctx, nr_events, &done);
2286
2287         return ret;
2288 }
2289
2290 /*
2291  * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
2292  * non-spinning poll check - we'll still enter the driver poll loop, but only
2293  * as a non-spinning completion check.
2294  */
2295 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
2296                                 long min)
2297 {
2298         while (!list_empty(&ctx->iopoll_list) && !need_resched()) {
2299                 int ret;
2300
2301                 ret = io_do_iopoll(ctx, nr_events, min);
2302                 if (ret < 0)
2303                         return ret;
2304                 if (*nr_events >= min)
2305                         return 0;
2306         }
2307
2308         return 1;
2309 }
2310
2311 /*
2312  * We can't just wait for polled events to come to us, we have to actively
2313  * find and complete them.
2314  */
2315 static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
2316 {
2317         if (!(ctx->flags & IORING_SETUP_IOPOLL))
2318                 return;
2319
2320         mutex_lock(&ctx->uring_lock);
2321         while (!list_empty(&ctx->iopoll_list)) {
2322                 unsigned int nr_events = 0;
2323
2324                 io_do_iopoll(ctx, &nr_events, 0);
2325
2326                 /* let it sleep and repeat later if can't complete a request */
2327                 if (nr_events == 0)
2328                         break;
2329                 /*
2330                  * Ensure we allow local-to-the-cpu processing to take place,
2331                  * in this case we need to ensure that we reap all events.
2332                  * Also let task_work, etc. to progress by releasing the mutex
2333                  */
2334                 if (need_resched()) {
2335                         mutex_unlock(&ctx->uring_lock);
2336                         cond_resched();
2337                         mutex_lock(&ctx->uring_lock);
2338                 }
2339         }
2340         mutex_unlock(&ctx->uring_lock);
2341 }
2342
2343 static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
2344 {
2345         unsigned int nr_events = 0;
2346         int iters = 0, ret = 0;
2347
2348         /*
2349          * We disallow the app entering submit/complete with polling, but we
2350          * still need to lock the ring to prevent racing with polled issue
2351          * that got punted to a workqueue.
2352          */
2353         mutex_lock(&ctx->uring_lock);
2354         do {
2355                 /*
2356                  * Don't enter poll loop if we already have events pending.
2357                  * If we do, we can potentially be spinning for commands that
2358                  * already triggered a CQE (eg in error).
2359                  */
2360                 if (test_bit(0, &ctx->cq_check_overflow))
2361                         __io_cqring_overflow_flush(ctx, false, NULL, NULL);
2362                 if (io_cqring_events(ctx))
2363                         break;
2364
2365                 /*
2366                  * If a submit got punted to a workqueue, we can have the
2367                  * application entering polling for a command before it gets
2368                  * issued. That app will hold the uring_lock for the duration
2369                  * of the poll right here, so we need to take a breather every
2370                  * now and then to ensure that the issue has a chance to add
2371                  * the poll to the issued list. Otherwise we can spin here
2372                  * forever, while the workqueue is stuck trying to acquire the
2373                  * very same mutex.
2374                  */
2375                 if (!(++iters & 7)) {
2376                         mutex_unlock(&ctx->uring_lock);
2377                         io_run_task_work();
2378                         mutex_lock(&ctx->uring_lock);
2379                 }
2380
2381                 ret = io_iopoll_getevents(ctx, &nr_events, min);
2382                 if (ret <= 0)
2383                         break;
2384                 ret = 0;
2385         } while (min && !nr_events && !need_resched());
2386
2387         mutex_unlock(&ctx->uring_lock);
2388         return ret;
2389 }
2390
2391 static void kiocb_end_write(struct io_kiocb *req)
2392 {
2393         /*
2394          * Tell lockdep we inherited freeze protection from submission
2395          * thread.
2396          */
2397         if (req->flags & REQ_F_ISREG) {
2398                 struct inode *inode = file_inode(req->file);
2399
2400                 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
2401         }
2402         file_end_write(req->file);
2403 }
2404
2405 #ifdef CONFIG_BLOCK
2406 static bool io_resubmit_prep(struct io_kiocb *req)
2407 {
2408         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2409         int rw, ret;
2410         struct iov_iter iter;
2411
2412         /* already prepared */
2413         if (req->async_data)
2414                 return true;
2415
2416         switch (req->opcode) {
2417         case IORING_OP_READV:
2418         case IORING_OP_READ_FIXED:
2419         case IORING_OP_READ:
2420                 rw = READ;
2421                 break;
2422         case IORING_OP_WRITEV:
2423         case IORING_OP_WRITE_FIXED:
2424         case IORING_OP_WRITE:
2425                 rw = WRITE;
2426                 break;
2427         default:
2428                 printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n",
2429                                 req->opcode);
2430                 return false;
2431         }
2432
2433         ret = io_import_iovec(rw, req, &iovec, &iter, false);
2434         if (ret < 0)
2435                 return false;
2436         return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
2437 }
2438
2439 static bool io_rw_should_reissue(struct io_kiocb *req)
2440 {
2441         umode_t mode = file_inode(req->file)->i_mode;
2442         struct io_ring_ctx *ctx = req->ctx;
2443
2444         if (!S_ISBLK(mode) && !S_ISREG(mode))
2445                 return false;
2446         if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
2447             !(ctx->flags & IORING_SETUP_IOPOLL)))
2448                 return false;
2449         /*
2450          * If ref is dying, we might be running poll reap from the exit work.
2451          * Don't attempt to reissue from that path, just let it fail with
2452          * -EAGAIN.
2453          */
2454         if (percpu_ref_is_dying(&ctx->refs))
2455                 return false;
2456         return true;
2457 }
2458 #endif
2459
2460 static bool io_rw_reissue(struct io_kiocb *req)
2461 {
2462 #ifdef CONFIG_BLOCK
2463         if (!io_rw_should_reissue(req))
2464                 return false;
2465
2466         lockdep_assert_held(&req->ctx->uring_lock);
2467
2468         if (io_resubmit_prep(req)) {
2469                 refcount_inc(&req->refs);
2470                 io_queue_async_work(req);
2471                 return true;
2472         }
2473         req_set_fail_links(req);
2474 #endif
2475         return false;
2476 }
2477
2478 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
2479                              unsigned int issue_flags)
2480 {
2481         int cflags = 0;
2482
2483         if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_reissue(req))
2484                 return;
2485         if (res != req->result)
2486                 req_set_fail_links(req);
2487
2488         if (req->rw.kiocb.ki_flags & IOCB_WRITE)
2489                 kiocb_end_write(req);
2490         if (req->flags & REQ_F_BUFFER_SELECTED)
2491                 cflags = io_put_rw_kbuf(req);
2492         __io_req_complete(req, issue_flags, res, cflags);
2493 }
2494
2495 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
2496 {
2497         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2498
2499         __io_complete_rw(req, res, res2, 0);
2500 }
2501
2502 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
2503 {
2504         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2505
2506 #ifdef CONFIG_BLOCK
2507         /* Rewind iter, if we have one. iopoll path resubmits as usual */
2508         if (res == -EAGAIN && io_rw_should_reissue(req)) {
2509                 struct io_async_rw *rw = req->async_data;
2510
2511                 if (rw)
2512                         iov_iter_revert(&rw->iter,
2513                                         req->result - iov_iter_count(&rw->iter));
2514                 else if (!io_resubmit_prep(req))
2515                         res = -EIO;
2516         }
2517 #endif
2518
2519         if (kiocb->ki_flags & IOCB_WRITE)
2520                 kiocb_end_write(req);
2521
2522         if (res != -EAGAIN && res != req->result)
2523                 req_set_fail_links(req);
2524
2525         WRITE_ONCE(req->result, res);
2526         /* order with io_poll_complete() checking ->result */
2527         smp_wmb();
2528         WRITE_ONCE(req->iopoll_completed, 1);
2529 }
2530
2531 /*
2532  * After the iocb has been issued, it's safe to be found on the poll list.
2533  * Adding the kiocb to the list AFTER submission ensures that we don't
2534  * find it from a io_iopoll_getevents() thread before the issuer is done
2535  * accessing the kiocb cookie.
2536  */
2537 static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
2538 {
2539         struct io_ring_ctx *ctx = req->ctx;
2540
2541         /*
2542          * Track whether we have multiple files in our lists. This will impact
2543          * how we do polling eventually, not spinning if we're on potentially
2544          * different devices.
2545          */
2546         if (list_empty(&ctx->iopoll_list)) {
2547                 ctx->poll_multi_file = false;
2548         } else if (!ctx->poll_multi_file) {
2549                 struct io_kiocb *list_req;
2550
2551                 list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
2552                                                 inflight_entry);
2553                 if (list_req->file != req->file)
2554                         ctx->poll_multi_file = true;
2555         }
2556
2557         /*
2558          * For fast devices, IO may have already completed. If it has, add
2559          * it to the front so we find it first.
2560          */
2561         if (READ_ONCE(req->iopoll_completed))
2562                 list_add(&req->inflight_entry, &ctx->iopoll_list);
2563         else
2564                 list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
2565
2566         /*
2567          * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread
2568          * task context or in io worker task context. If current task context is
2569          * sq thread, we don't need to check whether should wake up sq thread.
2570          */
2571         if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) &&
2572             wq_has_sleeper(&ctx->sq_data->wait))
2573                 wake_up(&ctx->sq_data->wait);
2574 }
2575
2576 static inline void io_state_file_put(struct io_submit_state *state)
2577 {
2578         if (state->file_refs) {
2579                 fput_many(state->file, state->file_refs);
2580                 state->file_refs = 0;
2581         }
2582 }
2583
2584 /*
2585  * Get as many references to a file as we have IOs left in this submission,
2586  * assuming most submissions are for one file, or at least that each file
2587  * has more than one submission.
2588  */
2589 static struct file *__io_file_get(struct io_submit_state *state, int fd)
2590 {
2591         if (!state)
2592                 return fget(fd);
2593
2594         if (state->file_refs) {
2595                 if (state->fd == fd) {
2596                         state->file_refs--;
2597                         return state->file;
2598                 }
2599                 io_state_file_put(state);
2600         }
2601         state->file = fget_many(fd, state->ios_left);
2602         if (unlikely(!state->file))
2603                 return NULL;
2604
2605         state->fd = fd;
2606         state->file_refs = state->ios_left - 1;
2607         return state->file;
2608 }
2609
2610 static bool io_bdev_nowait(struct block_device *bdev)
2611 {
2612         return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
2613 }
2614
2615 /*
2616  * If we tracked the file through the SCM inflight mechanism, we could support
2617  * any file. For now, just ensure that anything potentially problematic is done
2618  * inline.
2619  */
2620 static bool io_file_supports_async(struct file *file, int rw)
2621 {
2622         umode_t mode = file_inode(file)->i_mode;
2623
2624         if (S_ISBLK(mode)) {
2625                 if (IS_ENABLED(CONFIG_BLOCK) &&
2626                     io_bdev_nowait(I_BDEV(file->f_mapping->host)))
2627                         return true;
2628                 return false;
2629         }
2630         if (S_ISCHR(mode) || S_ISSOCK(mode))
2631                 return true;
2632         if (S_ISREG(mode)) {
2633                 if (IS_ENABLED(CONFIG_BLOCK) &&
2634                     io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
2635                     file->f_op != &io_uring_fops)
2636                         return true;
2637                 return false;
2638         }
2639
2640         /* any ->read/write should understand O_NONBLOCK */
2641         if (file->f_flags & O_NONBLOCK)
2642                 return true;
2643
2644         if (!(file->f_mode & FMODE_NOWAIT))
2645                 return false;
2646
2647         if (rw == READ)
2648                 return file->f_op->read_iter != NULL;
2649
2650         return file->f_op->write_iter != NULL;
2651 }
2652
2653 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2654 {
2655         struct io_ring_ctx *ctx = req->ctx;
2656         struct kiocb *kiocb = &req->rw.kiocb;
2657         struct file *file = req->file;
2658         unsigned ioprio;
2659         int ret;
2660
2661         if (S_ISREG(file_inode(file)->i_mode))
2662                 req->flags |= REQ_F_ISREG;
2663
2664         kiocb->ki_pos = READ_ONCE(sqe->off);
2665         if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
2666                 req->flags |= REQ_F_CUR_POS;
2667                 kiocb->ki_pos = file->f_pos;
2668         }
2669         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
2670         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2671         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2672         if (unlikely(ret))
2673                 return ret;
2674
2675         /* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */
2676         if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK))
2677                 req->flags |= REQ_F_NOWAIT;
2678
2679         ioprio = READ_ONCE(sqe->ioprio);
2680         if (ioprio) {
2681                 ret = ioprio_check_cap(ioprio);
2682                 if (ret)
2683                         return ret;
2684
2685                 kiocb->ki_ioprio = ioprio;
2686         } else
2687                 kiocb->ki_ioprio = get_current_ioprio();
2688
2689         if (ctx->flags & IORING_SETUP_IOPOLL) {
2690                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2691                     !kiocb->ki_filp->f_op->iopoll)
2692                         return -EOPNOTSUPP;
2693
2694                 kiocb->ki_flags |= IOCB_HIPRI;
2695                 kiocb->ki_complete = io_complete_rw_iopoll;
2696                 req->iopoll_completed = 0;
2697         } else {
2698                 if (kiocb->ki_flags & IOCB_HIPRI)
2699                         return -EINVAL;
2700                 kiocb->ki_complete = io_complete_rw;
2701         }
2702
2703         req->rw.addr = READ_ONCE(sqe->addr);
2704         req->rw.len = READ_ONCE(sqe->len);
2705         req->buf_index = READ_ONCE(sqe->buf_index);
2706         return 0;
2707 }
2708
2709 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2710 {
2711         switch (ret) {
2712         case -EIOCBQUEUED:
2713                 break;
2714         case -ERESTARTSYS:
2715         case -ERESTARTNOINTR:
2716         case -ERESTARTNOHAND:
2717         case -ERESTART_RESTARTBLOCK:
2718                 /*
2719                  * We can't just restart the syscall, since previously
2720                  * submitted sqes may already be in progress. Just fail this
2721                  * IO with EINTR.
2722                  */
2723                 ret = -EINTR;
2724                 fallthrough;
2725         default:
2726                 kiocb->ki_complete(kiocb, ret, 0);
2727         }
2728 }
2729
2730 static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
2731                        unsigned int issue_flags)
2732 {
2733         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2734         struct io_async_rw *io = req->async_data;
2735
2736         /* add previously done IO, if any */
2737         if (io && io->bytes_done > 0) {
2738                 if (ret < 0)
2739                         ret = io->bytes_done;
2740                 else
2741                         ret += io->bytes_done;
2742         }
2743
2744         if (req->flags & REQ_F_CUR_POS)
2745                 req->file->f_pos = kiocb->ki_pos;
2746         if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
2747                 __io_complete_rw(req, ret, 0, issue_flags);
2748         else
2749                 io_rw_done(kiocb, ret);
2750 }
2751
2752 static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
2753 {
2754         struct io_ring_ctx *ctx = req->ctx;
2755         size_t len = req->rw.len;
2756         struct io_mapped_ubuf *imu;
2757         u16 index, buf_index = req->buf_index;
2758         size_t offset;
2759         u64 buf_addr;
2760
2761         if (unlikely(buf_index >= ctx->nr_user_bufs))
2762                 return -EFAULT;
2763         index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2764         imu = &ctx->user_bufs[index];
2765         buf_addr = req->rw.addr;
2766
2767         /* overflow */
2768         if (buf_addr + len < buf_addr)
2769                 return -EFAULT;
2770         /* not inside the mapped region */
2771         if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
2772                 return -EFAULT;
2773
2774         /*
2775          * May not be a start of buffer, set size appropriately
2776          * and advance us to the beginning.
2777          */
2778         offset = buf_addr - imu->ubuf;
2779         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
2780
2781         if (offset) {
2782                 /*
2783                  * Don't use iov_iter_advance() here, as it's really slow for
2784                  * using the latter parts of a big fixed buffer - it iterates
2785                  * over each segment manually. We can cheat a bit here, because
2786                  * we know that:
2787                  *
2788                  * 1) it's a BVEC iter, we set it up
2789                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
2790                  *    first and last bvec
2791                  *
2792                  * So just find our index, and adjust the iterator afterwards.
2793                  * If the offset is within the first bvec (or the whole first
2794                  * bvec, just use iov_iter_advance(). This makes it easier
2795                  * since we can just skip the first segment, which may not
2796                  * be PAGE_SIZE aligned.
2797                  */
2798                 const struct bio_vec *bvec = imu->bvec;
2799
2800                 if (offset <= bvec->bv_len) {
2801                         iov_iter_advance(iter, offset);
2802                 } else {
2803                         unsigned long seg_skip;
2804
2805                         /* skip first vec */
2806                         offset -= bvec->bv_len;
2807                         seg_skip = 1 + (offset >> PAGE_SHIFT);
2808
2809                         iter->bvec = bvec + seg_skip;
2810                         iter->nr_segs -= seg_skip;
2811                         iter->count -= bvec->bv_len + offset;
2812                         iter->iov_offset = offset & ~PAGE_MASK;
2813                 }
2814         }
2815
2816         return 0;
2817 }
2818
2819 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2820 {
2821         if (needs_lock)
2822                 mutex_unlock(&ctx->uring_lock);
2823 }
2824
2825 static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2826 {
2827         /*
2828          * "Normal" inline submissions always hold the uring_lock, since we
2829          * grab it from the system call. Same is true for the SQPOLL offload.
2830          * The only exception is when we've detached the request and issue it
2831          * from an async worker thread, grab the lock for that case.
2832          */
2833         if (needs_lock)
2834                 mutex_lock(&ctx->uring_lock);
2835 }
2836
2837 static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
2838                                           int bgid, struct io_buffer *kbuf,
2839                                           bool needs_lock)
2840 {
2841         struct io_buffer *head;
2842
2843         if (req->flags & REQ_F_BUFFER_SELECTED)
2844                 return kbuf;
2845
2846         io_ring_submit_lock(req->ctx, needs_lock);
2847
2848         lockdep_assert_held(&req->ctx->uring_lock);
2849
2850         head = xa_load(&req->ctx->io_buffers, bgid);
2851         if (head) {
2852                 if (!list_empty(&head->list)) {
2853                         kbuf = list_last_entry(&head->list, struct io_buffer,
2854                                                         list);
2855                         list_del(&kbuf->list);
2856                 } else {
2857                         kbuf = head;
2858                         xa_erase(&req->ctx->io_buffers, bgid);
2859                 }
2860                 if (*len > kbuf->len)
2861                         *len = kbuf->len;
2862         } else {
2863                 kbuf = ERR_PTR(-ENOBUFS);
2864         }
2865
2866         io_ring_submit_unlock(req->ctx, needs_lock);
2867
2868         return kbuf;
2869 }
2870
2871 static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
2872                                         bool needs_lock)
2873 {
2874         struct io_buffer *kbuf;
2875         u16 bgid;
2876
2877         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2878         bgid = req->buf_index;
2879         kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
2880         if (IS_ERR(kbuf))
2881                 return kbuf;
2882         req->rw.addr = (u64) (unsigned long) kbuf;
2883         req->flags |= REQ_F_BUFFER_SELECTED;
2884         return u64_to_user_ptr(kbuf->addr);
2885 }
2886
2887 #ifdef CONFIG_COMPAT
2888 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
2889                                 bool needs_lock)
2890 {
2891         struct compat_iovec __user *uiov;
2892         compat_ssize_t clen;
2893         void __user *buf;
2894         ssize_t len;
2895
2896         uiov = u64_to_user_ptr(req->rw.addr);
2897         if (!access_ok(uiov, sizeof(*uiov)))
2898                 return -EFAULT;
2899         if (__get_user(clen, &uiov->iov_len))
2900                 return -EFAULT;
2901         if (clen < 0)
2902                 return -EINVAL;
2903
2904         len = clen;
2905         buf = io_rw_buffer_select(req, &len, needs_lock);
2906         if (IS_ERR(buf))
2907                 return PTR_ERR(buf);
2908         iov[0].iov_base = buf;
2909         iov[0].iov_len = (compat_size_t) len;
2910         return 0;
2911 }
2912 #endif
2913
2914 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2915                                       bool needs_lock)
2916 {
2917         struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
2918         void __user *buf;
2919         ssize_t len;
2920
2921         if (copy_from_user(iov, uiov, sizeof(*uiov)))
2922                 return -EFAULT;
2923
2924         len = iov[0].iov_len;
2925         if (len < 0)
2926                 return -EINVAL;
2927         buf = io_rw_buffer_select(req, &len, needs_lock);
2928         if (IS_ERR(buf))
2929                 return PTR_ERR(buf);
2930         iov[0].iov_base = buf;
2931         iov[0].iov_len = len;
2932         return 0;
2933 }
2934
2935 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2936                                     bool needs_lock)
2937 {
2938         if (req->flags & REQ_F_BUFFER_SELECTED) {
2939                 struct io_buffer *kbuf;
2940
2941                 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2942                 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
2943                 iov[0].iov_len = kbuf->len;
2944                 return 0;
2945         }
2946         if (req->rw.len != 1)
2947                 return -EINVAL;
2948
2949 #ifdef CONFIG_COMPAT
2950         if (req->ctx->compat)
2951                 return io_compat_import(req, iov, needs_lock);
2952 #endif
2953
2954         return __io_iov_buffer_select(req, iov, needs_lock);
2955 }
2956
2957 static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
2958                            struct iov_iter *iter, bool needs_lock)
2959 {
2960         void __user *buf = u64_to_user_ptr(req->rw.addr);
2961         size_t sqe_len = req->rw.len;
2962         u8 opcode = req->opcode;
2963         ssize_t ret;
2964
2965         if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
2966                 *iovec = NULL;
2967                 return io_import_fixed(req, rw, iter);
2968         }
2969
2970         /* buffer index only valid with fixed read/write, or buffer select  */
2971         if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
2972                 return -EINVAL;
2973
2974         if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
2975                 if (req->flags & REQ_F_BUFFER_SELECT) {
2976                         buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
2977                         if (IS_ERR(buf))
2978                                 return PTR_ERR(buf);
2979                         req->rw.len = sqe_len;
2980                 }
2981
2982                 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
2983                 *iovec = NULL;
2984                 return ret;
2985         }
2986
2987         if (req->flags & REQ_F_BUFFER_SELECT) {
2988                 ret = io_iov_buffer_select(req, *iovec, needs_lock);
2989                 if (!ret)
2990                         iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
2991                 *iovec = NULL;
2992                 return ret;
2993         }
2994
2995         return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
2996                               req->ctx->compat);
2997 }
2998
2999 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
3000 {
3001         return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
3002 }
3003
3004 /*
3005  * For files that don't have ->read_iter() and ->write_iter(), handle them
3006  * by looping over ->read() or ->write() manually.
3007  */
3008 static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
3009 {
3010         struct kiocb *kiocb = &req->rw.kiocb;
3011         struct file *file = req->file;
3012         ssize_t ret = 0;
3013
3014         /*
3015          * Don't support polled IO through this interface, and we can't
3016          * support non-blocking either. For the latter, this just causes
3017          * the kiocb to be handled from an async context.
3018          */
3019         if (kiocb->ki_flags & IOCB_HIPRI)
3020                 return -EOPNOTSUPP;
3021         if (kiocb->ki_flags & IOCB_NOWAIT)
3022                 return -EAGAIN;
3023
3024         while (iov_iter_count(iter)) {
3025                 struct iovec iovec;
3026                 ssize_t nr;
3027
3028                 if (!iov_iter_is_bvec(iter)) {
3029                         iovec = iov_iter_iovec(iter);
3030                 } else {
3031                         iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3032                         iovec.iov_len = req->rw.len;
3033                 }
3034
3035                 if (rw == READ) {
3036                         nr = file->f_op->read(file, iovec.iov_base,
3037                                               iovec.iov_len, io_kiocb_ppos(kiocb));
3038                 } else {
3039                         nr = file->f_op->write(file, iovec.iov_base,
3040                                                iovec.iov_len, io_kiocb_ppos(kiocb));
3041                 }
3042
3043                 if (nr < 0) {
3044                         if (!ret)
3045                                 ret = nr;
3046                         break;
3047                 }
3048                 ret += nr;
3049                 if (nr != iovec.iov_len)
3050                         break;
3051                 req->rw.len -= nr;
3052                 req->rw.addr += nr;
3053                 iov_iter_advance(iter, nr);
3054         }
3055
3056         return ret;
3057 }
3058
3059 static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3060                           const struct iovec *fast_iov, struct iov_iter *iter)
3061 {
3062         struct io_async_rw *rw = req->async_data;
3063
3064         memcpy(&rw->iter, iter, sizeof(*iter));
3065         rw->free_iovec = iovec;
3066         rw->bytes_done = 0;
3067         /* can only be fixed buffers, no need to do anything */
3068         if (iov_iter_is_bvec(iter))
3069                 return;
3070         if (!iovec) {
3071                 unsigned iov_off = 0;
3072
3073                 rw->iter.iov = rw->fast_iov;
3074                 if (iter->iov != fast_iov) {
3075                         iov_off = iter->iov - fast_iov;
3076                         rw->iter.iov += iov_off;
3077                 }
3078                 if (rw->fast_iov != fast_iov)
3079                         memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
3080                                sizeof(struct iovec) * iter->nr_segs);
3081         } else {
3082                 req->flags |= REQ_F_NEED_CLEANUP;
3083         }
3084 }
3085
3086 static inline int __io_alloc_async_data(struct io_kiocb *req)
3087 {
3088         WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3089         req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
3090         return req->async_data == NULL;
3091 }
3092
3093 static int io_alloc_async_data(struct io_kiocb *req)
3094 {
3095         if (!io_op_defs[req->opcode].needs_async_data)
3096                 return 0;
3097
3098         return  __io_alloc_async_data(req);
3099 }
3100
3101 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3102                              const struct iovec *fast_iov,
3103                              struct iov_iter *iter, bool force)
3104 {
3105         if (!force && !io_op_defs[req->opcode].needs_async_data)
3106                 return 0;
3107         if (!req->async_data) {
3108                 if (__io_alloc_async_data(req)) {
3109                         kfree(iovec);
3110                         return -ENOMEM;
3111                 }
3112
3113                 io_req_map_rw(req, iovec, fast_iov, iter);
3114         }
3115         return 0;
3116 }
3117
3118 static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
3119 {
3120         struct io_async_rw *iorw = req->async_data;
3121         struct iovec *iov = iorw->fast_iov;
3122         int ret;
3123
3124         ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
3125         if (unlikely(ret < 0))
3126                 return ret;
3127
3128         iorw->bytes_done = 0;
3129         iorw->free_iovec = iov;
3130         if (iov)
3131                 req->flags |= REQ_F_NEED_CLEANUP;
3132         return 0;
3133 }
3134
3135 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3136 {
3137         if (unlikely(!(req->file->f_mode & FMODE_READ)))
3138                 return -EBADF;
3139         return io_prep_rw(req, sqe);
3140 }
3141
3142 /*
3143  * This is our waitqueue callback handler, registered through lock_page_async()
3144  * when we initially tried to do the IO with the iocb armed our waitqueue.
3145  * This gets called when the page is unlocked, and we generally expect that to
3146  * happen when the page IO is completed and the page is now uptodate. This will
3147  * queue a task_work based retry of the operation, attempting to copy the data
3148  * again. If the latter fails because the page was NOT uptodate, then we will
3149  * do a thread based blocking retry of the operation. That's the unexpected
3150  * slow path.
3151  */
3152 static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3153                              int sync, void *arg)
3154 {
3155         struct wait_page_queue *wpq;
3156         struct io_kiocb *req = wait->private;
3157         struct wait_page_key *key = arg;
3158
3159         wpq = container_of(wait, struct wait_page_queue, wait);
3160
3161         if (!wake_page_match(wpq, key))
3162                 return 0;
3163
3164         req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
3165         list_del_init(&wait->entry);
3166
3167         /* submit ref gets dropped, acquire a new one */
3168         refcount_inc(&req->refs);
3169         io_req_task_queue(req);
3170         return 1;
3171 }
3172
3173 /*
3174  * This controls whether a given IO request should be armed for async page
3175  * based retry. If we return false here, the request is handed to the async
3176  * worker threads for retry. If we're doing buffered reads on a regular file,
3177  * we prepare a private wait_page_queue entry and retry the operation. This
3178  * will either succeed because the page is now uptodate and unlocked, or it
3179  * will register a callback when the page is unlocked at IO completion. Through
3180  * that callback, io_uring uses task_work to setup a retry of the operation.
3181  * That retry will attempt the buffered read again. The retry will generally
3182  * succeed, or in rare cases where it fails, we then fall back to using the
3183  * async worker threads for a blocking retry.
3184  */
3185 static bool io_rw_should_retry(struct io_kiocb *req)
3186 {
3187         struct io_async_rw *rw = req->async_data;
3188         struct wait_page_queue *wait = &rw->wpq;
3189         struct kiocb *kiocb = &req->rw.kiocb;
3190
3191         /* never retry for NOWAIT, we just complete with -EAGAIN */
3192         if (req->flags & REQ_F_NOWAIT)
3193                 return false;
3194
3195         /* Only for buffered IO */
3196         if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
3197                 return false;
3198
3199         /*
3200          * just use poll if we can, and don't attempt if the fs doesn't
3201          * support callback based unlocks
3202          */
3203         if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3204                 return false;
3205
3206         wait->wait.func = io_async_buf_func;
3207         wait->wait.private = req;
3208         wait->wait.flags = 0;
3209         INIT_LIST_HEAD(&wait->wait.entry);
3210         kiocb->ki_flags |= IOCB_WAITQ;
3211         kiocb->ki_flags &= ~IOCB_NOWAIT;
3212         kiocb->ki_waitq = wait;
3213         return true;
3214 }
3215
3216 static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
3217 {
3218         if (req->file->f_op->read_iter)
3219                 return call_read_iter(req->file, &req->rw.kiocb, iter);
3220         else if (req->file->f_op->read)
3221                 return loop_rw_iter(READ, req, iter);
3222         else
3223                 return -EINVAL;
3224 }
3225
3226 static int io_read(struct io_kiocb *req, unsigned int issue_flags)
3227 {
3228         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3229         struct kiocb *kiocb = &req->rw.kiocb;
3230         struct iov_iter __iter, *iter = &__iter;
3231         struct io_async_rw *rw = req->async_data;
3232         ssize_t io_size, ret, ret2;
3233         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3234
3235         if (rw) {
3236                 iter = &rw->iter;
3237                 iovec = NULL;
3238         } else {
3239                 ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
3240                 if (ret < 0)
3241                         return ret;
3242         }
3243         io_size = iov_iter_count(iter);
3244         req->result = io_size;
3245
3246         /* Ensure we clear previously set non-block flag */
3247         if (!force_nonblock)
3248                 kiocb->ki_flags &= ~IOCB_NOWAIT;
3249         else
3250                 kiocb->ki_flags |= IOCB_NOWAIT;
3251
3252         /* If the file doesn't support async, just async punt */
3253         if (force_nonblock && !io_file_supports_async(req->file, READ)) {
3254                 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3255                 return ret ?: -EAGAIN;
3256         }
3257
3258         ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
3259         if (unlikely(ret)) {
3260                 kfree(iovec);
3261                 return ret;
3262         }
3263
3264         ret = io_iter_do_read(req, iter);
3265
3266         if (ret == -EIOCBQUEUED) {
3267                 if (req->async_data)
3268                         iov_iter_revert(iter, io_size - iov_iter_count(iter));
3269                 goto out_free;
3270         } else if (ret == -EAGAIN) {
3271                 /* IOPOLL retry should happen for io-wq threads */
3272                 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
3273                         goto done;
3274                 /* no retry on NONBLOCK nor RWF_NOWAIT */
3275                 if (req->flags & REQ_F_NOWAIT)
3276                         goto done;
3277                 /* some cases will consume bytes even on error returns */
3278                 iov_iter_revert(iter, io_size - iov_iter_count(iter));
3279                 ret = 0;
3280         } else if (ret <= 0 || ret == io_size || !force_nonblock ||
3281                    (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) {
3282                 /* read all, failed, already did sync or don't want to retry */
3283                 goto done;
3284         }
3285
3286         ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3287         if (ret2)
3288                 return ret2;
3289
3290         iovec = NULL;
3291         rw = req->async_data;
3292         /* now use our persistent iterator, if we aren't already */
3293         iter = &rw->iter;
3294
3295         do {
3296                 io_size -= ret;
3297                 rw->bytes_done += ret;
3298                 /* if we can retry, do so with the callbacks armed */
3299                 if (!io_rw_should_retry(req)) {
3300                         kiocb->ki_flags &= ~IOCB_WAITQ;
3301                         return -EAGAIN;
3302                 }
3303
3304                 /*
3305                  * Now retry read with the IOCB_WAITQ parts set in the iocb. If
3306                  * we get -EIOCBQUEUED, then we'll get a notification when the
3307                  * desired page gets unlocked. We can also get a partial read
3308                  * here, and if we do, then just retry at the new offset.
3309                  */
3310                 ret = io_iter_do_read(req, iter);
3311                 if (ret == -EIOCBQUEUED)
3312                         return 0;
3313                 /* we got some bytes, but not all. retry. */
3314                 kiocb->ki_flags &= ~IOCB_WAITQ;
3315         } while (ret > 0 && ret < io_size);
3316 done:
3317         kiocb_done(kiocb, ret, issue_flags);
3318 out_free:
3319         /* it's faster to check here then delegate to kfree */
3320         if (iovec)
3321                 kfree(iovec);
3322         return 0;
3323 }
3324
3325 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3326 {
3327         if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3328                 return -EBADF;
3329         return io_prep_rw(req, sqe);
3330 }
3331
3332 static int io_write(struct io_kiocb *req, unsigned int issue_flags)
3333 {
3334         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3335         struct kiocb *kiocb = &req->rw.kiocb;
3336         struct iov_iter __iter, *iter = &__iter;
3337         struct io_async_rw *rw = req->async_data;
3338         ssize_t ret, ret2, io_size;
3339         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3340
3341         if (rw) {
3342                 iter = &rw->iter;
3343                 iovec = NULL;
3344         } else {
3345                 ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
3346                 if (ret < 0)
3347                         return ret;
3348         }
3349         io_size = iov_iter_count(iter);
3350         req->result = io_size;
3351
3352         /* Ensure we clear previously set non-block flag */
3353         if (!force_nonblock)
3354                 kiocb->ki_flags &= ~IOCB_NOWAIT;
3355         else
3356                 kiocb->ki_flags |= IOCB_NOWAIT;
3357
3358         /* If the file doesn't support async, just async punt */
3359         if (force_nonblock && !io_file_supports_async(req->file, WRITE))
3360                 goto copy_iov;
3361
3362         /* file path doesn't support NOWAIT for non-direct_IO */
3363         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3364             (req->flags & REQ_F_ISREG))
3365                 goto copy_iov;
3366
3367         ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
3368         if (unlikely(ret))
3369                 goto out_free;
3370
3371         /*
3372          * Open-code file_start_write here to grab freeze protection,
3373          * which will be released by another thread in
3374          * io_complete_rw().  Fool lockdep by telling it the lock got
3375          * released so that it doesn't complain about the held lock when
3376          * we return to userspace.
3377          */
3378         if (req->flags & REQ_F_ISREG) {
3379                 sb_start_write(file_inode(req->file)->i_sb);
3380                 __sb_writers_release(file_inode(req->file)->i_sb,
3381                                         SB_FREEZE_WRITE);
3382         }
3383         kiocb->ki_flags |= IOCB_WRITE;
3384
3385         if (req->file->f_op->write_iter)
3386                 ret2 = call_write_iter(req->file, kiocb, iter);
3387         else if (req->file->f_op->write)
3388                 ret2 = loop_rw_iter(WRITE, req, iter);
3389         else
3390                 ret2 = -EINVAL;
3391
3392         /*
3393          * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3394          * retry them without IOCB_NOWAIT.
3395          */
3396         if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3397                 ret2 = -EAGAIN;
3398         /* no retry on NONBLOCK nor RWF_NOWAIT */
3399         if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
3400                 goto done;
3401         if (ret2 == -EIOCBQUEUED && req->async_data)
3402                 iov_iter_revert(iter, io_size - iov_iter_count(iter));
3403         if (!force_nonblock || ret2 != -EAGAIN) {
3404                 /* IOPOLL retry should happen for io-wq threads */
3405                 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
3406                         goto copy_iov;
3407 done:
3408                 kiocb_done(kiocb, ret2, issue_flags);
3409         } else {
3410 copy_iov:
3411                 /* some cases will consume bytes even on error returns */
3412                 iov_iter_revert(iter, io_size - iov_iter_count(iter));
3413                 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
3414                 return ret ?: -EAGAIN;
3415         }
3416 out_free:
3417         /* it's reportedly faster than delegating the null check to kfree() */
3418         if (iovec)
3419                 kfree(iovec);
3420         return ret;
3421 }
3422
3423 static int io_renameat_prep(struct io_kiocb *req,
3424                             const struct io_uring_sqe *sqe)
3425 {
3426         struct io_rename *ren = &req->rename;
3427         const char __user *oldf, *newf;
3428
3429         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3430                 return -EBADF;
3431
3432         ren->old_dfd = READ_ONCE(sqe->fd);
3433         oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3434         newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3435         ren->new_dfd = READ_ONCE(sqe->len);
3436         ren->flags = READ_ONCE(sqe->rename_flags);
3437
3438         ren->oldpath = getname(oldf);
3439         if (IS_ERR(ren->oldpath))
3440                 return PTR_ERR(ren->oldpath);
3441
3442         ren->newpath = getname(newf);
3443         if (IS_ERR(ren->newpath)) {
3444                 putname(ren->oldpath);
3445                 return PTR_ERR(ren->newpath);
3446         }
3447
3448         req->flags |= REQ_F_NEED_CLEANUP;
3449         return 0;
3450 }
3451
3452 static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
3453 {
3454         struct io_rename *ren = &req->rename;
3455         int ret;
3456
3457         if (issue_flags & IO_URING_F_NONBLOCK)
3458                 return -EAGAIN;
3459
3460         ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
3461                                 ren->newpath, ren->flags);
3462
3463         req->flags &= ~REQ_F_NEED_CLEANUP;
3464         if (ret < 0)
3465                 req_set_fail_links(req);
3466         io_req_complete(req, ret);
3467         return 0;
3468 }
3469
3470 static int io_unlinkat_prep(struct io_kiocb *req,
3471                             const struct io_uring_sqe *sqe)
3472 {
3473         struct io_unlink *un = &req->unlink;
3474         const char __user *fname;
3475
3476         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3477                 return -EBADF;
3478
3479         un->dfd = READ_ONCE(sqe->fd);
3480
3481         un->flags = READ_ONCE(sqe->unlink_flags);
3482         if (un->flags & ~AT_REMOVEDIR)
3483                 return -EINVAL;
3484
3485         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3486         un->filename = getname(fname);
3487         if (IS_ERR(un->filename))
3488                 return PTR_ERR(un->filename);
3489
3490         req->flags |= REQ_F_NEED_CLEANUP;
3491         return 0;
3492 }
3493
3494 static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
3495 {
3496         struct io_unlink *un = &req->unlink;
3497         int ret;
3498
3499         if (issue_flags & IO_URING_F_NONBLOCK)
3500                 return -EAGAIN;
3501
3502         if (un->flags & AT_REMOVEDIR)
3503                 ret = do_rmdir(un->dfd, un->filename);
3504         else
3505                 ret = do_unlinkat(un->dfd, un->filename);
3506
3507         req->flags &= ~REQ_F_NEED_CLEANUP;
3508         if (ret < 0)
3509                 req_set_fail_links(req);
3510         io_req_complete(req, ret);
3511         return 0;
3512 }
3513
3514 static int io_shutdown_prep(struct io_kiocb *req,
3515                             const struct io_uring_sqe *sqe)
3516 {
3517 #if defined(CONFIG_NET)
3518         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3519                 return -EINVAL;
3520         if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
3521             sqe->buf_index)
3522                 return -EINVAL;
3523
3524         req->shutdown.how = READ_ONCE(sqe->len);
3525         return 0;
3526 #else
3527         return -EOPNOTSUPP;
3528 #endif
3529 }
3530
3531 static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
3532 {
3533 #if defined(CONFIG_NET)
3534         struct socket *sock;
3535         int ret;
3536
3537         if (issue_flags & IO_URING_F_NONBLOCK)
3538                 return -EAGAIN;
3539
3540         sock = sock_from_file(req->file);
3541         if (unlikely(!sock))
3542                 return -ENOTSOCK;
3543
3544         ret = __sys_shutdown_sock(sock, req->shutdown.how);
3545         if (ret < 0)
3546                 req_set_fail_links(req);
3547         io_req_complete(req, ret);
3548         return 0;
3549 #else
3550         return -EOPNOTSUPP;
3551 #endif
3552 }
3553
3554 static int __io_splice_prep(struct io_kiocb *req,
3555                             const struct io_uring_sqe *sqe)
3556 {
3557         struct io_splice* sp = &req->splice;
3558         unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
3559
3560         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3561                 return -EINVAL;
3562
3563         sp->file_in = NULL;
3564         sp->len = READ_ONCE(sqe->len);
3565         sp->flags = READ_ONCE(sqe->splice_flags);
3566
3567         if (unlikely(sp->flags & ~valid_flags))
3568                 return -EINVAL;
3569
3570         sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in),
3571                                   (sp->flags & SPLICE_F_FD_IN_FIXED));
3572         if (!sp->file_in)
3573                 return -EBADF;
3574         req->flags |= REQ_F_NEED_CLEANUP;
3575
3576         if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
3577                 /*
3578                  * Splice operation will be punted aync, and here need to
3579                  * modify io_wq_work.flags, so initialize io_wq_work firstly.
3580                  */
3581                 req->work.flags |= IO_WQ_WORK_UNBOUND;
3582         }
3583
3584         return 0;
3585 }
3586
3587 static int io_tee_prep(struct io_kiocb *req,
3588                        const struct io_uring_sqe *sqe)
3589 {
3590         if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
3591                 return -EINVAL;
3592         return __io_splice_prep(req, sqe);
3593 }
3594
3595 static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
3596 {
3597         struct io_splice *sp = &req->splice;
3598         struct file *in = sp->file_in;
3599         struct file *out = sp->file_out;
3600         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3601         long ret = 0;
3602
3603         if (issue_flags & IO_URING_F_NONBLOCK)
3604                 return -EAGAIN;
3605         if (sp->len)
3606                 ret = do_tee(in, out, sp->len, flags);
3607
3608         io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3609         req->flags &= ~REQ_F_NEED_CLEANUP;
3610
3611         if (ret != sp->len)
3612                 req_set_fail_links(req);
3613         io_req_complete(req, ret);
3614         return 0;
3615 }
3616
3617 static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3618 {
3619         struct io_splice* sp = &req->splice;
3620
3621         sp->off_in = READ_ONCE(sqe->splice_off_in);
3622         sp->off_out = READ_ONCE(sqe->off);
3623         return __io_splice_prep(req, sqe);
3624 }
3625
3626 static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
3627 {
3628         struct io_splice *sp = &req->splice;
3629         struct file *in = sp->file_in;
3630         struct file *out = sp->file_out;
3631         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3632         loff_t *poff_in, *poff_out;
3633         long ret = 0;
3634
3635         if (issue_flags & IO_URING_F_NONBLOCK)
3636                 return -EAGAIN;
3637
3638         poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
3639         poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
3640
3641         if (sp->len)
3642                 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
3643
3644         io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3645         req->flags &= ~REQ_F_NEED_CLEANUP;
3646
3647         if (ret != sp->len)
3648                 req_set_fail_links(req);
3649         io_req_complete(req, ret);
3650         return 0;
3651 }
3652
3653 /*
3654  * IORING_OP_NOP just posts a completion event, nothing else.
3655  */
3656 static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
3657 {
3658         struct io_ring_ctx *ctx = req->ctx;
3659
3660         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3661                 return -EINVAL;
3662
3663         __io_req_complete(req, issue_flags, 0, 0);
3664         return 0;
3665 }
3666
3667 static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3668 {
3669         struct io_ring_ctx *ctx = req->ctx;
3670
3671         if (!req->file)
3672                 return -EBADF;
3673
3674         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3675                 return -EINVAL;
3676         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
3677                 return -EINVAL;
3678
3679         req->sync.flags = READ_ONCE(sqe->fsync_flags);
3680         if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
3681                 return -EINVAL;
3682
3683         req->sync.off = READ_ONCE(sqe->off);
3684         req->sync.len = READ_ONCE(sqe->len);
3685         return 0;
3686 }
3687
3688 static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
3689 {
3690         loff_t end = req->sync.off + req->sync.len;
3691         int ret;
3692
3693         /* fsync always requires a blocking context */
3694         if (issue_flags & IO_URING_F_NONBLOCK)
3695                 return -EAGAIN;
3696
3697         ret = vfs_fsync_range(req->file, req->sync.off,
3698                                 end > 0 ? end : LLONG_MAX,
3699                                 req->sync.flags & IORING_FSYNC_DATASYNC);
3700         if (ret < 0)
3701                 req_set_fail_links(req);
3702         io_req_complete(req, ret);
3703         return 0;
3704 }
3705
3706 static int io_fallocate_prep(struct io_kiocb *req,
3707                              const struct io_uring_sqe *sqe)
3708 {
3709         if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
3710                 return -EINVAL;
3711         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3712                 return -EINVAL;
3713
3714         req->sync.off = READ_ONCE(sqe->off);
3715         req->sync.len = READ_ONCE(sqe->addr);
3716         req->sync.mode = READ_ONCE(sqe->len);
3717         return 0;
3718 }
3719
3720 static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
3721 {
3722         int ret;
3723
3724         /* fallocate always requiring blocking context */
3725         if (issue_flags & IO_URING_F_NONBLOCK)
3726                 return -EAGAIN;
3727         ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
3728                                 req->sync.len);
3729         if (ret < 0)
3730                 req_set_fail_links(req);
3731         io_req_complete(req, ret);
3732         return 0;
3733 }
3734
3735 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3736 {
3737         const char __user *fname;
3738         int ret;
3739
3740         if (unlikely(sqe->ioprio || sqe->buf_index))
3741                 return -EINVAL;
3742         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3743                 return -EBADF;
3744
3745         /* open.how should be already initialised */
3746         if (!(req->open.how.flags & O_PATH) && force_o_largefile())
3747                 req->open.how.flags |= O_LARGEFILE;
3748
3749         req->open.dfd = READ_ONCE(sqe->fd);
3750         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3751         req->open.filename = getname(fname);
3752         if (IS_ERR(req->open.filename)) {
3753                 ret = PTR_ERR(req->open.filename);
3754                 req->open.filename = NULL;
3755                 return ret;
3756         }
3757         req->open.nofile = rlimit(RLIMIT_NOFILE);
3758         req->flags |= REQ_F_NEED_CLEANUP;
3759         return 0;
3760 }
3761
3762 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3763 {
3764         u64 flags, mode;
3765
3766         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3767                 return -EINVAL;
3768         mode = READ_ONCE(sqe->len);
3769         flags = READ_ONCE(sqe->open_flags);
3770         req->open.how = build_open_how(flags, mode);
3771         return __io_openat_prep(req, sqe);
3772 }
3773
3774 static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3775 {
3776         struct open_how __user *how;
3777         size_t len;
3778         int ret;
3779
3780         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3781                 return -EINVAL;
3782         how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3783         len = READ_ONCE(sqe->len);
3784         if (len < OPEN_HOW_SIZE_VER0)
3785                 return -EINVAL;
3786
3787         ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
3788                                         len);
3789         if (ret)
3790                 return ret;
3791
3792         return __io_openat_prep(req, sqe);
3793 }
3794
3795 static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
3796 {
3797         struct open_flags op;
3798         struct file *file;
3799         bool nonblock_set;
3800         bool resolve_nonblock;
3801         int ret;
3802
3803         ret = build_open_flags(&req->open.how, &op);
3804         if (ret)
3805                 goto err;
3806         nonblock_set = op.open_flag & O_NONBLOCK;
3807         resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
3808         if (issue_flags & IO_URING_F_NONBLOCK) {
3809                 /*
3810                  * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
3811                  * it'll always -EAGAIN
3812                  */
3813                 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
3814                         return -EAGAIN;
3815                 op.lookup_flags |= LOOKUP_CACHED;
3816                 op.open_flag |= O_NONBLOCK;
3817         }
3818
3819         ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
3820         if (ret < 0)
3821                 goto err;
3822
3823         file = do_filp_open(req->open.dfd, req->open.filename, &op);
3824         /* only retry if RESOLVE_CACHED wasn't already set by application */
3825         if ((!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)) &&
3826             file == ERR_PTR(-EAGAIN)) {
3827                 /*
3828                  * We could hang on to this 'fd', but seems like marginal
3829                  * gain for something that is now known to be a slower path.
3830                  * So just put it, and we'll get a new one when we retry.
3831                  */
3832                 put_unused_fd(ret);
3833                 return -EAGAIN;
3834         }
3835
3836         if (IS_ERR(file)) {
3837                 put_unused_fd(ret);
3838                 ret = PTR_ERR(file);
3839         } else {
3840                 if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
3841                         file->f_flags &= ~O_NONBLOCK;
3842                 fsnotify_open(file);
3843                 fd_install(ret, file);
3844         }
3845 err:
3846         putname(req->open.filename);
3847         req->flags &= ~REQ_F_NEED_CLEANUP;
3848         if (ret < 0)
3849                 req_set_fail_links(req);
3850         io_req_complete(req, ret);
3851         return 0;
3852 }
3853
3854 static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
3855 {
3856         return io_openat2(req, issue_flags);
3857 }
3858
3859 static int io_remove_buffers_prep(struct io_kiocb *req,
3860                                   const struct io_uring_sqe *sqe)
3861 {
3862         struct io_provide_buf *p = &req->pbuf;
3863         u64 tmp;
3864
3865         if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
3866                 return -EINVAL;
3867
3868         tmp = READ_ONCE(sqe->fd);
3869         if (!tmp || tmp > USHRT_MAX)
3870                 return -EINVAL;
3871
3872         memset(p, 0, sizeof(*p));
3873         p->nbufs = tmp;
3874         p->bgid = READ_ONCE(sqe->buf_group);
3875         return 0;
3876 }
3877
3878 static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
3879                                int bgid, unsigned nbufs)
3880 {
3881         unsigned i = 0;
3882
3883         /* shouldn't happen */
3884         if (!nbufs)
3885                 return 0;
3886
3887         /* the head kbuf is the list itself */
3888         while (!list_empty(&buf->list)) {
3889                 struct io_buffer *nxt;
3890
3891                 nxt = list_first_entry(&buf->list, struct io_buffer, list);
3892                 list_del(&nxt->list);
3893                 kfree(nxt);
3894                 if (++i == nbufs)
3895                         return i;
3896         }
3897         i++;
3898         kfree(buf);
3899         xa_erase(&ctx->io_buffers, bgid);
3900
3901         return i;
3902 }
3903
3904 static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
3905 {
3906         struct io_provide_buf *p = &req->pbuf;
3907         struct io_ring_ctx *ctx = req->ctx;
3908         struct io_buffer *head;
3909         int ret = 0;
3910         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3911
3912         io_ring_submit_lock(ctx, !force_nonblock);
3913
3914         lockdep_assert_held(&ctx->uring_lock);
3915
3916         ret = -ENOENT;
3917         head = xa_load(&ctx->io_buffers, p->bgid);
3918         if (head)
3919                 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
3920         if (ret < 0)
3921                 req_set_fail_links(req);
3922
3923         /* need to hold the lock to complete IOPOLL requests */
3924         if (ctx->flags & IORING_SETUP_IOPOLL) {
3925                 __io_req_complete(req, issue_flags, ret, 0);
3926                 io_ring_submit_unlock(ctx, !force_nonblock);
3927         } else {
3928                 io_ring_submit_unlock(ctx, !force_nonblock);
3929                 __io_req_complete(req, issue_flags, ret, 0);
3930         }
3931         return 0;
3932 }
3933
3934 static int io_provide_buffers_prep(struct io_kiocb *req,
3935                                    const struct io_uring_sqe *sqe)
3936 {
3937         struct io_provide_buf *p = &req->pbuf;
3938         u64 tmp;
3939
3940         if (sqe->ioprio || sqe->rw_flags)
3941                 return -EINVAL;
3942
3943         tmp = READ_ONCE(sqe->fd);
3944         if (!tmp || tmp > USHRT_MAX)
3945                 return -E2BIG;
3946         p->nbufs = tmp;
3947         p->addr = READ_ONCE(sqe->addr);
3948         p->len = READ_ONCE(sqe->len);
3949
3950         if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
3951                 return -EFAULT;
3952
3953         p->bgid = READ_ONCE(sqe->buf_group);
3954         tmp = READ_ONCE(sqe->off);
3955         if (tmp > USHRT_MAX)
3956                 return -E2BIG;
3957         p->bid = tmp;
3958         return 0;
3959 }
3960
3961 static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
3962 {
3963         struct io_buffer *buf;
3964         u64 addr = pbuf->addr;
3965         int i, bid = pbuf->bid;
3966
3967         for (i = 0; i < pbuf->nbufs; i++) {
3968                 buf = kmalloc(sizeof(*buf), GFP_KERNEL);
3969                 if (!buf)
3970                         break;
3971
3972                 buf->addr = addr;
3973                 buf->len = pbuf->len;
3974                 buf->bid = bid;
3975                 addr += pbuf->len;
3976                 bid++;
3977                 if (!*head) {
3978                         INIT_LIST_HEAD(&buf->list);
3979                         *head = buf;
3980                 } else {
3981                         list_add_tail(&buf->list, &(*head)->list);
3982                 }
3983         }
3984
3985         return i ? i : -ENOMEM;
3986 }
3987
3988 static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
3989 {
3990         struct io_provide_buf *p = &req->pbuf;
3991         struct io_ring_ctx *ctx = req->ctx;
3992         struct io_buffer *head, *list;
3993         int ret = 0;
3994         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3995
3996         io_ring_submit_lock(ctx, !force_nonblock);
3997
3998         lockdep_assert_held(&ctx->uring_lock);
3999
4000         list = head = xa_load(&ctx->io_buffers, p->bgid);
4001
4002         ret = io_add_buffers(p, &head);
4003         if (ret >= 0 && !list) {
4004                 ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL);
4005                 if (ret < 0)
4006                         __io_remove_buffers(ctx, head, p->bgid, -1U);
4007         }
4008         if (ret < 0)
4009                 req_set_fail_links(req);
4010
4011         /* need to hold the lock to complete IOPOLL requests */
4012         if (ctx->flags & IORING_SETUP_IOPOLL) {
4013                 __io_req_complete(req, issue_flags, ret, 0);
4014                 io_ring_submit_unlock(ctx, !force_nonblock);
4015         } else {
4016                 io_ring_submit_unlock(ctx, !force_nonblock);
4017                 __io_req_complete(req, issue_flags, ret, 0);
4018         }
4019         return 0;
4020 }
4021
4022 static int io_epoll_ctl_prep(struct io_kiocb *req,
4023                              const struct io_uring_sqe *sqe)
4024 {
4025 #if defined(CONFIG_EPOLL)
4026         if (sqe->ioprio || sqe->buf_index)
4027                 return -EINVAL;
4028         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
4029                 return -EINVAL;
4030
4031         req->epoll.epfd = READ_ONCE(sqe->fd);
4032         req->epoll.op = READ_ONCE(sqe->len);
4033         req->epoll.fd = READ_ONCE(sqe->off);
4034
4035         if (ep_op_has_event(req->epoll.op)) {
4036                 struct epoll_event __user *ev;
4037
4038                 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4039                 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4040                         return -EFAULT;
4041         }
4042
4043         return 0;
4044 #else
4045         return -EOPNOTSUPP;
4046 #endif
4047 }
4048
4049 static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
4050 {
4051 #if defined(CONFIG_EPOLL)
4052         struct io_epoll *ie = &req->epoll;
4053         int ret;
4054         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4055
4056         ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4057         if (force_nonblock && ret == -EAGAIN)
4058                 return -EAGAIN;
4059
4060         if (ret < 0)
4061                 req_set_fail_links(req);
4062         __io_req_complete(req, issue_flags, ret, 0);
4063         return 0;
4064 #else
4065         return -EOPNOTSUPP;
4066 #endif
4067 }
4068
4069 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4070 {
4071 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4072         if (sqe->ioprio || sqe->buf_index || sqe->off)
4073                 return -EINVAL;
4074         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4075                 return -EINVAL;
4076
4077         req->madvise.addr = READ_ONCE(sqe->addr);
4078         req->madvise.len = READ_ONCE(sqe->len);
4079         req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4080         return 0;
4081 #else
4082         return -EOPNOTSUPP;
4083 #endif
4084 }
4085
4086 static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
4087 {
4088 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4089         struct io_madvise *ma = &req->madvise;
4090         int ret;
4091
4092         if (issue_flags & IO_URING_F_NONBLOCK)
4093                 return -EAGAIN;
4094
4095         ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
4096         if (ret < 0)
4097                 req_set_fail_links(req);
4098         io_req_complete(req, ret);
4099         return 0;
4100 #else
4101         return -EOPNOTSUPP;
4102 #endif
4103 }
4104
4105 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4106 {
4107         if (sqe->ioprio || sqe->buf_index || sqe->addr)
4108                 return -EINVAL;
4109         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4110                 return -EINVAL;
4111
4112         req->fadvise.offset = READ_ONCE(sqe->off);
4113         req->fadvise.len = READ_ONCE(sqe->len);
4114         req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
4115         return 0;
4116 }
4117
4118 static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
4119 {
4120         struct io_fadvise *fa = &req->fadvise;
4121         int ret;
4122
4123         if (issue_flags & IO_URING_F_NONBLOCK) {
4124                 switch (fa->advice) {
4125                 case POSIX_FADV_NORMAL:
4126                 case POSIX_FADV_RANDOM:
4127                 case POSIX_FADV_SEQUENTIAL:
4128                         break;
4129                 default:
4130                         return -EAGAIN;
4131                 }
4132         }
4133
4134         ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
4135         if (ret < 0)
4136                 req_set_fail_links(req);
4137         io_req_complete(req, ret);
4138         return 0;
4139 }
4140
4141 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4142 {
4143         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
4144                 return -EINVAL;
4145         if (sqe->ioprio || sqe->buf_index)
4146                 return -EINVAL;
4147         if (req->flags & REQ_F_FIXED_FILE)
4148                 return -EBADF;
4149
4150         req->statx.dfd = READ_ONCE(sqe->fd);
4151         req->statx.mask = READ_ONCE(sqe->len);
4152         req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
4153         req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4154         req->statx.flags = READ_ONCE(sqe->statx_flags);
4155
4156         return 0;
4157 }
4158
4159 static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
4160 {
4161         struct io_statx *ctx = &req->statx;
4162         int ret;
4163
4164         if (issue_flags & IO_URING_F_NONBLOCK) {
4165                 /* only need file table for an actual valid fd */
4166                 if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
4167                         req->flags |= REQ_F_NO_FILE_TABLE;
4168                 return -EAGAIN;
4169         }
4170
4171         ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
4172                        ctx->buffer);
4173
4174         if (ret < 0)
4175                 req_set_fail_links(req);
4176         io_req_complete(req, ret);
4177         return 0;
4178 }
4179
4180 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4181 {
4182         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4183                 return -EINVAL;
4184         if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
4185             sqe->rw_flags || sqe->buf_index)
4186                 return -EINVAL;
4187         if (req->flags & REQ_F_FIXED_FILE)
4188                 return -EBADF;
4189
4190         req->close.fd = READ_ONCE(sqe->fd);
4191         return 0;
4192 }
4193
4194 static int io_close(struct io_kiocb *req, unsigned int issue_flags)
4195 {
4196         struct files_struct *files = current->files;
4197         struct io_close *close = &req->close;
4198         struct fdtable *fdt;
4199         struct file *file;
4200         int ret;
4201
4202         file = NULL;
4203         ret = -EBADF;
4204         spin_lock(&files->file_lock);
4205         fdt = files_fdtable(files);
4206         if (close->fd >= fdt->max_fds) {
4207                 spin_unlock(&files->file_lock);
4208                 goto err;
4209         }
4210         file = fdt->fd[close->fd];
4211         if (!file) {
4212                 spin_unlock(&files->file_lock);
4213                 goto err;
4214         }
4215
4216         if (file->f_op == &io_uring_fops) {
4217                 spin_unlock(&files->file_lock);
4218                 file = NULL;
4219                 goto err;
4220         }
4221
4222         /* if the file has a flush method, be safe and punt to async */
4223         if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
4224                 spin_unlock(&files->file_lock);
4225                 return -EAGAIN;
4226         }
4227
4228         ret = __close_fd_get_file(close->fd, &file);
4229         spin_unlock(&files->file_lock);
4230         if (ret < 0) {
4231                 if (ret == -ENOENT)
4232                         ret = -EBADF;
4233                 goto err;
4234         }
4235
4236         /* No ->flush() or already async, safely close from here */
4237         ret = filp_close(file, current->files);
4238 err:
4239         if (ret < 0)
4240                 req_set_fail_links(req);
4241         if (file)
4242                 fput(file);
4243         __io_req_complete(req, issue_flags, ret, 0);
4244         return 0;
4245 }
4246
4247 static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4248 {
4249         struct io_ring_ctx *ctx = req->ctx;
4250
4251         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4252                 return -EINVAL;
4253         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
4254                 return -EINVAL;
4255
4256         req->sync.off = READ_ONCE(sqe->off);
4257         req->sync.len = READ_ONCE(sqe->len);
4258         req->sync.flags = READ_ONCE(sqe->sync_range_flags);
4259         return 0;
4260 }
4261
4262 static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
4263 {
4264         int ret;
4265
4266         /* sync_file_range always requires a blocking context */
4267         if (issue_flags & IO_URING_F_NONBLOCK)
4268                 return -EAGAIN;
4269
4270         ret = sync_file_range(req->file, req->sync.off, req->sync.len,
4271                                 req->sync.flags);
4272         if (ret < 0)
4273                 req_set_fail_links(req);
4274         io_req_complete(req, ret);
4275         return 0;
4276 }
4277
4278 #if defined(CONFIG_NET)
4279 static int io_setup_async_msg(struct io_kiocb *req,
4280                               struct io_async_msghdr *kmsg)
4281 {
4282         struct io_async_msghdr *async_msg = req->async_data;
4283
4284         if (async_msg)
4285                 return -EAGAIN;
4286         if (io_alloc_async_data(req)) {
4287                 kfree(kmsg->free_iov);
4288                 return -ENOMEM;
4289         }
4290         async_msg = req->async_data;
4291         req->flags |= REQ_F_NEED_CLEANUP;
4292         memcpy(async_msg, kmsg, sizeof(*kmsg));
4293         async_msg->msg.msg_name = &async_msg->addr;
4294         /* if were using fast_iov, set it to the new one */
4295         if (!async_msg->free_iov)
4296                 async_msg->msg.msg_iter.iov = async_msg->fast_iov;
4297
4298         return -EAGAIN;
4299 }
4300
4301 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
4302                                struct io_async_msghdr *iomsg)
4303 {
4304         iomsg->msg.msg_name = &iomsg->addr;
4305         iomsg->free_iov = iomsg->fast_iov;
4306         return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
4307                                    req->sr_msg.msg_flags, &iomsg->free_iov);
4308 }
4309
4310 static int io_sendmsg_prep_async(struct io_kiocb *req)
4311 {
4312         int ret;
4313
4314         if (!io_op_defs[req->opcode].needs_async_data)
4315                 return 0;
4316         ret = io_sendmsg_copy_hdr(req, req->async_data);
4317         if (!ret)
4318                 req->flags |= REQ_F_NEED_CLEANUP;
4319         return ret;
4320 }
4321
4322 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4323 {
4324         struct io_sr_msg *sr = &req->sr_msg;
4325
4326         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4327                 return -EINVAL;
4328
4329         sr->msg_flags = READ_ONCE(sqe->msg_flags);
4330         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4331         sr->len = READ_ONCE(sqe->len);
4332
4333 #ifdef CONFIG_COMPAT
4334         if (req->ctx->compat)
4335                 sr->msg_flags |= MSG_CMSG_COMPAT;
4336 #endif
4337         return 0;
4338 }
4339
4340 static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
4341 {
4342         struct io_async_msghdr iomsg, *kmsg;
4343         struct socket *sock;
4344         unsigned flags;
4345         int ret;
4346
4347         sock = sock_from_file(req->file);
4348         if (unlikely(!sock))
4349                 return -ENOTSOCK;
4350
4351         kmsg = req->async_data;
4352         if (!kmsg) {
4353                 ret = io_sendmsg_copy_hdr(req, &iomsg);
4354                 if (ret)
4355                         return ret;
4356                 kmsg = &iomsg;
4357         }
4358
4359         flags = req->sr_msg.msg_flags;
4360         if (flags & MSG_DONTWAIT)
4361                 req->flags |= REQ_F_NOWAIT;
4362         else if (issue_flags & IO_URING_F_NONBLOCK)
4363                 flags |= MSG_DONTWAIT;
4364
4365         ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
4366         if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4367                 return io_setup_async_msg(req, kmsg);
4368         if (ret == -ERESTARTSYS)
4369                 ret = -EINTR;
4370
4371         /* fast path, check for non-NULL to avoid function call */
4372         if (kmsg->free_iov)
4373                 kfree(kmsg->free_iov);
4374         req->flags &= ~REQ_F_NEED_CLEANUP;
4375         if (ret < 0)
4376                 req_set_fail_links(req);
4377         __io_req_complete(req, issue_flags, ret, 0);
4378         return 0;
4379 }
4380
4381 static int io_send(struct io_kiocb *req, unsigned int issue_flags)
4382 {
4383         struct io_sr_msg *sr = &req->sr_msg;
4384         struct msghdr msg;
4385         struct iovec iov;
4386         struct socket *sock;
4387         unsigned flags;
4388         int ret;
4389
4390         sock = sock_from_file(req->file);
4391         if (unlikely(!sock))
4392                 return -ENOTSOCK;
4393
4394         ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
4395         if (unlikely(ret))
4396                 return ret;
4397
4398         msg.msg_name = NULL;
4399         msg.msg_control = NULL;
4400         msg.msg_controllen = 0;
4401         msg.msg_namelen = 0;
4402
4403         flags = req->sr_msg.msg_flags;
4404         if (flags & MSG_DONTWAIT)
4405                 req->flags |= REQ_F_NOWAIT;
4406         else if (issue_flags & IO_URING_F_NONBLOCK)
4407                 flags |= MSG_DONTWAIT;
4408
4409         msg.msg_flags = flags;
4410         ret = sock_sendmsg(sock, &msg);
4411         if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4412                 return -EAGAIN;
4413         if (ret == -ERESTARTSYS)
4414                 ret = -EINTR;
4415
4416         if (ret < 0)
4417                 req_set_fail_links(req);
4418         __io_req_complete(req, issue_flags, ret, 0);
4419         return 0;
4420 }
4421
4422 static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
4423                                  struct io_async_msghdr *iomsg)
4424 {
4425         struct io_sr_msg *sr = &req->sr_msg;
4426         struct iovec __user *uiov;
4427         size_t iov_len;
4428         int ret;
4429
4430         ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
4431                                         &iomsg->uaddr, &uiov, &iov_len);
4432         if (ret)
4433                 return ret;
4434
4435         if (req->flags & REQ_F_BUFFER_SELECT) {
4436                 if (iov_len > 1)
4437                         return -EINVAL;
4438                 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
4439                         return -EFAULT;
4440                 sr->len = iomsg->fast_iov[0].iov_len;
4441                 iomsg->free_iov = NULL;
4442         } else {
4443                 iomsg->free_iov = iomsg->fast_iov;
4444                 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
4445                                      &iomsg->free_iov, &iomsg->msg.msg_iter,
4446                                      false);
4447                 if (ret > 0)
4448                         ret = 0;
4449         }
4450
4451         return ret;
4452 }
4453
4454 #ifdef CONFIG_COMPAT
4455 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
4456                                         struct io_async_msghdr *iomsg)
4457 {
4458         struct compat_msghdr __user *msg_compat;
4459         struct io_sr_msg *sr = &req->sr_msg;
4460         struct compat_iovec __user *uiov;
4461         compat_uptr_t ptr;
4462         compat_size_t len;
4463         int ret;
4464
4465         msg_compat = (struct compat_msghdr __user *) sr->umsg;
4466         ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr,
4467                                         &ptr, &len);
4468         if (ret)
4469                 return ret;
4470
4471         uiov = compat_ptr(ptr);
4472         if (req->flags & REQ_F_BUFFER_SELECT) {
4473                 compat_ssize_t clen;
4474
4475                 if (len > 1)
4476                         return -EINVAL;
4477                 if (!access_ok(uiov, sizeof(*uiov)))
4478                         return -EFAULT;
4479                 if (__get_user(clen, &uiov->iov_len))
4480                         return -EFAULT;
4481                 if (clen < 0)
4482                         return -EINVAL;
4483                 sr->len = clen;
4484                 iomsg->free_iov = NULL;
4485         } else {
4486                 iomsg->free_iov = iomsg->fast_iov;
4487                 ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
4488                                    UIO_FASTIOV, &iomsg->free_iov,
4489                                    &iomsg->msg.msg_iter, true);
4490                 if (ret < 0)
4491                         return ret;
4492         }
4493
4494         return 0;
4495 }
4496 #endif
4497
4498 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
4499                                struct io_async_msghdr *iomsg)
4500 {
4501         iomsg->msg.msg_name = &iomsg->addr;
4502
4503 #ifdef CONFIG_COMPAT
4504         if (req->ctx->compat)
4505                 return __io_compat_recvmsg_copy_hdr(req, iomsg);
4506 #endif
4507
4508         return __io_recvmsg_copy_hdr(req, iomsg);
4509 }
4510
4511 static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
4512                                                bool needs_lock)
4513 {
4514         struct io_sr_msg *sr = &req->sr_msg;
4515         struct io_buffer *kbuf;
4516
4517         kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
4518         if (IS_ERR(kbuf))
4519                 return kbuf;
4520
4521         sr->kbuf = kbuf;
4522         req->flags |= REQ_F_BUFFER_SELECTED;
4523         return kbuf;
4524 }
4525
4526 static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
4527 {
4528         return io_put_kbuf(req, req->sr_msg.kbuf);
4529 }
4530
4531 static int io_recvmsg_prep_async(struct io_kiocb *req)
4532 {
4533         int ret;
4534
4535         if (!io_op_defs[req->opcode].needs_async_data)
4536                 return 0;
4537         ret = io_recvmsg_copy_hdr(req, req->async_data);
4538         if (!ret)
4539                 req->flags |= REQ_F_NEED_CLEANUP;
4540         return ret;
4541 }
4542
4543 static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4544 {
4545         struct io_sr_msg *sr = &req->sr_msg;
4546
4547         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4548                 return -EINVAL;
4549
4550         sr->msg_flags = READ_ONCE(sqe->msg_flags);
4551         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4552         sr->len = READ_ONCE(sqe->len);
4553         sr->bgid = READ_ONCE(sqe->buf_group);
4554
4555 #ifdef CONFIG_COMPAT
4556         if (req->ctx->compat)
4557                 sr->msg_flags |= MSG_CMSG_COMPAT;
4558 #endif
4559         return 0;
4560 }
4561
4562 static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
4563 {
4564         struct io_async_msghdr iomsg, *kmsg;
4565         struct socket *sock;
4566         struct io_buffer *kbuf;
4567         unsigned flags;
4568         int ret, cflags = 0;
4569         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4570
4571         sock = sock_from_file(req->file);
4572         if (unlikely(!sock))
4573                 return -ENOTSOCK;
4574
4575         kmsg = req->async_data;
4576         if (!kmsg) {
4577                 ret = io_recvmsg_copy_hdr(req, &iomsg);
4578                 if (ret)
4579                         return ret;
4580                 kmsg = &iomsg;
4581         }
4582
4583         if (req->flags & REQ_F_BUFFER_SELECT) {
4584                 kbuf = io_recv_buffer_select(req, !force_nonblock);
4585                 if (IS_ERR(kbuf))
4586                         return PTR_ERR(kbuf);
4587                 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
4588                 kmsg->fast_iov[0].iov_len = req->sr_msg.len;
4589                 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
4590                                 1, req->sr_msg.len);
4591         }
4592
4593         flags = req->sr_msg.msg_flags;
4594         if (flags & MSG_DONTWAIT)
4595                 req->flags |= REQ_F_NOWAIT;
4596         else if (force_nonblock)
4597                 flags |= MSG_DONTWAIT;
4598
4599         ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
4600                                         kmsg->uaddr, flags);
4601         if (force_nonblock && ret == -EAGAIN)
4602                 return io_setup_async_msg(req, kmsg);
4603         if (ret == -ERESTARTSYS)
4604                 ret = -EINTR;
4605
4606         if (req->flags & REQ_F_BUFFER_SELECTED)
4607                 cflags = io_put_recv_kbuf(req);
4608         /* fast path, check for non-NULL to avoid function call */
4609         if (kmsg->free_iov)
4610                 kfree(kmsg->free_iov);
4611         req->flags &= ~REQ_F_NEED_CLEANUP;
4612         if (ret < 0)
4613                 req_set_fail_links(req);
4614         __io_req_complete(req, issue_flags, ret, cflags);
4615         return 0;
4616 }
4617
4618 static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
4619 {
4620         struct io_buffer *kbuf;
4621         struct io_sr_msg *sr = &req->sr_msg;
4622         struct msghdr msg;
4623         void __user *buf = sr->buf;
4624         struct socket *sock;
4625         struct iovec iov;
4626         unsigned flags;
4627         int ret, cflags = 0;
4628         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4629
4630         sock = sock_from_file(req->file);
4631         if (unlikely(!sock))
4632                 return -ENOTSOCK;
4633
4634         if (req->flags & REQ_F_BUFFER_SELECT) {
4635                 kbuf = io_recv_buffer_select(req, !force_nonblock);
4636                 if (IS_ERR(kbuf))
4637                         return PTR_ERR(kbuf);
4638                 buf = u64_to_user_ptr(kbuf->addr);
4639         }
4640
4641         ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
4642         if (unlikely(ret))
4643                 goto out_free;
4644
4645         msg.msg_name = NULL;
4646         msg.msg_control = NULL;
4647         msg.msg_controllen = 0;
4648         msg.msg_namelen = 0;
4649         msg.msg_iocb = NULL;
4650         msg.msg_flags = 0;
4651
4652         flags = req->sr_msg.msg_flags;
4653         if (flags & MSG_DONTWAIT)
4654                 req->flags |= REQ_F_NOWAIT;
4655         else if (force_nonblock)
4656                 flags |= MSG_DONTWAIT;
4657
4658         ret = sock_recvmsg(sock, &msg, flags);
4659         if (force_nonblock && ret == -EAGAIN)
4660                 return -EAGAIN;
4661         if (ret == -ERESTARTSYS)
4662                 ret = -EINTR;
4663 out_free:
4664         if (req->flags & REQ_F_BUFFER_SELECTED)
4665                 cflags = io_put_recv_kbuf(req);
4666         if (ret < 0)
4667                 req_set_fail_links(req);
4668         __io_req_complete(req, issue_flags, ret, cflags);
4669         return 0;
4670 }
4671
4672 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4673 {
4674         struct io_accept *accept = &req->accept;
4675
4676         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4677                 return -EINVAL;
4678         if (sqe->ioprio || sqe->len || sqe->buf_index)
4679                 return -EINVAL;
4680
4681         accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4682         accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4683         accept->flags = READ_ONCE(sqe->accept_flags);
4684         accept->nofile = rlimit(RLIMIT_NOFILE);
4685         return 0;
4686 }
4687
4688 static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
4689 {
4690         struct io_accept *accept = &req->accept;
4691         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4692         unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
4693         int ret;
4694
4695         if (req->file->f_flags & O_NONBLOCK)
4696                 req->flags |= REQ_F_NOWAIT;
4697
4698         ret = __sys_accept4_file(req->file, file_flags, accept->addr,
4699                                         accept->addr_len, accept->flags,
4700                                         accept->nofile);
4701         if (ret == -EAGAIN && force_nonblock)
4702                 return -EAGAIN;
4703         if (ret < 0) {
4704                 if (ret == -ERESTARTSYS)
4705                         ret = -EINTR;
4706                 req_set_fail_links(req);
4707         }
4708         __io_req_complete(req, issue_flags, ret, 0);
4709         return 0;
4710 }
4711
4712 static int io_connect_prep_async(struct io_kiocb *req)
4713 {
4714         struct io_async_connect *io = req->async_data;
4715         struct io_connect *conn = &req->connect;
4716
4717         return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
4718 }
4719
4720 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4721 {
4722         struct io_connect *conn = &req->connect;
4723
4724         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4725                 return -EINVAL;
4726         if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
4727                 return -EINVAL;
4728
4729         conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4730         conn->addr_len =  READ_ONCE(sqe->addr2);
4731         return 0;
4732 }
4733
4734 static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
4735 {
4736         struct io_async_connect __io, *io;
4737         unsigned file_flags;
4738         int ret;
4739         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4740
4741         if (req->async_data) {
4742                 io = req->async_data;
4743         } else {
4744                 ret = move_addr_to_kernel(req->connect.addr,
4745                                                 req->connect.addr_len,
4746                                                 &__io.address);
4747                 if (ret)
4748                         goto out;
4749                 io = &__io;
4750         }
4751
4752         file_flags = force_nonblock ? O_NONBLOCK : 0;
4753
4754         ret = __sys_connect_file(req->file, &io->address,
4755                                         req->connect.addr_len, file_flags);
4756         if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
4757                 if (req->async_data)
4758                         return -EAGAIN;
4759                 if (io_alloc_async_data(req)) {
4760                         ret = -ENOMEM;
4761                         goto out;
4762                 }
4763                 io = req->async_data;
4764                 memcpy(req->async_data, &__io, sizeof(__io));
4765                 return -EAGAIN;
4766         }
4767         if (ret == -ERESTARTSYS)
4768                 ret = -EINTR;
4769 out:
4770         if (ret < 0)
4771                 req_set_fail_links(req);
4772         __io_req_complete(req, issue_flags, ret, 0);
4773         return 0;
4774 }
4775 #else /* !CONFIG_NET */
4776 #define IO_NETOP_FN(op)                                                 \
4777 static int io_##op(struct io_kiocb *req, unsigned int issue_flags)      \
4778 {                                                                       \
4779         return -EOPNOTSUPP;                                             \
4780 }
4781
4782 #define IO_NETOP_PREP(op)                                               \
4783 IO_NETOP_FN(op)                                                         \
4784 static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
4785 {                                                                       \
4786         return -EOPNOTSUPP;                                             \
4787 }                                                                       \
4788
4789 #define IO_NETOP_PREP_ASYNC(op)                                         \
4790 IO_NETOP_PREP(op)                                                       \
4791 static int io_##op##_prep_async(struct io_kiocb *req)                   \
4792 {                                                                       \
4793         return -EOPNOTSUPP;                                             \
4794 }
4795
4796 IO_NETOP_PREP_ASYNC(sendmsg);
4797 IO_NETOP_PREP_ASYNC(recvmsg);
4798 IO_NETOP_PREP_ASYNC(connect);
4799 IO_NETOP_PREP(accept);
4800 IO_NETOP_FN(send);
4801 IO_NETOP_FN(recv);
4802 #endif /* CONFIG_NET */
4803
4804 struct io_poll_table {
4805         struct poll_table_struct pt;
4806         struct io_kiocb *req;
4807         int error;
4808 };
4809
4810 static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
4811                            __poll_t mask, task_work_func_t func)
4812 {
4813         int ret;
4814
4815         /* for instances that support it check for an event match first: */
4816         if (mask && !(mask & poll->events))
4817                 return 0;
4818
4819         trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
4820
4821         list_del_init(&poll->wait.entry);
4822
4823         req->result = mask;
4824         req->task_work.func = func;
4825         percpu_ref_get(&req->ctx->refs);
4826
4827         /*
4828          * If this fails, then the task is exiting. When a task exits, the
4829          * work gets canceled, so just cancel this request as well instead
4830          * of executing it. We can't safely execute it anyway, as we may not
4831          * have the needed state needed for it anyway.
4832          */
4833         ret = io_req_task_work_add(req);
4834         if (unlikely(ret)) {
4835                 WRITE_ONCE(poll->canceled, true);
4836                 io_req_task_work_add_fallback(req, func);
4837         }
4838         return 1;
4839 }
4840
4841 static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
4842         __acquires(&req->ctx->completion_lock)
4843 {
4844         struct io_ring_ctx *ctx = req->ctx;
4845
4846         if (!req->result && !READ_ONCE(poll->canceled)) {
4847                 struct poll_table_struct pt = { ._key = poll->events };
4848
4849                 req->result = vfs_poll(req->file, &pt) & poll->events;
4850         }
4851
4852         spin_lock_irq(&ctx->completion_lock);
4853         if (!req->result && !READ_ONCE(poll->canceled)) {
4854                 add_wait_queue(poll->head, &poll->wait);
4855                 return true;
4856         }
4857
4858         return false;
4859 }
4860
4861 static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
4862 {
4863         /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
4864         if (req->opcode == IORING_OP_POLL_ADD)
4865                 return req->async_data;
4866         return req->apoll->double_poll;
4867 }
4868
4869 static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
4870 {
4871         if (req->opcode == IORING_OP_POLL_ADD)
4872                 return &req->poll;
4873         return &req->apoll->poll;
4874 }
4875
4876 static void io_poll_remove_double(struct io_kiocb *req)
4877 {
4878         struct io_poll_iocb *poll = io_poll_get_double(req);
4879
4880         lockdep_assert_held(&req->ctx->completion_lock);
4881
4882         if (poll && poll->head) {
4883                 struct wait_queue_head *head = poll->head;
4884
4885                 spin_lock(&head->lock);
4886                 list_del_init(&poll->wait.entry);
4887                 if (poll->wait.private)
4888                         refcount_dec(&req->refs);
4889                 poll->head = NULL;
4890                 spin_unlock(&head->lock);
4891         }
4892 }
4893
4894 static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
4895 {
4896         struct io_ring_ctx *ctx = req->ctx;
4897
4898         io_poll_remove_double(req);
4899         req->poll.done = true;
4900         io_cqring_fill_event(req, error ? error : mangle_poll(mask));
4901         io_commit_cqring(ctx);
4902 }
4903
4904 static void io_poll_task_func(struct callback_head *cb)
4905 {
4906         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4907         struct io_ring_ctx *ctx = req->ctx;
4908         struct io_kiocb *nxt;
4909
4910         if (io_poll_rewait(req, &req->poll)) {
4911                 spin_unlock_irq(&ctx->completion_lock);
4912         } else {
4913                 hash_del(&req->hash_node);
4914                 io_poll_complete(req, req->result, 0);
4915                 spin_unlock_irq(&ctx->completion_lock);
4916
4917                 nxt = io_put_req_find_next(req);
4918                 io_cqring_ev_posted(ctx);
4919                 if (nxt)
4920                         __io_req_task_submit(nxt);
4921         }
4922
4923         percpu_ref_put(&ctx->refs);
4924 }
4925
4926 static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
4927                                int sync, void *key)
4928 {
4929         struct io_kiocb *req = wait->private;
4930         struct io_poll_iocb *poll = io_poll_get_single(req);
4931         __poll_t mask = key_to_poll(key);
4932
4933         /* for instances that support it check for an event match first: */
4934         if (mask && !(mask & poll->events))
4935                 return 0;
4936
4937         list_del_init(&wait->entry);
4938
4939         if (poll && poll->head) {
4940                 bool done;
4941
4942                 spin_lock(&poll->head->lock);
4943                 done = list_empty(&poll->wait.entry);
4944                 if (!done)
4945                         list_del_init(&poll->wait.entry);
4946                 /* make sure double remove sees this as being gone */
4947                 wait->private = NULL;
4948                 spin_unlock(&poll->head->lock);
4949                 if (!done) {
4950                         /* use wait func handler, so it matches the rq type */
4951                         poll->wait.func(&poll->wait, mode, sync, key);
4952                 }
4953         }
4954         refcount_dec(&req->refs);
4955         return 1;
4956 }
4957
4958 static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
4959                               wait_queue_func_t wake_func)
4960 {
4961         poll->head = NULL;
4962         poll->done = false;
4963         poll->canceled = false;
4964         poll->events = events;
4965         INIT_LIST_HEAD(&poll->wait.entry);
4966         init_waitqueue_func_entry(&poll->wait, wake_func);
4967 }
4968
4969 static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
4970                             struct wait_queue_head *head,
4971                             struct io_poll_iocb **poll_ptr)
4972 {
4973         struct io_kiocb *req = pt->req;
4974
4975         /*
4976          * If poll->head is already set, it's because the file being polled
4977          * uses multiple waitqueues for poll handling (eg one for read, one
4978          * for write). Setup a separate io_poll_iocb if this happens.
4979          */
4980         if (unlikely(poll->head)) {
4981                 struct io_poll_iocb *poll_one = poll;
4982
4983                 /* already have a 2nd entry, fail a third attempt */
4984                 if (*poll_ptr) {
4985                         pt->error = -EINVAL;
4986                         return;
4987                 }
4988                 /* double add on the same waitqueue head, ignore */
4989                 if (poll->head == head)
4990                         return;
4991                 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
4992                 if (!poll) {
4993                         pt->error = -ENOMEM;
4994                         return;
4995                 }
4996                 io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
4997                 refcount_inc(&req->refs);
4998                 poll->wait.private = req;
4999                 *poll_ptr = poll;
5000         }
5001
5002         pt->error = 0;
5003         poll->head = head;
5004
5005         if (poll->events & EPOLLEXCLUSIVE)
5006                 add_wait_queue_exclusive(head, &poll->wait);
5007         else
5008                 add_wait_queue(head, &poll->wait);
5009 }
5010
5011 static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
5012                                struct poll_table_struct *p)
5013 {
5014         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5015         struct async_poll *apoll = pt->req->apoll;
5016
5017         __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
5018 }
5019
5020 static void io_async_task_func(struct callback_head *cb)
5021 {
5022         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
5023         struct async_poll *apoll = req->apoll;
5024         struct io_ring_ctx *ctx = req->ctx;
5025
5026         trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
5027
5028         if (io_poll_rewait(req, &apoll->poll)) {
5029                 spin_unlock_irq(&ctx->completion_lock);
5030                 percpu_ref_put(&ctx->refs);
5031                 return;
5032         }
5033
5034         /* If req is still hashed, it cannot have been canceled. Don't check. */
5035         if (hash_hashed(&req->hash_node))
5036                 hash_del(&req->hash_node);
5037
5038         io_poll_remove_double(req);
5039         spin_unlock_irq(&ctx->completion_lock);
5040
5041         if (!READ_ONCE(apoll->poll.canceled))
5042                 __io_req_task_submit(req);
5043         else
5044                 __io_req_task_cancel(req, -ECANCELED);
5045
5046         percpu_ref_put(&ctx->refs);
5047         kfree(apoll->double_poll);
5048         kfree(apoll);
5049 }
5050
5051 static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5052                         void *key)
5053 {
5054         struct io_kiocb *req = wait->private;
5055         struct io_poll_iocb *poll = &req->apoll->poll;
5056
5057         trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
5058                                         key_to_poll(key));
5059
5060         return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
5061 }
5062
5063 static void io_poll_req_insert(struct io_kiocb *req)
5064 {
5065         struct io_ring_ctx *ctx = req->ctx;
5066         struct hlist_head *list;
5067
5068         list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
5069         hlist_add_head(&req->hash_node, list);
5070 }
5071
5072 static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
5073                                       struct io_poll_iocb *poll,
5074                                       struct io_poll_table *ipt, __poll_t mask,
5075                                       wait_queue_func_t wake_func)
5076         __acquires(&ctx->completion_lock)
5077 {
5078         struct io_ring_ctx *ctx = req->ctx;
5079         bool cancel = false;
5080
5081         INIT_HLIST_NODE(&req->hash_node);
5082         io_init_poll_iocb(poll, mask, wake_func);
5083         poll->file = req->file;
5084         poll->wait.private = req;
5085
5086         ipt->pt._key = mask;
5087         ipt->req = req;
5088         ipt->error = -EINVAL;
5089
5090         mask = vfs_poll(req->file, &ipt->pt) & poll->events;
5091
5092         spin_lock_irq(&ctx->completion_lock);
5093         if (likely(poll->head)) {
5094                 spin_lock(&poll->head->lock);
5095                 if (unlikely(list_empty(&poll->wait.entry))) {
5096                         if (ipt->error)
5097                                 cancel = true;
5098                         ipt->error = 0;
5099                         mask = 0;
5100                 }
5101                 if (mask || ipt->error)
5102                         list_del_init(&poll->wait.entry);
5103                 else if (cancel)
5104                         WRITE_ONCE(poll->canceled, true);
5105                 else if (!poll->done) /* actually waiting for an event */
5106                         io_poll_req_insert(req);
5107                 spin_unlock(&poll->head->lock);
5108         }
5109
5110         return mask;
5111 }
5112
5113 static bool io_arm_poll_handler(struct io_kiocb *req)
5114 {
5115         const struct io_op_def *def = &io_op_defs[req->opcode];
5116         struct io_ring_ctx *ctx = req->ctx;
5117         struct async_poll *apoll;
5118         struct io_poll_table ipt;
5119         __poll_t mask, ret;
5120         int rw;
5121
5122         if (!req->file || !file_can_poll(req->file))
5123                 return false;
5124         if (req->flags & REQ_F_POLLED)
5125                 return false;
5126         if (def->pollin)
5127                 rw = READ;
5128         else if (def->pollout)
5129                 rw = WRITE;
5130         else
5131                 return false;
5132         /* if we can't nonblock try, then no point in arming a poll handler */
5133         if (!io_file_supports_async(req->file, rw))
5134                 return false;
5135
5136         apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5137         if (unlikely(!apoll))
5138                 return false;
5139         apoll->double_poll = NULL;
5140
5141         req->flags |= REQ_F_POLLED;
5142         req->apoll = apoll;
5143
5144         mask = 0;
5145         if (def->pollin)
5146                 mask |= POLLIN | POLLRDNORM;
5147         if (def->pollout)
5148                 mask |= POLLOUT | POLLWRNORM;
5149
5150         /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
5151         if ((req->opcode == IORING_OP_RECVMSG) &&
5152             (req->sr_msg.msg_flags & MSG_ERRQUEUE))
5153                 mask &= ~POLLIN;
5154
5155         mask |= POLLERR | POLLPRI;
5156
5157         ipt.pt._qproc = io_async_queue_proc;
5158
5159         ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
5160                                         io_async_wake);
5161         if (ret || ipt.error) {
5162                 io_poll_remove_double(req);
5163                 spin_unlock_irq(&ctx->completion_lock);
5164                 kfree(apoll->double_poll);
5165                 kfree(apoll);
5166                 return false;
5167         }
5168         spin_unlock_irq(&ctx->completion_lock);
5169         trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
5170                                         apoll->poll.events);
5171         return true;
5172 }
5173
5174 static bool __io_poll_remove_one(struct io_kiocb *req,
5175                                  struct io_poll_iocb *poll)
5176 {
5177         bool do_complete = false;
5178
5179         spin_lock(&poll->head->lock);
5180         WRITE_ONCE(poll->canceled, true);
5181         if (!list_empty(&poll->wait.entry)) {
5182                 list_del_init(&poll->wait.entry);
5183                 do_complete = true;
5184         }
5185         spin_unlock(&poll->head->lock);
5186         hash_del(&req->hash_node);
5187         return do_complete;
5188 }
5189
5190 static bool io_poll_remove_one(struct io_kiocb *req)
5191 {
5192         bool do_complete;
5193
5194         io_poll_remove_double(req);
5195
5196         if (req->opcode == IORING_OP_POLL_ADD) {
5197                 do_complete = __io_poll_remove_one(req, &req->poll);
5198         } else {
5199                 struct async_poll *apoll = req->apoll;
5200
5201                 /* non-poll requests have submit ref still */
5202                 do_complete = __io_poll_remove_one(req, &apoll->poll);
5203                 if (do_complete) {
5204                         io_put_req(req);
5205                         kfree(apoll->double_poll);
5206                         kfree(apoll);
5207                 }
5208         }
5209
5210         if (do_complete) {
5211                 io_cqring_fill_event(req, -ECANCELED);
5212                 io_commit_cqring(req->ctx);
5213                 req_set_fail_links(req);
5214                 io_put_req_deferred(req, 1);
5215         }
5216
5217         return do_complete;
5218 }
5219
5220 /*
5221  * Returns true if we found and killed one or more poll requests
5222  */
5223 static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
5224                                struct files_struct *files)
5225 {
5226         struct hlist_node *tmp;
5227         struct io_kiocb *req;
5228         int posted = 0, i;
5229
5230         spin_lock_irq(&ctx->completion_lock);
5231         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
5232                 struct hlist_head *list;
5233
5234                 list = &ctx->cancel_hash[i];
5235                 hlist_for_each_entry_safe(req, tmp, list, hash_node) {
5236                         if (io_match_task(req, tsk, files))
5237                                 posted += io_poll_remove_one(req);
5238                 }
5239         }
5240         spin_unlock_irq(&ctx->completion_lock);
5241
5242         if (posted)
5243                 io_cqring_ev_posted(ctx);
5244
5245         return posted != 0;
5246 }
5247
5248 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
5249 {
5250         struct hlist_head *list;
5251         struct io_kiocb *req;
5252
5253         list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
5254         hlist_for_each_entry(req, list, hash_node) {
5255                 if (sqe_addr != req->user_data)
5256                         continue;
5257                 if (io_poll_remove_one(req))
5258                         return 0;
5259                 return -EALREADY;
5260         }
5261
5262         return -ENOENT;
5263 }
5264
5265 static int io_poll_remove_prep(struct io_kiocb *req,
5266                                const struct io_uring_sqe *sqe)
5267 {
5268         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5269                 return -EINVAL;
5270         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
5271             sqe->poll_events)
5272                 return -EINVAL;
5273
5274         req->poll_remove.addr = READ_ONCE(sqe->addr);
5275         return 0;
5276 }
5277
5278 /*
5279  * Find a running poll command that matches one specified in sqe->addr,
5280  * and remove it if found.
5281  */
5282 static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
5283 {
5284         struct io_ring_ctx *ctx = req->ctx;
5285         int ret;
5286
5287         spin_lock_irq(&ctx->completion_lock);
5288         ret = io_poll_cancel(ctx, req->poll_remove.addr);
5289         spin_unlock_irq(&ctx->completion_lock);
5290
5291         if (ret < 0)
5292                 req_set_fail_links(req);
5293         io_req_complete(req, ret);
5294         return 0;
5295 }
5296
5297 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5298                         void *key)
5299 {
5300         struct io_kiocb *req = wait->private;
5301         struct io_poll_iocb *poll = &req->poll;
5302
5303         return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
5304 }
5305
5306 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
5307                                struct poll_table_struct *p)
5308 {
5309         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5310
5311         __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
5312 }
5313
5314 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5315 {
5316         struct io_poll_iocb *poll = &req->poll;
5317         u32 events;
5318
5319         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5320                 return -EINVAL;
5321         if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
5322                 return -EINVAL;
5323
5324         events = READ_ONCE(sqe->poll32_events);
5325 #ifdef __BIG_ENDIAN
5326         events = swahw32(events);
5327 #endif
5328         poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
5329                        (events & EPOLLEXCLUSIVE);
5330         return 0;
5331 }
5332
5333 static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
5334 {
5335         struct io_poll_iocb *poll = &req->poll;
5336         struct io_ring_ctx *ctx = req->ctx;
5337         struct io_poll_table ipt;
5338         __poll_t mask;
5339
5340         ipt.pt._qproc = io_poll_queue_proc;
5341
5342         mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
5343                                         io_poll_wake);
5344
5345         if (mask) { /* no async, we'd stolen it */
5346                 ipt.error = 0;
5347                 io_poll_complete(req, mask, 0);
5348         }
5349         spin_unlock_irq(&ctx->completion_lock);
5350
5351         if (mask) {
5352                 io_cqring_ev_posted(ctx);
5353                 io_put_req(req);
5354         }
5355         return ipt.error;
5356 }
5357
5358 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
5359 {
5360         struct io_timeout_data *data = container_of(timer,
5361                                                 struct io_timeout_data, timer);
5362         struct io_kiocb *req = data->req;
5363         struct io_ring_ctx *ctx = req->ctx;
5364         unsigned long flags;
5365
5366         spin_lock_irqsave(&ctx->completion_lock, flags);
5367         list_del_init(&req->timeout.list);
5368         atomic_set(&req->ctx->cq_timeouts,
5369                 atomic_read(&req->ctx->cq_timeouts) + 1);
5370
5371         io_cqring_fill_event(req, -ETIME);
5372         io_commit_cqring(ctx);
5373         spin_unlock_irqrestore(&ctx->completion_lock, flags);
5374
5375         io_cqring_ev_posted(ctx);
5376         req_set_fail_links(req);
5377         io_put_req(req);
5378         return HRTIMER_NORESTART;
5379 }
5380
5381 static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
5382                                            __u64 user_data)
5383 {
5384         struct io_timeout_data *io;
5385         struct io_kiocb *req;
5386         int ret = -ENOENT;
5387
5388         list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
5389                 if (user_data == req->user_data) {
5390                         ret = 0;
5391                         break;
5392                 }
5393         }
5394
5395         if (ret == -ENOENT)
5396                 return ERR_PTR(ret);
5397
5398         io = req->async_data;
5399         ret = hrtimer_try_to_cancel(&io->timer);
5400         if (ret == -1)
5401                 return ERR_PTR(-EALREADY);
5402         list_del_init(&req->timeout.list);
5403         return req;
5404 }
5405
5406 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
5407 {
5408         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5409
5410         if (IS_ERR(req))
5411                 return PTR_ERR(req);
5412
5413         req_set_fail_links(req);
5414         io_cqring_fill_event(req, -ECANCELED);
5415         io_put_req_deferred(req, 1);
5416         return 0;
5417 }
5418
5419 static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
5420                              struct timespec64 *ts, enum hrtimer_mode mode)
5421 {
5422         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5423         struct io_timeout_data *data;
5424
5425         if (IS_ERR(req))
5426                 return PTR_ERR(req);
5427
5428         req->timeout.off = 0; /* noseq */
5429         data = req->async_data;
5430         list_add_tail(&req->timeout.list, &ctx->timeout_list);
5431         hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
5432         data->timer.function = io_timeout_fn;
5433         hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
5434         return 0;
5435 }
5436
5437 static int io_timeout_remove_prep(struct io_kiocb *req,
5438                                   const struct io_uring_sqe *sqe)
5439 {
5440         struct io_timeout_rem *tr = &req->timeout_rem;
5441
5442         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5443                 return -EINVAL;
5444         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5445                 return -EINVAL;
5446         if (sqe->ioprio || sqe->buf_index || sqe->len)
5447                 return -EINVAL;
5448
5449         tr->addr = READ_ONCE(sqe->addr);
5450         tr->flags = READ_ONCE(sqe->timeout_flags);
5451         if (tr->flags & IORING_TIMEOUT_UPDATE) {
5452                 if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
5453                         return -EINVAL;
5454                 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
5455                         return -EFAULT;
5456         } else if (tr->flags) {
5457                 /* timeout removal doesn't support flags */
5458                 return -EINVAL;
5459         }
5460
5461         return 0;
5462 }
5463
5464 static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
5465 {
5466         return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
5467                                             : HRTIMER_MODE_REL;
5468 }
5469
5470 /*
5471  * Remove or update an existing timeout command
5472  */
5473 static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
5474 {
5475         struct io_timeout_rem *tr = &req->timeout_rem;
5476         struct io_ring_ctx *ctx = req->ctx;
5477         int ret;
5478
5479         spin_lock_irq(&ctx->completion_lock);
5480         if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE))
5481                 ret = io_timeout_cancel(ctx, tr->addr);
5482         else
5483                 ret = io_timeout_update(ctx, tr->addr, &tr->ts,
5484                                         io_translate_timeout_mode(tr->flags));
5485
5486         io_cqring_fill_event(req, ret);
5487         io_commit_cqring(ctx);
5488         spin_unlock_irq(&ctx->completion_lock);
5489         io_cqring_ev_posted(ctx);
5490         if (ret < 0)
5491                 req_set_fail_links(req);
5492         io_put_req(req);
5493         return 0;
5494 }
5495
5496 static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5497                            bool is_timeout_link)
5498 {
5499         struct io_timeout_data *data;
5500         unsigned flags;
5501         u32 off = READ_ONCE(sqe->off);
5502
5503         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5504                 return -EINVAL;
5505         if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
5506                 return -EINVAL;
5507         if (off && is_timeout_link)
5508                 return -EINVAL;
5509         flags = READ_ONCE(sqe->timeout_flags);
5510         if (flags & ~IORING_TIMEOUT_ABS)
5511                 return -EINVAL;
5512
5513         req->timeout.off = off;
5514
5515         if (!req->async_data && io_alloc_async_data(req))
5516                 return -ENOMEM;
5517
5518         data = req->async_data;
5519         data->req = req;
5520
5521         if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5522                 return -EFAULT;
5523
5524         data->mode = io_translate_timeout_mode(flags);
5525         hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
5526         io_req_track_inflight(req);
5527         return 0;
5528 }
5529
5530 static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
5531 {
5532         struct io_ring_ctx *ctx = req->ctx;
5533         struct io_timeout_data *data = req->async_data;
5534         struct list_head *entry;
5535         u32 tail, off = req->timeout.off;
5536
5537         spin_lock_irq(&ctx->completion_lock);
5538
5539         /*
5540          * sqe->off holds how many events that need to occur for this
5541          * timeout event to be satisfied. If it isn't set, then this is
5542          * a pure timeout request, sequence isn't used.
5543          */
5544         if (io_is_timeout_noseq(req)) {
5545                 entry = ctx->timeout_list.prev;
5546                 goto add;
5547         }
5548
5549         tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
5550         req->timeout.target_seq = tail + off;
5551
5552         /* Update the last seq here in case io_flush_timeouts() hasn't.
5553          * This is safe because ->completion_lock is held, and submissions
5554          * and completions are never mixed in the same ->completion_lock section.
5555          */
5556         ctx->cq_last_tm_flush = tail;
5557
5558         /*
5559          * Insertion sort, ensuring the first entry in the list is always
5560          * the one we need first.
5561          */
5562         list_for_each_prev(entry, &ctx->timeout_list) {
5563                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
5564                                                   timeout.list);
5565
5566                 if (io_is_timeout_noseq(nxt))
5567                         continue;
5568                 /* nxt.seq is behind @tail, otherwise would've been completed */
5569                 if (off >= nxt->timeout.target_seq - tail)
5570                         break;
5571         }
5572 add:
5573         list_add(&req->timeout.list, entry);
5574         data->timer.function = io_timeout_fn;
5575         hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
5576         spin_unlock_irq(&ctx->completion_lock);
5577         return 0;
5578 }
5579
5580 struct io_cancel_data {
5581         struct io_ring_ctx *ctx;
5582         u64 user_data;
5583 };
5584
5585 static bool io_cancel_cb(struct io_wq_work *work, void *data)
5586 {
5587         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5588         struct io_cancel_data *cd = data;
5589
5590         return req->ctx == cd->ctx && req->user_data == cd->user_data;
5591 }
5592
5593 static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
5594                                struct io_ring_ctx *ctx)
5595 {
5596         struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
5597         enum io_wq_cancel cancel_ret;
5598         int ret = 0;
5599
5600         if (!tctx || !tctx->io_wq)
5601                 return -ENOENT;
5602
5603         cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
5604         switch (cancel_ret) {
5605         case IO_WQ_CANCEL_OK:
5606                 ret = 0;
5607                 break;
5608         case IO_WQ_CANCEL_RUNNING:
5609                 ret = -EALREADY;
5610                 break;
5611         case IO_WQ_CANCEL_NOTFOUND:
5612                 ret = -ENOENT;
5613                 break;
5614         }
5615
5616         return ret;
5617 }
5618
5619 static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
5620                                      struct io_kiocb *req, __u64 sqe_addr,
5621                                      int success_ret)
5622 {
5623         unsigned long flags;
5624         int ret;
5625
5626         ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
5627         if (ret != -ENOENT) {
5628                 spin_lock_irqsave(&ctx->completion_lock, flags);
5629                 goto done;
5630         }
5631
5632         spin_lock_irqsave(&ctx->completion_lock, flags);
5633         ret = io_timeout_cancel(ctx, sqe_addr);
5634         if (ret != -ENOENT)
5635                 goto done;
5636         ret = io_poll_cancel(ctx, sqe_addr);
5637 done:
5638         if (!ret)
5639                 ret = success_ret;
5640         io_cqring_fill_event(req, ret);
5641         io_commit_cqring(ctx);
5642         spin_unlock_irqrestore(&ctx->completion_lock, flags);
5643         io_cqring_ev_posted(ctx);
5644
5645         if (ret < 0)
5646                 req_set_fail_links(req);
5647         io_put_req(req);
5648 }
5649
5650 static int io_async_cancel_prep(struct io_kiocb *req,
5651                                 const struct io_uring_sqe *sqe)
5652 {
5653         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5654                 return -EINVAL;
5655         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5656                 return -EINVAL;
5657         if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
5658                 return -EINVAL;
5659
5660         req->cancel.addr = READ_ONCE(sqe->addr);
5661         return 0;
5662 }
5663
5664 static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
5665 {
5666         struct io_ring_ctx *ctx = req->ctx;
5667         u64 sqe_addr = req->cancel.addr;
5668         struct io_tctx_node *node;
5669         int ret;
5670
5671         /* tasks should wait for their io-wq threads, so safe w/o sync */
5672         ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
5673         spin_lock_irq(&ctx->completion_lock);
5674         if (ret != -ENOENT)
5675                 goto done;
5676         ret = io_timeout_cancel(ctx, sqe_addr);
5677         if (ret != -ENOENT)
5678                 goto done;
5679         ret = io_poll_cancel(ctx, sqe_addr);
5680         if (ret != -ENOENT)
5681                 goto done;
5682         spin_unlock_irq(&ctx->completion_lock);
5683
5684         /* slow path, try all io-wq's */
5685         io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
5686         ret = -ENOENT;
5687         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
5688                 struct io_uring_task *tctx = node->task->io_uring;
5689
5690                 if (!tctx || !tctx->io_wq)
5691                         continue;
5692                 ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
5693                 if (ret != -ENOENT)
5694                         break;
5695         }
5696         io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
5697
5698         spin_lock_irq(&ctx->completion_lock);
5699 done:
5700         io_cqring_fill_event(req, ret);
5701         io_commit_cqring(ctx);
5702         spin_unlock_irq(&ctx->completion_lock);
5703         io_cqring_ev_posted(ctx);
5704
5705         if (ret < 0)
5706                 req_set_fail_links(req);
5707         io_put_req(req);
5708         return 0;
5709 }
5710
5711 static int io_rsrc_update_prep(struct io_kiocb *req,
5712                                 const struct io_uring_sqe *sqe)
5713 {
5714         if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL))
5715                 return -EINVAL;
5716         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5717                 return -EINVAL;
5718         if (sqe->ioprio || sqe->rw_flags)
5719                 return -EINVAL;
5720
5721         req->rsrc_update.offset = READ_ONCE(sqe->off);
5722         req->rsrc_update.nr_args = READ_ONCE(sqe->len);
5723         if (!req->rsrc_update.nr_args)
5724                 return -EINVAL;
5725         req->rsrc_update.arg = READ_ONCE(sqe->addr);
5726         return 0;
5727 }
5728
5729 static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
5730 {
5731         struct io_ring_ctx *ctx = req->ctx;
5732         struct io_uring_rsrc_update up;
5733         int ret;
5734
5735         if (issue_flags & IO_URING_F_NONBLOCK)
5736                 return -EAGAIN;
5737
5738         up.offset = req->rsrc_update.offset;
5739         up.data = req->rsrc_update.arg;
5740
5741         mutex_lock(&ctx->uring_lock);
5742         ret = __io_sqe_files_update(ctx, &up, req->rsrc_update.nr_args);
5743         mutex_unlock(&ctx->uring_lock);
5744
5745         if (ret < 0)
5746                 req_set_fail_links(req);
5747         __io_req_complete(req, issue_flags, ret, 0);
5748         return 0;
5749 }
5750
5751 static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5752 {
5753         switch (req->opcode) {
5754         case IORING_OP_NOP:
5755                 return 0;
5756         case IORING_OP_READV:
5757         case IORING_OP_READ_FIXED:
5758         case IORING_OP_READ:
5759                 return io_read_prep(req, sqe);
5760         case IORING_OP_WRITEV:
5761         case IORING_OP_WRITE_FIXED:
5762         case IORING_OP_WRITE:
5763                 return io_write_prep(req, sqe);
5764         case IORING_OP_POLL_ADD:
5765                 return io_poll_add_prep(req, sqe);
5766         case IORING_OP_POLL_REMOVE:
5767                 return io_poll_remove_prep(req, sqe);
5768         case IORING_OP_FSYNC:
5769                 return io_fsync_prep(req, sqe);
5770         case IORING_OP_SYNC_FILE_RANGE:
5771                 return io_sfr_prep(req, sqe);
5772         case IORING_OP_SENDMSG:
5773         case IORING_OP_SEND:
5774                 return io_sendmsg_prep(req, sqe);
5775         case IORING_OP_RECVMSG:
5776         case IORING_OP_RECV:
5777                 return io_recvmsg_prep(req, sqe);
5778         case IORING_OP_CONNECT:
5779                 return io_connect_prep(req, sqe);
5780         case IORING_OP_TIMEOUT:
5781                 return io_timeout_prep(req, sqe, false);
5782         case IORING_OP_TIMEOUT_REMOVE:
5783                 return io_timeout_remove_prep(req, sqe);
5784         case IORING_OP_ASYNC_CANCEL:
5785                 return io_async_cancel_prep(req, sqe);
5786         case IORING_OP_LINK_TIMEOUT:
5787                 return io_timeout_prep(req, sqe, true);
5788         case IORING_OP_ACCEPT:
5789                 return io_accept_prep(req, sqe);
5790         case IORING_OP_FALLOCATE:
5791                 return io_fallocate_prep(req, sqe);
5792         case IORING_OP_OPENAT:
5793                 return io_openat_prep(req, sqe);
5794         case IORING_OP_CLOSE:
5795                 return io_close_prep(req, sqe);
5796         case IORING_OP_FILES_UPDATE:
5797                 return io_rsrc_update_prep(req, sqe);
5798         case IORING_OP_STATX:
5799                 return io_statx_prep(req, sqe);
5800         case IORING_OP_FADVISE:
5801                 return io_fadvise_prep(req, sqe);
5802         case IORING_OP_MADVISE:
5803                 return io_madvise_prep(req, sqe);
5804         case IORING_OP_OPENAT2:
5805                 return io_openat2_prep(req, sqe);
5806         case IORING_OP_EPOLL_CTL:
5807                 return io_epoll_ctl_prep(req, sqe);
5808         case IORING_OP_SPLICE:
5809                 return io_splice_prep(req, sqe);
5810         case IORING_OP_PROVIDE_BUFFERS:
5811                 return io_provide_buffers_prep(req, sqe);
5812         case IORING_OP_REMOVE_BUFFERS:
5813                 return io_remove_buffers_prep(req, sqe);
5814         case IORING_OP_TEE:
5815                 return io_tee_prep(req, sqe);
5816         case IORING_OP_SHUTDOWN:
5817                 return io_shutdown_prep(req, sqe);
5818         case IORING_OP_RENAMEAT:
5819                 return io_renameat_prep(req, sqe);
5820         case IORING_OP_UNLINKAT:
5821                 return io_unlinkat_prep(req, sqe);
5822         }
5823
5824         printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
5825                         req->opcode);
5826         return-EINVAL;
5827 }
5828
5829 static int io_req_prep_async(struct io_kiocb *req)
5830 {
5831         switch (req->opcode) {
5832         case IORING_OP_READV:
5833         case IORING_OP_READ_FIXED:
5834         case IORING_OP_READ:
5835                 return io_rw_prep_async(req, READ);
5836         case IORING_OP_WRITEV:
5837         case IORING_OP_WRITE_FIXED:
5838         case IORING_OP_WRITE:
5839                 return io_rw_prep_async(req, WRITE);
5840         case IORING_OP_SENDMSG:
5841         case IORING_OP_SEND:
5842                 return io_sendmsg_prep_async(req);
5843         case IORING_OP_RECVMSG:
5844         case IORING_OP_RECV:
5845                 return io_recvmsg_prep_async(req);
5846         case IORING_OP_CONNECT:
5847                 return io_connect_prep_async(req);
5848         }
5849         return 0;
5850 }
5851
5852 static int io_req_defer_prep(struct io_kiocb *req)
5853 {
5854         if (!io_op_defs[req->opcode].needs_async_data)
5855                 return 0;
5856         /* some opcodes init it during the inital prep */
5857         if (req->async_data)
5858                 return 0;
5859         if (__io_alloc_async_data(req))
5860                 return -EAGAIN;
5861         return io_req_prep_async(req);
5862 }
5863
5864 static u32 io_get_sequence(struct io_kiocb *req)
5865 {
5866         struct io_kiocb *pos;
5867         struct io_ring_ctx *ctx = req->ctx;
5868         u32 total_submitted, nr_reqs = 0;
5869
5870         io_for_each_link(pos, req)
5871                 nr_reqs++;
5872
5873         total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
5874         return total_submitted - nr_reqs;
5875 }
5876
5877 static int io_req_defer(struct io_kiocb *req)
5878 {
5879         struct io_ring_ctx *ctx = req->ctx;
5880         struct io_defer_entry *de;
5881         int ret;
5882         u32 seq;
5883
5884         /* Still need defer if there is pending req in defer list. */
5885         if (likely(list_empty_careful(&ctx->defer_list) &&
5886                 !(req->flags & REQ_F_IO_DRAIN)))
5887                 return 0;
5888
5889         seq = io_get_sequence(req);
5890         /* Still a chance to pass the sequence check */
5891         if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
5892                 return 0;
5893
5894         ret = io_req_defer_prep(req);
5895         if (ret)
5896                 return ret;
5897         io_prep_async_link(req);
5898         de = kmalloc(sizeof(*de), GFP_KERNEL);
5899         if (!de)
5900                 return -ENOMEM;
5901
5902         spin_lock_irq(&ctx->completion_lock);
5903         if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
5904                 spin_unlock_irq(&ctx->completion_lock);
5905                 kfree(de);
5906                 io_queue_async_work(req);
5907                 return -EIOCBQUEUED;
5908         }
5909
5910         trace_io_uring_defer(ctx, req, req->user_data);
5911         de->req = req;
5912         de->seq = seq;
5913         list_add_tail(&de->list, &ctx->defer_list);
5914         spin_unlock_irq(&ctx->completion_lock);
5915         return -EIOCBQUEUED;
5916 }
5917
5918 static void __io_clean_op(struct io_kiocb *req)
5919 {
5920         if (req->flags & REQ_F_BUFFER_SELECTED) {
5921                 switch (req->opcode) {
5922                 case IORING_OP_READV:
5923                 case IORING_OP_READ_FIXED:
5924                 case IORING_OP_READ:
5925                         kfree((void *)(unsigned long)req->rw.addr);
5926                         break;
5927                 case IORING_OP_RECVMSG:
5928                 case IORING_OP_RECV:
5929                         kfree(req->sr_msg.kbuf);
5930                         break;
5931                 }
5932                 req->flags &= ~REQ_F_BUFFER_SELECTED;
5933         }
5934
5935         if (req->flags & REQ_F_NEED_CLEANUP) {
5936                 switch (req->opcode) {
5937                 case IORING_OP_READV:
5938                 case IORING_OP_READ_FIXED:
5939                 case IORING_OP_READ:
5940                 case IORING_OP_WRITEV:
5941                 case IORING_OP_WRITE_FIXED:
5942                 case IORING_OP_WRITE: {
5943                         struct io_async_rw *io = req->async_data;
5944                         if (io->free_iovec)
5945                                 kfree(io->free_iovec);
5946                         break;
5947                         }
5948                 case IORING_OP_RECVMSG:
5949                 case IORING_OP_SENDMSG: {
5950                         struct io_async_msghdr *io = req->async_data;
5951
5952                         kfree(io->free_iov);
5953                         break;
5954                         }
5955                 case IORING_OP_SPLICE:
5956                 case IORING_OP_TEE:
5957                         io_put_file(req, req->splice.file_in,
5958                                     (req->splice.flags & SPLICE_F_FD_IN_FIXED));
5959                         break;
5960                 case IORING_OP_OPENAT:
5961                 case IORING_OP_OPENAT2:
5962                         if (req->open.filename)
5963                                 putname(req->open.filename);
5964                         break;
5965                 case IORING_OP_RENAMEAT:
5966                         putname(req->rename.oldpath);
5967                         putname(req->rename.newpath);
5968                         break;
5969                 case IORING_OP_UNLINKAT:
5970                         putname(req->unlink.filename);
5971                         break;
5972                 }
5973                 req->flags &= ~REQ_F_NEED_CLEANUP;
5974         }
5975 }
5976
5977 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
5978 {
5979         struct io_ring_ctx *ctx = req->ctx;
5980         const struct cred *creds = NULL;
5981         int ret;
5982
5983         if (req->work.creds && req->work.creds != current_cred())
5984                 creds = override_creds(req->work.creds);
5985
5986         switch (req->opcode) {
5987         case IORING_OP_NOP:
5988                 ret = io_nop(req, issue_flags);
5989                 break;
5990         case IORING_OP_READV:
5991         case IORING_OP_READ_FIXED:
5992         case IORING_OP_READ:
5993                 ret = io_read(req, issue_flags);
5994                 break;
5995         case IORING_OP_WRITEV:
5996         case IORING_OP_WRITE_FIXED:
5997         case IORING_OP_WRITE:
5998                 ret = io_write(req, issue_flags);
5999                 break;
6000         case IORING_OP_FSYNC:
6001                 ret = io_fsync(req, issue_flags);
6002                 break;
6003         case IORING_OP_POLL_ADD:
6004                 ret = io_poll_add(req, issue_flags);
6005                 break;
6006         case IORING_OP_POLL_REMOVE:
6007                 ret = io_poll_remove(req, issue_flags);
6008                 break;
6009         case IORING_OP_SYNC_FILE_RANGE:
6010                 ret = io_sync_file_range(req, issue_flags);
6011                 break;
6012         case IORING_OP_SENDMSG:
6013                 ret = io_sendmsg(req, issue_flags);
6014                 break;
6015         case IORING_OP_SEND:
6016                 ret = io_send(req, issue_flags);
6017                 break;
6018         case IORING_OP_RECVMSG:
6019                 ret = io_recvmsg(req, issue_flags);
6020                 break;
6021         case IORING_OP_RECV:
6022                 ret = io_recv(req, issue_flags);
6023                 break;
6024         case IORING_OP_TIMEOUT:
6025                 ret = io_timeout(req, issue_flags);
6026                 break;
6027         case IORING_OP_TIMEOUT_REMOVE:
6028                 ret = io_timeout_remove(req, issue_flags);
6029                 break;
6030         case IORING_OP_ACCEPT:
6031                 ret = io_accept(req, issue_flags);
6032                 break;
6033         case IORING_OP_CONNECT:
6034                 ret = io_connect(req, issue_flags);
6035                 break;
6036         case IORING_OP_ASYNC_CANCEL:
6037                 ret = io_async_cancel(req, issue_flags);
6038                 break;
6039         case IORING_OP_FALLOCATE:
6040                 ret = io_fallocate(req, issue_flags);
6041                 break;
6042         case IORING_OP_OPENAT:
6043                 ret = io_openat(req, issue_flags);
6044                 break;
6045         case IORING_OP_CLOSE:
6046                 ret = io_close(req, issue_flags);
6047                 break;
6048         case IORING_OP_FILES_UPDATE:
6049                 ret = io_files_update(req, issue_flags);
6050                 break;
6051         case IORING_OP_STATX:
6052                 ret = io_statx(req, issue_flags);
6053                 break;
6054         case IORING_OP_FADVISE:
6055                 ret = io_fadvise(req, issue_flags);
6056                 break;
6057         case IORING_OP_MADVISE:
6058                 ret = io_madvise(req, issue_flags);
6059                 break;
6060         case IORING_OP_OPENAT2:
6061                 ret = io_openat2(req, issue_flags);
6062                 break;
6063         case IORING_OP_EPOLL_CTL:
6064                 ret = io_epoll_ctl(req, issue_flags);
6065                 break;
6066         case IORING_OP_SPLICE:
6067                 ret = io_splice(req, issue_flags);
6068                 break;
6069         case IORING_OP_PROVIDE_BUFFERS:
6070                 ret = io_provide_buffers(req, issue_flags);
6071                 break;
6072         case IORING_OP_REMOVE_BUFFERS:
6073                 ret = io_remove_buffers(req, issue_flags);
6074                 break;
6075         case IORING_OP_TEE:
6076                 ret = io_tee(req, issue_flags);
6077                 break;
6078         case IORING_OP_SHUTDOWN:
6079                 ret = io_shutdown(req, issue_flags);
6080                 break;
6081         case IORING_OP_RENAMEAT:
6082                 ret = io_renameat(req, issue_flags);
6083                 break;
6084         case IORING_OP_UNLINKAT:
6085                 ret = io_unlinkat(req, issue_flags);
6086                 break;
6087         default:
6088                 ret = -EINVAL;
6089                 break;
6090         }
6091
6092         if (creds)
6093                 revert_creds(creds);
6094
6095         if (ret)
6096                 return ret;
6097
6098         /* If the op doesn't have a file, we're not polling for it */
6099         if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
6100                 const bool in_async = io_wq_current_is_worker();
6101
6102                 /* workqueue context doesn't hold uring_lock, grab it now */
6103                 if (in_async)
6104                         mutex_lock(&ctx->uring_lock);
6105
6106                 io_iopoll_req_issued(req, in_async);
6107
6108                 if (in_async)
6109                         mutex_unlock(&ctx->uring_lock);
6110         }
6111
6112         return 0;
6113 }
6114
6115 static void io_wq_submit_work(struct io_wq_work *work)
6116 {
6117         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6118         struct io_kiocb *timeout;
6119         int ret = 0;
6120
6121         timeout = io_prep_linked_timeout(req);
6122         if (timeout)
6123                 io_queue_linked_timeout(timeout);
6124
6125         if (work->flags & IO_WQ_WORK_CANCEL)
6126                 ret = -ECANCELED;
6127
6128         if (!ret) {
6129                 do {
6130                         ret = io_issue_sqe(req, 0);
6131                         /*
6132                          * We can get EAGAIN for polled IO even though we're
6133                          * forcing a sync submission from here, since we can't
6134                          * wait for request slots on the block side.
6135                          */
6136                         if (ret != -EAGAIN)
6137                                 break;
6138                         cond_resched();
6139                 } while (1);
6140         }
6141
6142         /* avoid locking problems by failing it from a clean context */
6143         if (ret) {
6144                 /* io-wq is going to take one down */
6145                 refcount_inc(&req->refs);
6146                 io_req_task_queue_fail(req, ret);
6147         }
6148 }
6149
6150 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
6151                                               int index)
6152 {
6153         struct fixed_rsrc_table *table;
6154
6155         table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
6156         return table->files[index & IORING_FILE_TABLE_MASK];
6157 }
6158
6159 static struct file *io_file_get(struct io_submit_state *state,
6160                                 struct io_kiocb *req, int fd, bool fixed)
6161 {
6162         struct io_ring_ctx *ctx = req->ctx;
6163         struct file *file;
6164
6165         if (fixed) {
6166                 if (unlikely((unsigned int)fd >= ctx->nr_user_files))
6167                         return NULL;
6168                 fd = array_index_nospec(fd, ctx->nr_user_files);
6169                 file = io_file_from_index(ctx, fd);
6170                 io_set_resource_node(req);
6171         } else {
6172                 trace_io_uring_file_get(ctx, fd);
6173                 file = __io_file_get(state, fd);
6174         }
6175
6176         if (file && unlikely(file->f_op == &io_uring_fops))
6177                 io_req_track_inflight(req);
6178         return file;
6179 }
6180
6181 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
6182 {
6183         struct io_timeout_data *data = container_of(timer,
6184                                                 struct io_timeout_data, timer);
6185         struct io_kiocb *prev, *req = data->req;
6186         struct io_ring_ctx *ctx = req->ctx;
6187         unsigned long flags;
6188
6189         spin_lock_irqsave(&ctx->completion_lock, flags);
6190         prev = req->timeout.head;
6191         req->timeout.head = NULL;
6192
6193         /*
6194          * We don't expect the list to be empty, that will only happen if we
6195          * race with the completion of the linked work.
6196          */
6197         if (prev && refcount_inc_not_zero(&prev->refs))
6198                 io_remove_next_linked(prev);
6199         else
6200                 prev = NULL;
6201         spin_unlock_irqrestore(&ctx->completion_lock, flags);
6202
6203         if (prev) {
6204                 io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
6205                 io_put_req_deferred(prev, 1);
6206         } else {
6207                 io_req_complete_post(req, -ETIME, 0);
6208                 io_put_req_deferred(req, 1);
6209         }
6210         return HRTIMER_NORESTART;
6211 }
6212
6213 static void __io_queue_linked_timeout(struct io_kiocb *req)
6214 {
6215         /*
6216          * If the back reference is NULL, then our linked request finished
6217          * before we got a chance to setup the timer
6218          */
6219         if (req->timeout.head) {
6220                 struct io_timeout_data *data = req->async_data;
6221
6222                 data->timer.function = io_link_timeout_fn;
6223                 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
6224                                 data->mode);
6225         }
6226 }
6227
6228 static void io_queue_linked_timeout(struct io_kiocb *req)
6229 {
6230         struct io_ring_ctx *ctx = req->ctx;
6231
6232         spin_lock_irq(&ctx->completion_lock);
6233         __io_queue_linked_timeout(req);
6234         spin_unlock_irq(&ctx->completion_lock);
6235
6236         /* drop submission reference */
6237         io_put_req(req);
6238 }
6239
6240 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
6241 {
6242         struct io_kiocb *nxt = req->link;
6243
6244         if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) ||
6245             nxt->opcode != IORING_OP_LINK_TIMEOUT)
6246                 return NULL;
6247
6248         nxt->timeout.head = req;
6249         nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
6250         req->flags |= REQ_F_LINK_TIMEOUT;
6251         return nxt;
6252 }
6253
6254 static void __io_queue_sqe(struct io_kiocb *req)
6255 {
6256         struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
6257         int ret;
6258
6259         ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
6260
6261         /*
6262          * We async punt it if the file wasn't marked NOWAIT, or if the file
6263          * doesn't support non-blocking read/write attempts
6264          */
6265         if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
6266                 if (!io_arm_poll_handler(req)) {
6267                         /*
6268                          * Queued up for async execution, worker will release
6269                          * submit reference when the iocb is actually submitted.
6270                          */
6271                         io_queue_async_work(req);
6272                 }
6273         } else if (likely(!ret)) {
6274                 /* drop submission reference */
6275                 if (req->flags & REQ_F_COMPLETE_INLINE) {
6276                         struct io_ring_ctx *ctx = req->ctx;
6277                         struct io_comp_state *cs = &ctx->submit_state.comp;
6278
6279                         cs->reqs[cs->nr++] = req;
6280                         if (cs->nr == ARRAY_SIZE(cs->reqs))
6281                                 io_submit_flush_completions(cs, ctx);
6282                 } else {
6283                         io_put_req(req);
6284                 }
6285         } else {
6286                 req_set_fail_links(req);
6287                 io_put_req(req);
6288                 io_req_complete(req, ret);
6289         }
6290         if (linked_timeout)
6291                 io_queue_linked_timeout(linked_timeout);
6292 }
6293
6294 static void io_queue_sqe(struct io_kiocb *req)
6295 {
6296         int ret;
6297
6298         ret = io_req_defer(req);
6299         if (ret) {
6300                 if (ret != -EIOCBQUEUED) {
6301 fail_req:
6302                         req_set_fail_links(req);
6303                         io_put_req(req);
6304                         io_req_complete(req, ret);
6305                 }
6306         } else if (req->flags & REQ_F_FORCE_ASYNC) {
6307                 ret = io_req_defer_prep(req);
6308                 if (unlikely(ret))
6309                         goto fail_req;
6310                 io_queue_async_work(req);
6311         } else {
6312                 __io_queue_sqe(req);
6313         }
6314 }
6315
6316 /*
6317  * Check SQE restrictions (opcode and flags).
6318  *
6319  * Returns 'true' if SQE is allowed, 'false' otherwise.
6320  */
6321 static inline bool io_check_restriction(struct io_ring_ctx *ctx,
6322                                         struct io_kiocb *req,
6323                                         unsigned int sqe_flags)
6324 {
6325         if (!ctx->restricted)
6326                 return true;
6327
6328         if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
6329                 return false;
6330
6331         if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
6332             ctx->restrictions.sqe_flags_required)
6333                 return false;
6334
6335         if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
6336                           ctx->restrictions.sqe_flags_required))
6337                 return false;
6338
6339         return true;
6340 }
6341
6342 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
6343                        const struct io_uring_sqe *sqe)
6344 {
6345         struct io_submit_state *state;
6346         unsigned int sqe_flags;
6347         int personality, ret = 0;
6348
6349         req->opcode = READ_ONCE(sqe->opcode);
6350         /* same numerical values with corresponding REQ_F_*, safe to copy */
6351         req->flags = sqe_flags = READ_ONCE(sqe->flags);
6352         req->user_data = READ_ONCE(sqe->user_data);
6353         req->async_data = NULL;
6354         req->file = NULL;
6355         req->ctx = ctx;
6356         req->link = NULL;
6357         req->fixed_rsrc_refs = NULL;
6358         /* one is dropped after submission, the other at completion */
6359         refcount_set(&req->refs, 2);
6360         req->task = current;
6361         req->result = 0;
6362         req->work.list.next = NULL;
6363         req->work.creds = NULL;
6364         req->work.flags = 0;
6365
6366         /* enforce forwards compatibility on users */
6367         if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
6368                 req->flags = 0;
6369                 return -EINVAL;
6370         }
6371
6372         if (unlikely(req->opcode >= IORING_OP_LAST))
6373                 return -EINVAL;
6374
6375         if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
6376                 return -EACCES;
6377
6378         if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
6379             !io_op_defs[req->opcode].buffer_select)
6380                 return -EOPNOTSUPP;
6381
6382         personality = READ_ONCE(sqe->personality);
6383         if (personality) {
6384                 req->work.creds = xa_load(&ctx->personalities, personality);
6385                 if (!req->work.creds)
6386                         return -EINVAL;
6387                 get_cred(req->work.creds);
6388         }
6389         state = &ctx->submit_state;
6390
6391         /*
6392          * Plug now if we have more than 1 IO left after this, and the target
6393          * is potentially a read/write to block based storage.
6394          */
6395         if (!state->plug_started && state->ios_left > 1 &&
6396             io_op_defs[req->opcode].plug) {
6397                 blk_start_plug(&state->plug);
6398                 state->plug_started = true;
6399         }
6400
6401         if (io_op_defs[req->opcode].needs_file) {
6402                 bool fixed = req->flags & REQ_F_FIXED_FILE;
6403
6404                 req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
6405                 if (unlikely(!req->file))
6406                         ret = -EBADF;
6407         }
6408
6409         state->ios_left--;
6410         return ret;
6411 }
6412
6413 static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
6414                          const struct io_uring_sqe *sqe)
6415 {
6416         struct io_submit_link *link = &ctx->submit_state.link;
6417         int ret;
6418
6419         ret = io_init_req(ctx, req, sqe);
6420         if (unlikely(ret)) {
6421 fail_req:
6422                 io_put_req(req);
6423                 io_req_complete(req, ret);
6424                 if (link->head) {
6425                         /* fail even hard links since we don't submit */
6426                         link->head->flags |= REQ_F_FAIL_LINK;
6427                         io_put_req(link->head);
6428                         io_req_complete(link->head, -ECANCELED);
6429                         link->head = NULL;
6430                 }
6431                 return ret;
6432         }
6433         ret = io_req_prep(req, sqe);
6434         if (unlikely(ret))
6435                 goto fail_req;
6436
6437         /* don't need @sqe from now on */
6438         trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
6439                                 true, ctx->flags & IORING_SETUP_SQPOLL);
6440
6441         /*
6442          * If we already have a head request, queue this one for async
6443          * submittal once the head completes. If we don't have a head but
6444          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
6445          * submitted sync once the chain is complete. If none of those
6446          * conditions are true (normal request), then just queue it.
6447          */
6448         if (link->head) {
6449                 struct io_kiocb *head = link->head;
6450
6451                 /*
6452                  * Taking sequential execution of a link, draining both sides
6453                  * of the link also fullfils IOSQE_IO_DRAIN semantics for all
6454                  * requests in the link. So, it drains the head and the
6455                  * next after the link request. The last one is done via
6456                  * drain_next flag to persist the effect across calls.
6457                  */
6458                 if (req->flags & REQ_F_IO_DRAIN) {
6459                         head->flags |= REQ_F_IO_DRAIN;
6460                         ctx->drain_next = 1;
6461                 }
6462                 ret = io_req_defer_prep(req);
6463                 if (unlikely(ret))
6464                         goto fail_req;
6465                 trace_io_uring_link(ctx, req, head);
6466                 link->last->link = req;
6467                 link->last = req;
6468
6469                 /* last request of a link, enqueue the link */
6470                 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
6471                         io_queue_sqe(head);
6472                         link->head = NULL;
6473                 }
6474         } else {
6475                 if (unlikely(ctx->drain_next)) {
6476                         req->flags |= REQ_F_IO_DRAIN;
6477                         ctx->drain_next = 0;
6478                 }
6479                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
6480                         link->head = req;
6481                         link->last = req;
6482                 } else {
6483                         io_queue_sqe(req);
6484                 }
6485         }
6486
6487         return 0;
6488 }
6489
6490 /*
6491  * Batched submission is done, ensure local IO is flushed out.
6492  */
6493 static void io_submit_state_end(struct io_submit_state *state,
6494                                 struct io_ring_ctx *ctx)
6495 {
6496         if (state->link.head)
6497                 io_queue_sqe(state->link.head);
6498         if (state->comp.nr)
6499                 io_submit_flush_completions(&state->comp, ctx);
6500         if (state->plug_started)
6501                 blk_finish_plug(&state->plug);
6502         io_state_file_put(state);
6503 }
6504
6505 /*
6506  * Start submission side cache.
6507  */
6508 static void io_submit_state_start(struct io_submit_state *state,
6509                                   unsigned int max_ios)
6510 {
6511         state->plug_started = false;
6512         state->ios_left = max_ios;
6513         /* set only head, no need to init link_last in advance */
6514         state->link.head = NULL;
6515 }
6516
6517 static void io_commit_sqring(struct io_ring_ctx *ctx)
6518 {
6519         struct io_rings *rings = ctx->rings;
6520
6521         /*
6522          * Ensure any loads from the SQEs are done at this point,
6523          * since once we write the new head, the application could
6524          * write new data to them.
6525          */
6526         smp_store_release(&rings->sq.head, ctx->cached_sq_head);
6527 }
6528
6529 /*
6530  * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
6531  * that is mapped by userspace. This means that care needs to be taken to
6532  * ensure that reads are stable, as we cannot rely on userspace always
6533  * being a good citizen. If members of the sqe are validated and then later
6534  * used, it's important that those reads are done through READ_ONCE() to
6535  * prevent a re-load down the line.
6536  */
6537 static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
6538 {
6539         u32 *sq_array = ctx->sq_array;
6540         unsigned head;
6541
6542         /*
6543          * The cached sq head (or cq tail) serves two purposes:
6544          *
6545          * 1) allows us to batch the cost of updating the user visible
6546          *    head updates.
6547          * 2) allows the kernel side to track the head on its own, even
6548          *    though the application is the one updating it.
6549          */
6550         head = READ_ONCE(sq_array[ctx->cached_sq_head++ & ctx->sq_mask]);
6551         if (likely(head < ctx->sq_entries))
6552                 return &ctx->sq_sqes[head];
6553
6554         /* drop invalid entries */
6555         ctx->cached_sq_dropped++;
6556         WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
6557         return NULL;
6558 }
6559
6560 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
6561 {
6562         int submitted = 0;
6563
6564         /* if we have a backlog and couldn't flush it all, return BUSY */
6565         if (test_bit(0, &ctx->sq_check_overflow)) {
6566                 if (!__io_cqring_overflow_flush(ctx, false, NULL, NULL))
6567                         return -EBUSY;
6568         }
6569
6570         /* make sure SQ entry isn't read before tail */
6571         nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
6572
6573         if (!percpu_ref_tryget_many(&ctx->refs, nr))
6574                 return -EAGAIN;
6575
6576         percpu_counter_add(&current->io_uring->inflight, nr);
6577         refcount_add(nr, &current->usage);
6578         io_submit_state_start(&ctx->submit_state, nr);
6579
6580         while (submitted < nr) {
6581                 const struct io_uring_sqe *sqe;
6582                 struct io_kiocb *req;
6583
6584                 req = io_alloc_req(ctx);
6585                 if (unlikely(!req)) {
6586                         if (!submitted)
6587                                 submitted = -EAGAIN;
6588                         break;
6589                 }
6590                 sqe = io_get_sqe(ctx);
6591                 if (unlikely(!sqe)) {
6592                         kmem_cache_free(req_cachep, req);
6593                         break;
6594                 }
6595                 /* will complete beyond this point, count as submitted */
6596                 submitted++;
6597                 if (io_submit_sqe(ctx, req, sqe))
6598                         break;
6599         }
6600
6601         if (unlikely(submitted != nr)) {
6602                 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
6603                 struct io_uring_task *tctx = current->io_uring;
6604                 int unused = nr - ref_used;
6605
6606                 percpu_ref_put_many(&ctx->refs, unused);
6607                 percpu_counter_sub(&tctx->inflight, unused);
6608                 put_task_struct_many(current, unused);
6609         }
6610
6611         io_submit_state_end(&ctx->submit_state, ctx);
6612          /* Commit SQ ring head once we've consumed and submitted all SQEs */
6613         io_commit_sqring(ctx);
6614
6615         return submitted;
6616 }
6617
6618 static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
6619 {
6620         /* Tell userspace we may need a wakeup call */
6621         spin_lock_irq(&ctx->completion_lock);
6622         ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
6623         spin_unlock_irq(&ctx->completion_lock);
6624 }
6625
6626 static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
6627 {
6628         spin_lock_irq(&ctx->completion_lock);
6629         ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6630         spin_unlock_irq(&ctx->completion_lock);
6631 }
6632
6633 static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
6634 {
6635         unsigned int to_submit;
6636         int ret = 0;
6637
6638         to_submit = io_sqring_entries(ctx);
6639         /* if we're handling multiple rings, cap submit size for fairness */
6640         if (cap_entries && to_submit > 8)
6641                 to_submit = 8;
6642
6643         if (!list_empty(&ctx->iopoll_list) || to_submit) {
6644                 unsigned nr_events = 0;
6645
6646                 mutex_lock(&ctx->uring_lock);
6647                 if (!list_empty(&ctx->iopoll_list))
6648                         io_do_iopoll(ctx, &nr_events, 0);
6649
6650                 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
6651                     !(ctx->flags & IORING_SETUP_R_DISABLED))
6652                         ret = io_submit_sqes(ctx, to_submit);
6653                 mutex_unlock(&ctx->uring_lock);
6654         }
6655
6656         if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
6657                 wake_up(&ctx->sqo_sq_wait);
6658
6659         return ret;
6660 }
6661
6662 static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
6663 {
6664         struct io_ring_ctx *ctx;
6665         unsigned sq_thread_idle = 0;
6666
6667         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
6668                 if (sq_thread_idle < ctx->sq_thread_idle)
6669                         sq_thread_idle = ctx->sq_thread_idle;
6670         }
6671
6672         sqd->sq_thread_idle = sq_thread_idle;
6673 }
6674
6675 static int io_sq_thread(void *data)
6676 {
6677         struct io_sq_data *sqd = data;
6678         struct io_ring_ctx *ctx;
6679         unsigned long timeout = 0;
6680         char buf[TASK_COMM_LEN];
6681         DEFINE_WAIT(wait);
6682
6683         sprintf(buf, "iou-sqp-%d", sqd->task_pid);
6684         set_task_comm(current, buf);
6685         current->pf_io_worker = NULL;
6686
6687         if (sqd->sq_cpu != -1)
6688                 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
6689         else
6690                 set_cpus_allowed_ptr(current, cpu_online_mask);
6691         current->flags |= PF_NO_SETAFFINITY;
6692
6693         mutex_lock(&sqd->lock);
6694         while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) {
6695                 int ret;
6696                 bool cap_entries, sqt_spin, needs_sched;
6697
6698                 if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) {
6699                         mutex_unlock(&sqd->lock);
6700                         cond_resched();
6701                         mutex_lock(&sqd->lock);
6702                         io_run_task_work();
6703                         timeout = jiffies + sqd->sq_thread_idle;
6704                         continue;
6705                 }
6706                 if (fatal_signal_pending(current))
6707                         break;
6708                 sqt_spin = false;
6709                 cap_entries = !list_is_singular(&sqd->ctx_list);
6710                 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
6711                         const struct cred *creds = NULL;
6712
6713                         if (ctx->sq_creds != current_cred())
6714                                 creds = override_creds(ctx->sq_creds);
6715                         ret = __io_sq_thread(ctx, cap_entries);
6716                         if (creds)
6717                                 revert_creds(creds);
6718                         if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
6719                                 sqt_spin = true;
6720                 }
6721
6722                 if (sqt_spin || !time_after(jiffies, timeout)) {
6723                         io_run_task_work();
6724                         cond_resched();
6725                         if (sqt_spin)
6726                                 timeout = jiffies + sqd->sq_thread_idle;
6727                         continue;
6728                 }
6729
6730                 needs_sched = true;
6731                 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
6732                 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
6733                         if ((ctx->flags & IORING_SETUP_IOPOLL) &&
6734                             !list_empty_careful(&ctx->iopoll_list)) {
6735                                 needs_sched = false;
6736                                 break;
6737                         }
6738                         if (io_sqring_entries(ctx)) {
6739                                 needs_sched = false;
6740                                 break;
6741                         }
6742                 }
6743
6744                 if (needs_sched && !test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) {
6745                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6746                                 io_ring_set_wakeup_flag(ctx);
6747
6748                         mutex_unlock(&sqd->lock);
6749                         schedule();
6750                         try_to_freeze();
6751                         mutex_lock(&sqd->lock);
6752                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6753                                 io_ring_clear_wakeup_flag(ctx);
6754                 }
6755
6756                 finish_wait(&sqd->wait, &wait);
6757                 timeout = jiffies + sqd->sq_thread_idle;
6758         }
6759
6760         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6761                 io_uring_cancel_sqpoll(ctx);
6762         sqd->thread = NULL;
6763         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6764                 io_ring_set_wakeup_flag(ctx);
6765         mutex_unlock(&sqd->lock);
6766
6767         io_run_task_work();
6768         complete(&sqd->exited);
6769         do_exit(0);
6770 }
6771
6772 struct io_wait_queue {
6773         struct wait_queue_entry wq;
6774         struct io_ring_ctx *ctx;
6775         unsigned to_wait;
6776         unsigned nr_timeouts;
6777 };
6778
6779 static inline bool io_should_wake(struct io_wait_queue *iowq)
6780 {
6781         struct io_ring_ctx *ctx = iowq->ctx;
6782
6783         /*
6784          * Wake up if we have enough events, or if a timeout occurred since we
6785          * started waiting. For timeouts, we always want to return to userspace,
6786          * regardless of event count.
6787          */
6788         return io_cqring_events(ctx) >= iowq->to_wait ||
6789                         atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
6790 }
6791
6792 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
6793                             int wake_flags, void *key)
6794 {
6795         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
6796                                                         wq);
6797
6798         /*
6799          * Cannot safely flush overflowed CQEs from here, ensure we wake up
6800          * the task, and the next invocation will do it.
6801          */
6802         if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->cq_check_overflow))
6803                 return autoremove_wake_function(curr, mode, wake_flags, key);
6804         return -1;
6805 }
6806
6807 static int io_run_task_work_sig(void)
6808 {
6809         if (io_run_task_work())
6810                 return 1;
6811         if (!signal_pending(current))
6812                 return 0;
6813         if (test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))
6814                 return -ERESTARTSYS;
6815         return -EINTR;
6816 }
6817
6818 /* when returns >0, the caller should retry */
6819 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
6820                                           struct io_wait_queue *iowq,
6821                                           signed long *timeout)
6822 {
6823         int ret;
6824
6825         /* make sure we run task_work before checking for signals */
6826         ret = io_run_task_work_sig();
6827         if (ret || io_should_wake(iowq))
6828                 return ret;
6829         /* let the caller flush overflows, retry */
6830         if (test_bit(0, &ctx->cq_check_overflow))
6831                 return 1;
6832
6833         *timeout = schedule_timeout(*timeout);
6834         return !*timeout ? -ETIME : 1;
6835 }
6836
6837 /*
6838  * Wait until events become available, if we don't already have some. The
6839  * application must reap them itself, as they reside on the shared cq ring.
6840  */
6841 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
6842                           const sigset_t __user *sig, size_t sigsz,
6843                           struct __kernel_timespec __user *uts)
6844 {
6845         struct io_wait_queue iowq = {
6846                 .wq = {
6847                         .private        = current,
6848                         .func           = io_wake_function,
6849                         .entry          = LIST_HEAD_INIT(iowq.wq.entry),
6850                 },
6851                 .ctx            = ctx,
6852                 .to_wait        = min_events,
6853         };
6854         struct io_rings *rings = ctx->rings;
6855         signed long timeout = MAX_SCHEDULE_TIMEOUT;
6856         int ret;
6857
6858         do {
6859                 io_cqring_overflow_flush(ctx, false, NULL, NULL);
6860                 if (io_cqring_events(ctx) >= min_events)
6861                         return 0;
6862                 if (!io_run_task_work())
6863                         break;
6864         } while (1);
6865
6866         if (sig) {
6867 #ifdef CONFIG_COMPAT
6868                 if (in_compat_syscall())
6869                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
6870                                                       sigsz);
6871                 else
6872 #endif
6873                         ret = set_user_sigmask(sig, sigsz);
6874
6875                 if (ret)
6876                         return ret;
6877         }
6878
6879         if (uts) {
6880                 struct timespec64 ts;
6881
6882                 if (get_timespec64(&ts, uts))
6883                         return -EFAULT;
6884                 timeout = timespec64_to_jiffies(&ts);
6885         }
6886
6887         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
6888         trace_io_uring_cqring_wait(ctx, min_events);
6889         do {
6890                 /* if we can't even flush overflow, don't wait for more */
6891                 if (!io_cqring_overflow_flush(ctx, false, NULL, NULL)) {
6892                         ret = -EBUSY;
6893                         break;
6894                 }
6895                 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
6896                                                 TASK_INTERRUPTIBLE);
6897                 ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
6898                 finish_wait(&ctx->wait, &iowq.wq);
6899                 cond_resched();
6900         } while (ret > 0);
6901
6902         restore_saved_sigmask_unless(ret == -EINTR);
6903
6904         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
6905 }
6906
6907 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
6908 {
6909 #if defined(CONFIG_UNIX)
6910         if (ctx->ring_sock) {
6911                 struct sock *sock = ctx->ring_sock->sk;
6912                 struct sk_buff *skb;
6913
6914                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
6915                         kfree_skb(skb);
6916         }
6917 #else
6918         int i;
6919
6920         for (i = 0; i < ctx->nr_user_files; i++) {
6921                 struct file *file;
6922
6923                 file = io_file_from_index(ctx, i);
6924                 if (file)
6925                         fput(file);
6926         }
6927 #endif
6928 }
6929
6930 static void io_rsrc_data_ref_zero(struct percpu_ref *ref)
6931 {
6932         struct fixed_rsrc_data *data;
6933
6934         data = container_of(ref, struct fixed_rsrc_data, refs);
6935         complete(&data->done);
6936 }
6937
6938 static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
6939 {
6940         spin_lock_bh(&ctx->rsrc_ref_lock);
6941 }
6942
6943 static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx)
6944 {
6945         spin_unlock_bh(&ctx->rsrc_ref_lock);
6946 }
6947
6948 static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx,
6949                                  struct fixed_rsrc_data *rsrc_data,
6950                                  struct fixed_rsrc_ref_node *ref_node)
6951 {
6952         io_rsrc_ref_lock(ctx);
6953         rsrc_data->node = ref_node;
6954         list_add_tail(&ref_node->node, &ctx->rsrc_ref_list);
6955         io_rsrc_ref_unlock(ctx);
6956         percpu_ref_get(&rsrc_data->refs);
6957 }
6958
6959 static void io_sqe_rsrc_kill_node(struct io_ring_ctx *ctx, struct fixed_rsrc_data *data)
6960 {
6961         struct fixed_rsrc_ref_node *ref_node = NULL;
6962
6963         io_rsrc_ref_lock(ctx);
6964         ref_node = data->node;
6965         data->node = NULL;
6966         io_rsrc_ref_unlock(ctx);
6967         if (ref_node)
6968                 percpu_ref_kill(&ref_node->refs);
6969 }
6970
6971 static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
6972                                struct io_ring_ctx *ctx,
6973                                void (*rsrc_put)(struct io_ring_ctx *ctx,
6974                                                 struct io_rsrc_put *prsrc))
6975 {
6976         struct fixed_rsrc_ref_node *backup_node;
6977         int ret;
6978
6979         if (data->quiesce)
6980                 return -ENXIO;
6981
6982         data->quiesce = true;
6983         do {
6984                 ret = -ENOMEM;
6985                 backup_node = alloc_fixed_rsrc_ref_node(ctx);
6986                 if (!backup_node)
6987                         break;
6988                 backup_node->rsrc_data = data;
6989                 backup_node->rsrc_put = rsrc_put;
6990
6991                 io_sqe_rsrc_kill_node(ctx, data);
6992                 percpu_ref_kill(&data->refs);
6993                 flush_delayed_work(&ctx->rsrc_put_work);
6994
6995                 ret = wait_for_completion_interruptible(&data->done);
6996                 if (!ret)
6997                         break;
6998
6999                 percpu_ref_resurrect(&data->refs);
7000                 io_sqe_rsrc_set_node(ctx, data, backup_node);
7001                 backup_node = NULL;
7002                 reinit_completion(&data->done);
7003                 mutex_unlock(&ctx->uring_lock);
7004                 ret = io_run_task_work_sig();
7005                 mutex_lock(&ctx->uring_lock);
7006         } while (ret >= 0);
7007         data->quiesce = false;
7008
7009         if (backup_node)
7010                 destroy_fixed_rsrc_ref_node(backup_node);
7011         return ret;
7012 }
7013
7014 static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx)
7015 {
7016         struct fixed_rsrc_data *data;
7017
7018         data = kzalloc(sizeof(*data), GFP_KERNEL);
7019         if (!data)
7020                 return NULL;
7021
7022         if (percpu_ref_init(&data->refs, io_rsrc_data_ref_zero,
7023                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
7024                 kfree(data);
7025                 return NULL;
7026         }
7027         data->ctx = ctx;
7028         init_completion(&data->done);
7029         return data;
7030 }
7031
7032 static void free_fixed_rsrc_data(struct fixed_rsrc_data *data)
7033 {
7034         percpu_ref_exit(&data->refs);
7035         kfree(data->table);
7036         kfree(data);
7037 }
7038
7039 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
7040 {
7041         struct fixed_rsrc_data *data = ctx->file_data;
7042         unsigned nr_tables, i;
7043         int ret;
7044
7045         /*
7046          * percpu_ref_is_dying() is to stop parallel files unregister
7047          * Since we possibly drop uring lock later in this function to
7048          * run task work.
7049          */
7050         if (!data || percpu_ref_is_dying(&data->refs))
7051                 return -ENXIO;
7052         ret = io_rsrc_ref_quiesce(data, ctx, io_ring_file_put);
7053         if (ret)
7054                 return ret;
7055
7056         __io_sqe_files_unregister(ctx);
7057         nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
7058         for (i = 0; i < nr_tables; i++)
7059                 kfree(data->table[i].files);
7060         free_fixed_rsrc_data(data);
7061         ctx->file_data = NULL;
7062         ctx->nr_user_files = 0;
7063         return 0;
7064 }
7065
7066 static void io_sq_thread_unpark(struct io_sq_data *sqd)
7067         __releases(&sqd->lock)
7068 {
7069         WARN_ON_ONCE(sqd->thread == current);
7070
7071         /*
7072          * Do the dance but not conditional clear_bit() because it'd race with
7073          * other threads incrementing park_pending and setting the bit.
7074          */
7075         clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7076         if (atomic_dec_return(&sqd->park_pending))
7077                 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7078         mutex_unlock(&sqd->lock);
7079 }
7080
7081 static void io_sq_thread_park(struct io_sq_data *sqd)
7082         __acquires(&sqd->lock)
7083 {
7084         WARN_ON_ONCE(sqd->thread == current);
7085
7086         atomic_inc(&sqd->park_pending);
7087         set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7088         mutex_lock(&sqd->lock);
7089         if (sqd->thread)
7090                 wake_up_process(sqd->thread);
7091 }
7092
7093 static void io_sq_thread_stop(struct io_sq_data *sqd)
7094 {
7095         WARN_ON_ONCE(sqd->thread == current);
7096
7097         mutex_lock(&sqd->lock);
7098         set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
7099         if (sqd->thread)
7100                 wake_up_process(sqd->thread);
7101         mutex_unlock(&sqd->lock);
7102         wait_for_completion(&sqd->exited);
7103 }
7104
7105 static void io_put_sq_data(struct io_sq_data *sqd)
7106 {
7107         if (refcount_dec_and_test(&sqd->refs)) {
7108                 WARN_ON_ONCE(atomic_read(&sqd->park_pending));
7109
7110                 io_sq_thread_stop(sqd);
7111                 kfree(sqd);
7112         }
7113 }
7114
7115 static void io_sq_thread_finish(struct io_ring_ctx *ctx)
7116 {
7117         struct io_sq_data *sqd = ctx->sq_data;
7118
7119         if (sqd) {
7120                 io_sq_thread_park(sqd);
7121                 list_del_init(&ctx->sqd_list);
7122                 io_sqd_update_thread_idle(sqd);
7123                 io_sq_thread_unpark(sqd);
7124
7125                 io_put_sq_data(sqd);
7126                 ctx->sq_data = NULL;
7127                 if (ctx->sq_creds)
7128                         put_cred(ctx->sq_creds);
7129         }
7130 }
7131
7132 static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
7133 {
7134         struct io_ring_ctx *ctx_attach;
7135         struct io_sq_data *sqd;
7136         struct fd f;
7137
7138         f = fdget(p->wq_fd);
7139         if (!f.file)
7140                 return ERR_PTR(-ENXIO);
7141         if (f.file->f_op != &io_uring_fops) {
7142                 fdput(f);
7143                 return ERR_PTR(-EINVAL);
7144         }
7145
7146         ctx_attach = f.file->private_data;
7147         sqd = ctx_attach->sq_data;
7148         if (!sqd) {
7149                 fdput(f);
7150                 return ERR_PTR(-EINVAL);
7151         }
7152         if (sqd->task_tgid != current->tgid) {
7153                 fdput(f);
7154                 return ERR_PTR(-EPERM);
7155         }
7156
7157         refcount_inc(&sqd->refs);
7158         fdput(f);
7159         return sqd;
7160 }
7161
7162 static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
7163                                          bool *attached)
7164 {
7165         struct io_sq_data *sqd;
7166
7167         *attached = false;
7168         if (p->flags & IORING_SETUP_ATTACH_WQ) {
7169                 sqd = io_attach_sq_data(p);
7170                 if (!IS_ERR(sqd)) {
7171                         *attached = true;
7172                         return sqd;
7173                 }
7174                 /* fall through for EPERM case, setup new sqd/task */
7175                 if (PTR_ERR(sqd) != -EPERM)
7176                         return sqd;
7177         }
7178
7179         sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
7180         if (!sqd)
7181                 return ERR_PTR(-ENOMEM);
7182
7183         atomic_set(&sqd->park_pending, 0);
7184         refcount_set(&sqd->refs, 1);
7185         INIT_LIST_HEAD(&sqd->ctx_list);
7186         mutex_init(&sqd->lock);
7187         init_waitqueue_head(&sqd->wait);
7188         init_completion(&sqd->exited);
7189         return sqd;
7190 }
7191
7192 #if defined(CONFIG_UNIX)
7193 /*
7194  * Ensure the UNIX gc is aware of our file set, so we are certain that
7195  * the io_uring can be safely unregistered on process exit, even if we have
7196  * loops in the file referencing.
7197  */
7198 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
7199 {
7200         struct sock *sk = ctx->ring_sock->sk;
7201         struct scm_fp_list *fpl;
7202         struct sk_buff *skb;
7203         int i, nr_files;
7204
7205         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
7206         if (!fpl)
7207                 return -ENOMEM;
7208
7209         skb = alloc_skb(0, GFP_KERNEL);
7210         if (!skb) {
7211                 kfree(fpl);
7212                 return -ENOMEM;
7213         }
7214
7215         skb->sk = sk;
7216
7217         nr_files = 0;
7218         fpl->user = get_uid(current_user());
7219         for (i = 0; i < nr; i++) {
7220                 struct file *file = io_file_from_index(ctx, i + offset);
7221
7222                 if (!file)
7223                         continue;
7224                 fpl->fp[nr_files] = get_file(file);
7225                 unix_inflight(fpl->user, fpl->fp[nr_files]);
7226                 nr_files++;
7227         }
7228
7229         if (nr_files) {
7230                 fpl->max = SCM_MAX_FD;
7231                 fpl->count = nr_files;
7232                 UNIXCB(skb).fp = fpl;
7233                 skb->destructor = unix_destruct_scm;
7234                 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
7235                 skb_queue_head(&sk->sk_receive_queue, skb);
7236
7237                 for (i = 0; i < nr_files; i++)
7238                         fput(fpl->fp[i]);
7239         } else {
7240                 kfree_skb(skb);
7241                 kfree(fpl);
7242         }
7243
7244         return 0;
7245 }
7246
7247 /*
7248  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
7249  * causes regular reference counting to break down. We rely on the UNIX
7250  * garbage collection to take care of this problem for us.
7251  */
7252 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7253 {
7254         unsigned left, total;
7255         int ret = 0;
7256
7257         total = 0;
7258         left = ctx->nr_user_files;
7259         while (left) {
7260                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
7261
7262                 ret = __io_sqe_files_scm(ctx, this_files, total);
7263                 if (ret)
7264                         break;
7265                 left -= this_files;
7266                 total += this_files;
7267         }
7268
7269         if (!ret)
7270                 return 0;
7271
7272         while (total < ctx->nr_user_files) {
7273                 struct file *file = io_file_from_index(ctx, total);
7274
7275                 if (file)
7276                         fput(file);
7277                 total++;
7278         }
7279
7280         return ret;
7281 }
7282 #else
7283 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7284 {
7285         return 0;
7286 }
7287 #endif
7288
7289 static int io_sqe_alloc_file_tables(struct fixed_rsrc_data *file_data,
7290                                     unsigned nr_tables, unsigned nr_files)
7291 {
7292         int i;
7293
7294         for (i = 0; i < nr_tables; i++) {
7295                 struct fixed_rsrc_table *table = &file_data->table[i];
7296                 unsigned this_files;
7297
7298                 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
7299                 table->files = kcalloc(this_files, sizeof(struct file *),
7300                                         GFP_KERNEL);
7301                 if (!table->files)
7302                         break;
7303                 nr_files -= this_files;
7304         }
7305
7306         if (i == nr_tables)
7307                 return 0;
7308
7309         for (i = 0; i < nr_tables; i++) {
7310                 struct fixed_rsrc_table *table = &file_data->table[i];
7311                 kfree(table->files);
7312         }
7313         return 1;
7314 }
7315
7316 static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
7317 {
7318         struct file *file = prsrc->file;
7319 #if defined(CONFIG_UNIX)
7320         struct sock *sock = ctx->ring_sock->sk;
7321         struct sk_buff_head list, *head = &sock->sk_receive_queue;
7322         struct sk_buff *skb;
7323         int i;
7324
7325         __skb_queue_head_init(&list);
7326
7327         /*
7328          * Find the skb that holds this file in its SCM_RIGHTS. When found,
7329          * remove this entry and rearrange the file array.
7330          */
7331         skb = skb_dequeue(head);
7332         while (skb) {
7333                 struct scm_fp_list *fp;
7334
7335                 fp = UNIXCB(skb).fp;
7336                 for (i = 0; i < fp->count; i++) {
7337                         int left;
7338
7339                         if (fp->fp[i] != file)
7340                                 continue;
7341
7342                         unix_notinflight(fp->user, fp->fp[i]);
7343                         left = fp->count - 1 - i;
7344                         if (left) {
7345                                 memmove(&fp->fp[i], &fp->fp[i + 1],
7346                                                 left * sizeof(struct file *));
7347                         }
7348                         fp->count--;
7349                         if (!fp->count) {
7350                                 kfree_skb(skb);
7351                                 skb = NULL;
7352                         } else {
7353                                 __skb_queue_tail(&list, skb);
7354                         }
7355                         fput(file);
7356                         file = NULL;
7357                         break;
7358                 }
7359
7360                 if (!file)
7361                         break;
7362
7363                 __skb_queue_tail(&list, skb);
7364
7365                 skb = skb_dequeue(head);
7366         }
7367
7368         if (skb_peek(&list)) {
7369                 spin_lock_irq(&head->lock);
7370                 while ((skb = __skb_dequeue(&list)) != NULL)
7371                         __skb_queue_tail(head, skb);
7372                 spin_unlock_irq(&head->lock);
7373         }
7374 #else
7375         fput(file);
7376 #endif
7377 }
7378
7379 static void __io_rsrc_put_work(struct fixed_rsrc_ref_node *ref_node)
7380 {
7381         struct fixed_rsrc_data *rsrc_data = ref_node->rsrc_data;
7382         struct io_ring_ctx *ctx = rsrc_data->ctx;
7383         struct io_rsrc_put *prsrc, *tmp;
7384
7385         list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
7386                 list_del(&prsrc->list);
7387                 ref_node->rsrc_put(ctx, prsrc);
7388                 kfree(prsrc);
7389         }
7390
7391         percpu_ref_exit(&ref_node->refs);
7392         kfree(ref_node);
7393         percpu_ref_put(&rsrc_data->refs);
7394 }
7395
7396 static void io_rsrc_put_work(struct work_struct *work)
7397 {
7398         struct io_ring_ctx *ctx;
7399         struct llist_node *node;
7400
7401         ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
7402         node = llist_del_all(&ctx->rsrc_put_llist);
7403
7404         while (node) {
7405                 struct fixed_rsrc_ref_node *ref_node;
7406                 struct llist_node *next = node->next;
7407
7408                 ref_node = llist_entry(node, struct fixed_rsrc_ref_node, llist);
7409                 __io_rsrc_put_work(ref_node);
7410                 node = next;
7411         }
7412 }
7413
7414 static struct file **io_fixed_file_slot(struct fixed_rsrc_data *file_data,
7415                                         unsigned i)
7416 {
7417         struct fixed_rsrc_table *table;
7418
7419         table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
7420         return &table->files[i & IORING_FILE_TABLE_MASK];
7421 }
7422
7423 static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
7424 {
7425         struct fixed_rsrc_ref_node *ref_node;
7426         struct fixed_rsrc_data *data;
7427         struct io_ring_ctx *ctx;
7428         bool first_add = false;
7429         int delay = HZ;
7430
7431         ref_node = container_of(ref, struct fixed_rsrc_ref_node, refs);
7432         data = ref_node->rsrc_data;
7433         ctx = data->ctx;
7434
7435         io_rsrc_ref_lock(ctx);
7436         ref_node->done = true;
7437
7438         while (!list_empty(&ctx->rsrc_ref_list)) {
7439                 ref_node = list_first_entry(&ctx->rsrc_ref_list,
7440                                         struct fixed_rsrc_ref_node, node);
7441                 /* recycle ref nodes in order */
7442                 if (!ref_node->done)
7443                         break;
7444                 list_del(&ref_node->node);
7445                 first_add |= llist_add(&ref_node->llist, &ctx->rsrc_put_llist);
7446         }
7447         io_rsrc_ref_unlock(ctx);
7448
7449         if (percpu_ref_is_dying(&data->refs))
7450                 delay = 0;
7451
7452         if (!delay)
7453                 mod_delayed_work(system_wq, &ctx->rsrc_put_work, 0);
7454         else if (first_add)
7455                 queue_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
7456 }
7457
7458 static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
7459                         struct io_ring_ctx *ctx)
7460 {
7461         struct fixed_rsrc_ref_node *ref_node;
7462
7463         ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
7464         if (!ref_node)
7465                 return NULL;
7466
7467         if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
7468                             0, GFP_KERNEL)) {
7469                 kfree(ref_node);
7470                 return NULL;
7471         }
7472         INIT_LIST_HEAD(&ref_node->node);
7473         INIT_LIST_HEAD(&ref_node->rsrc_list);
7474         ref_node->done = false;
7475         return ref_node;
7476 }
7477
7478 static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
7479                                      struct fixed_rsrc_ref_node *ref_node)
7480 {
7481         ref_node->rsrc_data = ctx->file_data;
7482         ref_node->rsrc_put = io_ring_file_put;
7483 }
7484
7485 static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
7486 {
7487         percpu_ref_exit(&ref_node->refs);
7488         kfree(ref_node);
7489 }
7490
7491
7492 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
7493                                  unsigned nr_args)
7494 {
7495         __s32 __user *fds = (__s32 __user *) arg;
7496         unsigned nr_tables, i;
7497         struct file *file;
7498         int fd, ret = -ENOMEM;
7499         struct fixed_rsrc_ref_node *ref_node;
7500         struct fixed_rsrc_data *file_data;
7501
7502         if (ctx->file_data)
7503                 return -EBUSY;
7504         if (!nr_args)
7505                 return -EINVAL;
7506         if (nr_args > IORING_MAX_FIXED_FILES)
7507                 return -EMFILE;
7508
7509         file_data = alloc_fixed_rsrc_data(ctx);
7510         if (!file_data)
7511                 return -ENOMEM;
7512         ctx->file_data = file_data;
7513
7514         nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
7515         file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
7516                                    GFP_KERNEL);
7517         if (!file_data->table)
7518                 goto out_free;
7519
7520         if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args))
7521                 goto out_free;
7522
7523         for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
7524                 if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
7525                         ret = -EFAULT;
7526                         goto out_fput;
7527                 }
7528                 /* allow sparse sets */
7529                 if (fd == -1)
7530                         continue;
7531
7532                 file = fget(fd);
7533                 ret = -EBADF;
7534                 if (!file)
7535                         goto out_fput;
7536
7537                 /*
7538                  * Don't allow io_uring instances to be registered. If UNIX
7539                  * isn't enabled, then this causes a reference cycle and this
7540                  * instance can never get freed. If UNIX is enabled we'll
7541                  * handle it just fine, but there's still no point in allowing
7542                  * a ring fd as it doesn't support regular read/write anyway.
7543                  */
7544                 if (file->f_op == &io_uring_fops) {
7545                         fput(file);
7546                         goto out_fput;
7547                 }
7548                 *io_fixed_file_slot(file_data, i) = file;
7549         }
7550
7551         ret = io_sqe_files_scm(ctx);
7552         if (ret) {
7553                 io_sqe_files_unregister(ctx);
7554                 return ret;
7555         }
7556
7557         ref_node = alloc_fixed_rsrc_ref_node(ctx);
7558         if (!ref_node) {
7559                 io_sqe_files_unregister(ctx);
7560                 return -ENOMEM;
7561         }
7562         init_fixed_file_ref_node(ctx, ref_node);
7563
7564         io_sqe_rsrc_set_node(ctx, file_data, ref_node);
7565         return ret;
7566 out_fput:
7567         for (i = 0; i < ctx->nr_user_files; i++) {
7568                 file = io_file_from_index(ctx, i);
7569                 if (file)
7570                         fput(file);
7571         }
7572         for (i = 0; i < nr_tables; i++)
7573                 kfree(file_data->table[i].files);
7574         ctx->nr_user_files = 0;
7575 out_free:
7576         free_fixed_rsrc_data(ctx->file_data);
7577         ctx->file_data = NULL;
7578         return ret;
7579 }
7580
7581 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
7582                                 int index)
7583 {
7584 #if defined(CONFIG_UNIX)
7585         struct sock *sock = ctx->ring_sock->sk;
7586         struct sk_buff_head *head = &sock->sk_receive_queue;
7587         struct sk_buff *skb;
7588
7589         /*
7590          * See if we can merge this file into an existing skb SCM_RIGHTS
7591          * file set. If there's no room, fall back to allocating a new skb
7592          * and filling it in.
7593          */
7594         spin_lock_irq(&head->lock);
7595         skb = skb_peek(head);
7596         if (skb) {
7597                 struct scm_fp_list *fpl = UNIXCB(skb).fp;
7598
7599                 if (fpl->count < SCM_MAX_FD) {
7600                         __skb_unlink(skb, head);
7601                         spin_unlock_irq(&head->lock);
7602                         fpl->fp[fpl->count] = get_file(file);
7603                         unix_inflight(fpl->user, fpl->fp[fpl->count]);
7604                         fpl->count++;
7605                         spin_lock_irq(&head->lock);
7606                         __skb_queue_head(head, skb);
7607                 } else {
7608                         skb = NULL;
7609                 }
7610         }
7611         spin_unlock_irq(&head->lock);
7612
7613         if (skb) {
7614                 fput(file);
7615                 return 0;
7616         }
7617
7618         return __io_sqe_files_scm(ctx, 1, index);
7619 #else
7620         return 0;
7621 #endif
7622 }
7623
7624 static int io_queue_rsrc_removal(struct fixed_rsrc_data *data, void *rsrc)
7625 {
7626         struct io_rsrc_put *prsrc;
7627         struct fixed_rsrc_ref_node *ref_node = data->node;
7628
7629         prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
7630         if (!prsrc)
7631                 return -ENOMEM;
7632
7633         prsrc->rsrc = rsrc;
7634         list_add(&prsrc->list, &ref_node->rsrc_list);
7635
7636         return 0;
7637 }
7638
7639 static inline int io_queue_file_removal(struct fixed_rsrc_data *data,
7640                                         struct file *file)
7641 {
7642         return io_queue_rsrc_removal(data, (void *)file);
7643 }
7644
7645 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
7646                                  struct io_uring_rsrc_update *up,
7647                                  unsigned nr_args)
7648 {
7649         struct fixed_rsrc_data *data = ctx->file_data;
7650         struct fixed_rsrc_ref_node *ref_node;
7651         struct file *file, **file_slot;
7652         __s32 __user *fds;
7653         int fd, i, err;
7654         __u32 done;
7655         bool needs_switch = false;
7656
7657         if (check_add_overflow(up->offset, nr_args, &done))
7658                 return -EOVERFLOW;
7659         if (done > ctx->nr_user_files)
7660                 return -EINVAL;
7661
7662         ref_node = alloc_fixed_rsrc_ref_node(ctx);
7663         if (!ref_node)
7664                 return -ENOMEM;
7665         init_fixed_file_ref_node(ctx, ref_node);
7666
7667         fds = u64_to_user_ptr(up->data);
7668         for (done = 0; done < nr_args; done++) {
7669                 err = 0;
7670                 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
7671                         err = -EFAULT;
7672                         break;
7673                 }
7674                 if (fd == IORING_REGISTER_FILES_SKIP)
7675                         continue;
7676
7677                 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
7678                 file_slot = io_fixed_file_slot(ctx->file_data, i);
7679
7680                 if (*file_slot) {
7681                         err = io_queue_file_removal(data, *file_slot);
7682                         if (err)
7683                                 break;
7684                         *file_slot = NULL;
7685                         needs_switch = true;
7686                 }
7687                 if (fd != -1) {
7688                         file = fget(fd);
7689                         if (!file) {
7690                                 err = -EBADF;
7691                                 break;
7692                         }
7693                         /*
7694                          * Don't allow io_uring instances to be registered. If
7695                          * UNIX isn't enabled, then this causes a reference
7696                          * cycle and this instance can never get freed. If UNIX
7697                          * is enabled we'll handle it just fine, but there's
7698                          * still no point in allowing a ring fd as it doesn't
7699                          * support regular read/write anyway.
7700                          */
7701                         if (file->f_op == &io_uring_fops) {
7702                                 fput(file);
7703                                 err = -EBADF;
7704                                 break;
7705                         }
7706                         *file_slot = file;
7707                         err = io_sqe_file_register(ctx, file, i);
7708                         if (err) {
7709                                 *file_slot = NULL;
7710                                 fput(file);
7711                                 break;
7712                         }
7713                 }
7714         }
7715
7716         if (needs_switch) {
7717                 percpu_ref_kill(&data->node->refs);
7718                 io_sqe_rsrc_set_node(ctx, data, ref_node);
7719         } else
7720                 destroy_fixed_rsrc_ref_node(ref_node);
7721
7722         return done ? done : err;
7723 }
7724
7725 static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
7726                                unsigned nr_args)
7727 {
7728         struct io_uring_rsrc_update up;
7729
7730         if (!ctx->file_data)
7731                 return -ENXIO;
7732         if (!nr_args)
7733                 return -EINVAL;
7734         if (copy_from_user(&up, arg, sizeof(up)))
7735                 return -EFAULT;
7736         if (up.resv)
7737                 return -EINVAL;
7738
7739         return __io_sqe_files_update(ctx, &up, nr_args);
7740 }
7741
7742 static struct io_wq_work *io_free_work(struct io_wq_work *work)
7743 {
7744         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
7745
7746         req = io_put_req_find_next(req);
7747         return req ? &req->work : NULL;
7748 }
7749
7750 static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
7751 {
7752         struct io_wq_hash *hash;
7753         struct io_wq_data data;
7754         unsigned int concurrency;
7755
7756         hash = ctx->hash_map;
7757         if (!hash) {
7758                 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
7759                 if (!hash)
7760                         return ERR_PTR(-ENOMEM);
7761                 refcount_set(&hash->refs, 1);
7762                 init_waitqueue_head(&hash->wait);
7763                 ctx->hash_map = hash;
7764         }
7765
7766         data.hash = hash;
7767         data.free_work = io_free_work;
7768         data.do_work = io_wq_submit_work;
7769
7770         /* Do QD, or 4 * CPUS, whatever is smallest */
7771         concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
7772
7773         return io_wq_create(concurrency, &data);
7774 }
7775
7776 static int io_uring_alloc_task_context(struct task_struct *task,
7777                                        struct io_ring_ctx *ctx)
7778 {
7779         struct io_uring_task *tctx;
7780         int ret;
7781
7782         tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
7783         if (unlikely(!tctx))
7784                 return -ENOMEM;
7785
7786         ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
7787         if (unlikely(ret)) {
7788                 kfree(tctx);
7789                 return ret;
7790         }
7791
7792         tctx->io_wq = io_init_wq_offload(ctx);
7793         if (IS_ERR(tctx->io_wq)) {
7794                 ret = PTR_ERR(tctx->io_wq);
7795                 percpu_counter_destroy(&tctx->inflight);
7796                 kfree(tctx);
7797                 return ret;
7798         }
7799
7800         xa_init(&tctx->xa);
7801         init_waitqueue_head(&tctx->wait);
7802         tctx->last = NULL;
7803         atomic_set(&tctx->in_idle, 0);
7804         task->io_uring = tctx;
7805         spin_lock_init(&tctx->task_lock);
7806         INIT_WQ_LIST(&tctx->task_list);
7807         tctx->task_state = 0;
7808         init_task_work(&tctx->task_work, tctx_task_work);
7809         return 0;
7810 }
7811
7812 void __io_uring_free(struct task_struct *tsk)
7813 {
7814         struct io_uring_task *tctx = tsk->io_uring;
7815
7816         WARN_ON_ONCE(!xa_empty(&tctx->xa));
7817         WARN_ON_ONCE(tctx->io_wq);
7818
7819         percpu_counter_destroy(&tctx->inflight);
7820         kfree(tctx);
7821         tsk->io_uring = NULL;
7822 }
7823
7824 static int io_sq_offload_create(struct io_ring_ctx *ctx,
7825                                 struct io_uring_params *p)
7826 {
7827         int ret;
7828
7829         /* Retain compatibility with failing for an invalid attach attempt */
7830         if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
7831                                 IORING_SETUP_ATTACH_WQ) {
7832                 struct fd f;
7833
7834                 f = fdget(p->wq_fd);
7835                 if (!f.file)
7836                         return -ENXIO;
7837                 if (f.file->f_op != &io_uring_fops) {
7838                         fdput(f);
7839                         return -EINVAL;
7840                 }
7841                 fdput(f);
7842         }
7843         if (ctx->flags & IORING_SETUP_SQPOLL) {
7844                 struct task_struct *tsk;
7845                 struct io_sq_data *sqd;
7846                 bool attached;
7847
7848                 ret = -EPERM;
7849                 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
7850                         goto err;
7851
7852                 sqd = io_get_sq_data(p, &attached);
7853                 if (IS_ERR(sqd)) {
7854                         ret = PTR_ERR(sqd);
7855                         goto err;
7856                 }
7857
7858                 ctx->sq_creds = get_current_cred();
7859                 ctx->sq_data = sqd;
7860                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
7861                 if (!ctx->sq_thread_idle)
7862                         ctx->sq_thread_idle = HZ;
7863
7864                 ret = 0;
7865                 io_sq_thread_park(sqd);
7866                 /* don't attach to a dying SQPOLL thread, would be racy */
7867                 if (attached && !sqd->thread) {
7868                         ret = -ENXIO;
7869                 } else {
7870                         list_add(&ctx->sqd_list, &sqd->ctx_list);
7871                         io_sqd_update_thread_idle(sqd);
7872                 }
7873                 io_sq_thread_unpark(sqd);
7874
7875                 if (ret < 0) {
7876                         io_put_sq_data(sqd);
7877                         ctx->sq_data = NULL;
7878                         return ret;
7879                 } else if (attached) {
7880                         return 0;
7881                 }
7882
7883                 if (p->flags & IORING_SETUP_SQ_AFF) {
7884                         int cpu = p->sq_thread_cpu;
7885
7886                         ret = -EINVAL;
7887                         if (cpu >= nr_cpu_ids)
7888                                 goto err_sqpoll;
7889                         if (!cpu_online(cpu))
7890                                 goto err_sqpoll;
7891
7892                         sqd->sq_cpu = cpu;
7893                 } else {
7894                         sqd->sq_cpu = -1;
7895                 }
7896
7897                 sqd->task_pid = current->pid;
7898                 sqd->task_tgid = current->tgid;
7899                 tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
7900                 if (IS_ERR(tsk)) {
7901                         ret = PTR_ERR(tsk);
7902                         goto err_sqpoll;
7903                 }
7904
7905                 sqd->thread = tsk;
7906                 ret = io_uring_alloc_task_context(tsk, ctx);
7907                 wake_up_new_task(tsk);
7908                 if (ret)
7909                         goto err;
7910         } else if (p->flags & IORING_SETUP_SQ_AFF) {
7911                 /* Can't have SQ_AFF without SQPOLL */
7912                 ret = -EINVAL;
7913                 goto err;
7914         }
7915
7916         return 0;
7917 err:
7918         io_sq_thread_finish(ctx);
7919         return ret;
7920 err_sqpoll:
7921         complete(&ctx->sq_data->exited);
7922         goto err;
7923 }
7924
7925 static inline void __io_unaccount_mem(struct user_struct *user,
7926                                       unsigned long nr_pages)
7927 {
7928         atomic_long_sub(nr_pages, &user->locked_vm);
7929 }
7930
7931 static inline int __io_account_mem(struct user_struct *user,
7932                                    unsigned long nr_pages)
7933 {
7934         unsigned long page_limit, cur_pages, new_pages;
7935
7936         /* Don't allow more pages than we can safely lock */
7937         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
7938
7939         do {
7940                 cur_pages = atomic_long_read(&user->locked_vm);
7941                 new_pages = cur_pages + nr_pages;
7942                 if (new_pages > page_limit)
7943                         return -ENOMEM;
7944         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
7945                                         new_pages) != cur_pages);
7946
7947         return 0;
7948 }
7949
7950 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
7951 {
7952         if (ctx->user)
7953                 __io_unaccount_mem(ctx->user, nr_pages);
7954
7955         if (ctx->mm_account)
7956                 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
7957 }
7958
7959 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
7960 {
7961         int ret;
7962
7963         if (ctx->user) {
7964                 ret = __io_account_mem(ctx->user, nr_pages);
7965                 if (ret)
7966                         return ret;
7967         }
7968
7969         if (ctx->mm_account)
7970                 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
7971
7972         return 0;
7973 }
7974
7975 static void io_mem_free(void *ptr)
7976 {
7977         struct page *page;
7978
7979         if (!ptr)
7980                 return;
7981
7982         page = virt_to_head_page(ptr);
7983         if (put_page_testzero(page))
7984                 free_compound_page(page);
7985 }
7986
7987 static void *io_mem_alloc(size_t size)
7988 {
7989         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
7990                                 __GFP_NORETRY | __GFP_ACCOUNT;
7991
7992         return (void *) __get_free_pages(gfp_flags, get_order(size));
7993 }
7994
7995 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
7996                                 size_t *sq_offset)
7997 {
7998         struct io_rings *rings;
7999         size_t off, sq_array_size;
8000
8001         off = struct_size(rings, cqes, cq_entries);
8002         if (off == SIZE_MAX)
8003                 return SIZE_MAX;
8004
8005 #ifdef CONFIG_SMP
8006         off = ALIGN(off, SMP_CACHE_BYTES);
8007         if (off == 0)
8008                 return SIZE_MAX;
8009 #endif
8010
8011         if (sq_offset)
8012                 *sq_offset = off;
8013
8014         sq_array_size = array_size(sizeof(u32), sq_entries);
8015         if (sq_array_size == SIZE_MAX)
8016                 return SIZE_MAX;
8017
8018         if (check_add_overflow(off, sq_array_size, &off))
8019                 return SIZE_MAX;
8020
8021         return off;
8022 }
8023
8024 static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
8025 {
8026         int i, j;
8027
8028         if (!ctx->user_bufs)
8029                 return -ENXIO;
8030
8031         for (i = 0; i < ctx->nr_user_bufs; i++) {
8032                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8033
8034                 for (j = 0; j < imu->nr_bvecs; j++)
8035                         unpin_user_page(imu->bvec[j].bv_page);
8036
8037                 if (imu->acct_pages)
8038                         io_unaccount_mem(ctx, imu->acct_pages);
8039                 kvfree(imu->bvec);
8040                 imu->nr_bvecs = 0;
8041         }
8042
8043         kfree(ctx->user_bufs);
8044         ctx->user_bufs = NULL;
8045         ctx->nr_user_bufs = 0;
8046         return 0;
8047 }
8048
8049 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
8050                        void __user *arg, unsigned index)
8051 {
8052         struct iovec __user *src;
8053
8054 #ifdef CONFIG_COMPAT
8055         if (ctx->compat) {
8056                 struct compat_iovec __user *ciovs;
8057                 struct compat_iovec ciov;
8058
8059                 ciovs = (struct compat_iovec __user *) arg;
8060                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
8061                         return -EFAULT;
8062
8063                 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
8064                 dst->iov_len = ciov.iov_len;
8065                 return 0;
8066         }
8067 #endif
8068         src = (struct iovec __user *) arg;
8069         if (copy_from_user(dst, &src[index], sizeof(*dst)))
8070                 return -EFAULT;
8071         return 0;
8072 }
8073
8074 /*
8075  * Not super efficient, but this is just a registration time. And we do cache
8076  * the last compound head, so generally we'll only do a full search if we don't
8077  * match that one.
8078  *
8079  * We check if the given compound head page has already been accounted, to
8080  * avoid double accounting it. This allows us to account the full size of the
8081  * page, not just the constituent pages of a huge page.
8082  */
8083 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
8084                                   int nr_pages, struct page *hpage)
8085 {
8086         int i, j;
8087
8088         /* check current page array */
8089         for (i = 0; i < nr_pages; i++) {
8090                 if (!PageCompound(pages[i]))
8091                         continue;
8092                 if (compound_head(pages[i]) == hpage)
8093                         return true;
8094         }
8095
8096         /* check previously registered pages */
8097         for (i = 0; i < ctx->nr_user_bufs; i++) {
8098                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8099
8100                 for (j = 0; j < imu->nr_bvecs; j++) {
8101                         if (!PageCompound(imu->bvec[j].bv_page))
8102                                 continue;
8103                         if (compound_head(imu->bvec[j].bv_page) == hpage)
8104                                 return true;
8105                 }
8106         }
8107
8108         return false;
8109 }
8110
8111 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
8112                                  int nr_pages, struct io_mapped_ubuf *imu,
8113                                  struct page **last_hpage)
8114 {
8115         int i, ret;
8116
8117         for (i = 0; i < nr_pages; i++) {
8118                 if (!PageCompound(pages[i])) {
8119                         imu->acct_pages++;
8120                 } else {
8121                         struct page *hpage;
8122
8123                         hpage = compound_head(pages[i]);
8124                         if (hpage == *last_hpage)
8125                                 continue;
8126                         *last_hpage = hpage;
8127                         if (headpage_already_acct(ctx, pages, i, hpage))
8128                                 continue;
8129                         imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
8130                 }
8131         }
8132
8133         if (!imu->acct_pages)
8134                 return 0;
8135
8136         ret = io_account_mem(ctx, imu->acct_pages);
8137         if (ret)
8138                 imu->acct_pages = 0;
8139         return ret;
8140 }
8141
8142 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
8143                                   struct io_mapped_ubuf *imu,
8144                                   struct page **last_hpage)
8145 {
8146         struct vm_area_struct **vmas = NULL;
8147         struct page **pages = NULL;
8148         unsigned long off, start, end, ubuf;
8149         size_t size;
8150         int ret, pret, nr_pages, i;
8151
8152         ubuf = (unsigned long) iov->iov_base;
8153         end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
8154         start = ubuf >> PAGE_SHIFT;
8155         nr_pages = end - start;
8156
8157         ret = -ENOMEM;
8158
8159         pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
8160         if (!pages)
8161                 goto done;
8162
8163         vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
8164                               GFP_KERNEL);
8165         if (!vmas)
8166                 goto done;
8167
8168         imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
8169                                    GFP_KERNEL);
8170         if (!imu->bvec)
8171                 goto done;
8172
8173         ret = 0;
8174         mmap_read_lock(current->mm);
8175         pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
8176                               pages, vmas);
8177         if (pret == nr_pages) {
8178                 /* don't support file backed memory */
8179                 for (i = 0; i < nr_pages; i++) {
8180                         struct vm_area_struct *vma = vmas[i];
8181
8182                         if (vma->vm_file &&
8183                             !is_file_hugepages(vma->vm_file)) {
8184                                 ret = -EOPNOTSUPP;
8185                                 break;
8186                         }
8187                 }
8188         } else {
8189                 ret = pret < 0 ? pret : -EFAULT;
8190         }
8191         mmap_read_unlock(current->mm);
8192         if (ret) {
8193                 /*
8194                  * if we did partial map, or found file backed vmas,
8195                  * release any pages we did get
8196                  */
8197                 if (pret > 0)
8198                         unpin_user_pages(pages, pret);
8199                 kvfree(imu->bvec);
8200                 goto done;
8201         }
8202
8203         ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
8204         if (ret) {
8205                 unpin_user_pages(pages, pret);
8206                 kvfree(imu->bvec);
8207                 goto done;
8208         }
8209
8210         off = ubuf & ~PAGE_MASK;
8211         size = iov->iov_len;
8212         for (i = 0; i < nr_pages; i++) {
8213                 size_t vec_len;
8214
8215                 vec_len = min_t(size_t, size, PAGE_SIZE - off);
8216                 imu->bvec[i].bv_page = pages[i];
8217                 imu->bvec[i].bv_len = vec_len;
8218                 imu->bvec[i].bv_offset = off;
8219                 off = 0;
8220                 size -= vec_len;
8221         }
8222         /* store original address for later verification */
8223         imu->ubuf = ubuf;
8224         imu->len = iov->iov_len;
8225         imu->nr_bvecs = nr_pages;
8226         ret = 0;
8227 done:
8228         kvfree(pages);
8229         kvfree(vmas);
8230         return ret;
8231 }
8232
8233 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
8234 {
8235         if (ctx->user_bufs)
8236                 return -EBUSY;
8237         if (!nr_args || nr_args > UIO_MAXIOV)
8238                 return -EINVAL;
8239
8240         ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
8241                                         GFP_KERNEL);
8242         if (!ctx->user_bufs)
8243                 return -ENOMEM;
8244
8245         return 0;
8246 }
8247
8248 static int io_buffer_validate(struct iovec *iov)
8249 {
8250         /*
8251          * Don't impose further limits on the size and buffer
8252          * constraints here, we'll -EINVAL later when IO is
8253          * submitted if they are wrong.
8254          */
8255         if (!iov->iov_base || !iov->iov_len)
8256                 return -EFAULT;
8257
8258         /* arbitrary limit, but we need something */
8259         if (iov->iov_len > SZ_1G)
8260                 return -EFAULT;
8261
8262         return 0;
8263 }
8264
8265 static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
8266                                    unsigned int nr_args)
8267 {
8268         int i, ret;
8269         struct iovec iov;
8270         struct page *last_hpage = NULL;
8271
8272         ret = io_buffers_map_alloc(ctx, nr_args);
8273         if (ret)
8274                 return ret;
8275
8276         for (i = 0; i < nr_args; i++) {
8277                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8278
8279                 ret = io_copy_iov(ctx, &iov, arg, i);
8280                 if (ret)
8281                         break;
8282
8283                 ret = io_buffer_validate(&iov);
8284                 if (ret)
8285                         break;
8286
8287                 ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
8288                 if (ret)
8289                         break;
8290
8291                 ctx->nr_user_bufs++;
8292         }
8293
8294         if (ret)
8295                 io_sqe_buffers_unregister(ctx);
8296
8297         return ret;
8298 }
8299
8300 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
8301 {
8302         __s32 __user *fds = arg;
8303         int fd;
8304
8305         if (ctx->cq_ev_fd)
8306                 return -EBUSY;
8307
8308         if (copy_from_user(&fd, fds, sizeof(*fds)))
8309                 return -EFAULT;
8310
8311         ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
8312         if (IS_ERR(ctx->cq_ev_fd)) {
8313                 int ret = PTR_ERR(ctx->cq_ev_fd);
8314                 ctx->cq_ev_fd = NULL;
8315                 return ret;
8316         }
8317
8318         return 0;
8319 }
8320
8321 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
8322 {
8323         if (ctx->cq_ev_fd) {
8324                 eventfd_ctx_put(ctx->cq_ev_fd);
8325                 ctx->cq_ev_fd = NULL;
8326                 return 0;
8327         }
8328
8329         return -ENXIO;
8330 }
8331
8332 static void io_destroy_buffers(struct io_ring_ctx *ctx)
8333 {
8334         struct io_buffer *buf;
8335         unsigned long index;
8336
8337         xa_for_each(&ctx->io_buffers, index, buf)
8338                 __io_remove_buffers(ctx, buf, index, -1U);
8339 }
8340
8341 static void io_req_cache_free(struct list_head *list, struct task_struct *tsk)
8342 {
8343         struct io_kiocb *req, *nxt;
8344
8345         list_for_each_entry_safe(req, nxt, list, compl.list) {
8346                 if (tsk && req->task != tsk)
8347                         continue;
8348                 list_del(&req->compl.list);
8349                 kmem_cache_free(req_cachep, req);
8350         }
8351 }
8352
8353 static void io_req_caches_free(struct io_ring_ctx *ctx)
8354 {
8355         struct io_submit_state *submit_state = &ctx->submit_state;
8356         struct io_comp_state *cs = &ctx->submit_state.comp;
8357
8358         mutex_lock(&ctx->uring_lock);
8359
8360         if (submit_state->free_reqs) {
8361                 kmem_cache_free_bulk(req_cachep, submit_state->free_reqs,
8362                                      submit_state->reqs);
8363                 submit_state->free_reqs = 0;
8364         }
8365
8366         spin_lock_irq(&ctx->completion_lock);
8367         list_splice_init(&cs->locked_free_list, &cs->free_list);
8368         cs->locked_free_nr = 0;
8369         spin_unlock_irq(&ctx->completion_lock);
8370
8371         io_req_cache_free(&cs->free_list, NULL);
8372
8373         mutex_unlock(&ctx->uring_lock);
8374 }
8375
8376 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
8377 {
8378         /*
8379          * Some may use context even when all refs and requests have been put,
8380          * and they are free to do so while still holding uring_lock or
8381          * completion_lock, see __io_req_task_submit(). Wait for them to finish.
8382          */
8383         mutex_lock(&ctx->uring_lock);
8384         mutex_unlock(&ctx->uring_lock);
8385         spin_lock_irq(&ctx->completion_lock);
8386         spin_unlock_irq(&ctx->completion_lock);
8387
8388         io_sq_thread_finish(ctx);
8389         io_sqe_buffers_unregister(ctx);
8390
8391         if (ctx->mm_account) {
8392                 mmdrop(ctx->mm_account);
8393                 ctx->mm_account = NULL;
8394         }
8395
8396         mutex_lock(&ctx->uring_lock);
8397         io_sqe_files_unregister(ctx);
8398         mutex_unlock(&ctx->uring_lock);
8399         io_eventfd_unregister(ctx);
8400         io_destroy_buffers(ctx);
8401
8402 #if defined(CONFIG_UNIX)
8403         if (ctx->ring_sock) {
8404                 ctx->ring_sock->file = NULL; /* so that iput() is called */
8405                 sock_release(ctx->ring_sock);
8406         }
8407 #endif
8408
8409         io_mem_free(ctx->rings);
8410         io_mem_free(ctx->sq_sqes);
8411
8412         percpu_ref_exit(&ctx->refs);
8413         free_uid(ctx->user);
8414         io_req_caches_free(ctx);
8415         if (ctx->hash_map)
8416                 io_wq_put_hash(ctx->hash_map);
8417         kfree(ctx->cancel_hash);
8418         kfree(ctx);
8419 }
8420
8421 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
8422 {
8423         struct io_ring_ctx *ctx = file->private_data;
8424         __poll_t mask = 0;
8425
8426         poll_wait(file, &ctx->cq_wait, wait);
8427         /*
8428          * synchronizes with barrier from wq_has_sleeper call in
8429          * io_commit_cqring
8430          */
8431         smp_rmb();
8432         if (!io_sqring_full(ctx))
8433                 mask |= EPOLLOUT | EPOLLWRNORM;
8434
8435         /*
8436          * Don't flush cqring overflow list here, just do a simple check.
8437          * Otherwise there could possible be ABBA deadlock:
8438          *      CPU0                    CPU1
8439          *      ----                    ----
8440          * lock(&ctx->uring_lock);
8441          *                              lock(&ep->mtx);
8442          *                              lock(&ctx->uring_lock);
8443          * lock(&ep->mtx);
8444          *
8445          * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
8446          * pushs them to do the flush.
8447          */
8448         if (io_cqring_events(ctx) || test_bit(0, &ctx->cq_check_overflow))
8449                 mask |= EPOLLIN | EPOLLRDNORM;
8450
8451         return mask;
8452 }
8453
8454 static int io_uring_fasync(int fd, struct file *file, int on)
8455 {
8456         struct io_ring_ctx *ctx = file->private_data;
8457
8458         return fasync_helper(fd, file, on, &ctx->cq_fasync);
8459 }
8460
8461 static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
8462 {
8463         const struct cred *creds;
8464
8465         creds = xa_erase(&ctx->personalities, id);
8466         if (creds) {
8467                 put_cred(creds);
8468                 return 0;
8469         }
8470
8471         return -EINVAL;
8472 }
8473
8474 static bool io_run_ctx_fallback(struct io_ring_ctx *ctx)
8475 {
8476         struct callback_head *work, *next;
8477         bool executed = false;
8478
8479         do {
8480                 work = xchg(&ctx->exit_task_work, NULL);
8481                 if (!work)
8482                         break;
8483
8484                 do {
8485                         next = work->next;
8486                         work->func(work);
8487                         work = next;
8488                         cond_resched();
8489                 } while (work);
8490                 executed = true;
8491         } while (1);
8492
8493         return executed;
8494 }
8495
8496 struct io_tctx_exit {
8497         struct callback_head            task_work;
8498         struct completion               completion;
8499         struct io_ring_ctx              *ctx;
8500 };
8501
8502 static void io_tctx_exit_cb(struct callback_head *cb)
8503 {
8504         struct io_uring_task *tctx = current->io_uring;
8505         struct io_tctx_exit *work;
8506
8507         work = container_of(cb, struct io_tctx_exit, task_work);
8508         /*
8509          * When @in_idle, we're in cancellation and it's racy to remove the
8510          * node. It'll be removed by the end of cancellation, just ignore it.
8511          */
8512         if (!atomic_read(&tctx->in_idle))
8513                 io_uring_del_task_file((unsigned long)work->ctx);
8514         complete(&work->completion);
8515 }
8516
8517 static void io_ring_exit_work(struct work_struct *work)
8518 {
8519         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
8520         unsigned long timeout = jiffies + HZ * 60 * 5;
8521         struct io_tctx_exit exit;
8522         struct io_tctx_node *node;
8523         int ret;
8524
8525         /*
8526          * If we're doing polled IO and end up having requests being
8527          * submitted async (out-of-line), then completions can come in while
8528          * we're waiting for refs to drop. We need to reap these manually,
8529          * as nobody else will be looking for them.
8530          */
8531         do {
8532                 io_uring_try_cancel_requests(ctx, NULL, NULL);
8533
8534                 WARN_ON_ONCE(time_after(jiffies, timeout));
8535         } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
8536
8537         mutex_lock(&ctx->uring_lock);
8538         while (!list_empty(&ctx->tctx_list)) {
8539                 WARN_ON_ONCE(time_after(jiffies, timeout));
8540
8541                 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
8542                                         ctx_node);
8543                 exit.ctx = ctx;
8544                 init_completion(&exit.completion);
8545                 init_task_work(&exit.task_work, io_tctx_exit_cb);
8546                 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
8547                 if (WARN_ON_ONCE(ret))
8548                         continue;
8549                 wake_up_process(node->task);
8550
8551                 mutex_unlock(&ctx->uring_lock);
8552                 wait_for_completion(&exit.completion);
8553                 cond_resched();
8554                 mutex_lock(&ctx->uring_lock);
8555         }
8556         mutex_unlock(&ctx->uring_lock);
8557
8558         io_ring_ctx_free(ctx);
8559 }
8560
8561 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
8562 {
8563         unsigned long index;
8564         struct creds *creds;
8565
8566         mutex_lock(&ctx->uring_lock);
8567         percpu_ref_kill(&ctx->refs);
8568         /* if force is set, the ring is going away. always drop after that */
8569         ctx->cq_overflow_flushed = 1;
8570         if (ctx->rings)
8571                 __io_cqring_overflow_flush(ctx, true, NULL, NULL);
8572         xa_for_each(&ctx->personalities, index, creds)
8573                 io_unregister_personality(ctx, index);
8574         mutex_unlock(&ctx->uring_lock);
8575
8576         /* prevent SQPOLL from submitting new requests */
8577         if (ctx->sq_data) {
8578                 io_sq_thread_park(ctx->sq_data);
8579                 list_del_init(&ctx->sqd_list);
8580                 io_sqd_update_thread_idle(ctx->sq_data);
8581                 io_sq_thread_unpark(ctx->sq_data);
8582         }
8583
8584         io_kill_timeouts(ctx, NULL, NULL);
8585         io_poll_remove_all(ctx, NULL, NULL);
8586
8587         /* if we failed setting up the ctx, we might not have any rings */
8588         io_iopoll_try_reap_events(ctx);
8589
8590         INIT_WORK(&ctx->exit_work, io_ring_exit_work);
8591         /*
8592          * Use system_unbound_wq to avoid spawning tons of event kworkers
8593          * if we're exiting a ton of rings at the same time. It just adds
8594          * noise and overhead, there's no discernable change in runtime
8595          * over using system_wq.
8596          */
8597         queue_work(system_unbound_wq, &ctx->exit_work);
8598 }
8599
8600 static int io_uring_release(struct inode *inode, struct file *file)
8601 {
8602         struct io_ring_ctx *ctx = file->private_data;
8603
8604         file->private_data = NULL;
8605         io_ring_ctx_wait_and_kill(ctx);
8606         return 0;
8607 }
8608
8609 struct io_task_cancel {
8610         struct task_struct *task;
8611         struct files_struct *files;
8612 };
8613
8614 static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
8615 {
8616         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
8617         struct io_task_cancel *cancel = data;
8618         bool ret;
8619
8620         if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) {
8621                 unsigned long flags;
8622                 struct io_ring_ctx *ctx = req->ctx;
8623
8624                 /* protect against races with linked timeouts */
8625                 spin_lock_irqsave(&ctx->completion_lock, flags);
8626                 ret = io_match_task(req, cancel->task, cancel->files);
8627                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
8628         } else {
8629                 ret = io_match_task(req, cancel->task, cancel->files);
8630         }
8631         return ret;
8632 }
8633
8634 static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
8635                                   struct task_struct *task,
8636                                   struct files_struct *files)
8637 {
8638         struct io_defer_entry *de;
8639         LIST_HEAD(list);
8640
8641         spin_lock_irq(&ctx->completion_lock);
8642         list_for_each_entry_reverse(de, &ctx->defer_list, list) {
8643                 if (io_match_task(de->req, task, files)) {
8644                         list_cut_position(&list, &ctx->defer_list, &de->list);
8645                         break;
8646                 }
8647         }
8648         spin_unlock_irq(&ctx->completion_lock);
8649         if (list_empty(&list))
8650                 return false;
8651
8652         while (!list_empty(&list)) {
8653                 de = list_first_entry(&list, struct io_defer_entry, list);
8654                 list_del_init(&de->list);
8655                 req_set_fail_links(de->req);
8656                 io_put_req(de->req);
8657                 io_req_complete(de->req, -ECANCELED);
8658                 kfree(de);
8659         }
8660         return true;
8661 }
8662
8663 static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
8664 {
8665         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
8666
8667         return req->ctx == data;
8668 }
8669
8670 static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
8671 {
8672         struct io_tctx_node *node;
8673         enum io_wq_cancel cret;
8674         bool ret = false;
8675
8676         mutex_lock(&ctx->uring_lock);
8677         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
8678                 struct io_uring_task *tctx = node->task->io_uring;
8679
8680                 /*
8681                  * io_wq will stay alive while we hold uring_lock, because it's
8682                  * killed after ctx nodes, which requires to take the lock.
8683                  */
8684                 if (!tctx || !tctx->io_wq)
8685                         continue;
8686                 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
8687                 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
8688         }
8689         mutex_unlock(&ctx->uring_lock);
8690
8691         return ret;
8692 }
8693
8694 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
8695                                          struct task_struct *task,
8696                                          struct files_struct *files)
8697 {
8698         struct io_task_cancel cancel = { .task = task, .files = files, };
8699         struct io_uring_task *tctx = task ? task->io_uring : NULL;
8700
8701         while (1) {
8702                 enum io_wq_cancel cret;
8703                 bool ret = false;
8704
8705                 if (!task) {
8706                         ret |= io_uring_try_cancel_iowq(ctx);
8707                 } else if (tctx && tctx->io_wq) {
8708                         /*
8709                          * Cancels requests of all rings, not only @ctx, but
8710                          * it's fine as the task is in exit/exec.
8711                          */
8712                         cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
8713                                                &cancel, true);
8714                         ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
8715                 }
8716
8717                 /* SQPOLL thread does its own polling */
8718                 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && !files) ||
8719                     (ctx->sq_data && ctx->sq_data->thread == current)) {
8720                         while (!list_empty_careful(&ctx->iopoll_list)) {
8721                                 io_iopoll_try_reap_events(ctx);
8722                                 ret = true;
8723                         }
8724                 }
8725
8726                 ret |= io_cancel_defer_files(ctx, task, files);
8727                 ret |= io_poll_remove_all(ctx, task, files);
8728                 ret |= io_kill_timeouts(ctx, task, files);
8729                 ret |= io_run_task_work();
8730                 ret |= io_run_ctx_fallback(ctx);
8731                 io_cqring_overflow_flush(ctx, true, task, files);
8732                 if (!ret)
8733                         break;
8734                 cond_resched();
8735         }
8736 }
8737
8738 static int io_uring_count_inflight(struct io_ring_ctx *ctx,
8739                                    struct task_struct *task,
8740                                    struct files_struct *files)
8741 {
8742         struct io_kiocb *req;
8743         int cnt = 0;
8744
8745         spin_lock_irq(&ctx->inflight_lock);
8746         list_for_each_entry(req, &ctx->inflight_list, inflight_entry)
8747                 cnt += io_match_task(req, task, files);
8748         spin_unlock_irq(&ctx->inflight_lock);
8749         return cnt;
8750 }
8751
8752 static void io_uring_cancel_files(struct io_ring_ctx *ctx,
8753                                   struct task_struct *task,
8754                                   struct files_struct *files)
8755 {
8756         while (!list_empty_careful(&ctx->inflight_list)) {
8757                 DEFINE_WAIT(wait);
8758                 int inflight;
8759
8760                 inflight = io_uring_count_inflight(ctx, task, files);
8761                 if (!inflight)
8762                         break;
8763
8764                 io_uring_try_cancel_requests(ctx, task, files);
8765
8766                 prepare_to_wait(&task->io_uring->wait, &wait,
8767                                 TASK_UNINTERRUPTIBLE);
8768                 if (inflight == io_uring_count_inflight(ctx, task, files))
8769                         schedule();
8770                 finish_wait(&task->io_uring->wait, &wait);
8771         }
8772 }
8773
8774 /*
8775  * Note that this task has used io_uring. We use it for cancelation purposes.
8776  */
8777 static int io_uring_add_task_file(struct io_ring_ctx *ctx)
8778 {
8779         struct io_uring_task *tctx = current->io_uring;
8780         struct io_tctx_node *node;
8781         int ret;
8782
8783         if (unlikely(!tctx)) {
8784                 ret = io_uring_alloc_task_context(current, ctx);
8785                 if (unlikely(ret))
8786                         return ret;
8787                 tctx = current->io_uring;
8788         }
8789         if (tctx->last != ctx) {
8790                 void *old = xa_load(&tctx->xa, (unsigned long)ctx);
8791
8792                 if (!old) {
8793                         node = kmalloc(sizeof(*node), GFP_KERNEL);
8794                         if (!node)
8795                                 return -ENOMEM;
8796                         node->ctx = ctx;
8797                         node->task = current;
8798
8799                         ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
8800                                                 node, GFP_KERNEL));
8801                         if (ret) {
8802                                 kfree(node);
8803                                 return ret;
8804                         }
8805
8806                         mutex_lock(&ctx->uring_lock);
8807                         list_add(&node->ctx_node, &ctx->tctx_list);
8808                         mutex_unlock(&ctx->uring_lock);
8809                 }
8810                 tctx->last = ctx;
8811         }
8812         return 0;
8813 }
8814
8815 /*
8816  * Remove this io_uring_file -> task mapping.
8817  */
8818 static void io_uring_del_task_file(unsigned long index)
8819 {
8820         struct io_uring_task *tctx = current->io_uring;
8821         struct io_tctx_node *node;
8822
8823         if (!tctx)
8824                 return;
8825         node = xa_erase(&tctx->xa, index);
8826         if (!node)
8827                 return;
8828
8829         WARN_ON_ONCE(current != node->task);
8830         WARN_ON_ONCE(list_empty(&node->ctx_node));
8831
8832         mutex_lock(&node->ctx->uring_lock);
8833         list_del(&node->ctx_node);
8834         mutex_unlock(&node->ctx->uring_lock);
8835
8836         if (tctx->last == node->ctx)
8837                 tctx->last = NULL;
8838         kfree(node);
8839 }
8840
8841 static void io_uring_clean_tctx(struct io_uring_task *tctx)
8842 {
8843         struct io_tctx_node *node;
8844         unsigned long index;
8845
8846         xa_for_each(&tctx->xa, index, node)
8847                 io_uring_del_task_file(index);
8848         if (tctx->io_wq) {
8849                 io_wq_put_and_exit(tctx->io_wq);
8850                 tctx->io_wq = NULL;
8851         }
8852 }
8853
8854 static s64 tctx_inflight(struct io_uring_task *tctx)
8855 {
8856         return percpu_counter_sum(&tctx->inflight);
8857 }
8858
8859 static void io_sqpoll_cancel_cb(struct callback_head *cb)
8860 {
8861         struct io_tctx_exit *work = container_of(cb, struct io_tctx_exit, task_work);
8862         struct io_ring_ctx *ctx = work->ctx;
8863         struct io_sq_data *sqd = ctx->sq_data;
8864
8865         if (sqd->thread)
8866                 io_uring_cancel_sqpoll(ctx);
8867         complete(&work->completion);
8868 }
8869
8870 static void io_sqpoll_cancel_sync(struct io_ring_ctx *ctx)
8871 {
8872         struct io_sq_data *sqd = ctx->sq_data;
8873         struct io_tctx_exit work = { .ctx = ctx, };
8874         struct task_struct *task;
8875
8876         io_sq_thread_park(sqd);
8877         list_del_init(&ctx->sqd_list);
8878         io_sqd_update_thread_idle(sqd);
8879         task = sqd->thread;
8880         if (task) {
8881                 init_completion(&work.completion);
8882                 init_task_work(&work.task_work, io_sqpoll_cancel_cb);
8883                 WARN_ON_ONCE(task_work_add(task, &work.task_work, TWA_SIGNAL));
8884                 wake_up_process(task);
8885         }
8886         io_sq_thread_unpark(sqd);
8887
8888         if (task)
8889                 wait_for_completion(&work.completion);
8890 }
8891
8892 void __io_uring_files_cancel(struct files_struct *files)
8893 {
8894         struct io_uring_task *tctx = current->io_uring;
8895         struct io_tctx_node *node;
8896         unsigned long index;
8897
8898         /* make sure overflow events are dropped */
8899         atomic_inc(&tctx->in_idle);
8900         xa_for_each(&tctx->xa, index, node) {
8901                 struct io_ring_ctx *ctx = node->ctx;
8902
8903                 if (ctx->sq_data) {
8904                         io_sqpoll_cancel_sync(ctx);
8905                         continue;
8906                 }
8907                 io_uring_cancel_files(ctx, current, files);
8908                 if (!files)
8909                         io_uring_try_cancel_requests(ctx, current, NULL);
8910         }
8911         atomic_dec(&tctx->in_idle);
8912
8913         if (files)
8914                 io_uring_clean_tctx(tctx);
8915 }
8916
8917 /* should only be called by SQPOLL task */
8918 static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
8919 {
8920         struct io_sq_data *sqd = ctx->sq_data;
8921         struct io_uring_task *tctx = current->io_uring;
8922         s64 inflight;
8923         DEFINE_WAIT(wait);
8924
8925         WARN_ON_ONCE(!sqd || ctx->sq_data->thread != current);
8926
8927         atomic_inc(&tctx->in_idle);
8928         do {
8929                 /* read completions before cancelations */
8930                 inflight = tctx_inflight(tctx);
8931                 if (!inflight)
8932                         break;
8933                 io_uring_try_cancel_requests(ctx, current, NULL);
8934
8935                 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
8936                 /*
8937                  * If we've seen completions, retry without waiting. This
8938                  * avoids a race where a completion comes in before we did
8939                  * prepare_to_wait().
8940                  */
8941                 if (inflight == tctx_inflight(tctx))
8942                         schedule();
8943                 finish_wait(&tctx->wait, &wait);
8944         } while (1);
8945         atomic_dec(&tctx->in_idle);
8946 }
8947
8948 /*
8949  * Find any io_uring fd that this task has registered or done IO on, and cancel
8950  * requests.
8951  */
8952 void __io_uring_task_cancel(void)
8953 {
8954         struct io_uring_task *tctx = current->io_uring;
8955         DEFINE_WAIT(wait);
8956         s64 inflight;
8957
8958         /* make sure overflow events are dropped */
8959         atomic_inc(&tctx->in_idle);
8960         do {
8961                 /* read completions before cancelations */
8962                 inflight = tctx_inflight(tctx);
8963                 if (!inflight)
8964                         break;
8965                 __io_uring_files_cancel(NULL);
8966
8967                 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
8968
8969                 /*
8970                  * If we've seen completions, retry without waiting. This
8971                  * avoids a race where a completion comes in before we did
8972                  * prepare_to_wait().
8973                  */
8974                 if (inflight == tctx_inflight(tctx))
8975                         schedule();
8976                 finish_wait(&tctx->wait, &wait);
8977         } while (1);
8978
8979         atomic_dec(&tctx->in_idle);
8980
8981         io_uring_clean_tctx(tctx);
8982         /* all current's requests should be gone, we can kill tctx */
8983         __io_uring_free(current);
8984 }
8985
8986 static void *io_uring_validate_mmap_request(struct file *file,
8987                                             loff_t pgoff, size_t sz)
8988 {
8989         struct io_ring_ctx *ctx = file->private_data;
8990         loff_t offset = pgoff << PAGE_SHIFT;
8991         struct page *page;
8992         void *ptr;
8993
8994         switch (offset) {
8995         case IORING_OFF_SQ_RING:
8996         case IORING_OFF_CQ_RING:
8997                 ptr = ctx->rings;
8998                 break;
8999         case IORING_OFF_SQES:
9000                 ptr = ctx->sq_sqes;
9001                 break;
9002         default:
9003                 return ERR_PTR(-EINVAL);
9004         }
9005
9006         page = virt_to_head_page(ptr);
9007         if (sz > page_size(page))
9008                 return ERR_PTR(-EINVAL);
9009
9010         return ptr;
9011 }
9012
9013 #ifdef CONFIG_MMU
9014
9015 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
9016 {
9017         size_t sz = vma->vm_end - vma->vm_start;
9018         unsigned long pfn;
9019         void *ptr;
9020
9021         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
9022         if (IS_ERR(ptr))
9023                 return PTR_ERR(ptr);
9024
9025         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
9026         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
9027 }
9028
9029 #else /* !CONFIG_MMU */
9030
9031 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
9032 {
9033         return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
9034 }
9035
9036 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
9037 {
9038         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
9039 }
9040
9041 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
9042         unsigned long addr, unsigned long len,
9043         unsigned long pgoff, unsigned long flags)
9044 {
9045         void *ptr;
9046
9047         ptr = io_uring_validate_mmap_request(file, pgoff, len);
9048         if (IS_ERR(ptr))
9049                 return PTR_ERR(ptr);
9050
9051         return (unsigned long) ptr;
9052 }
9053
9054 #endif /* !CONFIG_MMU */
9055
9056 static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
9057 {
9058         DEFINE_WAIT(wait);
9059
9060         do {
9061                 if (!io_sqring_full(ctx))
9062                         break;
9063                 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
9064
9065                 if (!io_sqring_full(ctx))
9066                         break;
9067                 schedule();
9068         } while (!signal_pending(current));
9069
9070         finish_wait(&ctx->sqo_sq_wait, &wait);
9071         return 0;
9072 }
9073
9074 static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
9075                           struct __kernel_timespec __user **ts,
9076                           const sigset_t __user **sig)
9077 {
9078         struct io_uring_getevents_arg arg;
9079
9080         /*
9081          * If EXT_ARG isn't set, then we have no timespec and the argp pointer
9082          * is just a pointer to the sigset_t.
9083          */
9084         if (!(flags & IORING_ENTER_EXT_ARG)) {
9085                 *sig = (const sigset_t __user *) argp;
9086                 *ts = NULL;
9087                 return 0;
9088         }
9089
9090         /*
9091          * EXT_ARG is set - ensure we agree on the size of it and copy in our
9092          * timespec and sigset_t pointers if good.
9093          */
9094         if (*argsz != sizeof(arg))
9095                 return -EINVAL;
9096         if (copy_from_user(&arg, argp, sizeof(arg)))
9097                 return -EFAULT;
9098         *sig = u64_to_user_ptr(arg.sigmask);
9099         *argsz = arg.sigmask_sz;
9100         *ts = u64_to_user_ptr(arg.ts);
9101         return 0;
9102 }
9103
9104 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
9105                 u32, min_complete, u32, flags, const void __user *, argp,
9106                 size_t, argsz)
9107 {
9108         struct io_ring_ctx *ctx;
9109         long ret = -EBADF;
9110         int submitted = 0;
9111         struct fd f;
9112
9113         io_run_task_work();
9114
9115         if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
9116                         IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))
9117                 return -EINVAL;
9118
9119         f = fdget(fd);
9120         if (!f.file)
9121                 return -EBADF;
9122
9123         ret = -EOPNOTSUPP;
9124         if (f.file->f_op != &io_uring_fops)
9125                 goto out_fput;
9126
9127         ret = -ENXIO;
9128         ctx = f.file->private_data;
9129         if (!percpu_ref_tryget(&ctx->refs))
9130                 goto out_fput;
9131
9132         ret = -EBADFD;
9133         if (ctx->flags & IORING_SETUP_R_DISABLED)
9134                 goto out;
9135
9136         /*
9137          * For SQ polling, the thread will do all submissions and completions.
9138          * Just return the requested submit count, and wake the thread if
9139          * we were asked to.
9140          */
9141         ret = 0;
9142         if (ctx->flags & IORING_SETUP_SQPOLL) {
9143                 io_cqring_overflow_flush(ctx, false, NULL, NULL);
9144
9145                 ret = -EOWNERDEAD;
9146                 if (unlikely(ctx->sq_data->thread == NULL)) {
9147                         goto out;
9148                 }
9149                 if (flags & IORING_ENTER_SQ_WAKEUP)
9150                         wake_up(&ctx->sq_data->wait);
9151                 if (flags & IORING_ENTER_SQ_WAIT) {
9152                         ret = io_sqpoll_wait_sq(ctx);
9153                         if (ret)
9154                                 goto out;
9155                 }
9156                 submitted = to_submit;
9157         } else if (to_submit) {
9158                 ret = io_uring_add_task_file(ctx);
9159                 if (unlikely(ret))
9160                         goto out;
9161                 mutex_lock(&ctx->uring_lock);
9162                 submitted = io_submit_sqes(ctx, to_submit);
9163                 mutex_unlock(&ctx->uring_lock);
9164
9165                 if (submitted != to_submit)
9166                         goto out;
9167         }
9168         if (flags & IORING_ENTER_GETEVENTS) {
9169                 const sigset_t __user *sig;
9170                 struct __kernel_timespec __user *ts;
9171
9172                 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
9173                 if (unlikely(ret))
9174                         goto out;
9175
9176                 min_complete = min(min_complete, ctx->cq_entries);
9177
9178                 /*
9179                  * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
9180                  * space applications don't need to do io completion events
9181                  * polling again, they can rely on io_sq_thread to do polling
9182                  * work, which can reduce cpu usage and uring_lock contention.
9183                  */
9184                 if (ctx->flags & IORING_SETUP_IOPOLL &&
9185                     !(ctx->flags & IORING_SETUP_SQPOLL)) {
9186                         ret = io_iopoll_check(ctx, min_complete);
9187                 } else {
9188                         ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
9189                 }
9190         }
9191
9192 out:
9193         percpu_ref_put(&ctx->refs);
9194 out_fput:
9195         fdput(f);
9196         return submitted ? submitted : ret;
9197 }
9198
9199 #ifdef CONFIG_PROC_FS
9200 static int io_uring_show_cred(struct seq_file *m, unsigned int id,
9201                 const struct cred *cred)
9202 {
9203         struct user_namespace *uns = seq_user_ns(m);
9204         struct group_info *gi;
9205         kernel_cap_t cap;
9206         unsigned __capi;
9207         int g;
9208
9209         seq_printf(m, "%5d\n", id);
9210         seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
9211         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
9212         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
9213         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
9214         seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
9215         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
9216         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
9217         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
9218         seq_puts(m, "\n\tGroups:\t");
9219         gi = cred->group_info;
9220         for (g = 0; g < gi->ngroups; g++) {
9221                 seq_put_decimal_ull(m, g ? " " : "",
9222                                         from_kgid_munged(uns, gi->gid[g]));
9223         }
9224         seq_puts(m, "\n\tCapEff:\t");
9225         cap = cred->cap_effective;
9226         CAP_FOR_EACH_U32(__capi)
9227                 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
9228         seq_putc(m, '\n');
9229         return 0;
9230 }
9231
9232 static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
9233 {
9234         struct io_sq_data *sq = NULL;
9235         bool has_lock;
9236         int i;
9237
9238         /*
9239          * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
9240          * since fdinfo case grabs it in the opposite direction of normal use
9241          * cases. If we fail to get the lock, we just don't iterate any
9242          * structures that could be going away outside the io_uring mutex.
9243          */
9244         has_lock = mutex_trylock(&ctx->uring_lock);
9245
9246         if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
9247                 sq = ctx->sq_data;
9248                 if (!sq->thread)
9249                         sq = NULL;
9250         }
9251
9252         seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
9253         seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
9254         seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
9255         for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
9256                 struct file *f = *io_fixed_file_slot(ctx->file_data, i);
9257
9258                 if (f)
9259                         seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
9260                 else
9261                         seq_printf(m, "%5u: <none>\n", i);
9262         }
9263         seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
9264         for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
9265                 struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
9266
9267                 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
9268                                                 (unsigned int) buf->len);
9269         }
9270         if (has_lock && !xa_empty(&ctx->personalities)) {
9271                 unsigned long index;
9272                 const struct cred *cred;
9273
9274                 seq_printf(m, "Personalities:\n");
9275                 xa_for_each(&ctx->personalities, index, cred)
9276                         io_uring_show_cred(m, index, cred);
9277         }
9278         seq_printf(m, "PollList:\n");
9279         spin_lock_irq(&ctx->completion_lock);
9280         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
9281                 struct hlist_head *list = &ctx->cancel_hash[i];
9282                 struct io_kiocb *req;
9283
9284                 hlist_for_each_entry(req, list, hash_node)
9285                         seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
9286                                         req->task->task_works != NULL);
9287         }
9288         spin_unlock_irq(&ctx->completion_lock);
9289         if (has_lock)
9290                 mutex_unlock(&ctx->uring_lock);
9291 }
9292
9293 static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
9294 {
9295         struct io_ring_ctx *ctx = f->private_data;
9296
9297         if (percpu_ref_tryget(&ctx->refs)) {
9298                 __io_uring_show_fdinfo(ctx, m);
9299                 percpu_ref_put(&ctx->refs);
9300         }
9301 }
9302 #endif
9303
9304 static const struct file_operations io_uring_fops = {
9305         .release        = io_uring_release,
9306         .mmap           = io_uring_mmap,
9307 #ifndef CONFIG_MMU
9308         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
9309         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
9310 #endif
9311         .poll           = io_uring_poll,
9312         .fasync         = io_uring_fasync,
9313 #ifdef CONFIG_PROC_FS
9314         .show_fdinfo    = io_uring_show_fdinfo,
9315 #endif
9316 };
9317
9318 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
9319                                   struct io_uring_params *p)
9320 {
9321         struct io_rings *rings;
9322         size_t size, sq_array_offset;
9323
9324         /* make sure these are sane, as we already accounted them */
9325         ctx->sq_entries = p->sq_entries;
9326         ctx->cq_entries = p->cq_entries;
9327
9328         size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
9329         if (size == SIZE_MAX)
9330                 return -EOVERFLOW;
9331
9332         rings = io_mem_alloc(size);
9333         if (!rings)
9334                 return -ENOMEM;
9335
9336         ctx->rings = rings;
9337         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
9338         rings->sq_ring_mask = p->sq_entries - 1;
9339         rings->cq_ring_mask = p->cq_entries - 1;
9340         rings->sq_ring_entries = p->sq_entries;
9341         rings->cq_ring_entries = p->cq_entries;
9342         ctx->sq_mask = rings->sq_ring_mask;
9343         ctx->cq_mask = rings->cq_ring_mask;
9344
9345         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
9346         if (size == SIZE_MAX) {
9347                 io_mem_free(ctx->rings);
9348                 ctx->rings = NULL;
9349                 return -EOVERFLOW;
9350         }
9351
9352         ctx->sq_sqes = io_mem_alloc(size);
9353         if (!ctx->sq_sqes) {
9354                 io_mem_free(ctx->rings);
9355                 ctx->rings = NULL;
9356                 return -ENOMEM;
9357         }
9358
9359         return 0;
9360 }
9361
9362 static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
9363 {
9364         int ret, fd;
9365
9366         fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
9367         if (fd < 0)
9368                 return fd;
9369
9370         ret = io_uring_add_task_file(ctx);
9371         if (ret) {
9372                 put_unused_fd(fd);
9373                 return ret;
9374         }
9375         fd_install(fd, file);
9376         return fd;
9377 }
9378
9379 /*
9380  * Allocate an anonymous fd, this is what constitutes the application
9381  * visible backing of an io_uring instance. The application mmaps this
9382  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
9383  * we have to tie this fd to a socket for file garbage collection purposes.
9384  */
9385 static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
9386 {
9387         struct file *file;
9388 #if defined(CONFIG_UNIX)
9389         int ret;
9390
9391         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
9392                                 &ctx->ring_sock);
9393         if (ret)
9394                 return ERR_PTR(ret);
9395 #endif
9396
9397         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
9398                                         O_RDWR | O_CLOEXEC);
9399 #if defined(CONFIG_UNIX)
9400         if (IS_ERR(file)) {
9401                 sock_release(ctx->ring_sock);
9402                 ctx->ring_sock = NULL;
9403         } else {
9404                 ctx->ring_sock->file = file;
9405         }
9406 #endif
9407         return file;
9408 }
9409
9410 static int io_uring_create(unsigned entries, struct io_uring_params *p,
9411                            struct io_uring_params __user *params)
9412 {
9413         struct io_ring_ctx *ctx;
9414         struct file *file;
9415         int ret;
9416
9417         if (!entries)
9418                 return -EINVAL;
9419         if (entries > IORING_MAX_ENTRIES) {
9420                 if (!(p->flags & IORING_SETUP_CLAMP))
9421                         return -EINVAL;
9422                 entries = IORING_MAX_ENTRIES;
9423         }
9424
9425         /*
9426          * Use twice as many entries for the CQ ring. It's possible for the
9427          * application to drive a higher depth than the size of the SQ ring,
9428          * since the sqes are only used at submission time. This allows for
9429          * some flexibility in overcommitting a bit. If the application has
9430          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
9431          * of CQ ring entries manually.
9432          */
9433         p->sq_entries = roundup_pow_of_two(entries);
9434         if (p->flags & IORING_SETUP_CQSIZE) {
9435                 /*
9436                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
9437                  * to a power-of-two, if it isn't already. We do NOT impose
9438                  * any cq vs sq ring sizing.
9439                  */
9440                 if (!p->cq_entries)
9441                         return -EINVAL;
9442                 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
9443                         if (!(p->flags & IORING_SETUP_CLAMP))
9444                                 return -EINVAL;
9445                         p->cq_entries = IORING_MAX_CQ_ENTRIES;
9446                 }
9447                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
9448                 if (p->cq_entries < p->sq_entries)
9449                         return -EINVAL;
9450         } else {
9451                 p->cq_entries = 2 * p->sq_entries;
9452         }
9453
9454         ctx = io_ring_ctx_alloc(p);
9455         if (!ctx)
9456                 return -ENOMEM;
9457         ctx->compat = in_compat_syscall();
9458         if (!capable(CAP_IPC_LOCK))
9459                 ctx->user = get_uid(current_user());
9460
9461         /*
9462          * This is just grabbed for accounting purposes. When a process exits,
9463          * the mm is exited and dropped before the files, hence we need to hang
9464          * on to this mm purely for the purposes of being able to unaccount
9465          * memory (locked/pinned vm). It's not used for anything else.
9466          */
9467         mmgrab(current->mm);
9468         ctx->mm_account = current->mm;
9469
9470         ret = io_allocate_scq_urings(ctx, p);
9471         if (ret)
9472                 goto err;
9473
9474         ret = io_sq_offload_create(ctx, p);
9475         if (ret)
9476                 goto err;
9477
9478         memset(&p->sq_off, 0, sizeof(p->sq_off));
9479         p->sq_off.head = offsetof(struct io_rings, sq.head);
9480         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
9481         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
9482         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
9483         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
9484         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
9485         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
9486
9487         memset(&p->cq_off, 0, sizeof(p->cq_off));
9488         p->cq_off.head = offsetof(struct io_rings, cq.head);
9489         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
9490         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
9491         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
9492         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
9493         p->cq_off.cqes = offsetof(struct io_rings, cqes);
9494         p->cq_off.flags = offsetof(struct io_rings, cq_flags);
9495
9496         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
9497                         IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
9498                         IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
9499                         IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
9500                         IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS;
9501
9502         if (copy_to_user(params, p, sizeof(*p))) {
9503                 ret = -EFAULT;
9504                 goto err;
9505         }
9506
9507         file = io_uring_get_file(ctx);
9508         if (IS_ERR(file)) {
9509                 ret = PTR_ERR(file);
9510                 goto err;
9511         }
9512
9513         /*
9514          * Install ring fd as the very last thing, so we don't risk someone
9515          * having closed it before we finish setup
9516          */
9517         ret = io_uring_install_fd(ctx, file);
9518         if (ret < 0) {
9519                 /* fput will clean it up */
9520                 fput(file);
9521                 return ret;
9522         }
9523
9524         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
9525         return ret;
9526 err:
9527         io_ring_ctx_wait_and_kill(ctx);
9528         return ret;
9529 }
9530
9531 /*
9532  * Sets up an aio uring context, and returns the fd. Applications asks for a
9533  * ring size, we return the actual sq/cq ring sizes (among other things) in the
9534  * params structure passed in.
9535  */
9536 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
9537 {
9538         struct io_uring_params p;
9539         int i;
9540
9541         if (copy_from_user(&p, params, sizeof(p)))
9542                 return -EFAULT;
9543         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
9544                 if (p.resv[i])
9545                         return -EINVAL;
9546         }
9547
9548         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
9549                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
9550                         IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
9551                         IORING_SETUP_R_DISABLED))
9552                 return -EINVAL;
9553
9554         return  io_uring_create(entries, &p, params);
9555 }
9556
9557 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
9558                 struct io_uring_params __user *, params)
9559 {
9560         return io_uring_setup(entries, params);
9561 }
9562
9563 static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
9564 {
9565         struct io_uring_probe *p;
9566         size_t size;
9567         int i, ret;
9568
9569         size = struct_size(p, ops, nr_args);
9570         if (size == SIZE_MAX)
9571                 return -EOVERFLOW;
9572         p = kzalloc(size, GFP_KERNEL);
9573         if (!p)
9574                 return -ENOMEM;
9575
9576         ret = -EFAULT;
9577         if (copy_from_user(p, arg, size))
9578                 goto out;
9579         ret = -EINVAL;
9580         if (memchr_inv(p, 0, size))
9581                 goto out;
9582
9583         p->last_op = IORING_OP_LAST - 1;
9584         if (nr_args > IORING_OP_LAST)
9585                 nr_args = IORING_OP_LAST;
9586
9587         for (i = 0; i < nr_args; i++) {
9588                 p->ops[i].op = i;
9589                 if (!io_op_defs[i].not_supported)
9590                         p->ops[i].flags = IO_URING_OP_SUPPORTED;
9591         }
9592         p->ops_len = i;
9593
9594         ret = 0;
9595         if (copy_to_user(arg, p, size))
9596                 ret = -EFAULT;
9597 out:
9598         kfree(p);
9599         return ret;
9600 }
9601
9602 static int io_register_personality(struct io_ring_ctx *ctx)
9603 {
9604         const struct cred *creds;
9605         u32 id;
9606         int ret;
9607
9608         creds = get_current_cred();
9609
9610         ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
9611                         XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
9612         if (!ret)
9613                 return id;
9614         put_cred(creds);
9615         return ret;
9616 }
9617
9618 static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
9619                                     unsigned int nr_args)
9620 {
9621         struct io_uring_restriction *res;
9622         size_t size;
9623         int i, ret;
9624
9625         /* Restrictions allowed only if rings started disabled */
9626         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
9627                 return -EBADFD;
9628
9629         /* We allow only a single restrictions registration */
9630         if (ctx->restrictions.registered)
9631                 return -EBUSY;
9632
9633         if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
9634                 return -EINVAL;
9635
9636         size = array_size(nr_args, sizeof(*res));
9637         if (size == SIZE_MAX)
9638                 return -EOVERFLOW;
9639
9640         res = memdup_user(arg, size);
9641         if (IS_ERR(res))
9642                 return PTR_ERR(res);
9643
9644         ret = 0;
9645
9646         for (i = 0; i < nr_args; i++) {
9647                 switch (res[i].opcode) {
9648                 case IORING_RESTRICTION_REGISTER_OP:
9649                         if (res[i].register_op >= IORING_REGISTER_LAST) {
9650                                 ret = -EINVAL;
9651                                 goto out;
9652                         }
9653
9654                         __set_bit(res[i].register_op,
9655                                   ctx->restrictions.register_op);
9656                         break;
9657                 case IORING_RESTRICTION_SQE_OP:
9658                         if (res[i].sqe_op >= IORING_OP_LAST) {
9659                                 ret = -EINVAL;
9660                                 goto out;
9661                         }
9662
9663                         __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
9664                         break;
9665                 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
9666                         ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
9667                         break;
9668                 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
9669                         ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
9670                         break;
9671                 default:
9672                         ret = -EINVAL;
9673                         goto out;
9674                 }
9675         }
9676
9677 out:
9678         /* Reset all restrictions if an error happened */
9679         if (ret != 0)
9680                 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
9681         else
9682                 ctx->restrictions.registered = true;
9683
9684         kfree(res);
9685         return ret;
9686 }
9687
9688 static int io_register_enable_rings(struct io_ring_ctx *ctx)
9689 {
9690         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
9691                 return -EBADFD;
9692
9693         if (ctx->restrictions.registered)
9694                 ctx->restricted = 1;
9695
9696         ctx->flags &= ~IORING_SETUP_R_DISABLED;
9697         if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
9698                 wake_up(&ctx->sq_data->wait);
9699         return 0;
9700 }
9701
9702 static bool io_register_op_must_quiesce(int op)
9703 {
9704         switch (op) {
9705         case IORING_UNREGISTER_FILES:
9706         case IORING_REGISTER_FILES_UPDATE:
9707         case IORING_REGISTER_PROBE:
9708         case IORING_REGISTER_PERSONALITY:
9709         case IORING_UNREGISTER_PERSONALITY:
9710                 return false;
9711         default:
9712                 return true;
9713         }
9714 }
9715
9716 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
9717                                void __user *arg, unsigned nr_args)
9718         __releases(ctx->uring_lock)
9719         __acquires(ctx->uring_lock)
9720 {
9721         int ret;
9722
9723         /*
9724          * We're inside the ring mutex, if the ref is already dying, then
9725          * someone else killed the ctx or is already going through
9726          * io_uring_register().
9727          */
9728         if (percpu_ref_is_dying(&ctx->refs))
9729                 return -ENXIO;
9730
9731         if (io_register_op_must_quiesce(opcode)) {
9732                 percpu_ref_kill(&ctx->refs);
9733
9734                 /*
9735                  * Drop uring mutex before waiting for references to exit. If
9736                  * another thread is currently inside io_uring_enter() it might
9737                  * need to grab the uring_lock to make progress. If we hold it
9738                  * here across the drain wait, then we can deadlock. It's safe
9739                  * to drop the mutex here, since no new references will come in
9740                  * after we've killed the percpu ref.
9741                  */
9742                 mutex_unlock(&ctx->uring_lock);
9743                 do {
9744                         ret = wait_for_completion_interruptible(&ctx->ref_comp);
9745                         if (!ret)
9746                                 break;
9747                         ret = io_run_task_work_sig();
9748                         if (ret < 0)
9749                                 break;
9750                 } while (1);
9751
9752                 mutex_lock(&ctx->uring_lock);
9753
9754                 if (ret) {
9755                         percpu_ref_resurrect(&ctx->refs);
9756                         goto out_quiesce;
9757                 }
9758         }
9759
9760         if (ctx->restricted) {
9761                 if (opcode >= IORING_REGISTER_LAST) {
9762                         ret = -EINVAL;
9763                         goto out;
9764                 }
9765
9766                 if (!test_bit(opcode, ctx->restrictions.register_op)) {
9767                         ret = -EACCES;
9768                         goto out;
9769                 }
9770         }
9771
9772         switch (opcode) {
9773         case IORING_REGISTER_BUFFERS:
9774                 ret = io_sqe_buffers_register(ctx, arg, nr_args);
9775                 break;
9776         case IORING_UNREGISTER_BUFFERS:
9777                 ret = -EINVAL;
9778                 if (arg || nr_args)
9779                         break;
9780                 ret = io_sqe_buffers_unregister(ctx);
9781                 break;
9782         case IORING_REGISTER_FILES:
9783                 ret = io_sqe_files_register(ctx, arg, nr_args);
9784                 break;
9785         case IORING_UNREGISTER_FILES:
9786                 ret = -EINVAL;
9787                 if (arg || nr_args)
9788                         break;
9789                 ret = io_sqe_files_unregister(ctx);
9790                 break;
9791         case IORING_REGISTER_FILES_UPDATE:
9792                 ret = io_sqe_files_update(ctx, arg, nr_args);
9793                 break;
9794         case IORING_REGISTER_EVENTFD:
9795         case IORING_REGISTER_EVENTFD_ASYNC:
9796                 ret = -EINVAL;
9797                 if (nr_args != 1)
9798                         break;
9799                 ret = io_eventfd_register(ctx, arg);
9800                 if (ret)
9801                         break;
9802                 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
9803                         ctx->eventfd_async = 1;
9804                 else
9805                         ctx->eventfd_async = 0;
9806                 break;
9807         case IORING_UNREGISTER_EVENTFD:
9808                 ret = -EINVAL;
9809                 if (arg || nr_args)
9810                         break;
9811                 ret = io_eventfd_unregister(ctx);
9812                 break;
9813         case IORING_REGISTER_PROBE:
9814                 ret = -EINVAL;
9815                 if (!arg || nr_args > 256)
9816                         break;
9817                 ret = io_probe(ctx, arg, nr_args);
9818                 break;
9819         case IORING_REGISTER_PERSONALITY:
9820                 ret = -EINVAL;
9821                 if (arg || nr_args)
9822                         break;
9823                 ret = io_register_personality(ctx);
9824                 break;
9825         case IORING_UNREGISTER_PERSONALITY:
9826                 ret = -EINVAL;
9827                 if (arg)
9828                         break;
9829                 ret = io_unregister_personality(ctx, nr_args);
9830                 break;
9831         case IORING_REGISTER_ENABLE_RINGS:
9832                 ret = -EINVAL;
9833                 if (arg || nr_args)
9834                         break;
9835                 ret = io_register_enable_rings(ctx);
9836                 break;
9837         case IORING_REGISTER_RESTRICTIONS:
9838                 ret = io_register_restrictions(ctx, arg, nr_args);
9839                 break;
9840         default:
9841                 ret = -EINVAL;
9842                 break;
9843         }
9844
9845 out:
9846         if (io_register_op_must_quiesce(opcode)) {
9847                 /* bring the ctx back to life */
9848                 percpu_ref_reinit(&ctx->refs);
9849 out_quiesce:
9850                 reinit_completion(&ctx->ref_comp);
9851         }
9852         return ret;
9853 }
9854
9855 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
9856                 void __user *, arg, unsigned int, nr_args)
9857 {
9858         struct io_ring_ctx *ctx;
9859         long ret = -EBADF;
9860         struct fd f;
9861
9862         f = fdget(fd);
9863         if (!f.file)
9864                 return -EBADF;
9865
9866         ret = -EOPNOTSUPP;
9867         if (f.file->f_op != &io_uring_fops)
9868                 goto out_fput;
9869
9870         ctx = f.file->private_data;
9871
9872         io_run_task_work();
9873
9874         mutex_lock(&ctx->uring_lock);
9875         ret = __io_uring_register(ctx, opcode, arg, nr_args);
9876         mutex_unlock(&ctx->uring_lock);
9877         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
9878                                                         ctx->cq_ev_fd != NULL, ret);
9879 out_fput:
9880         fdput(f);
9881         return ret;
9882 }
9883
9884 static int __init io_uring_init(void)
9885 {
9886 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
9887         BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
9888         BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
9889 } while (0)
9890
9891 #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
9892         __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
9893         BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
9894         BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
9895         BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
9896         BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
9897         BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
9898         BUILD_BUG_SQE_ELEM(8,  __u64,  off);
9899         BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
9900         BUILD_BUG_SQE_ELEM(16, __u64,  addr);
9901         BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
9902         BUILD_BUG_SQE_ELEM(24, __u32,  len);
9903         BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
9904         BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
9905         BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
9906         BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
9907         BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
9908         BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
9909         BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
9910         BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
9911         BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
9912         BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
9913         BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
9914         BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
9915         BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
9916         BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
9917         BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
9918         BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
9919         BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
9920         BUILD_BUG_SQE_ELEM(42, __u16,  personality);
9921         BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
9922
9923         BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
9924         BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
9925         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
9926                                 SLAB_ACCOUNT);
9927         return 0;
9928 };
9929 __initcall(io_uring_init);