fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqe (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <linux/compat.h>
  47 #include <net/compat.h>
  48 #include <linux/refcount.h>
  49 #include <linux/uio.h>
  50 #include <linux/bits.h>
  51
  52 #include <linux/sched/signal.h>
  53 #include <linux/fs.h>
  54 #include <linux/file.h>
  55 #include <linux/fdtable.h>
  56 #include <linux/mm.h>
  57 #include <linux/mman.h>
  58 #include <linux/percpu.h>
  59 #include <linux/slab.h>
  60 #include <linux/blkdev.h>
  61 #include <linux/bvec.h>
  62 #include <linux/net.h>
  63 #include <net/sock.h>
  64 #include <net/af_unix.h>
  65 #include <net/scm.h>
  66 #include <linux/anon_inodes.h>
  67 #include <linux/sched/mm.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/nospec.h>
  70 #include <linux/sizes.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/highmem.h>
  73 #include <linux/namei.h>
  74 #include <linux/fsnotify.h>
  75 #include <linux/fadvise.h>
  76 #include <linux/eventpoll.h>
  77 #include <linux/splice.h>
  78 #include <linux/task_work.h>
  79 #include <linux/pagemap.h>
  80 #include <linux/io_uring.h>
  81 #include <linux/tracehook.h>
  82
  83 #define CREATE_TRACE_POINTS
  84 #include <trace/events/io_uring.h>
  85
  86 #include <uapi/linux/io_uring.h>
  87
  88 #include "internal.h"
  89 #include "io-wq.h"
  90
  91 #define IORING_MAX_ENTRIES      32768
  92 #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
  93 #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
  94
  95 /* only define max */
  96 #define IORING_MAX_FIXED_FILES  (1U << 15)
  97 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
  98                                  IORING_REGISTER_LAST + IORING_OP_LAST)
  99
 100 #define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
 101 #define IO_RSRC_TAG_TABLE_MAX   (1U << IO_RSRC_TAG_TABLE_SHIFT)
 102 #define IO_RSRC_TAG_TABLE_MASK  (IO_RSRC_TAG_TABLE_MAX - 1)
 103
 104 #define IORING_MAX_REG_BUFFERS  (1U << 14)
 105
 106 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
 107                                 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
 108                                 IOSQE_BUFFER_SELECT)
 109 #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
 110                                 REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS)
 111
 112 #define IO_TCTX_REFS_CACHE_NR   (1U << 10)
 113
 114 struct io_uring {
 115         u32 head ____cacheline_aligned_in_smp;
 116         u32 tail ____cacheline_aligned_in_smp;
 117 };
 118
 119 /*
 120  * This data is shared with the application through the mmap at offsets
 121  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 122  *
 123  * The offsets to the member fields are published through struct
 124  * io_sqring_offsets when calling io_uring_setup.
 125  */
 126 struct io_rings {
 127         /*
 128          * Head and tail offsets into the ring; the offsets need to be
 129          * masked to get valid indices.
 130          *
 131          * The kernel controls head of the sq ring and the tail of the cq ring,
 132          * and the application controls tail of the sq ring and the head of the
 133          * cq ring.
 134          */
 135         struct io_uring         sq, cq;
 136         /*
 137          * Bitmasks to apply to head and tail offsets (constant, equals
 138          * ring_entries - 1)
 139          */
 140         u32                     sq_ring_mask, cq_ring_mask;
 141         /* Ring sizes (constant, power of 2) */
 142         u32                     sq_ring_entries, cq_ring_entries;
 143         /*
 144          * Number of invalid entries dropped by the kernel due to
 145          * invalid index stored in array
 146          *
 147          * Written by the kernel, shouldn't be modified by the
 148          * application (i.e. get number of "new events" by comparing to
 149          * cached value).
 150          *
 151          * After a new SQ head value was read by the application this
 152          * counter includes all submissions that were dropped reaching
 153          * the new SQ head (and possibly more).
 154          */
 155         u32                     sq_dropped;
 156         /*
 157          * Runtime SQ flags
 158          *
 159          * Written by the kernel, shouldn't be modified by the
 160          * application.
 161          *
 162          * The application needs a full memory barrier before checking
 163          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 164          */
 165         u32                     sq_flags;
 166         /*
 167          * Runtime CQ flags
 168          *
 169          * Written by the application, shouldn't be modified by the
 170          * kernel.
 171          */
 172         u32                     cq_flags;
 173         /*
 174          * Number of completion events lost because the queue was full;
 175          * this should be avoided by the application by making sure
 176          * there are not more requests pending than there is space in
 177          * the completion queue.
 178          *
 179          * Written by the kernel, shouldn't be modified by the
 180          * application (i.e. get number of "new events" by comparing to
 181          * cached value).
 182          *
 183          * As completion events come in out of order this counter is not
 184          * ordered with any other data.
 185          */
 186         u32                     cq_overflow;
 187         /*
 188          * Ring buffer of completion events.
 189          *
 190          * The kernel writes completion events fresh every time they are
 191          * produced, so the application is allowed to modify pending
 192          * entries.
 193          */
 194         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 195 };
 196
 197 enum io_uring_cmd_flags {
 198         IO_URING_F_NONBLOCK             = 1,
 199         IO_URING_F_COMPLETE_DEFER       = 2,
 200 };
 201
 202 struct io_mapped_ubuf {
 203         u64             ubuf;
 204         u64             ubuf_end;
 205         unsigned int    nr_bvecs;
 206         unsigned long   acct_pages;
 207         struct bio_vec  bvec[];
 208 };
 209
 210 struct io_ring_ctx;
 211
 212 struct io_overflow_cqe {
 213         struct io_uring_cqe cqe;
 214         struct list_head list;
 215 };
 216
 217 struct io_fixed_file {
 218         /* file * with additional FFS_* flags */
 219         unsigned long file_ptr;
 220 };
 221
 222 struct io_rsrc_put {
 223         struct list_head list;
 224         u64 tag;
 225         union {
 226                 void *rsrc;
 227                 struct file *file;
 228                 struct io_mapped_ubuf *buf;
 229         };
 230 };
 231
 232 struct io_file_table {
 233         struct io_fixed_file *files;
 234 };
 235
 236 struct io_rsrc_node {
 237         struct percpu_ref               refs;
 238         struct list_head                node;
 239         struct list_head                rsrc_list;
 240         struct io_rsrc_data             *rsrc_data;
 241         struct llist_node               llist;
 242         bool                            done;
 243 };
 244
 245 typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
 246
 247 struct io_rsrc_data {
 248         struct io_ring_ctx              *ctx;
 249
 250         u64                             **tags;
 251         unsigned int                    nr;
 252         rsrc_put_fn                     *do_put;
 253         atomic_t                        refs;
 254         struct completion               done;
 255         bool                            quiesce;
 256 };
 257
 258 struct io_buffer {
 259         struct list_head list;
 260         __u64 addr;
 261         __u32 len;
 262         __u16 bid;
 263 };
 264
 265 struct io_restriction {
 266         DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
 267         DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
 268         u8 sqe_flags_allowed;
 269         u8 sqe_flags_required;
 270         bool registered;
 271 };
 272
 273 enum {
 274         IO_SQ_THREAD_SHOULD_STOP = 0,
 275         IO_SQ_THREAD_SHOULD_PARK,
 276 };
 277
 278 struct io_sq_data {
 279         refcount_t              refs;
 280         atomic_t                park_pending;
 281         struct mutex            lock;
 282
 283         /* ctx's that are using this sqd */
 284         struct list_head        ctx_list;
 285
 286         struct task_struct      *thread;
 287         struct wait_queue_head  wait;
 288
 289         unsigned                sq_thread_idle;
 290         int                     sq_cpu;
 291         pid_t                   task_pid;
 292         pid_t                   task_tgid;
 293
 294         unsigned long           state;
 295         struct completion       exited;
 296 };
 297
 298 #define IO_COMPL_BATCH                  32
 299 #define IO_REQ_CACHE_SIZE               32
 300 #define IO_REQ_ALLOC_BATCH              8
 301
 302 struct io_submit_link {
 303         struct io_kiocb         *head;
 304         struct io_kiocb         *last;
 305 };
 306
 307 struct io_submit_state {
 308         struct blk_plug         plug;
 309         struct io_submit_link   link;
 310
 311         /*
 312          * io_kiocb alloc cache
 313          */
 314         void                    *reqs[IO_REQ_CACHE_SIZE];
 315         unsigned int            free_reqs;
 316
 317         bool                    plug_started;
 318
 319         /*
 320          * Batch completion logic
 321          */
 322         struct io_kiocb         *compl_reqs[IO_COMPL_BATCH];
 323         unsigned int            compl_nr;
 324         /* inline/task_work completion list, under ->uring_lock */
 325         struct list_head        free_list;
 326
 327         unsigned int            ios_left;
 328 };
 329
 330 struct io_ring_ctx {
 331         /* const or read-mostly hot data */
 332         struct {
 333                 struct percpu_ref       refs;
 334
 335                 struct io_rings         *rings;
 336                 unsigned int            flags;
 337                 unsigned int            compat: 1;
 338                 unsigned int            drain_next: 1;
 339                 unsigned int            eventfd_async: 1;
 340                 unsigned int            restricted: 1;
 341                 unsigned int            off_timeout_used: 1;
 342                 unsigned int            drain_active: 1;
 343         } ____cacheline_aligned_in_smp;
 344
 345         /* submission data */
 346         struct {
 347                 struct mutex            uring_lock;
 348
 349                 /*
 350                  * Ring buffer of indices into array of io_uring_sqe, which is
 351                  * mmapped by the application using the IORING_OFF_SQES offset.
 352                  *
 353                  * This indirection could e.g. be used to assign fixed
 354                  * io_uring_sqe entries to operations and only submit them to
 355                  * the queue when needed.
 356                  *
 357                  * The kernel modifies neither the indices array nor the entries
 358                  * array.
 359                  */
 360                 u32                     *sq_array;
 361                 struct io_uring_sqe     *sq_sqes;
 362                 unsigned                cached_sq_head;
 363                 unsigned                sq_entries;
 364                 struct list_head        defer_list;
 365
 366                 /*
 367                  * Fixed resources fast path, should be accessed only under
 368                  * uring_lock, and updated through io_uring_register(2)
 369                  */
 370                 struct io_rsrc_node     *rsrc_node;
 371                 struct io_file_table    file_table;
 372                 unsigned                nr_user_files;
 373                 unsigned                nr_user_bufs;
 374                 struct io_mapped_ubuf   **user_bufs;
 375
 376                 struct io_submit_state  submit_state;
 377                 struct list_head        timeout_list;
 378                 struct list_head        cq_overflow_list;
 379                 struct xarray           io_buffers;
 380                 struct xarray           personalities;
 381                 u32                     pers_next;
 382                 unsigned                sq_thread_idle;
 383         } ____cacheline_aligned_in_smp;
 384
 385         /* IRQ completion list, under ->completion_lock */
 386         struct list_head        locked_free_list;
 387         unsigned int            locked_free_nr;
 388
 389         const struct cred       *sq_creds;      /* cred used for __io_sq_thread() */
 390         struct io_sq_data       *sq_data;       /* if using sq thread polling */
 391
 392         struct wait_queue_head  sqo_sq_wait;
 393         struct list_head        sqd_list;
 394
 395         unsigned long           check_cq_overflow;
 396
 397         struct {
 398                 unsigned                cached_cq_tail;
 399                 unsigned                cq_entries;
 400                 struct eventfd_ctx      *cq_ev_fd;
 401                 struct wait_queue_head  poll_wait;
 402                 struct wait_queue_head  cq_wait;
 403                 unsigned                cq_extra;
 404                 atomic_t                cq_timeouts;
 405                 struct fasync_struct    *cq_fasync;
 406                 unsigned                cq_last_tm_flush;
 407         } ____cacheline_aligned_in_smp;
 408
 409         struct {
 410                 spinlock_t              completion_lock;
 411
 412                 spinlock_t              timeout_lock;
 413
 414                 /*
 415                  * ->iopoll_list is protected by the ctx->uring_lock for
 416                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 417                  * For SQPOLL, only the single threaded io_sq_thread() will
 418                  * manipulate the list, hence no extra locking is needed there.
 419                  */
 420                 struct list_head        iopoll_list;
 421                 struct hlist_head       *cancel_hash;
 422                 unsigned                cancel_hash_bits;
 423                 bool                    poll_multi_queue;
 424         } ____cacheline_aligned_in_smp;
 425
 426         struct io_restriction           restrictions;
 427
 428         /* slow path rsrc auxilary data, used by update/register */
 429         struct {
 430                 struct io_rsrc_node             *rsrc_backup_node;
 431                 struct io_mapped_ubuf           *dummy_ubuf;
 432                 struct io_rsrc_data             *file_data;
 433                 struct io_rsrc_data             *buf_data;
 434
 435                 struct delayed_work             rsrc_put_work;
 436                 struct llist_head               rsrc_put_llist;
 437                 struct list_head                rsrc_ref_list;
 438                 spinlock_t                      rsrc_ref_lock;
 439         };
 440
 441         /* Keep this last, we don't need it for the fast path */
 442         struct {
 443                 #if defined(CONFIG_UNIX)
 444                         struct socket           *ring_sock;
 445                 #endif
 446                 /* hashed buffered write serialization */
 447                 struct io_wq_hash               *hash_map;
 448
 449                 /* Only used for accounting purposes */
 450                 struct user_struct              *user;
 451                 struct mm_struct                *mm_account;
 452
 453                 /* ctx exit and cancelation */
 454                 struct llist_head               fallback_llist;
 455                 struct delayed_work             fallback_work;
 456                 struct work_struct              exit_work;
 457                 struct list_head                tctx_list;
 458                 struct completion               ref_comp;
 459         };
 460 };
 461
 462 struct io_uring_task {
 463         /* submission side */
 464         int                     cached_refs;
 465         struct xarray           xa;
 466         struct wait_queue_head  wait;
 467         const struct io_ring_ctx *last;
 468         struct io_wq            *io_wq;
 469         struct percpu_counter   inflight;
 470         atomic_t                inflight_tracked;
 471         atomic_t                in_idle;
 472
 473         spinlock_t              task_lock;
 474         struct io_wq_work_list  task_list;
 475         struct callback_head    task_work;
 476         bool                    task_running;
 477 };
 478
 479 /*
 480  * First field must be the file pointer in all the
 481  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 482  */
 483 struct io_poll_iocb {
 484         struct file                     *file;
 485         struct wait_queue_head          *head;
 486         __poll_t                        events;
 487         bool                            done;
 488         bool                            canceled;
 489         struct wait_queue_entry         wait;
 490 };
 491
 492 struct io_poll_update {
 493         struct file                     *file;
 494         u64                             old_user_data;
 495         u64                             new_user_data;
 496         __poll_t                        events;
 497         bool                            update_events;
 498         bool                            update_user_data;
 499 };
 500
 501 struct io_close {
 502         struct file                     *file;
 503         int                             fd;
 504 };
 505
 506 struct io_timeout_data {
 507         struct io_kiocb                 *req;
 508         struct hrtimer                  timer;
 509         struct timespec64               ts;
 510         enum hrtimer_mode               mode;
 511         u32                             flags;
 512 };
 513
 514 struct io_accept {
 515         struct file                     *file;
 516         struct sockaddr __user          *addr;
 517         int __user                      *addr_len;
 518         int                             flags;
 519         u32                             file_slot;
 520         unsigned long                   nofile;
 521 };
 522
 523 struct io_sync {
 524         struct file                     *file;
 525         loff_t                          len;
 526         loff_t                          off;
 527         int                             flags;
 528         int                             mode;
 529 };
 530
 531 struct io_cancel {
 532         struct file                     *file;
 533         u64                             addr;
 534 };
 535
 536 struct io_timeout {
 537         struct file                     *file;
 538         u32                             off;
 539         u32                             target_seq;
 540         struct list_head                list;
 541         /* head of the link, used by linked timeouts only */
 542         struct io_kiocb                 *head;
 543         /* for linked completions */
 544         struct io_kiocb                 *prev;
 545 };
 546
 547 struct io_timeout_rem {
 548         struct file                     *file;
 549         u64                             addr;
 550
 551         /* timeout update */
 552         struct timespec64               ts;
 553         u32                             flags;
 554 };
 555
 556 struct io_rw {
 557         /* NOTE: kiocb has the file as the first member, so don't do it here */
 558         struct kiocb                    kiocb;
 559         u64                             addr;
 560         u64                             len;
 561 };
 562
 563 struct io_connect {
 564         struct file                     *file;
 565         struct sockaddr __user          *addr;
 566         int                             addr_len;
 567 };
 568
 569 struct io_sr_msg {
 570         struct file                     *file;
 571         union {
 572                 struct compat_msghdr __user     *umsg_compat;
 573                 struct user_msghdr __user       *umsg;
 574                 void __user                     *buf;
 575         };
 576         int                             msg_flags;
 577         int                             bgid;
 578         size_t                          len;
 579         struct io_buffer                *kbuf;
 580 };
 581
 582 struct io_open {
 583         struct file                     *file;
 584         int                             dfd;
 585         u32                             file_slot;
 586         struct filename                 *filename;
 587         struct open_how                 how;
 588         unsigned long                   nofile;
 589 };
 590
 591 struct io_rsrc_update {
 592         struct file                     *file;
 593         u64                             arg;
 594         u32                             nr_args;
 595         u32                             offset;
 596 };
 597
 598 struct io_fadvise {
 599         struct file                     *file;
 600         u64                             offset;
 601         u32                             len;
 602         u32                             advice;
 603 };
 604
 605 struct io_madvise {
 606         struct file                     *file;
 607         u64                             addr;
 608         u32                             len;
 609         u32                             advice;
 610 };
 611
 612 struct io_epoll {
 613         struct file                     *file;
 614         int                             epfd;
 615         int                             op;
 616         int                             fd;
 617         struct epoll_event              event;
 618 };
 619
 620 struct io_splice {
 621         struct file                     *file_out;
 622         struct file                     *file_in;
 623         loff_t                          off_out;
 624         loff_t                          off_in;
 625         u64                             len;
 626         unsigned int                    flags;
 627 };
 628
 629 struct io_provide_buf {
 630         struct file                     *file;
 631         __u64                           addr;
 632         __u32                           len;
 633         __u32                           bgid;
 634         __u16                           nbufs;
 635         __u16                           bid;
 636 };
 637
 638 struct io_statx {
 639         struct file                     *file;
 640         int                             dfd;
 641         unsigned int                    mask;
 642         unsigned int                    flags;
 643         const char __user               *filename;
 644         struct statx __user             *buffer;
 645 };
 646
 647 struct io_shutdown {
 648         struct file                     *file;
 649         int                             how;
 650 };
 651
 652 struct io_rename {
 653         struct file                     *file;
 654         int                             old_dfd;
 655         int                             new_dfd;
 656         struct filename                 *oldpath;
 657         struct filename                 *newpath;
 658         int                             flags;
 659 };
 660
 661 struct io_unlink {
 662         struct file                     *file;
 663         int                             dfd;
 664         int                             flags;
 665         struct filename                 *filename;
 666 };
 667
 668 struct io_completion {
 669         struct file                     *file;
 670         u32                             cflags;
 671 };
 672
 673 struct io_async_connect {
 674         struct sockaddr_storage         address;
 675 };
 676
 677 struct io_async_msghdr {
 678         struct iovec                    fast_iov[UIO_FASTIOV];
 679         /* points to an allocated iov, if NULL we use fast_iov instead */
 680         struct iovec                    *free_iov;
 681         struct sockaddr __user          *uaddr;
 682         struct msghdr                   msg;
 683         struct sockaddr_storage         addr;
 684 };
 685
 686 struct io_async_rw {
 687         struct iovec                    fast_iov[UIO_FASTIOV];
 688         const struct iovec              *free_iovec;
 689         struct iov_iter                 iter;
 690         size_t                          bytes_done;
 691         struct wait_page_queue          wpq;
 692 };
 693
 694 enum {
 695         REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
 696         REQ_F_IO_DRAIN_BIT      = IOSQE_IO_DRAIN_BIT,
 697         REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
 698         REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
 699         REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
 700         REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
 701
 702         /* first byte is taken by user flags, shift it to not overlap */
 703         REQ_F_FAIL_BIT          = 8,
 704         REQ_F_INFLIGHT_BIT,
 705         REQ_F_CUR_POS_BIT,
 706         REQ_F_NOWAIT_BIT,
 707         REQ_F_LINK_TIMEOUT_BIT,
 708         REQ_F_NEED_CLEANUP_BIT,
 709         REQ_F_POLLED_BIT,
 710         REQ_F_BUFFER_SELECTED_BIT,
 711         REQ_F_COMPLETE_INLINE_BIT,
 712         REQ_F_REISSUE_BIT,
 713         REQ_F_DONT_REISSUE_BIT,
 714         REQ_F_CREDS_BIT,
 715         REQ_F_REFCOUNT_BIT,
 716         REQ_F_ARM_LTIMEOUT_BIT,
 717         /* keep async read/write and isreg together and in order */
 718         REQ_F_NOWAIT_READ_BIT,
 719         REQ_F_NOWAIT_WRITE_BIT,
 720         REQ_F_ISREG_BIT,
 721
 722         /* not a real bit, just to check we're not overflowing the space */
 723         __REQ_F_LAST_BIT,
 724 };
 725
 726 enum {
 727         /* ctx owns file */
 728         REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
 729         /* drain existing IO first */
 730         REQ_F_IO_DRAIN          = BIT(REQ_F_IO_DRAIN_BIT),
 731         /* linked sqes */
 732         REQ_F_LINK              = BIT(REQ_F_LINK_BIT),
 733         /* doesn't sever on completion < 0 */
 734         REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
 735         /* IOSQE_ASYNC */
 736         REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
 737         /* IOSQE_BUFFER_SELECT */
 738         REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
 739
 740         /* fail rest of links */
 741         REQ_F_FAIL              = BIT(REQ_F_FAIL_BIT),
 742         /* on inflight list, should be cancelled and waited on exit reliably */
 743         REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
 744         /* read/write uses file position */
 745         REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
 746         /* must not punt to workers */
 747         REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
 748         /* has or had linked timeout */
 749         REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
 750         /* needs cleanup */
 751         REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
 752         /* already went through poll handler */
 753         REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
 754         /* buffer already selected */
 755         REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
 756         /* completion is deferred through io_comp_state */
 757         REQ_F_COMPLETE_INLINE   = BIT(REQ_F_COMPLETE_INLINE_BIT),
 758         /* caller should reissue async */
 759         REQ_F_REISSUE           = BIT(REQ_F_REISSUE_BIT),
 760         /* don't attempt request reissue, see io_rw_reissue() */
 761         REQ_F_DONT_REISSUE      = BIT(REQ_F_DONT_REISSUE_BIT),
 762         /* supports async reads */
 763         REQ_F_NOWAIT_READ       = BIT(REQ_F_NOWAIT_READ_BIT),
 764         /* supports async writes */
 765         REQ_F_NOWAIT_WRITE      = BIT(REQ_F_NOWAIT_WRITE_BIT),
 766         /* regular file */
 767         REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
 768         /* has creds assigned */
 769         REQ_F_CREDS             = BIT(REQ_F_CREDS_BIT),
 770         /* skip refcounting if not set */
 771         REQ_F_REFCOUNT          = BIT(REQ_F_REFCOUNT_BIT),
 772         /* there is a linked timeout that has to be armed */
 773         REQ_F_ARM_LTIMEOUT      = BIT(REQ_F_ARM_LTIMEOUT_BIT),
 774 };
 775
 776 struct async_poll {
 777         struct io_poll_iocb     poll;
 778         struct io_poll_iocb     *double_poll;
 779 };
 780
 781 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
 782
 783 struct io_task_work {
 784         union {
 785                 struct io_wq_work_node  node;
 786                 struct llist_node       fallback_node;
 787         };
 788         io_req_tw_func_t                func;
 789 };
 790
 791 enum {
 792         IORING_RSRC_FILE                = 0,
 793         IORING_RSRC_BUFFER              = 1,
 794 };
 795
 796 /*
 797  * NOTE! Each of the iocb union members has the file pointer
 798  * as the first entry in their struct definition. So you can
 799  * access the file pointer through any of the sub-structs,
 800  * or directly as just 'ki_filp' in this struct.
 801  */
 802 struct io_kiocb {
 803         union {
 804                 struct file             *file;
 805                 struct io_rw            rw;
 806                 struct io_poll_iocb     poll;
 807                 struct io_poll_update   poll_update;
 808                 struct io_accept        accept;
 809                 struct io_sync          sync;
 810                 struct io_cancel        cancel;
 811                 struct io_timeout       timeout;
 812                 struct io_timeout_rem   timeout_rem;
 813                 struct io_connect       connect;
 814                 struct io_sr_msg        sr_msg;
 815                 struct io_open          open;
 816                 struct io_close         close;
 817                 struct io_rsrc_update   rsrc_update;
 818                 struct io_fadvise       fadvise;
 819                 struct io_madvise       madvise;
 820                 struct io_epoll         epoll;
 821                 struct io_splice        splice;
 822                 struct io_provide_buf   pbuf;
 823                 struct io_statx         statx;
 824                 struct io_shutdown      shutdown;
 825                 struct io_rename        rename;
 826                 struct io_unlink        unlink;
 827                 /* use only after cleaning per-op data, see io_clean_op() */
 828                 struct io_completion    compl;
 829         };
 830
 831         /* opcode allocated if it needs to store data for async defer */
 832         void                            *async_data;
 833         u8                              opcode;
 834         /* polled IO has completed */
 835         u8                              iopoll_completed;
 836
 837         u16                             buf_index;
 838         u32                             result;
 839
 840         struct io_ring_ctx              *ctx;
 841         unsigned int                    flags;
 842         atomic_t                        refs;
 843         struct task_struct              *task;
 844         u64                             user_data;
 845
 846         struct io_kiocb                 *link;
 847         struct percpu_ref               *fixed_rsrc_refs;
 848
 849         /* used with ctx->iopoll_list with reads/writes */
 850         struct list_head                inflight_entry;
 851         struct io_task_work             io_task_work;
 852         /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
 853         struct hlist_node               hash_node;
 854         struct async_poll               *apoll;
 855         struct io_wq_work               work;
 856         const struct cred               *creds;
 857
 858         /* store used ubuf, so we can prevent reloading */
 859         struct io_mapped_ubuf           *imu;
 860 };
 861
 862 struct io_tctx_node {
 863         struct list_head        ctx_node;
 864         struct task_struct      *task;
 865         struct io_ring_ctx      *ctx;
 866 };
 867
 868 struct io_defer_entry {
 869         struct list_head        list;
 870         struct io_kiocb         *req;
 871         u32                     seq;
 872 };
 873
 874 struct io_op_def {
 875         /* needs req->file assigned */
 876         unsigned                needs_file : 1;
 877         /* hash wq insertion if file is a regular file */
 878         unsigned                hash_reg_file : 1;
 879         /* unbound wq insertion if file is a non-regular file */
 880         unsigned                unbound_nonreg_file : 1;
 881         /* opcode is not supported by this kernel */
 882         unsigned                not_supported : 1;
 883         /* set if opcode supports polled "wait" */
 884         unsigned                pollin : 1;
 885         unsigned                pollout : 1;
 886         /* op supports buffer selection */
 887         unsigned                buffer_select : 1;
 888         /* do prep async if is going to be punted */
 889         unsigned                needs_async_setup : 1;
 890         /* should block plug */
 891         unsigned                plug : 1;
 892         /* size of async data needed, if any */
 893         unsigned short          async_size;
 894 };
 895
 896 static const struct io_op_def io_op_defs[] = {
 897         [IORING_OP_NOP] = {},
 898         [IORING_OP_READV] = {
 899                 .needs_file             = 1,
 900                 .unbound_nonreg_file    = 1,
 901                 .pollin                 = 1,
 902                 .buffer_select          = 1,
 903                 .needs_async_setup      = 1,
 904                 .plug                   = 1,
 905                 .async_size             = sizeof(struct io_async_rw),
 906         },
 907         [IORING_OP_WRITEV] = {
 908                 .needs_file             = 1,
 909                 .hash_reg_file          = 1,
 910                 .unbound_nonreg_file    = 1,
 911                 .pollout                = 1,
 912                 .needs_async_setup      = 1,
 913                 .plug                   = 1,
 914                 .async_size             = sizeof(struct io_async_rw),
 915         },
 916         [IORING_OP_FSYNC] = {
 917                 .needs_file             = 1,
 918         },
 919         [IORING_OP_READ_FIXED] = {
 920                 .needs_file             = 1,
 921                 .unbound_nonreg_file    = 1,
 922                 .pollin                 = 1,
 923                 .plug                   = 1,
 924                 .async_size             = sizeof(struct io_async_rw),
 925         },
 926         [IORING_OP_WRITE_FIXED] = {
 927                 .needs_file             = 1,
 928                 .hash_reg_file          = 1,
 929                 .unbound_nonreg_file    = 1,
 930                 .pollout                = 1,
 931                 .plug                   = 1,
 932                 .async_size             = sizeof(struct io_async_rw),
 933         },
 934         [IORING_OP_POLL_ADD] = {
 935                 .needs_file             = 1,
 936                 .unbound_nonreg_file    = 1,
 937         },
 938         [IORING_OP_POLL_REMOVE] = {},
 939         [IORING_OP_SYNC_FILE_RANGE] = {
 940                 .needs_file             = 1,
 941         },
 942         [IORING_OP_SENDMSG] = {
 943                 .needs_file             = 1,
 944                 .unbound_nonreg_file    = 1,
 945                 .pollout                = 1,
 946                 .needs_async_setup      = 1,
 947                 .async_size             = sizeof(struct io_async_msghdr),
 948         },
 949         [IORING_OP_RECVMSG] = {
 950                 .needs_file             = 1,
 951                 .unbound_nonreg_file    = 1,
 952                 .pollin                 = 1,
 953                 .buffer_select          = 1,
 954                 .needs_async_setup      = 1,
 955                 .async_size             = sizeof(struct io_async_msghdr),
 956         },
 957         [IORING_OP_TIMEOUT] = {
 958                 .async_size             = sizeof(struct io_timeout_data),
 959         },
 960         [IORING_OP_TIMEOUT_REMOVE] = {
 961                 /* used by timeout updates' prep() */
 962         },
 963         [IORING_OP_ACCEPT] = {
 964                 .needs_file             = 1,
 965                 .unbound_nonreg_file    = 1,
 966                 .pollin                 = 1,
 967         },
 968         [IORING_OP_ASYNC_CANCEL] = {},
 969         [IORING_OP_LINK_TIMEOUT] = {
 970                 .async_size             = sizeof(struct io_timeout_data),
 971         },
 972         [IORING_OP_CONNECT] = {
 973                 .needs_file             = 1,
 974                 .unbound_nonreg_file    = 1,
 975                 .pollout                = 1,
 976                 .needs_async_setup      = 1,
 977                 .async_size             = sizeof(struct io_async_connect),
 978         },
 979         [IORING_OP_FALLOCATE] = {
 980                 .needs_file             = 1,
 981         },
 982         [IORING_OP_OPENAT] = {},
 983         [IORING_OP_CLOSE] = {},
 984         [IORING_OP_FILES_UPDATE] = {},
 985         [IORING_OP_STATX] = {},
 986         [IORING_OP_READ] = {
 987                 .needs_file             = 1,
 988                 .unbound_nonreg_file    = 1,
 989                 .pollin                 = 1,
 990                 .buffer_select          = 1,
 991                 .plug                   = 1,
 992                 .async_size             = sizeof(struct io_async_rw),
 993         },
 994         [IORING_OP_WRITE] = {
 995                 .needs_file             = 1,
 996                 .unbound_nonreg_file    = 1,
 997                 .pollout                = 1,
 998                 .plug                   = 1,
 999                 .async_size             = sizeof(struct io_async_rw),
1000         },
1001         [IORING_OP_FADVISE] = {
1002                 .needs_file             = 1,
1003         },
1004         [IORING_OP_MADVISE] = {},
1005         [IORING_OP_SEND] = {
1006                 .needs_file             = 1,
1007                 .unbound_nonreg_file    = 1,
1008                 .pollout                = 1,
1009         },
1010         [IORING_OP_RECV] = {
1011                 .needs_file             = 1,
1012                 .unbound_nonreg_file    = 1,
1013                 .pollin                 = 1,
1014                 .buffer_select          = 1,
1015         },
1016         [IORING_OP_OPENAT2] = {
1017         },
1018         [IORING_OP_EPOLL_CTL] = {
1019                 .unbound_nonreg_file    = 1,
1020         },
1021         [IORING_OP_SPLICE] = {
1022                 .needs_file             = 1,
1023                 .hash_reg_file          = 1,
1024                 .unbound_nonreg_file    = 1,
1025         },
1026         [IORING_OP_PROVIDE_BUFFERS] = {},
1027         [IORING_OP_REMOVE_BUFFERS] = {},
1028         [IORING_OP_TEE] = {
1029                 .needs_file             = 1,
1030                 .hash_reg_file          = 1,
1031                 .unbound_nonreg_file    = 1,
1032         },
1033         [IORING_OP_SHUTDOWN] = {
1034                 .needs_file             = 1,
1035         },
1036         [IORING_OP_RENAMEAT] = {},
1037         [IORING_OP_UNLINKAT] = {},
1038 };
1039
1040 /* requests with any of those set should undergo io_disarm_next() */
1041 #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
1042
1043 static bool io_disarm_next(struct io_kiocb *req);
1044 static void io_uring_del_tctx_node(unsigned long index);
1045 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
1046                                          struct task_struct *task,
1047                                          bool cancel_all);
1048 static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
1049
1050 static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1051                                  long res, unsigned int cflags);
1052 static void io_put_req(struct io_kiocb *req);
1053 static void io_put_req_deferred(struct io_kiocb *req);
1054 static void io_dismantle_req(struct io_kiocb *req);
1055 static void io_queue_linked_timeout(struct io_kiocb *req);
1056 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
1057                                      struct io_uring_rsrc_update2 *up,
1058                                      unsigned nr_args);
1059 static void io_clean_op(struct io_kiocb *req);
1060 static struct file *io_file_get(struct io_ring_ctx *ctx,
1061                                 struct io_kiocb *req, int fd, bool fixed);
1062 static void __io_queue_sqe(struct io_kiocb *req);
1063 static void io_rsrc_put_work(struct work_struct *work);
1064
1065 static void io_req_task_queue(struct io_kiocb *req);
1066 static void io_submit_flush_completions(struct io_ring_ctx *ctx);
1067 static int io_req_prep_async(struct io_kiocb *req);
1068
1069 static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
1070                                  unsigned int issue_flags, u32 slot_index);
1071
1072 static struct kmem_cache *req_cachep;
1073
1074 static const struct file_operations io_uring_fops;
1075
1076 struct sock *io_uring_get_socket(struct file *file)
1077 {
1078 #if defined(CONFIG_UNIX)
1079         if (file->f_op == &io_uring_fops) {
1080                 struct io_ring_ctx *ctx = file->private_data;
1081
1082                 return ctx->ring_sock->sk;
1083         }
1084 #endif
1085         return NULL;
1086 }
1087 EXPORT_SYMBOL(io_uring_get_socket);
1088
1089 static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
1090 {
1091         if (!*locked) {
1092                 mutex_lock(&ctx->uring_lock);
1093                 *locked = true;
1094         }
1095 }
1096
1097 #define io_for_each_link(pos, head) \
1098         for (pos = (head); pos; pos = pos->link)
1099
1100 /*
1101  * Shamelessly stolen from the mm implementation of page reference checking,
1102  * see commit f958d7b528b1 for details.
1103  */
1104 #define req_ref_zero_or_close_to_overflow(req)  \
1105         ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
1106
1107 static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
1108 {
1109         WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
1110         return atomic_inc_not_zero(&req->refs);
1111 }
1112
1113 static inline bool req_ref_put_and_test(struct io_kiocb *req)
1114 {
1115         if (likely(!(req->flags & REQ_F_REFCOUNT)))
1116                 return true;
1117
1118         WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1119         return atomic_dec_and_test(&req->refs);
1120 }
1121
1122 static inline void req_ref_put(struct io_kiocb *req)
1123 {
1124         WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
1125         WARN_ON_ONCE(req_ref_put_and_test(req));
1126 }
1127
1128 static inline void req_ref_get(struct io_kiocb *req)
1129 {
1130         WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
1131         WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1132         atomic_inc(&req->refs);
1133 }
1134
1135 static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
1136 {
1137         if (!(req->flags & REQ_F_REFCOUNT)) {
1138                 req->flags |= REQ_F_REFCOUNT;
1139                 atomic_set(&req->refs, nr);
1140         }
1141 }
1142
1143 static inline void io_req_set_refcount(struct io_kiocb *req)
1144 {
1145         __io_req_set_refcount(req, 1);
1146 }
1147
1148 static inline void io_req_set_rsrc_node(struct io_kiocb *req)
1149 {
1150         struct io_ring_ctx *ctx = req->ctx;
1151
1152         if (!req->fixed_rsrc_refs) {
1153                 req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
1154                 percpu_ref_get(req->fixed_rsrc_refs);
1155         }
1156 }
1157
1158 static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
1159 {
1160         bool got = percpu_ref_tryget(ref);
1161
1162         /* already at zero, wait for ->release() */
1163         if (!got)
1164                 wait_for_completion(compl);
1165         percpu_ref_resurrect(ref);
1166         if (got)
1167                 percpu_ref_put(ref);
1168 }
1169
1170 static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
1171                           bool cancel_all)
1172 {
1173         struct io_kiocb *req;
1174
1175         if (task && head->task != task)
1176                 return false;
1177         if (cancel_all)
1178                 return true;
1179
1180         io_for_each_link(req, head) {
1181                 if (req->flags & REQ_F_INFLIGHT)
1182                         return true;
1183         }
1184         return false;
1185 }
1186
1187 static inline void req_set_fail(struct io_kiocb *req)
1188 {
1189         req->flags |= REQ_F_FAIL;
1190 }
1191
1192 static inline void req_fail_link_node(struct io_kiocb *req, int res)
1193 {
1194         req_set_fail(req);
1195         req->result = res;
1196 }
1197
1198 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
1199 {
1200         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1201
1202         complete(&ctx->ref_comp);
1203 }
1204
1205 static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1206 {
1207         return !req->timeout.off;
1208 }
1209
1210 static void io_fallback_req_func(struct work_struct *work)
1211 {
1212         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
1213                                                 fallback_work.work);
1214         struct llist_node *node = llist_del_all(&ctx->fallback_llist);
1215         struct io_kiocb *req, *tmp;
1216         bool locked = false;
1217
1218         percpu_ref_get(&ctx->refs);
1219         llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
1220                 req->io_task_work.func(req, &locked);
1221
1222         if (locked) {
1223                 if (ctx->submit_state.compl_nr)
1224                         io_submit_flush_completions(ctx);
1225                 mutex_unlock(&ctx->uring_lock);
1226         }
1227         percpu_ref_put(&ctx->refs);
1228
1229 }
1230
1231 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1232 {
1233         struct io_ring_ctx *ctx;
1234         int hash_bits;
1235
1236         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1237         if (!ctx)
1238                 return NULL;
1239
1240         /*
1241          * Use 5 bits less than the max cq entries, that should give us around
1242          * 32 entries per hash list if totally full and uniformly spread.
1243          */
1244         hash_bits = ilog2(p->cq_entries);
1245         hash_bits -= 5;
1246         if (hash_bits <= 0)
1247                 hash_bits = 1;
1248         ctx->cancel_hash_bits = hash_bits;
1249         ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1250                                         GFP_KERNEL);
1251         if (!ctx->cancel_hash)
1252                 goto err;
1253         __hash_init(ctx->cancel_hash, 1U << hash_bits);
1254
1255         ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
1256         if (!ctx->dummy_ubuf)
1257                 goto err;
1258         /* set invalid range, so io_import_fixed() fails meeting it */
1259         ctx->dummy_ubuf->ubuf = -1UL;
1260
1261         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
1262                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1263                 goto err;
1264
1265         ctx->flags = p->flags;
1266         init_waitqueue_head(&ctx->sqo_sq_wait);
1267         INIT_LIST_HEAD(&ctx->sqd_list);
1268         init_waitqueue_head(&ctx->poll_wait);
1269         INIT_LIST_HEAD(&ctx->cq_overflow_list);
1270         init_completion(&ctx->ref_comp);
1271         xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
1272         xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
1273         mutex_init(&ctx->uring_lock);
1274         init_waitqueue_head(&ctx->cq_wait);
1275         spin_lock_init(&ctx->completion_lock);
1276         spin_lock_init(&ctx->timeout_lock);
1277         INIT_LIST_HEAD(&ctx->iopoll_list);
1278         INIT_LIST_HEAD(&ctx->defer_list);
1279         INIT_LIST_HEAD(&ctx->timeout_list);
1280         spin_lock_init(&ctx->rsrc_ref_lock);
1281         INIT_LIST_HEAD(&ctx->rsrc_ref_list);
1282         INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1283         init_llist_head(&ctx->rsrc_put_llist);
1284         INIT_LIST_HEAD(&ctx->tctx_list);
1285         INIT_LIST_HEAD(&ctx->submit_state.free_list);
1286         INIT_LIST_HEAD(&ctx->locked_free_list);
1287         INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
1288         return ctx;
1289 err:
1290         kfree(ctx->dummy_ubuf);
1291         kfree(ctx->cancel_hash);
1292         kfree(ctx);
1293         return NULL;
1294 }
1295
1296 static void io_account_cq_overflow(struct io_ring_ctx *ctx)
1297 {
1298         struct io_rings *r = ctx->rings;
1299
1300         WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
1301         ctx->cq_extra--;
1302 }
1303
1304 static bool req_need_defer(struct io_kiocb *req, u32 seq)
1305 {
1306         if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1307                 struct io_ring_ctx *ctx = req->ctx;
1308
1309                 return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
1310         }
1311
1312         return false;
1313 }
1314
1315 #define FFS_ASYNC_READ          0x1UL
1316 #define FFS_ASYNC_WRITE         0x2UL
1317 #ifdef CONFIG_64BIT
1318 #define FFS_ISREG               0x4UL
1319 #else
1320 #define FFS_ISREG               0x0UL
1321 #endif
1322 #define FFS_MASK                ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
1323
1324 static inline bool io_req_ffs_set(struct io_kiocb *req)
1325 {
1326         return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE);
1327 }
1328
1329 static void io_req_track_inflight(struct io_kiocb *req)
1330 {
1331         if (!(req->flags & REQ_F_INFLIGHT)) {
1332                 req->flags |= REQ_F_INFLIGHT;
1333                 atomic_inc(&current->io_uring->inflight_tracked);
1334         }
1335 }
1336
1337 static inline void io_unprep_linked_timeout(struct io_kiocb *req)
1338 {
1339         req->flags &= ~REQ_F_LINK_TIMEOUT;
1340 }
1341
1342 static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
1343 {
1344         if (WARN_ON_ONCE(!req->link))
1345                 return NULL;
1346
1347         req->flags &= ~REQ_F_ARM_LTIMEOUT;
1348         req->flags |= REQ_F_LINK_TIMEOUT;
1349
1350         /* linked timeouts should have two refs once prep'ed */
1351         io_req_set_refcount(req);
1352         __io_req_set_refcount(req->link, 2);
1353         return req->link;
1354 }
1355
1356 static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
1357 {
1358         if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
1359                 return NULL;
1360         return __io_prep_linked_timeout(req);
1361 }
1362
1363 static void io_prep_async_work(struct io_kiocb *req)
1364 {
1365         const struct io_op_def *def = &io_op_defs[req->opcode];
1366         struct io_ring_ctx *ctx = req->ctx;
1367
1368         if (!(req->flags & REQ_F_CREDS)) {
1369                 req->flags |= REQ_F_CREDS;
1370                 req->creds = get_current_cred();
1371         }
1372
1373         req->work.list.next = NULL;
1374         req->work.flags = 0;
1375         if (req->flags & REQ_F_FORCE_ASYNC)
1376                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
1377
1378         if (req->flags & REQ_F_ISREG) {
1379                 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1380                         io_wq_hash_work(&req->work, file_inode(req->file));
1381         } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
1382                 if (def->unbound_nonreg_file)
1383                         req->work.flags |= IO_WQ_WORK_UNBOUND;
1384         }
1385
1386         switch (req->opcode) {
1387         case IORING_OP_SPLICE:
1388         case IORING_OP_TEE:
1389                 if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
1390                         req->work.flags |= IO_WQ_WORK_UNBOUND;
1391                 break;
1392         }
1393 }
1394
1395 static void io_prep_async_link(struct io_kiocb *req)
1396 {
1397         struct io_kiocb *cur;
1398
1399         if (req->flags & REQ_F_LINK_TIMEOUT) {
1400                 struct io_ring_ctx *ctx = req->ctx;
1401
1402                 spin_lock(&ctx->completion_lock);
1403                 io_for_each_link(cur, req)
1404                         io_prep_async_work(cur);
1405                 spin_unlock(&ctx->completion_lock);
1406         } else {
1407                 io_for_each_link(cur, req)
1408                         io_prep_async_work(cur);
1409         }
1410 }
1411
1412 static void io_queue_async_work(struct io_kiocb *req, bool *locked)
1413 {
1414         struct io_ring_ctx *ctx = req->ctx;
1415         struct io_kiocb *link = io_prep_linked_timeout(req);
1416         struct io_uring_task *tctx = req->task->io_uring;
1417
1418         /* must not take the lock, NULL it as a precaution */
1419         locked = NULL;
1420
1421         BUG_ON(!tctx);
1422         BUG_ON(!tctx->io_wq);
1423
1424         /* init ->work of the whole link before punting */
1425         io_prep_async_link(req);
1426
1427         /*
1428          * Not expected to happen, but if we do have a bug where this _can_
1429          * happen, catch it here and ensure the request is marked as
1430          * canceled. That will make io-wq go through the usual work cancel
1431          * procedure rather than attempt to run this request (or create a new
1432          * worker for it).
1433          */
1434         if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
1435                 req->work.flags |= IO_WQ_WORK_CANCEL;
1436
1437         trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1438                                         &req->work, req->flags);
1439         io_wq_enqueue(tctx->io_wq, &req->work);
1440         if (link)
1441                 io_queue_linked_timeout(link);
1442 }
1443
1444 static void io_kill_timeout(struct io_kiocb *req, int status)
1445         __must_hold(&req->ctx->completion_lock)
1446         __must_hold(&req->ctx->timeout_lock)
1447 {
1448         struct io_timeout_data *io = req->async_data;
1449
1450         if (hrtimer_try_to_cancel(&io->timer) != -1) {
1451                 atomic_set(&req->ctx->cq_timeouts,
1452                         atomic_read(&req->ctx->cq_timeouts) + 1);
1453                 list_del_init(&req->timeout.list);
1454                 io_cqring_fill_event(req->ctx, req->user_data, status, 0);
1455                 io_put_req_deferred(req);
1456         }
1457 }
1458
1459 static void io_queue_deferred(struct io_ring_ctx *ctx)
1460 {
1461         while (!list_empty(&ctx->defer_list)) {
1462                 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1463                                                 struct io_defer_entry, list);
1464
1465                 if (req_need_defer(de->req, de->seq))
1466                         break;
1467                 list_del_init(&de->list);
1468                 io_req_task_queue(de->req);
1469                 kfree(de);
1470         }
1471 }
1472
1473 static void io_flush_timeouts(struct io_ring_ctx *ctx)
1474         __must_hold(&ctx->completion_lock)
1475 {
1476         u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
1477
1478         spin_lock_irq(&ctx->timeout_lock);
1479         while (!list_empty(&ctx->timeout_list)) {
1480                 u32 events_needed, events_got;
1481                 struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
1482                                                 struct io_kiocb, timeout.list);
1483
1484                 if (io_is_timeout_noseq(req))
1485                         break;
1486
1487                 /*
1488                  * Since seq can easily wrap around over time, subtract
1489                  * the last seq at which timeouts were flushed before comparing.
1490                  * Assuming not more than 2^31-1 events have happened since,
1491                  * these subtractions won't have wrapped, so we can check if
1492                  * target is in [last_seq, current_seq] by comparing the two.
1493                  */
1494                 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
1495                 events_got = seq - ctx->cq_last_tm_flush;
1496                 if (events_got < events_needed)
1497                         break;
1498
1499                 list_del_init(&req->timeout.list);
1500                 io_kill_timeout(req, 0);
1501         }
1502         ctx->cq_last_tm_flush = seq;
1503         spin_unlock_irq(&ctx->timeout_lock);
1504 }
1505
1506 static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
1507 {
1508         if (ctx->off_timeout_used)
1509                 io_flush_timeouts(ctx);
1510         if (ctx->drain_active)
1511                 io_queue_deferred(ctx);
1512 }
1513
1514 static inline void io_commit_cqring(struct io_ring_ctx *ctx)
1515 {
1516         if (unlikely(ctx->off_timeout_used || ctx->drain_active))
1517                 __io_commit_cqring_flush(ctx);
1518         /* order cqe stores with ring update */
1519         smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
1520 }
1521
1522 static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1523 {
1524         struct io_rings *r = ctx->rings;
1525
1526         return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
1527 }
1528
1529 static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
1530 {
1531         return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
1532 }
1533
1534 static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
1535 {
1536         struct io_rings *rings = ctx->rings;
1537         unsigned tail, mask = ctx->cq_entries - 1;
1538
1539         /*
1540          * writes to the cq entry need to come after reading head; the
1541          * control dependency is enough as we're using WRITE_ONCE to
1542          * fill the cq entry
1543          */
1544         if (__io_cqring_events(ctx) == ctx->cq_entries)
1545                 return NULL;
1546
1547         tail = ctx->cached_cq_tail++;
1548         return &rings->cqes[tail & mask];
1549 }
1550
1551 static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1552 {
1553         if (likely(!ctx->cq_ev_fd))
1554                 return false;
1555         if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1556                 return false;
1557         return !ctx->eventfd_async || io_wq_current_is_worker();
1558 }
1559
1560 /*
1561  * This should only get called when at least one event has been posted.
1562  * Some applications rely on the eventfd notification count only changing
1563  * IFF a new CQE has been added to the CQ ring. There's no depedency on
1564  * 1:1 relationship between how many times this function is called (and
1565  * hence the eventfd count) and number of CQEs posted to the CQ ring.
1566  */
1567 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1568 {
1569         /*
1570          * wake_up_all() may seem excessive, but io_wake_function() and
1571          * io_should_wake() handle the termination of the loop and only
1572          * wake as many waiters as we need to.
1573          */
1574         if (wq_has_sleeper(&ctx->cq_wait))
1575                 wake_up_all(&ctx->cq_wait);
1576         if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
1577                 wake_up(&ctx->sq_data->wait);
1578         if (io_should_trigger_evfd(ctx))
1579                 eventfd_signal(ctx->cq_ev_fd, 1);
1580         if (waitqueue_active(&ctx->poll_wait)) {
1581                 wake_up_interruptible(&ctx->poll_wait);
1582                 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1583         }
1584 }
1585
1586 static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1587 {
1588         if (ctx->flags & IORING_SETUP_SQPOLL) {
1589                 if (wq_has_sleeper(&ctx->cq_wait))
1590                         wake_up_all(&ctx->cq_wait);
1591         }
1592         if (io_should_trigger_evfd(ctx))
1593                 eventfd_signal(ctx->cq_ev_fd, 1);
1594         if (waitqueue_active(&ctx->poll_wait)) {
1595                 wake_up_interruptible(&ctx->poll_wait);
1596                 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1597         }
1598 }
1599
1600 /* Returns true if there are no backlogged entries after the flush */
1601 static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1602 {
1603         bool all_flushed, posted;
1604
1605         if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
1606                 return false;
1607
1608         posted = false;
1609         spin_lock(&ctx->completion_lock);
1610         while (!list_empty(&ctx->cq_overflow_list)) {
1611                 struct io_uring_cqe *cqe = io_get_cqe(ctx);
1612                 struct io_overflow_cqe *ocqe;
1613
1614                 if (!cqe && !force)
1615                         break;
1616                 ocqe = list_first_entry(&ctx->cq_overflow_list,
1617                                         struct io_overflow_cqe, list);
1618                 if (cqe)
1619                         memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
1620                 else
1621                         io_account_cq_overflow(ctx);
1622
1623                 posted = true;
1624                 list_del(&ocqe->list);
1625                 kfree(ocqe);
1626         }
1627
1628         all_flushed = list_empty(&ctx->cq_overflow_list);
1629         if (all_flushed) {
1630                 clear_bit(0, &ctx->check_cq_overflow);
1631                 WRITE_ONCE(ctx->rings->sq_flags,
1632                            ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
1633         }
1634
1635         if (posted)
1636                 io_commit_cqring(ctx);
1637         spin_unlock(&ctx->completion_lock);
1638         if (posted)
1639                 io_cqring_ev_posted(ctx);
1640         return all_flushed;
1641 }
1642
1643 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
1644 {
1645         bool ret = true;
1646
1647         if (test_bit(0, &ctx->check_cq_overflow)) {
1648                 /* iopoll syncs against uring_lock, not completion_lock */
1649                 if (ctx->flags & IORING_SETUP_IOPOLL)
1650                         mutex_lock(&ctx->uring_lock);
1651                 ret = __io_cqring_overflow_flush(ctx, false);
1652                 if (ctx->flags & IORING_SETUP_IOPOLL)
1653                         mutex_unlock(&ctx->uring_lock);
1654         }
1655
1656         return ret;
1657 }
1658
1659 /* must to be called somewhat shortly after putting a request */
1660 static inline void io_put_task(struct task_struct *task, int nr)
1661 {
1662         struct io_uring_task *tctx = task->io_uring;
1663
1664         if (likely(task == current)) {
1665                 tctx->cached_refs += nr;
1666         } else {
1667                 percpu_counter_sub(&tctx->inflight, nr);
1668                 if (unlikely(atomic_read(&tctx->in_idle)))
1669                         wake_up(&tctx->wait);
1670                 put_task_struct_many(task, nr);
1671         }
1672 }
1673
1674 static void io_task_refs_refill(struct io_uring_task *tctx)
1675 {
1676         unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
1677
1678         percpu_counter_add(&tctx->inflight, refill);
1679         refcount_add(refill, &current->usage);
1680         tctx->cached_refs += refill;
1681 }
1682
1683 static inline void io_get_task_refs(int nr)
1684 {
1685         struct io_uring_task *tctx = current->io_uring;
1686
1687         tctx->cached_refs -= nr;
1688         if (unlikely(tctx->cached_refs < 0))
1689                 io_task_refs_refill(tctx);
1690 }
1691
1692 static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
1693                                      long res, unsigned int cflags)
1694 {
1695         struct io_overflow_cqe *ocqe;
1696
1697         ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
1698         if (!ocqe) {
1699                 /*
1700                  * If we're in ring overflow flush mode, or in task cancel mode,
1701                  * or cannot allocate an overflow entry, then we need to drop it
1702                  * on the floor.
1703                  */
1704                 io_account_cq_overflow(ctx);
1705                 return false;
1706         }
1707         if (list_empty(&ctx->cq_overflow_list)) {
1708                 set_bit(0, &ctx->check_cq_overflow);
1709                 WRITE_ONCE(ctx->rings->sq_flags,
1710                            ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
1711
1712         }
1713         ocqe->cqe.user_data = user_data;
1714         ocqe->cqe.res = res;
1715         ocqe->cqe.flags = cflags;
1716         list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
1717         return true;
1718 }
1719
1720 static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1721                                           long res, unsigned int cflags)
1722 {
1723         struct io_uring_cqe *cqe;
1724
1725         trace_io_uring_complete(ctx, user_data, res, cflags);
1726
1727         /*
1728          * If we can't get a cq entry, userspace overflowed the
1729          * submission (by quite a lot). Increment the overflow count in
1730          * the ring.
1731          */
1732         cqe = io_get_cqe(ctx);
1733         if (likely(cqe)) {
1734                 WRITE_ONCE(cqe->user_data, user_data);
1735                 WRITE_ONCE(cqe->res, res);
1736                 WRITE_ONCE(cqe->flags, cflags);
1737                 return true;
1738         }
1739         return io_cqring_event_overflow(ctx, user_data, res, cflags);
1740 }
1741
1742 /* not as hot to bloat with inlining */
1743 static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1744                                           long res, unsigned int cflags)
1745 {
1746         return __io_cqring_fill_event(ctx, user_data, res, cflags);
1747 }
1748
1749 static void io_req_complete_post(struct io_kiocb *req, long res,
1750                                  unsigned int cflags)
1751 {
1752         struct io_ring_ctx *ctx = req->ctx;
1753
1754         spin_lock(&ctx->completion_lock);
1755         __io_cqring_fill_event(ctx, req->user_data, res, cflags);
1756         /*
1757          * If we're the last reference to this request, add to our locked
1758          * free_list cache.
1759          */
1760         if (req_ref_put_and_test(req)) {
1761                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
1762                         if (req->flags & IO_DISARM_MASK)
1763                                 io_disarm_next(req);
1764                         if (req->link) {
1765                                 io_req_task_queue(req->link);
1766                                 req->link = NULL;
1767                         }
1768                 }
1769                 io_dismantle_req(req);
1770                 io_put_task(req->task, 1);
1771                 list_add(&req->inflight_entry, &ctx->locked_free_list);
1772                 ctx->locked_free_nr++;
1773         } else {
1774                 if (!percpu_ref_tryget(&ctx->refs))
1775                         req = NULL;
1776         }
1777         io_commit_cqring(ctx);
1778         spin_unlock(&ctx->completion_lock);
1779
1780         if (req) {
1781                 io_cqring_ev_posted(ctx);
1782                 percpu_ref_put(&ctx->refs);
1783         }
1784 }
1785
1786 static inline bool io_req_needs_clean(struct io_kiocb *req)
1787 {
1788         return req->flags & IO_REQ_CLEAN_FLAGS;
1789 }
1790
1791 static void io_req_complete_state(struct io_kiocb *req, long res,
1792                                   unsigned int cflags)
1793 {
1794         if (io_req_needs_clean(req))
1795                 io_clean_op(req);
1796         req->result = res;
1797         req->compl.cflags = cflags;
1798         req->flags |= REQ_F_COMPLETE_INLINE;
1799 }
1800
1801 static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
1802                                      long res, unsigned cflags)
1803 {
1804         if (issue_flags & IO_URING_F_COMPLETE_DEFER)
1805                 io_req_complete_state(req, res, cflags);
1806         else
1807                 io_req_complete_post(req, res, cflags);
1808 }
1809
1810 static inline void io_req_complete(struct io_kiocb *req, long res)
1811 {
1812         __io_req_complete(req, 0, res, 0);
1813 }
1814
1815 static void io_req_complete_failed(struct io_kiocb *req, long res)
1816 {
1817         req_set_fail(req);
1818         io_req_complete_post(req, res, 0);
1819 }
1820
1821 /*
1822  * Don't initialise the fields below on every allocation, but do that in
1823  * advance and keep them valid across allocations.
1824  */
1825 static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
1826 {
1827         req->ctx = ctx;
1828         req->link = NULL;
1829         req->async_data = NULL;
1830         /* not necessary, but safer to zero */
1831         req->result = 0;
1832 }
1833
1834 static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
1835                                         struct io_submit_state *state)
1836 {
1837         spin_lock(&ctx->completion_lock);
1838         list_splice_init(&ctx->locked_free_list, &state->free_list);
1839         ctx->locked_free_nr = 0;
1840         spin_unlock(&ctx->completion_lock);
1841 }
1842
1843 /* Returns true IFF there are requests in the cache */
1844 static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
1845 {
1846         struct io_submit_state *state = &ctx->submit_state;
1847         int nr;
1848
1849         /*
1850          * If we have more than a batch's worth of requests in our IRQ side
1851          * locked cache, grab the lock and move them over to our submission
1852          * side cache.
1853          */
1854         if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
1855                 io_flush_cached_locked_reqs(ctx, state);
1856
1857         nr = state->free_reqs;
1858         while (!list_empty(&state->free_list)) {
1859                 struct io_kiocb *req = list_first_entry(&state->free_list,
1860                                         struct io_kiocb, inflight_entry);
1861
1862                 list_del(&req->inflight_entry);
1863                 state->reqs[nr++] = req;
1864                 if (nr == ARRAY_SIZE(state->reqs))
1865                         break;
1866         }
1867
1868         state->free_reqs = nr;
1869         return nr != 0;
1870 }
1871
1872 /*
1873  * A request might get retired back into the request caches even before opcode
1874  * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
1875  * Because of that, io_alloc_req() should be called only under ->uring_lock
1876  * and with extra caution to not get a request that is still worked on.
1877  */
1878 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
1879         __must_hold(&ctx->uring_lock)
1880 {
1881         struct io_submit_state *state = &ctx->submit_state;
1882         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1883         int ret, i;
1884
1885         BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH);
1886
1887         if (likely(state->free_reqs || io_flush_cached_reqs(ctx)))
1888                 goto got_req;
1889
1890         ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
1891                                     state->reqs);
1892
1893         /*
1894          * Bulk alloc is all-or-nothing. If we fail to get a batch,
1895          * retry single alloc to be on the safe side.
1896          */
1897         if (unlikely(ret <= 0)) {
1898                 state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1899                 if (!state->reqs[0])
1900                         return NULL;
1901                 ret = 1;
1902         }
1903
1904         for (i = 0; i < ret; i++)
1905                 io_preinit_req(state->reqs[i], ctx);
1906         state->free_reqs = ret;
1907 got_req:
1908         state->free_reqs--;
1909         return state->reqs[state->free_reqs];
1910 }
1911
1912 static inline void io_put_file(struct file *file)
1913 {
1914         if (file)
1915                 fput(file);
1916 }
1917
1918 static void io_dismantle_req(struct io_kiocb *req)
1919 {
1920         unsigned int flags = req->flags;
1921
1922         if (io_req_needs_clean(req))
1923                 io_clean_op(req);
1924         if (!(flags & REQ_F_FIXED_FILE))
1925                 io_put_file(req->file);
1926         if (req->fixed_rsrc_refs)
1927                 percpu_ref_put(req->fixed_rsrc_refs);
1928         if (req->async_data) {
1929                 kfree(req->async_data);
1930                 req->async_data = NULL;
1931         }
1932 }
1933
1934 static void __io_free_req(struct io_kiocb *req)
1935 {
1936         struct io_ring_ctx *ctx = req->ctx;
1937
1938         io_dismantle_req(req);
1939         io_put_task(req->task, 1);
1940
1941         spin_lock(&ctx->completion_lock);
1942         list_add(&req->inflight_entry, &ctx->locked_free_list);
1943         ctx->locked_free_nr++;
1944         spin_unlock(&ctx->completion_lock);
1945
1946         percpu_ref_put(&ctx->refs);
1947 }
1948
1949 static inline void io_remove_next_linked(struct io_kiocb *req)
1950 {
1951         struct io_kiocb *nxt = req->link;
1952
1953         req->link = nxt->link;
1954         nxt->link = NULL;
1955 }
1956
1957 static bool io_kill_linked_timeout(struct io_kiocb *req)
1958         __must_hold(&req->ctx->completion_lock)
1959         __must_hold(&req->ctx->timeout_lock)
1960 {
1961         struct io_kiocb *link = req->link;
1962
1963         if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
1964                 struct io_timeout_data *io = link->async_data;
1965
1966                 io_remove_next_linked(req);
1967                 link->timeout.head = NULL;
1968                 if (hrtimer_try_to_cancel(&io->timer) != -1) {
1969                         io_cqring_fill_event(link->ctx, link->user_data,
1970                                              -ECANCELED, 0);
1971                         io_put_req_deferred(link);
1972                         return true;
1973                 }
1974         }
1975         return false;
1976 }
1977
1978 static void io_fail_links(struct io_kiocb *req)
1979         __must_hold(&req->ctx->completion_lock)
1980 {
1981         struct io_kiocb *nxt, *link = req->link;
1982
1983         req->link = NULL;
1984         while (link) {
1985                 long res = -ECANCELED;
1986
1987                 if (link->flags & REQ_F_FAIL)
1988                         res = link->result;
1989
1990                 nxt = link->link;
1991                 link->link = NULL;
1992
1993                 trace_io_uring_fail_link(req, link);
1994                 io_cqring_fill_event(link->ctx, link->user_data, res, 0);
1995                 io_put_req_deferred(link);
1996                 link = nxt;
1997         }
1998 }
1999
2000 static bool io_disarm_next(struct io_kiocb *req)
2001         __must_hold(&req->ctx->completion_lock)
2002 {
2003         bool posted = false;
2004
2005         if (req->flags & REQ_F_ARM_LTIMEOUT) {
2006                 struct io_kiocb *link = req->link;
2007
2008                 req->flags &= ~REQ_F_ARM_LTIMEOUT;
2009                 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
2010                         io_remove_next_linked(req);
2011                         io_cqring_fill_event(link->ctx, link->user_data,
2012                                              -ECANCELED, 0);
2013                         io_put_req_deferred(link);
2014                         posted = true;
2015                 }
2016         } else if (req->flags & REQ_F_LINK_TIMEOUT) {
2017                 struct io_ring_ctx *ctx = req->ctx;
2018
2019                 spin_lock_irq(&ctx->timeout_lock);
2020                 posted = io_kill_linked_timeout(req);
2021                 spin_unlock_irq(&ctx->timeout_lock);
2022         }
2023         if (unlikely((req->flags & REQ_F_FAIL) &&
2024                      !(req->flags & REQ_F_HARDLINK))) {
2025                 posted |= (req->link != NULL);
2026                 io_fail_links(req);
2027         }
2028         return posted;
2029 }
2030
2031 static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
2032 {
2033         struct io_kiocb *nxt;
2034
2035         /*
2036          * If LINK is set, we have dependent requests in this chain. If we
2037          * didn't fail this request, queue the first one up, moving any other
2038          * dependencies to the next request. In case of failure, fail the rest
2039          * of the chain.
2040          */
2041         if (req->flags & IO_DISARM_MASK) {
2042                 struct io_ring_ctx *ctx = req->ctx;
2043                 bool posted;
2044
2045                 spin_lock(&ctx->completion_lock);
2046                 posted = io_disarm_next(req);
2047                 if (posted)
2048                         io_commit_cqring(req->ctx);
2049                 spin_unlock(&ctx->completion_lock);
2050                 if (posted)
2051                         io_cqring_ev_posted(ctx);
2052         }
2053         nxt = req->link;
2054         req->link = NULL;
2055         return nxt;
2056 }
2057
2058 static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
2059 {
2060         if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
2061                 return NULL;
2062         return __io_req_find_next(req);
2063 }
2064
2065 static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
2066 {
2067         if (!ctx)
2068                 return;
2069         if (*locked) {
2070                 if (ctx->submit_state.compl_nr)
2071                         io_submit_flush_completions(ctx);
2072                 mutex_unlock(&ctx->uring_lock);
2073                 *locked = false;
2074         }
2075         percpu_ref_put(&ctx->refs);
2076 }
2077
2078 static void tctx_task_work(struct callback_head *cb)
2079 {
2080         bool locked = false;
2081         struct io_ring_ctx *ctx = NULL;
2082         struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
2083                                                   task_work);
2084
2085         while (1) {
2086                 struct io_wq_work_node *node;
2087
2088                 spin_lock_irq(&tctx->task_lock);
2089                 node = tctx->task_list.first;
2090                 INIT_WQ_LIST(&tctx->task_list);
2091                 if (!node)
2092                         tctx->task_running = false;
2093                 spin_unlock_irq(&tctx->task_lock);
2094                 if (!node)
2095                         break;
2096
2097                 do {
2098                         struct io_wq_work_node *next = node->next;
2099                         struct io_kiocb *req = container_of(node, struct io_kiocb,
2100                                                             io_task_work.node);
2101
2102                         if (req->ctx != ctx) {
2103                                 ctx_flush_and_put(ctx, &locked);
2104                                 ctx = req->ctx;
2105                                 /* if not contended, grab and improve batching */
2106                                 locked = mutex_trylock(&ctx->uring_lock);
2107                                 percpu_ref_get(&ctx->refs);
2108                         }
2109                         req->io_task_work.func(req, &locked);
2110                         node = next;
2111                 } while (node);
2112
2113                 cond_resched();
2114         }
2115
2116         ctx_flush_and_put(ctx, &locked);
2117 }
2118
2119 static void io_req_task_work_add(struct io_kiocb *req)
2120 {
2121         struct task_struct *tsk = req->task;
2122         struct io_uring_task *tctx = tsk->io_uring;
2123         enum task_work_notify_mode notify;
2124         struct io_wq_work_node *node;
2125         unsigned long flags;
2126         bool running;
2127
2128         WARN_ON_ONCE(!tctx);
2129
2130         spin_lock_irqsave(&tctx->task_lock, flags);
2131         wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
2132         running = tctx->task_running;
2133         if (!running)
2134                 tctx->task_running = true;
2135         spin_unlock_irqrestore(&tctx->task_lock, flags);
2136
2137         /* task_work already pending, we're done */
2138         if (running)
2139                 return;
2140
2141         /*
2142          * SQPOLL kernel thread doesn't need notification, just a wakeup. For
2143          * all other cases, use TWA_SIGNAL unconditionally to ensure we're
2144          * processing task_work. There's no reliable way to tell if TWA_RESUME
2145          * will do the job.
2146          */
2147         notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
2148         if (!task_work_add(tsk, &tctx->task_work, notify)) {
2149                 wake_up_process(tsk);
2150                 return;
2151         }
2152
2153         spin_lock_irqsave(&tctx->task_lock, flags);
2154         tctx->task_running = false;
2155         node = tctx->task_list.first;
2156         INIT_WQ_LIST(&tctx->task_list);
2157         spin_unlock_irqrestore(&tctx->task_lock, flags);
2158
2159         while (node) {
2160                 req = container_of(node, struct io_kiocb, io_task_work.node);
2161                 node = node->next;
2162                 if (llist_add(&req->io_task_work.fallback_node,
2163                               &req->ctx->fallback_llist))
2164                         schedule_delayed_work(&req->ctx->fallback_work, 1);
2165         }
2166 }
2167
2168 static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
2169 {
2170         struct io_ring_ctx *ctx = req->ctx;
2171
2172         /* not needed for normal modes, but SQPOLL depends on it */
2173         io_tw_lock(ctx, locked);
2174         io_req_complete_failed(req, req->result);
2175 }
2176
2177 static void io_req_task_submit(struct io_kiocb *req, bool *locked)
2178 {
2179         struct io_ring_ctx *ctx = req->ctx;
2180
2181         io_tw_lock(ctx, locked);
2182         /* req->task == current here, checking PF_EXITING is safe */
2183         if (likely(!(req->task->flags & PF_EXITING)))
2184                 __io_queue_sqe(req);
2185         else
2186                 io_req_complete_failed(req, -EFAULT);
2187 }
2188
2189 static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
2190 {
2191         req->result = ret;
2192         req->io_task_work.func = io_req_task_cancel;
2193         io_req_task_work_add(req);
2194 }
2195
2196 static void io_req_task_queue(struct io_kiocb *req)
2197 {
2198         req->io_task_work.func = io_req_task_submit;
2199         io_req_task_work_add(req);
2200 }
2201
2202 static void io_req_task_queue_reissue(struct io_kiocb *req)
2203 {
2204         req->io_task_work.func = io_queue_async_work;
2205         io_req_task_work_add(req);
2206 }
2207
2208 static inline void io_queue_next(struct io_kiocb *req)
2209 {
2210         struct io_kiocb *nxt = io_req_find_next(req);
2211
2212         if (nxt)
2213                 io_req_task_queue(nxt);
2214 }
2215
2216 static void io_free_req(struct io_kiocb *req)
2217 {
2218         io_queue_next(req);
2219         __io_free_req(req);
2220 }
2221
2222 static void io_free_req_work(struct io_kiocb *req, bool *locked)
2223 {
2224         io_free_req(req);
2225 }
2226
2227 struct req_batch {
2228         struct task_struct      *task;
2229         int                     task_refs;
2230         int                     ctx_refs;
2231 };
2232
2233 static inline void io_init_req_batch(struct req_batch *rb)
2234 {
2235         rb->task_refs = 0;
2236         rb->ctx_refs = 0;
2237         rb->task = NULL;
2238 }
2239
2240 static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
2241                                      struct req_batch *rb)
2242 {
2243         if (rb->ctx_refs)
2244                 percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
2245         if (rb->task)
2246                 io_put_task(rb->task, rb->task_refs);
2247 }
2248
2249 static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
2250                               struct io_submit_state *state)
2251 {
2252         io_queue_next(req);
2253         io_dismantle_req(req);
2254
2255         if (req->task != rb->task) {
2256                 if (rb->task)
2257                         io_put_task(rb->task, rb->task_refs);
2258                 rb->task = req->task;
2259                 rb->task_refs = 0;
2260         }
2261         rb->task_refs++;
2262         rb->ctx_refs++;
2263
2264         if (state->free_reqs != ARRAY_SIZE(state->reqs))
2265                 state->reqs[state->free_reqs++] = req;
2266         else
2267                 list_add(&req->inflight_entry, &state->free_list);
2268 }
2269
2270 static void io_submit_flush_completions(struct io_ring_ctx *ctx)
2271         __must_hold(&ctx->uring_lock)
2272 {
2273         struct io_submit_state *state = &ctx->submit_state;
2274         int i, nr = state->compl_nr;
2275         struct req_batch rb;
2276
2277         spin_lock(&ctx->completion_lock);
2278         for (i = 0; i < nr; i++) {
2279                 struct io_kiocb *req = state->compl_reqs[i];
2280
2281                 __io_cqring_fill_event(ctx, req->user_data, req->result,
2282                                         req->compl.cflags);
2283         }
2284         io_commit_cqring(ctx);
2285         spin_unlock(&ctx->completion_lock);
2286         io_cqring_ev_posted(ctx);
2287
2288         io_init_req_batch(&rb);
2289         for (i = 0; i < nr; i++) {
2290                 struct io_kiocb *req = state->compl_reqs[i];
2291
2292                 if (req_ref_put_and_test(req))
2293                         io_req_free_batch(&rb, req, &ctx->submit_state);
2294         }
2295
2296         io_req_free_batch_finish(ctx, &rb);
2297         state->compl_nr = 0;
2298 }
2299
2300 /*
2301  * Drop reference to request, return next in chain (if there is one) if this
2302  * was the last reference to this request.
2303  */
2304 static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
2305 {
2306         struct io_kiocb *nxt = NULL;
2307
2308         if (req_ref_put_and_test(req)) {
2309                 nxt = io_req_find_next(req);
2310                 __io_free_req(req);
2311         }
2312         return nxt;
2313 }
2314
2315 static inline void io_put_req(struct io_kiocb *req)
2316 {
2317         if (req_ref_put_and_test(req))
2318                 io_free_req(req);
2319 }
2320
2321 static inline void io_put_req_deferred(struct io_kiocb *req)
2322 {
2323         if (req_ref_put_and_test(req)) {
2324                 req->io_task_work.func = io_free_req_work;
2325                 io_req_task_work_add(req);
2326         }
2327 }
2328
2329 static unsigned io_cqring_events(struct io_ring_ctx *ctx)
2330 {
2331         /* See comment at the top of this file */
2332         smp_rmb();
2333         return __io_cqring_events(ctx);
2334 }
2335
2336 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2337 {
2338         struct io_rings *rings = ctx->rings;
2339
2340         /* make sure SQ entry isn't read before tail */
2341         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2342 }
2343
2344 static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
2345 {
2346         unsigned int cflags;
2347
2348         cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
2349         cflags |= IORING_CQE_F_BUFFER;
2350         req->flags &= ~REQ_F_BUFFER_SELECTED;
2351         kfree(kbuf);
2352         return cflags;
2353 }
2354
2355 static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
2356 {
2357         struct io_buffer *kbuf;
2358
2359         if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
2360                 return 0;
2361         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2362         return io_put_kbuf(req, kbuf);
2363 }
2364
2365 static inline bool io_run_task_work(void)
2366 {
2367         if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
2368                 __set_current_state(TASK_RUNNING);
2369                 tracehook_notify_signal();
2370                 return true;
2371         }
2372
2373         return false;
2374 }
2375
2376 /*
2377  * Find and free completed poll iocbs
2378  */
2379 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
2380                                struct list_head *done)
2381 {
2382         struct req_batch rb;
2383         struct io_kiocb *req;
2384
2385         /* order with ->result store in io_complete_rw_iopoll() */
2386         smp_rmb();
2387
2388         io_init_req_batch(&rb);
2389         while (!list_empty(done)) {
2390                 req = list_first_entry(done, struct io_kiocb, inflight_entry);
2391                 list_del(&req->inflight_entry);
2392
2393                 if (READ_ONCE(req->result) == -EAGAIN &&
2394                     !(req->flags & REQ_F_DONT_REISSUE)) {
2395                         req->iopoll_completed = 0;
2396                         io_req_task_queue_reissue(req);
2397                         continue;
2398                 }
2399
2400                 __io_cqring_fill_event(ctx, req->user_data, req->result,
2401                                         io_put_rw_kbuf(req));
2402                 (*nr_events)++;
2403
2404                 if (req_ref_put_and_test(req))
2405                         io_req_free_batch(&rb, req, &ctx->submit_state);
2406         }
2407
2408         io_commit_cqring(ctx);
2409         io_cqring_ev_posted_iopoll(ctx);
2410         io_req_free_batch_finish(ctx, &rb);
2411 }
2412
2413 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
2414                         long min)
2415 {
2416         struct io_kiocb *req, *tmp;
2417         LIST_HEAD(done);
2418         bool spin;
2419
2420         /*
2421          * Only spin for completions if we don't have multiple devices hanging
2422          * off our complete list, and we're under the requested amount.
2423          */
2424         spin = !ctx->poll_multi_queue && *nr_events < min;
2425
2426         list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
2427                 struct kiocb *kiocb = &req->rw.kiocb;
2428                 int ret;
2429
2430                 /*
2431                  * Move completed and retryable entries to our local lists.
2432                  * If we find a request that requires polling, break out
2433                  * and complete those lists first, if we have entries there.
2434                  */
2435                 if (READ_ONCE(req->iopoll_completed)) {
2436                         list_move_tail(&req->inflight_entry, &done);
2437                         continue;
2438                 }
2439                 if (!list_empty(&done))
2440                         break;
2441
2442                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
2443                 if (unlikely(ret < 0))
2444                         return ret;
2445                 else if (ret)
2446                         spin = false;
2447
2448                 /* iopoll may have completed current req */
2449                 if (READ_ONCE(req->iopoll_completed))
2450                         list_move_tail(&req->inflight_entry, &done);
2451         }
2452
2453         if (!list_empty(&done))
2454                 io_iopoll_complete(ctx, nr_events, &done);
2455
2456         return 0;
2457 }
2458
2459 /*
2460  * We can't just wait for polled events to come to us, we have to actively
2461  * find and complete them.
2462  */
2463 static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
2464 {
2465         if (!(ctx->flags & IORING_SETUP_IOPOLL))
2466                 return;
2467
2468         mutex_lock(&ctx->uring_lock);
2469         while (!list_empty(&ctx->iopoll_list)) {
2470                 unsigned int nr_events = 0;
2471
2472                 io_do_iopoll(ctx, &nr_events, 0);
2473
2474                 /* let it sleep and repeat later if can't complete a request */
2475                 if (nr_events == 0)
2476                         break;
2477                 /*
2478                  * Ensure we allow local-to-the-cpu processing to take place,
2479                  * in this case we need to ensure that we reap all events.
2480                  * Also let task_work, etc. to progress by releasing the mutex
2481                  */
2482                 if (need_resched()) {
2483                         mutex_unlock(&ctx->uring_lock);
2484                         cond_resched();
2485                         mutex_lock(&ctx->uring_lock);
2486                 }
2487         }
2488         mutex_unlock(&ctx->uring_lock);
2489 }
2490
2491 static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
2492 {
2493         unsigned int nr_events = 0;
2494         int ret = 0;
2495
2496         /*
2497          * We disallow the app entering submit/complete with polling, but we
2498          * still need to lock the ring to prevent racing with polled issue
2499          * that got punted to a workqueue.
2500          */
2501         mutex_lock(&ctx->uring_lock);
2502         /*
2503          * Don't enter poll loop if we already have events pending.
2504          * If we do, we can potentially be spinning for commands that
2505          * already triggered a CQE (eg in error).
2506          */
2507         if (test_bit(0, &ctx->check_cq_overflow))
2508                 __io_cqring_overflow_flush(ctx, false);
2509         if (io_cqring_events(ctx))
2510                 goto out;
2511         do {
2512                 /*
2513                  * If a submit got punted to a workqueue, we can have the
2514                  * application entering polling for a command before it gets
2515                  * issued. That app will hold the uring_lock for the duration
2516                  * of the poll right here, so we need to take a breather every
2517                  * now and then to ensure that the issue has a chance to add
2518                  * the poll to the issued list. Otherwise we can spin here
2519                  * forever, while the workqueue is stuck trying to acquire the
2520                  * very same mutex.
2521                  */
2522                 if (list_empty(&ctx->iopoll_list)) {
2523                         u32 tail = ctx->cached_cq_tail;
2524
2525                         mutex_unlock(&ctx->uring_lock);
2526                         io_run_task_work();
2527                         mutex_lock(&ctx->uring_lock);
2528
2529                         /* some requests don't go through iopoll_list */
2530                         if (tail != ctx->cached_cq_tail ||
2531                             list_empty(&ctx->iopoll_list))
2532                                 break;
2533                 }
2534                 ret = io_do_iopoll(ctx, &nr_events, min);
2535         } while (!ret && nr_events < min && !need_resched());
2536 out:
2537         mutex_unlock(&ctx->uring_lock);
2538         return ret;
2539 }
2540
2541 static void kiocb_end_write(struct io_kiocb *req)
2542 {
2543         /*
2544          * Tell lockdep we inherited freeze protection from submission
2545          * thread.
2546          */
2547         if (req->flags & REQ_F_ISREG) {
2548                 struct super_block *sb = file_inode(req->file)->i_sb;
2549
2550                 __sb_writers_acquired(sb, SB_FREEZE_WRITE);
2551                 sb_end_write(sb);
2552         }
2553 }
2554
2555 #ifdef CONFIG_BLOCK
2556 static bool io_resubmit_prep(struct io_kiocb *req)
2557 {
2558         struct io_async_rw *rw = req->async_data;
2559
2560         if (!rw)
2561                 return !io_req_prep_async(req);
2562         /* may have left rw->iter inconsistent on -EIOCBQUEUED */
2563         iov_iter_revert(&rw->iter, req->result - iov_iter_count(&rw->iter));
2564         return true;
2565 }
2566
2567 static bool io_rw_should_reissue(struct io_kiocb *req)
2568 {
2569         umode_t mode = file_inode(req->file)->i_mode;
2570         struct io_ring_ctx *ctx = req->ctx;
2571
2572         if (!S_ISBLK(mode) && !S_ISREG(mode))
2573                 return false;
2574         if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
2575             !(ctx->flags & IORING_SETUP_IOPOLL)))
2576                 return false;
2577         /*
2578          * If ref is dying, we might be running poll reap from the exit work.
2579          * Don't attempt to reissue from that path, just let it fail with
2580          * -EAGAIN.
2581          */
2582         if (percpu_ref_is_dying(&ctx->refs))
2583                 return false;
2584         /*
2585          * Play it safe and assume not safe to re-import and reissue if we're
2586          * not in the original thread group (or in task context).
2587          */
2588         if (!same_thread_group(req->task, current) || !in_task())
2589                 return false;
2590         return true;
2591 }
2592 #else
2593 static bool io_resubmit_prep(struct io_kiocb *req)
2594 {
2595         return false;
2596 }
2597 static bool io_rw_should_reissue(struct io_kiocb *req)
2598 {
2599         return false;
2600 }
2601 #endif
2602
2603 static bool __io_complete_rw_common(struct io_kiocb *req, long res)
2604 {
2605         if (req->rw.kiocb.ki_flags & IOCB_WRITE)
2606                 kiocb_end_write(req);
2607         if (res != req->result) {
2608                 if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
2609                     io_rw_should_reissue(req)) {
2610                         req->flags |= REQ_F_REISSUE;
2611                         return true;
2612                 }
2613                 req_set_fail(req);
2614                 req->result = res;
2615         }
2616         return false;
2617 }
2618
2619 static void io_req_task_complete(struct io_kiocb *req, bool *locked)
2620 {
2621         unsigned int cflags = io_put_rw_kbuf(req);
2622         long res = req->result;
2623
2624         if (*locked) {
2625                 struct io_ring_ctx *ctx = req->ctx;
2626                 struct io_submit_state *state = &ctx->submit_state;
2627
2628                 io_req_complete_state(req, res, cflags);
2629                 state->compl_reqs[state->compl_nr++] = req;
2630                 if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
2631                         io_submit_flush_completions(ctx);
2632         } else {
2633                 io_req_complete_post(req, res, cflags);
2634         }
2635 }
2636
2637 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
2638                              unsigned int issue_flags)
2639 {
2640         if (__io_complete_rw_common(req, res))
2641                 return;
2642         __io_req_complete(req, 0, req->result, io_put_rw_kbuf(req));
2643 }
2644
2645 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
2646 {
2647         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2648
2649         if (__io_complete_rw_common(req, res))
2650                 return;
2651         req->result = res;
2652         req->io_task_work.func = io_req_task_complete;
2653         io_req_task_work_add(req);
2654 }
2655
2656 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
2657 {
2658         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2659
2660         if (kiocb->ki_flags & IOCB_WRITE)
2661                 kiocb_end_write(req);
2662         if (unlikely(res != req->result)) {
2663                 if (!(res == -EAGAIN && io_rw_should_reissue(req) &&
2664                     io_resubmit_prep(req))) {
2665                         req_set_fail(req);
2666                         req->flags |= REQ_F_DONT_REISSUE;
2667                 }
2668         }
2669
2670         WRITE_ONCE(req->result, res);
2671         /* order with io_iopoll_complete() checking ->result */
2672         smp_wmb();
2673         WRITE_ONCE(req->iopoll_completed, 1);
2674 }
2675
2676 /*
2677  * After the iocb has been issued, it's safe to be found on the poll list.
2678  * Adding the kiocb to the list AFTER submission ensures that we don't
2679  * find it from a io_do_iopoll() thread before the issuer is done
2680  * accessing the kiocb cookie.
2681  */
2682 static void io_iopoll_req_issued(struct io_kiocb *req)
2683 {
2684         struct io_ring_ctx *ctx = req->ctx;
2685         const bool in_async = io_wq_current_is_worker();
2686
2687         /* workqueue context doesn't hold uring_lock, grab it now */
2688         if (unlikely(in_async))
2689                 mutex_lock(&ctx->uring_lock);
2690
2691         /*
2692          * Track whether we have multiple files in our lists. This will impact
2693          * how we do polling eventually, not spinning if we're on potentially
2694          * different devices.
2695          */
2696         if (list_empty(&ctx->iopoll_list)) {
2697                 ctx->poll_multi_queue = false;
2698         } else if (!ctx->poll_multi_queue) {
2699                 struct io_kiocb *list_req;
2700                 unsigned int queue_num0, queue_num1;
2701
2702                 list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
2703                                                 inflight_entry);
2704
2705                 if (list_req->file != req->file) {
2706                         ctx->poll_multi_queue = true;
2707                 } else {
2708                         queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie);
2709                         queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie);
2710                         if (queue_num0 != queue_num1)
2711                                 ctx->poll_multi_queue = true;
2712                 }
2713         }
2714
2715         /*
2716          * For fast devices, IO may have already completed. If it has, add
2717          * it to the front so we find it first.
2718          */
2719         if (READ_ONCE(req->iopoll_completed))
2720                 list_add(&req->inflight_entry, &ctx->iopoll_list);
2721         else
2722                 list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
2723
2724         if (unlikely(in_async)) {
2725                 /*
2726                  * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
2727                  * in sq thread task context or in io worker task context. If
2728                  * current task context is sq thread, we don't need to check
2729                  * whether should wake up sq thread.
2730                  */
2731                 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
2732                     wq_has_sleeper(&ctx->sq_data->wait))
2733                         wake_up(&ctx->sq_data->wait);
2734
2735                 mutex_unlock(&ctx->uring_lock);
2736         }
2737 }
2738
2739 static bool io_bdev_nowait(struct block_device *bdev)
2740 {
2741         return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
2742 }
2743
2744 /*
2745  * If we tracked the file through the SCM inflight mechanism, we could support
2746  * any file. For now, just ensure that anything potentially problematic is done
2747  * inline.
2748  */
2749 static bool __io_file_supports_nowait(struct file *file, int rw)
2750 {
2751         umode_t mode = file_inode(file)->i_mode;
2752
2753         if (S_ISBLK(mode)) {
2754                 if (IS_ENABLED(CONFIG_BLOCK) &&
2755                     io_bdev_nowait(I_BDEV(file->f_mapping->host)))
2756                         return true;
2757                 return false;
2758         }
2759         if (S_ISSOCK(mode))
2760                 return true;
2761         if (S_ISREG(mode)) {
2762                 if (IS_ENABLED(CONFIG_BLOCK) &&
2763                     io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
2764                     file->f_op != &io_uring_fops)
2765                         return true;
2766                 return false;
2767         }
2768
2769         /* any ->read/write should understand O_NONBLOCK */
2770         if (file->f_flags & O_NONBLOCK)
2771                 return true;
2772
2773         if (!(file->f_mode & FMODE_NOWAIT))
2774                 return false;
2775
2776         if (rw == READ)
2777                 return file->f_op->read_iter != NULL;
2778
2779         return file->f_op->write_iter != NULL;
2780 }
2781
2782 static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
2783 {
2784         if (rw == READ && (req->flags & REQ_F_NOWAIT_READ))
2785                 return true;
2786         else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE))
2787                 return true;
2788
2789         return __io_file_supports_nowait(req->file, rw);
2790 }
2791
2792 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2793 {
2794         struct io_ring_ctx *ctx = req->ctx;
2795         struct kiocb *kiocb = &req->rw.kiocb;
2796         struct file *file = req->file;
2797         unsigned ioprio;
2798         int ret;
2799
2800         if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode))
2801                 req->flags |= REQ_F_ISREG;
2802
2803         kiocb->ki_pos = READ_ONCE(sqe->off);
2804         if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
2805                 req->flags |= REQ_F_CUR_POS;
2806                 kiocb->ki_pos = file->f_pos;
2807         }
2808         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
2809         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2810         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2811         if (unlikely(ret))
2812                 return ret;
2813
2814         /* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */
2815         if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK))
2816                 req->flags |= REQ_F_NOWAIT;
2817
2818         ioprio = READ_ONCE(sqe->ioprio);
2819         if (ioprio) {
2820                 ret = ioprio_check_cap(ioprio);
2821                 if (ret)
2822                         return ret;
2823
2824                 kiocb->ki_ioprio = ioprio;
2825         } else
2826                 kiocb->ki_ioprio = get_current_ioprio();
2827
2828         if (ctx->flags & IORING_SETUP_IOPOLL) {
2829                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2830                     !kiocb->ki_filp->f_op->iopoll)
2831                         return -EOPNOTSUPP;
2832
2833                 kiocb->ki_flags |= IOCB_HIPRI;
2834                 kiocb->ki_complete = io_complete_rw_iopoll;
2835                 req->iopoll_completed = 0;
2836         } else {
2837                 if (kiocb->ki_flags & IOCB_HIPRI)
2838                         return -EINVAL;
2839                 kiocb->ki_complete = io_complete_rw;
2840         }
2841
2842         if (req->opcode == IORING_OP_READ_FIXED ||
2843             req->opcode == IORING_OP_WRITE_FIXED) {
2844                 req->imu = NULL;
2845                 io_req_set_rsrc_node(req);
2846         }
2847
2848         req->rw.addr = READ_ONCE(sqe->addr);
2849         req->rw.len = READ_ONCE(sqe->len);
2850         req->buf_index = READ_ONCE(sqe->buf_index);
2851         return 0;
2852 }
2853
2854 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2855 {
2856         switch (ret) {
2857         case -EIOCBQUEUED:
2858                 break;
2859         case -ERESTARTSYS:
2860         case -ERESTARTNOINTR:
2861         case -ERESTARTNOHAND:
2862         case -ERESTART_RESTARTBLOCK:
2863                 /*
2864                  * We can't just restart the syscall, since previously
2865                  * submitted sqes may already be in progress. Just fail this
2866                  * IO with EINTR.
2867                  */
2868                 ret = -EINTR;
2869                 fallthrough;
2870         default:
2871                 kiocb->ki_complete(kiocb, ret, 0);
2872         }
2873 }
2874
2875 static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
2876                        unsigned int issue_flags)
2877 {
2878         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2879         struct io_async_rw *io = req->async_data;
2880         bool check_reissue = kiocb->ki_complete == io_complete_rw;
2881
2882         /* add previously done IO, if any */
2883         if (io && io->bytes_done > 0) {
2884                 if (ret < 0)
2885                         ret = io->bytes_done;
2886                 else
2887                         ret += io->bytes_done;
2888         }
2889
2890         if (req->flags & REQ_F_CUR_POS)
2891                 req->file->f_pos = kiocb->ki_pos;
2892         if (ret >= 0 && check_reissue)
2893                 __io_complete_rw(req, ret, 0, issue_flags);
2894         else
2895                 io_rw_done(kiocb, ret);
2896
2897         if (check_reissue && (req->flags & REQ_F_REISSUE)) {
2898                 req->flags &= ~REQ_F_REISSUE;
2899                 if (io_resubmit_prep(req)) {
2900                         io_req_task_queue_reissue(req);
2901                 } else {
2902                         req_set_fail(req);
2903                         __io_req_complete(req, issue_flags, ret,
2904                                           io_put_rw_kbuf(req));
2905                 }
2906         }
2907 }
2908
2909 static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
2910                              struct io_mapped_ubuf *imu)
2911 {
2912         size_t len = req->rw.len;
2913         u64 buf_end, buf_addr = req->rw.addr;
2914         size_t offset;
2915
2916         if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
2917                 return -EFAULT;
2918         /* not inside the mapped region */
2919         if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
2920                 return -EFAULT;
2921
2922         /*
2923          * May not be a start of buffer, set size appropriately
2924          * and advance us to the beginning.
2925          */
2926         offset = buf_addr - imu->ubuf;
2927         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
2928
2929         if (offset) {
2930                 /*
2931                  * Don't use iov_iter_advance() here, as it's really slow for
2932                  * using the latter parts of a big fixed buffer - it iterates
2933                  * over each segment manually. We can cheat a bit here, because
2934                  * we know that:
2935                  *
2936                  * 1) it's a BVEC iter, we set it up
2937                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
2938                  *    first and last bvec
2939                  *
2940                  * So just find our index, and adjust the iterator afterwards.
2941                  * If the offset is within the first bvec (or the whole first
2942                  * bvec, just use iov_iter_advance(). This makes it easier
2943                  * since we can just skip the first segment, which may not
2944                  * be PAGE_SIZE aligned.
2945                  */
2946                 const struct bio_vec *bvec = imu->bvec;
2947
2948                 if (offset <= bvec->bv_len) {
2949                         iov_iter_advance(iter, offset);
2950                 } else {
2951                         unsigned long seg_skip;
2952
2953                         /* skip first vec */
2954                         offset -= bvec->bv_len;
2955                         seg_skip = 1 + (offset >> PAGE_SHIFT);
2956
2957                         iter->bvec = bvec + seg_skip;
2958                         iter->nr_segs -= seg_skip;
2959                         iter->count -= bvec->bv_len + offset;
2960                         iter->iov_offset = offset & ~PAGE_MASK;
2961                 }
2962         }
2963
2964         return 0;
2965 }
2966
2967 static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
2968 {
2969         struct io_ring_ctx *ctx = req->ctx;
2970         struct io_mapped_ubuf *imu = req->imu;
2971         u16 index, buf_index = req->buf_index;
2972
2973         if (likely(!imu)) {
2974                 if (unlikely(buf_index >= ctx->nr_user_bufs))
2975                         return -EFAULT;
2976                 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2977                 imu = READ_ONCE(ctx->user_bufs[index]);
2978                 req->imu = imu;
2979         }
2980         return __io_import_fixed(req, rw, iter, imu);
2981 }
2982
2983 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2984 {
2985         if (needs_lock)
2986                 mutex_unlock(&ctx->uring_lock);
2987 }
2988
2989 static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2990 {
2991         /*
2992          * "Normal" inline submissions always hold the uring_lock, since we
2993          * grab it from the system call. Same is true for the SQPOLL offload.
2994          * The only exception is when we've detached the request and issue it
2995          * from an async worker thread, grab the lock for that case.
2996          */
2997         if (needs_lock)
2998                 mutex_lock(&ctx->uring_lock);
2999 }
3000
3001 static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
3002                                           int bgid, struct io_buffer *kbuf,
3003                                           bool needs_lock)
3004 {
3005         struct io_buffer *head;
3006
3007         if (req->flags & REQ_F_BUFFER_SELECTED)
3008                 return kbuf;
3009
3010         io_ring_submit_lock(req->ctx, needs_lock);
3011
3012         lockdep_assert_held(&req->ctx->uring_lock);
3013
3014         head = xa_load(&req->ctx->io_buffers, bgid);
3015         if (head) {
3016                 if (!list_empty(&head->list)) {
3017                         kbuf = list_last_entry(&head->list, struct io_buffer,
3018                                                         list);
3019                         list_del(&kbuf->list);
3020                 } else {
3021                         kbuf = head;
3022                         xa_erase(&req->ctx->io_buffers, bgid);
3023                 }
3024                 if (*len > kbuf->len)
3025                         *len = kbuf->len;
3026         } else {
3027                 kbuf = ERR_PTR(-ENOBUFS);
3028         }
3029
3030         io_ring_submit_unlock(req->ctx, needs_lock);
3031
3032         return kbuf;
3033 }
3034
3035 static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
3036                                         bool needs_lock)
3037 {
3038         struct io_buffer *kbuf;
3039         u16 bgid;
3040
3041         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
3042         bgid = req->buf_index;
3043         kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
3044         if (IS_ERR(kbuf))
3045                 return kbuf;
3046         req->rw.addr = (u64) (unsigned long) kbuf;
3047         req->flags |= REQ_F_BUFFER_SELECTED;
3048         return u64_to_user_ptr(kbuf->addr);
3049 }
3050
3051 #ifdef CONFIG_COMPAT
3052 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
3053                                 bool needs_lock)
3054 {
3055         struct compat_iovec __user *uiov;
3056         compat_ssize_t clen;
3057         void __user *buf;
3058         ssize_t len;
3059
3060         uiov = u64_to_user_ptr(req->rw.addr);
3061         if (!access_ok(uiov, sizeof(*uiov)))
3062                 return -EFAULT;
3063         if (__get_user(clen, &uiov->iov_len))
3064                 return -EFAULT;
3065         if (clen < 0)
3066                 return -EINVAL;
3067
3068         len = clen;
3069         buf = io_rw_buffer_select(req, &len, needs_lock);
3070         if (IS_ERR(buf))
3071                 return PTR_ERR(buf);
3072         iov[0].iov_base = buf;
3073         iov[0].iov_len = (compat_size_t) len;
3074         return 0;
3075 }
3076 #endif
3077
3078 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3079                                       bool needs_lock)
3080 {
3081         struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
3082         void __user *buf;
3083         ssize_t len;
3084
3085         if (copy_from_user(iov, uiov, sizeof(*uiov)))
3086                 return -EFAULT;
3087
3088         len = iov[0].iov_len;
3089         if (len < 0)
3090                 return -EINVAL;
3091         buf = io_rw_buffer_select(req, &len, needs_lock);
3092         if (IS_ERR(buf))
3093                 return PTR_ERR(buf);
3094         iov[0].iov_base = buf;
3095         iov[0].iov_len = len;
3096         return 0;
3097 }
3098
3099 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3100                                     bool needs_lock)
3101 {
3102         if (req->flags & REQ_F_BUFFER_SELECTED) {
3103                 struct io_buffer *kbuf;
3104
3105                 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
3106                 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
3107                 iov[0].iov_len = kbuf->len;
3108                 return 0;
3109         }
3110         if (req->rw.len != 1)
3111                 return -EINVAL;
3112
3113 #ifdef CONFIG_COMPAT
3114         if (req->ctx->compat)
3115                 return io_compat_import(req, iov, needs_lock);
3116 #endif
3117
3118         return __io_iov_buffer_select(req, iov, needs_lock);
3119 }
3120
3121 static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
3122                            struct iov_iter *iter, bool needs_lock)
3123 {
3124         void __user *buf = u64_to_user_ptr(req->rw.addr);
3125         size_t sqe_len = req->rw.len;
3126         u8 opcode = req->opcode;
3127         ssize_t ret;
3128
3129         if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
3130                 *iovec = NULL;
3131                 return io_import_fixed(req, rw, iter);
3132         }
3133
3134         /* buffer index only valid with fixed read/write, or buffer select  */
3135         if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
3136                 return -EINVAL;
3137
3138         if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
3139                 if (req->flags & REQ_F_BUFFER_SELECT) {
3140                         buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
3141                         if (IS_ERR(buf))
3142                                 return PTR_ERR(buf);
3143                         req->rw.len = sqe_len;
3144                 }
3145
3146                 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
3147                 *iovec = NULL;
3148                 return ret;
3149         }
3150
3151         if (req->flags & REQ_F_BUFFER_SELECT) {
3152                 ret = io_iov_buffer_select(req, *iovec, needs_lock);
3153                 if (!ret)
3154                         iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
3155                 *iovec = NULL;
3156                 return ret;
3157         }
3158
3159         return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
3160                               req->ctx->compat);
3161 }
3162
3163 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
3164 {
3165         return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
3166 }
3167
3168 /*
3169  * For files that don't have ->read_iter() and ->write_iter(), handle them
3170  * by looping over ->read() or ->write() manually.
3171  */
3172 static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
3173 {
3174         struct kiocb *kiocb = &req->rw.kiocb;
3175         struct file *file = req->file;
3176         ssize_t ret = 0;
3177
3178         /*
3179          * Don't support polled IO through this interface, and we can't
3180          * support non-blocking either. For the latter, this just causes
3181          * the kiocb to be handled from an async context.
3182          */
3183         if (kiocb->ki_flags & IOCB_HIPRI)
3184                 return -EOPNOTSUPP;
3185         if (kiocb->ki_flags & IOCB_NOWAIT)
3186                 return -EAGAIN;
3187
3188         while (iov_iter_count(iter)) {
3189                 struct iovec iovec;
3190                 ssize_t nr;
3191
3192                 if (!iov_iter_is_bvec(iter)) {
3193                         iovec = iov_iter_iovec(iter);
3194                 } else {
3195                         iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3196                         iovec.iov_len = req->rw.len;
3197                 }
3198
3199                 if (rw == READ) {
3200                         nr = file->f_op->read(file, iovec.iov_base,
3201                                               iovec.iov_len, io_kiocb_ppos(kiocb));
3202                 } else {
3203                         nr = file->f_op->write(file, iovec.iov_base,
3204                                                iovec.iov_len, io_kiocb_ppos(kiocb));
3205                 }
3206
3207                 if (nr < 0) {
3208                         if (!ret)
3209                                 ret = nr;
3210                         break;
3211                 }
3212                 ret += nr;
3213                 if (nr != iovec.iov_len)
3214                         break;
3215                 req->rw.len -= nr;
3216                 req->rw.addr += nr;
3217                 iov_iter_advance(iter, nr);
3218         }
3219
3220         return ret;
3221 }
3222
3223 static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3224                           const struct iovec *fast_iov, struct iov_iter *iter)
3225 {
3226         struct io_async_rw *rw = req->async_data;
3227
3228         memcpy(&rw->iter, iter, sizeof(*iter));
3229         rw->free_iovec = iovec;
3230         rw->bytes_done = 0;
3231         /* can only be fixed buffers, no need to do anything */
3232         if (iov_iter_is_bvec(iter))
3233                 return;
3234         if (!iovec) {
3235                 unsigned iov_off = 0;
3236
3237                 rw->iter.iov = rw->fast_iov;
3238                 if (iter->iov != fast_iov) {
3239                         iov_off = iter->iov - fast_iov;
3240                         rw->iter.iov += iov_off;
3241                 }
3242                 if (rw->fast_iov != fast_iov)
3243                         memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
3244                                sizeof(struct iovec) * iter->nr_segs);
3245         } else {
3246                 req->flags |= REQ_F_NEED_CLEANUP;
3247         }
3248 }
3249
3250 static inline int io_alloc_async_data(struct io_kiocb *req)
3251 {
3252         WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3253         req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
3254         return req->async_data == NULL;
3255 }
3256
3257 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3258                              const struct iovec *fast_iov,
3259                              struct iov_iter *iter, bool force)
3260 {
3261         if (!force && !io_op_defs[req->opcode].needs_async_setup)
3262                 return 0;
3263         if (!req->async_data) {
3264                 if (io_alloc_async_data(req)) {
3265                         kfree(iovec);
3266                         return -ENOMEM;
3267                 }
3268
3269                 io_req_map_rw(req, iovec, fast_iov, iter);
3270         }
3271         return 0;
3272 }
3273
3274 static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
3275 {
3276         struct io_async_rw *iorw = req->async_data;
3277         struct iovec *iov = iorw->fast_iov;
3278         int ret;
3279
3280         ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
3281         if (unlikely(ret < 0))
3282                 return ret;
3283
3284         iorw->bytes_done = 0;
3285         iorw->free_iovec = iov;
3286         if (iov)
3287                 req->flags |= REQ_F_NEED_CLEANUP;
3288         return 0;
3289 }
3290
3291 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3292 {
3293         if (unlikely(!(req->file->f_mode & FMODE_READ)))
3294                 return -EBADF;
3295         return io_prep_rw(req, sqe);
3296 }
3297
3298 /*
3299  * This is our waitqueue callback handler, registered through lock_page_async()
3300  * when we initially tried to do the IO with the iocb armed our waitqueue.
3301  * This gets called when the page is unlocked, and we generally expect that to
3302  * happen when the page IO is completed and the page is now uptodate. This will
3303  * queue a task_work based retry of the operation, attempting to copy the data
3304  * again. If the latter fails because the page was NOT uptodate, then we will
3305  * do a thread based blocking retry of the operation. That's the unexpected
3306  * slow path.
3307  */
3308 static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3309                              int sync, void *arg)
3310 {
3311         struct wait_page_queue *wpq;
3312         struct io_kiocb *req = wait->private;
3313         struct wait_page_key *key = arg;
3314
3315         wpq = container_of(wait, struct wait_page_queue, wait);
3316
3317         if (!wake_page_match(wpq, key))
3318                 return 0;
3319
3320         req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
3321         list_del_init(&wait->entry);
3322         io_req_task_queue(req);
3323         return 1;
3324 }
3325
3326 /*
3327  * This controls whether a given IO request should be armed for async page
3328  * based retry. If we return false here, the request is handed to the async
3329  * worker threads for retry. If we're doing buffered reads on a regular file,
3330  * we prepare a private wait_page_queue entry and retry the operation. This
3331  * will either succeed because the page is now uptodate and unlocked, or it
3332  * will register a callback when the page is unlocked at IO completion. Through
3333  * that callback, io_uring uses task_work to setup a retry of the operation.
3334  * That retry will attempt the buffered read again. The retry will generally
3335  * succeed, or in rare cases where it fails, we then fall back to using the
3336  * async worker threads for a blocking retry.
3337  */
3338 static bool io_rw_should_retry(struct io_kiocb *req)
3339 {
3340         struct io_async_rw *rw = req->async_data;
3341         struct wait_page_queue *wait = &rw->wpq;
3342         struct kiocb *kiocb = &req->rw.kiocb;
3343
3344         /* never retry for NOWAIT, we just complete with -EAGAIN */
3345         if (req->flags & REQ_F_NOWAIT)
3346                 return false;
3347
3348         /* Only for buffered IO */
3349         if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
3350                 return false;
3351
3352         /*
3353          * just use poll if we can, and don't attempt if the fs doesn't
3354          * support callback based unlocks
3355          */
3356         if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3357                 return false;
3358
3359         wait->wait.func = io_async_buf_func;
3360         wait->wait.private = req;
3361         wait->wait.flags = 0;
3362         INIT_LIST_HEAD(&wait->wait.entry);
3363         kiocb->ki_flags |= IOCB_WAITQ;
3364         kiocb->ki_flags &= ~IOCB_NOWAIT;
3365         kiocb->ki_waitq = wait;
3366         return true;
3367 }
3368
3369 static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
3370 {
3371         if (req->file->f_op->read_iter)
3372                 return call_read_iter(req->file, &req->rw.kiocb, iter);
3373         else if (req->file->f_op->read)
3374                 return loop_rw_iter(READ, req, iter);
3375         else
3376                 return -EINVAL;
3377 }
3378
3379 static int io_read(struct io_kiocb *req, unsigned int issue_flags)
3380 {
3381         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3382         struct kiocb *kiocb = &req->rw.kiocb;
3383         struct iov_iter __iter, *iter = &__iter;
3384         struct io_async_rw *rw = req->async_data;
3385         ssize_t io_size, ret, ret2;
3386         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3387
3388         if (rw) {
3389                 iter = &rw->iter;
3390                 iovec = NULL;
3391         } else {
3392                 ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
3393                 if (ret < 0)
3394                         return ret;
3395         }
3396         io_size = iov_iter_count(iter);
3397         req->result = io_size;
3398
3399         /* Ensure we clear previously set non-block flag */
3400         if (!force_nonblock)
3401                 kiocb->ki_flags &= ~IOCB_NOWAIT;
3402         else
3403                 kiocb->ki_flags |= IOCB_NOWAIT;
3404
3405         /* If the file doesn't support async, just async punt */
3406         if (force_nonblock && !io_file_supports_nowait(req, READ)) {
3407                 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3408                 return ret ?: -EAGAIN;
3409         }
3410
3411         ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
3412         if (unlikely(ret)) {
3413                 kfree(iovec);
3414                 return ret;
3415         }
3416
3417         ret = io_iter_do_read(req, iter);
3418
3419         if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
3420                 req->flags &= ~REQ_F_REISSUE;
3421                 /* IOPOLL retry should happen for io-wq threads */
3422                 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
3423                         goto done;
3424                 /* no retry on NONBLOCK nor RWF_NOWAIT */
3425                 if (req->flags & REQ_F_NOWAIT)
3426                         goto done;
3427                 /* some cases will consume bytes even on error returns */
3428                 iov_iter_revert(iter, io_size - iov_iter_count(iter));
3429                 ret = 0;
3430         } else if (ret == -EIOCBQUEUED) {
3431                 goto out_free;
3432         } else if (ret <= 0 || ret == io_size || !force_nonblock ||
3433                    (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) {
3434                 /* read all, failed, already did sync or don't want to retry */
3435                 goto done;
3436         }
3437
3438         ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3439         if (ret2)
3440                 return ret2;
3441
3442         iovec = NULL;
3443         rw = req->async_data;
3444         /* now use our persistent iterator, if we aren't already */
3445         iter = &rw->iter;
3446
3447         do {
3448                 io_size -= ret;
3449                 rw->bytes_done += ret;
3450                 /* if we can retry, do so with the callbacks armed */
3451                 if (!io_rw_should_retry(req)) {
3452                         kiocb->ki_flags &= ~IOCB_WAITQ;
3453                         return -EAGAIN;
3454                 }
3455
3456                 /*
3457                  * Now retry read with the IOCB_WAITQ parts set in the iocb. If
3458                  * we get -EIOCBQUEUED, then we'll get a notification when the
3459                  * desired page gets unlocked. We can also get a partial read
3460                  * here, and if we do, then just retry at the new offset.
3461                  */
3462                 ret = io_iter_do_read(req, iter);
3463                 if (ret == -EIOCBQUEUED)
3464                         return 0;
3465                 /* we got some bytes, but not all. retry. */
3466                 kiocb->ki_flags &= ~IOCB_WAITQ;
3467         } while (ret > 0 && ret < io_size);
3468 done:
3469         kiocb_done(kiocb, ret, issue_flags);
3470 out_free:
3471         /* it's faster to check here then delegate to kfree */
3472         if (iovec)
3473                 kfree(iovec);
3474         return 0;
3475 }
3476
3477 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3478 {
3479         if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3480                 return -EBADF;
3481         return io_prep_rw(req, sqe);
3482 }
3483
3484 static int io_write(struct io_kiocb *req, unsigned int issue_flags)
3485 {
3486         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3487         struct kiocb *kiocb = &req->rw.kiocb;
3488         struct iov_iter __iter, *iter = &__iter;
3489         struct io_async_rw *rw = req->async_data;
3490         ssize_t ret, ret2, io_size;
3491         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3492
3493         if (rw) {
3494                 iter = &rw->iter;
3495                 iovec = NULL;
3496         } else {
3497                 ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
3498                 if (ret < 0)
3499                         return ret;
3500         }
3501         io_size = iov_iter_count(iter);
3502         req->result = io_size;
3503
3504         /* Ensure we clear previously set non-block flag */
3505         if (!force_nonblock)
3506                 kiocb->ki_flags &= ~IOCB_NOWAIT;
3507         else
3508                 kiocb->ki_flags |= IOCB_NOWAIT;
3509
3510         /* If the file doesn't support async, just async punt */
3511         if (force_nonblock && !io_file_supports_nowait(req, WRITE))
3512                 goto copy_iov;
3513
3514         /* file path doesn't support NOWAIT for non-direct_IO */
3515         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3516             (req->flags & REQ_F_ISREG))
3517                 goto copy_iov;
3518
3519         ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
3520         if (unlikely(ret))
3521                 goto out_free;
3522
3523         /*
3524          * Open-code file_start_write here to grab freeze protection,
3525          * which will be released by another thread in
3526          * io_complete_rw().  Fool lockdep by telling it the lock got
3527          * released so that it doesn't complain about the held lock when
3528          * we return to userspace.
3529          */
3530         if (req->flags & REQ_F_ISREG) {
3531                 sb_start_write(file_inode(req->file)->i_sb);
3532                 __sb_writers_release(file_inode(req->file)->i_sb,
3533                                         SB_FREEZE_WRITE);
3534         }
3535         kiocb->ki_flags |= IOCB_WRITE;
3536
3537         if (req->file->f_op->write_iter)
3538                 ret2 = call_write_iter(req->file, kiocb, iter);
3539         else if (req->file->f_op->write)
3540                 ret2 = loop_rw_iter(WRITE, req, iter);
3541         else
3542                 ret2 = -EINVAL;
3543
3544         if (req->flags & REQ_F_REISSUE) {
3545                 req->flags &= ~REQ_F_REISSUE;
3546                 ret2 = -EAGAIN;
3547         }
3548
3549         /*
3550          * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3551          * retry them without IOCB_NOWAIT.
3552          */
3553         if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3554                 ret2 = -EAGAIN;
3555         /* no retry on NONBLOCK nor RWF_NOWAIT */
3556         if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
3557                 goto done;
3558         if (!force_nonblock || ret2 != -EAGAIN) {
3559                 /* IOPOLL retry should happen for io-wq threads */
3560                 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
3561                         goto copy_iov;
3562 done:
3563                 kiocb_done(kiocb, ret2, issue_flags);
3564         } else {
3565 copy_iov:
3566                 /* some cases will consume bytes even on error returns */
3567                 iov_iter_revert(iter, io_size - iov_iter_count(iter));
3568                 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
3569                 return ret ?: -EAGAIN;
3570         }
3571 out_free:
3572         /* it's reportedly faster than delegating the null check to kfree() */
3573         if (iovec)
3574                 kfree(iovec);
3575         return ret;
3576 }
3577
3578 static int io_renameat_prep(struct io_kiocb *req,
3579                             const struct io_uring_sqe *sqe)
3580 {
3581         struct io_rename *ren = &req->rename;
3582         const char __user *oldf, *newf;
3583
3584         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3585                 return -EINVAL;
3586         if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
3587                 return -EINVAL;
3588         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3589                 return -EBADF;
3590
3591         ren->old_dfd = READ_ONCE(sqe->fd);
3592         oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3593         newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3594         ren->new_dfd = READ_ONCE(sqe->len);
3595         ren->flags = READ_ONCE(sqe->rename_flags);
3596
3597         ren->oldpath = getname(oldf);
3598         if (IS_ERR(ren->oldpath))
3599                 return PTR_ERR(ren->oldpath);
3600
3601         ren->newpath = getname(newf);
3602         if (IS_ERR(ren->newpath)) {
3603                 putname(ren->oldpath);
3604                 return PTR_ERR(ren->newpath);
3605         }
3606
3607         req->flags |= REQ_F_NEED_CLEANUP;
3608         return 0;
3609 }
3610
3611 static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
3612 {
3613         struct io_rename *ren = &req->rename;
3614         int ret;
3615
3616         if (issue_flags & IO_URING_F_NONBLOCK)
3617                 return -EAGAIN;
3618
3619         ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
3620                                 ren->newpath, ren->flags);
3621
3622         req->flags &= ~REQ_F_NEED_CLEANUP;
3623         if (ret < 0)
3624                 req_set_fail(req);
3625         io_req_complete(req, ret);
3626         return 0;
3627 }
3628
3629 static int io_unlinkat_prep(struct io_kiocb *req,
3630                             const struct io_uring_sqe *sqe)
3631 {
3632         struct io_unlink *un = &req->unlink;
3633         const char __user *fname;
3634
3635         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3636                 return -EINVAL;
3637         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
3638             sqe->splice_fd_in)
3639                 return -EINVAL;
3640         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3641                 return -EBADF;
3642
3643         un->dfd = READ_ONCE(sqe->fd);
3644
3645         un->flags = READ_ONCE(sqe->unlink_flags);
3646         if (un->flags & ~AT_REMOVEDIR)
3647                 return -EINVAL;
3648
3649         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3650         un->filename = getname(fname);
3651         if (IS_ERR(un->filename))
3652                 return PTR_ERR(un->filename);
3653
3654         req->flags |= REQ_F_NEED_CLEANUP;
3655         return 0;
3656 }
3657
3658 static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
3659 {
3660         struct io_unlink *un = &req->unlink;
3661         int ret;
3662
3663         if (issue_flags & IO_URING_F_NONBLOCK)
3664                 return -EAGAIN;
3665
3666         if (un->flags & AT_REMOVEDIR)
3667                 ret = do_rmdir(un->dfd, un->filename);
3668         else
3669                 ret = do_unlinkat(un->dfd, un->filename);
3670
3671         req->flags &= ~REQ_F_NEED_CLEANUP;
3672         if (ret < 0)
3673                 req_set_fail(req);
3674         io_req_complete(req, ret);
3675         return 0;
3676 }
3677
3678 static int io_shutdown_prep(struct io_kiocb *req,
3679                             const struct io_uring_sqe *sqe)
3680 {
3681 #if defined(CONFIG_NET)
3682         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3683                 return -EINVAL;
3684         if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
3685                      sqe->buf_index || sqe->splice_fd_in))
3686                 return -EINVAL;
3687
3688         req->shutdown.how = READ_ONCE(sqe->len);
3689         return 0;
3690 #else
3691         return -EOPNOTSUPP;
3692 #endif
3693 }
3694
3695 static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
3696 {
3697 #if defined(CONFIG_NET)
3698         struct socket *sock;
3699         int ret;
3700
3701         if (issue_flags & IO_URING_F_NONBLOCK)
3702                 return -EAGAIN;
3703
3704         sock = sock_from_file(req->file);
3705         if (unlikely(!sock))
3706                 return -ENOTSOCK;
3707
3708         ret = __sys_shutdown_sock(sock, req->shutdown.how);
3709         if (ret < 0)
3710                 req_set_fail(req);
3711         io_req_complete(req, ret);
3712         return 0;
3713 #else
3714         return -EOPNOTSUPP;
3715 #endif
3716 }
3717
3718 static int __io_splice_prep(struct io_kiocb *req,
3719                             const struct io_uring_sqe *sqe)
3720 {
3721         struct io_splice *sp = &req->splice;
3722         unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
3723
3724         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3725                 return -EINVAL;
3726
3727         sp->file_in = NULL;
3728         sp->len = READ_ONCE(sqe->len);
3729         sp->flags = READ_ONCE(sqe->splice_flags);
3730
3731         if (unlikely(sp->flags & ~valid_flags))
3732                 return -EINVAL;
3733
3734         sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in),
3735                                   (sp->flags & SPLICE_F_FD_IN_FIXED));
3736         if (!sp->file_in)
3737                 return -EBADF;
3738         req->flags |= REQ_F_NEED_CLEANUP;
3739         return 0;
3740 }
3741
3742 static int io_tee_prep(struct io_kiocb *req,
3743                        const struct io_uring_sqe *sqe)
3744 {
3745         if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
3746                 return -EINVAL;
3747         return __io_splice_prep(req, sqe);
3748 }
3749
3750 static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
3751 {
3752         struct io_splice *sp = &req->splice;
3753         struct file *in = sp->file_in;
3754         struct file *out = sp->file_out;
3755         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3756         long ret = 0;
3757
3758         if (issue_flags & IO_URING_F_NONBLOCK)
3759                 return -EAGAIN;
3760         if (sp->len)
3761                 ret = do_tee(in, out, sp->len, flags);
3762
3763         if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
3764                 io_put_file(in);
3765         req->flags &= ~REQ_F_NEED_CLEANUP;
3766
3767         if (ret != sp->len)
3768                 req_set_fail(req);
3769         io_req_complete(req, ret);
3770         return 0;
3771 }
3772
3773 static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3774 {
3775         struct io_splice *sp = &req->splice;
3776
3777         sp->off_in = READ_ONCE(sqe->splice_off_in);
3778         sp->off_out = READ_ONCE(sqe->off);
3779         return __io_splice_prep(req, sqe);
3780 }
3781
3782 static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
3783 {
3784         struct io_splice *sp = &req->splice;
3785         struct file *in = sp->file_in;
3786         struct file *out = sp->file_out;
3787         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3788         loff_t *poff_in, *poff_out;
3789         long ret = 0;
3790
3791         if (issue_flags & IO_URING_F_NONBLOCK)
3792                 return -EAGAIN;
3793
3794         poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
3795         poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
3796
3797         if (sp->len)
3798                 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
3799
3800         if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
3801                 io_put_file(in);
3802         req->flags &= ~REQ_F_NEED_CLEANUP;
3803
3804         if (ret != sp->len)
3805                 req_set_fail(req);
3806         io_req_complete(req, ret);
3807         return 0;
3808 }
3809
3810 /*
3811  * IORING_OP_NOP just posts a completion event, nothing else.
3812  */
3813 static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
3814 {
3815         struct io_ring_ctx *ctx = req->ctx;
3816
3817         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3818                 return -EINVAL;
3819
3820         __io_req_complete(req, issue_flags, 0, 0);
3821         return 0;
3822 }
3823
3824 static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3825 {
3826         struct io_ring_ctx *ctx = req->ctx;
3827
3828         if (!req->file)
3829                 return -EBADF;
3830
3831         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3832                 return -EINVAL;
3833         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
3834                      sqe->splice_fd_in))
3835                 return -EINVAL;
3836
3837         req->sync.flags = READ_ONCE(sqe->fsync_flags);
3838         if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
3839                 return -EINVAL;
3840
3841         req->sync.off = READ_ONCE(sqe->off);
3842         req->sync.len = READ_ONCE(sqe->len);
3843         return 0;
3844 }
3845
3846 static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
3847 {
3848         loff_t end = req->sync.off + req->sync.len;
3849         int ret;
3850
3851         /* fsync always requires a blocking context */
3852         if (issue_flags & IO_URING_F_NONBLOCK)
3853                 return -EAGAIN;
3854
3855         ret = vfs_fsync_range(req->file, req->sync.off,
3856                                 end > 0 ? end : LLONG_MAX,
3857                                 req->sync.flags & IORING_FSYNC_DATASYNC);
3858         if (ret < 0)
3859                 req_set_fail(req);
3860         io_req_complete(req, ret);
3861         return 0;
3862 }
3863
3864 static int io_fallocate_prep(struct io_kiocb *req,
3865                              const struct io_uring_sqe *sqe)
3866 {
3867         if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
3868             sqe->splice_fd_in)
3869                 return -EINVAL;
3870         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3871                 return -EINVAL;
3872
3873         req->sync.off = READ_ONCE(sqe->off);
3874         req->sync.len = READ_ONCE(sqe->addr);
3875         req->sync.mode = READ_ONCE(sqe->len);
3876         return 0;
3877 }
3878
3879 static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
3880 {
3881         int ret;
3882
3883         /* fallocate always requiring blocking context */
3884         if (issue_flags & IO_URING_F_NONBLOCK)
3885                 return -EAGAIN;
3886         ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
3887                                 req->sync.len);
3888         if (ret < 0)
3889                 req_set_fail(req);
3890         io_req_complete(req, ret);
3891         return 0;
3892 }
3893
3894 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3895 {
3896         const char __user *fname;
3897         int ret;
3898
3899         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3900                 return -EINVAL;
3901         if (unlikely(sqe->ioprio || sqe->buf_index))
3902                 return -EINVAL;
3903         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3904                 return -EBADF;
3905
3906         /* open.how should be already initialised */
3907         if (!(req->open.how.flags & O_PATH) && force_o_largefile())
3908                 req->open.how.flags |= O_LARGEFILE;
3909
3910         req->open.dfd = READ_ONCE(sqe->fd);
3911         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3912         req->open.filename = getname(fname);
3913         if (IS_ERR(req->open.filename)) {
3914                 ret = PTR_ERR(req->open.filename);
3915                 req->open.filename = NULL;
3916                 return ret;
3917         }
3918
3919         req->open.file_slot = READ_ONCE(sqe->file_index);
3920         if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
3921                 return -EINVAL;
3922
3923         req->open.nofile = rlimit(RLIMIT_NOFILE);
3924         req->flags |= REQ_F_NEED_CLEANUP;
3925         return 0;
3926 }
3927
3928 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3929 {
3930         u64 mode = READ_ONCE(sqe->len);
3931         u64 flags = READ_ONCE(sqe->open_flags);
3932
3933         req->open.how = build_open_how(flags, mode);
3934         return __io_openat_prep(req, sqe);
3935 }
3936
3937 static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3938 {
3939         struct open_how __user *how;
3940         size_t len;
3941         int ret;
3942
3943         how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3944         len = READ_ONCE(sqe->len);
3945         if (len < OPEN_HOW_SIZE_VER0)
3946                 return -EINVAL;
3947
3948         ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
3949                                         len);
3950         if (ret)
3951                 return ret;
3952
3953         return __io_openat_prep(req, sqe);
3954 }
3955
3956 static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
3957 {
3958         struct open_flags op;
3959         struct file *file;
3960         bool resolve_nonblock, nonblock_set;
3961         bool fixed = !!req->open.file_slot;
3962         int ret;
3963
3964         ret = build_open_flags(&req->open.how, &op);
3965         if (ret)
3966                 goto err;
3967         nonblock_set = op.open_flag & O_NONBLOCK;
3968         resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
3969         if (issue_flags & IO_URING_F_NONBLOCK) {
3970                 /*
3971                  * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
3972                  * it'll always -EAGAIN
3973                  */
3974                 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
3975                         return -EAGAIN;
3976                 op.lookup_flags |= LOOKUP_CACHED;
3977                 op.open_flag |= O_NONBLOCK;
3978         }
3979
3980         if (!fixed) {
3981                 ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
3982                 if (ret < 0)
3983                         goto err;
3984         }
3985
3986         file = do_filp_open(req->open.dfd, req->open.filename, &op);
3987         if (IS_ERR(file)) {
3988                 /*
3989                  * We could hang on to this 'fd' on retrying, but seems like
3990                  * marginal gain for something that is now known to be a slower
3991                  * path. So just put it, and we'll get a new one when we retry.
3992                  */
3993                 if (!fixed)
3994                         put_unused_fd(ret);
3995
3996                 ret = PTR_ERR(file);
3997                 /* only retry if RESOLVE_CACHED wasn't already set by application */
3998                 if (ret == -EAGAIN &&
3999                     (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
4000                         return -EAGAIN;
4001                 goto err;
4002         }
4003
4004         if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
4005                 file->f_flags &= ~O_NONBLOCK;
4006         fsnotify_open(file);
4007
4008         if (!fixed)
4009                 fd_install(ret, file);
4010         else
4011                 ret = io_install_fixed_file(req, file, issue_flags,
4012                                             req->open.file_slot - 1);
4013 err:
4014         putname(req->open.filename);
4015         req->flags &= ~REQ_F_NEED_CLEANUP;
4016         if (ret < 0)
4017                 req_set_fail(req);
4018         __io_req_complete(req, issue_flags, ret, 0);
4019         return 0;
4020 }
4021
4022 static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
4023 {
4024         return io_openat2(req, issue_flags);
4025 }
4026
4027 static int io_remove_buffers_prep(struct io_kiocb *req,
4028                                   const struct io_uring_sqe *sqe)
4029 {
4030         struct io_provide_buf *p = &req->pbuf;
4031         u64 tmp;
4032
4033         if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
4034             sqe->splice_fd_in)
4035                 return -EINVAL;
4036
4037         tmp = READ_ONCE(sqe->fd);
4038         if (!tmp || tmp > USHRT_MAX)
4039                 return -EINVAL;
4040
4041         memset(p, 0, sizeof(*p));
4042         p->nbufs = tmp;
4043         p->bgid = READ_ONCE(sqe->buf_group);
4044         return 0;
4045 }
4046
4047 static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
4048                                int bgid, unsigned nbufs)
4049 {
4050         unsigned i = 0;
4051
4052         /* shouldn't happen */
4053         if (!nbufs)
4054                 return 0;
4055
4056         /* the head kbuf is the list itself */
4057         while (!list_empty(&buf->list)) {
4058                 struct io_buffer *nxt;
4059
4060                 nxt = list_first_entry(&buf->list, struct io_buffer, list);
4061                 list_del(&nxt->list);
4062                 kfree(nxt);
4063                 if (++i == nbufs)
4064                         return i;
4065         }
4066         i++;
4067         kfree(buf);
4068         xa_erase(&ctx->io_buffers, bgid);
4069
4070         return i;
4071 }
4072
4073 static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
4074 {
4075         struct io_provide_buf *p = &req->pbuf;
4076         struct io_ring_ctx *ctx = req->ctx;
4077         struct io_buffer *head;
4078         int ret = 0;
4079         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4080
4081         io_ring_submit_lock(ctx, !force_nonblock);
4082
4083         lockdep_assert_held(&ctx->uring_lock);
4084
4085         ret = -ENOENT;
4086         head = xa_load(&ctx->io_buffers, p->bgid);
4087         if (head)
4088                 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
4089         if (ret < 0)
4090                 req_set_fail(req);
4091
4092         /* complete before unlock, IOPOLL may need the lock */
4093         __io_req_complete(req, issue_flags, ret, 0);
4094         io_ring_submit_unlock(ctx, !force_nonblock);
4095         return 0;
4096 }
4097
4098 static int io_provide_buffers_prep(struct io_kiocb *req,
4099                                    const struct io_uring_sqe *sqe)
4100 {
4101         unsigned long size, tmp_check;
4102         struct io_provide_buf *p = &req->pbuf;
4103         u64 tmp;
4104
4105         if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
4106                 return -EINVAL;
4107
4108         tmp = READ_ONCE(sqe->fd);
4109         if (!tmp || tmp > USHRT_MAX)
4110                 return -E2BIG;
4111         p->nbufs = tmp;
4112         p->addr = READ_ONCE(sqe->addr);
4113         p->len = READ_ONCE(sqe->len);
4114
4115         if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
4116                                 &size))
4117                 return -EOVERFLOW;
4118         if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
4119                 return -EOVERFLOW;
4120
4121         size = (unsigned long)p->len * p->nbufs;
4122         if (!access_ok(u64_to_user_ptr(p->addr), size))
4123                 return -EFAULT;
4124
4125         p->bgid = READ_ONCE(sqe->buf_group);
4126         tmp = READ_ONCE(sqe->off);
4127         if (tmp > USHRT_MAX)
4128                 return -E2BIG;
4129         p->bid = tmp;
4130         return 0;
4131 }
4132
4133 static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
4134 {
4135         struct io_buffer *buf;
4136         u64 addr = pbuf->addr;
4137         int i, bid = pbuf->bid;
4138
4139         for (i = 0; i < pbuf->nbufs; i++) {
4140                 buf = kmalloc(sizeof(*buf), GFP_KERNEL);
4141                 if (!buf)
4142                         break;
4143
4144                 buf->addr = addr;
4145                 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
4146                 buf->bid = bid;
4147                 addr += pbuf->len;
4148                 bid++;
4149                 if (!*head) {
4150                         INIT_LIST_HEAD(&buf->list);
4151                         *head = buf;
4152                 } else {
4153                         list_add_tail(&buf->list, &(*head)->list);
4154                 }
4155         }
4156
4157         return i ? i : -ENOMEM;
4158 }
4159
4160 static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
4161 {
4162         struct io_provide_buf *p = &req->pbuf;
4163         struct io_ring_ctx *ctx = req->ctx;
4164         struct io_buffer *head, *list;
4165         int ret = 0;
4166         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4167
4168         io_ring_submit_lock(ctx, !force_nonblock);
4169
4170         lockdep_assert_held(&ctx->uring_lock);
4171
4172         list = head = xa_load(&ctx->io_buffers, p->bgid);
4173
4174         ret = io_add_buffers(p, &head);
4175         if (ret >= 0 && !list) {
4176                 ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL);
4177                 if (ret < 0)
4178                         __io_remove_buffers(ctx, head, p->bgid, -1U);
4179         }
4180         if (ret < 0)
4181                 req_set_fail(req);
4182         /* complete before unlock, IOPOLL may need the lock */
4183         __io_req_complete(req, issue_flags, ret, 0);
4184         io_ring_submit_unlock(ctx, !force_nonblock);
4185         return 0;
4186 }
4187
4188 static int io_epoll_ctl_prep(struct io_kiocb *req,
4189                              const struct io_uring_sqe *sqe)
4190 {
4191 #if defined(CONFIG_EPOLL)
4192         if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
4193                 return -EINVAL;
4194         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4195                 return -EINVAL;
4196
4197         req->epoll.epfd = READ_ONCE(sqe->fd);
4198         req->epoll.op = READ_ONCE(sqe->len);
4199         req->epoll.fd = READ_ONCE(sqe->off);
4200
4201         if (ep_op_has_event(req->epoll.op)) {
4202                 struct epoll_event __user *ev;
4203
4204                 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4205                 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4206                         return -EFAULT;
4207         }
4208
4209         return 0;
4210 #else
4211         return -EOPNOTSUPP;
4212 #endif
4213 }
4214
4215 static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
4216 {
4217 #if defined(CONFIG_EPOLL)
4218         struct io_epoll *ie = &req->epoll;
4219         int ret;
4220         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4221
4222         ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4223         if (force_nonblock && ret == -EAGAIN)
4224                 return -EAGAIN;
4225
4226         if (ret < 0)
4227                 req_set_fail(req);
4228         __io_req_complete(req, issue_flags, ret, 0);
4229         return 0;
4230 #else
4231         return -EOPNOTSUPP;
4232 #endif
4233 }
4234
4235 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4236 {
4237 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4238         if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
4239                 return -EINVAL;
4240         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4241                 return -EINVAL;
4242
4243         req->madvise.addr = READ_ONCE(sqe->addr);
4244         req->madvise.len = READ_ONCE(sqe->len);
4245         req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4246         return 0;
4247 #else
4248         return -EOPNOTSUPP;
4249 #endif
4250 }
4251
4252 static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
4253 {
4254 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4255         struct io_madvise *ma = &req->madvise;
4256         int ret;
4257
4258         if (issue_flags & IO_URING_F_NONBLOCK)
4259                 return -EAGAIN;
4260
4261         ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
4262         if (ret < 0)
4263                 req_set_fail(req);
4264         io_req_complete(req, ret);
4265         return 0;
4266 #else
4267         return -EOPNOTSUPP;
4268 #endif
4269 }
4270
4271 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4272 {
4273         if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
4274                 return -EINVAL;
4275         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4276                 return -EINVAL;
4277
4278         req->fadvise.offset = READ_ONCE(sqe->off);
4279         req->fadvise.len = READ_ONCE(sqe->len);
4280         req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
4281         return 0;
4282 }
4283
4284 static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
4285 {
4286         struct io_fadvise *fa = &req->fadvise;
4287         int ret;
4288
4289         if (issue_flags & IO_URING_F_NONBLOCK) {
4290                 switch (fa->advice) {
4291                 case POSIX_FADV_NORMAL:
4292                 case POSIX_FADV_RANDOM:
4293                 case POSIX_FADV_SEQUENTIAL:
4294                         break;
4295                 default:
4296                         return -EAGAIN;
4297                 }
4298         }
4299
4300         ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
4301         if (ret < 0)
4302                 req_set_fail(req);
4303         __io_req_complete(req, issue_flags, ret, 0);
4304         return 0;
4305 }
4306
4307 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4308 {
4309         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4310                 return -EINVAL;
4311         if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
4312                 return -EINVAL;
4313         if (req->flags & REQ_F_FIXED_FILE)
4314                 return -EBADF;
4315
4316         req->statx.dfd = READ_ONCE(sqe->fd);
4317         req->statx.mask = READ_ONCE(sqe->len);
4318         req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
4319         req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4320         req->statx.flags = READ_ONCE(sqe->statx_flags);
4321
4322         return 0;
4323 }
4324
4325 static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
4326 {
4327         struct io_statx *ctx = &req->statx;
4328         int ret;
4329
4330         if (issue_flags & IO_URING_F_NONBLOCK)
4331                 return -EAGAIN;
4332
4333         ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
4334                        ctx->buffer);
4335
4336         if (ret < 0)
4337                 req_set_fail(req);
4338         io_req_complete(req, ret);
4339         return 0;
4340 }
4341
4342 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4343 {
4344         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4345                 return -EINVAL;
4346         if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
4347             sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
4348                 return -EINVAL;
4349         if (req->flags & REQ_F_FIXED_FILE)
4350                 return -EBADF;
4351
4352         req->close.fd = READ_ONCE(sqe->fd);
4353         return 0;
4354 }
4355
4356 static int io_close(struct io_kiocb *req, unsigned int issue_flags)
4357 {
4358         struct files_struct *files = current->files;
4359         struct io_close *close = &req->close;
4360         struct fdtable *fdt;
4361         struct file *file = NULL;
4362         int ret = -EBADF;
4363
4364         spin_lock(&files->file_lock);
4365         fdt = files_fdtable(files);
4366         if (close->fd >= fdt->max_fds) {
4367                 spin_unlock(&files->file_lock);
4368                 goto err;
4369         }
4370         file = fdt->fd[close->fd];
4371         if (!file || file->f_op == &io_uring_fops) {
4372                 spin_unlock(&files->file_lock);
4373                 file = NULL;
4374                 goto err;
4375         }
4376
4377         /* if the file has a flush method, be safe and punt to async */
4378         if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
4379                 spin_unlock(&files->file_lock);
4380                 return -EAGAIN;
4381         }
4382
4383         ret = __close_fd_get_file(close->fd, &file);
4384         spin_unlock(&files->file_lock);
4385         if (ret < 0) {
4386                 if (ret == -ENOENT)
4387                         ret = -EBADF;
4388                 goto err;
4389         }
4390
4391         /* No ->flush() or already async, safely close from here */
4392         ret = filp_close(file, current->files);
4393 err:
4394         if (ret < 0)
4395                 req_set_fail(req);
4396         if (file)
4397                 fput(file);
4398         __io_req_complete(req, issue_flags, ret, 0);
4399         return 0;
4400 }
4401
4402 static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4403 {
4404         struct io_ring_ctx *ctx = req->ctx;
4405
4406         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4407                 return -EINVAL;
4408         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
4409                      sqe->splice_fd_in))
4410                 return -EINVAL;
4411
4412         req->sync.off = READ_ONCE(sqe->off);
4413         req->sync.len = READ_ONCE(sqe->len);
4414         req->sync.flags = READ_ONCE(sqe->sync_range_flags);
4415         return 0;
4416 }
4417
4418 static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
4419 {
4420         int ret;
4421
4422         /* sync_file_range always requires a blocking context */
4423         if (issue_flags & IO_URING_F_NONBLOCK)
4424                 return -EAGAIN;
4425
4426         ret = sync_file_range(req->file, req->sync.off, req->sync.len,
4427                                 req->sync.flags);
4428         if (ret < 0)
4429                 req_set_fail(req);
4430         io_req_complete(req, ret);
4431         return 0;
4432 }
4433
4434 #if defined(CONFIG_NET)
4435 static int io_setup_async_msg(struct io_kiocb *req,
4436                               struct io_async_msghdr *kmsg)
4437 {
4438         struct io_async_msghdr *async_msg = req->async_data;
4439
4440         if (async_msg)
4441                 return -EAGAIN;
4442         if (io_alloc_async_data(req)) {
4443                 kfree(kmsg->free_iov);
4444                 return -ENOMEM;
4445         }
4446         async_msg = req->async_data;
4447         req->flags |= REQ_F_NEED_CLEANUP;
4448         memcpy(async_msg, kmsg, sizeof(*kmsg));
4449         async_msg->msg.msg_name = &async_msg->addr;
4450         /* if were using fast_iov, set it to the new one */
4451         if (!async_msg->free_iov)
4452                 async_msg->msg.msg_iter.iov = async_msg->fast_iov;
4453
4454         return -EAGAIN;
4455 }
4456
4457 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
4458                                struct io_async_msghdr *iomsg)
4459 {
4460         iomsg->msg.msg_name = &iomsg->addr;
4461         iomsg->free_iov = iomsg->fast_iov;
4462         return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
4463                                    req->sr_msg.msg_flags, &iomsg->free_iov);
4464 }
4465
4466 static int io_sendmsg_prep_async(struct io_kiocb *req)
4467 {
4468         int ret;
4469
4470         ret = io_sendmsg_copy_hdr(req, req->async_data);
4471         if (!ret)
4472                 req->flags |= REQ_F_NEED_CLEANUP;
4473         return ret;
4474 }
4475
4476 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4477 {
4478         struct io_sr_msg *sr = &req->sr_msg;
4479
4480         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4481                 return -EINVAL;
4482
4483         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4484         sr->len = READ_ONCE(sqe->len);
4485         sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
4486         if (sr->msg_flags & MSG_DONTWAIT)
4487                 req->flags |= REQ_F_NOWAIT;
4488
4489 #ifdef CONFIG_COMPAT
4490         if (req->ctx->compat)
4491                 sr->msg_flags |= MSG_CMSG_COMPAT;
4492 #endif
4493         return 0;
4494 }
4495
4496 static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
4497 {
4498         struct io_async_msghdr iomsg, *kmsg;
4499         struct socket *sock;
4500         unsigned flags;
4501         int min_ret = 0;
4502         int ret;
4503
4504         sock = sock_from_file(req->file);
4505         if (unlikely(!sock))
4506                 return -ENOTSOCK;
4507
4508         kmsg = req->async_data;
4509         if (!kmsg) {
4510                 ret = io_sendmsg_copy_hdr(req, &iomsg);
4511                 if (ret)
4512                         return ret;
4513                 kmsg = &iomsg;
4514         }
4515
4516         flags = req->sr_msg.msg_flags;
4517         if (issue_flags & IO_URING_F_NONBLOCK)
4518                 flags |= MSG_DONTWAIT;
4519         if (flags & MSG_WAITALL)
4520                 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
4521
4522         ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
4523         if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4524                 return io_setup_async_msg(req, kmsg);
4525         if (ret == -ERESTARTSYS)
4526                 ret = -EINTR;
4527
4528         /* fast path, check for non-NULL to avoid function call */
4529         if (kmsg->free_iov)
4530                 kfree(kmsg->free_iov);
4531         req->flags &= ~REQ_F_NEED_CLEANUP;
4532         if (ret < min_ret)
4533                 req_set_fail(req);
4534         __io_req_complete(req, issue_flags, ret, 0);
4535         return 0;
4536 }
4537
4538 static int io_send(struct io_kiocb *req, unsigned int issue_flags)
4539 {
4540         struct io_sr_msg *sr = &req->sr_msg;
4541         struct msghdr msg;
4542         struct iovec iov;
4543         struct socket *sock;
4544         unsigned flags;
4545         int min_ret = 0;
4546         int ret;
4547
4548         sock = sock_from_file(req->file);
4549         if (unlikely(!sock))
4550                 return -ENOTSOCK;
4551
4552         ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
4553         if (unlikely(ret))
4554                 return ret;
4555
4556         msg.msg_name = NULL;
4557         msg.msg_control = NULL;
4558         msg.msg_controllen = 0;
4559         msg.msg_namelen = 0;
4560
4561         flags = req->sr_msg.msg_flags;
4562         if (issue_flags & IO_URING_F_NONBLOCK)
4563                 flags |= MSG_DONTWAIT;
4564         if (flags & MSG_WAITALL)
4565                 min_ret = iov_iter_count(&msg.msg_iter);
4566
4567         msg.msg_flags = flags;
4568         ret = sock_sendmsg(sock, &msg);
4569         if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4570                 return -EAGAIN;
4571         if (ret == -ERESTARTSYS)
4572                 ret = -EINTR;
4573
4574         if (ret < min_ret)
4575                 req_set_fail(req);
4576         __io_req_complete(req, issue_flags, ret, 0);
4577         return 0;
4578 }
4579
4580 static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
4581                                  struct io_async_msghdr *iomsg)
4582 {
4583         struct io_sr_msg *sr = &req->sr_msg;
4584         struct iovec __user *uiov;
4585         size_t iov_len;
4586         int ret;
4587
4588         ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
4589                                         &iomsg->uaddr, &uiov, &iov_len);
4590         if (ret)
4591                 return ret;
4592
4593         if (req->flags & REQ_F_BUFFER_SELECT) {
4594                 if (iov_len > 1)
4595                         return -EINVAL;
4596                 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
4597                         return -EFAULT;
4598                 sr->len = iomsg->fast_iov[0].iov_len;
4599                 iomsg->free_iov = NULL;
4600         } else {
4601                 iomsg->free_iov = iomsg->fast_iov;
4602                 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
4603                                      &iomsg->free_iov, &iomsg->msg.msg_iter,
4604                                      false);
4605                 if (ret > 0)
4606                         ret = 0;
4607         }
4608
4609         return ret;
4610 }
4611
4612 #ifdef CONFIG_COMPAT
4613 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
4614                                         struct io_async_msghdr *iomsg)
4615 {
4616         struct io_sr_msg *sr = &req->sr_msg;
4617         struct compat_iovec __user *uiov;
4618         compat_uptr_t ptr;
4619         compat_size_t len;
4620         int ret;
4621
4622         ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
4623                                   &ptr, &len);
4624         if (ret)
4625                 return ret;
4626
4627         uiov = compat_ptr(ptr);
4628         if (req->flags & REQ_F_BUFFER_SELECT) {
4629                 compat_ssize_t clen;
4630
4631                 if (len > 1)
4632                         return -EINVAL;
4633                 if (!access_ok(uiov, sizeof(*uiov)))
4634                         return -EFAULT;
4635                 if (__get_user(clen, &uiov->iov_len))
4636                         return -EFAULT;
4637                 if (clen < 0)
4638                         return -EINVAL;
4639                 sr->len = clen;
4640                 iomsg->free_iov = NULL;
4641         } else {
4642                 iomsg->free_iov = iomsg->fast_iov;
4643                 ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
4644                                    UIO_FASTIOV, &iomsg->free_iov,
4645                                    &iomsg->msg.msg_iter, true);
4646                 if (ret < 0)
4647                         return ret;
4648         }
4649
4650         return 0;
4651 }
4652 #endif
4653
4654 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
4655                                struct io_async_msghdr *iomsg)
4656 {
4657         iomsg->msg.msg_name = &iomsg->addr;
4658
4659 #ifdef CONFIG_COMPAT
4660         if (req->ctx->compat)
4661                 return __io_compat_recvmsg_copy_hdr(req, iomsg);
4662 #endif
4663
4664         return __io_recvmsg_copy_hdr(req, iomsg);
4665 }
4666
4667 static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
4668                                                bool needs_lock)
4669 {
4670         struct io_sr_msg *sr = &req->sr_msg;
4671         struct io_buffer *kbuf;
4672
4673         kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
4674         if (IS_ERR(kbuf))
4675                 return kbuf;
4676
4677         sr->kbuf = kbuf;
4678         req->flags |= REQ_F_BUFFER_SELECTED;
4679         return kbuf;
4680 }
4681
4682 static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
4683 {
4684         return io_put_kbuf(req, req->sr_msg.kbuf);
4685 }
4686
4687 static int io_recvmsg_prep_async(struct io_kiocb *req)
4688 {
4689         int ret;
4690
4691         ret = io_recvmsg_copy_hdr(req, req->async_data);
4692         if (!ret)
4693                 req->flags |= REQ_F_NEED_CLEANUP;
4694         return ret;
4695 }
4696
4697 static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4698 {
4699         struct io_sr_msg *sr = &req->sr_msg;
4700
4701         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4702                 return -EINVAL;
4703
4704         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4705         sr->len = READ_ONCE(sqe->len);
4706         sr->bgid = READ_ONCE(sqe->buf_group);
4707         sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
4708         if (sr->msg_flags & MSG_DONTWAIT)
4709                 req->flags |= REQ_F_NOWAIT;
4710
4711 #ifdef CONFIG_COMPAT
4712         if (req->ctx->compat)
4713                 sr->msg_flags |= MSG_CMSG_COMPAT;
4714 #endif
4715         return 0;
4716 }
4717
4718 static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
4719 {
4720         struct io_async_msghdr iomsg, *kmsg;
4721         struct socket *sock;
4722         struct io_buffer *kbuf;
4723         unsigned flags;
4724         int min_ret = 0;
4725         int ret, cflags = 0;
4726         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4727
4728         sock = sock_from_file(req->file);
4729         if (unlikely(!sock))
4730                 return -ENOTSOCK;
4731
4732         kmsg = req->async_data;
4733         if (!kmsg) {
4734                 ret = io_recvmsg_copy_hdr(req, &iomsg);
4735                 if (ret)
4736                         return ret;
4737                 kmsg = &iomsg;
4738         }
4739
4740         if (req->flags & REQ_F_BUFFER_SELECT) {
4741                 kbuf = io_recv_buffer_select(req, !force_nonblock);
4742                 if (IS_ERR(kbuf))
4743                         return PTR_ERR(kbuf);
4744                 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
4745                 kmsg->fast_iov[0].iov_len = req->sr_msg.len;
4746                 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
4747                                 1, req->sr_msg.len);
4748         }
4749
4750         flags = req->sr_msg.msg_flags;
4751         if (force_nonblock)
4752                 flags |= MSG_DONTWAIT;
4753         if (flags & MSG_WAITALL)
4754                 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
4755
4756         ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
4757                                         kmsg->uaddr, flags);
4758         if (force_nonblock && ret == -EAGAIN)
4759                 return io_setup_async_msg(req, kmsg);
4760         if (ret == -ERESTARTSYS)
4761                 ret = -EINTR;
4762
4763         if (req->flags & REQ_F_BUFFER_SELECTED)
4764                 cflags = io_put_recv_kbuf(req);
4765         /* fast path, check for non-NULL to avoid function call */
4766         if (kmsg->free_iov)
4767                 kfree(kmsg->free_iov);
4768         req->flags &= ~REQ_F_NEED_CLEANUP;
4769         if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
4770                 req_set_fail(req);
4771         __io_req_complete(req, issue_flags, ret, cflags);
4772         return 0;
4773 }
4774
4775 static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
4776 {
4777         struct io_buffer *kbuf;
4778         struct io_sr_msg *sr = &req->sr_msg;
4779         struct msghdr msg;
4780         void __user *buf = sr->buf;
4781         struct socket *sock;
4782         struct iovec iov;
4783         unsigned flags;
4784         int min_ret = 0;
4785         int ret, cflags = 0;
4786         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4787
4788         sock = sock_from_file(req->file);
4789         if (unlikely(!sock))
4790                 return -ENOTSOCK;
4791
4792         if (req->flags & REQ_F_BUFFER_SELECT) {
4793                 kbuf = io_recv_buffer_select(req, !force_nonblock);
4794                 if (IS_ERR(kbuf))
4795                         return PTR_ERR(kbuf);
4796                 buf = u64_to_user_ptr(kbuf->addr);
4797         }
4798
4799         ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
4800         if (unlikely(ret))
4801                 goto out_free;
4802
4803         msg.msg_name = NULL;
4804         msg.msg_control = NULL;
4805         msg.msg_controllen = 0;
4806         msg.msg_namelen = 0;
4807         msg.msg_iocb = NULL;
4808         msg.msg_flags = 0;
4809
4810         flags = req->sr_msg.msg_flags;
4811         if (force_nonblock)
4812                 flags |= MSG_DONTWAIT;
4813         if (flags & MSG_WAITALL)
4814                 min_ret = iov_iter_count(&msg.msg_iter);
4815
4816         ret = sock_recvmsg(sock, &msg, flags);
4817         if (force_nonblock && ret == -EAGAIN)
4818                 return -EAGAIN;
4819         if (ret == -ERESTARTSYS)
4820                 ret = -EINTR;
4821 out_free:
4822         if (req->flags & REQ_F_BUFFER_SELECTED)
4823                 cflags = io_put_recv_kbuf(req);
4824         if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
4825                 req_set_fail(req);
4826         __io_req_complete(req, issue_flags, ret, cflags);
4827         return 0;
4828 }
4829
4830 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4831 {
4832         struct io_accept *accept = &req->accept;
4833
4834         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4835                 return -EINVAL;
4836         if (sqe->ioprio || sqe->len || sqe->buf_index)
4837                 return -EINVAL;
4838
4839         accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4840         accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4841         accept->flags = READ_ONCE(sqe->accept_flags);
4842         accept->nofile = rlimit(RLIMIT_NOFILE);
4843
4844         accept->file_slot = READ_ONCE(sqe->file_index);
4845         if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) ||
4846                                   (accept->flags & SOCK_CLOEXEC)))
4847                 return -EINVAL;
4848         if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
4849                 return -EINVAL;
4850         if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
4851                 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
4852         return 0;
4853 }
4854
4855 static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
4856 {
4857         struct io_accept *accept = &req->accept;
4858         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4859         unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
4860         bool fixed = !!accept->file_slot;
4861         struct file *file;
4862         int ret, fd;
4863
4864         if (req->file->f_flags & O_NONBLOCK)
4865                 req->flags |= REQ_F_NOWAIT;
4866
4867         if (!fixed) {
4868                 fd = __get_unused_fd_flags(accept->flags, accept->nofile);
4869                 if (unlikely(fd < 0))
4870                         return fd;
4871         }
4872         file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
4873                          accept->flags);
4874         if (IS_ERR(file)) {
4875                 if (!fixed)
4876                         put_unused_fd(fd);
4877                 ret = PTR_ERR(file);
4878                 if (ret == -EAGAIN && force_nonblock)
4879                         return -EAGAIN;
4880                 if (ret == -ERESTARTSYS)
4881                         ret = -EINTR;
4882                 req_set_fail(req);
4883         } else if (!fixed) {
4884                 fd_install(fd, file);
4885                 ret = fd;
4886         } else {
4887                 ret = io_install_fixed_file(req, file, issue_flags,
4888                                             accept->file_slot - 1);
4889         }
4890         __io_req_complete(req, issue_flags, ret, 0);
4891         return 0;
4892 }
4893
4894 static int io_connect_prep_async(struct io_kiocb *req)
4895 {
4896         struct io_async_connect *io = req->async_data;
4897         struct io_connect *conn = &req->connect;
4898
4899         return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
4900 }
4901
4902 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4903 {
4904         struct io_connect *conn = &req->connect;
4905
4906         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4907                 return -EINVAL;
4908         if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
4909             sqe->splice_fd_in)
4910                 return -EINVAL;
4911
4912         conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4913         conn->addr_len =  READ_ONCE(sqe->addr2);
4914         return 0;
4915 }
4916
4917 static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
4918 {
4919         struct io_async_connect __io, *io;
4920         unsigned file_flags;
4921         int ret;
4922         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4923
4924         if (req->async_data) {
4925                 io = req->async_data;
4926         } else {
4927                 ret = move_addr_to_kernel(req->connect.addr,
4928                                                 req->connect.addr_len,
4929                                                 &__io.address);
4930                 if (ret)
4931                         goto out;
4932                 io = &__io;
4933         }
4934
4935         file_flags = force_nonblock ? O_NONBLOCK : 0;
4936
4937         ret = __sys_connect_file(req->file, &io->address,
4938                                         req->connect.addr_len, file_flags);
4939         if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
4940                 if (req->async_data)
4941                         return -EAGAIN;
4942                 if (io_alloc_async_data(req)) {
4943                         ret = -ENOMEM;
4944                         goto out;
4945                 }
4946                 memcpy(req->async_data, &__io, sizeof(__io));
4947                 return -EAGAIN;
4948         }
4949         if (ret == -ERESTARTSYS)
4950                 ret = -EINTR;
4951 out:
4952         if (ret < 0)
4953                 req_set_fail(req);
4954         __io_req_complete(req, issue_flags, ret, 0);
4955         return 0;
4956 }
4957 #else /* !CONFIG_NET */
4958 #define IO_NETOP_FN(op)                                                 \
4959 static int io_##op(struct io_kiocb *req, unsigned int issue_flags)      \
4960 {                                                                       \
4961         return -EOPNOTSUPP;                                             \
4962 }
4963
4964 #define IO_NETOP_PREP(op)                                               \
4965 IO_NETOP_FN(op)                                                         \
4966 static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
4967 {                                                                       \
4968         return -EOPNOTSUPP;                                             \
4969 }                                                                       \
4970
4971 #define IO_NETOP_PREP_ASYNC(op)                                         \
4972 IO_NETOP_PREP(op)                                                       \
4973 static int io_##op##_prep_async(struct io_kiocb *req)                   \
4974 {                                                                       \
4975         return -EOPNOTSUPP;                                             \
4976 }
4977
4978 IO_NETOP_PREP_ASYNC(sendmsg);
4979 IO_NETOP_PREP_ASYNC(recvmsg);
4980 IO_NETOP_PREP_ASYNC(connect);
4981 IO_NETOP_PREP(accept);
4982 IO_NETOP_FN(send);
4983 IO_NETOP_FN(recv);
4984 #endif /* CONFIG_NET */
4985
4986 struct io_poll_table {
4987         struct poll_table_struct pt;
4988         struct io_kiocb *req;
4989         int nr_entries;
4990         int error;
4991 };
4992
4993 static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
4994                            __poll_t mask, io_req_tw_func_t func)
4995 {
4996         /* for instances that support it check for an event match first: */
4997         if (mask && !(mask & poll->events))
4998                 return 0;
4999
5000         trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
5001
5002         list_del_init(&poll->wait.entry);
5003
5004         req->result = mask;
5005         req->io_task_work.func = func;
5006
5007         /*
5008          * If this fails, then the task is exiting. When a task exits, the
5009          * work gets canceled, so just cancel this request as well instead
5010          * of executing it. We can't safely execute it anyway, as we may not
5011          * have the needed state needed for it anyway.
5012          */
5013         io_req_task_work_add(req);
5014         return 1;
5015 }
5016
5017 static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
5018         __acquires(&req->ctx->completion_lock)
5019 {
5020         struct io_ring_ctx *ctx = req->ctx;
5021
5022         /* req->task == current here, checking PF_EXITING is safe */
5023         if (unlikely(req->task->flags & PF_EXITING))
5024                 WRITE_ONCE(poll->canceled, true);
5025
5026         if (!req->result && !READ_ONCE(poll->canceled)) {
5027                 struct poll_table_struct pt = { ._key = poll->events };
5028
5029                 req->result = vfs_poll(req->file, &pt) & poll->events;
5030         }
5031
5032         spin_lock(&ctx->completion_lock);
5033         if (!req->result && !READ_ONCE(poll->canceled)) {
5034                 add_wait_queue(poll->head, &poll->wait);
5035                 return true;
5036         }
5037
5038         return false;
5039 }
5040
5041 static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
5042 {
5043         /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
5044         if (req->opcode == IORING_OP_POLL_ADD)
5045                 return req->async_data;
5046         return req->apoll->double_poll;
5047 }
5048
5049 static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
5050 {
5051         if (req->opcode == IORING_OP_POLL_ADD)
5052                 return &req->poll;
5053         return &req->apoll->poll;
5054 }
5055
5056 static void io_poll_remove_double(struct io_kiocb *req)
5057         __must_hold(&req->ctx->completion_lock)
5058 {
5059         struct io_poll_iocb *poll = io_poll_get_double(req);
5060
5061         lockdep_assert_held(&req->ctx->completion_lock);
5062
5063         if (poll && poll->head) {
5064                 struct wait_queue_head *head = poll->head;
5065
5066                 spin_lock_irq(&head->lock);
5067                 list_del_init(&poll->wait.entry);
5068                 if (poll->wait.private)
5069                         req_ref_put(req);
5070                 poll->head = NULL;
5071                 spin_unlock_irq(&head->lock);
5072         }
5073 }
5074
5075 static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
5076         __must_hold(&req->ctx->completion_lock)
5077 {
5078         struct io_ring_ctx *ctx = req->ctx;
5079         unsigned flags = IORING_CQE_F_MORE;
5080         int error;
5081
5082         if (READ_ONCE(req->poll.canceled)) {
5083                 error = -ECANCELED;
5084                 req->poll.events |= EPOLLONESHOT;
5085         } else {
5086                 error = mangle_poll(mask);
5087         }
5088         if (req->poll.events & EPOLLONESHOT)
5089                 flags = 0;
5090         if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
5091                 req->poll.done = true;
5092                 flags = 0;
5093         }
5094         if (flags & IORING_CQE_F_MORE)
5095                 ctx->cq_extra++;
5096
5097         io_commit_cqring(ctx);
5098         return !(flags & IORING_CQE_F_MORE);
5099 }
5100
5101 static void io_poll_task_func(struct io_kiocb *req, bool *locked)
5102 {
5103         struct io_ring_ctx *ctx = req->ctx;
5104         struct io_kiocb *nxt;
5105
5106         if (io_poll_rewait(req, &req->poll)) {
5107                 spin_unlock(&ctx->completion_lock);
5108         } else {
5109                 bool done;
5110
5111                 done = io_poll_complete(req, req->result);
5112                 if (done) {
5113                         io_poll_remove_double(req);
5114                         hash_del(&req->hash_node);
5115                 } else {
5116                         req->result = 0;
5117                         add_wait_queue(req->poll.head, &req->poll.wait);
5118                 }
5119                 spin_unlock(&ctx->completion_lock);
5120                 io_cqring_ev_posted(ctx);
5121
5122                 if (done) {
5123                         nxt = io_put_req_find_next(req);
5124                         if (nxt)
5125                                 io_req_task_submit(nxt, locked);
5126                 }
5127         }
5128 }
5129
5130 static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
5131                                int sync, void *key)
5132 {
5133         struct io_kiocb *req = wait->private;
5134         struct io_poll_iocb *poll = io_poll_get_single(req);
5135         __poll_t mask = key_to_poll(key);
5136         unsigned long flags;
5137
5138         /* for instances that support it check for an event match first: */
5139         if (mask && !(mask & poll->events))
5140                 return 0;
5141         if (!(poll->events & EPOLLONESHOT))
5142                 return poll->wait.func(&poll->wait, mode, sync, key);
5143
5144         list_del_init(&wait->entry);
5145
5146         if (poll->head) {
5147                 bool done;
5148
5149                 spin_lock_irqsave(&poll->head->lock, flags);
5150                 done = list_empty(&poll->wait.entry);
5151                 if (!done)
5152                         list_del_init(&poll->wait.entry);
5153                 /* make sure double remove sees this as being gone */
5154                 wait->private = NULL;
5155                 spin_unlock_irqrestore(&poll->head->lock, flags);
5156                 if (!done) {
5157                         /* use wait func handler, so it matches the rq type */
5158                         poll->wait.func(&poll->wait, mode, sync, key);
5159                 }
5160         }
5161         req_ref_put(req);
5162         return 1;
5163 }
5164
5165 static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
5166                               wait_queue_func_t wake_func)
5167 {
5168         poll->head = NULL;
5169         poll->done = false;
5170         poll->canceled = false;
5171 #define IO_POLL_UNMASK  (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
5172         /* mask in events that we always want/need */
5173         poll->events = events | IO_POLL_UNMASK;
5174         INIT_LIST_HEAD(&poll->wait.entry);
5175         init_waitqueue_func_entry(&poll->wait, wake_func);
5176 }
5177
5178 static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
5179                             struct wait_queue_head *head,
5180                             struct io_poll_iocb **poll_ptr)
5181 {
5182         struct io_kiocb *req = pt->req;
5183
5184         /*
5185          * The file being polled uses multiple waitqueues for poll handling
5186          * (e.g. one for read, one for write). Setup a separate io_poll_iocb
5187          * if this happens.
5188          */
5189         if (unlikely(pt->nr_entries)) {
5190                 struct io_poll_iocb *poll_one = poll;
5191
5192                 /* double add on the same waitqueue head, ignore */
5193                 if (poll_one->head == head)
5194                         return;
5195                 /* already have a 2nd entry, fail a third attempt */
5196                 if (*poll_ptr) {
5197                         if ((*poll_ptr)->head == head)
5198                                 return;
5199                         pt->error = -EINVAL;
5200                         return;
5201                 }
5202                 /*
5203                  * Can't handle multishot for double wait for now, turn it
5204                  * into one-shot mode.
5205                  */
5206                 if (!(poll_one->events & EPOLLONESHOT))
5207                         poll_one->events |= EPOLLONESHOT;
5208                 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
5209                 if (!poll) {
5210                         pt->error = -ENOMEM;
5211                         return;
5212                 }
5213                 io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
5214                 req_ref_get(req);
5215                 poll->wait.private = req;
5216                 *poll_ptr = poll;
5217         }
5218
5219         pt->nr_entries++;
5220         poll->head = head;
5221
5222         if (poll->events & EPOLLEXCLUSIVE)
5223                 add_wait_queue_exclusive(head, &poll->wait);
5224         else
5225                 add_wait_queue(head, &poll->wait);
5226 }
5227
5228 static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
5229                                struct poll_table_struct *p)
5230 {
5231         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5232         struct async_poll *apoll = pt->req->apoll;
5233
5234         __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
5235 }
5236
5237 static void io_async_task_func(struct io_kiocb *req, bool *locked)
5238 {
5239         struct async_poll *apoll = req->apoll;
5240         struct io_ring_ctx *ctx = req->ctx;
5241
5242         trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data);
5243
5244         if (io_poll_rewait(req, &apoll->poll)) {
5245                 spin_unlock(&ctx->completion_lock);
5246                 return;
5247         }
5248
5249         hash_del(&req->hash_node);
5250         io_poll_remove_double(req);
5251         spin_unlock(&ctx->completion_lock);
5252
5253         if (!READ_ONCE(apoll->poll.canceled))
5254                 io_req_task_submit(req, locked);
5255         else
5256                 io_req_complete_failed(req, -ECANCELED);
5257 }
5258
5259 static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5260                         void *key)
5261 {
5262         struct io_kiocb *req = wait->private;
5263         struct io_poll_iocb *poll = &req->apoll->poll;
5264
5265         trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
5266                                         key_to_poll(key));
5267
5268         return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
5269 }
5270
5271 static void io_poll_req_insert(struct io_kiocb *req)
5272 {
5273         struct io_ring_ctx *ctx = req->ctx;
5274         struct hlist_head *list;
5275
5276         list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
5277         hlist_add_head(&req->hash_node, list);
5278 }
5279
5280 static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
5281                                       struct io_poll_iocb *poll,
5282                                       struct io_poll_table *ipt, __poll_t mask,
5283                                       wait_queue_func_t wake_func)
5284         __acquires(&ctx->completion_lock)
5285 {
5286         struct io_ring_ctx *ctx = req->ctx;
5287         bool cancel = false;
5288
5289         INIT_HLIST_NODE(&req->hash_node);
5290         io_init_poll_iocb(poll, mask, wake_func);
5291         poll->file = req->file;
5292         poll->wait.private = req;
5293
5294         ipt->pt._key = mask;
5295         ipt->req = req;
5296         ipt->error = 0;
5297         ipt->nr_entries = 0;
5298
5299         mask = vfs_poll(req->file, &ipt->pt) & poll->events;
5300         if (unlikely(!ipt->nr_entries) && !ipt->error)
5301                 ipt->error = -EINVAL;
5302
5303         spin_lock(&ctx->completion_lock);
5304         if (ipt->error || (mask && (poll->events & EPOLLONESHOT)))
5305                 io_poll_remove_double(req);
5306         if (likely(poll->head)) {
5307                 spin_lock_irq(&poll->head->lock);
5308                 if (unlikely(list_empty(&poll->wait.entry))) {
5309                         if (ipt->error)
5310                                 cancel = true;
5311                         ipt->error = 0;
5312                         mask = 0;
5313                 }
5314                 if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error)
5315                         list_del_init(&poll->wait.entry);
5316                 else if (cancel)
5317                         WRITE_ONCE(poll->canceled, true);
5318                 else if (!poll->done) /* actually waiting for an event */
5319                         io_poll_req_insert(req);
5320                 spin_unlock_irq(&poll->head->lock);
5321         }
5322
5323         return mask;
5324 }
5325
5326 enum {
5327         IO_APOLL_OK,
5328         IO_APOLL_ABORTED,
5329         IO_APOLL_READY
5330 };
5331
5332 static int io_arm_poll_handler(struct io_kiocb *req)
5333 {
5334         const struct io_op_def *def = &io_op_defs[req->opcode];
5335         struct io_ring_ctx *ctx = req->ctx;
5336         struct async_poll *apoll;
5337         struct io_poll_table ipt;
5338         __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI;
5339         int rw;
5340
5341         if (!req->file || !file_can_poll(req->file))
5342                 return IO_APOLL_ABORTED;
5343         if (req->flags & REQ_F_POLLED)
5344                 return IO_APOLL_ABORTED;
5345         if (!def->pollin && !def->pollout)
5346                 return IO_APOLL_ABORTED;
5347
5348         if (def->pollin) {
5349                 rw = READ;
5350                 mask |= POLLIN | POLLRDNORM;
5351
5352                 /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
5353                 if ((req->opcode == IORING_OP_RECVMSG) &&
5354                     (req->sr_msg.msg_flags & MSG_ERRQUEUE))
5355                         mask &= ~POLLIN;
5356         } else {
5357                 rw = WRITE;
5358                 mask |= POLLOUT | POLLWRNORM;
5359         }
5360
5361         /* if we can't nonblock try, then no point in arming a poll handler */
5362         if (!io_file_supports_nowait(req, rw))
5363                 return IO_APOLL_ABORTED;
5364
5365         apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5366         if (unlikely(!apoll))
5367                 return IO_APOLL_ABORTED;
5368         apoll->double_poll = NULL;
5369         req->apoll = apoll;
5370         req->flags |= REQ_F_POLLED;
5371         ipt.pt._qproc = io_async_queue_proc;
5372         io_req_set_refcount(req);
5373
5374         ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
5375                                         io_async_wake);
5376         spin_unlock(&ctx->completion_lock);
5377         if (ret || ipt.error)
5378                 return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
5379
5380         trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
5381                                 mask, apoll->poll.events);
5382         return IO_APOLL_OK;
5383 }
5384
5385 static bool __io_poll_remove_one(struct io_kiocb *req,
5386                                  struct io_poll_iocb *poll, bool do_cancel)
5387         __must_hold(&req->ctx->completion_lock)
5388 {
5389         bool do_complete = false;
5390
5391         if (!poll->head)
5392                 return false;
5393         spin_lock_irq(&poll->head->lock);
5394         if (do_cancel)
5395                 WRITE_ONCE(poll->canceled, true);
5396         if (!list_empty(&poll->wait.entry)) {
5397                 list_del_init(&poll->wait.entry);
5398                 do_complete = true;
5399         }
5400         spin_unlock_irq(&poll->head->lock);
5401         hash_del(&req->hash_node);
5402         return do_complete;
5403 }
5404
5405 static bool io_poll_remove_one(struct io_kiocb *req)
5406         __must_hold(&req->ctx->completion_lock)
5407 {
5408         bool do_complete;
5409
5410         io_poll_remove_double(req);
5411         do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
5412
5413         if (do_complete) {
5414                 io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
5415                 io_commit_cqring(req->ctx);
5416                 req_set_fail(req);
5417                 io_put_req_deferred(req);
5418         }
5419         return do_complete;
5420 }
5421
5422 /*
5423  * Returns true if we found and killed one or more poll requests
5424  */
5425 static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
5426                                bool cancel_all)
5427 {
5428         struct hlist_node *tmp;
5429         struct io_kiocb *req;
5430         int posted = 0, i;
5431
5432         spin_lock(&ctx->completion_lock);
5433         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
5434                 struct hlist_head *list;
5435
5436                 list = &ctx->cancel_hash[i];
5437                 hlist_for_each_entry_safe(req, tmp, list, hash_node) {
5438                         if (io_match_task(req, tsk, cancel_all))
5439                                 posted += io_poll_remove_one(req);
5440                 }
5441         }
5442         spin_unlock(&ctx->completion_lock);
5443
5444         if (posted)
5445                 io_cqring_ev_posted(ctx);
5446
5447         return posted != 0;
5448 }
5449
5450 static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
5451                                      bool poll_only)
5452         __must_hold(&ctx->completion_lock)
5453 {
5454         struct hlist_head *list;
5455         struct io_kiocb *req;
5456
5457         list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
5458         hlist_for_each_entry(req, list, hash_node) {
5459                 if (sqe_addr != req->user_data)
5460                         continue;
5461                 if (poll_only && req->opcode != IORING_OP_POLL_ADD)
5462                         continue;
5463                 return req;
5464         }
5465         return NULL;
5466 }
5467
5468 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
5469                           bool poll_only)
5470         __must_hold(&ctx->completion_lock)
5471 {
5472         struct io_kiocb *req;
5473
5474         req = io_poll_find(ctx, sqe_addr, poll_only);
5475         if (!req)
5476                 return -ENOENT;
5477         if (io_poll_remove_one(req))
5478                 return 0;
5479
5480         return -EALREADY;
5481 }
5482
5483 static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
5484                                      unsigned int flags)
5485 {
5486         u32 events;
5487
5488         events = READ_ONCE(sqe->poll32_events);
5489 #ifdef __BIG_ENDIAN
5490         events = swahw32(events);
5491 #endif
5492         if (!(flags & IORING_POLL_ADD_MULTI))
5493                 events |= EPOLLONESHOT;
5494         return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
5495 }
5496
5497 static int io_poll_update_prep(struct io_kiocb *req,
5498                                const struct io_uring_sqe *sqe)
5499 {
5500         struct io_poll_update *upd = &req->poll_update;
5501         u32 flags;
5502
5503         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5504                 return -EINVAL;
5505         if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
5506                 return -EINVAL;
5507         flags = READ_ONCE(sqe->len);
5508         if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
5509                       IORING_POLL_ADD_MULTI))
5510                 return -EINVAL;
5511         /* meaningless without update */
5512         if (flags == IORING_POLL_ADD_MULTI)
5513                 return -EINVAL;
5514
5515         upd->old_user_data = READ_ONCE(sqe->addr);
5516         upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
5517         upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
5518
5519         upd->new_user_data = READ_ONCE(sqe->off);
5520         if (!upd->update_user_data && upd->new_user_data)
5521                 return -EINVAL;
5522         if (upd->update_events)
5523                 upd->events = io_poll_parse_events(sqe, flags);
5524         else if (sqe->poll32_events)
5525                 return -EINVAL;
5526
5527         return 0;
5528 }
5529
5530 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5531                         void *key)
5532 {
5533         struct io_kiocb *req = wait->private;
5534         struct io_poll_iocb *poll = &req->poll;
5535
5536         return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
5537 }
5538
5539 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
5540                                struct poll_table_struct *p)
5541 {
5542         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5543
5544         __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
5545 }
5546
5547 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5548 {
5549         struct io_poll_iocb *poll = &req->poll;
5550         u32 flags;
5551
5552         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5553                 return -EINVAL;
5554         if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
5555                 return -EINVAL;
5556         flags = READ_ONCE(sqe->len);
5557         if (flags & ~IORING_POLL_ADD_MULTI)
5558                 return -EINVAL;
5559
5560         io_req_set_refcount(req);
5561         poll->events = io_poll_parse_events(sqe, flags);
5562         return 0;
5563 }
5564
5565 static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
5566 {
5567         struct io_poll_iocb *poll = &req->poll;
5568         struct io_ring_ctx *ctx = req->ctx;
5569         struct io_poll_table ipt;
5570         __poll_t mask;
5571
5572         ipt.pt._qproc = io_poll_queue_proc;
5573
5574         mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
5575                                         io_poll_wake);
5576
5577         if (mask) { /* no async, we'd stolen it */
5578                 ipt.error = 0;
5579                 io_poll_complete(req, mask);
5580         }
5581         spin_unlock(&ctx->completion_lock);
5582
5583         if (mask) {
5584                 io_cqring_ev_posted(ctx);
5585                 if (poll->events & EPOLLONESHOT)
5586                         io_put_req(req);
5587         }
5588         return ipt.error;
5589 }
5590
5591 static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
5592 {
5593         struct io_ring_ctx *ctx = req->ctx;
5594         struct io_kiocb *preq;
5595         bool completing;
5596         int ret;
5597
5598         spin_lock(&ctx->completion_lock);
5599         preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
5600         if (!preq) {
5601                 ret = -ENOENT;
5602                 goto err;
5603         }
5604
5605         if (!req->poll_update.update_events && !req->poll_update.update_user_data) {
5606                 completing = true;
5607                 ret = io_poll_remove_one(preq) ? 0 : -EALREADY;
5608                 goto err;
5609         }
5610
5611         /*
5612          * Don't allow racy completion with singleshot, as we cannot safely
5613          * update those. For multishot, if we're racing with completion, just
5614          * let completion re-add it.
5615          */
5616         completing = !__io_poll_remove_one(preq, &preq->poll, false);
5617         if (completing && (preq->poll.events & EPOLLONESHOT)) {
5618                 ret = -EALREADY;
5619                 goto err;
5620         }
5621         /* we now have a detached poll request. reissue. */
5622         ret = 0;
5623 err:
5624         if (ret < 0) {
5625                 spin_unlock(&ctx->completion_lock);
5626                 req_set_fail(req);
5627                 io_req_complete(req, ret);
5628                 return 0;
5629         }
5630         /* only mask one event flags, keep behavior flags */
5631         if (req->poll_update.update_events) {
5632                 preq->poll.events &= ~0xffff;
5633                 preq->poll.events |= req->poll_update.events & 0xffff;
5634                 preq->poll.events |= IO_POLL_UNMASK;
5635         }
5636         if (req->poll_update.update_user_data)
5637                 preq->user_data = req->poll_update.new_user_data;
5638         spin_unlock(&ctx->completion_lock);
5639
5640         /* complete update request, we're done with it */
5641         io_req_complete(req, ret);
5642
5643         if (!completing) {
5644                 ret = io_poll_add(preq, issue_flags);
5645                 if (ret < 0) {
5646                         req_set_fail(preq);
5647                         io_req_complete(preq, ret);
5648                 }
5649         }
5650         return 0;
5651 }
5652
5653 static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
5654 {
5655         req_set_fail(req);
5656         io_req_complete_post(req, -ETIME, 0);
5657 }
5658
5659 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
5660 {
5661         struct io_timeout_data *data = container_of(timer,
5662                                                 struct io_timeout_data, timer);
5663         struct io_kiocb *req = data->req;
5664         struct io_ring_ctx *ctx = req->ctx;
5665         unsigned long flags;
5666
5667         spin_lock_irqsave(&ctx->timeout_lock, flags);
5668         list_del_init(&req->timeout.list);
5669         atomic_set(&req->ctx->cq_timeouts,
5670                 atomic_read(&req->ctx->cq_timeouts) + 1);
5671         spin_unlock_irqrestore(&ctx->timeout_lock, flags);
5672
5673         req->io_task_work.func = io_req_task_timeout;
5674         io_req_task_work_add(req);
5675         return HRTIMER_NORESTART;
5676 }
5677
5678 static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
5679                                            __u64 user_data)
5680         __must_hold(&ctx->timeout_lock)
5681 {
5682         struct io_timeout_data *io;
5683         struct io_kiocb *req;
5684         bool found = false;
5685
5686         list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
5687                 found = user_data == req->user_data;
5688                 if (found)
5689                         break;
5690         }
5691         if (!found)
5692                 return ERR_PTR(-ENOENT);
5693
5694         io = req->async_data;
5695         if (hrtimer_try_to_cancel(&io->timer) == -1)
5696                 return ERR_PTR(-EALREADY);
5697         list_del_init(&req->timeout.list);
5698         return req;
5699 }
5700
5701 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
5702         __must_hold(&ctx->completion_lock)
5703         __must_hold(&ctx->timeout_lock)
5704 {
5705         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5706
5707         if (IS_ERR(req))
5708                 return PTR_ERR(req);
5709
5710         req_set_fail(req);
5711         io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
5712         io_put_req_deferred(req);
5713         return 0;
5714 }
5715
5716 static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
5717 {
5718         switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
5719         case IORING_TIMEOUT_BOOTTIME:
5720                 return CLOCK_BOOTTIME;
5721         case IORING_TIMEOUT_REALTIME:
5722                 return CLOCK_REALTIME;
5723         default:
5724                 /* can't happen, vetted at prep time */
5725                 WARN_ON_ONCE(1);
5726                 fallthrough;
5727         case 0:
5728                 return CLOCK_MONOTONIC;
5729         }
5730 }
5731
5732 static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
5733                              struct timespec64 *ts, enum hrtimer_mode mode)
5734         __must_hold(&ctx->timeout_lock)
5735 {
5736         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5737         struct io_timeout_data *data;
5738
5739         if (IS_ERR(req))
5740                 return PTR_ERR(req);
5741
5742         req->timeout.off = 0; /* noseq */
5743         data = req->async_data;
5744         list_add_tail(&req->timeout.list, &ctx->timeout_list);
5745         hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
5746         data->timer.function = io_timeout_fn;
5747         hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
5748         return 0;
5749 }
5750
5751 static int io_timeout_remove_prep(struct io_kiocb *req,
5752                                   const struct io_uring_sqe *sqe)
5753 {
5754         struct io_timeout_rem *tr = &req->timeout_rem;
5755
5756         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5757                 return -EINVAL;
5758         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5759                 return -EINVAL;
5760         if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
5761                 return -EINVAL;
5762
5763         tr->addr = READ_ONCE(sqe->addr);
5764         tr->flags = READ_ONCE(sqe->timeout_flags);
5765         if (tr->flags & IORING_TIMEOUT_UPDATE) {
5766                 if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
5767                         return -EINVAL;
5768                 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
5769                         return -EFAULT;
5770         } else if (tr->flags) {
5771                 /* timeout removal doesn't support flags */
5772                 return -EINVAL;
5773         }
5774
5775         return 0;
5776 }
5777
5778 static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
5779 {
5780         return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
5781                                             : HRTIMER_MODE_REL;
5782 }
5783
5784 /*
5785  * Remove or update an existing timeout command
5786  */
5787 static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
5788 {
5789         struct io_timeout_rem *tr = &req->timeout_rem;
5790         struct io_ring_ctx *ctx = req->ctx;
5791         int ret;
5792
5793         if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
5794                 spin_lock(&ctx->completion_lock);
5795                 spin_lock_irq(&ctx->timeout_lock);
5796                 ret = io_timeout_cancel(ctx, tr->addr);
5797                 spin_unlock_irq(&ctx->timeout_lock);
5798                 spin_unlock(&ctx->completion_lock);
5799         } else {
5800                 spin_lock_irq(&ctx->timeout_lock);
5801                 ret = io_timeout_update(ctx, tr->addr, &tr->ts,
5802                                         io_translate_timeout_mode(tr->flags));
5803                 spin_unlock_irq(&ctx->timeout_lock);
5804         }
5805
5806         if (ret < 0)
5807                 req_set_fail(req);
5808         io_req_complete_post(req, ret, 0);
5809         return 0;
5810 }
5811
5812 static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5813                            bool is_timeout_link)
5814 {
5815         struct io_timeout_data *data;
5816         unsigned flags;
5817         u32 off = READ_ONCE(sqe->off);
5818
5819         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5820                 return -EINVAL;
5821         if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
5822             sqe->splice_fd_in)
5823                 return -EINVAL;
5824         if (off && is_timeout_link)
5825                 return -EINVAL;
5826         flags = READ_ONCE(sqe->timeout_flags);
5827         if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK))
5828                 return -EINVAL;
5829         /* more than one clock specified is invalid, obviously */
5830         if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
5831                 return -EINVAL;
5832
5833         req->timeout.off = off;
5834         if (unlikely(off && !req->ctx->off_timeout_used))
5835                 req->ctx->off_timeout_used = true;
5836
5837         if (!req->async_data && io_alloc_async_data(req))
5838                 return -ENOMEM;
5839
5840         data = req->async_data;
5841         data->req = req;
5842         data->flags = flags;
5843
5844         if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5845                 return -EFAULT;
5846
5847         data->mode = io_translate_timeout_mode(flags);
5848         hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
5849
5850         if (is_timeout_link) {
5851                 struct io_submit_link *link = &req->ctx->submit_state.link;
5852
5853                 if (!link->head)
5854                         return -EINVAL;
5855                 if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
5856                         return -EINVAL;
5857                 req->timeout.head = link->last;
5858                 link->last->flags |= REQ_F_ARM_LTIMEOUT;
5859         }
5860         return 0;
5861 }
5862
5863 static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
5864 {
5865         struct io_ring_ctx *ctx = req->ctx;
5866         struct io_timeout_data *data = req->async_data;
5867         struct list_head *entry;
5868         u32 tail, off = req->timeout.off;
5869
5870         spin_lock_irq(&ctx->timeout_lock);
5871
5872         /*
5873          * sqe->off holds how many events that need to occur for this
5874          * timeout event to be satisfied. If it isn't set, then this is
5875          * a pure timeout request, sequence isn't used.
5876          */
5877         if (io_is_timeout_noseq(req)) {
5878                 entry = ctx->timeout_list.prev;
5879                 goto add;
5880         }
5881
5882         tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
5883         req->timeout.target_seq = tail + off;
5884
5885         /* Update the last seq here in case io_flush_timeouts() hasn't.
5886          * This is safe because ->completion_lock is held, and submissions
5887          * and completions are never mixed in the same ->completion_lock section.
5888          */
5889         ctx->cq_last_tm_flush = tail;
5890
5891         /*
5892          * Insertion sort, ensuring the first entry in the list is always
5893          * the one we need first.
5894          */
5895         list_for_each_prev(entry, &ctx->timeout_list) {
5896                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
5897                                                   timeout.list);
5898
5899                 if (io_is_timeout_noseq(nxt))
5900                         continue;
5901                 /* nxt.seq is behind @tail, otherwise would've been completed */
5902                 if (off >= nxt->timeout.target_seq - tail)
5903                         break;
5904         }
5905 add:
5906         list_add(&req->timeout.list, entry);
5907         data->timer.function = io_timeout_fn;
5908         hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
5909         spin_unlock_irq(&ctx->timeout_lock);
5910         return 0;
5911 }
5912
5913 struct io_cancel_data {
5914         struct io_ring_ctx *ctx;
5915         u64 user_data;
5916 };
5917
5918 static bool io_cancel_cb(struct io_wq_work *work, void *data)
5919 {
5920         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5921         struct io_cancel_data *cd = data;
5922
5923         return req->ctx == cd->ctx && req->user_data == cd->user_data;
5924 }
5925
5926 static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
5927                                struct io_ring_ctx *ctx)
5928 {
5929         struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
5930         enum io_wq_cancel cancel_ret;
5931         int ret = 0;
5932
5933         if (!tctx || !tctx->io_wq)
5934                 return -ENOENT;
5935
5936         cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
5937         switch (cancel_ret) {
5938         case IO_WQ_CANCEL_OK:
5939                 ret = 0;
5940                 break;
5941         case IO_WQ_CANCEL_RUNNING:
5942                 ret = -EALREADY;
5943                 break;
5944         case IO_WQ_CANCEL_NOTFOUND:
5945                 ret = -ENOENT;
5946                 break;
5947         }
5948
5949         return ret;
5950 }
5951
5952 static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
5953 {
5954         struct io_ring_ctx *ctx = req->ctx;
5955         int ret;
5956
5957         WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
5958
5959         ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
5960         if (ret != -ENOENT)
5961                 return ret;
5962
5963         spin_lock(&ctx->completion_lock);
5964         spin_lock_irq(&ctx->timeout_lock);
5965         ret = io_timeout_cancel(ctx, sqe_addr);
5966         spin_unlock_irq(&ctx->timeout_lock);
5967         if (ret != -ENOENT)
5968                 goto out;
5969         ret = io_poll_cancel(ctx, sqe_addr, false);
5970 out:
5971         spin_unlock(&ctx->completion_lock);
5972         return ret;
5973 }
5974
5975 static int io_async_cancel_prep(struct io_kiocb *req,
5976                                 const struct io_uring_sqe *sqe)
5977 {
5978         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5979                 return -EINVAL;
5980         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5981                 return -EINVAL;
5982         if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
5983             sqe->splice_fd_in)
5984                 return -EINVAL;
5985
5986         req->cancel.addr = READ_ONCE(sqe->addr);
5987         return 0;
5988 }
5989
5990 static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
5991 {
5992         struct io_ring_ctx *ctx = req->ctx;
5993         u64 sqe_addr = req->cancel.addr;
5994         struct io_tctx_node *node;
5995         int ret;
5996
5997         ret = io_try_cancel_userdata(req, sqe_addr);
5998         if (ret != -ENOENT)
5999                 goto done;
6000
6001         /* slow path, try all io-wq's */
6002         io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
6003         ret = -ENOENT;
6004         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
6005                 struct io_uring_task *tctx = node->task->io_uring;
6006
6007                 ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
6008                 if (ret != -ENOENT)
6009                         break;
6010         }
6011         io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
6012 done:
6013         if (ret < 0)
6014                 req_set_fail(req);
6015         io_req_complete_post(req, ret, 0);
6016         return 0;
6017 }
6018
6019 static int io_rsrc_update_prep(struct io_kiocb *req,
6020                                 const struct io_uring_sqe *sqe)
6021 {
6022         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6023                 return -EINVAL;
6024         if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
6025                 return -EINVAL;
6026
6027         req->rsrc_update.offset = READ_ONCE(sqe->off);
6028         req->rsrc_update.nr_args = READ_ONCE(sqe->len);
6029         if (!req->rsrc_update.nr_args)
6030                 return -EINVAL;
6031         req->rsrc_update.arg = READ_ONCE(sqe->addr);
6032         return 0;
6033 }
6034
6035 static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
6036 {
6037         struct io_ring_ctx *ctx = req->ctx;
6038         struct io_uring_rsrc_update2 up;
6039         int ret;
6040
6041         if (issue_flags & IO_URING_F_NONBLOCK)
6042                 return -EAGAIN;
6043
6044         up.offset = req->rsrc_update.offset;
6045         up.data = req->rsrc_update.arg;
6046         up.nr = 0;
6047         up.tags = 0;
6048         up.resv = 0;
6049
6050         mutex_lock(&ctx->uring_lock);
6051         ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
6052                                         &up, req->rsrc_update.nr_args);
6053         mutex_unlock(&ctx->uring_lock);
6054
6055         if (ret < 0)
6056                 req_set_fail(req);
6057         __io_req_complete(req, issue_flags, ret, 0);
6058         return 0;
6059 }
6060
6061 static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
6062 {
6063         switch (req->opcode) {
6064         case IORING_OP_NOP:
6065                 return 0;
6066         case IORING_OP_READV:
6067         case IORING_OP_READ_FIXED:
6068         case IORING_OP_READ:
6069                 return io_read_prep(req, sqe);
6070         case IORING_OP_WRITEV:
6071         case IORING_OP_WRITE_FIXED:
6072         case IORING_OP_WRITE:
6073                 return io_write_prep(req, sqe);
6074         case IORING_OP_POLL_ADD:
6075                 return io_poll_add_prep(req, sqe);
6076         case IORING_OP_POLL_REMOVE:
6077                 return io_poll_update_prep(req, sqe);
6078         case IORING_OP_FSYNC:
6079                 return io_fsync_prep(req, sqe);
6080         case IORING_OP_SYNC_FILE_RANGE:
6081                 return io_sfr_prep(req, sqe);
6082         case IORING_OP_SENDMSG:
6083         case IORING_OP_SEND:
6084                 return io_sendmsg_prep(req, sqe);
6085         case IORING_OP_RECVMSG:
6086         case IORING_OP_RECV:
6087                 return io_recvmsg_prep(req, sqe);
6088         case IORING_OP_CONNECT:
6089                 return io_connect_prep(req, sqe);
6090         case IORING_OP_TIMEOUT:
6091                 return io_timeout_prep(req, sqe, false);
6092         case IORING_OP_TIMEOUT_REMOVE:
6093                 return io_timeout_remove_prep(req, sqe);
6094         case IORING_OP_ASYNC_CANCEL:
6095                 return io_async_cancel_prep(req, sqe);
6096         case IORING_OP_LINK_TIMEOUT:
6097                 return io_timeout_prep(req, sqe, true);
6098         case IORING_OP_ACCEPT:
6099                 return io_accept_prep(req, sqe);
6100         case IORING_OP_FALLOCATE:
6101                 return io_fallocate_prep(req, sqe);
6102         case IORING_OP_OPENAT:
6103                 return io_openat_prep(req, sqe);
6104         case IORING_OP_CLOSE:
6105                 return io_close_prep(req, sqe);
6106         case IORING_OP_FILES_UPDATE:
6107                 return io_rsrc_update_prep(req, sqe);
6108         case IORING_OP_STATX:
6109                 return io_statx_prep(req, sqe);
6110         case IORING_OP_FADVISE:
6111                 return io_fadvise_prep(req, sqe);
6112         case IORING_OP_MADVISE:
6113                 return io_madvise_prep(req, sqe);
6114         case IORING_OP_OPENAT2:
6115                 return io_openat2_prep(req, sqe);
6116         case IORING_OP_EPOLL_CTL:
6117                 return io_epoll_ctl_prep(req, sqe);
6118         case IORING_OP_SPLICE:
6119                 return io_splice_prep(req, sqe);
6120         case IORING_OP_PROVIDE_BUFFERS:
6121                 return io_provide_buffers_prep(req, sqe);
6122         case IORING_OP_REMOVE_BUFFERS:
6123                 return io_remove_buffers_prep(req, sqe);
6124         case IORING_OP_TEE:
6125                 return io_tee_prep(req, sqe);
6126         case IORING_OP_SHUTDOWN:
6127                 return io_shutdown_prep(req, sqe);
6128         case IORING_OP_RENAMEAT:
6129                 return io_renameat_prep(req, sqe);
6130         case IORING_OP_UNLINKAT:
6131                 return io_unlinkat_prep(req, sqe);
6132         }
6133
6134         printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
6135                         req->opcode);
6136         return -EINVAL;
6137 }
6138
6139 static int io_req_prep_async(struct io_kiocb *req)
6140 {
6141         if (!io_op_defs[req->opcode].needs_async_setup)
6142                 return 0;
6143         if (WARN_ON_ONCE(req->async_data))
6144                 return -EFAULT;
6145         if (io_alloc_async_data(req))
6146                 return -EAGAIN;
6147
6148         switch (req->opcode) {
6149         case IORING_OP_READV:
6150                 return io_rw_prep_async(req, READ);
6151         case IORING_OP_WRITEV:
6152                 return io_rw_prep_async(req, WRITE);
6153         case IORING_OP_SENDMSG:
6154                 return io_sendmsg_prep_async(req);
6155         case IORING_OP_RECVMSG:
6156                 return io_recvmsg_prep_async(req);
6157         case IORING_OP_CONNECT:
6158                 return io_connect_prep_async(req);
6159         }
6160         printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
6161                     req->opcode);
6162         return -EFAULT;
6163 }
6164
6165 static u32 io_get_sequence(struct io_kiocb *req)
6166 {
6167         u32 seq = req->ctx->cached_sq_head;
6168
6169         /* need original cached_sq_head, but it was increased for each req */
6170         io_for_each_link(req, req)
6171                 seq--;
6172         return seq;
6173 }
6174
6175 static bool io_drain_req(struct io_kiocb *req)
6176 {
6177         struct io_kiocb *pos;
6178         struct io_ring_ctx *ctx = req->ctx;
6179         struct io_defer_entry *de;
6180         int ret;
6181         u32 seq;
6182
6183         /*
6184          * If we need to drain a request in the middle of a link, drain the
6185          * head request and the next request/link after the current link.
6186          * Considering sequential execution of links, IOSQE_IO_DRAIN will be
6187          * maintained for every request of our link.
6188          */
6189         if (ctx->drain_next) {
6190                 req->flags |= REQ_F_IO_DRAIN;
6191                 ctx->drain_next = false;
6192         }
6193         /* not interested in head, start from the first linked */
6194         io_for_each_link(pos, req->link) {
6195                 if (pos->flags & REQ_F_IO_DRAIN) {
6196                         ctx->drain_next = true;
6197                         req->flags |= REQ_F_IO_DRAIN;
6198                         break;
6199                 }
6200         }
6201
6202         /* Still need defer if there is pending req in defer list. */
6203         if (likely(list_empty_careful(&ctx->defer_list) &&
6204                 !(req->flags & REQ_F_IO_DRAIN))) {
6205                 ctx->drain_active = false;
6206                 return false;
6207         }
6208
6209         seq = io_get_sequence(req);
6210         /* Still a chance to pass the sequence check */
6211         if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
6212                 return false;
6213
6214         ret = io_req_prep_async(req);
6215         if (ret)
6216                 goto fail;
6217         io_prep_async_link(req);
6218         de = kmalloc(sizeof(*de), GFP_KERNEL);
6219         if (!de) {
6220                 ret = -ENOMEM;
6221 fail:
6222                 io_req_complete_failed(req, ret);
6223                 return true;
6224         }
6225
6226         spin_lock(&ctx->completion_lock);
6227         if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
6228                 spin_unlock(&ctx->completion_lock);
6229                 kfree(de);
6230                 io_queue_async_work(req, NULL);
6231                 return true;
6232         }
6233
6234         trace_io_uring_defer(ctx, req, req->user_data);
6235         de->req = req;
6236         de->seq = seq;
6237         list_add_tail(&de->list, &ctx->defer_list);
6238         spin_unlock(&ctx->completion_lock);
6239         return true;
6240 }
6241
6242 static void io_clean_op(struct io_kiocb *req)
6243 {
6244         if (req->flags & REQ_F_BUFFER_SELECTED) {
6245                 switch (req->opcode) {
6246                 case IORING_OP_READV:
6247                 case IORING_OP_READ_FIXED:
6248                 case IORING_OP_READ:
6249                         kfree((void *)(unsigned long)req->rw.addr);
6250                         break;
6251                 case IORING_OP_RECVMSG:
6252                 case IORING_OP_RECV:
6253                         kfree(req->sr_msg.kbuf);
6254                         break;
6255                 }
6256         }
6257
6258         if (req->flags & REQ_F_NEED_CLEANUP) {
6259                 switch (req->opcode) {
6260                 case IORING_OP_READV:
6261                 case IORING_OP_READ_FIXED:
6262                 case IORING_OP_READ:
6263                 case IORING_OP_WRITEV:
6264                 case IORING_OP_WRITE_FIXED:
6265                 case IORING_OP_WRITE: {
6266                         struct io_async_rw *io = req->async_data;
6267
6268                         kfree(io->free_iovec);
6269                         break;
6270                         }
6271                 case IORING_OP_RECVMSG:
6272                 case IORING_OP_SENDMSG: {
6273                         struct io_async_msghdr *io = req->async_data;
6274
6275                         kfree(io->free_iov);
6276                         break;
6277                         }
6278                 case IORING_OP_SPLICE:
6279                 case IORING_OP_TEE:
6280                         if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED))
6281                                 io_put_file(req->splice.file_in);
6282                         break;
6283                 case IORING_OP_OPENAT:
6284                 case IORING_OP_OPENAT2:
6285                         if (req->open.filename)
6286                                 putname(req->open.filename);
6287                         break;
6288                 case IORING_OP_RENAMEAT:
6289                         putname(req->rename.oldpath);
6290                         putname(req->rename.newpath);
6291                         break;
6292                 case IORING_OP_UNLINKAT:
6293                         putname(req->unlink.filename);
6294                         break;
6295                 }
6296         }
6297         if ((req->flags & REQ_F_POLLED) && req->apoll) {
6298                 kfree(req->apoll->double_poll);
6299                 kfree(req->apoll);
6300                 req->apoll = NULL;
6301         }
6302         if (req->flags & REQ_F_INFLIGHT) {
6303                 struct io_uring_task *tctx = req->task->io_uring;
6304
6305                 atomic_dec(&tctx->inflight_tracked);
6306         }
6307         if (req->flags & REQ_F_CREDS)
6308                 put_cred(req->creds);
6309
6310         req->flags &= ~IO_REQ_CLEAN_FLAGS;
6311 }
6312
6313 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
6314 {
6315         struct io_ring_ctx *ctx = req->ctx;
6316         const struct cred *creds = NULL;
6317         int ret;
6318
6319         if ((req->flags & REQ_F_CREDS) && req->creds != current_cred())
6320                 creds = override_creds(req->creds);
6321
6322         switch (req->opcode) {
6323         case IORING_OP_NOP:
6324                 ret = io_nop(req, issue_flags);
6325                 break;
6326         case IORING_OP_READV:
6327         case IORING_OP_READ_FIXED:
6328         case IORING_OP_READ:
6329                 ret = io_read(req, issue_flags);
6330                 break;
6331         case IORING_OP_WRITEV:
6332         case IORING_OP_WRITE_FIXED:
6333         case IORING_OP_WRITE:
6334                 ret = io_write(req, issue_flags);
6335                 break;
6336         case IORING_OP_FSYNC:
6337                 ret = io_fsync(req, issue_flags);
6338                 break;
6339         case IORING_OP_POLL_ADD:
6340                 ret = io_poll_add(req, issue_flags);
6341                 break;
6342         case IORING_OP_POLL_REMOVE:
6343                 ret = io_poll_update(req, issue_flags);
6344                 break;
6345         case IORING_OP_SYNC_FILE_RANGE:
6346                 ret = io_sync_file_range(req, issue_flags);
6347                 break;
6348         case IORING_OP_SENDMSG:
6349                 ret = io_sendmsg(req, issue_flags);
6350                 break;
6351         case IORING_OP_SEND:
6352                 ret = io_send(req, issue_flags);
6353                 break;
6354         case IORING_OP_RECVMSG:
6355                 ret = io_recvmsg(req, issue_flags);
6356                 break;
6357         case IORING_OP_RECV:
6358                 ret = io_recv(req, issue_flags);
6359                 break;
6360         case IORING_OP_TIMEOUT:
6361                 ret = io_timeout(req, issue_flags);
6362                 break;
6363         case IORING_OP_TIMEOUT_REMOVE:
6364                 ret = io_timeout_remove(req, issue_flags);
6365                 break;
6366         case IORING_OP_ACCEPT:
6367                 ret = io_accept(req, issue_flags);
6368                 break;
6369         case IORING_OP_CONNECT:
6370                 ret = io_connect(req, issue_flags);
6371                 break;
6372         case IORING_OP_ASYNC_CANCEL:
6373                 ret = io_async_cancel(req, issue_flags);
6374                 break;
6375         case IORING_OP_FALLOCATE:
6376                 ret = io_fallocate(req, issue_flags);
6377                 break;
6378         case IORING_OP_OPENAT:
6379                 ret = io_openat(req, issue_flags);
6380                 break;
6381         case IORING_OP_CLOSE:
6382                 ret = io_close(req, issue_flags);
6383                 break;
6384         case IORING_OP_FILES_UPDATE:
6385                 ret = io_files_update(req, issue_flags);
6386                 break;
6387         case IORING_OP_STATX:
6388                 ret = io_statx(req, issue_flags);
6389                 break;
6390         case IORING_OP_FADVISE:
6391                 ret = io_fadvise(req, issue_flags);
6392                 break;
6393         case IORING_OP_MADVISE:
6394                 ret = io_madvise(req, issue_flags);
6395                 break;
6396         case IORING_OP_OPENAT2:
6397                 ret = io_openat2(req, issue_flags);
6398                 break;
6399         case IORING_OP_EPOLL_CTL:
6400                 ret = io_epoll_ctl(req, issue_flags);
6401                 break;
6402         case IORING_OP_SPLICE:
6403                 ret = io_splice(req, issue_flags);
6404                 break;
6405         case IORING_OP_PROVIDE_BUFFERS:
6406                 ret = io_provide_buffers(req, issue_flags);
6407                 break;
6408         case IORING_OP_REMOVE_BUFFERS:
6409                 ret = io_remove_buffers(req, issue_flags);
6410                 break;
6411         case IORING_OP_TEE:
6412                 ret = io_tee(req, issue_flags);
6413                 break;
6414         case IORING_OP_SHUTDOWN:
6415                 ret = io_shutdown(req, issue_flags);
6416                 break;
6417         case IORING_OP_RENAMEAT:
6418                 ret = io_renameat(req, issue_flags);
6419                 break;
6420         case IORING_OP_UNLINKAT:
6421                 ret = io_unlinkat(req, issue_flags);
6422                 break;
6423         default:
6424                 ret = -EINVAL;
6425                 break;
6426         }
6427
6428         if (creds)
6429                 revert_creds(creds);
6430         if (ret)
6431                 return ret;
6432         /* If the op doesn't have a file, we're not polling for it */
6433         if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file)
6434                 io_iopoll_req_issued(req);
6435
6436         return 0;
6437 }
6438
6439 static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
6440 {
6441         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6442
6443         req = io_put_req_find_next(req);
6444         return req ? &req->work : NULL;
6445 }
6446
6447 static void io_wq_submit_work(struct io_wq_work *work)
6448 {
6449         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6450         struct io_kiocb *timeout;
6451         int ret = 0;
6452
6453         /* one will be dropped by ->io_free_work() after returning to io-wq */
6454         if (!(req->flags & REQ_F_REFCOUNT))
6455                 __io_req_set_refcount(req, 2);
6456         else
6457                 req_ref_get(req);
6458
6459         timeout = io_prep_linked_timeout(req);
6460         if (timeout)
6461                 io_queue_linked_timeout(timeout);
6462
6463         /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
6464         if (work->flags & IO_WQ_WORK_CANCEL)
6465                 ret = -ECANCELED;
6466
6467         if (!ret) {
6468                 do {
6469                         ret = io_issue_sqe(req, 0);
6470                         /*
6471                          * We can get EAGAIN for polled IO even though we're
6472                          * forcing a sync submission from here, since we can't
6473                          * wait for request slots on the block side.
6474                          */
6475                         if (ret != -EAGAIN)
6476                                 break;
6477                         cond_resched();
6478                 } while (1);
6479         }
6480
6481         /* avoid locking problems by failing it from a clean context */
6482         if (ret)
6483                 io_req_task_queue_fail(req, ret);
6484 }
6485
6486 static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
6487                                                        unsigned i)
6488 {
6489         return &table->files[i];
6490 }
6491
6492 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
6493                                               int index)
6494 {
6495         struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
6496
6497         return (struct file *) (slot->file_ptr & FFS_MASK);
6498 }
6499
6500 static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
6501 {
6502         unsigned long file_ptr = (unsigned long) file;
6503
6504         if (__io_file_supports_nowait(file, READ))
6505                 file_ptr |= FFS_ASYNC_READ;
6506         if (__io_file_supports_nowait(file, WRITE))
6507                 file_ptr |= FFS_ASYNC_WRITE;
6508         if (S_ISREG(file_inode(file)->i_mode))
6509                 file_ptr |= FFS_ISREG;
6510         file_slot->file_ptr = file_ptr;
6511 }
6512
6513 static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
6514                                              struct io_kiocb *req, int fd)
6515 {
6516         struct file *file;
6517         unsigned long file_ptr;
6518
6519         if (unlikely((unsigned int)fd >= ctx->nr_user_files))
6520                 return NULL;
6521         fd = array_index_nospec(fd, ctx->nr_user_files);
6522         file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
6523         file = (struct file *) (file_ptr & FFS_MASK);
6524         file_ptr &= ~FFS_MASK;
6525         /* mask in overlapping REQ_F and FFS bits */
6526         req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
6527         io_req_set_rsrc_node(req);
6528         return file;
6529 }
6530
6531 static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
6532                                        struct io_kiocb *req, int fd)
6533 {
6534         struct file *file = fget(fd);
6535
6536         trace_io_uring_file_get(ctx, fd);
6537
6538         /* we don't allow fixed io_uring files */
6539         if (file && unlikely(file->f_op == &io_uring_fops))
6540                 io_req_track_inflight(req);
6541         return file;
6542 }
6543
6544 static inline struct file *io_file_get(struct io_ring_ctx *ctx,
6545                                        struct io_kiocb *req, int fd, bool fixed)
6546 {
6547         if (fixed)
6548                 return io_file_get_fixed(ctx, req, fd);
6549         else
6550                 return io_file_get_normal(ctx, req, fd);
6551 }
6552
6553 static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
6554 {
6555         struct io_kiocb *prev = req->timeout.prev;
6556         int ret;
6557
6558         if (prev) {
6559                 ret = io_try_cancel_userdata(req, prev->user_data);
6560                 io_req_complete_post(req, ret ?: -ETIME, 0);
6561                 io_put_req(prev);
6562         } else {
6563                 io_req_complete_post(req, -ETIME, 0);
6564         }
6565 }
6566
6567 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
6568 {
6569         struct io_timeout_data *data = container_of(timer,
6570                                                 struct io_timeout_data, timer);
6571         struct io_kiocb *prev, *req = data->req;
6572         struct io_ring_ctx *ctx = req->ctx;
6573         unsigned long flags;
6574
6575         spin_lock_irqsave(&ctx->timeout_lock, flags);
6576         prev = req->timeout.head;
6577         req->timeout.head = NULL;
6578
6579         /*
6580          * We don't expect the list to be empty, that will only happen if we
6581          * race with the completion of the linked work.
6582          */
6583         if (prev) {
6584                 io_remove_next_linked(prev);
6585                 if (!req_ref_inc_not_zero(prev))
6586                         prev = NULL;
6587         }
6588         req->timeout.prev = prev;
6589         spin_unlock_irqrestore(&ctx->timeout_lock, flags);
6590
6591         req->io_task_work.func = io_req_task_link_timeout;
6592         io_req_task_work_add(req);
6593         return HRTIMER_NORESTART;
6594 }
6595
6596 static void io_queue_linked_timeout(struct io_kiocb *req)
6597 {
6598         struct io_ring_ctx *ctx = req->ctx;
6599
6600         spin_lock_irq(&ctx->timeout_lock);
6601         /*
6602          * If the back reference is NULL, then our linked request finished
6603          * before we got a chance to setup the timer
6604          */
6605         if (req->timeout.head) {
6606                 struct io_timeout_data *data = req->async_data;
6607
6608                 data->timer.function = io_link_timeout_fn;
6609                 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
6610                                 data->mode);
6611         }
6612         spin_unlock_irq(&ctx->timeout_lock);
6613         /* drop submission reference */
6614         io_put_req(req);
6615 }
6616
6617 static void __io_queue_sqe(struct io_kiocb *req)
6618         __must_hold(&req->ctx->uring_lock)
6619 {
6620         struct io_kiocb *linked_timeout;
6621         int ret;
6622
6623 issue_sqe:
6624         ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
6625
6626         /*
6627          * We async punt it if the file wasn't marked NOWAIT, or if the file
6628          * doesn't support non-blocking read/write attempts
6629          */
6630         if (likely(!ret)) {
6631                 if (req->flags & REQ_F_COMPLETE_INLINE) {
6632                         struct io_ring_ctx *ctx = req->ctx;
6633                         struct io_submit_state *state = &ctx->submit_state;
6634
6635                         state->compl_reqs[state->compl_nr++] = req;
6636                         if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
6637                                 io_submit_flush_completions(ctx);
6638                         return;
6639                 }
6640
6641                 linked_timeout = io_prep_linked_timeout(req);
6642                 if (linked_timeout)
6643                         io_queue_linked_timeout(linked_timeout);
6644         } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
6645                 linked_timeout = io_prep_linked_timeout(req);
6646
6647                 switch (io_arm_poll_handler(req)) {
6648                 case IO_APOLL_READY:
6649                         if (linked_timeout)
6650                                 io_unprep_linked_timeout(req);
6651                         goto issue_sqe;
6652                 case IO_APOLL_ABORTED:
6653                         /*
6654                          * Queued up for async execution, worker will release
6655                          * submit reference when the iocb is actually submitted.
6656                          */
6657                         io_queue_async_work(req, NULL);
6658                         break;
6659                 }
6660
6661                 if (linked_timeout)
6662                         io_queue_linked_timeout(linked_timeout);
6663         } else {
6664                 io_req_complete_failed(req, ret);
6665         }
6666 }
6667
6668 static inline void io_queue_sqe(struct io_kiocb *req)
6669         __must_hold(&req->ctx->uring_lock)
6670 {
6671         if (unlikely(req->ctx->drain_active) && io_drain_req(req))
6672                 return;
6673
6674         if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) {
6675                 __io_queue_sqe(req);
6676         } else if (req->flags & REQ_F_FAIL) {
6677                 io_req_complete_failed(req, req->result);
6678         } else {
6679                 int ret = io_req_prep_async(req);
6680
6681                 if (unlikely(ret))
6682                         io_req_complete_failed(req, ret);
6683                 else
6684                         io_queue_async_work(req, NULL);
6685         }
6686 }
6687
6688 /*
6689  * Check SQE restrictions (opcode and flags).
6690  *
6691  * Returns 'true' if SQE is allowed, 'false' otherwise.
6692  */
6693 static inline bool io_check_restriction(struct io_ring_ctx *ctx,
6694                                         struct io_kiocb *req,
6695                                         unsigned int sqe_flags)
6696 {
6697         if (likely(!ctx->restricted))
6698                 return true;
6699
6700         if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
6701                 return false;
6702
6703         if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
6704             ctx->restrictions.sqe_flags_required)
6705                 return false;
6706
6707         if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
6708                           ctx->restrictions.sqe_flags_required))
6709                 return false;
6710
6711         return true;
6712 }
6713
6714 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
6715                        const struct io_uring_sqe *sqe)
6716         __must_hold(&ctx->uring_lock)
6717 {
6718         struct io_submit_state *state;
6719         unsigned int sqe_flags;
6720         int personality, ret = 0;
6721
6722         /* req is partially pre-initialised, see io_preinit_req() */
6723         req->opcode = READ_ONCE(sqe->opcode);
6724         /* same numerical values with corresponding REQ_F_*, safe to copy */
6725         req->flags = sqe_flags = READ_ONCE(sqe->flags);
6726         req->user_data = READ_ONCE(sqe->user_data);
6727         req->file = NULL;
6728         req->fixed_rsrc_refs = NULL;
6729         req->task = current;
6730
6731         /* enforce forwards compatibility on users */
6732         if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
6733                 return -EINVAL;
6734         if (unlikely(req->opcode >= IORING_OP_LAST))
6735                 return -EINVAL;
6736         if (!io_check_restriction(ctx, req, sqe_flags))
6737                 return -EACCES;
6738
6739         if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
6740             !io_op_defs[req->opcode].buffer_select)
6741                 return -EOPNOTSUPP;
6742         if (unlikely(sqe_flags & IOSQE_IO_DRAIN))
6743                 ctx->drain_active = true;
6744
6745         personality = READ_ONCE(sqe->personality);
6746         if (personality) {
6747                 req->creds = xa_load(&ctx->personalities, personality);
6748                 if (!req->creds)
6749                         return -EINVAL;
6750                 get_cred(req->creds);
6751                 req->flags |= REQ_F_CREDS;
6752         }
6753         state = &ctx->submit_state;
6754
6755         /*
6756          * Plug now if we have more than 1 IO left after this, and the target
6757          * is potentially a read/write to block based storage.
6758          */
6759         if (!state->plug_started && state->ios_left > 1 &&
6760             io_op_defs[req->opcode].plug) {
6761                 blk_start_plug(&state->plug);
6762                 state->plug_started = true;
6763         }
6764
6765         if (io_op_defs[req->opcode].needs_file) {
6766                 req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
6767                                         (sqe_flags & IOSQE_FIXED_FILE));
6768                 if (unlikely(!req->file))
6769                         ret = -EBADF;
6770         }
6771
6772         state->ios_left--;
6773         return ret;
6774 }
6775
6776 static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
6777                          const struct io_uring_sqe *sqe)
6778         __must_hold(&ctx->uring_lock)
6779 {
6780         struct io_submit_link *link = &ctx->submit_state.link;
6781         int ret;
6782
6783         ret = io_init_req(ctx, req, sqe);
6784         if (unlikely(ret)) {
6785 fail_req:
6786                 /* fail even hard links since we don't submit */
6787                 if (link->head) {
6788                         /*
6789                          * we can judge a link req is failed or cancelled by if
6790                          * REQ_F_FAIL is set, but the head is an exception since
6791                          * it may be set REQ_F_FAIL because of other req's failure
6792                          * so let's leverage req->result to distinguish if a head
6793                          * is set REQ_F_FAIL because of its failure or other req's
6794                          * failure so that we can set the correct ret code for it.
6795                          * init result here to avoid affecting the normal path.
6796                          */
6797                         if (!(link->head->flags & REQ_F_FAIL))
6798                                 req_fail_link_node(link->head, -ECANCELED);
6799                 } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
6800                         /*
6801                          * the current req is a normal req, we should return
6802                          * error and thus break the submittion loop.
6803                          */
6804                         io_req_complete_failed(req, ret);
6805                         return ret;
6806                 }
6807                 req_fail_link_node(req, ret);
6808         } else {
6809                 ret = io_req_prep(req, sqe);
6810                 if (unlikely(ret))
6811                         goto fail_req;
6812         }
6813
6814         /* don't need @sqe from now on */
6815         trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
6816                                   req->flags, true,
6817                                   ctx->flags & IORING_SETUP_SQPOLL);
6818
6819         /*
6820          * If we already have a head request, queue this one for async
6821          * submittal once the head completes. If we don't have a head but
6822          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
6823          * submitted sync once the chain is complete. If none of those
6824          * conditions are true (normal request), then just queue it.
6825          */
6826         if (link->head) {
6827                 struct io_kiocb *head = link->head;
6828
6829                 if (!(req->flags & REQ_F_FAIL)) {
6830                         ret = io_req_prep_async(req);
6831                         if (unlikely(ret)) {
6832                                 req_fail_link_node(req, ret);
6833                                 if (!(head->flags & REQ_F_FAIL))
6834                                         req_fail_link_node(head, -ECANCELED);
6835                         }
6836                 }
6837                 trace_io_uring_link(ctx, req, head);
6838                 link->last->link = req;
6839                 link->last = req;
6840
6841                 /* last request of a link, enqueue the link */
6842                 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
6843                         link->head = NULL;
6844                         io_queue_sqe(head);
6845                 }
6846         } else {
6847                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
6848                         link->head = req;
6849                         link->last = req;
6850                 } else {
6851                         io_queue_sqe(req);
6852                 }
6853         }
6854
6855         return 0;
6856 }
6857
6858 /*
6859  * Batched submission is done, ensure local IO is flushed out.
6860  */
6861 static void io_submit_state_end(struct io_submit_state *state,
6862                                 struct io_ring_ctx *ctx)
6863 {
6864         if (state->link.head)
6865                 io_queue_sqe(state->link.head);
6866         if (state->compl_nr)
6867                 io_submit_flush_completions(ctx);
6868         if (state->plug_started)
6869                 blk_finish_plug(&state->plug);
6870 }
6871
6872 /*
6873  * Start submission side cache.
6874  */
6875 static void io_submit_state_start(struct io_submit_state *state,
6876                                   unsigned int max_ios)
6877 {
6878         state->plug_started = false;
6879         state->ios_left = max_ios;
6880         /* set only head, no need to init link_last in advance */
6881         state->link.head = NULL;
6882 }
6883
6884 static void io_commit_sqring(struct io_ring_ctx *ctx)
6885 {
6886         struct io_rings *rings = ctx->rings;
6887
6888         /*
6889          * Ensure any loads from the SQEs are done at this point,
6890          * since once we write the new head, the application could
6891          * write new data to them.
6892          */
6893         smp_store_release(&rings->sq.head, ctx->cached_sq_head);
6894 }
6895
6896 /*
6897  * Fetch an sqe, if one is available. Note this returns a pointer to memory
6898  * that is mapped by userspace. This means that care needs to be taken to
6899  * ensure that reads are stable, as we cannot rely on userspace always
6900  * being a good citizen. If members of the sqe are validated and then later
6901  * used, it's important that those reads are done through READ_ONCE() to
6902  * prevent a re-load down the line.
6903  */
6904 static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
6905 {
6906         unsigned head, mask = ctx->sq_entries - 1;
6907         unsigned sq_idx = ctx->cached_sq_head++ & mask;
6908
6909         /*
6910          * The cached sq head (or cq tail) serves two purposes:
6911          *
6912          * 1) allows us to batch the cost of updating the user visible
6913          *    head updates.
6914          * 2) allows the kernel side to track the head on its own, even
6915          *    though the application is the one updating it.
6916          */
6917         head = READ_ONCE(ctx->sq_array[sq_idx]);
6918         if (likely(head < ctx->sq_entries))
6919                 return &ctx->sq_sqes[head];
6920
6921         /* drop invalid entries */
6922         ctx->cq_extra--;
6923         WRITE_ONCE(ctx->rings->sq_dropped,
6924                    READ_ONCE(ctx->rings->sq_dropped) + 1);
6925         return NULL;
6926 }
6927
6928 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
6929         __must_hold(&ctx->uring_lock)
6930 {
6931         int submitted = 0;
6932
6933         /* make sure SQ entry isn't read before tail */
6934         nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
6935         if (!percpu_ref_tryget_many(&ctx->refs, nr))
6936                 return -EAGAIN;
6937         io_get_task_refs(nr);
6938
6939         io_submit_state_start(&ctx->submit_state, nr);
6940         while (submitted < nr) {
6941                 const struct io_uring_sqe *sqe;
6942                 struct io_kiocb *req;
6943
6944                 req = io_alloc_req(ctx);
6945                 if (unlikely(!req)) {
6946                         if (!submitted)
6947                                 submitted = -EAGAIN;
6948                         break;
6949                 }
6950                 sqe = io_get_sqe(ctx);
6951                 if (unlikely(!sqe)) {
6952                         list_add(&req->inflight_entry, &ctx->submit_state.free_list);
6953                         break;
6954                 }
6955                 /* will complete beyond this point, count as submitted */
6956                 submitted++;
6957                 if (io_submit_sqe(ctx, req, sqe))
6958                         break;
6959         }
6960
6961         if (unlikely(submitted != nr)) {
6962                 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
6963                 int unused = nr - ref_used;
6964
6965                 current->io_uring->cached_refs += unused;
6966                 percpu_ref_put_many(&ctx->refs, unused);
6967         }
6968
6969         io_submit_state_end(&ctx->submit_state, ctx);
6970          /* Commit SQ ring head once we've consumed and submitted all SQEs */
6971         io_commit_sqring(ctx);
6972
6973         return submitted;
6974 }
6975
6976 static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
6977 {
6978         return READ_ONCE(sqd->state);
6979 }
6980
6981 static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
6982 {
6983         /* Tell userspace we may need a wakeup call */
6984         spin_lock(&ctx->completion_lock);
6985         WRITE_ONCE(ctx->rings->sq_flags,
6986                    ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
6987         spin_unlock(&ctx->completion_lock);
6988 }
6989
6990 static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
6991 {
6992         spin_lock(&ctx->completion_lock);
6993         WRITE_ONCE(ctx->rings->sq_flags,
6994                    ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
6995         spin_unlock(&ctx->completion_lock);
6996 }
6997
6998 static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
6999 {
7000         unsigned int to_submit;
7001         int ret = 0;
7002
7003         to_submit = io_sqring_entries(ctx);
7004         /* if we're handling multiple rings, cap submit size for fairness */
7005         if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
7006                 to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
7007
7008         if (!list_empty(&ctx->iopoll_list) || to_submit) {
7009                 unsigned nr_events = 0;
7010                 const struct cred *creds = NULL;
7011
7012                 if (ctx->sq_creds != current_cred())
7013                         creds = override_creds(ctx->sq_creds);
7014
7015                 mutex_lock(&ctx->uring_lock);
7016                 if (!list_empty(&ctx->iopoll_list))
7017                         io_do_iopoll(ctx, &nr_events, 0);
7018
7019                 /*
7020                  * Don't submit if refs are dying, good for io_uring_register(),
7021                  * but also it is relied upon by io_ring_exit_work()
7022                  */
7023                 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
7024                     !(ctx->flags & IORING_SETUP_R_DISABLED))
7025                         ret = io_submit_sqes(ctx, to_submit);
7026                 mutex_unlock(&ctx->uring_lock);
7027
7028                 if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
7029                         wake_up(&ctx->sqo_sq_wait);
7030                 if (creds)
7031                         revert_creds(creds);
7032         }
7033
7034         return ret;
7035 }
7036
7037 static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
7038 {
7039         struct io_ring_ctx *ctx;
7040         unsigned sq_thread_idle = 0;
7041
7042         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7043                 sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
7044         sqd->sq_thread_idle = sq_thread_idle;
7045 }
7046
7047 static bool io_sqd_handle_event(struct io_sq_data *sqd)
7048 {
7049         bool did_sig = false;
7050         struct ksignal ksig;
7051
7052         if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
7053             signal_pending(current)) {
7054                 mutex_unlock(&sqd->lock);
7055                 if (signal_pending(current))
7056                         did_sig = get_signal(&ksig);
7057                 cond_resched();
7058                 mutex_lock(&sqd->lock);
7059         }
7060         return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
7061 }
7062
7063 static int io_sq_thread(void *data)
7064 {
7065         struct io_sq_data *sqd = data;
7066         struct io_ring_ctx *ctx;
7067         unsigned long timeout = 0;
7068         char buf[TASK_COMM_LEN];
7069         DEFINE_WAIT(wait);
7070
7071         snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
7072         set_task_comm(current, buf);
7073
7074         if (sqd->sq_cpu != -1)
7075                 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
7076         else
7077                 set_cpus_allowed_ptr(current, cpu_online_mask);
7078         current->flags |= PF_NO_SETAFFINITY;
7079
7080         mutex_lock(&sqd->lock);
7081         while (1) {
7082                 bool cap_entries, sqt_spin = false;
7083
7084                 if (io_sqd_events_pending(sqd) || signal_pending(current)) {
7085                         if (io_sqd_handle_event(sqd))
7086                                 break;
7087                         timeout = jiffies + sqd->sq_thread_idle;
7088                 }
7089
7090                 cap_entries = !list_is_singular(&sqd->ctx_list);
7091                 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
7092                         int ret = __io_sq_thread(ctx, cap_entries);
7093
7094                         if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
7095                                 sqt_spin = true;
7096                 }
7097                 if (io_run_task_work())
7098                         sqt_spin = true;
7099
7100                 if (sqt_spin || !time_after(jiffies, timeout)) {
7101                         cond_resched();
7102                         if (sqt_spin)
7103                                 timeout = jiffies + sqd->sq_thread_idle;
7104                         continue;
7105                 }
7106
7107                 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
7108                 if (!io_sqd_events_pending(sqd) && !current->task_works) {
7109                         bool needs_sched = true;
7110
7111                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
7112                                 io_ring_set_wakeup_flag(ctx);
7113
7114                                 if ((ctx->flags & IORING_SETUP_IOPOLL) &&
7115                                     !list_empty_careful(&ctx->iopoll_list)) {
7116                                         needs_sched = false;
7117                                         break;
7118                                 }
7119                                 if (io_sqring_entries(ctx)) {
7120                                         needs_sched = false;
7121                                         break;
7122                                 }
7123                         }
7124
7125                         if (needs_sched) {
7126                                 mutex_unlock(&sqd->lock);
7127                                 schedule();
7128                                 mutex_lock(&sqd->lock);
7129                         }
7130                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7131                                 io_ring_clear_wakeup_flag(ctx);
7132                 }
7133
7134                 finish_wait(&sqd->wait, &wait);
7135                 timeout = jiffies + sqd->sq_thread_idle;
7136         }
7137
7138         io_uring_cancel_generic(true, sqd);
7139         sqd->thread = NULL;
7140         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7141                 io_ring_set_wakeup_flag(ctx);
7142         io_run_task_work();
7143         mutex_unlock(&sqd->lock);
7144
7145         complete(&sqd->exited);
7146         do_exit(0);
7147 }
7148
7149 struct io_wait_queue {
7150         struct wait_queue_entry wq;
7151         struct io_ring_ctx *ctx;
7152         unsigned cq_tail;
7153         unsigned nr_timeouts;
7154 };
7155
7156 static inline bool io_should_wake(struct io_wait_queue *iowq)
7157 {
7158         struct io_ring_ctx *ctx = iowq->ctx;
7159         int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
7160
7161         /*
7162          * Wake up if we have enough events, or if a timeout occurred since we
7163          * started waiting. For timeouts, we always want to return to userspace,
7164          * regardless of event count.
7165          */
7166         return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
7167 }
7168
7169 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
7170                             int wake_flags, void *key)
7171 {
7172         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
7173                                                         wq);
7174
7175         /*
7176          * Cannot safely flush overflowed CQEs from here, ensure we wake up
7177          * the task, and the next invocation will do it.
7178          */
7179         if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
7180                 return autoremove_wake_function(curr, mode, wake_flags, key);
7181         return -1;
7182 }
7183
7184 static int io_run_task_work_sig(void)
7185 {
7186         if (io_run_task_work())
7187                 return 1;
7188         if (!signal_pending(current))
7189                 return 0;
7190         if (test_thread_flag(TIF_NOTIFY_SIGNAL))
7191                 return -ERESTARTSYS;
7192         return -EINTR;
7193 }
7194
7195 /* when returns >0, the caller should retry */
7196 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
7197                                           struct io_wait_queue *iowq,
7198                                           signed long *timeout)
7199 {
7200         int ret;
7201
7202         /* make sure we run task_work before checking for signals */
7203         ret = io_run_task_work_sig();
7204         if (ret || io_should_wake(iowq))
7205                 return ret;
7206         /* let the caller flush overflows, retry */
7207         if (test_bit(0, &ctx->check_cq_overflow))
7208                 return 1;
7209
7210         *timeout = schedule_timeout(*timeout);
7211         return !*timeout ? -ETIME : 1;
7212 }
7213
7214 /*
7215  * Wait until events become available, if we don't already have some. The
7216  * application must reap them itself, as they reside on the shared cq ring.
7217  */
7218 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
7219                           const sigset_t __user *sig, size_t sigsz,
7220                           struct __kernel_timespec __user *uts)
7221 {
7222         struct io_wait_queue iowq;
7223         struct io_rings *rings = ctx->rings;
7224         signed long timeout = MAX_SCHEDULE_TIMEOUT;
7225         int ret;
7226
7227         do {
7228                 io_cqring_overflow_flush(ctx);
7229                 if (io_cqring_events(ctx) >= min_events)
7230                         return 0;
7231                 if (!io_run_task_work())
7232                         break;
7233         } while (1);
7234
7235         if (sig) {
7236 #ifdef CONFIG_COMPAT
7237                 if (in_compat_syscall())
7238                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
7239                                                       sigsz);
7240                 else
7241 #endif
7242                         ret = set_user_sigmask(sig, sigsz);
7243
7244                 if (ret)
7245                         return ret;
7246         }
7247
7248         if (uts) {
7249                 struct timespec64 ts;
7250
7251                 if (get_timespec64(&ts, uts))
7252                         return -EFAULT;
7253                 timeout = timespec64_to_jiffies(&ts);
7254         }
7255
7256         init_waitqueue_func_entry(&iowq.wq, io_wake_function);
7257         iowq.wq.private = current;
7258         INIT_LIST_HEAD(&iowq.wq.entry);
7259         iowq.ctx = ctx;
7260         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
7261         iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
7262
7263         trace_io_uring_cqring_wait(ctx, min_events);
7264         do {
7265                 /* if we can't even flush overflow, don't wait for more */
7266                 if (!io_cqring_overflow_flush(ctx)) {
7267                         ret = -EBUSY;
7268                         break;
7269                 }
7270                 prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
7271                                                 TASK_INTERRUPTIBLE);
7272                 ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
7273                 finish_wait(&ctx->cq_wait, &iowq.wq);
7274                 cond_resched();
7275         } while (ret > 0);
7276
7277         restore_saved_sigmask_unless(ret == -EINTR);
7278
7279         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
7280 }
7281
7282 static void io_free_page_table(void **table, size_t size)
7283 {
7284         unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
7285
7286         for (i = 0; i < nr_tables; i++)
7287                 kfree(table[i]);
7288         kfree(table);
7289 }
7290
7291 static void **io_alloc_page_table(size_t size)
7292 {
7293         unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
7294         size_t init_size = size;
7295         void **table;
7296
7297         table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
7298         if (!table)
7299                 return NULL;
7300
7301         for (i = 0; i < nr_tables; i++) {
7302                 unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
7303
7304                 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
7305                 if (!table[i]) {
7306                         io_free_page_table(table, init_size);
7307                         return NULL;
7308                 }
7309                 size -= this_size;
7310         }
7311         return table;
7312 }
7313
7314 static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
7315 {
7316         percpu_ref_exit(&ref_node->refs);
7317         kfree(ref_node);
7318 }
7319
7320 static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
7321 {
7322         struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
7323         struct io_ring_ctx *ctx = node->rsrc_data->ctx;
7324         unsigned long flags;
7325         bool first_add = false;
7326
7327         spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
7328         node->done = true;
7329
7330         while (!list_empty(&ctx->rsrc_ref_list)) {
7331                 node = list_first_entry(&ctx->rsrc_ref_list,
7332                                             struct io_rsrc_node, node);
7333                 /* recycle ref nodes in order */
7334                 if (!node->done)
7335                         break;
7336                 list_del(&node->node);
7337                 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
7338         }
7339         spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
7340
7341         if (first_add)
7342                 mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
7343 }
7344
7345 static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
7346 {
7347         struct io_rsrc_node *ref_node;
7348
7349         ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
7350         if (!ref_node)
7351                 return NULL;
7352
7353         if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
7354                             0, GFP_KERNEL)) {
7355                 kfree(ref_node);
7356                 return NULL;
7357         }
7358         INIT_LIST_HEAD(&ref_node->node);
7359         INIT_LIST_HEAD(&ref_node->rsrc_list);
7360         ref_node->done = false;
7361         return ref_node;
7362 }
7363
7364 static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
7365                                 struct io_rsrc_data *data_to_kill)
7366 {
7367         WARN_ON_ONCE(!ctx->rsrc_backup_node);
7368         WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
7369
7370         if (data_to_kill) {
7371                 struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
7372
7373                 rsrc_node->rsrc_data = data_to_kill;
7374                 spin_lock_irq(&ctx->rsrc_ref_lock);
7375                 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
7376                 spin_unlock_irq(&ctx->rsrc_ref_lock);
7377
7378                 atomic_inc(&data_to_kill->refs);
7379                 percpu_ref_kill(&rsrc_node->refs);
7380                 ctx->rsrc_node = NULL;
7381         }
7382
7383         if (!ctx->rsrc_node) {
7384                 ctx->rsrc_node = ctx->rsrc_backup_node;
7385                 ctx->rsrc_backup_node = NULL;
7386         }
7387 }
7388
7389 static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
7390 {
7391         if (ctx->rsrc_backup_node)
7392                 return 0;
7393         ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
7394         return ctx->rsrc_backup_node ? 0 : -ENOMEM;
7395 }
7396
7397 static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx)
7398 {
7399         int ret;
7400
7401         /* As we may drop ->uring_lock, other task may have started quiesce */
7402         if (data->quiesce)
7403                 return -ENXIO;
7404
7405         data->quiesce = true;
7406         do {
7407                 ret = io_rsrc_node_switch_start(ctx);
7408                 if (ret)
7409                         break;
7410                 io_rsrc_node_switch(ctx, data);
7411
7412                 /* kill initial ref, already quiesced if zero */
7413                 if (atomic_dec_and_test(&data->refs))
7414                         break;
7415                 mutex_unlock(&ctx->uring_lock);
7416                 flush_delayed_work(&ctx->rsrc_put_work);
7417                 ret = wait_for_completion_interruptible(&data->done);
7418                 if (!ret) {
7419                         mutex_lock(&ctx->uring_lock);
7420                         break;
7421                 }
7422
7423                 atomic_inc(&data->refs);
7424                 /* wait for all works potentially completing data->done */
7425                 flush_delayed_work(&ctx->rsrc_put_work);
7426                 reinit_completion(&data->done);
7427
7428                 ret = io_run_task_work_sig();
7429                 mutex_lock(&ctx->uring_lock);
7430         } while (ret >= 0);
7431         data->quiesce = false;
7432
7433         return ret;
7434 }
7435
7436 static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
7437 {
7438         unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
7439         unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
7440
7441         return &data->tags[table_idx][off];
7442 }
7443
7444 static void io_rsrc_data_free(struct io_rsrc_data *data)
7445 {
7446         size_t size = data->nr * sizeof(data->tags[0][0]);
7447
7448         if (data->tags)
7449                 io_free_page_table((void **)data->tags, size);
7450         kfree(data);
7451 }
7452
7453 static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
7454                               u64 __user *utags, unsigned nr,
7455                               struct io_rsrc_data **pdata)
7456 {
7457         struct io_rsrc_data *data;
7458         int ret = -ENOMEM;
7459         unsigned i;
7460
7461         data = kzalloc(sizeof(*data), GFP_KERNEL);
7462         if (!data)
7463                 return -ENOMEM;
7464         data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
7465         if (!data->tags) {
7466                 kfree(data);
7467                 return -ENOMEM;
7468         }
7469
7470         data->nr = nr;
7471         data->ctx = ctx;
7472         data->do_put = do_put;
7473         if (utags) {
7474                 ret = -EFAULT;
7475                 for (i = 0; i < nr; i++) {
7476                         u64 *tag_slot = io_get_tag_slot(data, i);
7477
7478                         if (copy_from_user(tag_slot, &utags[i],
7479                                            sizeof(*tag_slot)))
7480                                 goto fail;
7481                 }
7482         }
7483
7484         atomic_set(&data->refs, 1);
7485         init_completion(&data->done);
7486         *pdata = data;
7487         return 0;
7488 fail:
7489         io_rsrc_data_free(data);
7490         return ret;
7491 }
7492
7493 static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
7494 {
7495         table->files = kvcalloc(nr_files, sizeof(table->files[0]),
7496                                 GFP_KERNEL_ACCOUNT);
7497         return !!table->files;
7498 }
7499
7500 static void io_free_file_tables(struct io_file_table *table)
7501 {
7502         kvfree(table->files);
7503         table->files = NULL;
7504 }
7505
7506 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
7507 {
7508 #if defined(CONFIG_UNIX)
7509         if (ctx->ring_sock) {
7510                 struct sock *sock = ctx->ring_sock->sk;
7511                 struct sk_buff *skb;
7512
7513                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
7514                         kfree_skb(skb);
7515         }
7516 #else
7517         int i;
7518
7519         for (i = 0; i < ctx->nr_user_files; i++) {
7520                 struct file *file;
7521
7522                 file = io_file_from_index(ctx, i);
7523                 if (file)
7524                         fput(file);
7525         }
7526 #endif
7527         io_free_file_tables(&ctx->file_table);
7528         io_rsrc_data_free(ctx->file_data);
7529         ctx->file_data = NULL;
7530         ctx->nr_user_files = 0;
7531 }
7532
7533 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
7534 {
7535         int ret;
7536
7537         if (!ctx->file_data)
7538                 return -ENXIO;
7539         ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
7540         if (!ret)
7541                 __io_sqe_files_unregister(ctx);
7542         return ret;
7543 }
7544
7545 static void io_sq_thread_unpark(struct io_sq_data *sqd)
7546         __releases(&sqd->lock)
7547 {
7548         WARN_ON_ONCE(sqd->thread == current);
7549
7550         /*
7551          * Do the dance but not conditional clear_bit() because it'd race with
7552          * other threads incrementing park_pending and setting the bit.
7553          */
7554         clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7555         if (atomic_dec_return(&sqd->park_pending))
7556                 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7557         mutex_unlock(&sqd->lock);
7558 }
7559
7560 static void io_sq_thread_park(struct io_sq_data *sqd)
7561         __acquires(&sqd->lock)
7562 {
7563         WARN_ON_ONCE(sqd->thread == current);
7564
7565         atomic_inc(&sqd->park_pending);
7566         set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7567         mutex_lock(&sqd->lock);
7568         if (sqd->thread)
7569                 wake_up_process(sqd->thread);
7570 }
7571
7572 static void io_sq_thread_stop(struct io_sq_data *sqd)
7573 {
7574         WARN_ON_ONCE(sqd->thread == current);
7575         WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
7576
7577         set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
7578         mutex_lock(&sqd->lock);
7579         if (sqd->thread)
7580                 wake_up_process(sqd->thread);
7581         mutex_unlock(&sqd->lock);
7582         wait_for_completion(&sqd->exited);
7583 }
7584
7585 static void io_put_sq_data(struct io_sq_data *sqd)
7586 {
7587         if (refcount_dec_and_test(&sqd->refs)) {
7588                 WARN_ON_ONCE(atomic_read(&sqd->park_pending));
7589
7590                 io_sq_thread_stop(sqd);
7591                 kfree(sqd);
7592         }
7593 }
7594
7595 static void io_sq_thread_finish(struct io_ring_ctx *ctx)
7596 {
7597         struct io_sq_data *sqd = ctx->sq_data;
7598
7599         if (sqd) {
7600                 io_sq_thread_park(sqd);
7601                 list_del_init(&ctx->sqd_list);
7602                 io_sqd_update_thread_idle(sqd);
7603                 io_sq_thread_unpark(sqd);
7604
7605                 io_put_sq_data(sqd);
7606                 ctx->sq_data = NULL;
7607         }
7608 }
7609
7610 static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
7611 {
7612         struct io_ring_ctx *ctx_attach;
7613         struct io_sq_data *sqd;
7614         struct fd f;
7615
7616         f = fdget(p->wq_fd);
7617         if (!f.file)
7618                 return ERR_PTR(-ENXIO);
7619         if (f.file->f_op != &io_uring_fops) {
7620                 fdput(f);
7621                 return ERR_PTR(-EINVAL);
7622         }
7623
7624         ctx_attach = f.file->private_data;
7625         sqd = ctx_attach->sq_data;
7626         if (!sqd) {
7627                 fdput(f);
7628                 return ERR_PTR(-EINVAL);
7629         }
7630         if (sqd->task_tgid != current->tgid) {
7631                 fdput(f);
7632                 return ERR_PTR(-EPERM);
7633         }
7634
7635         refcount_inc(&sqd->refs);
7636         fdput(f);
7637         return sqd;
7638 }
7639
7640 static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
7641                                          bool *attached)
7642 {
7643         struct io_sq_data *sqd;
7644
7645         *attached = false;
7646         if (p->flags & IORING_SETUP_ATTACH_WQ) {
7647                 sqd = io_attach_sq_data(p);
7648                 if (!IS_ERR(sqd)) {
7649                         *attached = true;
7650                         return sqd;
7651                 }
7652                 /* fall through for EPERM case, setup new sqd/task */
7653                 if (PTR_ERR(sqd) != -EPERM)
7654                         return sqd;
7655         }
7656
7657         sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
7658         if (!sqd)
7659                 return ERR_PTR(-ENOMEM);
7660
7661         atomic_set(&sqd->park_pending, 0);
7662         refcount_set(&sqd->refs, 1);
7663         INIT_LIST_HEAD(&sqd->ctx_list);
7664         mutex_init(&sqd->lock);
7665         init_waitqueue_head(&sqd->wait);
7666         init_completion(&sqd->exited);
7667         return sqd;
7668 }
7669
7670 #if defined(CONFIG_UNIX)
7671 /*
7672  * Ensure the UNIX gc is aware of our file set, so we are certain that
7673  * the io_uring can be safely unregistered on process exit, even if we have
7674  * loops in the file referencing.
7675  */
7676 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
7677 {
7678         struct sock *sk = ctx->ring_sock->sk;
7679         struct scm_fp_list *fpl;
7680         struct sk_buff *skb;
7681         int i, nr_files;
7682
7683         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
7684         if (!fpl)
7685                 return -ENOMEM;
7686
7687         skb = alloc_skb(0, GFP_KERNEL);
7688         if (!skb) {
7689                 kfree(fpl);
7690                 return -ENOMEM;
7691         }
7692
7693         skb->sk = sk;
7694
7695         nr_files = 0;
7696         fpl->user = get_uid(current_user());
7697         for (i = 0; i < nr; i++) {
7698                 struct file *file = io_file_from_index(ctx, i + offset);
7699
7700                 if (!file)
7701                         continue;
7702                 fpl->fp[nr_files] = get_file(file);
7703                 unix_inflight(fpl->user, fpl->fp[nr_files]);
7704                 nr_files++;
7705         }
7706
7707         if (nr_files) {
7708                 fpl->max = SCM_MAX_FD;
7709                 fpl->count = nr_files;
7710                 UNIXCB(skb).fp = fpl;
7711                 skb->destructor = unix_destruct_scm;
7712                 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
7713                 skb_queue_head(&sk->sk_receive_queue, skb);
7714
7715                 for (i = 0; i < nr_files; i++)
7716                         fput(fpl->fp[i]);
7717         } else {
7718                 kfree_skb(skb);
7719                 kfree(fpl);
7720         }
7721
7722         return 0;
7723 }
7724
7725 /*
7726  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
7727  * causes regular reference counting to break down. We rely on the UNIX
7728  * garbage collection to take care of this problem for us.
7729  */
7730 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7731 {
7732         unsigned left, total;
7733         int ret = 0;
7734
7735         total = 0;
7736         left = ctx->nr_user_files;
7737         while (left) {
7738                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
7739
7740                 ret = __io_sqe_files_scm(ctx, this_files, total);
7741                 if (ret)
7742                         break;
7743                 left -= this_files;
7744                 total += this_files;
7745         }
7746
7747         if (!ret)
7748                 return 0;
7749
7750         while (total < ctx->nr_user_files) {
7751                 struct file *file = io_file_from_index(ctx, total);
7752
7753                 if (file)
7754                         fput(file);
7755                 total++;
7756         }
7757
7758         return ret;
7759 }
7760 #else
7761 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7762 {
7763         return 0;
7764 }
7765 #endif
7766
7767 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
7768 {
7769         struct file *file = prsrc->file;
7770 #if defined(CONFIG_UNIX)
7771         struct sock *sock = ctx->ring_sock->sk;
7772         struct sk_buff_head list, *head = &sock->sk_receive_queue;
7773         struct sk_buff *skb;
7774         int i;
7775
7776         __skb_queue_head_init(&list);
7777
7778         /*
7779          * Find the skb that holds this file in its SCM_RIGHTS. When found,
7780          * remove this entry and rearrange the file array.
7781          */
7782         skb = skb_dequeue(head);
7783         while (skb) {
7784                 struct scm_fp_list *fp;
7785
7786                 fp = UNIXCB(skb).fp;
7787                 for (i = 0; i < fp->count; i++) {
7788                         int left;
7789
7790                         if (fp->fp[i] != file)
7791                                 continue;
7792
7793                         unix_notinflight(fp->user, fp->fp[i]);
7794                         left = fp->count - 1 - i;
7795                         if (left) {
7796                                 memmove(&fp->fp[i], &fp->fp[i + 1],
7797                                                 left * sizeof(struct file *));
7798                         }
7799                         fp->count--;
7800                         if (!fp->count) {
7801                                 kfree_skb(skb);
7802                                 skb = NULL;
7803                         } else {
7804                                 __skb_queue_tail(&list, skb);
7805                         }
7806                         fput(file);
7807                         file = NULL;
7808                         break;
7809                 }
7810
7811                 if (!file)
7812                         break;
7813
7814                 __skb_queue_tail(&list, skb);
7815
7816                 skb = skb_dequeue(head);
7817         }
7818
7819         if (skb_peek(&list)) {
7820                 spin_lock_irq(&head->lock);
7821                 while ((skb = __skb_dequeue(&list)) != NULL)
7822                         __skb_queue_tail(head, skb);
7823                 spin_unlock_irq(&head->lock);
7824         }
7825 #else
7826         fput(file);
7827 #endif
7828 }
7829
7830 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
7831 {
7832         struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
7833         struct io_ring_ctx *ctx = rsrc_data->ctx;
7834         struct io_rsrc_put *prsrc, *tmp;
7835
7836         list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
7837                 list_del(&prsrc->list);
7838
7839                 if (prsrc->tag) {
7840                         bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
7841
7842                         io_ring_submit_lock(ctx, lock_ring);
7843                         spin_lock(&ctx->completion_lock);
7844                         io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
7845                         ctx->cq_extra++;
7846                         io_commit_cqring(ctx);
7847                         spin_unlock(&ctx->completion_lock);
7848                         io_cqring_ev_posted(ctx);
7849                         io_ring_submit_unlock(ctx, lock_ring);
7850                 }
7851
7852                 rsrc_data->do_put(ctx, prsrc);
7853                 kfree(prsrc);
7854         }
7855
7856         io_rsrc_node_destroy(ref_node);
7857         if (atomic_dec_and_test(&rsrc_data->refs))
7858                 complete(&rsrc_data->done);
7859 }
7860
7861 static void io_rsrc_put_work(struct work_struct *work)
7862 {
7863         struct io_ring_ctx *ctx;
7864         struct llist_node *node;
7865
7866         ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
7867         node = llist_del_all(&ctx->rsrc_put_llist);
7868
7869         while (node) {
7870                 struct io_rsrc_node *ref_node;
7871                 struct llist_node *next = node->next;
7872
7873                 ref_node = llist_entry(node, struct io_rsrc_node, llist);
7874                 __io_rsrc_put_work(ref_node);
7875                 node = next;
7876         }
7877 }
7878
7879 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
7880                                  unsigned nr_args, u64 __user *tags)
7881 {
7882         __s32 __user *fds = (__s32 __user *) arg;
7883         struct file *file;
7884         int fd, ret;
7885         unsigned i;
7886
7887         if (ctx->file_data)
7888                 return -EBUSY;
7889         if (!nr_args)
7890                 return -EINVAL;
7891         if (nr_args > IORING_MAX_FIXED_FILES)
7892                 return -EMFILE;
7893         if (nr_args > rlimit(RLIMIT_NOFILE))
7894                 return -EMFILE;
7895         ret = io_rsrc_node_switch_start(ctx);
7896         if (ret)
7897                 return ret;
7898         ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
7899                                  &ctx->file_data);
7900         if (ret)
7901                 return ret;
7902
7903         ret = -ENOMEM;
7904         if (!io_alloc_file_tables(&ctx->file_table, nr_args))
7905                 goto out_free;
7906
7907         for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
7908                 if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
7909                         ret = -EFAULT;
7910                         goto out_fput;
7911                 }
7912                 /* allow sparse sets */
7913                 if (fd == -1) {
7914                         ret = -EINVAL;
7915                         if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
7916                                 goto out_fput;
7917                         continue;
7918                 }
7919
7920                 file = fget(fd);
7921                 ret = -EBADF;
7922                 if (unlikely(!file))
7923                         goto out_fput;
7924
7925                 /*
7926                  * Don't allow io_uring instances to be registered. If UNIX
7927                  * isn't enabled, then this causes a reference cycle and this
7928                  * instance can never get freed. If UNIX is enabled we'll
7929                  * handle it just fine, but there's still no point in allowing
7930                  * a ring fd as it doesn't support regular read/write anyway.
7931                  */
7932                 if (file->f_op == &io_uring_fops) {
7933                         fput(file);
7934                         goto out_fput;
7935                 }
7936                 io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
7937         }
7938
7939         ret = io_sqe_files_scm(ctx);
7940         if (ret) {
7941                 __io_sqe_files_unregister(ctx);
7942                 return ret;
7943         }
7944
7945         io_rsrc_node_switch(ctx, NULL);
7946         return ret;
7947 out_fput:
7948         for (i = 0; i < ctx->nr_user_files; i++) {
7949                 file = io_file_from_index(ctx, i);
7950                 if (file)
7951                         fput(file);
7952         }
7953         io_free_file_tables(&ctx->file_table);
7954         ctx->nr_user_files = 0;
7955 out_free:
7956         io_rsrc_data_free(ctx->file_data);
7957         ctx->file_data = NULL;
7958         return ret;
7959 }
7960
7961 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
7962                                 int index)
7963 {
7964 #if defined(CONFIG_UNIX)
7965         struct sock *sock = ctx->ring_sock->sk;
7966         struct sk_buff_head *head = &sock->sk_receive_queue;
7967         struct sk_buff *skb;
7968
7969         /*
7970          * See if we can merge this file into an existing skb SCM_RIGHTS
7971          * file set. If there's no room, fall back to allocating a new skb
7972          * and filling it in.
7973          */
7974         spin_lock_irq(&head->lock);
7975         skb = skb_peek(head);
7976         if (skb) {
7977                 struct scm_fp_list *fpl = UNIXCB(skb).fp;
7978
7979                 if (fpl->count < SCM_MAX_FD) {
7980                         __skb_unlink(skb, head);
7981                         spin_unlock_irq(&head->lock);
7982                         fpl->fp[fpl->count] = get_file(file);
7983                         unix_inflight(fpl->user, fpl->fp[fpl->count]);
7984                         fpl->count++;
7985                         spin_lock_irq(&head->lock);
7986                         __skb_queue_head(head, skb);
7987                 } else {
7988                         skb = NULL;
7989                 }
7990         }
7991         spin_unlock_irq(&head->lock);
7992
7993         if (skb) {
7994                 fput(file);
7995                 return 0;
7996         }
7997
7998         return __io_sqe_files_scm(ctx, 1, index);
7999 #else
8000         return 0;
8001 #endif
8002 }
8003
8004 static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
8005                                  unsigned int issue_flags, u32 slot_index)
8006 {
8007         struct io_ring_ctx *ctx = req->ctx;
8008         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
8009         struct io_fixed_file *file_slot;
8010         int ret = -EBADF;
8011
8012         io_ring_submit_lock(ctx, !force_nonblock);
8013         if (file->f_op == &io_uring_fops)
8014                 goto err;
8015         ret = -ENXIO;
8016         if (!ctx->file_data)
8017                 goto err;
8018         ret = -EINVAL;
8019         if (slot_index >= ctx->nr_user_files)
8020                 goto err;
8021
8022         slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
8023         file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
8024         ret = -EBADF;
8025         if (file_slot->file_ptr)
8026                 goto err;
8027
8028         *io_get_tag_slot(ctx->file_data, slot_index) = 0;
8029         io_fixed_file_set(file_slot, file);
8030         ret = io_sqe_file_register(ctx, file, slot_index);
8031         if (ret) {
8032                 file_slot->file_ptr = 0;
8033                 goto err;
8034         }
8035
8036         ret = 0;
8037 err:
8038         io_ring_submit_unlock(ctx, !force_nonblock);
8039         if (ret)
8040                 fput(file);
8041         return ret;
8042 }
8043
8044 static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
8045                                  struct io_rsrc_node *node, void *rsrc)
8046 {
8047         struct io_rsrc_put *prsrc;
8048
8049         prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
8050         if (!prsrc)
8051                 return -ENOMEM;
8052
8053         prsrc->tag = *io_get_tag_slot(data, idx);
8054         prsrc->rsrc = rsrc;
8055         list_add(&prsrc->list, &node->rsrc_list);
8056         return 0;
8057 }
8058
8059 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
8060                                  struct io_uring_rsrc_update2 *up,
8061                                  unsigned nr_args)
8062 {
8063         u64 __user *tags = u64_to_user_ptr(up->tags);
8064         __s32 __user *fds = u64_to_user_ptr(up->data);
8065         struct io_rsrc_data *data = ctx->file_data;
8066         struct io_fixed_file *file_slot;
8067         struct file *file;
8068         int fd, i, err = 0;
8069         unsigned int done;
8070         bool needs_switch = false;
8071
8072         if (!ctx->file_data)
8073                 return -ENXIO;
8074         if (up->offset + nr_args > ctx->nr_user_files)
8075                 return -EINVAL;
8076
8077         for (done = 0; done < nr_args; done++) {
8078                 u64 tag = 0;
8079
8080                 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
8081                     copy_from_user(&fd, &fds[done], sizeof(fd))) {
8082                         err = -EFAULT;
8083                         break;
8084                 }
8085                 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
8086                         err = -EINVAL;
8087                         break;
8088                 }
8089                 if (fd == IORING_REGISTER_FILES_SKIP)
8090                         continue;
8091
8092                 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
8093                 file_slot = io_fixed_file_slot(&ctx->file_table, i);
8094
8095                 if (file_slot->file_ptr) {
8096                         file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8097                         err = io_queue_rsrc_removal(data, up->offset + done,
8098                                                     ctx->rsrc_node, file);
8099                         if (err)
8100                                 break;
8101                         file_slot->file_ptr = 0;
8102                         needs_switch = true;
8103                 }
8104                 if (fd != -1) {
8105                         file = fget(fd);
8106                         if (!file) {
8107                                 err = -EBADF;
8108                                 break;
8109                         }
8110                         /*
8111                          * Don't allow io_uring instances to be registered. If
8112                          * UNIX isn't enabled, then this causes a reference
8113                          * cycle and this instance can never get freed. If UNIX
8114                          * is enabled we'll handle it just fine, but there's
8115                          * still no point in allowing a ring fd as it doesn't
8116                          * support regular read/write anyway.
8117                          */
8118                         if (file->f_op == &io_uring_fops) {
8119                                 fput(file);
8120                                 err = -EBADF;
8121                                 break;
8122                         }
8123                         *io_get_tag_slot(data, up->offset + done) = tag;
8124                         io_fixed_file_set(file_slot, file);
8125                         err = io_sqe_file_register(ctx, file, i);
8126                         if (err) {
8127                                 file_slot->file_ptr = 0;
8128                                 fput(file);
8129                                 break;
8130                         }
8131                 }
8132         }
8133
8134         if (needs_switch)
8135                 io_rsrc_node_switch(ctx, data);
8136         return done ? done : err;
8137 }
8138
8139 static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
8140                                         struct task_struct *task)
8141 {
8142         struct io_wq_hash *hash;
8143         struct io_wq_data data;
8144         unsigned int concurrency;
8145
8146         mutex_lock(&ctx->uring_lock);
8147         hash = ctx->hash_map;
8148         if (!hash) {
8149                 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
8150                 if (!hash) {
8151                         mutex_unlock(&ctx->uring_lock);
8152                         return ERR_PTR(-ENOMEM);
8153                 }
8154                 refcount_set(&hash->refs, 1);
8155                 init_waitqueue_head(&hash->wait);
8156                 ctx->hash_map = hash;
8157         }
8158         mutex_unlock(&ctx->uring_lock);
8159
8160         data.hash = hash;
8161         data.task = task;
8162         data.free_work = io_wq_free_work;
8163         data.do_work = io_wq_submit_work;
8164
8165         /* Do QD, or 4 * CPUS, whatever is smallest */
8166         concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
8167
8168         return io_wq_create(concurrency, &data);
8169 }
8170
8171 static int io_uring_alloc_task_context(struct task_struct *task,
8172                                        struct io_ring_ctx *ctx)
8173 {
8174         struct io_uring_task *tctx;
8175         int ret;
8176
8177         tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
8178         if (unlikely(!tctx))
8179                 return -ENOMEM;
8180
8181         ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
8182         if (unlikely(ret)) {
8183                 kfree(tctx);
8184                 return ret;
8185         }
8186
8187         tctx->io_wq = io_init_wq_offload(ctx, task);
8188         if (IS_ERR(tctx->io_wq)) {
8189                 ret = PTR_ERR(tctx->io_wq);
8190                 percpu_counter_destroy(&tctx->inflight);
8191                 kfree(tctx);
8192                 return ret;
8193         }
8194
8195         xa_init(&tctx->xa);
8196         init_waitqueue_head(&tctx->wait);
8197         atomic_set(&tctx->in_idle, 0);
8198         atomic_set(&tctx->inflight_tracked, 0);
8199         task->io_uring = tctx;
8200         spin_lock_init(&tctx->task_lock);
8201         INIT_WQ_LIST(&tctx->task_list);
8202         init_task_work(&tctx->task_work, tctx_task_work);
8203         return 0;
8204 }
8205
8206 void __io_uring_free(struct task_struct *tsk)
8207 {
8208         struct io_uring_task *tctx = tsk->io_uring;
8209
8210         WARN_ON_ONCE(!xa_empty(&tctx->xa));
8211         WARN_ON_ONCE(tctx->io_wq);
8212         WARN_ON_ONCE(tctx->cached_refs);
8213
8214         percpu_counter_destroy(&tctx->inflight);
8215         kfree(tctx);
8216         tsk->io_uring = NULL;
8217 }
8218
8219 static int io_sq_offload_create(struct io_ring_ctx *ctx,
8220                                 struct io_uring_params *p)
8221 {
8222         int ret;
8223
8224         /* Retain compatibility with failing for an invalid attach attempt */
8225         if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
8226                                 IORING_SETUP_ATTACH_WQ) {
8227                 struct fd f;
8228
8229                 f = fdget(p->wq_fd);
8230                 if (!f.file)
8231                         return -ENXIO;
8232                 if (f.file->f_op != &io_uring_fops) {
8233                         fdput(f);
8234                         return -EINVAL;
8235                 }
8236                 fdput(f);
8237         }
8238         if (ctx->flags & IORING_SETUP_SQPOLL) {
8239                 struct task_struct *tsk;
8240                 struct io_sq_data *sqd;
8241                 bool attached;
8242
8243                 sqd = io_get_sq_data(p, &attached);
8244                 if (IS_ERR(sqd)) {
8245                         ret = PTR_ERR(sqd);
8246                         goto err;
8247                 }
8248
8249                 ctx->sq_creds = get_current_cred();
8250                 ctx->sq_data = sqd;
8251                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
8252                 if (!ctx->sq_thread_idle)
8253                         ctx->sq_thread_idle = HZ;
8254
8255                 io_sq_thread_park(sqd);
8256                 list_add(&ctx->sqd_list, &sqd->ctx_list);
8257                 io_sqd_update_thread_idle(sqd);
8258                 /* don't attach to a dying SQPOLL thread, would be racy */
8259                 ret = (attached && !sqd->thread) ? -ENXIO : 0;
8260                 io_sq_thread_unpark(sqd);
8261
8262                 if (ret < 0)
8263                         goto err;
8264                 if (attached)
8265                         return 0;
8266
8267                 if (p->flags & IORING_SETUP_SQ_AFF) {
8268                         int cpu = p->sq_thread_cpu;
8269
8270                         ret = -EINVAL;
8271                         if (cpu >= nr_cpu_ids || !cpu_online(cpu))
8272                                 goto err_sqpoll;
8273                         sqd->sq_cpu = cpu;
8274                 } else {
8275                         sqd->sq_cpu = -1;
8276                 }
8277
8278                 sqd->task_pid = current->pid;
8279                 sqd->task_tgid = current->tgid;
8280                 tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
8281                 if (IS_ERR(tsk)) {
8282                         ret = PTR_ERR(tsk);
8283                         goto err_sqpoll;
8284                 }
8285
8286                 sqd->thread = tsk;
8287                 ret = io_uring_alloc_task_context(tsk, ctx);
8288                 wake_up_new_task(tsk);
8289                 if (ret)
8290                         goto err;
8291         } else if (p->flags & IORING_SETUP_SQ_AFF) {
8292                 /* Can't have SQ_AFF without SQPOLL */
8293                 ret = -EINVAL;
8294                 goto err;
8295         }
8296
8297         return 0;
8298 err_sqpoll:
8299         complete(&ctx->sq_data->exited);
8300 err:
8301         io_sq_thread_finish(ctx);
8302         return ret;
8303 }
8304
8305 static inline void __io_unaccount_mem(struct user_struct *user,
8306                                       unsigned long nr_pages)
8307 {
8308         atomic_long_sub(nr_pages, &user->locked_vm);
8309 }
8310
8311 static inline int __io_account_mem(struct user_struct *user,
8312                                    unsigned long nr_pages)
8313 {
8314         unsigned long page_limit, cur_pages, new_pages;
8315
8316         /* Don't allow more pages than we can safely lock */
8317         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
8318
8319         do {
8320                 cur_pages = atomic_long_read(&user->locked_vm);
8321                 new_pages = cur_pages + nr_pages;
8322                 if (new_pages > page_limit)
8323                         return -ENOMEM;
8324         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
8325                                         new_pages) != cur_pages);
8326
8327         return 0;
8328 }
8329
8330 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
8331 {
8332         if (ctx->user)
8333                 __io_unaccount_mem(ctx->user, nr_pages);
8334
8335         if (ctx->mm_account)
8336                 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
8337 }
8338
8339 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
8340 {
8341         int ret;
8342
8343         if (ctx->user) {
8344                 ret = __io_account_mem(ctx->user, nr_pages);
8345                 if (ret)
8346                         return ret;
8347         }
8348
8349         if (ctx->mm_account)
8350                 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
8351
8352         return 0;
8353 }
8354
8355 static void io_mem_free(void *ptr)
8356 {
8357         struct page *page;
8358
8359         if (!ptr)
8360                 return;
8361
8362         page = virt_to_head_page(ptr);
8363         if (put_page_testzero(page))
8364                 free_compound_page(page);
8365 }
8366
8367 static void *io_mem_alloc(size_t size)
8368 {
8369         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
8370                                 __GFP_NORETRY | __GFP_ACCOUNT;
8371
8372         return (void *) __get_free_pages(gfp_flags, get_order(size));
8373 }
8374
8375 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
8376                                 size_t *sq_offset)
8377 {
8378         struct io_rings *rings;
8379         size_t off, sq_array_size;
8380
8381         off = struct_size(rings, cqes, cq_entries);
8382         if (off == SIZE_MAX)
8383                 return SIZE_MAX;
8384
8385 #ifdef CONFIG_SMP
8386         off = ALIGN(off, SMP_CACHE_BYTES);
8387         if (off == 0)
8388                 return SIZE_MAX;
8389 #endif
8390
8391         if (sq_offset)
8392                 *sq_offset = off;
8393
8394         sq_array_size = array_size(sizeof(u32), sq_entries);
8395         if (sq_array_size == SIZE_MAX)
8396                 return SIZE_MAX;
8397
8398         if (check_add_overflow(off, sq_array_size, &off))
8399                 return SIZE_MAX;
8400
8401         return off;
8402 }
8403
8404 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
8405 {
8406         struct io_mapped_ubuf *imu = *slot;
8407         unsigned int i;
8408
8409         if (imu != ctx->dummy_ubuf) {
8410                 for (i = 0; i < imu->nr_bvecs; i++)
8411                         unpin_user_page(imu->bvec[i].bv_page);
8412                 if (imu->acct_pages)
8413                         io_unaccount_mem(ctx, imu->acct_pages);
8414                 kvfree(imu);
8415         }
8416         *slot = NULL;
8417 }
8418
8419 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
8420 {
8421         io_buffer_unmap(ctx, &prsrc->buf);
8422         prsrc->buf = NULL;
8423 }
8424
8425 static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
8426 {
8427         unsigned int i;
8428
8429         for (i = 0; i < ctx->nr_user_bufs; i++)
8430                 io_buffer_unmap(ctx, &ctx->user_bufs[i]);
8431         kfree(ctx->user_bufs);
8432         io_rsrc_data_free(ctx->buf_data);
8433         ctx->user_bufs = NULL;
8434         ctx->buf_data = NULL;
8435         ctx->nr_user_bufs = 0;
8436 }
8437
8438 static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
8439 {
8440         int ret;
8441
8442         if (!ctx->buf_data)
8443                 return -ENXIO;
8444
8445         ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
8446         if (!ret)
8447                 __io_sqe_buffers_unregister(ctx);
8448         return ret;
8449 }
8450
8451 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
8452                        void __user *arg, unsigned index)
8453 {
8454         struct iovec __user *src;
8455
8456 #ifdef CONFIG_COMPAT
8457         if (ctx->compat) {
8458                 struct compat_iovec __user *ciovs;
8459                 struct compat_iovec ciov;
8460
8461                 ciovs = (struct compat_iovec __user *) arg;
8462                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
8463                         return -EFAULT;
8464
8465                 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
8466                 dst->iov_len = ciov.iov_len;
8467                 return 0;
8468         }
8469 #endif
8470         src = (struct iovec __user *) arg;
8471         if (copy_from_user(dst, &src[index], sizeof(*dst)))
8472                 return -EFAULT;
8473         return 0;
8474 }
8475
8476 /*
8477  * Not super efficient, but this is just a registration time. And we do cache
8478  * the last compound head, so generally we'll only do a full search if we don't
8479  * match that one.
8480  *
8481  * We check if the given compound head page has already been accounted, to
8482  * avoid double accounting it. This allows us to account the full size of the
8483  * page, not just the constituent pages of a huge page.
8484  */
8485 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
8486                                   int nr_pages, struct page *hpage)
8487 {
8488         int i, j;
8489
8490         /* check current page array */
8491         for (i = 0; i < nr_pages; i++) {
8492                 if (!PageCompound(pages[i]))
8493                         continue;
8494                 if (compound_head(pages[i]) == hpage)
8495                         return true;
8496         }
8497
8498         /* check previously registered pages */
8499         for (i = 0; i < ctx->nr_user_bufs; i++) {
8500                 struct io_mapped_ubuf *imu = ctx->user_bufs[i];
8501
8502                 for (j = 0; j < imu->nr_bvecs; j++) {
8503                         if (!PageCompound(imu->bvec[j].bv_page))
8504                                 continue;
8505                         if (compound_head(imu->bvec[j].bv_page) == hpage)
8506                                 return true;
8507                 }
8508         }
8509
8510         return false;
8511 }
8512
8513 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
8514                                  int nr_pages, struct io_mapped_ubuf *imu,
8515                                  struct page **last_hpage)
8516 {
8517         int i, ret;
8518
8519         imu->acct_pages = 0;
8520         for (i = 0; i < nr_pages; i++) {
8521                 if (!PageCompound(pages[i])) {
8522                         imu->acct_pages++;
8523                 } else {
8524                         struct page *hpage;
8525
8526                         hpage = compound_head(pages[i]);
8527                         if (hpage == *last_hpage)
8528                                 continue;
8529                         *last_hpage = hpage;
8530                         if (headpage_already_acct(ctx, pages, i, hpage))
8531                                 continue;
8532                         imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
8533                 }
8534         }
8535
8536         if (!imu->acct_pages)
8537                 return 0;
8538
8539         ret = io_account_mem(ctx, imu->acct_pages);
8540         if (ret)
8541                 imu->acct_pages = 0;
8542         return ret;
8543 }
8544
8545 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
8546                                   struct io_mapped_ubuf **pimu,
8547                                   struct page **last_hpage)
8548 {
8549         struct io_mapped_ubuf *imu = NULL;
8550         struct vm_area_struct **vmas = NULL;
8551         struct page **pages = NULL;
8552         unsigned long off, start, end, ubuf;
8553         size_t size;
8554         int ret, pret, nr_pages, i;
8555
8556         if (!iov->iov_base) {
8557                 *pimu = ctx->dummy_ubuf;
8558                 return 0;
8559         }
8560
8561         ubuf = (unsigned long) iov->iov_base;
8562         end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
8563         start = ubuf >> PAGE_SHIFT;
8564         nr_pages = end - start;
8565
8566         *pimu = NULL;
8567         ret = -ENOMEM;
8568
8569         pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
8570         if (!pages)
8571                 goto done;
8572
8573         vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
8574                               GFP_KERNEL);
8575         if (!vmas)
8576                 goto done;
8577
8578         imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
8579         if (!imu)
8580                 goto done;
8581
8582         ret = 0;
8583         mmap_read_lock(current->mm);
8584         pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
8585                               pages, vmas);
8586         if (pret == nr_pages) {
8587                 /* don't support file backed memory */
8588                 for (i = 0; i < nr_pages; i++) {
8589                         struct vm_area_struct *vma = vmas[i];
8590
8591                         if (vma_is_shmem(vma))
8592                                 continue;
8593                         if (vma->vm_file &&
8594                             !is_file_hugepages(vma->vm_file)) {
8595                                 ret = -EOPNOTSUPP;
8596                                 break;
8597                         }
8598                 }
8599         } else {
8600                 ret = pret < 0 ? pret : -EFAULT;
8601         }
8602         mmap_read_unlock(current->mm);
8603         if (ret) {
8604                 /*
8605                  * if we did partial map, or found file backed vmas,
8606                  * release any pages we did get
8607                  */
8608                 if (pret > 0)
8609                         unpin_user_pages(pages, pret);
8610                 goto done;
8611         }
8612
8613         ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
8614         if (ret) {
8615                 unpin_user_pages(pages, pret);
8616                 goto done;
8617         }
8618
8619         off = ubuf & ~PAGE_MASK;
8620         size = iov->iov_len;
8621         for (i = 0; i < nr_pages; i++) {
8622                 size_t vec_len;
8623
8624                 vec_len = min_t(size_t, size, PAGE_SIZE - off);
8625                 imu->bvec[i].bv_page = pages[i];
8626                 imu->bvec[i].bv_len = vec_len;
8627                 imu->bvec[i].bv_offset = off;
8628                 off = 0;
8629                 size -= vec_len;
8630         }
8631         /* store original address for later verification */
8632         imu->ubuf = ubuf;
8633         imu->ubuf_end = ubuf + iov->iov_len;
8634         imu->nr_bvecs = nr_pages;
8635         *pimu = imu;
8636         ret = 0;
8637 done:
8638         if (ret)
8639                 kvfree(imu);
8640         kvfree(pages);
8641         kvfree(vmas);
8642         return ret;
8643 }
8644
8645 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
8646 {
8647         ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
8648         return ctx->user_bufs ? 0 : -ENOMEM;
8649 }
8650
8651 static int io_buffer_validate(struct iovec *iov)
8652 {
8653         unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
8654
8655         /*
8656          * Don't impose further limits on the size and buffer
8657          * constraints here, we'll -EINVAL later when IO is
8658          * submitted if they are wrong.
8659          */
8660         if (!iov->iov_base)
8661                 return iov->iov_len ? -EFAULT : 0;
8662         if (!iov->iov_len)
8663                 return -EFAULT;
8664
8665         /* arbitrary limit, but we need something */
8666         if (iov->iov_len > SZ_1G)
8667                 return -EFAULT;
8668
8669         if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
8670                 return -EOVERFLOW;
8671
8672         return 0;
8673 }
8674
8675 static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
8676                                    unsigned int nr_args, u64 __user *tags)
8677 {
8678         struct page *last_hpage = NULL;
8679         struct io_rsrc_data *data;
8680         int i, ret;
8681         struct iovec iov;
8682
8683         if (ctx->user_bufs)
8684                 return -EBUSY;
8685         if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
8686                 return -EINVAL;
8687         ret = io_rsrc_node_switch_start(ctx);
8688         if (ret)
8689                 return ret;
8690         ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
8691         if (ret)
8692                 return ret;
8693         ret = io_buffers_map_alloc(ctx, nr_args);
8694         if (ret) {
8695                 io_rsrc_data_free(data);
8696                 return ret;
8697         }
8698
8699         for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
8700                 ret = io_copy_iov(ctx, &iov, arg, i);
8701                 if (ret)
8702                         break;
8703                 ret = io_buffer_validate(&iov);
8704                 if (ret)
8705                         break;
8706                 if (!iov.iov_base && *io_get_tag_slot(data, i)) {
8707                         ret = -EINVAL;
8708                         break;
8709                 }
8710
8711                 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
8712                                              &last_hpage);
8713                 if (ret)
8714                         break;
8715         }
8716
8717         WARN_ON_ONCE(ctx->buf_data);
8718
8719         ctx->buf_data = data;
8720         if (ret)
8721                 __io_sqe_buffers_unregister(ctx);
8722         else
8723                 io_rsrc_node_switch(ctx, NULL);
8724         return ret;
8725 }
8726
8727 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
8728                                    struct io_uring_rsrc_update2 *up,
8729                                    unsigned int nr_args)
8730 {
8731         u64 __user *tags = u64_to_user_ptr(up->tags);
8732         struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
8733         struct page *last_hpage = NULL;
8734         bool needs_switch = false;
8735         __u32 done;
8736         int i, err;
8737
8738         if (!ctx->buf_data)
8739                 return -ENXIO;
8740         if (up->offset + nr_args > ctx->nr_user_bufs)
8741                 return -EINVAL;
8742
8743         for (done = 0; done < nr_args; done++) {
8744                 struct io_mapped_ubuf *imu;
8745                 int offset = up->offset + done;
8746                 u64 tag = 0;
8747
8748                 err = io_copy_iov(ctx, &iov, iovs, done);
8749                 if (err)
8750                         break;
8751                 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
8752                         err = -EFAULT;
8753                         break;
8754                 }
8755                 err = io_buffer_validate(&iov);
8756                 if (err)
8757                         break;
8758                 if (!iov.iov_base && tag) {
8759                         err = -EINVAL;
8760                         break;
8761                 }
8762                 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
8763                 if (err)
8764                         break;
8765
8766                 i = array_index_nospec(offset, ctx->nr_user_bufs);
8767                 if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
8768                         err = io_queue_rsrc_removal(ctx->buf_data, offset,
8769                                                     ctx->rsrc_node, ctx->user_bufs[i]);
8770                         if (unlikely(err)) {
8771                                 io_buffer_unmap(ctx, &imu);
8772                                 break;
8773                         }
8774                         ctx->user_bufs[i] = NULL;
8775                         needs_switch = true;
8776                 }
8777
8778                 ctx->user_bufs[i] = imu;
8779                 *io_get_tag_slot(ctx->buf_data, offset) = tag;
8780         }
8781
8782         if (needs_switch)
8783                 io_rsrc_node_switch(ctx, ctx->buf_data);
8784         return done ? done : err;
8785 }
8786
8787 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
8788 {
8789         __s32 __user *fds = arg;
8790         int fd;
8791
8792         if (ctx->cq_ev_fd)
8793                 return -EBUSY;
8794
8795         if (copy_from_user(&fd, fds, sizeof(*fds)))
8796                 return -EFAULT;
8797
8798         ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
8799         if (IS_ERR(ctx->cq_ev_fd)) {
8800                 int ret = PTR_ERR(ctx->cq_ev_fd);
8801
8802                 ctx->cq_ev_fd = NULL;
8803                 return ret;
8804         }
8805
8806         return 0;
8807 }
8808
8809 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
8810 {
8811         if (ctx->cq_ev_fd) {
8812                 eventfd_ctx_put(ctx->cq_ev_fd);
8813                 ctx->cq_ev_fd = NULL;
8814                 return 0;
8815         }
8816
8817         return -ENXIO;
8818 }
8819
8820 static void io_destroy_buffers(struct io_ring_ctx *ctx)
8821 {
8822         struct io_buffer *buf;
8823         unsigned long index;
8824
8825         xa_for_each(&ctx->io_buffers, index, buf)
8826                 __io_remove_buffers(ctx, buf, index, -1U);
8827 }
8828
8829 static void io_req_cache_free(struct list_head *list)
8830 {
8831         struct io_kiocb *req, *nxt;
8832
8833         list_for_each_entry_safe(req, nxt, list, inflight_entry) {
8834                 list_del(&req->inflight_entry);
8835                 kmem_cache_free(req_cachep, req);
8836         }
8837 }
8838
8839 static void io_req_caches_free(struct io_ring_ctx *ctx)
8840 {
8841         struct io_submit_state *state = &ctx->submit_state;
8842
8843         mutex_lock(&ctx->uring_lock);
8844
8845         if (state->free_reqs) {
8846                 kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
8847                 state->free_reqs = 0;
8848         }
8849
8850         io_flush_cached_locked_reqs(ctx, state);
8851         io_req_cache_free(&state->free_list);
8852         mutex_unlock(&ctx->uring_lock);
8853 }
8854
8855 static void io_wait_rsrc_data(struct io_rsrc_data *data)
8856 {
8857         if (data && !atomic_dec_and_test(&data->refs))
8858                 wait_for_completion(&data->done);
8859 }
8860
8861 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
8862 {
8863         io_sq_thread_finish(ctx);
8864
8865         if (ctx->mm_account) {
8866                 mmdrop(ctx->mm_account);
8867                 ctx->mm_account = NULL;
8868         }
8869
8870         /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
8871         io_wait_rsrc_data(ctx->buf_data);
8872         io_wait_rsrc_data(ctx->file_data);
8873
8874         mutex_lock(&ctx->uring_lock);
8875         if (ctx->buf_data)
8876                 __io_sqe_buffers_unregister(ctx);
8877         if (ctx->file_data)
8878                 __io_sqe_files_unregister(ctx);
8879         if (ctx->rings)
8880                 __io_cqring_overflow_flush(ctx, true);
8881         mutex_unlock(&ctx->uring_lock);
8882         io_eventfd_unregister(ctx);
8883         io_destroy_buffers(ctx);
8884         if (ctx->sq_creds)
8885                 put_cred(ctx->sq_creds);
8886
8887         /* there are no registered resources left, nobody uses it */
8888         if (ctx->rsrc_node)
8889                 io_rsrc_node_destroy(ctx->rsrc_node);
8890         if (ctx->rsrc_backup_node)
8891                 io_rsrc_node_destroy(ctx->rsrc_backup_node);
8892         flush_delayed_work(&ctx->rsrc_put_work);
8893
8894         WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
8895         WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
8896
8897 #if defined(CONFIG_UNIX)
8898         if (ctx->ring_sock) {
8899                 ctx->ring_sock->file = NULL; /* so that iput() is called */
8900                 sock_release(ctx->ring_sock);
8901         }
8902 #endif
8903
8904         io_mem_free(ctx->rings);
8905         io_mem_free(ctx->sq_sqes);
8906
8907         percpu_ref_exit(&ctx->refs);
8908         free_uid(ctx->user);
8909         io_req_caches_free(ctx);
8910         if (ctx->hash_map)
8911                 io_wq_put_hash(ctx->hash_map);
8912         kfree(ctx->cancel_hash);
8913         kfree(ctx->dummy_ubuf);
8914         kfree(ctx);
8915 }
8916
8917 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
8918 {
8919         struct io_ring_ctx *ctx = file->private_data;
8920         __poll_t mask = 0;
8921
8922         poll_wait(file, &ctx->poll_wait, wait);
8923         /*
8924          * synchronizes with barrier from wq_has_sleeper call in
8925          * io_commit_cqring
8926          */
8927         smp_rmb();
8928         if (!io_sqring_full(ctx))
8929                 mask |= EPOLLOUT | EPOLLWRNORM;
8930
8931         /*
8932          * Don't flush cqring overflow list here, just do a simple check.
8933          * Otherwise there could possible be ABBA deadlock:
8934          *      CPU0                    CPU1
8935          *      ----                    ----
8936          * lock(&ctx->uring_lock);
8937          *                              lock(&ep->mtx);
8938          *                              lock(&ctx->uring_lock);
8939          * lock(&ep->mtx);
8940          *
8941          * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
8942          * pushs them to do the flush.
8943          */
8944         if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
8945                 mask |= EPOLLIN | EPOLLRDNORM;
8946
8947         return mask;
8948 }
8949
8950 static int io_uring_fasync(int fd, struct file *file, int on)
8951 {
8952         struct io_ring_ctx *ctx = file->private_data;
8953
8954         return fasync_helper(fd, file, on, &ctx->cq_fasync);
8955 }
8956
8957 static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
8958 {
8959         const struct cred *creds;
8960
8961         creds = xa_erase(&ctx->personalities, id);
8962         if (creds) {
8963                 put_cred(creds);
8964                 return 0;
8965         }
8966
8967         return -EINVAL;
8968 }
8969
8970 struct io_tctx_exit {
8971         struct callback_head            task_work;
8972         struct completion               completion;
8973         struct io_ring_ctx              *ctx;
8974 };
8975
8976 static void io_tctx_exit_cb(struct callback_head *cb)
8977 {
8978         struct io_uring_task *tctx = current->io_uring;
8979         struct io_tctx_exit *work;
8980
8981         work = container_of(cb, struct io_tctx_exit, task_work);
8982         /*
8983          * When @in_idle, we're in cancellation and it's racy to remove the
8984          * node. It'll be removed by the end of cancellation, just ignore it.
8985          */
8986         if (!atomic_read(&tctx->in_idle))
8987                 io_uring_del_tctx_node((unsigned long)work->ctx);
8988         complete(&work->completion);
8989 }
8990
8991 static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
8992 {
8993         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
8994
8995         return req->ctx == data;
8996 }
8997
8998 static void io_ring_exit_work(struct work_struct *work)
8999 {
9000         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
9001         unsigned long timeout = jiffies + HZ * 60 * 5;
9002         unsigned long interval = HZ / 20;
9003         struct io_tctx_exit exit;
9004         struct io_tctx_node *node;
9005         int ret;
9006
9007         /*
9008          * If we're doing polled IO and end up having requests being
9009          * submitted async (out-of-line), then completions can come in while
9010          * we're waiting for refs to drop. We need to reap these manually,
9011          * as nobody else will be looking for them.
9012          */
9013         do {
9014                 io_uring_try_cancel_requests(ctx, NULL, true);
9015                 if (ctx->sq_data) {
9016                         struct io_sq_data *sqd = ctx->sq_data;
9017                         struct task_struct *tsk;
9018
9019                         io_sq_thread_park(sqd);
9020                         tsk = sqd->thread;
9021                         if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
9022                                 io_wq_cancel_cb(tsk->io_uring->io_wq,
9023                                                 io_cancel_ctx_cb, ctx, true);
9024                         io_sq_thread_unpark(sqd);
9025                 }
9026
9027                 if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
9028                         /* there is little hope left, don't run it too often */
9029                         interval = HZ * 60;
9030                 }
9031         } while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
9032
9033         init_completion(&exit.completion);
9034         init_task_work(&exit.task_work, io_tctx_exit_cb);
9035         exit.ctx = ctx;
9036         /*
9037          * Some may use context even when all refs and requests have been put,
9038          * and they are free to do so while still holding uring_lock or
9039          * completion_lock, see io_req_task_submit(). Apart from other work,
9040          * this lock/unlock section also waits them to finish.
9041          */
9042         mutex_lock(&ctx->uring_lock);
9043         while (!list_empty(&ctx->tctx_list)) {
9044                 WARN_ON_ONCE(time_after(jiffies, timeout));
9045
9046                 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
9047                                         ctx_node);
9048                 /* don't spin on a single task if cancellation failed */
9049                 list_rotate_left(&ctx->tctx_list);
9050                 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
9051                 if (WARN_ON_ONCE(ret))
9052                         continue;
9053                 wake_up_process(node->task);
9054
9055                 mutex_unlock(&ctx->uring_lock);
9056                 wait_for_completion(&exit.completion);
9057                 mutex_lock(&ctx->uring_lock);
9058         }
9059         mutex_unlock(&ctx->uring_lock);
9060         spin_lock(&ctx->completion_lock);
9061         spin_unlock(&ctx->completion_lock);
9062
9063         io_ring_ctx_free(ctx);
9064 }
9065
9066 /* Returns true if we found and killed one or more timeouts */
9067 static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
9068                              bool cancel_all)
9069 {
9070         struct io_kiocb *req, *tmp;
9071         int canceled = 0;
9072
9073         spin_lock(&ctx->completion_lock);
9074         spin_lock_irq(&ctx->timeout_lock);
9075         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
9076                 if (io_match_task(req, tsk, cancel_all)) {
9077                         io_kill_timeout(req, -ECANCELED);
9078                         canceled++;
9079                 }
9080         }
9081         spin_unlock_irq(&ctx->timeout_lock);
9082         if (canceled != 0)
9083                 io_commit_cqring(ctx);
9084         spin_unlock(&ctx->completion_lock);
9085         if (canceled != 0)
9086                 io_cqring_ev_posted(ctx);
9087         return canceled != 0;
9088 }
9089
9090 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
9091 {
9092         unsigned long index;
9093         struct creds *creds;
9094
9095         mutex_lock(&ctx->uring_lock);
9096         percpu_ref_kill(&ctx->refs);
9097         if (ctx->rings)
9098                 __io_cqring_overflow_flush(ctx, true);
9099         xa_for_each(&ctx->personalities, index, creds)
9100                 io_unregister_personality(ctx, index);
9101         mutex_unlock(&ctx->uring_lock);
9102
9103         io_kill_timeouts(ctx, NULL, true);
9104         io_poll_remove_all(ctx, NULL, true);
9105
9106         /* if we failed setting up the ctx, we might not have any rings */
9107         io_iopoll_try_reap_events(ctx);
9108
9109         INIT_WORK(&ctx->exit_work, io_ring_exit_work);
9110         /*
9111          * Use system_unbound_wq to avoid spawning tons of event kworkers
9112          * if we're exiting a ton of rings at the same time. It just adds
9113          * noise and overhead, there's no discernable change in runtime
9114          * over using system_wq.
9115          */
9116         queue_work(system_unbound_wq, &ctx->exit_work);
9117 }
9118
9119 static int io_uring_release(struct inode *inode, struct file *file)
9120 {
9121         struct io_ring_ctx *ctx = file->private_data;
9122
9123         file->private_data = NULL;
9124         io_ring_ctx_wait_and_kill(ctx);
9125         return 0;
9126 }
9127
9128 struct io_task_cancel {
9129         struct task_struct *task;
9130         bool all;
9131 };
9132
9133 static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
9134 {
9135         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
9136         struct io_task_cancel *cancel = data;
9137         bool ret;
9138
9139         if (!cancel->all && (req->flags & REQ_F_LINK_TIMEOUT)) {
9140                 struct io_ring_ctx *ctx = req->ctx;
9141
9142                 /* protect against races with linked timeouts */
9143                 spin_lock(&ctx->completion_lock);
9144                 ret = io_match_task(req, cancel->task, cancel->all);
9145                 spin_unlock(&ctx->completion_lock);
9146         } else {
9147                 ret = io_match_task(req, cancel->task, cancel->all);
9148         }
9149         return ret;
9150 }
9151
9152 static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
9153                                   struct task_struct *task, bool cancel_all)
9154 {
9155         struct io_defer_entry *de;
9156         LIST_HEAD(list);
9157
9158         spin_lock(&ctx->completion_lock);
9159         list_for_each_entry_reverse(de, &ctx->defer_list, list) {
9160                 if (io_match_task(de->req, task, cancel_all)) {
9161                         list_cut_position(&list, &ctx->defer_list, &de->list);
9162                         break;
9163                 }
9164         }
9165         spin_unlock(&ctx->completion_lock);
9166         if (list_empty(&list))
9167                 return false;
9168
9169         while (!list_empty(&list)) {
9170                 de = list_first_entry(&list, struct io_defer_entry, list);
9171                 list_del_init(&de->list);
9172                 io_req_complete_failed(de->req, -ECANCELED);
9173                 kfree(de);
9174         }
9175         return true;
9176 }
9177
9178 static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
9179 {
9180         struct io_tctx_node *node;
9181         enum io_wq_cancel cret;
9182         bool ret = false;
9183
9184         mutex_lock(&ctx->uring_lock);
9185         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
9186                 struct io_uring_task *tctx = node->task->io_uring;
9187
9188                 /*
9189                  * io_wq will stay alive while we hold uring_lock, because it's
9190                  * killed after ctx nodes, which requires to take the lock.
9191                  */
9192                 if (!tctx || !tctx->io_wq)
9193                         continue;
9194                 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
9195                 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
9196         }
9197         mutex_unlock(&ctx->uring_lock);
9198
9199         return ret;
9200 }
9201
9202 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
9203                                          struct task_struct *task,
9204                                          bool cancel_all)
9205 {
9206         struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
9207         struct io_uring_task *tctx = task ? task->io_uring : NULL;
9208
9209         while (1) {
9210                 enum io_wq_cancel cret;
9211                 bool ret = false;
9212
9213                 if (!task) {
9214                         ret |= io_uring_try_cancel_iowq(ctx);
9215                 } else if (tctx && tctx->io_wq) {
9216                         /*
9217                          * Cancels requests of all rings, not only @ctx, but
9218                          * it's fine as the task is in exit/exec.
9219                          */
9220                         cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
9221                                                &cancel, true);
9222                         ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
9223                 }
9224
9225                 /* SQPOLL thread does its own polling */
9226                 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
9227                     (ctx->sq_data && ctx->sq_data->thread == current)) {
9228                         while (!list_empty_careful(&ctx->iopoll_list)) {
9229                                 io_iopoll_try_reap_events(ctx);
9230                                 ret = true;
9231                         }
9232                 }
9233
9234                 ret |= io_cancel_defer_files(ctx, task, cancel_all);
9235                 ret |= io_poll_remove_all(ctx, task, cancel_all);
9236                 ret |= io_kill_timeouts(ctx, task, cancel_all);
9237                 if (task)
9238                         ret |= io_run_task_work();
9239                 if (!ret)
9240                         break;
9241                 cond_resched();
9242         }
9243 }
9244
9245 static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
9246 {
9247         struct io_uring_task *tctx = current->io_uring;
9248         struct io_tctx_node *node;
9249         int ret;
9250
9251         if (unlikely(!tctx)) {
9252                 ret = io_uring_alloc_task_context(current, ctx);
9253                 if (unlikely(ret))
9254                         return ret;
9255                 tctx = current->io_uring;
9256         }
9257         if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
9258                 node = kmalloc(sizeof(*node), GFP_KERNEL);
9259                 if (!node)
9260                         return -ENOMEM;
9261                 node->ctx = ctx;
9262                 node->task = current;
9263
9264                 ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
9265                                         node, GFP_KERNEL));
9266                 if (ret) {
9267                         kfree(node);
9268                         return ret;
9269                 }
9270
9271                 mutex_lock(&ctx->uring_lock);
9272                 list_add(&node->ctx_node, &ctx->tctx_list);
9273                 mutex_unlock(&ctx->uring_lock);
9274         }
9275         tctx->last = ctx;
9276         return 0;
9277 }
9278
9279 /*
9280  * Note that this task has used io_uring. We use it for cancelation purposes.
9281  */
9282 static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
9283 {
9284         struct io_uring_task *tctx = current->io_uring;
9285
9286         if (likely(tctx && tctx->last == ctx))
9287                 return 0;
9288         return __io_uring_add_tctx_node(ctx);
9289 }
9290
9291 /*
9292  * Remove this io_uring_file -> task mapping.
9293  */
9294 static void io_uring_del_tctx_node(unsigned long index)
9295 {
9296         struct io_uring_task *tctx = current->io_uring;
9297         struct io_tctx_node *node;
9298
9299         if (!tctx)
9300                 return;
9301         node = xa_erase(&tctx->xa, index);
9302         if (!node)
9303                 return;
9304
9305         WARN_ON_ONCE(current != node->task);
9306         WARN_ON_ONCE(list_empty(&node->ctx_node));
9307
9308         mutex_lock(&node->ctx->uring_lock);
9309         list_del(&node->ctx_node);
9310         mutex_unlock(&node->ctx->uring_lock);
9311
9312         if (tctx->last == node->ctx)
9313                 tctx->last = NULL;
9314         kfree(node);
9315 }
9316
9317 static void io_uring_clean_tctx(struct io_uring_task *tctx)
9318 {
9319         struct io_wq *wq = tctx->io_wq;
9320         struct io_tctx_node *node;
9321         unsigned long index;
9322
9323         xa_for_each(&tctx->xa, index, node)
9324                 io_uring_del_tctx_node(index);
9325         if (wq) {
9326                 /*
9327                  * Must be after io_uring_del_task_file() (removes nodes under
9328                  * uring_lock) to avoid race with io_uring_try_cancel_iowq().
9329                  */
9330                 io_wq_put_and_exit(wq);
9331                 tctx->io_wq = NULL;
9332         }
9333 }
9334
9335 static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
9336 {
9337         if (tracked)
9338                 return atomic_read(&tctx->inflight_tracked);
9339         return percpu_counter_sum(&tctx->inflight);
9340 }
9341
9342 static void io_uring_drop_tctx_refs(struct task_struct *task)
9343 {
9344         struct io_uring_task *tctx = task->io_uring;
9345         unsigned int refs = tctx->cached_refs;
9346
9347         if (refs) {
9348                 tctx->cached_refs = 0;
9349                 percpu_counter_sub(&tctx->inflight, refs);
9350                 put_task_struct_many(task, refs);
9351         }
9352 }
9353
9354 /*
9355  * Find any io_uring ctx that this task has registered or done IO on, and cancel
9356  * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation.
9357  */
9358 static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
9359 {
9360         struct io_uring_task *tctx = current->io_uring;
9361         struct io_ring_ctx *ctx;
9362         s64 inflight;
9363         DEFINE_WAIT(wait);
9364
9365         WARN_ON_ONCE(sqd && sqd->thread != current);
9366
9367         if (!current->io_uring)
9368                 return;
9369         if (tctx->io_wq)
9370                 io_wq_exit_start(tctx->io_wq);
9371
9372         atomic_inc(&tctx->in_idle);
9373         do {
9374                 io_uring_drop_tctx_refs(current);
9375                 /* read completions before cancelations */
9376                 inflight = tctx_inflight(tctx, !cancel_all);
9377                 if (!inflight)
9378                         break;
9379
9380                 if (!sqd) {
9381                         struct io_tctx_node *node;
9382                         unsigned long index;
9383
9384                         xa_for_each(&tctx->xa, index, node) {
9385                                 /* sqpoll task will cancel all its requests */
9386                                 if (node->ctx->sq_data)
9387                                         continue;
9388                                 io_uring_try_cancel_requests(node->ctx, current,
9389                                                              cancel_all);
9390                         }
9391                 } else {
9392                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
9393                                 io_uring_try_cancel_requests(ctx, current,
9394                                                              cancel_all);
9395                 }
9396
9397                 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
9398                 io_uring_drop_tctx_refs(current);
9399                 /*
9400                  * If we've seen completions, retry without waiting. This
9401                  * avoids a race where a completion comes in before we did
9402                  * prepare_to_wait().
9403                  */
9404                 if (inflight == tctx_inflight(tctx, !cancel_all))
9405                         schedule();
9406                 finish_wait(&tctx->wait, &wait);
9407         } while (1);
9408         atomic_dec(&tctx->in_idle);
9409
9410         io_uring_clean_tctx(tctx);
9411         if (cancel_all) {
9412                 /* for exec all current's requests should be gone, kill tctx */
9413                 __io_uring_free(current);
9414         }
9415 }
9416
9417 void __io_uring_cancel(bool cancel_all)
9418 {
9419         io_uring_cancel_generic(cancel_all, NULL);
9420 }
9421
9422 static void *io_uring_validate_mmap_request(struct file *file,
9423                                             loff_t pgoff, size_t sz)
9424 {
9425         struct io_ring_ctx *ctx = file->private_data;
9426         loff_t offset = pgoff << PAGE_SHIFT;
9427         struct page *page;
9428         void *ptr;
9429
9430         switch (offset) {
9431         case IORING_OFF_SQ_RING:
9432         case IORING_OFF_CQ_RING:
9433                 ptr = ctx->rings;
9434                 break;
9435         case IORING_OFF_SQES:
9436                 ptr = ctx->sq_sqes;
9437                 break;
9438         default:
9439                 return ERR_PTR(-EINVAL);
9440         }
9441
9442         page = virt_to_head_page(ptr);
9443         if (sz > page_size(page))
9444                 return ERR_PTR(-EINVAL);
9445
9446         return ptr;
9447 }
9448
9449 #ifdef CONFIG_MMU
9450
9451 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
9452 {
9453         size_t sz = vma->vm_end - vma->vm_start;
9454         unsigned long pfn;
9455         void *ptr;
9456
9457         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
9458         if (IS_ERR(ptr))
9459                 return PTR_ERR(ptr);
9460
9461         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
9462         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
9463 }
9464
9465 #else /* !CONFIG_MMU */
9466
9467 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
9468 {
9469         return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
9470 }
9471
9472 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
9473 {
9474         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
9475 }
9476
9477 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
9478         unsigned long addr, unsigned long len,
9479         unsigned long pgoff, unsigned long flags)
9480 {
9481         void *ptr;
9482
9483         ptr = io_uring_validate_mmap_request(file, pgoff, len);
9484         if (IS_ERR(ptr))
9485                 return PTR_ERR(ptr);
9486
9487         return (unsigned long) ptr;
9488 }
9489
9490 #endif /* !CONFIG_MMU */
9491
9492 static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
9493 {
9494         DEFINE_WAIT(wait);
9495
9496         do {
9497                 if (!io_sqring_full(ctx))
9498                         break;
9499                 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
9500
9501                 if (!io_sqring_full(ctx))
9502                         break;
9503                 schedule();
9504         } while (!signal_pending(current));
9505
9506         finish_wait(&ctx->sqo_sq_wait, &wait);
9507         return 0;
9508 }
9509
9510 static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
9511                           struct __kernel_timespec __user **ts,
9512                           const sigset_t __user **sig)
9513 {
9514         struct io_uring_getevents_arg arg;
9515
9516         /*
9517          * If EXT_ARG isn't set, then we have no timespec and the argp pointer
9518          * is just a pointer to the sigset_t.
9519          */
9520         if (!(flags & IORING_ENTER_EXT_ARG)) {
9521                 *sig = (const sigset_t __user *) argp;
9522                 *ts = NULL;
9523                 return 0;
9524         }
9525
9526         /*
9527          * EXT_ARG is set - ensure we agree on the size of it and copy in our
9528          * timespec and sigset_t pointers if good.
9529          */
9530         if (*argsz != sizeof(arg))
9531                 return -EINVAL;
9532         if (copy_from_user(&arg, argp, sizeof(arg)))
9533                 return -EFAULT;
9534         *sig = u64_to_user_ptr(arg.sigmask);
9535         *argsz = arg.sigmask_sz;
9536         *ts = u64_to_user_ptr(arg.ts);
9537         return 0;
9538 }
9539
9540 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
9541                 u32, min_complete, u32, flags, const void __user *, argp,
9542                 size_t, argsz)
9543 {
9544         struct io_ring_ctx *ctx;
9545         int submitted = 0;
9546         struct fd f;
9547         long ret;
9548
9549         io_run_task_work();
9550
9551         if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
9552                                IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
9553                 return -EINVAL;
9554
9555         f = fdget(fd);
9556         if (unlikely(!f.file))
9557                 return -EBADF;
9558
9559         ret = -EOPNOTSUPP;
9560         if (unlikely(f.file->f_op != &io_uring_fops))
9561                 goto out_fput;
9562
9563         ret = -ENXIO;
9564         ctx = f.file->private_data;
9565         if (unlikely(!percpu_ref_tryget(&ctx->refs)))
9566                 goto out_fput;
9567
9568         ret = -EBADFD;
9569         if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
9570                 goto out;
9571
9572         /*
9573          * For SQ polling, the thread will do all submissions and completions.
9574          * Just return the requested submit count, and wake the thread if
9575          * we were asked to.
9576          */
9577         ret = 0;
9578         if (ctx->flags & IORING_SETUP_SQPOLL) {
9579                 io_cqring_overflow_flush(ctx);
9580
9581                 if (unlikely(ctx->sq_data->thread == NULL)) {
9582                         ret = -EOWNERDEAD;
9583                         goto out;
9584                 }
9585                 if (flags & IORING_ENTER_SQ_WAKEUP)
9586                         wake_up(&ctx->sq_data->wait);
9587                 if (flags & IORING_ENTER_SQ_WAIT) {
9588                         ret = io_sqpoll_wait_sq(ctx);
9589                         if (ret)
9590                                 goto out;
9591                 }
9592                 submitted = to_submit;
9593         } else if (to_submit) {
9594                 ret = io_uring_add_tctx_node(ctx);
9595                 if (unlikely(ret))
9596                         goto out;
9597                 mutex_lock(&ctx->uring_lock);
9598                 submitted = io_submit_sqes(ctx, to_submit);
9599                 mutex_unlock(&ctx->uring_lock);
9600
9601                 if (submitted != to_submit)
9602                         goto out;
9603         }
9604         if (flags & IORING_ENTER_GETEVENTS) {
9605                 const sigset_t __user *sig;
9606                 struct __kernel_timespec __user *ts;
9607
9608                 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
9609                 if (unlikely(ret))
9610                         goto out;
9611
9612                 min_complete = min(min_complete, ctx->cq_entries);
9613
9614                 /*
9615                  * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
9616                  * space applications don't need to do io completion events
9617                  * polling again, they can rely on io_sq_thread to do polling
9618                  * work, which can reduce cpu usage and uring_lock contention.
9619                  */
9620                 if (ctx->flags & IORING_SETUP_IOPOLL &&
9621                     !(ctx->flags & IORING_SETUP_SQPOLL)) {
9622                         ret = io_iopoll_check(ctx, min_complete);
9623                 } else {
9624                         ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
9625                 }
9626         }
9627
9628 out:
9629         percpu_ref_put(&ctx->refs);
9630 out_fput:
9631         fdput(f);
9632         return submitted ? submitted : ret;
9633 }
9634
9635 #ifdef CONFIG_PROC_FS
9636 static int io_uring_show_cred(struct seq_file *m, unsigned int id,
9637                 const struct cred *cred)
9638 {
9639         struct user_namespace *uns = seq_user_ns(m);
9640         struct group_info *gi;
9641         kernel_cap_t cap;
9642         unsigned __capi;
9643         int g;
9644
9645         seq_printf(m, "%5d\n", id);
9646         seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
9647         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
9648         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
9649         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
9650         seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
9651         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
9652         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
9653         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
9654         seq_puts(m, "\n\tGroups:\t");
9655         gi = cred->group_info;
9656         for (g = 0; g < gi->ngroups; g++) {
9657                 seq_put_decimal_ull(m, g ? " " : "",
9658                                         from_kgid_munged(uns, gi->gid[g]));
9659         }
9660         seq_puts(m, "\n\tCapEff:\t");
9661         cap = cred->cap_effective;
9662         CAP_FOR_EACH_U32(__capi)
9663                 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
9664         seq_putc(m, '\n');
9665         return 0;
9666 }
9667
9668 static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
9669 {
9670         struct io_sq_data *sq = NULL;
9671         bool has_lock;
9672         int i;
9673
9674         /*
9675          * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
9676          * since fdinfo case grabs it in the opposite direction of normal use
9677          * cases. If we fail to get the lock, we just don't iterate any
9678          * structures that could be going away outside the io_uring mutex.
9679          */
9680         has_lock = mutex_trylock(&ctx->uring_lock);
9681
9682         if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
9683                 sq = ctx->sq_data;
9684                 if (!sq->thread)
9685                         sq = NULL;
9686         }
9687
9688         seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
9689         seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
9690         seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
9691         for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
9692                 struct file *f = io_file_from_index(ctx, i);
9693
9694                 if (f)
9695                         seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
9696                 else
9697                         seq_printf(m, "%5u: <none>\n", i);
9698         }
9699         seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
9700         for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
9701                 struct io_mapped_ubuf *buf = ctx->user_bufs[i];
9702                 unsigned int len = buf->ubuf_end - buf->ubuf;
9703
9704                 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
9705         }
9706         if (has_lock && !xa_empty(&ctx->personalities)) {
9707                 unsigned long index;
9708                 const struct cred *cred;
9709
9710                 seq_printf(m, "Personalities:\n");
9711                 xa_for_each(&ctx->personalities, index, cred)
9712                         io_uring_show_cred(m, index, cred);
9713         }
9714         seq_printf(m, "PollList:\n");
9715         spin_lock(&ctx->completion_lock);
9716         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
9717                 struct hlist_head *list = &ctx->cancel_hash[i];
9718                 struct io_kiocb *req;
9719
9720                 hlist_for_each_entry(req, list, hash_node)
9721                         seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
9722                                         req->task->task_works != NULL);
9723         }
9724         spin_unlock(&ctx->completion_lock);
9725         if (has_lock)
9726                 mutex_unlock(&ctx->uring_lock);
9727 }
9728
9729 static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
9730 {
9731         struct io_ring_ctx *ctx = f->private_data;
9732
9733         if (percpu_ref_tryget(&ctx->refs)) {
9734                 __io_uring_show_fdinfo(ctx, m);
9735                 percpu_ref_put(&ctx->refs);
9736         }
9737 }
9738 #endif
9739
9740 static const struct file_operations io_uring_fops = {
9741         .release        = io_uring_release,
9742         .mmap           = io_uring_mmap,
9743 #ifndef CONFIG_MMU
9744         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
9745         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
9746 #endif
9747         .poll           = io_uring_poll,
9748         .fasync         = io_uring_fasync,
9749 #ifdef CONFIG_PROC_FS
9750         .show_fdinfo    = io_uring_show_fdinfo,
9751 #endif
9752 };
9753
9754 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
9755                                   struct io_uring_params *p)
9756 {
9757         struct io_rings *rings;
9758         size_t size, sq_array_offset;
9759
9760         /* make sure these are sane, as we already accounted them */
9761         ctx->sq_entries = p->sq_entries;
9762         ctx->cq_entries = p->cq_entries;
9763
9764         size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
9765         if (size == SIZE_MAX)
9766                 return -EOVERFLOW;
9767
9768         rings = io_mem_alloc(size);
9769         if (!rings)
9770                 return -ENOMEM;
9771
9772         ctx->rings = rings;
9773         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
9774         rings->sq_ring_mask = p->sq_entries - 1;
9775         rings->cq_ring_mask = p->cq_entries - 1;
9776         rings->sq_ring_entries = p->sq_entries;
9777         rings->cq_ring_entries = p->cq_entries;
9778
9779         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
9780         if (size == SIZE_MAX) {
9781                 io_mem_free(ctx->rings);
9782                 ctx->rings = NULL;
9783                 return -EOVERFLOW;
9784         }
9785
9786         ctx->sq_sqes = io_mem_alloc(size);
9787         if (!ctx->sq_sqes) {
9788                 io_mem_free(ctx->rings);
9789                 ctx->rings = NULL;
9790                 return -ENOMEM;
9791         }
9792
9793         return 0;
9794 }
9795
9796 static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
9797 {
9798         int ret, fd;
9799
9800         fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
9801         if (fd < 0)
9802                 return fd;
9803
9804         ret = io_uring_add_tctx_node(ctx);
9805         if (ret) {
9806                 put_unused_fd(fd);
9807                 return ret;
9808         }
9809         fd_install(fd, file);
9810         return fd;
9811 }
9812
9813 /*
9814  * Allocate an anonymous fd, this is what constitutes the application
9815  * visible backing of an io_uring instance. The application mmaps this
9816  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
9817  * we have to tie this fd to a socket for file garbage collection purposes.
9818  */
9819 static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
9820 {
9821         struct file *file;
9822 #if defined(CONFIG_UNIX)
9823         int ret;
9824
9825         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
9826                                 &ctx->ring_sock);
9827         if (ret)
9828                 return ERR_PTR(ret);
9829 #endif
9830
9831         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
9832                                         O_RDWR | O_CLOEXEC);
9833 #if defined(CONFIG_UNIX)
9834         if (IS_ERR(file)) {
9835                 sock_release(ctx->ring_sock);
9836                 ctx->ring_sock = NULL;
9837         } else {
9838                 ctx->ring_sock->file = file;
9839         }
9840 #endif
9841         return file;
9842 }
9843
9844 static int io_uring_create(unsigned entries, struct io_uring_params *p,
9845                            struct io_uring_params __user *params)
9846 {
9847         struct io_ring_ctx *ctx;
9848         struct file *file;
9849         int ret;
9850
9851         if (!entries)
9852                 return -EINVAL;
9853         if (entries > IORING_MAX_ENTRIES) {
9854                 if (!(p->flags & IORING_SETUP_CLAMP))
9855                         return -EINVAL;
9856                 entries = IORING_MAX_ENTRIES;
9857         }
9858
9859         /*
9860          * Use twice as many entries for the CQ ring. It's possible for the
9861          * application to drive a higher depth than the size of the SQ ring,
9862          * since the sqes are only used at submission time. This allows for
9863          * some flexibility in overcommitting a bit. If the application has
9864          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
9865          * of CQ ring entries manually.
9866          */
9867         p->sq_entries = roundup_pow_of_two(entries);
9868         if (p->flags & IORING_SETUP_CQSIZE) {
9869                 /*
9870                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
9871                  * to a power-of-two, if it isn't already. We do NOT impose
9872                  * any cq vs sq ring sizing.
9873                  */
9874                 if (!p->cq_entries)
9875                         return -EINVAL;
9876                 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
9877                         if (!(p->flags & IORING_SETUP_CLAMP))
9878                                 return -EINVAL;
9879                         p->cq_entries = IORING_MAX_CQ_ENTRIES;
9880                 }
9881                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
9882                 if (p->cq_entries < p->sq_entries)
9883                         return -EINVAL;
9884         } else {
9885                 p->cq_entries = 2 * p->sq_entries;
9886         }
9887
9888         ctx = io_ring_ctx_alloc(p);
9889         if (!ctx)
9890                 return -ENOMEM;
9891         ctx->compat = in_compat_syscall();
9892         if (!capable(CAP_IPC_LOCK))
9893                 ctx->user = get_uid(current_user());
9894
9895         /*
9896          * This is just grabbed for accounting purposes. When a process exits,
9897          * the mm is exited and dropped before the files, hence we need to hang
9898          * on to this mm purely for the purposes of being able to unaccount
9899          * memory (locked/pinned vm). It's not used for anything else.
9900          */
9901         mmgrab(current->mm);
9902         ctx->mm_account = current->mm;
9903
9904         ret = io_allocate_scq_urings(ctx, p);
9905         if (ret)
9906                 goto err;
9907
9908         ret = io_sq_offload_create(ctx, p);
9909         if (ret)
9910                 goto err;
9911         /* always set a rsrc node */
9912         ret = io_rsrc_node_switch_start(ctx);
9913         if (ret)
9914                 goto err;
9915         io_rsrc_node_switch(ctx, NULL);
9916
9917         memset(&p->sq_off, 0, sizeof(p->sq_off));
9918         p->sq_off.head = offsetof(struct io_rings, sq.head);
9919         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
9920         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
9921         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
9922         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
9923         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
9924         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
9925
9926         memset(&p->cq_off, 0, sizeof(p->cq_off));
9927         p->cq_off.head = offsetof(struct io_rings, cq.head);
9928         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
9929         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
9930         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
9931         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
9932         p->cq_off.cqes = offsetof(struct io_rings, cqes);
9933         p->cq_off.flags = offsetof(struct io_rings, cq_flags);
9934
9935         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
9936                         IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
9937                         IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
9938                         IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
9939                         IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
9940                         IORING_FEAT_RSRC_TAGS;
9941
9942         if (copy_to_user(params, p, sizeof(*p))) {
9943                 ret = -EFAULT;
9944                 goto err;
9945         }
9946
9947         file = io_uring_get_file(ctx);
9948         if (IS_ERR(file)) {
9949                 ret = PTR_ERR(file);
9950                 goto err;
9951         }
9952
9953         /*
9954          * Install ring fd as the very last thing, so we don't risk someone
9955          * having closed it before we finish setup
9956          */
9957         ret = io_uring_install_fd(ctx, file);
9958         if (ret < 0) {
9959                 /* fput will clean it up */
9960                 fput(file);
9961                 return ret;
9962         }
9963
9964         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
9965         return ret;
9966 err:
9967         io_ring_ctx_wait_and_kill(ctx);
9968         return ret;
9969 }
9970
9971 /*
9972  * Sets up an aio uring context, and returns the fd. Applications asks for a
9973  * ring size, we return the actual sq/cq ring sizes (among other things) in the
9974  * params structure passed in.
9975  */
9976 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
9977 {
9978         struct io_uring_params p;
9979         int i;
9980
9981         if (copy_from_user(&p, params, sizeof(p)))
9982                 return -EFAULT;
9983         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
9984                 if (p.resv[i])
9985                         return -EINVAL;
9986         }
9987
9988         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
9989                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
9990                         IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
9991                         IORING_SETUP_R_DISABLED))
9992                 return -EINVAL;
9993
9994         return  io_uring_create(entries, &p, params);
9995 }
9996
9997 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
9998                 struct io_uring_params __user *, params)
9999 {
10000         return io_uring_setup(entries, params);
10001 }
10002
10003 static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
10004 {
10005         struct io_uring_probe *p;
10006         size_t size;
10007         int i, ret;
10008
10009         size = struct_size(p, ops, nr_args);
10010         if (size == SIZE_MAX)
10011                 return -EOVERFLOW;
10012         p = kzalloc(size, GFP_KERNEL);
10013         if (!p)
10014                 return -ENOMEM;
10015
10016         ret = -EFAULT;
10017         if (copy_from_user(p, arg, size))
10018                 goto out;
10019         ret = -EINVAL;
10020         if (memchr_inv(p, 0, size))
10021                 goto out;
10022
10023         p->last_op = IORING_OP_LAST - 1;
10024         if (nr_args > IORING_OP_LAST)
10025                 nr_args = IORING_OP_LAST;
10026
10027         for (i = 0; i < nr_args; i++) {
10028                 p->ops[i].op = i;
10029                 if (!io_op_defs[i].not_supported)
10030                         p->ops[i].flags = IO_URING_OP_SUPPORTED;
10031         }
10032         p->ops_len = i;
10033
10034         ret = 0;
10035         if (copy_to_user(arg, p, size))
10036                 ret = -EFAULT;
10037 out:
10038         kfree(p);
10039         return ret;
10040 }
10041
10042 static int io_register_personality(struct io_ring_ctx *ctx)
10043 {
10044         const struct cred *creds;
10045         u32 id;
10046         int ret;
10047
10048         creds = get_current_cred();
10049
10050         ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
10051                         XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
10052         if (ret < 0) {
10053                 put_cred(creds);
10054                 return ret;
10055         }
10056         return id;
10057 }
10058
10059 static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
10060                                     unsigned int nr_args)
10061 {
10062         struct io_uring_restriction *res;
10063         size_t size;
10064         int i, ret;
10065
10066         /* Restrictions allowed only if rings started disabled */
10067         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
10068                 return -EBADFD;
10069
10070         /* We allow only a single restrictions registration */
10071         if (ctx->restrictions.registered)
10072                 return -EBUSY;
10073
10074         if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
10075                 return -EINVAL;
10076
10077         size = array_size(nr_args, sizeof(*res));
10078         if (size == SIZE_MAX)
10079                 return -EOVERFLOW;
10080
10081         res = memdup_user(arg, size);
10082         if (IS_ERR(res))
10083                 return PTR_ERR(res);
10084
10085         ret = 0;
10086
10087         for (i = 0; i < nr_args; i++) {
10088                 switch (res[i].opcode) {
10089                 case IORING_RESTRICTION_REGISTER_OP:
10090                         if (res[i].register_op >= IORING_REGISTER_LAST) {
10091                                 ret = -EINVAL;
10092                                 goto out;
10093                         }
10094
10095                         __set_bit(res[i].register_op,
10096                                   ctx->restrictions.register_op);
10097                         break;
10098                 case IORING_RESTRICTION_SQE_OP:
10099                         if (res[i].sqe_op >= IORING_OP_LAST) {
10100                                 ret = -EINVAL;
10101                                 goto out;
10102                         }
10103
10104                         __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
10105                         break;
10106                 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
10107                         ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
10108                         break;
10109                 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
10110                         ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
10111                         break;
10112                 default:
10113                         ret = -EINVAL;
10114                         goto out;
10115                 }
10116         }
10117
10118 out:
10119         /* Reset all restrictions if an error happened */
10120         if (ret != 0)
10121                 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
10122         else
10123                 ctx->restrictions.registered = true;
10124
10125         kfree(res);
10126         return ret;
10127 }
10128
10129 static int io_register_enable_rings(struct io_ring_ctx *ctx)
10130 {
10131         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
10132                 return -EBADFD;
10133
10134         if (ctx->restrictions.registered)
10135                 ctx->restricted = 1;
10136
10137         ctx->flags &= ~IORING_SETUP_R_DISABLED;
10138         if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
10139                 wake_up(&ctx->sq_data->wait);
10140         return 0;
10141 }
10142
10143 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
10144                                      struct io_uring_rsrc_update2 *up,
10145                                      unsigned nr_args)
10146 {
10147         __u32 tmp;
10148         int err;
10149
10150         if (up->resv)
10151                 return -EINVAL;
10152         if (check_add_overflow(up->offset, nr_args, &tmp))
10153                 return -EOVERFLOW;
10154         err = io_rsrc_node_switch_start(ctx);
10155         if (err)
10156                 return err;
10157
10158         switch (type) {
10159         case IORING_RSRC_FILE:
10160                 return __io_sqe_files_update(ctx, up, nr_args);
10161         case IORING_RSRC_BUFFER:
10162                 return __io_sqe_buffers_update(ctx, up, nr_args);
10163         }
10164         return -EINVAL;
10165 }
10166
10167 static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
10168                                     unsigned nr_args)
10169 {
10170         struct io_uring_rsrc_update2 up;
10171
10172         if (!nr_args)
10173                 return -EINVAL;
10174         memset(&up, 0, sizeof(up));
10175         if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
10176                 return -EFAULT;
10177         return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
10178 }
10179
10180 static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
10181                                    unsigned size, unsigned type)
10182 {
10183         struct io_uring_rsrc_update2 up;
10184
10185         if (size != sizeof(up))
10186                 return -EINVAL;
10187         if (copy_from_user(&up, arg, sizeof(up)))
10188                 return -EFAULT;
10189         if (!up.nr || up.resv)
10190                 return -EINVAL;
10191         return __io_register_rsrc_update(ctx, type, &up, up.nr);
10192 }
10193
10194 static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
10195                             unsigned int size, unsigned int type)
10196 {
10197         struct io_uring_rsrc_register rr;
10198
10199         /* keep it extendible */
10200         if (size != sizeof(rr))
10201                 return -EINVAL;
10202
10203         memset(&rr, 0, sizeof(rr));
10204         if (copy_from_user(&rr, arg, size))
10205                 return -EFAULT;
10206         if (!rr.nr || rr.resv || rr.resv2)
10207                 return -EINVAL;
10208
10209         switch (type) {
10210         case IORING_RSRC_FILE:
10211                 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
10212                                              rr.nr, u64_to_user_ptr(rr.tags));
10213         case IORING_RSRC_BUFFER:
10214                 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
10215                                                rr.nr, u64_to_user_ptr(rr.tags));
10216         }
10217         return -EINVAL;
10218 }
10219
10220 static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
10221                                 unsigned len)
10222 {
10223         struct io_uring_task *tctx = current->io_uring;
10224         cpumask_var_t new_mask;
10225         int ret;
10226
10227         if (!tctx || !tctx->io_wq)
10228                 return -EINVAL;
10229
10230         if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
10231                 return -ENOMEM;
10232
10233         cpumask_clear(new_mask);
10234         if (len > cpumask_size())
10235                 len = cpumask_size();
10236
10237         if (copy_from_user(new_mask, arg, len)) {
10238                 free_cpumask_var(new_mask);
10239                 return -EFAULT;
10240         }
10241
10242         ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
10243         free_cpumask_var(new_mask);
10244         return ret;
10245 }
10246
10247 static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
10248 {
10249         struct io_uring_task *tctx = current->io_uring;
10250
10251         if (!tctx || !tctx->io_wq)
10252                 return -EINVAL;
10253
10254         return io_wq_cpu_affinity(tctx->io_wq, NULL);
10255 }
10256
10257 static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
10258                                         void __user *arg)
10259 {
10260         struct io_uring_task *tctx = current->io_uring;
10261         __u32 new_count[2];
10262         int i, ret;
10263
10264         if (!tctx || !tctx->io_wq)
10265                 return -EINVAL;
10266         if (copy_from_user(new_count, arg, sizeof(new_count)))
10267                 return -EFAULT;
10268         for (i = 0; i < ARRAY_SIZE(new_count); i++)
10269                 if (new_count[i] > INT_MAX)
10270                         return -EINVAL;
10271
10272         ret = io_wq_max_workers(tctx->io_wq, new_count);
10273         if (ret)
10274                 return ret;
10275
10276         if (copy_to_user(arg, new_count, sizeof(new_count)))
10277                 return -EFAULT;
10278
10279         return 0;
10280 }
10281
10282 static bool io_register_op_must_quiesce(int op)
10283 {
10284         switch (op) {
10285         case IORING_REGISTER_BUFFERS:
10286         case IORING_UNREGISTER_BUFFERS:
10287         case IORING_REGISTER_FILES:
10288         case IORING_UNREGISTER_FILES:
10289         case IORING_REGISTER_FILES_UPDATE:
10290         case IORING_REGISTER_PROBE:
10291         case IORING_REGISTER_PERSONALITY:
10292         case IORING_UNREGISTER_PERSONALITY:
10293         case IORING_REGISTER_FILES2:
10294         case IORING_REGISTER_FILES_UPDATE2:
10295         case IORING_REGISTER_BUFFERS2:
10296         case IORING_REGISTER_BUFFERS_UPDATE:
10297         case IORING_REGISTER_IOWQ_AFF:
10298         case IORING_UNREGISTER_IOWQ_AFF:
10299         case IORING_REGISTER_IOWQ_MAX_WORKERS:
10300                 return false;
10301         default:
10302                 return true;
10303         }
10304 }
10305
10306 static int io_ctx_quiesce(struct io_ring_ctx *ctx)
10307 {
10308         long ret;
10309
10310         percpu_ref_kill(&ctx->refs);
10311
10312         /*
10313          * Drop uring mutex before waiting for references to exit. If another
10314          * thread is currently inside io_uring_enter() it might need to grab the
10315          * uring_lock to make progress. If we hold it here across the drain
10316          * wait, then we can deadlock. It's safe to drop the mutex here, since
10317          * no new references will come in after we've killed the percpu ref.
10318          */
10319         mutex_unlock(&ctx->uring_lock);
10320         do {
10321                 ret = wait_for_completion_interruptible(&ctx->ref_comp);
10322                 if (!ret)
10323                         break;
10324                 ret = io_run_task_work_sig();
10325         } while (ret >= 0);
10326         mutex_lock(&ctx->uring_lock);
10327
10328         if (ret)
10329                 io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
10330         return ret;
10331 }
10332
10333 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
10334                                void __user *arg, unsigned nr_args)
10335         __releases(ctx->uring_lock)
10336         __acquires(ctx->uring_lock)
10337 {
10338         int ret;
10339
10340         /*
10341          * We're inside the ring mutex, if the ref is already dying, then
10342          * someone else killed the ctx or is already going through
10343          * io_uring_register().
10344          */
10345         if (percpu_ref_is_dying(&ctx->refs))
10346                 return -ENXIO;
10347
10348         if (ctx->restricted) {
10349                 if (opcode >= IORING_REGISTER_LAST)
10350                         return -EINVAL;
10351                 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
10352                 if (!test_bit(opcode, ctx->restrictions.register_op))
10353                         return -EACCES;
10354         }
10355
10356         if (io_register_op_must_quiesce(opcode)) {
10357                 ret = io_ctx_quiesce(ctx);
10358                 if (ret)
10359                         return ret;
10360         }
10361
10362         switch (opcode) {
10363         case IORING_REGISTER_BUFFERS:
10364                 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
10365                 break;
10366         case IORING_UNREGISTER_BUFFERS:
10367                 ret = -EINVAL;
10368                 if (arg || nr_args)
10369                         break;
10370                 ret = io_sqe_buffers_unregister(ctx);
10371                 break;
10372         case IORING_REGISTER_FILES:
10373                 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
10374                 break;
10375         case IORING_UNREGISTER_FILES:
10376                 ret = -EINVAL;
10377                 if (arg || nr_args)
10378                         break;
10379                 ret = io_sqe_files_unregister(ctx);
10380                 break;
10381         case IORING_REGISTER_FILES_UPDATE:
10382                 ret = io_register_files_update(ctx, arg, nr_args);
10383                 break;
10384         case IORING_REGISTER_EVENTFD:
10385         case IORING_REGISTER_EVENTFD_ASYNC:
10386                 ret = -EINVAL;
10387                 if (nr_args != 1)
10388                         break;
10389                 ret = io_eventfd_register(ctx, arg);
10390                 if (ret)
10391                         break;
10392                 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
10393                         ctx->eventfd_async = 1;
10394                 else
10395                         ctx->eventfd_async = 0;
10396                 break;
10397         case IORING_UNREGISTER_EVENTFD:
10398                 ret = -EINVAL;
10399                 if (arg || nr_args)
10400                         break;
10401                 ret = io_eventfd_unregister(ctx);
10402                 break;
10403         case IORING_REGISTER_PROBE:
10404                 ret = -EINVAL;
10405                 if (!arg || nr_args > 256)
10406                         break;
10407                 ret = io_probe(ctx, arg, nr_args);
10408                 break;
10409         case IORING_REGISTER_PERSONALITY:
10410                 ret = -EINVAL;
10411                 if (arg || nr_args)
10412                         break;
10413                 ret = io_register_personality(ctx);
10414                 break;
10415         case IORING_UNREGISTER_PERSONALITY:
10416                 ret = -EINVAL;
10417                 if (arg)
10418                         break;
10419                 ret = io_unregister_personality(ctx, nr_args);
10420                 break;
10421         case IORING_REGISTER_ENABLE_RINGS:
10422                 ret = -EINVAL;
10423                 if (arg || nr_args)
10424                         break;
10425                 ret = io_register_enable_rings(ctx);
10426                 break;
10427         case IORING_REGISTER_RESTRICTIONS:
10428                 ret = io_register_restrictions(ctx, arg, nr_args);
10429                 break;
10430         case IORING_REGISTER_FILES2:
10431                 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
10432                 break;
10433         case IORING_REGISTER_FILES_UPDATE2:
10434                 ret = io_register_rsrc_update(ctx, arg, nr_args,
10435                                               IORING_RSRC_FILE);
10436                 break;
10437         case IORING_REGISTER_BUFFERS2:
10438                 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
10439                 break;
10440         case IORING_REGISTER_BUFFERS_UPDATE:
10441                 ret = io_register_rsrc_update(ctx, arg, nr_args,
10442                                               IORING_RSRC_BUFFER);
10443                 break;
10444         case IORING_REGISTER_IOWQ_AFF:
10445                 ret = -EINVAL;
10446                 if (!arg || !nr_args)
10447                         break;
10448                 ret = io_register_iowq_aff(ctx, arg, nr_args);
10449                 break;
10450         case IORING_UNREGISTER_IOWQ_AFF:
10451                 ret = -EINVAL;
10452                 if (arg || nr_args)
10453                         break;
10454                 ret = io_unregister_iowq_aff(ctx);
10455                 break;
10456         case IORING_REGISTER_IOWQ_MAX_WORKERS:
10457                 ret = -EINVAL;
10458                 if (!arg || nr_args != 2)
10459                         break;
10460                 ret = io_register_iowq_max_workers(ctx, arg);
10461                 break;
10462         default:
10463                 ret = -EINVAL;
10464                 break;
10465         }
10466
10467         if (io_register_op_must_quiesce(opcode)) {
10468                 /* bring the ctx back to life */
10469                 percpu_ref_reinit(&ctx->refs);
10470                 reinit_completion(&ctx->ref_comp);
10471         }
10472         return ret;
10473 }
10474
10475 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
10476                 void __user *, arg, unsigned int, nr_args)
10477 {
10478         struct io_ring_ctx *ctx;
10479         long ret = -EBADF;
10480         struct fd f;
10481
10482         f = fdget(fd);
10483         if (!f.file)
10484                 return -EBADF;
10485
10486         ret = -EOPNOTSUPP;
10487         if (f.file->f_op != &io_uring_fops)
10488                 goto out_fput;
10489
10490         ctx = f.file->private_data;
10491
10492         io_run_task_work();
10493
10494         mutex_lock(&ctx->uring_lock);
10495         ret = __io_uring_register(ctx, opcode, arg, nr_args);
10496         mutex_unlock(&ctx->uring_lock);
10497         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
10498                                                         ctx->cq_ev_fd != NULL, ret);
10499 out_fput:
10500         fdput(f);
10501         return ret;
10502 }
10503
10504 static int __init io_uring_init(void)
10505 {
10506 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
10507         BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
10508         BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
10509 } while (0)
10510
10511 #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
10512         __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
10513         BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
10514         BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
10515         BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
10516         BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
10517         BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
10518         BUILD_BUG_SQE_ELEM(8,  __u64,  off);
10519         BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
10520         BUILD_BUG_SQE_ELEM(16, __u64,  addr);
10521         BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
10522         BUILD_BUG_SQE_ELEM(24, __u32,  len);
10523         BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
10524         BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
10525         BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
10526         BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
10527         BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
10528         BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
10529         BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
10530         BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
10531         BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
10532         BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
10533         BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
10534         BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
10535         BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
10536         BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
10537         BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
10538         BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
10539         BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
10540         BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
10541         BUILD_BUG_SQE_ELEM(42, __u16,  personality);
10542         BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
10543         BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
10544
10545         BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
10546                      sizeof(struct io_uring_rsrc_update));
10547         BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
10548                      sizeof(struct io_uring_rsrc_update2));
10549
10550         /* ->buf_index is u16 */
10551         BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
10552
10553         /* should fit into one byte */
10554         BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
10555
10556         BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
10557         BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
10558
10559         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
10560                                 SLAB_ACCOUNT);
10561         return 0;
10562 };
10563 __initcall(io_uring_init);