fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqe (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <linux/compat.h>
  47 #include <net/compat.h>
  48 #include <linux/refcount.h>
  49 #include <linux/uio.h>
  50 #include <linux/bits.h>
  51
  52 #include <linux/sched/signal.h>
  53 #include <linux/fs.h>
  54 #include <linux/file.h>
  55 #include <linux/fdtable.h>
  56 #include <linux/mm.h>
  57 #include <linux/mman.h>
  58 #include <linux/percpu.h>
  59 #include <linux/slab.h>
  60 #include <linux/blk-mq.h>
  61 #include <linux/bvec.h>
  62 #include <linux/net.h>
  63 #include <net/sock.h>
  64 #include <net/af_unix.h>
  65 #include <net/scm.h>
  66 #include <net/busy_poll.h>
  67 #include <linux/anon_inodes.h>
  68 #include <linux/sched/mm.h>
  69 #include <linux/uaccess.h>
  70 #include <linux/nospec.h>
  71 #include <linux/sizes.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/highmem.h>
  74 #include <linux/namei.h>
  75 #include <linux/fsnotify.h>
  76 #include <linux/fadvise.h>
  77 #include <linux/eventpoll.h>
  78 #include <linux/splice.h>
  79 #include <linux/task_work.h>
  80 #include <linux/pagemap.h>
  81 #include <linux/io_uring.h>
  82 #include <linux/audit.h>
  83 #include <linux/security.h>
  84
  85 #define CREATE_TRACE_POINTS
  86 #include <trace/events/io_uring.h>
  87
  88 #include <uapi/linux/io_uring.h>
  89
  90 #include "internal.h"
  91 #include "io-wq.h"
  92
  93 #define IORING_MAX_ENTRIES      32768
  94 #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
  95 #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
  96
  97 /* only define max */
  98 #define IORING_MAX_FIXED_FILES  (1U << 15)
  99 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
 100                                  IORING_REGISTER_LAST + IORING_OP_LAST)
 101
 102 #define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
 103 #define IO_RSRC_TAG_TABLE_MAX   (1U << IO_RSRC_TAG_TABLE_SHIFT)
 104 #define IO_RSRC_TAG_TABLE_MASK  (IO_RSRC_TAG_TABLE_MAX - 1)
 105
 106 #define IORING_MAX_REG_BUFFERS  (1U << 14)
 107
 108 #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
 109                           IOSQE_IO_HARDLINK | IOSQE_ASYNC)
 110
 111 #define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
 112                         IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
 113
 114 #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
 115                                 REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
 116                                 REQ_F_ASYNC_DATA)
 117
 118 #define IO_TCTX_REFS_CACHE_NR   (1U << 10)
 119
 120 struct io_uring {
 121         u32 head ____cacheline_aligned_in_smp;
 122         u32 tail ____cacheline_aligned_in_smp;
 123 };
 124
 125 /*
 126  * This data is shared with the application through the mmap at offsets
 127  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 128  *
 129  * The offsets to the member fields are published through struct
 130  * io_sqring_offsets when calling io_uring_setup.
 131  */
 132 struct io_rings {
 133         /*
 134          * Head and tail offsets into the ring; the offsets need to be
 135          * masked to get valid indices.
 136          *
 137          * The kernel controls head of the sq ring and the tail of the cq ring,
 138          * and the application controls tail of the sq ring and the head of the
 139          * cq ring.
 140          */
 141         struct io_uring         sq, cq;
 142         /*
 143          * Bitmasks to apply to head and tail offsets (constant, equals
 144          * ring_entries - 1)
 145          */
 146         u32                     sq_ring_mask, cq_ring_mask;
 147         /* Ring sizes (constant, power of 2) */
 148         u32                     sq_ring_entries, cq_ring_entries;
 149         /*
 150          * Number of invalid entries dropped by the kernel due to
 151          * invalid index stored in array
 152          *
 153          * Written by the kernel, shouldn't be modified by the
 154          * application (i.e. get number of "new events" by comparing to
 155          * cached value).
 156          *
 157          * After a new SQ head value was read by the application this
 158          * counter includes all submissions that were dropped reaching
 159          * the new SQ head (and possibly more).
 160          */
 161         u32                     sq_dropped;
 162         /*
 163          * Runtime SQ flags
 164          *
 165          * Written by the kernel, shouldn't be modified by the
 166          * application.
 167          *
 168          * The application needs a full memory barrier before checking
 169          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 170          */
 171         u32                     sq_flags;
 172         /*
 173          * Runtime CQ flags
 174          *
 175          * Written by the application, shouldn't be modified by the
 176          * kernel.
 177          */
 178         u32                     cq_flags;
 179         /*
 180          * Number of completion events lost because the queue was full;
 181          * this should be avoided by the application by making sure
 182          * there are not more requests pending than there is space in
 183          * the completion queue.
 184          *
 185          * Written by the kernel, shouldn't be modified by the
 186          * application (i.e. get number of "new events" by comparing to
 187          * cached value).
 188          *
 189          * As completion events come in out of order this counter is not
 190          * ordered with any other data.
 191          */
 192         u32                     cq_overflow;
 193         /*
 194          * Ring buffer of completion events.
 195          *
 196          * The kernel writes completion events fresh every time they are
 197          * produced, so the application is allowed to modify pending
 198          * entries.
 199          */
 200         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 201 };
 202
 203 enum io_uring_cmd_flags {
 204         IO_URING_F_COMPLETE_DEFER       = 1,
 205         IO_URING_F_UNLOCKED             = 2,
 206         /* int's last bit, sign checks are usually faster than a bit test */
 207         IO_URING_F_NONBLOCK             = INT_MIN,
 208 };
 209
 210 struct io_mapped_ubuf {
 211         u64             ubuf;
 212         u64             ubuf_end;
 213         unsigned int    nr_bvecs;
 214         unsigned long   acct_pages;
 215         struct bio_vec  bvec[];
 216 };
 217
 218 struct io_ring_ctx;
 219
 220 struct io_overflow_cqe {
 221         struct io_uring_cqe cqe;
 222         struct list_head list;
 223 };
 224
 225 struct io_fixed_file {
 226         /* file * with additional FFS_* flags */
 227         unsigned long file_ptr;
 228 };
 229
 230 struct io_rsrc_put {
 231         struct list_head list;
 232         u64 tag;
 233         union {
 234                 void *rsrc;
 235                 struct file *file;
 236                 struct io_mapped_ubuf *buf;
 237         };
 238 };
 239
 240 struct io_file_table {
 241         struct io_fixed_file *files;
 242 };
 243
 244 struct io_rsrc_node {
 245         struct percpu_ref               refs;
 246         struct list_head                node;
 247         struct list_head                rsrc_list;
 248         struct io_rsrc_data             *rsrc_data;
 249         struct llist_node               llist;
 250         bool                            done;
 251 };
 252
 253 typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
 254
 255 struct io_rsrc_data {
 256         struct io_ring_ctx              *ctx;
 257
 258         u64                             **tags;
 259         unsigned int                    nr;
 260         rsrc_put_fn                     *do_put;
 261         atomic_t                        refs;
 262         struct completion               done;
 263         bool                            quiesce;
 264 };
 265
 266 struct io_buffer_list {
 267         struct list_head list;
 268         struct list_head buf_list;
 269         __u16 bgid;
 270 };
 271
 272 struct io_buffer {
 273         struct list_head list;
 274         __u64 addr;
 275         __u32 len;
 276         __u16 bid;
 277         __u16 bgid;
 278 };
 279
 280 struct io_restriction {
 281         DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
 282         DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
 283         u8 sqe_flags_allowed;
 284         u8 sqe_flags_required;
 285         bool registered;
 286 };
 287
 288 enum {
 289         IO_SQ_THREAD_SHOULD_STOP = 0,
 290         IO_SQ_THREAD_SHOULD_PARK,
 291 };
 292
 293 struct io_sq_data {
 294         refcount_t              refs;
 295         atomic_t                park_pending;
 296         struct mutex            lock;
 297
 298         /* ctx's that are using this sqd */
 299         struct list_head        ctx_list;
 300
 301         struct task_struct      *thread;
 302         struct wait_queue_head  wait;
 303
 304         unsigned                sq_thread_idle;
 305         int                     sq_cpu;
 306         pid_t                   task_pid;
 307         pid_t                   task_tgid;
 308
 309         unsigned long           state;
 310         struct completion       exited;
 311 };
 312
 313 #define IO_COMPL_BATCH                  32
 314 #define IO_REQ_CACHE_SIZE               32
 315 #define IO_REQ_ALLOC_BATCH              8
 316
 317 struct io_submit_link {
 318         struct io_kiocb         *head;
 319         struct io_kiocb         *last;
 320 };
 321
 322 struct io_submit_state {
 323         /* inline/task_work completion list, under ->uring_lock */
 324         struct io_wq_work_node  free_list;
 325         /* batch completion logic */
 326         struct io_wq_work_list  compl_reqs;
 327         struct io_submit_link   link;
 328
 329         bool                    plug_started;
 330         bool                    need_plug;
 331         bool                    flush_cqes;
 332         unsigned short          submit_nr;
 333         struct blk_plug         plug;
 334 };
 335
 336 struct io_ev_fd {
 337         struct eventfd_ctx      *cq_ev_fd;
 338         unsigned int            eventfd_async: 1;
 339         struct rcu_head         rcu;
 340 };
 341
 342 #define IO_BUFFERS_HASH_BITS    5
 343
 344 struct io_ring_ctx {
 345         /* const or read-mostly hot data */
 346         struct {
 347                 struct percpu_ref       refs;
 348
 349                 struct io_rings         *rings;
 350                 unsigned int            flags;
 351                 unsigned int            compat: 1;
 352                 unsigned int            drain_next: 1;
 353                 unsigned int            restricted: 1;
 354                 unsigned int            off_timeout_used: 1;
 355                 unsigned int            drain_active: 1;
 356                 unsigned int            drain_disabled: 1;
 357                 unsigned int            has_evfd: 1;
 358         } ____cacheline_aligned_in_smp;
 359
 360         /* submission data */
 361         struct {
 362                 struct mutex            uring_lock;
 363
 364                 /*
 365                  * Ring buffer of indices into array of io_uring_sqe, which is
 366                  * mmapped by the application using the IORING_OFF_SQES offset.
 367                  *
 368                  * This indirection could e.g. be used to assign fixed
 369                  * io_uring_sqe entries to operations and only submit them to
 370                  * the queue when needed.
 371                  *
 372                  * The kernel modifies neither the indices array nor the entries
 373                  * array.
 374                  */
 375                 u32                     *sq_array;
 376                 struct io_uring_sqe     *sq_sqes;
 377                 unsigned                cached_sq_head;
 378                 unsigned                sq_entries;
 379                 struct list_head        defer_list;
 380
 381                 /*
 382                  * Fixed resources fast path, should be accessed only under
 383                  * uring_lock, and updated through io_uring_register(2)
 384                  */
 385                 struct io_rsrc_node     *rsrc_node;
 386                 int                     rsrc_cached_refs;
 387                 struct io_file_table    file_table;
 388                 unsigned                nr_user_files;
 389                 unsigned                nr_user_bufs;
 390                 struct io_mapped_ubuf   **user_bufs;
 391
 392                 struct io_submit_state  submit_state;
 393                 struct list_head        timeout_list;
 394                 struct list_head        ltimeout_list;
 395                 struct list_head        cq_overflow_list;
 396                 struct list_head        *io_buffers;
 397                 struct list_head        io_buffers_cache;
 398                 struct list_head        apoll_cache;
 399                 struct xarray           personalities;
 400                 u32                     pers_next;
 401                 unsigned                sq_thread_idle;
 402         } ____cacheline_aligned_in_smp;
 403
 404         /* IRQ completion list, under ->completion_lock */
 405         struct io_wq_work_list  locked_free_list;
 406         unsigned int            locked_free_nr;
 407
 408         const struct cred       *sq_creds;      /* cred used for __io_sq_thread() */
 409         struct io_sq_data       *sq_data;       /* if using sq thread polling */
 410
 411         struct wait_queue_head  sqo_sq_wait;
 412         struct list_head        sqd_list;
 413
 414         unsigned long           check_cq_overflow;
 415 #ifdef CONFIG_NET_RX_BUSY_POLL
 416         /* used to track busy poll napi_id */
 417         struct list_head        napi_list;
 418         spinlock_t              napi_lock;      /* napi_list lock */
 419 #endif
 420
 421         struct {
 422                 unsigned                cached_cq_tail;
 423                 unsigned                cq_entries;
 424                 struct io_ev_fd __rcu   *io_ev_fd;
 425                 struct wait_queue_head  cq_wait;
 426                 unsigned                cq_extra;
 427                 atomic_t                cq_timeouts;
 428                 unsigned                cq_last_tm_flush;
 429         } ____cacheline_aligned_in_smp;
 430
 431         struct {
 432                 spinlock_t              completion_lock;
 433
 434                 spinlock_t              timeout_lock;
 435
 436                 /*
 437                  * ->iopoll_list is protected by the ctx->uring_lock for
 438                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 439                  * For SQPOLL, only the single threaded io_sq_thread() will
 440                  * manipulate the list, hence no extra locking is needed there.
 441                  */
 442                 struct io_wq_work_list  iopoll_list;
 443                 struct hlist_head       *cancel_hash;
 444                 unsigned                cancel_hash_bits;
 445                 bool                    poll_multi_queue;
 446
 447                 struct list_head        io_buffers_comp;
 448         } ____cacheline_aligned_in_smp;
 449
 450         struct io_restriction           restrictions;
 451
 452         /* slow path rsrc auxilary data, used by update/register */
 453         struct {
 454                 struct io_rsrc_node             *rsrc_backup_node;
 455                 struct io_mapped_ubuf           *dummy_ubuf;
 456                 struct io_rsrc_data             *file_data;
 457                 struct io_rsrc_data             *buf_data;
 458
 459                 struct delayed_work             rsrc_put_work;
 460                 struct llist_head               rsrc_put_llist;
 461                 struct list_head                rsrc_ref_list;
 462                 spinlock_t                      rsrc_ref_lock;
 463
 464                 struct list_head        io_buffers_pages;
 465         };
 466
 467         /* Keep this last, we don't need it for the fast path */
 468         struct {
 469                 #if defined(CONFIG_UNIX)
 470                         struct socket           *ring_sock;
 471                 #endif
 472                 /* hashed buffered write serialization */
 473                 struct io_wq_hash               *hash_map;
 474
 475                 /* Only used for accounting purposes */
 476                 struct user_struct              *user;
 477                 struct mm_struct                *mm_account;
 478
 479                 /* ctx exit and cancelation */
 480                 struct llist_head               fallback_llist;
 481                 struct delayed_work             fallback_work;
 482                 struct work_struct              exit_work;
 483                 struct list_head                tctx_list;
 484                 struct completion               ref_comp;
 485                 u32                             iowq_limits[2];
 486                 bool                            iowq_limits_set;
 487         };
 488 };
 489
 490 /*
 491  * Arbitrary limit, can be raised if need be
 492  */
 493 #define IO_RINGFD_REG_MAX 16
 494
 495 struct io_uring_task {
 496         /* submission side */
 497         int                     cached_refs;
 498         struct xarray           xa;
 499         struct wait_queue_head  wait;
 500         const struct io_ring_ctx *last;
 501         struct io_wq            *io_wq;
 502         struct percpu_counter   inflight;
 503         atomic_t                inflight_tracked;
 504         atomic_t                in_idle;
 505
 506         spinlock_t              task_lock;
 507         struct io_wq_work_list  task_list;
 508         struct io_wq_work_list  prior_task_list;
 509         struct callback_head    task_work;
 510         struct file             **registered_rings;
 511         bool                    task_running;
 512 };
 513
 514 /*
 515  * First field must be the file pointer in all the
 516  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 517  */
 518 struct io_poll_iocb {
 519         struct file                     *file;
 520         struct wait_queue_head          *head;
 521         __poll_t                        events;
 522         struct wait_queue_entry         wait;
 523 };
 524
 525 struct io_poll_update {
 526         struct file                     *file;
 527         u64                             old_user_data;
 528         u64                             new_user_data;
 529         __poll_t                        events;
 530         bool                            update_events;
 531         bool                            update_user_data;
 532 };
 533
 534 struct io_close {
 535         struct file                     *file;
 536         int                             fd;
 537         u32                             file_slot;
 538 };
 539
 540 struct io_timeout_data {
 541         struct io_kiocb                 *req;
 542         struct hrtimer                  timer;
 543         struct timespec64               ts;
 544         enum hrtimer_mode               mode;
 545         u32                             flags;
 546 };
 547
 548 struct io_accept {
 549         struct file                     *file;
 550         struct sockaddr __user          *addr;
 551         int __user                      *addr_len;
 552         int                             flags;
 553         u32                             file_slot;
 554         unsigned long                   nofile;
 555 };
 556
 557 struct io_sync {
 558         struct file                     *file;
 559         loff_t                          len;
 560         loff_t                          off;
 561         int                             flags;
 562         int                             mode;
 563 };
 564
 565 struct io_cancel {
 566         struct file                     *file;
 567         u64                             addr;
 568 };
 569
 570 struct io_timeout {
 571         struct file                     *file;
 572         u32                             off;
 573         u32                             target_seq;
 574         struct list_head                list;
 575         /* head of the link, used by linked timeouts only */
 576         struct io_kiocb                 *head;
 577         /* for linked completions */
 578         struct io_kiocb                 *prev;
 579 };
 580
 581 struct io_timeout_rem {
 582         struct file                     *file;
 583         u64                             addr;
 584
 585         /* timeout update */
 586         struct timespec64               ts;
 587         u32                             flags;
 588         bool                            ltimeout;
 589 };
 590
 591 struct io_rw {
 592         /* NOTE: kiocb has the file as the first member, so don't do it here */
 593         struct kiocb                    kiocb;
 594         u64                             addr;
 595         u64                             len;
 596 };
 597
 598 struct io_connect {
 599         struct file                     *file;
 600         struct sockaddr __user          *addr;
 601         int                             addr_len;
 602 };
 603
 604 struct io_sr_msg {
 605         struct file                     *file;
 606         union {
 607                 struct compat_msghdr __user     *umsg_compat;
 608                 struct user_msghdr __user       *umsg;
 609                 void __user                     *buf;
 610         };
 611         int                             msg_flags;
 612         int                             bgid;
 613         size_t                          len;
 614         size_t                          done_io;
 615 };
 616
 617 struct io_open {
 618         struct file                     *file;
 619         int                             dfd;
 620         u32                             file_slot;
 621         struct filename                 *filename;
 622         struct open_how                 how;
 623         unsigned long                   nofile;
 624 };
 625
 626 struct io_rsrc_update {
 627         struct file                     *file;
 628         u64                             arg;
 629         u32                             nr_args;
 630         u32                             offset;
 631 };
 632
 633 struct io_fadvise {
 634         struct file                     *file;
 635         u64                             offset;
 636         u32                             len;
 637         u32                             advice;
 638 };
 639
 640 struct io_madvise {
 641         struct file                     *file;
 642         u64                             addr;
 643         u32                             len;
 644         u32                             advice;
 645 };
 646
 647 struct io_epoll {
 648         struct file                     *file;
 649         int                             epfd;
 650         int                             op;
 651         int                             fd;
 652         struct epoll_event              event;
 653 };
 654
 655 struct io_splice {
 656         struct file                     *file_out;
 657         struct file                     *file_in;
 658         loff_t                          off_out;
 659         loff_t                          off_in;
 660         u64                             len;
 661         unsigned int                    flags;
 662 };
 663
 664 struct io_provide_buf {
 665         struct file                     *file;
 666         __u64                           addr;
 667         __u32                           len;
 668         __u32                           bgid;
 669         __u16                           nbufs;
 670         __u16                           bid;
 671 };
 672
 673 struct io_statx {
 674         struct file                     *file;
 675         int                             dfd;
 676         unsigned int                    mask;
 677         unsigned int                    flags;
 678         struct filename                 *filename;
 679         struct statx __user             *buffer;
 680 };
 681
 682 struct io_shutdown {
 683         struct file                     *file;
 684         int                             how;
 685 };
 686
 687 struct io_rename {
 688         struct file                     *file;
 689         int                             old_dfd;
 690         int                             new_dfd;
 691         struct filename                 *oldpath;
 692         struct filename                 *newpath;
 693         int                             flags;
 694 };
 695
 696 struct io_unlink {
 697         struct file                     *file;
 698         int                             dfd;
 699         int                             flags;
 700         struct filename                 *filename;
 701 };
 702
 703 struct io_mkdir {
 704         struct file                     *file;
 705         int                             dfd;
 706         umode_t                         mode;
 707         struct filename                 *filename;
 708 };
 709
 710 struct io_symlink {
 711         struct file                     *file;
 712         int                             new_dfd;
 713         struct filename                 *oldpath;
 714         struct filename                 *newpath;
 715 };
 716
 717 struct io_hardlink {
 718         struct file                     *file;
 719         int                             old_dfd;
 720         int                             new_dfd;
 721         struct filename                 *oldpath;
 722         struct filename                 *newpath;
 723         int                             flags;
 724 };
 725
 726 struct io_msg {
 727         struct file                     *file;
 728         u64 user_data;
 729         u32 len;
 730 };
 731
 732 struct io_async_connect {
 733         struct sockaddr_storage         address;
 734 };
 735
 736 struct io_async_msghdr {
 737         struct iovec                    fast_iov[UIO_FASTIOV];
 738         /* points to an allocated iov, if NULL we use fast_iov instead */
 739         struct iovec                    *free_iov;
 740         struct sockaddr __user          *uaddr;
 741         struct msghdr                   msg;
 742         struct sockaddr_storage         addr;
 743 };
 744
 745 struct io_rw_state {
 746         struct iov_iter                 iter;
 747         struct iov_iter_state           iter_state;
 748         struct iovec                    fast_iov[UIO_FASTIOV];
 749 };
 750
 751 struct io_async_rw {
 752         struct io_rw_state              s;
 753         const struct iovec              *free_iovec;
 754         size_t                          bytes_done;
 755         struct wait_page_queue          wpq;
 756 };
 757
 758 enum {
 759         REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
 760         REQ_F_IO_DRAIN_BIT      = IOSQE_IO_DRAIN_BIT,
 761         REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
 762         REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
 763         REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
 764         REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
 765         REQ_F_CQE_SKIP_BIT      = IOSQE_CQE_SKIP_SUCCESS_BIT,
 766
 767         /* first byte is taken by user flags, shift it to not overlap */
 768         REQ_F_FAIL_BIT          = 8,
 769         REQ_F_INFLIGHT_BIT,
 770         REQ_F_CUR_POS_BIT,
 771         REQ_F_NOWAIT_BIT,
 772         REQ_F_LINK_TIMEOUT_BIT,
 773         REQ_F_NEED_CLEANUP_BIT,
 774         REQ_F_POLLED_BIT,
 775         REQ_F_BUFFER_SELECTED_BIT,
 776         REQ_F_COMPLETE_INLINE_BIT,
 777         REQ_F_REISSUE_BIT,
 778         REQ_F_CREDS_BIT,
 779         REQ_F_REFCOUNT_BIT,
 780         REQ_F_ARM_LTIMEOUT_BIT,
 781         REQ_F_ASYNC_DATA_BIT,
 782         REQ_F_SKIP_LINK_CQES_BIT,
 783         REQ_F_SINGLE_POLL_BIT,
 784         REQ_F_DOUBLE_POLL_BIT,
 785         REQ_F_PARTIAL_IO_BIT,
 786         /* keep async read/write and isreg together and in order */
 787         REQ_F_SUPPORT_NOWAIT_BIT,
 788         REQ_F_ISREG_BIT,
 789
 790         /* not a real bit, just to check we're not overflowing the space */
 791         __REQ_F_LAST_BIT,
 792 };
 793
 794 enum {
 795         /* ctx owns file */
 796         REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
 797         /* drain existing IO first */
 798         REQ_F_IO_DRAIN          = BIT(REQ_F_IO_DRAIN_BIT),
 799         /* linked sqes */
 800         REQ_F_LINK              = BIT(REQ_F_LINK_BIT),
 801         /* doesn't sever on completion < 0 */
 802         REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
 803         /* IOSQE_ASYNC */
 804         REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
 805         /* IOSQE_BUFFER_SELECT */
 806         REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
 807         /* IOSQE_CQE_SKIP_SUCCESS */
 808         REQ_F_CQE_SKIP          = BIT(REQ_F_CQE_SKIP_BIT),
 809
 810         /* fail rest of links */
 811         REQ_F_FAIL              = BIT(REQ_F_FAIL_BIT),
 812         /* on inflight list, should be cancelled and waited on exit reliably */
 813         REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
 814         /* read/write uses file position */
 815         REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
 816         /* must not punt to workers */
 817         REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
 818         /* has or had linked timeout */
 819         REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
 820         /* needs cleanup */
 821         REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
 822         /* already went through poll handler */
 823         REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
 824         /* buffer already selected */
 825         REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
 826         /* completion is deferred through io_comp_state */
 827         REQ_F_COMPLETE_INLINE   = BIT(REQ_F_COMPLETE_INLINE_BIT),
 828         /* caller should reissue async */
 829         REQ_F_REISSUE           = BIT(REQ_F_REISSUE_BIT),
 830         /* supports async reads/writes */
 831         REQ_F_SUPPORT_NOWAIT    = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
 832         /* regular file */
 833         REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
 834         /* has creds assigned */
 835         REQ_F_CREDS             = BIT(REQ_F_CREDS_BIT),
 836         /* skip refcounting if not set */
 837         REQ_F_REFCOUNT          = BIT(REQ_F_REFCOUNT_BIT),
 838         /* there is a linked timeout that has to be armed */
 839         REQ_F_ARM_LTIMEOUT      = BIT(REQ_F_ARM_LTIMEOUT_BIT),
 840         /* ->async_data allocated */
 841         REQ_F_ASYNC_DATA        = BIT(REQ_F_ASYNC_DATA_BIT),
 842         /* don't post CQEs while failing linked requests */
 843         REQ_F_SKIP_LINK_CQES    = BIT(REQ_F_SKIP_LINK_CQES_BIT),
 844         /* single poll may be active */
 845         REQ_F_SINGLE_POLL       = BIT(REQ_F_SINGLE_POLL_BIT),
 846         /* double poll may active */
 847         REQ_F_DOUBLE_POLL       = BIT(REQ_F_DOUBLE_POLL_BIT),
 848         /* request has already done partial IO */
 849         REQ_F_PARTIAL_IO        = BIT(REQ_F_PARTIAL_IO_BIT),
 850 };
 851
 852 struct async_poll {
 853         struct io_poll_iocb     poll;
 854         struct io_poll_iocb     *double_poll;
 855 };
 856
 857 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
 858
 859 struct io_task_work {
 860         union {
 861                 struct io_wq_work_node  node;
 862                 struct llist_node       fallback_node;
 863         };
 864         io_req_tw_func_t                func;
 865 };
 866
 867 enum {
 868         IORING_RSRC_FILE                = 0,
 869         IORING_RSRC_BUFFER              = 1,
 870 };
 871
 872 /*
 873  * NOTE! Each of the iocb union members has the file pointer
 874  * as the first entry in their struct definition. So you can
 875  * access the file pointer through any of the sub-structs,
 876  * or directly as just 'file' in this struct.
 877  */
 878 struct io_kiocb {
 879         union {
 880                 struct file             *file;
 881                 struct io_rw            rw;
 882                 struct io_poll_iocb     poll;
 883                 struct io_poll_update   poll_update;
 884                 struct io_accept        accept;
 885                 struct io_sync          sync;
 886                 struct io_cancel        cancel;
 887                 struct io_timeout       timeout;
 888                 struct io_timeout_rem   timeout_rem;
 889                 struct io_connect       connect;
 890                 struct io_sr_msg        sr_msg;
 891                 struct io_open          open;
 892                 struct io_close         close;
 893                 struct io_rsrc_update   rsrc_update;
 894                 struct io_fadvise       fadvise;
 895                 struct io_madvise       madvise;
 896                 struct io_epoll         epoll;
 897                 struct io_splice        splice;
 898                 struct io_provide_buf   pbuf;
 899                 struct io_statx         statx;
 900                 struct io_shutdown      shutdown;
 901                 struct io_rename        rename;
 902                 struct io_unlink        unlink;
 903                 struct io_mkdir         mkdir;
 904                 struct io_symlink       symlink;
 905                 struct io_hardlink      hardlink;
 906                 struct io_msg           msg;
 907         };
 908
 909         u8                              opcode;
 910         /* polled IO has completed */
 911         u8                              iopoll_completed;
 912         u16                             buf_index;
 913         unsigned int                    flags;
 914
 915         u64                             user_data;
 916         u32                             result;
 917         u32                             cflags;
 918
 919         struct io_ring_ctx              *ctx;
 920         struct task_struct              *task;
 921
 922         struct percpu_ref               *fixed_rsrc_refs;
 923         /* store used ubuf, so we can prevent reloading */
 924         struct io_mapped_ubuf           *imu;
 925
 926         /* used by request caches, completion batching and iopoll */
 927         struct io_wq_work_node          comp_list;
 928         atomic_t                        refs;
 929         atomic_t                        poll_refs;
 930         struct io_task_work             io_task_work;
 931         /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
 932         struct hlist_node               hash_node;
 933         /* internal polling, see IORING_FEAT_FAST_POLL */
 934         struct async_poll               *apoll;
 935         /* opcode allocated if it needs to store data for async defer */
 936         void                            *async_data;
 937         /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
 938         struct io_buffer                *kbuf;
 939         /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
 940         struct io_kiocb                 *link;
 941         /* custom credentials, valid IFF REQ_F_CREDS is set */
 942         const struct cred               *creds;
 943         struct io_wq_work               work;
 944 };
 945
 946 struct io_tctx_node {
 947         struct list_head        ctx_node;
 948         struct task_struct      *task;
 949         struct io_ring_ctx      *ctx;
 950 };
 951
 952 struct io_defer_entry {
 953         struct list_head        list;
 954         struct io_kiocb         *req;
 955         u32                     seq;
 956 };
 957
 958 struct io_op_def {
 959         /* needs req->file assigned */
 960         unsigned                needs_file : 1;
 961         /* should block plug */
 962         unsigned                plug : 1;
 963         /* hash wq insertion if file is a regular file */
 964         unsigned                hash_reg_file : 1;
 965         /* unbound wq insertion if file is a non-regular file */
 966         unsigned                unbound_nonreg_file : 1;
 967         /* set if opcode supports polled "wait" */
 968         unsigned                pollin : 1;
 969         unsigned                pollout : 1;
 970         unsigned                poll_exclusive : 1;
 971         /* op supports buffer selection */
 972         unsigned                buffer_select : 1;
 973         /* do prep async if is going to be punted */
 974         unsigned                needs_async_setup : 1;
 975         /* opcode is not supported by this kernel */
 976         unsigned                not_supported : 1;
 977         /* skip auditing */
 978         unsigned                audit_skip : 1;
 979         /* size of async data needed, if any */
 980         unsigned short          async_size;
 981 };
 982
 983 static const struct io_op_def io_op_defs[] = {
 984         [IORING_OP_NOP] = {},
 985         [IORING_OP_READV] = {
 986                 .needs_file             = 1,
 987                 .unbound_nonreg_file    = 1,
 988                 .pollin                 = 1,
 989                 .buffer_select          = 1,
 990                 .needs_async_setup      = 1,
 991                 .plug                   = 1,
 992                 .audit_skip             = 1,
 993                 .async_size             = sizeof(struct io_async_rw),
 994         },
 995         [IORING_OP_WRITEV] = {
 996                 .needs_file             = 1,
 997                 .hash_reg_file          = 1,
 998                 .unbound_nonreg_file    = 1,
 999                 .pollout                = 1,
1000                 .needs_async_setup      = 1,
1001                 .plug                   = 1,
1002                 .audit_skip             = 1,
1003                 .async_size             = sizeof(struct io_async_rw),
1004         },
1005         [IORING_OP_FSYNC] = {
1006                 .needs_file             = 1,
1007                 .audit_skip             = 1,
1008         },
1009         [IORING_OP_READ_FIXED] = {
1010                 .needs_file             = 1,
1011                 .unbound_nonreg_file    = 1,
1012                 .pollin                 = 1,
1013                 .plug                   = 1,
1014                 .audit_skip             = 1,
1015                 .async_size             = sizeof(struct io_async_rw),
1016         },
1017         [IORING_OP_WRITE_FIXED] = {
1018                 .needs_file             = 1,
1019                 .hash_reg_file          = 1,
1020                 .unbound_nonreg_file    = 1,
1021                 .pollout                = 1,
1022                 .plug                   = 1,
1023                 .audit_skip             = 1,
1024                 .async_size             = sizeof(struct io_async_rw),
1025         },
1026         [IORING_OP_POLL_ADD] = {
1027                 .needs_file             = 1,
1028                 .unbound_nonreg_file    = 1,
1029                 .audit_skip             = 1,
1030         },
1031         [IORING_OP_POLL_REMOVE] = {
1032                 .audit_skip             = 1,
1033         },
1034         [IORING_OP_SYNC_FILE_RANGE] = {
1035                 .needs_file             = 1,
1036                 .audit_skip             = 1,
1037         },
1038         [IORING_OP_SENDMSG] = {
1039                 .needs_file             = 1,
1040                 .unbound_nonreg_file    = 1,
1041                 .pollout                = 1,
1042                 .needs_async_setup      = 1,
1043                 .async_size             = sizeof(struct io_async_msghdr),
1044         },
1045         [IORING_OP_RECVMSG] = {
1046                 .needs_file             = 1,
1047                 .unbound_nonreg_file    = 1,
1048                 .pollin                 = 1,
1049                 .buffer_select          = 1,
1050                 .needs_async_setup      = 1,
1051                 .async_size             = sizeof(struct io_async_msghdr),
1052         },
1053         [IORING_OP_TIMEOUT] = {
1054                 .audit_skip             = 1,
1055                 .async_size             = sizeof(struct io_timeout_data),
1056         },
1057         [IORING_OP_TIMEOUT_REMOVE] = {
1058                 /* used by timeout updates' prep() */
1059                 .audit_skip             = 1,
1060         },
1061         [IORING_OP_ACCEPT] = {
1062                 .needs_file             = 1,
1063                 .unbound_nonreg_file    = 1,
1064                 .pollin                 = 1,
1065                 .poll_exclusive         = 1,
1066         },
1067         [IORING_OP_ASYNC_CANCEL] = {
1068                 .audit_skip             = 1,
1069         },
1070         [IORING_OP_LINK_TIMEOUT] = {
1071                 .audit_skip             = 1,
1072                 .async_size             = sizeof(struct io_timeout_data),
1073         },
1074         [IORING_OP_CONNECT] = {
1075                 .needs_file             = 1,
1076                 .unbound_nonreg_file    = 1,
1077                 .pollout                = 1,
1078                 .needs_async_setup      = 1,
1079                 .async_size             = sizeof(struct io_async_connect),
1080         },
1081         [IORING_OP_FALLOCATE] = {
1082                 .needs_file             = 1,
1083         },
1084         [IORING_OP_OPENAT] = {},
1085         [IORING_OP_CLOSE] = {},
1086         [IORING_OP_FILES_UPDATE] = {
1087                 .audit_skip             = 1,
1088         },
1089         [IORING_OP_STATX] = {
1090                 .audit_skip             = 1,
1091         },
1092         [IORING_OP_READ] = {
1093                 .needs_file             = 1,
1094                 .unbound_nonreg_file    = 1,
1095                 .pollin                 = 1,
1096                 .buffer_select          = 1,
1097                 .plug                   = 1,
1098                 .audit_skip             = 1,
1099                 .async_size             = sizeof(struct io_async_rw),
1100         },
1101         [IORING_OP_WRITE] = {
1102                 .needs_file             = 1,
1103                 .hash_reg_file          = 1,
1104                 .unbound_nonreg_file    = 1,
1105                 .pollout                = 1,
1106                 .plug                   = 1,
1107                 .audit_skip             = 1,
1108                 .async_size             = sizeof(struct io_async_rw),
1109         },
1110         [IORING_OP_FADVISE] = {
1111                 .needs_file             = 1,
1112                 .audit_skip             = 1,
1113         },
1114         [IORING_OP_MADVISE] = {},
1115         [IORING_OP_SEND] = {
1116                 .needs_file             = 1,
1117                 .unbound_nonreg_file    = 1,
1118                 .pollout                = 1,
1119                 .audit_skip             = 1,
1120         },
1121         [IORING_OP_RECV] = {
1122                 .needs_file             = 1,
1123                 .unbound_nonreg_file    = 1,
1124                 .pollin                 = 1,
1125                 .buffer_select          = 1,
1126                 .audit_skip             = 1,
1127         },
1128         [IORING_OP_OPENAT2] = {
1129         },
1130         [IORING_OP_EPOLL_CTL] = {
1131                 .unbound_nonreg_file    = 1,
1132                 .audit_skip             = 1,
1133         },
1134         [IORING_OP_SPLICE] = {
1135                 .needs_file             = 1,
1136                 .hash_reg_file          = 1,
1137                 .unbound_nonreg_file    = 1,
1138                 .audit_skip             = 1,
1139         },
1140         [IORING_OP_PROVIDE_BUFFERS] = {
1141                 .audit_skip             = 1,
1142         },
1143         [IORING_OP_REMOVE_BUFFERS] = {
1144                 .audit_skip             = 1,
1145         },
1146         [IORING_OP_TEE] = {
1147                 .needs_file             = 1,
1148                 .hash_reg_file          = 1,
1149                 .unbound_nonreg_file    = 1,
1150                 .audit_skip             = 1,
1151         },
1152         [IORING_OP_SHUTDOWN] = {
1153                 .needs_file             = 1,
1154         },
1155         [IORING_OP_RENAMEAT] = {},
1156         [IORING_OP_UNLINKAT] = {},
1157         [IORING_OP_MKDIRAT] = {},
1158         [IORING_OP_SYMLINKAT] = {},
1159         [IORING_OP_LINKAT] = {},
1160         [IORING_OP_MSG_RING] = {
1161                 .needs_file             = 1,
1162         },
1163 };
1164
1165 /* requests with any of those set should undergo io_disarm_next() */
1166 #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
1167
1168 static bool io_disarm_next(struct io_kiocb *req);
1169 static void io_uring_del_tctx_node(unsigned long index);
1170 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
1171                                          struct task_struct *task,
1172                                          bool cancel_all);
1173 static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
1174
1175 static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags);
1176
1177 static void io_put_req(struct io_kiocb *req);
1178 static void io_put_req_deferred(struct io_kiocb *req);
1179 static void io_dismantle_req(struct io_kiocb *req);
1180 static void io_queue_linked_timeout(struct io_kiocb *req);
1181 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
1182                                      struct io_uring_rsrc_update2 *up,
1183                                      unsigned nr_args);
1184 static void io_clean_op(struct io_kiocb *req);
1185 static struct file *io_file_get(struct io_ring_ctx *ctx,
1186                                 struct io_kiocb *req, int fd, bool fixed);
1187 static void __io_queue_sqe(struct io_kiocb *req);
1188 static void io_rsrc_put_work(struct work_struct *work);
1189
1190 static void io_req_task_queue(struct io_kiocb *req);
1191 static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
1192 static int io_req_prep_async(struct io_kiocb *req);
1193
1194 static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
1195                                  unsigned int issue_flags, u32 slot_index);
1196 static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
1197
1198 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
1199 static void io_eventfd_signal(struct io_ring_ctx *ctx);
1200
1201 static struct kmem_cache *req_cachep;
1202
1203 static const struct file_operations io_uring_fops;
1204
1205 struct sock *io_uring_get_socket(struct file *file)
1206 {
1207 #if defined(CONFIG_UNIX)
1208         if (file->f_op == &io_uring_fops) {
1209                 struct io_ring_ctx *ctx = file->private_data;
1210
1211                 return ctx->ring_sock->sk;
1212         }
1213 #endif
1214         return NULL;
1215 }
1216 EXPORT_SYMBOL(io_uring_get_socket);
1217
1218 static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
1219 {
1220         if (!*locked) {
1221                 mutex_lock(&ctx->uring_lock);
1222                 *locked = true;
1223         }
1224 }
1225
1226 #define io_for_each_link(pos, head) \
1227         for (pos = (head); pos; pos = pos->link)
1228
1229 /*
1230  * Shamelessly stolen from the mm implementation of page reference checking,
1231  * see commit f958d7b528b1 for details.
1232  */
1233 #define req_ref_zero_or_close_to_overflow(req)  \
1234         ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
1235
1236 static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
1237 {
1238         WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
1239         return atomic_inc_not_zero(&req->refs);
1240 }
1241
1242 static inline bool req_ref_put_and_test(struct io_kiocb *req)
1243 {
1244         if (likely(!(req->flags & REQ_F_REFCOUNT)))
1245                 return true;
1246
1247         WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1248         return atomic_dec_and_test(&req->refs);
1249 }
1250
1251 static inline void req_ref_get(struct io_kiocb *req)
1252 {
1253         WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
1254         WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1255         atomic_inc(&req->refs);
1256 }
1257
1258 static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
1259 {
1260         if (!wq_list_empty(&ctx->submit_state.compl_reqs))
1261                 __io_submit_flush_completions(ctx);
1262 }
1263
1264 static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
1265 {
1266         if (!(req->flags & REQ_F_REFCOUNT)) {
1267                 req->flags |= REQ_F_REFCOUNT;
1268                 atomic_set(&req->refs, nr);
1269         }
1270 }
1271
1272 static inline void io_req_set_refcount(struct io_kiocb *req)
1273 {
1274         __io_req_set_refcount(req, 1);
1275 }
1276
1277 #define IO_RSRC_REF_BATCH       100
1278
1279 static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
1280                                           struct io_ring_ctx *ctx)
1281         __must_hold(&ctx->uring_lock)
1282 {
1283         struct percpu_ref *ref = req->fixed_rsrc_refs;
1284
1285         if (ref) {
1286                 if (ref == &ctx->rsrc_node->refs)
1287                         ctx->rsrc_cached_refs++;
1288                 else
1289                         percpu_ref_put(ref);
1290         }
1291 }
1292
1293 static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
1294 {
1295         if (req->fixed_rsrc_refs)
1296                 percpu_ref_put(req->fixed_rsrc_refs);
1297 }
1298
1299 static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
1300         __must_hold(&ctx->uring_lock)
1301 {
1302         if (ctx->rsrc_cached_refs) {
1303                 percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
1304                 ctx->rsrc_cached_refs = 0;
1305         }
1306 }
1307
1308 static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
1309         __must_hold(&ctx->uring_lock)
1310 {
1311         ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
1312         percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
1313 }
1314
1315 static inline void io_req_set_rsrc_node(struct io_kiocb *req,
1316                                         struct io_ring_ctx *ctx)
1317 {
1318         if (!req->fixed_rsrc_refs) {
1319                 req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
1320                 ctx->rsrc_cached_refs--;
1321                 if (unlikely(ctx->rsrc_cached_refs < 0))
1322                         io_rsrc_refs_refill(ctx);
1323         }
1324 }
1325
1326 static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
1327 {
1328         struct io_buffer *kbuf = req->kbuf;
1329         unsigned int cflags;
1330
1331         cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT);
1332         req->flags &= ~REQ_F_BUFFER_SELECTED;
1333         list_add(&kbuf->list, list);
1334         req->kbuf = NULL;
1335         return cflags;
1336 }
1337
1338 static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
1339 {
1340         lockdep_assert_held(&req->ctx->completion_lock);
1341
1342         if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
1343                 return 0;
1344         return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
1345 }
1346
1347 static inline unsigned int io_put_kbuf(struct io_kiocb *req,
1348                                        unsigned issue_flags)
1349 {
1350         unsigned int cflags;
1351
1352         if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
1353                 return 0;
1354
1355         /*
1356          * We can add this buffer back to two lists:
1357          *
1358          * 1) The io_buffers_cache list. This one is protected by the
1359          *    ctx->uring_lock. If we already hold this lock, add back to this
1360          *    list as we can grab it from issue as well.
1361          * 2) The io_buffers_comp list. This one is protected by the
1362          *    ctx->completion_lock.
1363          *
1364          * We migrate buffers from the comp_list to the issue cache list
1365          * when we need one.
1366          */
1367         if (issue_flags & IO_URING_F_UNLOCKED) {
1368                 struct io_ring_ctx *ctx = req->ctx;
1369
1370                 spin_lock(&ctx->completion_lock);
1371                 cflags = __io_put_kbuf(req, &ctx->io_buffers_comp);
1372                 spin_unlock(&ctx->completion_lock);
1373         } else {
1374                 lockdep_assert_held(&req->ctx->uring_lock);
1375
1376                 cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache);
1377         }
1378
1379         return cflags;
1380 }
1381
1382 static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
1383                                                  unsigned int bgid)
1384 {
1385         struct list_head *hash_list;
1386         struct io_buffer_list *bl;
1387
1388         hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
1389         list_for_each_entry(bl, hash_list, list)
1390                 if (bl->bgid == bgid || bgid == -1U)
1391                         return bl;
1392
1393         return NULL;
1394 }
1395
1396 static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
1397 {
1398         struct io_ring_ctx *ctx = req->ctx;
1399         struct io_buffer_list *bl;
1400         struct io_buffer *buf;
1401
1402         if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
1403                 return;
1404         /* don't recycle if we already did IO to this buffer */
1405         if (req->flags & REQ_F_PARTIAL_IO)
1406                 return;
1407
1408         if (issue_flags & IO_URING_F_UNLOCKED)
1409                 mutex_lock(&ctx->uring_lock);
1410
1411         lockdep_assert_held(&ctx->uring_lock);
1412
1413         buf = req->kbuf;
1414         bl = io_buffer_get_list(ctx, buf->bgid);
1415         list_add(&buf->list, &bl->buf_list);
1416         req->flags &= ~REQ_F_BUFFER_SELECTED;
1417         req->kbuf = NULL;
1418
1419         if (issue_flags & IO_URING_F_UNLOCKED)
1420                 mutex_unlock(&ctx->uring_lock);
1421 }
1422
1423 static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
1424                           bool cancel_all)
1425         __must_hold(&req->ctx->timeout_lock)
1426 {
1427         struct io_kiocb *req;
1428
1429         if (task && head->task != task)
1430                 return false;
1431         if (cancel_all)
1432                 return true;
1433
1434         io_for_each_link(req, head) {
1435                 if (req->flags & REQ_F_INFLIGHT)
1436                         return true;
1437         }
1438         return false;
1439 }
1440
1441 static bool io_match_linked(struct io_kiocb *head)
1442 {
1443         struct io_kiocb *req;
1444
1445         io_for_each_link(req, head) {
1446                 if (req->flags & REQ_F_INFLIGHT)
1447                         return true;
1448         }
1449         return false;
1450 }
1451
1452 /*
1453  * As io_match_task() but protected against racing with linked timeouts.
1454  * User must not hold timeout_lock.
1455  */
1456 static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
1457                                bool cancel_all)
1458 {
1459         bool matched;
1460
1461         if (task && head->task != task)
1462                 return false;
1463         if (cancel_all)
1464                 return true;
1465
1466         if (head->flags & REQ_F_LINK_TIMEOUT) {
1467                 struct io_ring_ctx *ctx = head->ctx;
1468
1469                 /* protect against races with linked timeouts */
1470                 spin_lock_irq(&ctx->timeout_lock);
1471                 matched = io_match_linked(head);
1472                 spin_unlock_irq(&ctx->timeout_lock);
1473         } else {
1474                 matched = io_match_linked(head);
1475         }
1476         return matched;
1477 }
1478
1479 static inline bool req_has_async_data(struct io_kiocb *req)
1480 {
1481         return req->flags & REQ_F_ASYNC_DATA;
1482 }
1483
1484 static inline void req_set_fail(struct io_kiocb *req)
1485 {
1486         req->flags |= REQ_F_FAIL;
1487         if (req->flags & REQ_F_CQE_SKIP) {
1488                 req->flags &= ~REQ_F_CQE_SKIP;
1489                 req->flags |= REQ_F_SKIP_LINK_CQES;
1490         }
1491 }
1492
1493 static inline void req_fail_link_node(struct io_kiocb *req, int res)
1494 {
1495         req_set_fail(req);
1496         req->result = res;
1497 }
1498
1499 static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
1500 {
1501         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1502
1503         complete(&ctx->ref_comp);
1504 }
1505
1506 static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1507 {
1508         return !req->timeout.off;
1509 }
1510
1511 static __cold void io_fallback_req_func(struct work_struct *work)
1512 {
1513         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
1514                                                 fallback_work.work);
1515         struct llist_node *node = llist_del_all(&ctx->fallback_llist);
1516         struct io_kiocb *req, *tmp;
1517         bool locked = false;
1518
1519         percpu_ref_get(&ctx->refs);
1520         llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
1521                 req->io_task_work.func(req, &locked);
1522
1523         if (locked) {
1524                 io_submit_flush_completions(ctx);
1525                 mutex_unlock(&ctx->uring_lock);
1526         }
1527         percpu_ref_put(&ctx->refs);
1528 }
1529
1530 static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1531 {
1532         struct io_ring_ctx *ctx;
1533         int i, hash_bits;
1534
1535         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1536         if (!ctx)
1537                 return NULL;
1538
1539         /*
1540          * Use 5 bits less than the max cq entries, that should give us around
1541          * 32 entries per hash list if totally full and uniformly spread.
1542          */
1543         hash_bits = ilog2(p->cq_entries);
1544         hash_bits -= 5;
1545         if (hash_bits <= 0)
1546                 hash_bits = 1;
1547         ctx->cancel_hash_bits = hash_bits;
1548         ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1549                                         GFP_KERNEL);
1550         if (!ctx->cancel_hash)
1551                 goto err;
1552         __hash_init(ctx->cancel_hash, 1U << hash_bits);
1553
1554         ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
1555         if (!ctx->dummy_ubuf)
1556                 goto err;
1557         /* set invalid range, so io_import_fixed() fails meeting it */
1558         ctx->dummy_ubuf->ubuf = -1UL;
1559
1560         ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS,
1561                                         sizeof(struct list_head), GFP_KERNEL);
1562         if (!ctx->io_buffers)
1563                 goto err;
1564         for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++)
1565                 INIT_LIST_HEAD(&ctx->io_buffers[i]);
1566
1567         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
1568                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1569                 goto err;
1570
1571         ctx->flags = p->flags;
1572         init_waitqueue_head(&ctx->sqo_sq_wait);
1573         INIT_LIST_HEAD(&ctx->sqd_list);
1574         INIT_LIST_HEAD(&ctx->cq_overflow_list);
1575         INIT_LIST_HEAD(&ctx->io_buffers_cache);
1576         INIT_LIST_HEAD(&ctx->apoll_cache);
1577         init_completion(&ctx->ref_comp);
1578         xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
1579         mutex_init(&ctx->uring_lock);
1580         init_waitqueue_head(&ctx->cq_wait);
1581         spin_lock_init(&ctx->completion_lock);
1582         spin_lock_init(&ctx->timeout_lock);
1583         INIT_WQ_LIST(&ctx->iopoll_list);
1584         INIT_LIST_HEAD(&ctx->io_buffers_pages);
1585         INIT_LIST_HEAD(&ctx->io_buffers_comp);
1586         INIT_LIST_HEAD(&ctx->defer_list);
1587         INIT_LIST_HEAD(&ctx->timeout_list);
1588         INIT_LIST_HEAD(&ctx->ltimeout_list);
1589         spin_lock_init(&ctx->rsrc_ref_lock);
1590         INIT_LIST_HEAD(&ctx->rsrc_ref_list);
1591         INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1592         init_llist_head(&ctx->rsrc_put_llist);
1593         INIT_LIST_HEAD(&ctx->tctx_list);
1594         ctx->submit_state.free_list.next = NULL;
1595         INIT_WQ_LIST(&ctx->locked_free_list);
1596         INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
1597         INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
1598 #ifdef CONFIG_NET_RX_BUSY_POLL
1599         INIT_LIST_HEAD(&ctx->napi_list);
1600         spin_lock_init(&ctx->napi_lock);
1601 #endif
1602         return ctx;
1603 err:
1604         kfree(ctx->dummy_ubuf);
1605         kfree(ctx->cancel_hash);
1606         kfree(ctx->io_buffers);
1607         kfree(ctx);
1608         return NULL;
1609 }
1610
1611 static void io_account_cq_overflow(struct io_ring_ctx *ctx)
1612 {
1613         struct io_rings *r = ctx->rings;
1614
1615         WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
1616         ctx->cq_extra--;
1617 }
1618
1619 static bool req_need_defer(struct io_kiocb *req, u32 seq)
1620 {
1621         if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1622                 struct io_ring_ctx *ctx = req->ctx;
1623
1624                 return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
1625         }
1626
1627         return false;
1628 }
1629
1630 #define FFS_NOWAIT              0x1UL
1631 #define FFS_ISREG               0x2UL
1632 #define FFS_MASK                ~(FFS_NOWAIT|FFS_ISREG)
1633
1634 static inline bool io_req_ffs_set(struct io_kiocb *req)
1635 {
1636         return req->flags & REQ_F_FIXED_FILE;
1637 }
1638
1639 static inline void io_req_track_inflight(struct io_kiocb *req)
1640 {
1641         if (!(req->flags & REQ_F_INFLIGHT)) {
1642                 req->flags |= REQ_F_INFLIGHT;
1643                 atomic_inc(&current->io_uring->inflight_tracked);
1644         }
1645 }
1646
1647 static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
1648 {
1649         if (WARN_ON_ONCE(!req->link))
1650                 return NULL;
1651
1652         req->flags &= ~REQ_F_ARM_LTIMEOUT;
1653         req->flags |= REQ_F_LINK_TIMEOUT;
1654
1655         /* linked timeouts should have two refs once prep'ed */
1656         io_req_set_refcount(req);
1657         __io_req_set_refcount(req->link, 2);
1658         return req->link;
1659 }
1660
1661 static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
1662 {
1663         if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
1664                 return NULL;
1665         return __io_prep_linked_timeout(req);
1666 }
1667
1668 static void io_prep_async_work(struct io_kiocb *req)
1669 {
1670         const struct io_op_def *def = &io_op_defs[req->opcode];
1671         struct io_ring_ctx *ctx = req->ctx;
1672
1673         if (!(req->flags & REQ_F_CREDS)) {
1674                 req->flags |= REQ_F_CREDS;
1675                 req->creds = get_current_cred();
1676         }
1677
1678         req->work.list.next = NULL;
1679         req->work.flags = 0;
1680         if (req->flags & REQ_F_FORCE_ASYNC)
1681                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
1682
1683         if (req->flags & REQ_F_ISREG) {
1684                 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1685                         io_wq_hash_work(&req->work, file_inode(req->file));
1686         } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
1687                 if (def->unbound_nonreg_file)
1688                         req->work.flags |= IO_WQ_WORK_UNBOUND;
1689         }
1690
1691         switch (req->opcode) {
1692         case IORING_OP_SPLICE:
1693         case IORING_OP_TEE:
1694                 if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
1695                         req->work.flags |= IO_WQ_WORK_UNBOUND;
1696                 break;
1697         }
1698 }
1699
1700 static void io_prep_async_link(struct io_kiocb *req)
1701 {
1702         struct io_kiocb *cur;
1703
1704         if (req->flags & REQ_F_LINK_TIMEOUT) {
1705                 struct io_ring_ctx *ctx = req->ctx;
1706
1707                 spin_lock_irq(&ctx->timeout_lock);
1708                 io_for_each_link(cur, req)
1709                         io_prep_async_work(cur);
1710                 spin_unlock_irq(&ctx->timeout_lock);
1711         } else {
1712                 io_for_each_link(cur, req)
1713                         io_prep_async_work(cur);
1714         }
1715 }
1716
1717 static inline void io_req_add_compl_list(struct io_kiocb *req)
1718 {
1719         struct io_ring_ctx *ctx = req->ctx;
1720         struct io_submit_state *state = &ctx->submit_state;
1721
1722         if (!(req->flags & REQ_F_CQE_SKIP))
1723                 ctx->submit_state.flush_cqes = true;
1724         wq_list_add_tail(&req->comp_list, &state->compl_reqs);
1725 }
1726
1727 static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
1728 {
1729         struct io_ring_ctx *ctx = req->ctx;
1730         struct io_kiocb *link = io_prep_linked_timeout(req);
1731         struct io_uring_task *tctx = req->task->io_uring;
1732
1733         BUG_ON(!tctx);
1734         BUG_ON(!tctx->io_wq);
1735
1736         /* init ->work of the whole link before punting */
1737         io_prep_async_link(req);
1738
1739         /*
1740          * Not expected to happen, but if we do have a bug where this _can_
1741          * happen, catch it here and ensure the request is marked as
1742          * canceled. That will make io-wq go through the usual work cancel
1743          * procedure rather than attempt to run this request (or create a new
1744          * worker for it).
1745          */
1746         if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
1747                 req->work.flags |= IO_WQ_WORK_CANCEL;
1748
1749         trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags,
1750                                         &req->work, io_wq_is_hashed(&req->work));
1751         io_wq_enqueue(tctx->io_wq, &req->work);
1752         if (link)
1753                 io_queue_linked_timeout(link);
1754 }
1755
1756 static void io_kill_timeout(struct io_kiocb *req, int status)
1757         __must_hold(&req->ctx->completion_lock)
1758         __must_hold(&req->ctx->timeout_lock)
1759 {
1760         struct io_timeout_data *io = req->async_data;
1761
1762         if (hrtimer_try_to_cancel(&io->timer) != -1) {
1763                 if (status)
1764                         req_set_fail(req);
1765                 atomic_set(&req->ctx->cq_timeouts,
1766                         atomic_read(&req->ctx->cq_timeouts) + 1);
1767                 list_del_init(&req->timeout.list);
1768                 io_fill_cqe_req(req, status, 0);
1769                 io_put_req_deferred(req);
1770         }
1771 }
1772
1773 static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
1774 {
1775         while (!list_empty(&ctx->defer_list)) {
1776                 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1777                                                 struct io_defer_entry, list);
1778
1779                 if (req_need_defer(de->req, de->seq))
1780                         break;
1781                 list_del_init(&de->list);
1782                 io_req_task_queue(de->req);
1783                 kfree(de);
1784         }
1785 }
1786
1787 static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
1788         __must_hold(&ctx->completion_lock)
1789 {
1790         u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
1791
1792         spin_lock_irq(&ctx->timeout_lock);
1793         while (!list_empty(&ctx->timeout_list)) {
1794                 u32 events_needed, events_got;
1795                 struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
1796                                                 struct io_kiocb, timeout.list);
1797
1798                 if (io_is_timeout_noseq(req))
1799                         break;
1800
1801                 /*
1802                  * Since seq can easily wrap around over time, subtract
1803                  * the last seq at which timeouts were flushed before comparing.
1804                  * Assuming not more than 2^31-1 events have happened since,
1805                  * these subtractions won't have wrapped, so we can check if
1806                  * target is in [last_seq, current_seq] by comparing the two.
1807                  */
1808                 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
1809                 events_got = seq - ctx->cq_last_tm_flush;
1810                 if (events_got < events_needed)
1811                         break;
1812
1813                 list_del_init(&req->timeout.list);
1814                 io_kill_timeout(req, 0);
1815         }
1816         ctx->cq_last_tm_flush = seq;
1817         spin_unlock_irq(&ctx->timeout_lock);
1818 }
1819
1820 static inline void io_commit_cqring(struct io_ring_ctx *ctx)
1821 {
1822         /* order cqe stores with ring update */
1823         smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
1824 }
1825
1826 static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
1827 {
1828         if (ctx->off_timeout_used || ctx->drain_active) {
1829                 spin_lock(&ctx->completion_lock);
1830                 if (ctx->off_timeout_used)
1831                         io_flush_timeouts(ctx);
1832                 if (ctx->drain_active)
1833                         io_queue_deferred(ctx);
1834                 io_commit_cqring(ctx);
1835                 spin_unlock(&ctx->completion_lock);
1836         }
1837         if (ctx->has_evfd)
1838                 io_eventfd_signal(ctx);
1839 }
1840
1841 static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1842 {
1843         struct io_rings *r = ctx->rings;
1844
1845         return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
1846 }
1847
1848 static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
1849 {
1850         return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
1851 }
1852
1853 static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
1854 {
1855         struct io_rings *rings = ctx->rings;
1856         unsigned tail, mask = ctx->cq_entries - 1;
1857
1858         /*
1859          * writes to the cq entry need to come after reading head; the
1860          * control dependency is enough as we're using WRITE_ONCE to
1861          * fill the cq entry
1862          */
1863         if (__io_cqring_events(ctx) == ctx->cq_entries)
1864                 return NULL;
1865
1866         tail = ctx->cached_cq_tail++;
1867         return &rings->cqes[tail & mask];
1868 }
1869
1870 static void io_eventfd_signal(struct io_ring_ctx *ctx)
1871 {
1872         struct io_ev_fd *ev_fd;
1873
1874         rcu_read_lock();
1875         /*
1876          * rcu_dereference ctx->io_ev_fd once and use it for both for checking
1877          * and eventfd_signal
1878          */
1879         ev_fd = rcu_dereference(ctx->io_ev_fd);
1880
1881         /*
1882          * Check again if ev_fd exists incase an io_eventfd_unregister call
1883          * completed between the NULL check of ctx->io_ev_fd at the start of
1884          * the function and rcu_read_lock.
1885          */
1886         if (unlikely(!ev_fd))
1887                 goto out;
1888         if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1889                 goto out;
1890
1891         if (!ev_fd->eventfd_async || io_wq_current_is_worker())
1892                 eventfd_signal(ev_fd->cq_ev_fd, 1);
1893 out:
1894         rcu_read_unlock();
1895 }
1896
1897 static inline void io_cqring_wake(struct io_ring_ctx *ctx)
1898 {
1899         /*
1900          * wake_up_all() may seem excessive, but io_wake_function() and
1901          * io_should_wake() handle the termination of the loop and only
1902          * wake as many waiters as we need to.
1903          */
1904         if (wq_has_sleeper(&ctx->cq_wait))
1905                 wake_up_all(&ctx->cq_wait);
1906 }
1907
1908 /*
1909  * This should only get called when at least one event has been posted.
1910  * Some applications rely on the eventfd notification count only changing
1911  * IFF a new CQE has been added to the CQ ring. There's no depedency on
1912  * 1:1 relationship between how many times this function is called (and
1913  * hence the eventfd count) and number of CQEs posted to the CQ ring.
1914  */
1915 static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1916 {
1917         if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
1918                      ctx->has_evfd))
1919                 __io_commit_cqring_flush(ctx);
1920
1921         io_cqring_wake(ctx);
1922 }
1923
1924 static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1925 {
1926         if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
1927                      ctx->has_evfd))
1928                 __io_commit_cqring_flush(ctx);
1929
1930         if (ctx->flags & IORING_SETUP_SQPOLL)
1931                 io_cqring_wake(ctx);
1932 }
1933
1934 /* Returns true if there are no backlogged entries after the flush */
1935 static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1936 {
1937         bool all_flushed, posted;
1938
1939         if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
1940                 return false;
1941
1942         posted = false;
1943         spin_lock(&ctx->completion_lock);
1944         while (!list_empty(&ctx->cq_overflow_list)) {
1945                 struct io_uring_cqe *cqe = io_get_cqe(ctx);
1946                 struct io_overflow_cqe *ocqe;
1947
1948                 if (!cqe && !force)
1949                         break;
1950                 ocqe = list_first_entry(&ctx->cq_overflow_list,
1951                                         struct io_overflow_cqe, list);
1952                 if (cqe)
1953                         memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
1954                 else
1955                         io_account_cq_overflow(ctx);
1956
1957                 posted = true;
1958                 list_del(&ocqe->list);
1959                 kfree(ocqe);
1960         }
1961
1962         all_flushed = list_empty(&ctx->cq_overflow_list);
1963         if (all_flushed) {
1964                 clear_bit(0, &ctx->check_cq_overflow);
1965                 WRITE_ONCE(ctx->rings->sq_flags,
1966                            ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
1967         }
1968
1969         if (posted)
1970                 io_commit_cqring(ctx);
1971         spin_unlock(&ctx->completion_lock);
1972         if (posted)
1973                 io_cqring_ev_posted(ctx);
1974         return all_flushed;
1975 }
1976
1977 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
1978 {
1979         bool ret = true;
1980
1981         if (test_bit(0, &ctx->check_cq_overflow)) {
1982                 /* iopoll syncs against uring_lock, not completion_lock */
1983                 if (ctx->flags & IORING_SETUP_IOPOLL)
1984                         mutex_lock(&ctx->uring_lock);
1985                 ret = __io_cqring_overflow_flush(ctx, false);
1986                 if (ctx->flags & IORING_SETUP_IOPOLL)
1987                         mutex_unlock(&ctx->uring_lock);
1988         }
1989
1990         return ret;
1991 }
1992
1993 /* must to be called somewhat shortly after putting a request */
1994 static inline void io_put_task(struct task_struct *task, int nr)
1995 {
1996         struct io_uring_task *tctx = task->io_uring;
1997
1998         if (likely(task == current)) {
1999                 tctx->cached_refs += nr;
2000         } else {
2001                 percpu_counter_sub(&tctx->inflight, nr);
2002                 if (unlikely(atomic_read(&tctx->in_idle)))
2003                         wake_up(&tctx->wait);
2004                 put_task_struct_many(task, nr);
2005         }
2006 }
2007
2008 static void io_task_refs_refill(struct io_uring_task *tctx)
2009 {
2010         unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
2011
2012         percpu_counter_add(&tctx->inflight, refill);
2013         refcount_add(refill, &current->usage);
2014         tctx->cached_refs += refill;
2015 }
2016
2017 static inline void io_get_task_refs(int nr)
2018 {
2019         struct io_uring_task *tctx = current->io_uring;
2020
2021         tctx->cached_refs -= nr;
2022         if (unlikely(tctx->cached_refs < 0))
2023                 io_task_refs_refill(tctx);
2024 }
2025
2026 static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
2027 {
2028         struct io_uring_task *tctx = task->io_uring;
2029         unsigned int refs = tctx->cached_refs;
2030
2031         if (refs) {
2032                 tctx->cached_refs = 0;
2033                 percpu_counter_sub(&tctx->inflight, refs);
2034                 put_task_struct_many(task, refs);
2035         }
2036 }
2037
2038 static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
2039                                      s32 res, u32 cflags)
2040 {
2041         struct io_overflow_cqe *ocqe;
2042
2043         ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
2044         if (!ocqe) {
2045                 /*
2046                  * If we're in ring overflow flush mode, or in task cancel mode,
2047                  * or cannot allocate an overflow entry, then we need to drop it
2048                  * on the floor.
2049                  */
2050                 io_account_cq_overflow(ctx);
2051                 return false;
2052         }
2053         if (list_empty(&ctx->cq_overflow_list)) {
2054                 set_bit(0, &ctx->check_cq_overflow);
2055                 WRITE_ONCE(ctx->rings->sq_flags,
2056                            ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
2057
2058         }
2059         ocqe->cqe.user_data = user_data;
2060         ocqe->cqe.res = res;
2061         ocqe->cqe.flags = cflags;
2062         list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
2063         return true;
2064 }
2065
2066 static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
2067                                  s32 res, u32 cflags)
2068 {
2069         struct io_uring_cqe *cqe;
2070
2071         /*
2072          * If we can't get a cq entry, userspace overflowed the
2073          * submission (by quite a lot). Increment the overflow count in
2074          * the ring.
2075          */
2076         cqe = io_get_cqe(ctx);
2077         if (likely(cqe)) {
2078                 WRITE_ONCE(cqe->user_data, user_data);
2079                 WRITE_ONCE(cqe->res, res);
2080                 WRITE_ONCE(cqe->flags, cflags);
2081                 return true;
2082         }
2083         return io_cqring_event_overflow(ctx, user_data, res, cflags);
2084 }
2085
2086 static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
2087 {
2088         trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags);
2089         return __io_fill_cqe(req->ctx, req->user_data, res, cflags);
2090 }
2091
2092 static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
2093 {
2094         if (!(req->flags & REQ_F_CQE_SKIP))
2095                 __io_fill_cqe_req(req, res, cflags);
2096 }
2097
2098 static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
2099                                      s32 res, u32 cflags)
2100 {
2101         ctx->cq_extra++;
2102         trace_io_uring_complete(ctx, NULL, user_data, res, cflags);
2103         return __io_fill_cqe(ctx, user_data, res, cflags);
2104 }
2105
2106 static void __io_req_complete_post(struct io_kiocb *req, s32 res,
2107                                    u32 cflags)
2108 {
2109         struct io_ring_ctx *ctx = req->ctx;
2110
2111         if (!(req->flags & REQ_F_CQE_SKIP))
2112                 __io_fill_cqe_req(req, res, cflags);
2113         /*
2114          * If we're the last reference to this request, add to our locked
2115          * free_list cache.
2116          */
2117         if (req_ref_put_and_test(req)) {
2118                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
2119                         if (req->flags & IO_DISARM_MASK)
2120                                 io_disarm_next(req);
2121                         if (req->link) {
2122                                 io_req_task_queue(req->link);
2123                                 req->link = NULL;
2124                         }
2125                 }
2126                 io_req_put_rsrc(req, ctx);
2127                 /*
2128                  * Selected buffer deallocation in io_clean_op() assumes that
2129                  * we don't hold ->completion_lock. Clean them here to avoid
2130                  * deadlocks.
2131                  */
2132                 io_put_kbuf_comp(req);
2133                 io_dismantle_req(req);
2134                 io_put_task(req->task, 1);
2135                 wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
2136                 ctx->locked_free_nr++;
2137         }
2138 }
2139
2140 static void io_req_complete_post(struct io_kiocb *req, s32 res,
2141                                  u32 cflags)
2142 {
2143         struct io_ring_ctx *ctx = req->ctx;
2144
2145         spin_lock(&ctx->completion_lock);
2146         __io_req_complete_post(req, res, cflags);
2147         io_commit_cqring(ctx);
2148         spin_unlock(&ctx->completion_lock);
2149         io_cqring_ev_posted(ctx);
2150 }
2151
2152 static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
2153                                          u32 cflags)
2154 {
2155         req->result = res;
2156         req->cflags = cflags;
2157         req->flags |= REQ_F_COMPLETE_INLINE;
2158 }
2159
2160 static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
2161                                      s32 res, u32 cflags)
2162 {
2163         if (issue_flags & IO_URING_F_COMPLETE_DEFER)
2164                 io_req_complete_state(req, res, cflags);
2165         else
2166                 io_req_complete_post(req, res, cflags);
2167 }
2168
2169 static inline void io_req_complete(struct io_kiocb *req, s32 res)
2170 {
2171         __io_req_complete(req, 0, res, 0);
2172 }
2173
2174 static void io_req_complete_failed(struct io_kiocb *req, s32 res)
2175 {
2176         req_set_fail(req);
2177         io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
2178 }
2179
2180 static void io_req_complete_fail_submit(struct io_kiocb *req)
2181 {
2182         /*
2183          * We don't submit, fail them all, for that replace hardlinks with
2184          * normal links. Extra REQ_F_LINK is tolerated.
2185          */
2186         req->flags &= ~REQ_F_HARDLINK;
2187         req->flags |= REQ_F_LINK;
2188         io_req_complete_failed(req, req->result);
2189 }
2190
2191 /*
2192  * Don't initialise the fields below on every allocation, but do that in
2193  * advance and keep them valid across allocations.
2194  */
2195 static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
2196 {
2197         req->ctx = ctx;
2198         req->link = NULL;
2199         req->async_data = NULL;
2200         /* not necessary, but safer to zero */
2201         req->result = 0;
2202 }
2203
2204 static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
2205                                         struct io_submit_state *state)
2206 {
2207         spin_lock(&ctx->completion_lock);
2208         wq_list_splice(&ctx->locked_free_list, &state->free_list);
2209         ctx->locked_free_nr = 0;
2210         spin_unlock(&ctx->completion_lock);
2211 }
2212
2213 /* Returns true IFF there are requests in the cache */
2214 static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
2215 {
2216         struct io_submit_state *state = &ctx->submit_state;
2217
2218         /*
2219          * If we have more than a batch's worth of requests in our IRQ side
2220          * locked cache, grab the lock and move them over to our submission
2221          * side cache.
2222          */
2223         if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
2224                 io_flush_cached_locked_reqs(ctx, state);
2225         return !!state->free_list.next;
2226 }
2227
2228 /*
2229  * A request might get retired back into the request caches even before opcode
2230  * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
2231  * Because of that, io_alloc_req() should be called only under ->uring_lock
2232  * and with extra caution to not get a request that is still worked on.
2233  */
2234 static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
2235         __must_hold(&ctx->uring_lock)
2236 {
2237         struct io_submit_state *state = &ctx->submit_state;
2238         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
2239         void *reqs[IO_REQ_ALLOC_BATCH];
2240         struct io_kiocb *req;
2241         int ret, i;
2242
2243         if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
2244                 return true;
2245
2246         ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
2247
2248         /*
2249          * Bulk alloc is all-or-nothing. If we fail to get a batch,
2250          * retry single alloc to be on the safe side.
2251          */
2252         if (unlikely(ret <= 0)) {
2253                 reqs[0] = kmem_cache_alloc(req_cachep, gfp);
2254                 if (!reqs[0])
2255                         return false;
2256                 ret = 1;
2257         }
2258
2259         percpu_ref_get_many(&ctx->refs, ret);
2260         for (i = 0; i < ret; i++) {
2261                 req = reqs[i];
2262
2263                 io_preinit_req(req, ctx);
2264                 wq_stack_add_head(&req->comp_list, &state->free_list);
2265         }
2266         return true;
2267 }
2268
2269 static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
2270 {
2271         if (unlikely(!ctx->submit_state.free_list.next))
2272                 return __io_alloc_req_refill(ctx);
2273         return true;
2274 }
2275
2276 static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
2277 {
2278         struct io_wq_work_node *node;
2279
2280         node = wq_stack_extract(&ctx->submit_state.free_list);
2281         return container_of(node, struct io_kiocb, comp_list);
2282 }
2283
2284 static inline void io_put_file(struct file *file)
2285 {
2286         if (file)
2287                 fput(file);
2288 }
2289
2290 static inline void io_dismantle_req(struct io_kiocb *req)
2291 {
2292         unsigned int flags = req->flags;
2293
2294         if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
2295                 io_clean_op(req);
2296         if (!(flags & REQ_F_FIXED_FILE))
2297                 io_put_file(req->file);
2298 }
2299
2300 static __cold void __io_free_req(struct io_kiocb *req)
2301 {
2302         struct io_ring_ctx *ctx = req->ctx;
2303
2304         io_req_put_rsrc(req, ctx);
2305         io_dismantle_req(req);
2306         io_put_task(req->task, 1);
2307
2308         spin_lock(&ctx->completion_lock);
2309         wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
2310         ctx->locked_free_nr++;
2311         spin_unlock(&ctx->completion_lock);
2312 }
2313
2314 static inline void io_remove_next_linked(struct io_kiocb *req)
2315 {
2316         struct io_kiocb *nxt = req->link;
2317
2318         req->link = nxt->link;
2319         nxt->link = NULL;
2320 }
2321
2322 static bool io_kill_linked_timeout(struct io_kiocb *req)
2323         __must_hold(&req->ctx->completion_lock)
2324         __must_hold(&req->ctx->timeout_lock)
2325 {
2326         struct io_kiocb *link = req->link;
2327
2328         if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
2329                 struct io_timeout_data *io = link->async_data;
2330
2331                 io_remove_next_linked(req);
2332                 link->timeout.head = NULL;
2333                 if (hrtimer_try_to_cancel(&io->timer) != -1) {
2334                         list_del(&link->timeout.list);
2335                         /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
2336                         io_fill_cqe_req(link, -ECANCELED, 0);
2337                         io_put_req_deferred(link);
2338                         return true;
2339                 }
2340         }
2341         return false;
2342 }
2343
2344 static void io_fail_links(struct io_kiocb *req)
2345         __must_hold(&req->ctx->completion_lock)
2346 {
2347         struct io_kiocb *nxt, *link = req->link;
2348         bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES;
2349
2350         req->link = NULL;
2351         while (link) {
2352                 long res = -ECANCELED;
2353
2354                 if (link->flags & REQ_F_FAIL)
2355                         res = link->result;
2356
2357                 nxt = link->link;
2358                 link->link = NULL;
2359
2360                 trace_io_uring_fail_link(req->ctx, req, req->user_data,
2361                                         req->opcode, link);
2362
2363                 if (!ignore_cqes) {
2364                         link->flags &= ~REQ_F_CQE_SKIP;
2365                         io_fill_cqe_req(link, res, 0);
2366                 }
2367                 io_put_req_deferred(link);
2368                 link = nxt;
2369         }
2370 }
2371
2372 static bool io_disarm_next(struct io_kiocb *req)
2373         __must_hold(&req->ctx->completion_lock)
2374 {
2375         bool posted = false;
2376
2377         if (req->flags & REQ_F_ARM_LTIMEOUT) {
2378                 struct io_kiocb *link = req->link;
2379
2380                 req->flags &= ~REQ_F_ARM_LTIMEOUT;
2381                 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
2382                         io_remove_next_linked(req);
2383                         /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */
2384                         io_fill_cqe_req(link, -ECANCELED, 0);
2385                         io_put_req_deferred(link);
2386                         posted = true;
2387                 }
2388         } else if (req->flags & REQ_F_LINK_TIMEOUT) {
2389                 struct io_ring_ctx *ctx = req->ctx;
2390
2391                 spin_lock_irq(&ctx->timeout_lock);
2392                 posted = io_kill_linked_timeout(req);
2393                 spin_unlock_irq(&ctx->timeout_lock);
2394         }
2395         if (unlikely((req->flags & REQ_F_FAIL) &&
2396                      !(req->flags & REQ_F_HARDLINK))) {
2397                 posted |= (req->link != NULL);
2398                 io_fail_links(req);
2399         }
2400         return posted;
2401 }
2402
2403 static void __io_req_find_next_prep(struct io_kiocb *req)
2404 {
2405         struct io_ring_ctx *ctx = req->ctx;
2406         bool posted;
2407
2408         spin_lock(&ctx->completion_lock);
2409         posted = io_disarm_next(req);
2410         if (posted)
2411                 io_commit_cqring(ctx);
2412         spin_unlock(&ctx->completion_lock);
2413         if (posted)
2414                 io_cqring_ev_posted(ctx);
2415 }
2416
2417 static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
2418 {
2419         struct io_kiocb *nxt;
2420
2421         if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
2422                 return NULL;
2423         /*
2424          * If LINK is set, we have dependent requests in this chain. If we
2425          * didn't fail this request, queue the first one up, moving any other
2426          * dependencies to the next request. In case of failure, fail the rest
2427          * of the chain.
2428          */
2429         if (unlikely(req->flags & IO_DISARM_MASK))
2430                 __io_req_find_next_prep(req);
2431         nxt = req->link;
2432         req->link = NULL;
2433         return nxt;
2434 }
2435
2436 static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
2437 {
2438         if (!ctx)
2439                 return;
2440         if (*locked) {
2441                 io_submit_flush_completions(ctx);
2442                 mutex_unlock(&ctx->uring_lock);
2443                 *locked = false;
2444         }
2445         percpu_ref_put(&ctx->refs);
2446 }
2447
2448 static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx)
2449 {
2450         io_commit_cqring(ctx);
2451         spin_unlock(&ctx->completion_lock);
2452         io_cqring_ev_posted(ctx);
2453 }
2454
2455 static void handle_prev_tw_list(struct io_wq_work_node *node,
2456                                 struct io_ring_ctx **ctx, bool *uring_locked)
2457 {
2458         if (*ctx && !*uring_locked)
2459                 spin_lock(&(*ctx)->completion_lock);
2460
2461         do {
2462                 struct io_wq_work_node *next = node->next;
2463                 struct io_kiocb *req = container_of(node, struct io_kiocb,
2464                                                     io_task_work.node);
2465
2466                 prefetch(container_of(next, struct io_kiocb, io_task_work.node));
2467
2468                 if (req->ctx != *ctx) {
2469                         if (unlikely(!*uring_locked && *ctx))
2470                                 ctx_commit_and_unlock(*ctx);
2471
2472                         ctx_flush_and_put(*ctx, uring_locked);
2473                         *ctx = req->ctx;
2474                         /* if not contended, grab and improve batching */
2475                         *uring_locked = mutex_trylock(&(*ctx)->uring_lock);
2476                         percpu_ref_get(&(*ctx)->refs);
2477                         if (unlikely(!*uring_locked))
2478                                 spin_lock(&(*ctx)->completion_lock);
2479                 }
2480                 if (likely(*uring_locked))
2481                         req->io_task_work.func(req, uring_locked);
2482                 else
2483                         __io_req_complete_post(req, req->result,
2484                                                 io_put_kbuf_comp(req));
2485                 node = next;
2486         } while (node);
2487
2488         if (unlikely(!*uring_locked))
2489                 ctx_commit_and_unlock(*ctx);
2490 }
2491
2492 static void handle_tw_list(struct io_wq_work_node *node,
2493                            struct io_ring_ctx **ctx, bool *locked)
2494 {
2495         do {
2496                 struct io_wq_work_node *next = node->next;
2497                 struct io_kiocb *req = container_of(node, struct io_kiocb,
2498                                                     io_task_work.node);
2499
2500                 prefetch(container_of(next, struct io_kiocb, io_task_work.node));
2501
2502                 if (req->ctx != *ctx) {
2503                         ctx_flush_and_put(*ctx, locked);
2504                         *ctx = req->ctx;
2505                         /* if not contended, grab and improve batching */
2506                         *locked = mutex_trylock(&(*ctx)->uring_lock);
2507                         percpu_ref_get(&(*ctx)->refs);
2508                 }
2509                 req->io_task_work.func(req, locked);
2510                 node = next;
2511         } while (node);
2512 }
2513
2514 static void tctx_task_work(struct callback_head *cb)
2515 {
2516         bool uring_locked = false;
2517         struct io_ring_ctx *ctx = NULL;
2518         struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
2519                                                   task_work);
2520
2521         while (1) {
2522                 struct io_wq_work_node *node1, *node2;
2523
2524                 if (!tctx->task_list.first &&
2525                     !tctx->prior_task_list.first && uring_locked)
2526                         io_submit_flush_completions(ctx);
2527
2528                 spin_lock_irq(&tctx->task_lock);
2529                 node1 = tctx->prior_task_list.first;
2530                 node2 = tctx->task_list.first;
2531                 INIT_WQ_LIST(&tctx->task_list);
2532                 INIT_WQ_LIST(&tctx->prior_task_list);
2533                 if (!node2 && !node1)
2534                         tctx->task_running = false;
2535                 spin_unlock_irq(&tctx->task_lock);
2536                 if (!node2 && !node1)
2537                         break;
2538
2539                 if (node1)
2540                         handle_prev_tw_list(node1, &ctx, &uring_locked);
2541
2542                 if (node2)
2543                         handle_tw_list(node2, &ctx, &uring_locked);
2544                 cond_resched();
2545         }
2546
2547         ctx_flush_and_put(ctx, &uring_locked);
2548
2549         /* relaxed read is enough as only the task itself sets ->in_idle */
2550         if (unlikely(atomic_read(&tctx->in_idle)))
2551                 io_uring_drop_tctx_refs(current);
2552 }
2553
2554 static void io_req_task_work_add(struct io_kiocb *req, bool priority)
2555 {
2556         struct task_struct *tsk = req->task;
2557         struct io_uring_task *tctx = tsk->io_uring;
2558         enum task_work_notify_mode notify;
2559         struct io_wq_work_node *node;
2560         unsigned long flags;
2561         bool running;
2562
2563         WARN_ON_ONCE(!tctx);
2564
2565         spin_lock_irqsave(&tctx->task_lock, flags);
2566         if (priority)
2567                 wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list);
2568         else
2569                 wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
2570         running = tctx->task_running;
2571         if (!running)
2572                 tctx->task_running = true;
2573         spin_unlock_irqrestore(&tctx->task_lock, flags);
2574
2575         /* task_work already pending, we're done */
2576         if (running)
2577                 return;
2578
2579         /*
2580          * SQPOLL kernel thread doesn't need notification, just a wakeup. For
2581          * all other cases, use TWA_SIGNAL unconditionally to ensure we're
2582          * processing task_work. There's no reliable way to tell if TWA_RESUME
2583          * will do the job.
2584          */
2585         notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
2586         if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
2587                 if (notify == TWA_NONE)
2588                         wake_up_process(tsk);
2589                 return;
2590         }
2591
2592         spin_lock_irqsave(&tctx->task_lock, flags);
2593         tctx->task_running = false;
2594         node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list);
2595         spin_unlock_irqrestore(&tctx->task_lock, flags);
2596
2597         while (node) {
2598                 req = container_of(node, struct io_kiocb, io_task_work.node);
2599                 node = node->next;
2600                 if (llist_add(&req->io_task_work.fallback_node,
2601                               &req->ctx->fallback_llist))
2602                         schedule_delayed_work(&req->ctx->fallback_work, 1);
2603         }
2604 }
2605
2606 static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
2607 {
2608         struct io_ring_ctx *ctx = req->ctx;
2609
2610         /* not needed for normal modes, but SQPOLL depends on it */
2611         io_tw_lock(ctx, locked);
2612         io_req_complete_failed(req, req->result);
2613 }
2614
2615 static void io_req_task_submit(struct io_kiocb *req, bool *locked)
2616 {
2617         struct io_ring_ctx *ctx = req->ctx;
2618
2619         io_tw_lock(ctx, locked);
2620         /* req->task == current here, checking PF_EXITING is safe */
2621         if (likely(!(req->task->flags & PF_EXITING)))
2622                 __io_queue_sqe(req);
2623         else
2624                 io_req_complete_failed(req, -EFAULT);
2625 }
2626
2627 static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
2628 {
2629         req->result = ret;
2630         req->io_task_work.func = io_req_task_cancel;
2631         io_req_task_work_add(req, false);
2632 }
2633
2634 static void io_req_task_queue(struct io_kiocb *req)
2635 {
2636         req->io_task_work.func = io_req_task_submit;
2637         io_req_task_work_add(req, false);
2638 }
2639
2640 static void io_req_task_queue_reissue(struct io_kiocb *req)
2641 {
2642         req->io_task_work.func = io_queue_async_work;
2643         io_req_task_work_add(req, false);
2644 }
2645
2646 static inline void io_queue_next(struct io_kiocb *req)
2647 {
2648         struct io_kiocb *nxt = io_req_find_next(req);
2649
2650         if (nxt)
2651                 io_req_task_queue(nxt);
2652 }
2653
2654 static void io_free_req(struct io_kiocb *req)
2655 {
2656         io_queue_next(req);
2657         __io_free_req(req);
2658 }
2659
2660 static void io_free_req_work(struct io_kiocb *req, bool *locked)
2661 {
2662         io_free_req(req);
2663 }
2664
2665 static void io_free_batch_list(struct io_ring_ctx *ctx,
2666                                 struct io_wq_work_node *node)
2667         __must_hold(&ctx->uring_lock)
2668 {
2669         struct task_struct *task = NULL;
2670         int task_refs = 0;
2671
2672         do {
2673                 struct io_kiocb *req = container_of(node, struct io_kiocb,
2674                                                     comp_list);
2675
2676                 if (unlikely(req->flags & REQ_F_REFCOUNT)) {
2677                         node = req->comp_list.next;
2678                         if (!req_ref_put_and_test(req))
2679                                 continue;
2680                 }
2681
2682                 io_req_put_rsrc_locked(req, ctx);
2683                 io_queue_next(req);
2684                 io_dismantle_req(req);
2685
2686                 if (req->task != task) {
2687                         if (task)
2688                                 io_put_task(task, task_refs);
2689                         task = req->task;
2690                         task_refs = 0;
2691                 }
2692                 task_refs++;
2693                 node = req->comp_list.next;
2694                 wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
2695         } while (node);
2696
2697         if (task)
2698                 io_put_task(task, task_refs);
2699 }
2700
2701 static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
2702         __must_hold(&ctx->uring_lock)
2703 {
2704         struct io_wq_work_node *node, *prev;
2705         struct io_submit_state *state = &ctx->submit_state;
2706
2707         if (state->flush_cqes) {
2708                 spin_lock(&ctx->completion_lock);
2709                 wq_list_for_each(node, prev, &state->compl_reqs) {
2710                         struct io_kiocb *req = container_of(node, struct io_kiocb,
2711                                                     comp_list);
2712
2713                         if (!(req->flags & REQ_F_CQE_SKIP))
2714                                 __io_fill_cqe_req(req, req->result, req->cflags);
2715                         if ((req->flags & REQ_F_POLLED) && req->apoll) {
2716                                 struct async_poll *apoll = req->apoll;
2717
2718                                 if (apoll->double_poll)
2719                                         kfree(apoll->double_poll);
2720                                 list_add(&apoll->poll.wait.entry,
2721                                                 &ctx->apoll_cache);
2722                                 req->flags &= ~REQ_F_POLLED;
2723                         }
2724                 }
2725
2726                 io_commit_cqring(ctx);
2727                 spin_unlock(&ctx->completion_lock);
2728                 io_cqring_ev_posted(ctx);
2729                 state->flush_cqes = false;
2730         }
2731
2732         io_free_batch_list(ctx, state->compl_reqs.first);
2733         INIT_WQ_LIST(&state->compl_reqs);
2734 }
2735
2736 /*
2737  * Drop reference to request, return next in chain (if there is one) if this
2738  * was the last reference to this request.
2739  */
2740 static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
2741 {
2742         struct io_kiocb *nxt = NULL;
2743
2744         if (req_ref_put_and_test(req)) {
2745                 nxt = io_req_find_next(req);
2746                 __io_free_req(req);
2747         }
2748         return nxt;
2749 }
2750
2751 static inline void io_put_req(struct io_kiocb *req)
2752 {
2753         if (req_ref_put_and_test(req))
2754                 io_free_req(req);
2755 }
2756
2757 static inline void io_put_req_deferred(struct io_kiocb *req)
2758 {
2759         if (req_ref_put_and_test(req)) {
2760                 req->io_task_work.func = io_free_req_work;
2761                 io_req_task_work_add(req, false);
2762         }
2763 }
2764
2765 static unsigned io_cqring_events(struct io_ring_ctx *ctx)
2766 {
2767         /* See comment at the top of this file */
2768         smp_rmb();
2769         return __io_cqring_events(ctx);
2770 }
2771
2772 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2773 {
2774         struct io_rings *rings = ctx->rings;
2775
2776         /* make sure SQ entry isn't read before tail */
2777         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2778 }
2779
2780 static inline bool io_run_task_work(void)
2781 {
2782         if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) {
2783                 __set_current_state(TASK_RUNNING);
2784                 clear_notify_signal();
2785                 if (task_work_pending(current))
2786                         task_work_run();
2787                 return true;
2788         }
2789
2790         return false;
2791 }
2792
2793 static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
2794 {
2795         struct io_wq_work_node *pos, *start, *prev;
2796         unsigned int poll_flags = BLK_POLL_NOSLEEP;
2797         DEFINE_IO_COMP_BATCH(iob);
2798         int nr_events = 0;
2799
2800         /*
2801          * Only spin for completions if we don't have multiple devices hanging
2802          * off our complete list.
2803          */
2804         if (ctx->poll_multi_queue || force_nonspin)
2805                 poll_flags |= BLK_POLL_ONESHOT;
2806
2807         wq_list_for_each(pos, start, &ctx->iopoll_list) {
2808                 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
2809                 struct kiocb *kiocb = &req->rw.kiocb;
2810                 int ret;
2811
2812                 /*
2813                  * Move completed and retryable entries to our local lists.
2814                  * If we find a request that requires polling, break out
2815                  * and complete those lists first, if we have entries there.
2816                  */
2817                 if (READ_ONCE(req->iopoll_completed))
2818                         break;
2819
2820                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
2821                 if (unlikely(ret < 0))
2822                         return ret;
2823                 else if (ret)
2824                         poll_flags |= BLK_POLL_ONESHOT;
2825
2826                 /* iopoll may have completed current req */
2827                 if (!rq_list_empty(iob.req_list) ||
2828                     READ_ONCE(req->iopoll_completed))
2829                         break;
2830         }
2831
2832         if (!rq_list_empty(iob.req_list))
2833                 iob.complete(&iob);
2834         else if (!pos)
2835                 return 0;
2836
2837         prev = start;
2838         wq_list_for_each_resume(pos, prev) {
2839                 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
2840
2841                 /* order with io_complete_rw_iopoll(), e.g. ->result updates */
2842                 if (!smp_load_acquire(&req->iopoll_completed))
2843                         break;
2844                 if (unlikely(req->flags & REQ_F_CQE_SKIP))
2845                         continue;
2846
2847                 __io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0));
2848                 nr_events++;
2849         }
2850
2851         if (unlikely(!nr_events))
2852                 return 0;
2853
2854         io_commit_cqring(ctx);
2855         io_cqring_ev_posted_iopoll(ctx);
2856         pos = start ? start->next : ctx->iopoll_list.first;
2857         wq_list_cut(&ctx->iopoll_list, prev, start);
2858         io_free_batch_list(ctx, pos);
2859         return nr_events;
2860 }
2861
2862 /*
2863  * We can't just wait for polled events to come to us, we have to actively
2864  * find and complete them.
2865  */
2866 static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
2867 {
2868         if (!(ctx->flags & IORING_SETUP_IOPOLL))
2869                 return;
2870
2871         mutex_lock(&ctx->uring_lock);
2872         while (!wq_list_empty(&ctx->iopoll_list)) {
2873                 /* let it sleep and repeat later if can't complete a request */
2874                 if (io_do_iopoll(ctx, true) == 0)
2875                         break;
2876                 /*
2877                  * Ensure we allow local-to-the-cpu processing to take place,
2878                  * in this case we need to ensure that we reap all events.
2879                  * Also let task_work, etc. to progress by releasing the mutex
2880                  */
2881                 if (need_resched()) {
2882                         mutex_unlock(&ctx->uring_lock);
2883                         cond_resched();
2884                         mutex_lock(&ctx->uring_lock);
2885                 }
2886         }
2887         mutex_unlock(&ctx->uring_lock);
2888 }
2889
2890 static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
2891 {
2892         unsigned int nr_events = 0;
2893         int ret = 0;
2894
2895         /*
2896          * We disallow the app entering submit/complete with polling, but we
2897          * still need to lock the ring to prevent racing with polled issue
2898          * that got punted to a workqueue.
2899          */
2900         mutex_lock(&ctx->uring_lock);
2901         /*
2902          * Don't enter poll loop if we already have events pending.
2903          * If we do, we can potentially be spinning for commands that
2904          * already triggered a CQE (eg in error).
2905          */
2906         if (test_bit(0, &ctx->check_cq_overflow))
2907                 __io_cqring_overflow_flush(ctx, false);
2908         if (io_cqring_events(ctx))
2909                 goto out;
2910         do {
2911                 /*
2912                  * If a submit got punted to a workqueue, we can have the
2913                  * application entering polling for a command before it gets
2914                  * issued. That app will hold the uring_lock for the duration
2915                  * of the poll right here, so we need to take a breather every
2916                  * now and then to ensure that the issue has a chance to add
2917                  * the poll to the issued list. Otherwise we can spin here
2918                  * forever, while the workqueue is stuck trying to acquire the
2919                  * very same mutex.
2920                  */
2921                 if (wq_list_empty(&ctx->iopoll_list)) {
2922                         u32 tail = ctx->cached_cq_tail;
2923
2924                         mutex_unlock(&ctx->uring_lock);
2925                         io_run_task_work();
2926                         mutex_lock(&ctx->uring_lock);
2927
2928                         /* some requests don't go through iopoll_list */
2929                         if (tail != ctx->cached_cq_tail ||
2930                             wq_list_empty(&ctx->iopoll_list))
2931                                 break;
2932                 }
2933                 ret = io_do_iopoll(ctx, !min);
2934                 if (ret < 0)
2935                         break;
2936                 nr_events += ret;
2937                 ret = 0;
2938         } while (nr_events < min && !need_resched());
2939 out:
2940         mutex_unlock(&ctx->uring_lock);
2941         return ret;
2942 }
2943
2944 static void kiocb_end_write(struct io_kiocb *req)
2945 {
2946         /*
2947          * Tell lockdep we inherited freeze protection from submission
2948          * thread.
2949          */
2950         if (req->flags & REQ_F_ISREG) {
2951                 struct super_block *sb = file_inode(req->file)->i_sb;
2952
2953                 __sb_writers_acquired(sb, SB_FREEZE_WRITE);
2954                 sb_end_write(sb);
2955         }
2956 }
2957
2958 #ifdef CONFIG_BLOCK
2959 static bool io_resubmit_prep(struct io_kiocb *req)
2960 {
2961         struct io_async_rw *rw = req->async_data;
2962
2963         if (!req_has_async_data(req))
2964                 return !io_req_prep_async(req);
2965         iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
2966         return true;
2967 }
2968
2969 static bool io_rw_should_reissue(struct io_kiocb *req)
2970 {
2971         umode_t mode = file_inode(req->file)->i_mode;
2972         struct io_ring_ctx *ctx = req->ctx;
2973
2974         if (!S_ISBLK(mode) && !S_ISREG(mode))
2975                 return false;
2976         if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
2977             !(ctx->flags & IORING_SETUP_IOPOLL)))
2978                 return false;
2979         /*
2980          * If ref is dying, we might be running poll reap from the exit work.
2981          * Don't attempt to reissue from that path, just let it fail with
2982          * -EAGAIN.
2983          */
2984         if (percpu_ref_is_dying(&ctx->refs))
2985                 return false;
2986         /*
2987          * Play it safe and assume not safe to re-import and reissue if we're
2988          * not in the original thread group (or in task context).
2989          */
2990         if (!same_thread_group(req->task, current) || !in_task())
2991                 return false;
2992         return true;
2993 }
2994 #else
2995 static bool io_resubmit_prep(struct io_kiocb *req)
2996 {
2997         return false;
2998 }
2999 static bool io_rw_should_reissue(struct io_kiocb *req)
3000 {
3001         return false;
3002 }
3003 #endif
3004
3005 static bool __io_complete_rw_common(struct io_kiocb *req, long res)
3006 {
3007         if (req->rw.kiocb.ki_flags & IOCB_WRITE) {
3008                 kiocb_end_write(req);
3009                 fsnotify_modify(req->file);
3010         } else {
3011                 fsnotify_access(req->file);
3012         }
3013         if (unlikely(res != req->result)) {
3014                 if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
3015                     io_rw_should_reissue(req)) {
3016                         req->flags |= REQ_F_REISSUE;
3017                         return true;
3018                 }
3019                 req_set_fail(req);
3020                 req->result = res;
3021         }
3022         return false;
3023 }
3024
3025 static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
3026 {
3027         int res = req->result;
3028
3029         if (*locked) {
3030                 io_req_complete_state(req, res, io_put_kbuf(req, 0));
3031                 io_req_add_compl_list(req);
3032         } else {
3033                 io_req_complete_post(req, res,
3034                                         io_put_kbuf(req, IO_URING_F_UNLOCKED));
3035         }
3036 }
3037
3038 static void __io_complete_rw(struct io_kiocb *req, long res,
3039                              unsigned int issue_flags)
3040 {
3041         if (__io_complete_rw_common(req, res))
3042                 return;
3043         __io_req_complete(req, issue_flags, req->result,
3044                                 io_put_kbuf(req, issue_flags));
3045 }
3046
3047 static void io_complete_rw(struct kiocb *kiocb, long res)
3048 {
3049         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
3050
3051         if (__io_complete_rw_common(req, res))
3052                 return;
3053         req->result = res;
3054         req->io_task_work.func = io_req_task_complete;
3055         io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL));
3056 }
3057
3058 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
3059 {
3060         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
3061
3062         if (kiocb->ki_flags & IOCB_WRITE)
3063                 kiocb_end_write(req);
3064         if (unlikely(res != req->result)) {
3065                 if (res == -EAGAIN && io_rw_should_reissue(req)) {
3066                         req->flags |= REQ_F_REISSUE;
3067                         return;
3068                 }
3069                 req->result = res;
3070         }
3071
3072         /* order with io_iopoll_complete() checking ->iopoll_completed */
3073         smp_store_release(&req->iopoll_completed, 1);
3074 }
3075
3076 /*
3077  * After the iocb has been issued, it's safe to be found on the poll list.
3078  * Adding the kiocb to the list AFTER submission ensures that we don't
3079  * find it from a io_do_iopoll() thread before the issuer is done
3080  * accessing the kiocb cookie.
3081  */
3082 static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
3083 {
3084         struct io_ring_ctx *ctx = req->ctx;
3085         const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
3086
3087         /* workqueue context doesn't hold uring_lock, grab it now */
3088         if (unlikely(needs_lock))
3089                 mutex_lock(&ctx->uring_lock);
3090
3091         /*
3092          * Track whether we have multiple files in our lists. This will impact
3093          * how we do polling eventually, not spinning if we're on potentially
3094          * different devices.
3095          */
3096         if (wq_list_empty(&ctx->iopoll_list)) {
3097                 ctx->poll_multi_queue = false;
3098         } else if (!ctx->poll_multi_queue) {
3099                 struct io_kiocb *list_req;
3100
3101                 list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
3102                                         comp_list);
3103                 if (list_req->file != req->file)
3104                         ctx->poll_multi_queue = true;
3105         }
3106
3107         /*
3108          * For fast devices, IO may have already completed. If it has, add
3109          * it to the front so we find it first.
3110          */
3111         if (READ_ONCE(req->iopoll_completed))
3112                 wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
3113         else
3114                 wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
3115
3116         if (unlikely(needs_lock)) {
3117                 /*
3118                  * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
3119                  * in sq thread task context or in io worker task context. If
3120                  * current task context is sq thread, we don't need to check
3121                  * whether should wake up sq thread.
3122                  */
3123                 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
3124                     wq_has_sleeper(&ctx->sq_data->wait))
3125                         wake_up(&ctx->sq_data->wait);
3126
3127                 mutex_unlock(&ctx->uring_lock);
3128         }
3129 }
3130
3131 static bool io_bdev_nowait(struct block_device *bdev)
3132 {
3133         return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
3134 }
3135
3136 /*
3137  * If we tracked the file through the SCM inflight mechanism, we could support
3138  * any file. For now, just ensure that anything potentially problematic is done
3139  * inline.
3140  */
3141 static bool __io_file_supports_nowait(struct file *file, umode_t mode)
3142 {
3143         if (S_ISBLK(mode)) {
3144                 if (IS_ENABLED(CONFIG_BLOCK) &&
3145                     io_bdev_nowait(I_BDEV(file->f_mapping->host)))
3146                         return true;
3147                 return false;
3148         }
3149         if (S_ISSOCK(mode))
3150                 return true;
3151         if (S_ISREG(mode)) {
3152                 if (IS_ENABLED(CONFIG_BLOCK) &&
3153                     io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
3154                     file->f_op != &io_uring_fops)
3155                         return true;
3156                 return false;
3157         }
3158
3159         /* any ->read/write should understand O_NONBLOCK */
3160         if (file->f_flags & O_NONBLOCK)
3161                 return true;
3162         return file->f_mode & FMODE_NOWAIT;
3163 }
3164
3165 /*
3166  * If we tracked the file through the SCM inflight mechanism, we could support
3167  * any file. For now, just ensure that anything potentially problematic is done
3168  * inline.
3169  */
3170 static unsigned int io_file_get_flags(struct file *file)
3171 {
3172         umode_t mode = file_inode(file)->i_mode;
3173         unsigned int res = 0;
3174
3175         if (S_ISREG(mode))
3176                 res |= FFS_ISREG;
3177         if (__io_file_supports_nowait(file, mode))
3178                 res |= FFS_NOWAIT;
3179         return res;
3180 }
3181
3182 static inline bool io_file_supports_nowait(struct io_kiocb *req)
3183 {
3184         return req->flags & REQ_F_SUPPORT_NOWAIT;
3185 }
3186
3187 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3188 {
3189         struct io_ring_ctx *ctx = req->ctx;
3190         struct kiocb *kiocb = &req->rw.kiocb;
3191         struct file *file = req->file;
3192         unsigned ioprio;
3193         int ret;
3194
3195         if (!io_req_ffs_set(req))
3196                 req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
3197
3198         kiocb->ki_pos = READ_ONCE(sqe->off);
3199         kiocb->ki_flags = iocb_flags(file);
3200         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
3201         if (unlikely(ret))
3202                 return ret;
3203
3204         /*
3205          * If the file is marked O_NONBLOCK, still allow retry for it if it
3206          * supports async. Otherwise it's impossible to use O_NONBLOCK files
3207          * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
3208          */
3209         if ((kiocb->ki_flags & IOCB_NOWAIT) ||
3210             ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
3211                 req->flags |= REQ_F_NOWAIT;
3212
3213         if (ctx->flags & IORING_SETUP_IOPOLL) {
3214                 if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
3215                         return -EOPNOTSUPP;
3216
3217                 kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
3218                 kiocb->ki_complete = io_complete_rw_iopoll;
3219                 req->iopoll_completed = 0;
3220         } else {
3221                 if (kiocb->ki_flags & IOCB_HIPRI)
3222                         return -EINVAL;
3223                 kiocb->ki_complete = io_complete_rw;
3224         }
3225
3226         ioprio = READ_ONCE(sqe->ioprio);
3227         if (ioprio) {
3228                 ret = ioprio_check_cap(ioprio);
3229                 if (ret)
3230                         return ret;
3231
3232                 kiocb->ki_ioprio = ioprio;
3233         } else {
3234                 kiocb->ki_ioprio = get_current_ioprio();
3235         }
3236
3237         req->imu = NULL;
3238         req->rw.addr = READ_ONCE(sqe->addr);
3239         req->rw.len = READ_ONCE(sqe->len);
3240         req->buf_index = READ_ONCE(sqe->buf_index);
3241         return 0;
3242 }
3243
3244 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
3245 {
3246         switch (ret) {
3247         case -EIOCBQUEUED:
3248                 break;
3249         case -ERESTARTSYS:
3250         case -ERESTARTNOINTR:
3251         case -ERESTARTNOHAND:
3252         case -ERESTART_RESTARTBLOCK:
3253                 /*
3254                  * We can't just restart the syscall, since previously
3255                  * submitted sqes may already be in progress. Just fail this
3256                  * IO with EINTR.
3257                  */
3258                 ret = -EINTR;
3259                 fallthrough;
3260         default:
3261                 kiocb->ki_complete(kiocb, ret);
3262         }
3263 }
3264
3265 static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
3266 {
3267         struct kiocb *kiocb = &req->rw.kiocb;
3268         bool is_stream = req->file->f_mode & FMODE_STREAM;
3269
3270         if (kiocb->ki_pos == -1) {
3271                 if (!is_stream) {
3272                         req->flags |= REQ_F_CUR_POS;
3273                         kiocb->ki_pos = req->file->f_pos;
3274                         return &kiocb->ki_pos;
3275                 } else {
3276                         kiocb->ki_pos = 0;
3277                         return NULL;
3278                 }
3279         }
3280         return is_stream ? NULL : &kiocb->ki_pos;
3281 }
3282
3283 static void kiocb_done(struct io_kiocb *req, ssize_t ret,
3284                        unsigned int issue_flags)
3285 {
3286         struct io_async_rw *io = req->async_data;
3287
3288         /* add previously done IO, if any */
3289         if (req_has_async_data(req) && io->bytes_done > 0) {
3290                 if (ret < 0)
3291                         ret = io->bytes_done;
3292                 else
3293                         ret += io->bytes_done;
3294         }
3295
3296         if (req->flags & REQ_F_CUR_POS)
3297                 req->file->f_pos = req->rw.kiocb.ki_pos;
3298         if (ret >= 0 && (req->rw.kiocb.ki_complete == io_complete_rw))
3299                 __io_complete_rw(req, ret, issue_flags);
3300         else
3301                 io_rw_done(&req->rw.kiocb, ret);
3302
3303         if (req->flags & REQ_F_REISSUE) {
3304                 req->flags &= ~REQ_F_REISSUE;
3305                 if (io_resubmit_prep(req))
3306                         io_req_task_queue_reissue(req);
3307                 else
3308                         io_req_task_queue_fail(req, ret);
3309         }
3310 }
3311
3312 static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
3313                              struct io_mapped_ubuf *imu)
3314 {
3315         size_t len = req->rw.len;
3316         u64 buf_end, buf_addr = req->rw.addr;
3317         size_t offset;
3318
3319         if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
3320                 return -EFAULT;
3321         /* not inside the mapped region */
3322         if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
3323                 return -EFAULT;
3324
3325         /*
3326          * May not be a start of buffer, set size appropriately
3327          * and advance us to the beginning.
3328          */
3329         offset = buf_addr - imu->ubuf;
3330         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
3331
3332         if (offset) {
3333                 /*
3334                  * Don't use iov_iter_advance() here, as it's really slow for
3335                  * using the latter parts of a big fixed buffer - it iterates
3336                  * over each segment manually. We can cheat a bit here, because
3337                  * we know that:
3338                  *
3339                  * 1) it's a BVEC iter, we set it up
3340                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
3341                  *    first and last bvec
3342                  *
3343                  * So just find our index, and adjust the iterator afterwards.
3344                  * If the offset is within the first bvec (or the whole first
3345                  * bvec, just use iov_iter_advance(). This makes it easier
3346                  * since we can just skip the first segment, which may not
3347                  * be PAGE_SIZE aligned.
3348                  */
3349                 const struct bio_vec *bvec = imu->bvec;
3350
3351                 if (offset <= bvec->bv_len) {
3352                         iov_iter_advance(iter, offset);
3353                 } else {
3354                         unsigned long seg_skip;
3355
3356                         /* skip first vec */
3357                         offset -= bvec->bv_len;
3358                         seg_skip = 1 + (offset >> PAGE_SHIFT);
3359
3360                         iter->bvec = bvec + seg_skip;
3361                         iter->nr_segs -= seg_skip;
3362                         iter->count -= bvec->bv_len + offset;
3363                         iter->iov_offset = offset & ~PAGE_MASK;
3364                 }
3365         }
3366
3367         return 0;
3368 }
3369
3370 static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
3371 {
3372         struct io_mapped_ubuf *imu = req->imu;
3373         u16 index, buf_index = req->buf_index;
3374
3375         if (likely(!imu)) {
3376                 struct io_ring_ctx *ctx = req->ctx;
3377
3378                 if (unlikely(buf_index >= ctx->nr_user_bufs))
3379                         return -EFAULT;
3380                 io_req_set_rsrc_node(req, ctx);
3381                 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
3382                 imu = READ_ONCE(ctx->user_bufs[index]);
3383                 req->imu = imu;
3384         }
3385         return __io_import_fixed(req, rw, iter, imu);
3386 }
3387
3388 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
3389 {
3390         if (needs_lock)
3391                 mutex_unlock(&ctx->uring_lock);
3392 }
3393
3394 static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
3395 {
3396         /*
3397          * "Normal" inline submissions always hold the uring_lock, since we
3398          * grab it from the system call. Same is true for the SQPOLL offload.
3399          * The only exception is when we've detached the request and issue it
3400          * from an async worker thread, grab the lock for that case.
3401          */
3402         if (needs_lock)
3403                 mutex_lock(&ctx->uring_lock);
3404 }
3405
3406 static void io_buffer_add_list(struct io_ring_ctx *ctx,
3407                                struct io_buffer_list *bl, unsigned int bgid)
3408 {
3409         struct list_head *list;
3410
3411         list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
3412         INIT_LIST_HEAD(&bl->buf_list);
3413         bl->bgid = bgid;
3414         list_add(&bl->list, list);
3415 }
3416
3417 static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
3418                                           int bgid, unsigned int issue_flags)
3419 {
3420         struct io_buffer *kbuf = req->kbuf;
3421         bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
3422         struct io_ring_ctx *ctx = req->ctx;
3423         struct io_buffer_list *bl;
3424
3425         if (req->flags & REQ_F_BUFFER_SELECTED)
3426                 return kbuf;
3427
3428         io_ring_submit_lock(ctx, needs_lock);
3429
3430         lockdep_assert_held(&ctx->uring_lock);
3431
3432         bl = io_buffer_get_list(ctx, bgid);
3433         if (bl && !list_empty(&bl->buf_list)) {
3434                 kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
3435                 list_del(&kbuf->list);
3436                 if (*len > kbuf->len)
3437                         *len = kbuf->len;
3438                 req->flags |= REQ_F_BUFFER_SELECTED;
3439                 req->kbuf = kbuf;
3440         } else {
3441                 kbuf = ERR_PTR(-ENOBUFS);
3442         }
3443
3444         io_ring_submit_unlock(req->ctx, needs_lock);
3445         return kbuf;
3446 }
3447
3448 static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
3449                                         unsigned int issue_flags)
3450 {
3451         struct io_buffer *kbuf;
3452         u16 bgid;
3453
3454         bgid = req->buf_index;
3455         kbuf = io_buffer_select(req, len, bgid, issue_flags);
3456         if (IS_ERR(kbuf))
3457                 return kbuf;
3458         return u64_to_user_ptr(kbuf->addr);
3459 }
3460
3461 #ifdef CONFIG_COMPAT
3462 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
3463                                 unsigned int issue_flags)
3464 {
3465         struct compat_iovec __user *uiov;
3466         compat_ssize_t clen;
3467         void __user *buf;
3468         ssize_t len;
3469
3470         uiov = u64_to_user_ptr(req->rw.addr);
3471         if (!access_ok(uiov, sizeof(*uiov)))
3472                 return -EFAULT;
3473         if (__get_user(clen, &uiov->iov_len))
3474                 return -EFAULT;
3475         if (clen < 0)
3476                 return -EINVAL;
3477
3478         len = clen;
3479         buf = io_rw_buffer_select(req, &len, issue_flags);
3480         if (IS_ERR(buf))
3481                 return PTR_ERR(buf);
3482         iov[0].iov_base = buf;
3483         iov[0].iov_len = (compat_size_t) len;
3484         return 0;
3485 }
3486 #endif
3487
3488 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3489                                       unsigned int issue_flags)
3490 {
3491         struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
3492         void __user *buf;
3493         ssize_t len;
3494
3495         if (copy_from_user(iov, uiov, sizeof(*uiov)))
3496                 return -EFAULT;
3497
3498         len = iov[0].iov_len;
3499         if (len < 0)
3500                 return -EINVAL;
3501         buf = io_rw_buffer_select(req, &len, issue_flags);
3502         if (IS_ERR(buf))
3503                 return PTR_ERR(buf);
3504         iov[0].iov_base = buf;
3505         iov[0].iov_len = len;
3506         return 0;
3507 }
3508
3509 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3510                                     unsigned int issue_flags)
3511 {
3512         if (req->flags & REQ_F_BUFFER_SELECTED) {
3513                 struct io_buffer *kbuf = req->kbuf;
3514
3515                 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
3516                 iov[0].iov_len = kbuf->len;
3517                 return 0;
3518         }
3519         if (req->rw.len != 1)
3520                 return -EINVAL;
3521
3522 #ifdef CONFIG_COMPAT
3523         if (req->ctx->compat)
3524                 return io_compat_import(req, iov, issue_flags);
3525 #endif
3526
3527         return __io_iov_buffer_select(req, iov, issue_flags);
3528 }
3529
3530 static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
3531                                        struct io_rw_state *s,
3532                                        unsigned int issue_flags)
3533 {
3534         struct iov_iter *iter = &s->iter;
3535         u8 opcode = req->opcode;
3536         struct iovec *iovec;
3537         void __user *buf;
3538         size_t sqe_len;
3539         ssize_t ret;
3540
3541         if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
3542                 ret = io_import_fixed(req, rw, iter);
3543                 if (ret)
3544                         return ERR_PTR(ret);
3545                 return NULL;
3546         }
3547
3548         /* buffer index only valid with fixed read/write, or buffer select  */
3549         if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
3550                 return ERR_PTR(-EINVAL);
3551
3552         buf = u64_to_user_ptr(req->rw.addr);
3553         sqe_len = req->rw.len;
3554
3555         if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
3556                 if (req->flags & REQ_F_BUFFER_SELECT) {
3557                         buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
3558                         if (IS_ERR(buf))
3559                                 return ERR_CAST(buf);
3560                         req->rw.len = sqe_len;
3561                 }
3562
3563                 ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter);
3564                 if (ret)
3565                         return ERR_PTR(ret);
3566                 return NULL;
3567         }
3568
3569         iovec = s->fast_iov;
3570         if (req->flags & REQ_F_BUFFER_SELECT) {
3571                 ret = io_iov_buffer_select(req, iovec, issue_flags);
3572                 if (ret)
3573                         return ERR_PTR(ret);
3574                 iov_iter_init(iter, rw, iovec, 1, iovec->iov_len);
3575                 return NULL;
3576         }
3577
3578         ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
3579                               req->ctx->compat);
3580         if (unlikely(ret < 0))
3581                 return ERR_PTR(ret);
3582         return iovec;
3583 }
3584
3585 static inline int io_import_iovec(int rw, struct io_kiocb *req,
3586                                   struct iovec **iovec, struct io_rw_state *s,
3587                                   unsigned int issue_flags)
3588 {
3589         *iovec = __io_import_iovec(rw, req, s, issue_flags);
3590         if (unlikely(IS_ERR(*iovec)))
3591                 return PTR_ERR(*iovec);
3592
3593         iov_iter_save_state(&s->iter, &s->iter_state);
3594         return 0;
3595 }
3596
3597 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
3598 {
3599         return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
3600 }
3601
3602 /*
3603  * For files that don't have ->read_iter() and ->write_iter(), handle them
3604  * by looping over ->read() or ->write() manually.
3605  */
3606 static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
3607 {
3608         struct kiocb *kiocb = &req->rw.kiocb;
3609         struct file *file = req->file;
3610         ssize_t ret = 0;
3611         loff_t *ppos;
3612
3613         /*
3614          * Don't support polled IO through this interface, and we can't
3615          * support non-blocking either. For the latter, this just causes
3616          * the kiocb to be handled from an async context.
3617          */
3618         if (kiocb->ki_flags & IOCB_HIPRI)
3619                 return -EOPNOTSUPP;
3620         if ((kiocb->ki_flags & IOCB_NOWAIT) &&
3621             !(kiocb->ki_filp->f_flags & O_NONBLOCK))
3622                 return -EAGAIN;
3623
3624         ppos = io_kiocb_ppos(kiocb);
3625
3626         while (iov_iter_count(iter)) {
3627                 struct iovec iovec;
3628                 ssize_t nr;
3629
3630                 if (!iov_iter_is_bvec(iter)) {
3631                         iovec = iov_iter_iovec(iter);
3632                 } else {
3633                         iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3634                         iovec.iov_len = req->rw.len;
3635                 }
3636
3637                 if (rw == READ) {
3638                         nr = file->f_op->read(file, iovec.iov_base,
3639                                               iovec.iov_len, ppos);
3640                 } else {
3641                         nr = file->f_op->write(file, iovec.iov_base,
3642                                                iovec.iov_len, ppos);
3643                 }
3644
3645                 if (nr < 0) {
3646                         if (!ret)
3647                                 ret = nr;
3648                         break;
3649                 }
3650                 ret += nr;
3651                 if (!iov_iter_is_bvec(iter)) {
3652                         iov_iter_advance(iter, nr);
3653                 } else {
3654                         req->rw.addr += nr;
3655                         req->rw.len -= nr;
3656                         if (!req->rw.len)
3657                                 break;
3658                 }
3659                 if (nr != iovec.iov_len)
3660                         break;
3661         }
3662
3663         return ret;
3664 }
3665
3666 static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3667                           const struct iovec *fast_iov, struct iov_iter *iter)
3668 {
3669         struct io_async_rw *rw = req->async_data;
3670
3671         memcpy(&rw->s.iter, iter, sizeof(*iter));
3672         rw->free_iovec = iovec;
3673         rw->bytes_done = 0;
3674         /* can only be fixed buffers, no need to do anything */
3675         if (iov_iter_is_bvec(iter))
3676                 return;
3677         if (!iovec) {
3678                 unsigned iov_off = 0;
3679
3680                 rw->s.iter.iov = rw->s.fast_iov;
3681                 if (iter->iov != fast_iov) {
3682                         iov_off = iter->iov - fast_iov;
3683                         rw->s.iter.iov += iov_off;
3684                 }
3685                 if (rw->s.fast_iov != fast_iov)
3686                         memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off,
3687                                sizeof(struct iovec) * iter->nr_segs);
3688         } else {
3689                 req->flags |= REQ_F_NEED_CLEANUP;
3690         }
3691 }
3692
3693 static inline bool io_alloc_async_data(struct io_kiocb *req)
3694 {
3695         WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3696         req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
3697         if (req->async_data) {
3698                 req->flags |= REQ_F_ASYNC_DATA;
3699                 return false;
3700         }
3701         return true;
3702 }
3703
3704 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3705                              struct io_rw_state *s, bool force)
3706 {
3707         if (!force && !io_op_defs[req->opcode].needs_async_setup)
3708                 return 0;
3709         if (!req_has_async_data(req)) {
3710                 struct io_async_rw *iorw;
3711
3712                 if (io_alloc_async_data(req)) {
3713                         kfree(iovec);
3714                         return -ENOMEM;
3715                 }
3716
3717                 io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
3718                 iorw = req->async_data;
3719                 /* we've copied and mapped the iter, ensure state is saved */
3720                 iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
3721         }
3722         return 0;
3723 }
3724
3725 static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
3726 {
3727         struct io_async_rw *iorw = req->async_data;
3728         struct iovec *iov;
3729         int ret;
3730
3731         /* submission path, ->uring_lock should already be taken */
3732         ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
3733         if (unlikely(ret < 0))
3734                 return ret;
3735
3736         iorw->bytes_done = 0;
3737         iorw->free_iovec = iov;
3738         if (iov)
3739                 req->flags |= REQ_F_NEED_CLEANUP;
3740         return 0;
3741 }
3742
3743 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3744 {
3745         if (unlikely(!(req->file->f_mode & FMODE_READ)))
3746                 return -EBADF;
3747         return io_prep_rw(req, sqe);
3748 }
3749
3750 /*
3751  * This is our waitqueue callback handler, registered through __folio_lock_async()
3752  * when we initially tried to do the IO with the iocb armed our waitqueue.
3753  * This gets called when the page is unlocked, and we generally expect that to
3754  * happen when the page IO is completed and the page is now uptodate. This will
3755  * queue a task_work based retry of the operation, attempting to copy the data
3756  * again. If the latter fails because the page was NOT uptodate, then we will
3757  * do a thread based blocking retry of the operation. That's the unexpected
3758  * slow path.
3759  */
3760 static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3761                              int sync, void *arg)
3762 {
3763         struct wait_page_queue *wpq;
3764         struct io_kiocb *req = wait->private;
3765         struct wait_page_key *key = arg;
3766
3767         wpq = container_of(wait, struct wait_page_queue, wait);
3768
3769         if (!wake_page_match(wpq, key))
3770                 return 0;
3771
3772         req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
3773         list_del_init(&wait->entry);
3774         io_req_task_queue(req);
3775         return 1;
3776 }
3777
3778 /*
3779  * This controls whether a given IO request should be armed for async page
3780  * based retry. If we return false here, the request is handed to the async
3781  * worker threads for retry. If we're doing buffered reads on a regular file,
3782  * we prepare a private wait_page_queue entry and retry the operation. This
3783  * will either succeed because the page is now uptodate and unlocked, or it
3784  * will register a callback when the page is unlocked at IO completion. Through
3785  * that callback, io_uring uses task_work to setup a retry of the operation.
3786  * That retry will attempt the buffered read again. The retry will generally
3787  * succeed, or in rare cases where it fails, we then fall back to using the
3788  * async worker threads for a blocking retry.
3789  */
3790 static bool io_rw_should_retry(struct io_kiocb *req)
3791 {
3792         struct io_async_rw *rw = req->async_data;
3793         struct wait_page_queue *wait = &rw->wpq;
3794         struct kiocb *kiocb = &req->rw.kiocb;
3795
3796         /* never retry for NOWAIT, we just complete with -EAGAIN */
3797         if (req->flags & REQ_F_NOWAIT)
3798                 return false;
3799
3800         /* Only for buffered IO */
3801         if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
3802                 return false;
3803
3804         /*
3805          * just use poll if we can, and don't attempt if the fs doesn't
3806          * support callback based unlocks
3807          */
3808         if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3809                 return false;
3810
3811         wait->wait.func = io_async_buf_func;
3812         wait->wait.private = req;
3813         wait->wait.flags = 0;
3814         INIT_LIST_HEAD(&wait->wait.entry);
3815         kiocb->ki_flags |= IOCB_WAITQ;
3816         kiocb->ki_flags &= ~IOCB_NOWAIT;
3817         kiocb->ki_waitq = wait;
3818         return true;
3819 }
3820
3821 static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
3822 {
3823         if (likely(req->file->f_op->read_iter))
3824                 return call_read_iter(req->file, &req->rw.kiocb, iter);
3825         else if (req->file->f_op->read)
3826                 return loop_rw_iter(READ, req, iter);
3827         else
3828                 return -EINVAL;
3829 }
3830
3831 static bool need_read_all(struct io_kiocb *req)
3832 {
3833         return req->flags & REQ_F_ISREG ||
3834                 S_ISBLK(file_inode(req->file)->i_mode);
3835 }
3836
3837 static int io_read(struct io_kiocb *req, unsigned int issue_flags)
3838 {
3839         struct io_rw_state __s, *s = &__s;
3840         struct iovec *iovec;
3841         struct kiocb *kiocb = &req->rw.kiocb;
3842         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3843         struct io_async_rw *rw;
3844         ssize_t ret, ret2;
3845         loff_t *ppos;
3846
3847         if (!req_has_async_data(req)) {
3848                 ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
3849                 if (unlikely(ret < 0))
3850                         return ret;
3851         } else {
3852                 /*
3853                  * Safe and required to re-import if we're using provided
3854                  * buffers, as we dropped the selected one before retry.
3855                  */
3856                 if (req->flags & REQ_F_BUFFER_SELECT) {
3857                         ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
3858                         if (unlikely(ret < 0))
3859                                 return ret;
3860                 }
3861
3862                 rw = req->async_data;
3863                 s = &rw->s;
3864                 /*
3865                  * We come here from an earlier attempt, restore our state to
3866                  * match in case it doesn't. It's cheap enough that we don't
3867                  * need to make this conditional.
3868                  */
3869                 iov_iter_restore(&s->iter, &s->iter_state);
3870                 iovec = NULL;
3871         }
3872         req->result = iov_iter_count(&s->iter);
3873
3874         if (force_nonblock) {
3875                 /* If the file doesn't support async, just async punt */
3876                 if (unlikely(!io_file_supports_nowait(req))) {
3877                         ret = io_setup_async_rw(req, iovec, s, true);
3878                         return ret ?: -EAGAIN;
3879                 }
3880                 kiocb->ki_flags |= IOCB_NOWAIT;
3881         } else {
3882                 /* Ensure we clear previously set non-block flag */
3883                 kiocb->ki_flags &= ~IOCB_NOWAIT;
3884         }
3885
3886         ppos = io_kiocb_update_pos(req);
3887
3888         ret = rw_verify_area(READ, req->file, ppos, req->result);
3889         if (unlikely(ret)) {
3890                 kfree(iovec);
3891                 return ret;
3892         }
3893
3894         ret = io_iter_do_read(req, &s->iter);
3895
3896         if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
3897                 req->flags &= ~REQ_F_REISSUE;
3898                 /* if we can poll, just do that */
3899                 if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
3900                         return -EAGAIN;
3901                 /* IOPOLL retry should happen for io-wq threads */
3902                 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
3903                         goto done;
3904                 /* no retry on NONBLOCK nor RWF_NOWAIT */
3905                 if (req->flags & REQ_F_NOWAIT)
3906                         goto done;
3907                 ret = 0;
3908         } else if (ret == -EIOCBQUEUED) {
3909                 goto out_free;
3910         } else if (ret == req->result || ret <= 0 || !force_nonblock ||
3911                    (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
3912                 /* read all, failed, already did sync or don't want to retry */
3913                 goto done;
3914         }
3915
3916         /*
3917          * Don't depend on the iter state matching what was consumed, or being
3918          * untouched in case of error. Restore it and we'll advance it
3919          * manually if we need to.
3920          */
3921         iov_iter_restore(&s->iter, &s->iter_state);
3922
3923         ret2 = io_setup_async_rw(req, iovec, s, true);
3924         if (ret2)
3925                 return ret2;
3926
3927         iovec = NULL;
3928         rw = req->async_data;
3929         s = &rw->s;
3930         /*
3931          * Now use our persistent iterator and state, if we aren't already.
3932          * We've restored and mapped the iter to match.
3933          */
3934
3935         do {
3936                 /*
3937                  * We end up here because of a partial read, either from
3938                  * above or inside this loop. Advance the iter by the bytes
3939                  * that were consumed.
3940                  */
3941                 iov_iter_advance(&s->iter, ret);
3942                 if (!iov_iter_count(&s->iter))
3943                         break;
3944                 rw->bytes_done += ret;
3945                 iov_iter_save_state(&s->iter, &s->iter_state);
3946
3947                 /* if we can retry, do so with the callbacks armed */
3948                 if (!io_rw_should_retry(req)) {
3949                         kiocb->ki_flags &= ~IOCB_WAITQ;
3950                         return -EAGAIN;
3951                 }
3952
3953                 /*
3954                  * Now retry read with the IOCB_WAITQ parts set in the iocb. If
3955                  * we get -EIOCBQUEUED, then we'll get a notification when the
3956                  * desired page gets unlocked. We can also get a partial read
3957                  * here, and if we do, then just retry at the new offset.
3958                  */
3959                 ret = io_iter_do_read(req, &s->iter);
3960                 if (ret == -EIOCBQUEUED)
3961                         return 0;
3962                 /* we got some bytes, but not all. retry. */
3963                 kiocb->ki_flags &= ~IOCB_WAITQ;
3964                 iov_iter_restore(&s->iter, &s->iter_state);
3965         } while (ret > 0);
3966 done:
3967         kiocb_done(req, ret, issue_flags);
3968 out_free:
3969         /* it's faster to check here then delegate to kfree */
3970         if (iovec)
3971                 kfree(iovec);
3972         return 0;
3973 }
3974
3975 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3976 {
3977         if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3978                 return -EBADF;
3979         return io_prep_rw(req, sqe);
3980 }
3981
3982 static int io_write(struct io_kiocb *req, unsigned int issue_flags)
3983 {
3984         struct io_rw_state __s, *s = &__s;
3985         struct iovec *iovec;
3986         struct kiocb *kiocb = &req->rw.kiocb;
3987         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3988         ssize_t ret, ret2;
3989         loff_t *ppos;
3990
3991         if (!req_has_async_data(req)) {
3992                 ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
3993                 if (unlikely(ret < 0))
3994                         return ret;
3995         } else {
3996                 struct io_async_rw *rw = req->async_data;
3997
3998                 s = &rw->s;
3999                 iov_iter_restore(&s->iter, &s->iter_state);
4000                 iovec = NULL;
4001         }
4002         req->result = iov_iter_count(&s->iter);
4003
4004         if (force_nonblock) {
4005                 /* If the file doesn't support async, just async punt */
4006                 if (unlikely(!io_file_supports_nowait(req)))
4007                         goto copy_iov;
4008
4009                 /* file path doesn't support NOWAIT for non-direct_IO */
4010                 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
4011                     (req->flags & REQ_F_ISREG))
4012                         goto copy_iov;
4013
4014                 kiocb->ki_flags |= IOCB_NOWAIT;
4015         } else {
4016                 /* Ensure we clear previously set non-block flag */
4017                 kiocb->ki_flags &= ~IOCB_NOWAIT;
4018         }
4019
4020         ppos = io_kiocb_update_pos(req);
4021
4022         ret = rw_verify_area(WRITE, req->file, ppos, req->result);
4023         if (unlikely(ret))
4024                 goto out_free;
4025
4026         /*
4027          * Open-code file_start_write here to grab freeze protection,
4028          * which will be released by another thread in
4029          * io_complete_rw().  Fool lockdep by telling it the lock got
4030          * released so that it doesn't complain about the held lock when
4031          * we return to userspace.
4032          */
4033         if (req->flags & REQ_F_ISREG) {
4034                 sb_start_write(file_inode(req->file)->i_sb);
4035                 __sb_writers_release(file_inode(req->file)->i_sb,
4036                                         SB_FREEZE_WRITE);
4037         }
4038         kiocb->ki_flags |= IOCB_WRITE;
4039
4040         if (likely(req->file->f_op->write_iter))
4041                 ret2 = call_write_iter(req->file, kiocb, &s->iter);
4042         else if (req->file->f_op->write)
4043                 ret2 = loop_rw_iter(WRITE, req, &s->iter);
4044         else
4045                 ret2 = -EINVAL;
4046
4047         if (req->flags & REQ_F_REISSUE) {
4048                 req->flags &= ~REQ_F_REISSUE;
4049                 ret2 = -EAGAIN;
4050         }
4051
4052         /*
4053          * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
4054          * retry them without IOCB_NOWAIT.
4055          */
4056         if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
4057                 ret2 = -EAGAIN;
4058         /* no retry on NONBLOCK nor RWF_NOWAIT */
4059         if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
4060                 goto done;
4061         if (!force_nonblock || ret2 != -EAGAIN) {
4062                 /* IOPOLL retry should happen for io-wq threads */
4063                 if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
4064                         goto copy_iov;
4065 done:
4066                 kiocb_done(req, ret2, issue_flags);
4067         } else {
4068 copy_iov:
4069                 iov_iter_restore(&s->iter, &s->iter_state);
4070                 ret = io_setup_async_rw(req, iovec, s, false);
4071                 return ret ?: -EAGAIN;
4072         }
4073 out_free:
4074         /* it's reportedly faster than delegating the null check to kfree() */
4075         if (iovec)
4076                 kfree(iovec);
4077         return ret;
4078 }
4079
4080 static int io_renameat_prep(struct io_kiocb *req,
4081                             const struct io_uring_sqe *sqe)
4082 {
4083         struct io_rename *ren = &req->rename;
4084         const char __user *oldf, *newf;
4085
4086         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4087                 return -EINVAL;
4088         if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
4089                 return -EINVAL;
4090         if (unlikely(req->flags & REQ_F_FIXED_FILE))
4091                 return -EBADF;
4092
4093         ren->old_dfd = READ_ONCE(sqe->fd);
4094         oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
4095         newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4096         ren->new_dfd = READ_ONCE(sqe->len);
4097         ren->flags = READ_ONCE(sqe->rename_flags);
4098
4099         ren->oldpath = getname(oldf);
4100         if (IS_ERR(ren->oldpath))
4101                 return PTR_ERR(ren->oldpath);
4102
4103         ren->newpath = getname(newf);
4104         if (IS_ERR(ren->newpath)) {
4105                 putname(ren->oldpath);
4106                 return PTR_ERR(ren->newpath);
4107         }
4108
4109         req->flags |= REQ_F_NEED_CLEANUP;
4110         return 0;
4111 }
4112
4113 static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
4114 {
4115         struct io_rename *ren = &req->rename;
4116         int ret;
4117
4118         if (issue_flags & IO_URING_F_NONBLOCK)
4119                 return -EAGAIN;
4120
4121         ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
4122                                 ren->newpath, ren->flags);
4123
4124         req->flags &= ~REQ_F_NEED_CLEANUP;
4125         if (ret < 0)
4126                 req_set_fail(req);
4127         io_req_complete(req, ret);
4128         return 0;
4129 }
4130
4131 static int io_unlinkat_prep(struct io_kiocb *req,
4132                             const struct io_uring_sqe *sqe)
4133 {
4134         struct io_unlink *un = &req->unlink;
4135         const char __user *fname;
4136
4137         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4138                 return -EINVAL;
4139         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
4140             sqe->splice_fd_in)
4141                 return -EINVAL;
4142         if (unlikely(req->flags & REQ_F_FIXED_FILE))
4143                 return -EBADF;
4144
4145         un->dfd = READ_ONCE(sqe->fd);
4146
4147         un->flags = READ_ONCE(sqe->unlink_flags);
4148         if (un->flags & ~AT_REMOVEDIR)
4149                 return -EINVAL;
4150
4151         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
4152         un->filename = getname(fname);
4153         if (IS_ERR(un->filename))
4154                 return PTR_ERR(un->filename);
4155
4156         req->flags |= REQ_F_NEED_CLEANUP;
4157         return 0;
4158 }
4159
4160 static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
4161 {
4162         struct io_unlink *un = &req->unlink;
4163         int ret;
4164
4165         if (issue_flags & IO_URING_F_NONBLOCK)
4166                 return -EAGAIN;
4167
4168         if (un->flags & AT_REMOVEDIR)
4169                 ret = do_rmdir(un->dfd, un->filename);
4170         else
4171                 ret = do_unlinkat(un->dfd, un->filename);
4172
4173         req->flags &= ~REQ_F_NEED_CLEANUP;
4174         if (ret < 0)
4175                 req_set_fail(req);
4176         io_req_complete(req, ret);
4177         return 0;
4178 }
4179
4180 static int io_mkdirat_prep(struct io_kiocb *req,
4181                             const struct io_uring_sqe *sqe)
4182 {
4183         struct io_mkdir *mkd = &req->mkdir;
4184         const char __user *fname;
4185
4186         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4187                 return -EINVAL;
4188         if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index ||
4189             sqe->splice_fd_in)
4190                 return -EINVAL;
4191         if (unlikely(req->flags & REQ_F_FIXED_FILE))
4192                 return -EBADF;
4193
4194         mkd->dfd = READ_ONCE(sqe->fd);
4195         mkd->mode = READ_ONCE(sqe->len);
4196
4197         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
4198         mkd->filename = getname(fname);
4199         if (IS_ERR(mkd->filename))
4200                 return PTR_ERR(mkd->filename);
4201
4202         req->flags |= REQ_F_NEED_CLEANUP;
4203         return 0;
4204 }
4205
4206 static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
4207 {
4208         struct io_mkdir *mkd = &req->mkdir;
4209         int ret;
4210
4211         if (issue_flags & IO_URING_F_NONBLOCK)
4212                 return -EAGAIN;
4213
4214         ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
4215
4216         req->flags &= ~REQ_F_NEED_CLEANUP;
4217         if (ret < 0)
4218                 req_set_fail(req);
4219         io_req_complete(req, ret);
4220         return 0;
4221 }
4222
4223 static int io_symlinkat_prep(struct io_kiocb *req,
4224                             const struct io_uring_sqe *sqe)
4225 {
4226         struct io_symlink *sl = &req->symlink;
4227         const char __user *oldpath, *newpath;
4228
4229         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4230                 return -EINVAL;
4231         if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index ||
4232             sqe->splice_fd_in)
4233                 return -EINVAL;
4234         if (unlikely(req->flags & REQ_F_FIXED_FILE))
4235                 return -EBADF;
4236
4237         sl->new_dfd = READ_ONCE(sqe->fd);
4238         oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
4239         newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4240
4241         sl->oldpath = getname(oldpath);
4242         if (IS_ERR(sl->oldpath))
4243                 return PTR_ERR(sl->oldpath);
4244
4245         sl->newpath = getname(newpath);
4246         if (IS_ERR(sl->newpath)) {
4247                 putname(sl->oldpath);
4248                 return PTR_ERR(sl->newpath);
4249         }
4250
4251         req->flags |= REQ_F_NEED_CLEANUP;
4252         return 0;
4253 }
4254
4255 static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
4256 {
4257         struct io_symlink *sl = &req->symlink;
4258         int ret;
4259
4260         if (issue_flags & IO_URING_F_NONBLOCK)
4261                 return -EAGAIN;
4262
4263         ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
4264
4265         req->flags &= ~REQ_F_NEED_CLEANUP;
4266         if (ret < 0)
4267                 req_set_fail(req);
4268         io_req_complete(req, ret);
4269         return 0;
4270 }
4271
4272 static int io_linkat_prep(struct io_kiocb *req,
4273                             const struct io_uring_sqe *sqe)
4274 {
4275         struct io_hardlink *lnk = &req->hardlink;
4276         const char __user *oldf, *newf;
4277
4278         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4279                 return -EINVAL;
4280         if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
4281                 return -EINVAL;
4282         if (unlikely(req->flags & REQ_F_FIXED_FILE))
4283                 return -EBADF;
4284
4285         lnk->old_dfd = READ_ONCE(sqe->fd);
4286         lnk->new_dfd = READ_ONCE(sqe->len);
4287         oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
4288         newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4289         lnk->flags = READ_ONCE(sqe->hardlink_flags);
4290
4291         lnk->oldpath = getname(oldf);
4292         if (IS_ERR(lnk->oldpath))
4293                 return PTR_ERR(lnk->oldpath);
4294
4295         lnk->newpath = getname(newf);
4296         if (IS_ERR(lnk->newpath)) {
4297                 putname(lnk->oldpath);
4298                 return PTR_ERR(lnk->newpath);
4299         }
4300
4301         req->flags |= REQ_F_NEED_CLEANUP;
4302         return 0;
4303 }
4304
4305 static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
4306 {
4307         struct io_hardlink *lnk = &req->hardlink;
4308         int ret;
4309
4310         if (issue_flags & IO_URING_F_NONBLOCK)
4311                 return -EAGAIN;
4312
4313         ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
4314                                 lnk->newpath, lnk->flags);
4315
4316         req->flags &= ~REQ_F_NEED_CLEANUP;
4317         if (ret < 0)
4318                 req_set_fail(req);
4319         io_req_complete(req, ret);
4320         return 0;
4321 }
4322
4323 static int io_shutdown_prep(struct io_kiocb *req,
4324                             const struct io_uring_sqe *sqe)
4325 {
4326 #if defined(CONFIG_NET)
4327         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4328                 return -EINVAL;
4329         if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
4330                      sqe->buf_index || sqe->splice_fd_in))
4331                 return -EINVAL;
4332
4333         req->shutdown.how = READ_ONCE(sqe->len);
4334         return 0;
4335 #else
4336         return -EOPNOTSUPP;
4337 #endif
4338 }
4339
4340 static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
4341 {
4342 #if defined(CONFIG_NET)
4343         struct socket *sock;
4344         int ret;
4345
4346         if (issue_flags & IO_URING_F_NONBLOCK)
4347                 return -EAGAIN;
4348
4349         sock = sock_from_file(req->file);
4350         if (unlikely(!sock))
4351                 return -ENOTSOCK;
4352
4353         ret = __sys_shutdown_sock(sock, req->shutdown.how);
4354         if (ret < 0)
4355                 req_set_fail(req);
4356         io_req_complete(req, ret);
4357         return 0;
4358 #else
4359         return -EOPNOTSUPP;
4360 #endif
4361 }
4362
4363 static int __io_splice_prep(struct io_kiocb *req,
4364                             const struct io_uring_sqe *sqe)
4365 {
4366         struct io_splice *sp = &req->splice;
4367         unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
4368
4369         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4370                 return -EINVAL;
4371
4372         sp->file_in = NULL;
4373         sp->len = READ_ONCE(sqe->len);
4374         sp->flags = READ_ONCE(sqe->splice_flags);
4375
4376         if (unlikely(sp->flags & ~valid_flags))
4377                 return -EINVAL;
4378
4379         sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in),
4380                                   (sp->flags & SPLICE_F_FD_IN_FIXED));
4381         if (!sp->file_in)
4382                 return -EBADF;
4383         req->flags |= REQ_F_NEED_CLEANUP;
4384         return 0;
4385 }
4386
4387 static int io_tee_prep(struct io_kiocb *req,
4388                        const struct io_uring_sqe *sqe)
4389 {
4390         if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
4391                 return -EINVAL;
4392         return __io_splice_prep(req, sqe);
4393 }
4394
4395 static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
4396 {
4397         struct io_splice *sp = &req->splice;
4398         struct file *in = sp->file_in;
4399         struct file *out = sp->file_out;
4400         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
4401         long ret = 0;
4402
4403         if (issue_flags & IO_URING_F_NONBLOCK)
4404                 return -EAGAIN;
4405         if (sp->len)
4406                 ret = do_tee(in, out, sp->len, flags);
4407
4408         if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
4409                 io_put_file(in);
4410         req->flags &= ~REQ_F_NEED_CLEANUP;
4411
4412         if (ret != sp->len)
4413                 req_set_fail(req);
4414         io_req_complete(req, ret);
4415         return 0;
4416 }
4417
4418 static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4419 {
4420         struct io_splice *sp = &req->splice;
4421
4422         sp->off_in = READ_ONCE(sqe->splice_off_in);
4423         sp->off_out = READ_ONCE(sqe->off);
4424         return __io_splice_prep(req, sqe);
4425 }
4426
4427 static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
4428 {
4429         struct io_splice *sp = &req->splice;
4430         struct file *in = sp->file_in;
4431         struct file *out = sp->file_out;
4432         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
4433         loff_t *poff_in, *poff_out;
4434         long ret = 0;
4435
4436         if (issue_flags & IO_URING_F_NONBLOCK)
4437                 return -EAGAIN;
4438
4439         poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
4440         poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
4441
4442         if (sp->len)
4443                 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
4444
4445         if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
4446                 io_put_file(in);
4447         req->flags &= ~REQ_F_NEED_CLEANUP;
4448
4449         if (ret != sp->len)
4450                 req_set_fail(req);
4451         io_req_complete(req, ret);
4452         return 0;
4453 }
4454
4455 /*
4456  * IORING_OP_NOP just posts a completion event, nothing else.
4457  */
4458 static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
4459 {
4460         struct io_ring_ctx *ctx = req->ctx;
4461
4462         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4463                 return -EINVAL;
4464
4465         __io_req_complete(req, issue_flags, 0, 0);
4466         return 0;
4467 }
4468
4469 static int io_msg_ring_prep(struct io_kiocb *req,
4470                             const struct io_uring_sqe *sqe)
4471 {
4472         if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags ||
4473                      sqe->splice_fd_in || sqe->buf_index || sqe->personality))
4474                 return -EINVAL;
4475
4476         req->msg.user_data = READ_ONCE(sqe->off);
4477         req->msg.len = READ_ONCE(sqe->len);
4478         return 0;
4479 }
4480
4481 static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
4482 {
4483         struct io_ring_ctx *target_ctx;
4484         struct io_msg *msg = &req->msg;
4485         bool filled;
4486         int ret;
4487
4488         ret = -EBADFD;
4489         if (req->file->f_op != &io_uring_fops)
4490                 goto done;
4491
4492         ret = -EOVERFLOW;
4493         target_ctx = req->file->private_data;
4494
4495         spin_lock(&target_ctx->completion_lock);
4496         filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0);
4497         io_commit_cqring(target_ctx);
4498         spin_unlock(&target_ctx->completion_lock);
4499
4500         if (filled) {
4501                 io_cqring_ev_posted(target_ctx);
4502                 ret = 0;
4503         }
4504
4505 done:
4506         if (ret < 0)
4507                 req_set_fail(req);
4508         __io_req_complete(req, issue_flags, ret, 0);
4509         return 0;
4510 }
4511
4512 static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4513 {
4514         struct io_ring_ctx *ctx = req->ctx;
4515
4516         if (!req->file)
4517                 return -EBADF;
4518
4519         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4520                 return -EINVAL;
4521         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
4522                      sqe->splice_fd_in))
4523                 return -EINVAL;
4524
4525         req->sync.flags = READ_ONCE(sqe->fsync_flags);
4526         if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
4527                 return -EINVAL;
4528
4529         req->sync.off = READ_ONCE(sqe->off);
4530         req->sync.len = READ_ONCE(sqe->len);
4531         return 0;
4532 }
4533
4534 static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
4535 {
4536         loff_t end = req->sync.off + req->sync.len;
4537         int ret;
4538
4539         /* fsync always requires a blocking context */
4540         if (issue_flags & IO_URING_F_NONBLOCK)
4541                 return -EAGAIN;
4542
4543         ret = vfs_fsync_range(req->file, req->sync.off,
4544                                 end > 0 ? end : LLONG_MAX,
4545                                 req->sync.flags & IORING_FSYNC_DATASYNC);
4546         if (ret < 0)
4547                 req_set_fail(req);
4548         io_req_complete(req, ret);
4549         return 0;
4550 }
4551
4552 static int io_fallocate_prep(struct io_kiocb *req,
4553                              const struct io_uring_sqe *sqe)
4554 {
4555         if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
4556             sqe->splice_fd_in)
4557                 return -EINVAL;
4558         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4559                 return -EINVAL;
4560
4561         req->sync.off = READ_ONCE(sqe->off);
4562         req->sync.len = READ_ONCE(sqe->addr);
4563         req->sync.mode = READ_ONCE(sqe->len);
4564         return 0;
4565 }
4566
4567 static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
4568 {
4569         int ret;
4570
4571         /* fallocate always requiring blocking context */
4572         if (issue_flags & IO_URING_F_NONBLOCK)
4573                 return -EAGAIN;
4574         ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
4575                                 req->sync.len);
4576         if (ret < 0)
4577                 req_set_fail(req);
4578         else
4579                 fsnotify_modify(req->file);
4580         io_req_complete(req, ret);
4581         return 0;
4582 }
4583
4584 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4585 {
4586         const char __user *fname;
4587         int ret;
4588
4589         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4590                 return -EINVAL;
4591         if (unlikely(sqe->ioprio || sqe->buf_index))
4592                 return -EINVAL;
4593         if (unlikely(req->flags & REQ_F_FIXED_FILE))
4594                 return -EBADF;
4595
4596         /* open.how should be already initialised */
4597         if (!(req->open.how.flags & O_PATH) && force_o_largefile())
4598                 req->open.how.flags |= O_LARGEFILE;
4599
4600         req->open.dfd = READ_ONCE(sqe->fd);
4601         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
4602         req->open.filename = getname(fname);
4603         if (IS_ERR(req->open.filename)) {
4604                 ret = PTR_ERR(req->open.filename);
4605                 req->open.filename = NULL;
4606                 return ret;
4607         }
4608
4609         req->open.file_slot = READ_ONCE(sqe->file_index);
4610         if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
4611                 return -EINVAL;
4612
4613         req->open.nofile = rlimit(RLIMIT_NOFILE);
4614         req->flags |= REQ_F_NEED_CLEANUP;
4615         return 0;
4616 }
4617
4618 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4619 {
4620         u64 mode = READ_ONCE(sqe->len);
4621         u64 flags = READ_ONCE(sqe->open_flags);
4622
4623         req->open.how = build_open_how(flags, mode);
4624         return __io_openat_prep(req, sqe);
4625 }
4626
4627 static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4628 {
4629         struct open_how __user *how;
4630         size_t len;
4631         int ret;
4632
4633         how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4634         len = READ_ONCE(sqe->len);
4635         if (len < OPEN_HOW_SIZE_VER0)
4636                 return -EINVAL;
4637
4638         ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
4639                                         len);
4640         if (ret)
4641                 return ret;
4642
4643         return __io_openat_prep(req, sqe);
4644 }
4645
4646 static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
4647 {
4648         struct open_flags op;
4649         struct file *file;
4650         bool resolve_nonblock, nonblock_set;
4651         bool fixed = !!req->open.file_slot;
4652         int ret;
4653
4654         ret = build_open_flags(&req->open.how, &op);
4655         if (ret)
4656                 goto err;
4657         nonblock_set = op.open_flag & O_NONBLOCK;
4658         resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
4659         if (issue_flags & IO_URING_F_NONBLOCK) {
4660                 /*
4661                  * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
4662                  * it'll always -EAGAIN
4663                  */
4664                 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
4665                         return -EAGAIN;
4666                 op.lookup_flags |= LOOKUP_CACHED;
4667                 op.open_flag |= O_NONBLOCK;
4668         }
4669
4670         if (!fixed) {
4671                 ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
4672                 if (ret < 0)
4673                         goto err;
4674         }
4675
4676         file = do_filp_open(req->open.dfd, req->open.filename, &op);
4677         if (IS_ERR(file)) {
4678                 /*
4679                  * We could hang on to this 'fd' on retrying, but seems like
4680                  * marginal gain for something that is now known to be a slower
4681                  * path. So just put it, and we'll get a new one when we retry.
4682                  */
4683                 if (!fixed)
4684                         put_unused_fd(ret);
4685
4686                 ret = PTR_ERR(file);
4687                 /* only retry if RESOLVE_CACHED wasn't already set by application */
4688                 if (ret == -EAGAIN &&
4689                     (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
4690                         return -EAGAIN;
4691                 goto err;
4692         }
4693
4694         if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
4695                 file->f_flags &= ~O_NONBLOCK;
4696         fsnotify_open(file);
4697
4698         if (!fixed)
4699                 fd_install(ret, file);
4700         else
4701                 ret = io_install_fixed_file(req, file, issue_flags,
4702                                             req->open.file_slot - 1);
4703 err:
4704         putname(req->open.filename);
4705         req->flags &= ~REQ_F_NEED_CLEANUP;
4706         if (ret < 0)
4707                 req_set_fail(req);
4708         __io_req_complete(req, issue_flags, ret, 0);
4709         return 0;
4710 }
4711
4712 static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
4713 {
4714         return io_openat2(req, issue_flags);
4715 }
4716
4717 static int io_remove_buffers_prep(struct io_kiocb *req,
4718                                   const struct io_uring_sqe *sqe)
4719 {
4720         struct io_provide_buf *p = &req->pbuf;
4721         u64 tmp;
4722
4723         if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
4724             sqe->splice_fd_in)
4725                 return -EINVAL;
4726
4727         tmp = READ_ONCE(sqe->fd);
4728         if (!tmp || tmp > USHRT_MAX)
4729                 return -EINVAL;
4730
4731         memset(p, 0, sizeof(*p));
4732         p->nbufs = tmp;
4733         p->bgid = READ_ONCE(sqe->buf_group);
4734         return 0;
4735 }
4736
4737 static int __io_remove_buffers(struct io_ring_ctx *ctx,
4738                                struct io_buffer_list *bl, unsigned nbufs)
4739 {
4740         unsigned i = 0;
4741
4742         /* shouldn't happen */
4743         if (!nbufs)
4744                 return 0;
4745
4746         /* the head kbuf is the list itself */
4747         while (!list_empty(&bl->buf_list)) {
4748                 struct io_buffer *nxt;
4749
4750                 nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
4751                 list_del(&nxt->list);
4752                 if (++i == nbufs)
4753                         return i;
4754                 cond_resched();
4755         }
4756         i++;
4757
4758         return i;
4759 }
4760
4761 static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
4762 {
4763         struct io_provide_buf *p = &req->pbuf;
4764         struct io_ring_ctx *ctx = req->ctx;
4765         struct io_buffer_list *bl;
4766         int ret = 0;
4767         bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
4768
4769         io_ring_submit_lock(ctx, needs_lock);
4770
4771         lockdep_assert_held(&ctx->uring_lock);
4772
4773         ret = -ENOENT;
4774         bl = io_buffer_get_list(ctx, p->bgid);
4775         if (bl)
4776                 ret = __io_remove_buffers(ctx, bl, p->nbufs);
4777         if (ret < 0)
4778                 req_set_fail(req);
4779
4780         /* complete before unlock, IOPOLL may need the lock */
4781         __io_req_complete(req, issue_flags, ret, 0);
4782         io_ring_submit_unlock(ctx, needs_lock);
4783         return 0;
4784 }
4785
4786 static int io_provide_buffers_prep(struct io_kiocb *req,
4787                                    const struct io_uring_sqe *sqe)
4788 {
4789         unsigned long size, tmp_check;
4790         struct io_provide_buf *p = &req->pbuf;
4791         u64 tmp;
4792
4793         if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
4794                 return -EINVAL;
4795
4796         tmp = READ_ONCE(sqe->fd);
4797         if (!tmp || tmp > USHRT_MAX)
4798                 return -E2BIG;
4799         p->nbufs = tmp;
4800         p->addr = READ_ONCE(sqe->addr);
4801         p->len = READ_ONCE(sqe->len);
4802
4803         if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
4804                                 &size))
4805                 return -EOVERFLOW;
4806         if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
4807                 return -EOVERFLOW;
4808
4809         size = (unsigned long)p->len * p->nbufs;
4810         if (!access_ok(u64_to_user_ptr(p->addr), size))
4811                 return -EFAULT;
4812
4813         p->bgid = READ_ONCE(sqe->buf_group);
4814         tmp = READ_ONCE(sqe->off);
4815         if (tmp > USHRT_MAX)
4816                 return -E2BIG;
4817         p->bid = tmp;
4818         return 0;
4819 }
4820
4821 static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
4822 {
4823         struct io_buffer *buf;
4824         struct page *page;
4825         int bufs_in_page;
4826
4827         /*
4828          * Completions that don't happen inline (eg not under uring_lock) will
4829          * add to ->io_buffers_comp. If we don't have any free buffers, check
4830          * the completion list and splice those entries first.
4831          */
4832         if (!list_empty_careful(&ctx->io_buffers_comp)) {
4833                 spin_lock(&ctx->completion_lock);
4834                 if (!list_empty(&ctx->io_buffers_comp)) {
4835                         list_splice_init(&ctx->io_buffers_comp,
4836                                                 &ctx->io_buffers_cache);
4837                         spin_unlock(&ctx->completion_lock);
4838                         return 0;
4839                 }
4840                 spin_unlock(&ctx->completion_lock);
4841         }
4842
4843         /*
4844          * No free buffers and no completion entries either. Allocate a new
4845          * page worth of buffer entries and add those to our freelist.
4846          */
4847         page = alloc_page(GFP_KERNEL_ACCOUNT);
4848         if (!page)
4849                 return -ENOMEM;
4850
4851         list_add(&page->lru, &ctx->io_buffers_pages);
4852
4853         buf = page_address(page);
4854         bufs_in_page = PAGE_SIZE / sizeof(*buf);
4855         while (bufs_in_page) {
4856                 list_add_tail(&buf->list, &ctx->io_buffers_cache);
4857                 buf++;
4858                 bufs_in_page--;
4859         }
4860
4861         return 0;
4862 }
4863
4864 static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
4865                           struct io_buffer_list *bl)
4866 {
4867         struct io_buffer *buf;
4868         u64 addr = pbuf->addr;
4869         int i, bid = pbuf->bid;
4870
4871         for (i = 0; i < pbuf->nbufs; i++) {
4872                 if (list_empty(&ctx->io_buffers_cache) &&
4873                     io_refill_buffer_cache(ctx))
4874                         break;
4875                 buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
4876                                         list);
4877                 list_move_tail(&buf->list, &bl->buf_list);
4878                 buf->addr = addr;
4879                 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
4880                 buf->bid = bid;
4881                 buf->bgid = pbuf->bgid;
4882                 addr += pbuf->len;
4883                 bid++;
4884                 cond_resched();
4885         }
4886
4887         return i ? 0 : -ENOMEM;
4888 }
4889
4890 static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
4891 {
4892         struct io_provide_buf *p = &req->pbuf;
4893         struct io_ring_ctx *ctx = req->ctx;
4894         struct io_buffer_list *bl;
4895         int ret = 0;
4896         bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
4897
4898         io_ring_submit_lock(ctx, needs_lock);
4899
4900         lockdep_assert_held(&ctx->uring_lock);
4901
4902         bl = io_buffer_get_list(ctx, p->bgid);
4903         if (unlikely(!bl)) {
4904                 bl = kmalloc(sizeof(*bl), GFP_KERNEL);
4905                 if (!bl) {
4906                         ret = -ENOMEM;
4907                         goto err;
4908                 }
4909                 io_buffer_add_list(ctx, bl, p->bgid);
4910         }
4911
4912         ret = io_add_buffers(ctx, p, bl);
4913 err:
4914         if (ret < 0)
4915                 req_set_fail(req);
4916         /* complete before unlock, IOPOLL may need the lock */
4917         __io_req_complete(req, issue_flags, ret, 0);
4918         io_ring_submit_unlock(ctx, needs_lock);
4919         return 0;
4920 }
4921
4922 static int io_epoll_ctl_prep(struct io_kiocb *req,
4923                              const struct io_uring_sqe *sqe)
4924 {
4925 #if defined(CONFIG_EPOLL)
4926         if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
4927                 return -EINVAL;
4928         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4929                 return -EINVAL;
4930
4931         req->epoll.epfd = READ_ONCE(sqe->fd);
4932         req->epoll.op = READ_ONCE(sqe->len);
4933         req->epoll.fd = READ_ONCE(sqe->off);
4934
4935         if (ep_op_has_event(req->epoll.op)) {
4936                 struct epoll_event __user *ev;
4937
4938                 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4939                 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4940                         return -EFAULT;
4941         }
4942
4943         return 0;
4944 #else
4945         return -EOPNOTSUPP;
4946 #endif
4947 }
4948
4949 static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
4950 {
4951 #if defined(CONFIG_EPOLL)
4952         struct io_epoll *ie = &req->epoll;
4953         int ret;
4954         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4955
4956         ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4957         if (force_nonblock && ret == -EAGAIN)
4958                 return -EAGAIN;
4959
4960         if (ret < 0)
4961                 req_set_fail(req);
4962         __io_req_complete(req, issue_flags, ret, 0);
4963         return 0;
4964 #else
4965         return -EOPNOTSUPP;
4966 #endif
4967 }
4968
4969 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4970 {
4971 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4972         if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
4973                 return -EINVAL;
4974         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4975                 return -EINVAL;
4976
4977         req->madvise.addr = READ_ONCE(sqe->addr);
4978         req->madvise.len = READ_ONCE(sqe->len);
4979         req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4980         return 0;
4981 #else
4982         return -EOPNOTSUPP;
4983 #endif
4984 }
4985
4986 static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
4987 {
4988 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4989         struct io_madvise *ma = &req->madvise;
4990         int ret;
4991
4992         if (issue_flags & IO_URING_F_NONBLOCK)
4993                 return -EAGAIN;
4994
4995         ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
4996         if (ret < 0)
4997                 req_set_fail(req);
4998         io_req_complete(req, ret);
4999         return 0;
5000 #else
5001         return -EOPNOTSUPP;
5002 #endif
5003 }
5004
5005 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5006 {
5007         if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
5008                 return -EINVAL;
5009         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5010                 return -EINVAL;
5011
5012         req->fadvise.offset = READ_ONCE(sqe->off);
5013         req->fadvise.len = READ_ONCE(sqe->len);
5014         req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
5015         return 0;
5016 }
5017
5018 static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
5019 {
5020         struct io_fadvise *fa = &req->fadvise;
5021         int ret;
5022
5023         if (issue_flags & IO_URING_F_NONBLOCK) {
5024                 switch (fa->advice) {
5025                 case POSIX_FADV_NORMAL:
5026                 case POSIX_FADV_RANDOM:
5027                 case POSIX_FADV_SEQUENTIAL:
5028                         break;
5029                 default:
5030                         return -EAGAIN;
5031                 }
5032         }
5033
5034         ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
5035         if (ret < 0)
5036                 req_set_fail(req);
5037         __io_req_complete(req, issue_flags, ret, 0);
5038         return 0;
5039 }
5040
5041 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5042 {
5043         const char __user *path;
5044
5045         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5046                 return -EINVAL;
5047         if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
5048                 return -EINVAL;
5049         if (req->flags & REQ_F_FIXED_FILE)
5050                 return -EBADF;
5051
5052         req->statx.dfd = READ_ONCE(sqe->fd);
5053         req->statx.mask = READ_ONCE(sqe->len);
5054         path = u64_to_user_ptr(READ_ONCE(sqe->addr));
5055         req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
5056         req->statx.flags = READ_ONCE(sqe->statx_flags);
5057
5058         req->statx.filename = getname_flags(path,
5059                                         getname_statx_lookup_flags(req->statx.flags),
5060                                         NULL);
5061
5062         if (IS_ERR(req->statx.filename)) {
5063                 int ret = PTR_ERR(req->statx.filename);
5064
5065                 req->statx.filename = NULL;
5066                 return ret;
5067         }
5068
5069         req->flags |= REQ_F_NEED_CLEANUP;
5070         return 0;
5071 }
5072
5073 static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
5074 {
5075         struct io_statx *ctx = &req->statx;
5076         int ret;
5077
5078         if (issue_flags & IO_URING_F_NONBLOCK)
5079                 return -EAGAIN;
5080
5081         ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
5082                        ctx->buffer);
5083
5084         if (ret < 0)
5085                 req_set_fail(req);
5086         io_req_complete(req, ret);
5087         return 0;
5088 }
5089
5090 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5091 {
5092         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5093                 return -EINVAL;
5094         if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
5095             sqe->rw_flags || sqe->buf_index)
5096                 return -EINVAL;
5097         if (req->flags & REQ_F_FIXED_FILE)
5098                 return -EBADF;
5099
5100         req->close.fd = READ_ONCE(sqe->fd);
5101         req->close.file_slot = READ_ONCE(sqe->file_index);
5102         if (req->close.file_slot && req->close.fd)
5103                 return -EINVAL;
5104
5105         return 0;
5106 }
5107
5108 static int io_close(struct io_kiocb *req, unsigned int issue_flags)
5109 {
5110         struct files_struct *files = current->files;
5111         struct io_close *close = &req->close;
5112         struct fdtable *fdt;
5113         struct file *file = NULL;
5114         int ret = -EBADF;
5115
5116         if (req->close.file_slot) {
5117                 ret = io_close_fixed(req, issue_flags);
5118                 goto err;
5119         }
5120
5121         spin_lock(&files->file_lock);
5122         fdt = files_fdtable(files);
5123         if (close->fd >= fdt->max_fds) {
5124                 spin_unlock(&files->file_lock);
5125                 goto err;
5126         }
5127         file = fdt->fd[close->fd];
5128         if (!file || file->f_op == &io_uring_fops) {
5129                 spin_unlock(&files->file_lock);
5130                 file = NULL;
5131                 goto err;
5132         }
5133
5134         /* if the file has a flush method, be safe and punt to async */
5135         if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
5136                 spin_unlock(&files->file_lock);
5137                 return -EAGAIN;
5138         }
5139
5140         ret = __close_fd_get_file(close->fd, &file);
5141         spin_unlock(&files->file_lock);
5142         if (ret < 0) {
5143                 if (ret == -ENOENT)
5144                         ret = -EBADF;
5145                 goto err;
5146         }
5147
5148         /* No ->flush() or already async, safely close from here */
5149         ret = filp_close(file, current->files);
5150 err:
5151         if (ret < 0)
5152                 req_set_fail(req);
5153         if (file)
5154                 fput(file);
5155         __io_req_complete(req, issue_flags, ret, 0);
5156         return 0;
5157 }
5158
5159 static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5160 {
5161         struct io_ring_ctx *ctx = req->ctx;
5162
5163         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
5164                 return -EINVAL;
5165         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
5166                      sqe->splice_fd_in))
5167                 return -EINVAL;
5168
5169         req->sync.off = READ_ONCE(sqe->off);
5170         req->sync.len = READ_ONCE(sqe->len);
5171         req->sync.flags = READ_ONCE(sqe->sync_range_flags);
5172         return 0;
5173 }
5174
5175 static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
5176 {
5177         int ret;
5178
5179         /* sync_file_range always requires a blocking context */
5180         if (issue_flags & IO_URING_F_NONBLOCK)
5181                 return -EAGAIN;
5182
5183         ret = sync_file_range(req->file, req->sync.off, req->sync.len,
5184                                 req->sync.flags);
5185         if (ret < 0)
5186                 req_set_fail(req);
5187         io_req_complete(req, ret);
5188         return 0;
5189 }
5190
5191 #if defined(CONFIG_NET)
5192 static int io_setup_async_msg(struct io_kiocb *req,
5193                               struct io_async_msghdr *kmsg)
5194 {
5195         struct io_async_msghdr *async_msg = req->async_data;
5196
5197         if (async_msg)
5198                 return -EAGAIN;
5199         if (io_alloc_async_data(req)) {
5200                 kfree(kmsg->free_iov);
5201                 return -ENOMEM;
5202         }
5203         async_msg = req->async_data;
5204         req->flags |= REQ_F_NEED_CLEANUP;
5205         memcpy(async_msg, kmsg, sizeof(*kmsg));
5206         async_msg->msg.msg_name = &async_msg->addr;
5207         /* if were using fast_iov, set it to the new one */
5208         if (!async_msg->free_iov)
5209                 async_msg->msg.msg_iter.iov = async_msg->fast_iov;
5210
5211         return -EAGAIN;
5212 }
5213
5214 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
5215                                struct io_async_msghdr *iomsg)
5216 {
5217         iomsg->msg.msg_name = &iomsg->addr;
5218         iomsg->free_iov = iomsg->fast_iov;
5219         return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
5220                                    req->sr_msg.msg_flags, &iomsg->free_iov);
5221 }
5222
5223 static int io_sendmsg_prep_async(struct io_kiocb *req)
5224 {
5225         int ret;
5226
5227         ret = io_sendmsg_copy_hdr(req, req->async_data);
5228         if (!ret)
5229                 req->flags |= REQ_F_NEED_CLEANUP;
5230         return ret;
5231 }
5232
5233 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5234 {
5235         struct io_sr_msg *sr = &req->sr_msg;
5236
5237         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5238                 return -EINVAL;
5239
5240         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
5241         sr->len = READ_ONCE(sqe->len);
5242         sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
5243         if (sr->msg_flags & MSG_DONTWAIT)
5244                 req->flags |= REQ_F_NOWAIT;
5245
5246 #ifdef CONFIG_COMPAT
5247         if (req->ctx->compat)
5248                 sr->msg_flags |= MSG_CMSG_COMPAT;
5249 #endif
5250         return 0;
5251 }
5252
5253 static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
5254 {
5255         struct io_async_msghdr iomsg, *kmsg;
5256         struct socket *sock;
5257         unsigned flags;
5258         int min_ret = 0;
5259         int ret;
5260
5261         sock = sock_from_file(req->file);
5262         if (unlikely(!sock))
5263                 return -ENOTSOCK;
5264
5265         if (req_has_async_data(req)) {
5266                 kmsg = req->async_data;
5267         } else {
5268                 ret = io_sendmsg_copy_hdr(req, &iomsg);
5269                 if (ret)
5270                         return ret;
5271                 kmsg = &iomsg;
5272         }
5273
5274         flags = req->sr_msg.msg_flags;
5275         if (issue_flags & IO_URING_F_NONBLOCK)
5276                 flags |= MSG_DONTWAIT;
5277         if (flags & MSG_WAITALL)
5278                 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
5279
5280         ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
5281
5282         if (ret < min_ret) {
5283                 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
5284                         return io_setup_async_msg(req, kmsg);
5285                 if (ret == -ERESTARTSYS)
5286                         ret = -EINTR;
5287                 req_set_fail(req);
5288         }
5289         /* fast path, check for non-NULL to avoid function call */
5290         if (kmsg->free_iov)
5291                 kfree(kmsg->free_iov);
5292         req->flags &= ~REQ_F_NEED_CLEANUP;
5293         __io_req_complete(req, issue_flags, ret, 0);
5294         return 0;
5295 }
5296
5297 static int io_send(struct io_kiocb *req, unsigned int issue_flags)
5298 {
5299         struct io_sr_msg *sr = &req->sr_msg;
5300         struct msghdr msg;
5301         struct iovec iov;
5302         struct socket *sock;
5303         unsigned flags;
5304         int min_ret = 0;
5305         int ret;
5306
5307         sock = sock_from_file(req->file);
5308         if (unlikely(!sock))
5309                 return -ENOTSOCK;
5310
5311         ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
5312         if (unlikely(ret))
5313                 return ret;
5314
5315         msg.msg_name = NULL;
5316         msg.msg_control = NULL;
5317         msg.msg_controllen = 0;
5318         msg.msg_namelen = 0;
5319
5320         flags = req->sr_msg.msg_flags;
5321         if (issue_flags & IO_URING_F_NONBLOCK)
5322                 flags |= MSG_DONTWAIT;
5323         if (flags & MSG_WAITALL)
5324                 min_ret = iov_iter_count(&msg.msg_iter);
5325
5326         msg.msg_flags = flags;
5327         ret = sock_sendmsg(sock, &msg);
5328         if (ret < min_ret) {
5329                 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
5330                         return -EAGAIN;
5331                 if (ret == -ERESTARTSYS)
5332                         ret = -EINTR;
5333                 req_set_fail(req);
5334         }
5335         __io_req_complete(req, issue_flags, ret, 0);
5336         return 0;
5337 }
5338
5339 static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
5340                                  struct io_async_msghdr *iomsg)
5341 {
5342         struct io_sr_msg *sr = &req->sr_msg;
5343         struct iovec __user *uiov;
5344         size_t iov_len;
5345         int ret;
5346
5347         ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
5348                                         &iomsg->uaddr, &uiov, &iov_len);
5349         if (ret)
5350                 return ret;
5351
5352         if (req->flags & REQ_F_BUFFER_SELECT) {
5353                 if (iov_len > 1)
5354                         return -EINVAL;
5355                 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
5356                         return -EFAULT;
5357                 sr->len = iomsg->fast_iov[0].iov_len;
5358                 iomsg->free_iov = NULL;
5359         } else {
5360                 iomsg->free_iov = iomsg->fast_iov;
5361                 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
5362                                      &iomsg->free_iov, &iomsg->msg.msg_iter,
5363                                      false);
5364                 if (ret > 0)
5365                         ret = 0;
5366         }
5367
5368         return ret;
5369 }
5370
5371 #ifdef CONFIG_COMPAT
5372 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
5373                                         struct io_async_msghdr *iomsg)
5374 {
5375         struct io_sr_msg *sr = &req->sr_msg;
5376         struct compat_iovec __user *uiov;
5377         compat_uptr_t ptr;
5378         compat_size_t len;
5379         int ret;
5380
5381         ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
5382                                   &ptr, &len);
5383         if (ret)
5384                 return ret;
5385
5386         uiov = compat_ptr(ptr);
5387         if (req->flags & REQ_F_BUFFER_SELECT) {
5388                 compat_ssize_t clen;
5389
5390                 if (len > 1)
5391                         return -EINVAL;
5392                 if (!access_ok(uiov, sizeof(*uiov)))
5393                         return -EFAULT;
5394                 if (__get_user(clen, &uiov->iov_len))
5395                         return -EFAULT;
5396                 if (clen < 0)
5397                         return -EINVAL;
5398                 sr->len = clen;
5399                 iomsg->free_iov = NULL;
5400         } else {
5401                 iomsg->free_iov = iomsg->fast_iov;
5402                 ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
5403                                    UIO_FASTIOV, &iomsg->free_iov,
5404                                    &iomsg->msg.msg_iter, true);
5405                 if (ret < 0)
5406                         return ret;
5407         }
5408
5409         return 0;
5410 }
5411 #endif
5412
5413 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
5414                                struct io_async_msghdr *iomsg)
5415 {
5416         iomsg->msg.msg_name = &iomsg->addr;
5417
5418 #ifdef CONFIG_COMPAT
5419         if (req->ctx->compat)
5420                 return __io_compat_recvmsg_copy_hdr(req, iomsg);
5421 #endif
5422
5423         return __io_recvmsg_copy_hdr(req, iomsg);
5424 }
5425
5426 static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
5427                                                unsigned int issue_flags)
5428 {
5429         struct io_sr_msg *sr = &req->sr_msg;
5430
5431         return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
5432 }
5433
5434 static int io_recvmsg_prep_async(struct io_kiocb *req)
5435 {
5436         int ret;
5437
5438         ret = io_recvmsg_copy_hdr(req, req->async_data);
5439         if (!ret)
5440                 req->flags |= REQ_F_NEED_CLEANUP;
5441         return ret;
5442 }
5443
5444 static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5445 {
5446         struct io_sr_msg *sr = &req->sr_msg;
5447
5448         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5449                 return -EINVAL;
5450
5451         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
5452         sr->len = READ_ONCE(sqe->len);
5453         sr->bgid = READ_ONCE(sqe->buf_group);
5454         sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
5455         if (sr->msg_flags & MSG_DONTWAIT)
5456                 req->flags |= REQ_F_NOWAIT;
5457
5458 #ifdef CONFIG_COMPAT
5459         if (req->ctx->compat)
5460                 sr->msg_flags |= MSG_CMSG_COMPAT;
5461 #endif
5462         sr->done_io = 0;
5463         return 0;
5464 }
5465
5466 static bool io_net_retry(struct socket *sock, int flags)
5467 {
5468         if (!(flags & MSG_WAITALL))
5469                 return false;
5470         return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
5471 }
5472
5473 static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
5474 {
5475         struct io_async_msghdr iomsg, *kmsg;
5476         struct io_sr_msg *sr = &req->sr_msg;
5477         struct socket *sock;
5478         struct io_buffer *kbuf;
5479         unsigned flags;
5480         int ret, min_ret = 0;
5481         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5482
5483         sock = sock_from_file(req->file);
5484         if (unlikely(!sock))
5485                 return -ENOTSOCK;
5486
5487         if (req_has_async_data(req)) {
5488                 kmsg = req->async_data;
5489         } else {
5490                 ret = io_recvmsg_copy_hdr(req, &iomsg);
5491                 if (ret)
5492                         return ret;
5493                 kmsg = &iomsg;
5494         }
5495
5496         if (req->flags & REQ_F_BUFFER_SELECT) {
5497                 kbuf = io_recv_buffer_select(req, issue_flags);
5498                 if (IS_ERR(kbuf))
5499                         return PTR_ERR(kbuf);
5500                 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
5501                 kmsg->fast_iov[0].iov_len = req->sr_msg.len;
5502                 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
5503                                 1, req->sr_msg.len);
5504         }
5505
5506         flags = req->sr_msg.msg_flags;
5507         if (force_nonblock)
5508                 flags |= MSG_DONTWAIT;
5509         if (flags & MSG_WAITALL)
5510                 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
5511
5512         ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
5513                                         kmsg->uaddr, flags);
5514         if (ret < min_ret) {
5515                 if (ret == -EAGAIN && force_nonblock)
5516                         return io_setup_async_msg(req, kmsg);
5517                 if (ret == -ERESTARTSYS)
5518                         ret = -EINTR;
5519                 if (ret > 0 && io_net_retry(sock, flags)) {
5520                         sr->done_io += ret;
5521                         req->flags |= REQ_F_PARTIAL_IO;
5522                         return io_setup_async_msg(req, kmsg);
5523                 }
5524                 req_set_fail(req);
5525         } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
5526                 req_set_fail(req);
5527         }
5528
5529         /* fast path, check for non-NULL to avoid function call */
5530         if (kmsg->free_iov)
5531                 kfree(kmsg->free_iov);
5532         req->flags &= ~REQ_F_NEED_CLEANUP;
5533         if (ret >= 0)
5534                 ret += sr->done_io;
5535         else if (sr->done_io)
5536                 ret = sr->done_io;
5537         __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
5538         return 0;
5539 }
5540
5541 static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
5542 {
5543         struct io_buffer *kbuf;
5544         struct io_sr_msg *sr = &req->sr_msg;
5545         struct msghdr msg;
5546         void __user *buf = sr->buf;
5547         struct socket *sock;
5548         struct iovec iov;
5549         unsigned flags;
5550         int ret, min_ret = 0;
5551         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5552
5553         sock = sock_from_file(req->file);
5554         if (unlikely(!sock))
5555                 return -ENOTSOCK;
5556
5557         if (req->flags & REQ_F_BUFFER_SELECT) {
5558                 kbuf = io_recv_buffer_select(req, issue_flags);
5559                 if (IS_ERR(kbuf))
5560                         return PTR_ERR(kbuf);
5561                 buf = u64_to_user_ptr(kbuf->addr);
5562         }
5563
5564         ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
5565         if (unlikely(ret))
5566                 goto out_free;
5567
5568         msg.msg_name = NULL;
5569         msg.msg_control = NULL;
5570         msg.msg_controllen = 0;
5571         msg.msg_namelen = 0;
5572         msg.msg_iocb = NULL;
5573         msg.msg_flags = 0;
5574
5575         flags = req->sr_msg.msg_flags;
5576         if (force_nonblock)
5577                 flags |= MSG_DONTWAIT;
5578         if (flags & MSG_WAITALL)
5579                 min_ret = iov_iter_count(&msg.msg_iter);
5580
5581         ret = sock_recvmsg(sock, &msg, flags);
5582         if (ret < min_ret) {
5583                 if (ret == -EAGAIN && force_nonblock)
5584                         return -EAGAIN;
5585                 if (ret == -ERESTARTSYS)
5586                         ret = -EINTR;
5587                 if (ret > 0 && io_net_retry(sock, flags)) {
5588                         sr->len -= ret;
5589                         sr->buf += ret;
5590                         sr->done_io += ret;
5591                         req->flags |= REQ_F_PARTIAL_IO;
5592                         return -EAGAIN;
5593                 }
5594                 req_set_fail(req);
5595         } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
5596 out_free:
5597                 req_set_fail(req);
5598         }
5599
5600         if (ret >= 0)
5601                 ret += sr->done_io;
5602         else if (sr->done_io)
5603                 ret = sr->done_io;
5604         __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
5605         return 0;
5606 }
5607
5608 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5609 {
5610         struct io_accept *accept = &req->accept;
5611
5612         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5613                 return -EINVAL;
5614         if (sqe->ioprio || sqe->len || sqe->buf_index)
5615                 return -EINVAL;
5616
5617         accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5618         accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
5619         accept->flags = READ_ONCE(sqe->accept_flags);
5620         accept->nofile = rlimit(RLIMIT_NOFILE);
5621
5622         accept->file_slot = READ_ONCE(sqe->file_index);
5623         if (accept->file_slot && (accept->flags & SOCK_CLOEXEC))
5624                 return -EINVAL;
5625         if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
5626                 return -EINVAL;
5627         if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
5628                 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
5629         return 0;
5630 }
5631
5632 static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
5633 {
5634         struct io_accept *accept = &req->accept;
5635         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5636         unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
5637         bool fixed = !!accept->file_slot;
5638         struct file *file;
5639         int ret, fd;
5640
5641         if (!fixed) {
5642                 fd = __get_unused_fd_flags(accept->flags, accept->nofile);
5643                 if (unlikely(fd < 0))
5644                         return fd;
5645         }
5646         file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
5647                          accept->flags);
5648         if (IS_ERR(file)) {
5649                 if (!fixed)
5650                         put_unused_fd(fd);
5651                 ret = PTR_ERR(file);
5652                 if (ret == -EAGAIN && force_nonblock)
5653                         return -EAGAIN;
5654                 if (ret == -ERESTARTSYS)
5655                         ret = -EINTR;
5656                 req_set_fail(req);
5657         } else if (!fixed) {
5658                 fd_install(fd, file);
5659                 ret = fd;
5660         } else {
5661                 ret = io_install_fixed_file(req, file, issue_flags,
5662                                             accept->file_slot - 1);
5663         }
5664         __io_req_complete(req, issue_flags, ret, 0);
5665         return 0;
5666 }
5667
5668 static int io_connect_prep_async(struct io_kiocb *req)
5669 {
5670         struct io_async_connect *io = req->async_data;
5671         struct io_connect *conn = &req->connect;
5672
5673         return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
5674 }
5675
5676 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5677 {
5678         struct io_connect *conn = &req->connect;
5679
5680         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5681                 return -EINVAL;
5682         if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
5683             sqe->splice_fd_in)
5684                 return -EINVAL;
5685
5686         conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5687         conn->addr_len =  READ_ONCE(sqe->addr2);
5688         return 0;
5689 }
5690
5691 static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
5692 {
5693         struct io_async_connect __io, *io;
5694         unsigned file_flags;
5695         int ret;
5696         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5697
5698         if (req_has_async_data(req)) {
5699                 io = req->async_data;
5700         } else {
5701                 ret = move_addr_to_kernel(req->connect.addr,
5702                                                 req->connect.addr_len,
5703                                                 &__io.address);
5704                 if (ret)
5705                         goto out;
5706                 io = &__io;
5707         }
5708
5709         file_flags = force_nonblock ? O_NONBLOCK : 0;
5710
5711         ret = __sys_connect_file(req->file, &io->address,
5712                                         req->connect.addr_len, file_flags);
5713         if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
5714                 if (req_has_async_data(req))
5715                         return -EAGAIN;
5716                 if (io_alloc_async_data(req)) {
5717                         ret = -ENOMEM;
5718                         goto out;
5719                 }
5720                 memcpy(req->async_data, &__io, sizeof(__io));
5721                 return -EAGAIN;
5722         }
5723         if (ret == -ERESTARTSYS)
5724                 ret = -EINTR;
5725 out:
5726         if (ret < 0)
5727                 req_set_fail(req);
5728         __io_req_complete(req, issue_flags, ret, 0);
5729         return 0;
5730 }
5731 #else /* !CONFIG_NET */
5732 #define IO_NETOP_FN(op)                                                 \
5733 static int io_##op(struct io_kiocb *req, unsigned int issue_flags)      \
5734 {                                                                       \
5735         return -EOPNOTSUPP;                                             \
5736 }
5737
5738 #define IO_NETOP_PREP(op)                                               \
5739 IO_NETOP_FN(op)                                                         \
5740 static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
5741 {                                                                       \
5742         return -EOPNOTSUPP;                                             \
5743 }                                                                       \
5744
5745 #define IO_NETOP_PREP_ASYNC(op)                                         \
5746 IO_NETOP_PREP(op)                                                       \
5747 static int io_##op##_prep_async(struct io_kiocb *req)                   \
5748 {                                                                       \
5749         return -EOPNOTSUPP;                                             \
5750 }
5751
5752 IO_NETOP_PREP_ASYNC(sendmsg);
5753 IO_NETOP_PREP_ASYNC(recvmsg);
5754 IO_NETOP_PREP_ASYNC(connect);
5755 IO_NETOP_PREP(accept);
5756 IO_NETOP_FN(send);
5757 IO_NETOP_FN(recv);
5758 #endif /* CONFIG_NET */
5759
5760 #ifdef CONFIG_NET_RX_BUSY_POLL
5761
5762 #define NAPI_TIMEOUT                    (60 * SEC_CONVERSION)
5763
5764 struct napi_entry {
5765         struct list_head        list;
5766         unsigned int            napi_id;
5767         unsigned long           timeout;
5768 };
5769
5770 /*
5771  * Add busy poll NAPI ID from sk.
5772  */
5773 static void io_add_napi(struct file *file, struct io_ring_ctx *ctx)
5774 {
5775         unsigned int napi_id;
5776         struct socket *sock;
5777         struct sock *sk;
5778         struct napi_entry *ne;
5779
5780         if (!net_busy_loop_on())
5781                 return;
5782
5783         sock = sock_from_file(file);
5784         if (!sock)
5785                 return;
5786
5787         sk = sock->sk;
5788         if (!sk)
5789                 return;
5790
5791         napi_id = READ_ONCE(sk->sk_napi_id);
5792
5793         /* Non-NAPI IDs can be rejected */
5794         if (napi_id < MIN_NAPI_ID)
5795                 return;
5796
5797         spin_lock(&ctx->napi_lock);
5798         list_for_each_entry(ne, &ctx->napi_list, list) {
5799                 if (ne->napi_id == napi_id) {
5800                         ne->timeout = jiffies + NAPI_TIMEOUT;
5801                         goto out;
5802                 }
5803         }
5804
5805         ne = kmalloc(sizeof(*ne), GFP_NOWAIT);
5806         if (!ne)
5807                 goto out;
5808
5809         ne->napi_id = napi_id;
5810         ne->timeout = jiffies + NAPI_TIMEOUT;
5811         list_add_tail(&ne->list, &ctx->napi_list);
5812 out:
5813         spin_unlock(&ctx->napi_lock);
5814 }
5815
5816 static inline void io_check_napi_entry_timeout(struct napi_entry *ne)
5817 {
5818         if (time_after(jiffies, ne->timeout)) {
5819                 list_del(&ne->list);
5820                 kfree(ne);
5821         }
5822 }
5823
5824 /*
5825  * Busy poll if globally on and supporting sockets found
5826  */
5827 static bool io_napi_busy_loop(struct list_head *napi_list)
5828 {
5829         struct napi_entry *ne, *n;
5830
5831         list_for_each_entry_safe(ne, n, napi_list, list) {
5832                 napi_busy_loop(ne->napi_id, NULL, NULL, true,
5833                                BUSY_POLL_BUDGET);
5834                 io_check_napi_entry_timeout(ne);
5835         }
5836         return !list_empty(napi_list);
5837 }
5838
5839 static void io_free_napi_list(struct io_ring_ctx *ctx)
5840 {
5841         spin_lock(&ctx->napi_lock);
5842         while (!list_empty(&ctx->napi_list)) {
5843                 struct napi_entry *ne =
5844                         list_first_entry(&ctx->napi_list, struct napi_entry,
5845                                          list);
5846
5847                 list_del(&ne->list);
5848                 kfree(ne);
5849         }
5850         spin_unlock(&ctx->napi_lock);
5851 }
5852 #else
5853 static inline void io_add_napi(struct file *file, struct io_ring_ctx *ctx)
5854 {
5855 }
5856
5857 static inline void io_free_napi_list(struct io_ring_ctx *ctx)
5858 {
5859 }
5860 #endif /* CONFIG_NET_RX_BUSY_POLL */
5861
5862 struct io_poll_table {
5863         struct poll_table_struct pt;
5864         struct io_kiocb *req;
5865         int nr_entries;
5866         int error;
5867 };
5868
5869 #define IO_POLL_CANCEL_FLAG     BIT(31)
5870 #define IO_POLL_REF_MASK        GENMASK(30, 0)
5871
5872 /*
5873  * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
5874  * bump it and acquire ownership. It's disallowed to modify requests while not
5875  * owning it, that prevents from races for enqueueing task_work's and b/w
5876  * arming poll and wakeups.
5877  */
5878 static inline bool io_poll_get_ownership(struct io_kiocb *req)
5879 {
5880         return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
5881 }
5882
5883 static void io_poll_mark_cancelled(struct io_kiocb *req)
5884 {
5885         atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
5886 }
5887
5888 static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
5889 {
5890         /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
5891         if (req->opcode == IORING_OP_POLL_ADD)
5892                 return req->async_data;
5893         return req->apoll->double_poll;
5894 }
5895
5896 static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
5897 {
5898         if (req->opcode == IORING_OP_POLL_ADD)
5899                 return &req->poll;
5900         return &req->apoll->poll;
5901 }
5902
5903 static void io_poll_req_insert(struct io_kiocb *req)
5904 {
5905         struct io_ring_ctx *ctx = req->ctx;
5906         struct hlist_head *list;
5907
5908         list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
5909         hlist_add_head(&req->hash_node, list);
5910 }
5911
5912 static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
5913                               wait_queue_func_t wake_func)
5914 {
5915         poll->head = NULL;
5916 #define IO_POLL_UNMASK  (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
5917         /* mask in events that we always want/need */
5918         poll->events = events | IO_POLL_UNMASK;
5919         INIT_LIST_HEAD(&poll->wait.entry);
5920         init_waitqueue_func_entry(&poll->wait, wake_func);
5921 }
5922
5923 static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
5924 {
5925         struct wait_queue_head *head = smp_load_acquire(&poll->head);
5926
5927         if (head) {
5928                 spin_lock_irq(&head->lock);
5929                 list_del_init(&poll->wait.entry);
5930                 poll->head = NULL;
5931                 spin_unlock_irq(&head->lock);
5932         }
5933 }
5934
5935 static void io_poll_remove_entries(struct io_kiocb *req)
5936 {
5937         /*
5938          * Nothing to do if neither of those flags are set. Avoid dipping
5939          * into the poll/apoll/double cachelines if we can.
5940          */
5941         if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL)))
5942                 return;
5943
5944         /*
5945          * While we hold the waitqueue lock and the waitqueue is nonempty,
5946          * wake_up_pollfree() will wait for us.  However, taking the waitqueue
5947          * lock in the first place can race with the waitqueue being freed.
5948          *
5949          * We solve this as eventpoll does: by taking advantage of the fact that
5950          * all users of wake_up_pollfree() will RCU-delay the actual free.  If
5951          * we enter rcu_read_lock() and see that the pointer to the queue is
5952          * non-NULL, we can then lock it without the memory being freed out from
5953          * under us.
5954          *
5955          * Keep holding rcu_read_lock() as long as we hold the queue lock, in
5956          * case the caller deletes the entry from the queue, leaving it empty.
5957          * In that case, only RCU prevents the queue memory from being freed.
5958          */
5959         rcu_read_lock();
5960         if (req->flags & REQ_F_SINGLE_POLL)
5961                 io_poll_remove_entry(io_poll_get_single(req));
5962         if (req->flags & REQ_F_DOUBLE_POLL)
5963                 io_poll_remove_entry(io_poll_get_double(req));
5964         rcu_read_unlock();
5965 }
5966
5967 /*
5968  * All poll tw should go through this. Checks for poll events, manages
5969  * references, does rewait, etc.
5970  *
5971  * Returns a negative error on failure. >0 when no action require, which is
5972  * either spurious wakeup or multishot CQE is served. 0 when it's done with
5973  * the request, then the mask is stored in req->result.
5974  */
5975 static int io_poll_check_events(struct io_kiocb *req)
5976 {
5977         struct io_ring_ctx *ctx = req->ctx;
5978         struct io_poll_iocb *poll = io_poll_get_single(req);
5979         int v;
5980
5981         /* req->task == current here, checking PF_EXITING is safe */
5982         if (unlikely(req->task->flags & PF_EXITING))
5983                 io_poll_mark_cancelled(req);
5984
5985         do {
5986                 v = atomic_read(&req->poll_refs);
5987
5988                 /* tw handler should be the owner, and so have some references */
5989                 if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
5990                         return 0;
5991                 if (v & IO_POLL_CANCEL_FLAG)
5992                         return -ECANCELED;
5993
5994                 if (!req->result) {
5995                         struct poll_table_struct pt = { ._key = req->cflags };
5996
5997                         req->result = vfs_poll(req->file, &pt) & req->cflags;
5998                 }
5999
6000                 /* multishot, just fill an CQE and proceed */
6001                 if (req->result && !(req->cflags & EPOLLONESHOT)) {
6002                         __poll_t mask = mangle_poll(req->result & poll->events);
6003                         bool filled;
6004
6005                         spin_lock(&ctx->completion_lock);
6006                         filled = io_fill_cqe_aux(ctx, req->user_data, mask,
6007                                                  IORING_CQE_F_MORE);
6008                         io_commit_cqring(ctx);
6009                         spin_unlock(&ctx->completion_lock);
6010                         if (unlikely(!filled))
6011                                 return -ECANCELED;
6012                         io_cqring_ev_posted(ctx);
6013                         io_add_napi(req->file, ctx);
6014                 } else if (req->result) {
6015                         return 0;
6016                 }
6017
6018                 /*
6019                  * Release all references, retry if someone tried to restart
6020                  * task_work while we were executing it.
6021                  */
6022         } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs));
6023
6024         return 1;
6025 }
6026
6027 static void io_poll_task_func(struct io_kiocb *req, bool *locked)
6028 {
6029         struct io_ring_ctx *ctx = req->ctx;
6030         int ret;
6031
6032         ret = io_poll_check_events(req);
6033         if (ret > 0)
6034                 return;
6035
6036         if (!ret) {
6037                 req->result = mangle_poll(req->result & req->poll.events);
6038         } else {
6039                 req->result = ret;
6040                 req_set_fail(req);
6041         }
6042
6043         io_poll_remove_entries(req);
6044         spin_lock(&ctx->completion_lock);
6045         hash_del(&req->hash_node);
6046         __io_req_complete_post(req, req->result, 0);
6047         io_commit_cqring(ctx);
6048         spin_unlock(&ctx->completion_lock);
6049         io_cqring_ev_posted(ctx);
6050 }
6051
6052 static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
6053 {
6054         struct io_ring_ctx *ctx = req->ctx;
6055         int ret;
6056
6057         ret = io_poll_check_events(req);
6058         if (ret > 0)
6059                 return;
6060
6061         io_poll_remove_entries(req);
6062         spin_lock(&ctx->completion_lock);
6063         hash_del(&req->hash_node);
6064         spin_unlock(&ctx->completion_lock);
6065
6066         if (!ret)
6067                 io_req_task_submit(req, locked);
6068         else
6069                 io_req_complete_failed(req, ret);
6070 }
6071
6072 static void __io_poll_execute(struct io_kiocb *req, int mask, int events)
6073 {
6074         req->result = mask;
6075         /*
6076          * This is useful for poll that is armed on behalf of another
6077          * request, and where the wakeup path could be on a different
6078          * CPU. We want to avoid pulling in req->apoll->events for that
6079          * case.
6080          */
6081         req->cflags = events;
6082         if (req->opcode == IORING_OP_POLL_ADD)
6083                 req->io_task_work.func = io_poll_task_func;
6084         else
6085                 req->io_task_work.func = io_apoll_task_func;
6086
6087         trace_io_uring_task_add(req->ctx, req, req->user_data, req->opcode, mask);
6088         io_req_task_work_add(req, false);
6089 }
6090
6091 static inline void io_poll_execute(struct io_kiocb *req, int res, int events)
6092 {
6093         if (io_poll_get_ownership(req))
6094                 __io_poll_execute(req, res, events);
6095 }
6096
6097 static void io_poll_cancel_req(struct io_kiocb *req)
6098 {
6099         io_poll_mark_cancelled(req);
6100         /* kick tw, which should complete the request */
6101         io_poll_execute(req, 0, 0);
6102 }
6103
6104 #define wqe_to_req(wait)        ((void *)((unsigned long) (wait)->private & ~1))
6105 #define wqe_is_double(wait)     ((unsigned long) (wait)->private & 1)
6106
6107 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
6108                         void *key)
6109 {
6110         struct io_kiocb *req = wqe_to_req(wait);
6111         struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
6112                                                  wait);
6113         __poll_t mask = key_to_poll(key);
6114
6115         if (unlikely(mask & POLLFREE)) {
6116                 io_poll_mark_cancelled(req);
6117                 /* we have to kick tw in case it's not already */
6118                 io_poll_execute(req, 0, poll->events);
6119
6120                 /*
6121                  * If the waitqueue is being freed early but someone is already
6122                  * holds ownership over it, we have to tear down the request as
6123                  * best we can. That means immediately removing the request from
6124                  * its waitqueue and preventing all further accesses to the
6125                  * waitqueue via the request.
6126                  */
6127                 list_del_init(&poll->wait.entry);
6128
6129                 /*
6130                  * Careful: this *must* be the last step, since as soon
6131                  * as req->head is NULL'ed out, the request can be
6132                  * completed and freed, since aio_poll_complete_work()
6133                  * will no longer need to take the waitqueue lock.
6134                  */
6135                 smp_store_release(&poll->head, NULL);
6136                 return 1;
6137         }
6138
6139         /* for instances that support it check for an event match first */
6140         if (mask && !(mask & poll->events))
6141                 return 0;
6142
6143         if (io_poll_get_ownership(req)) {
6144                 /* optional, saves extra locking for removal in tw handler */
6145                 if (mask && poll->events & EPOLLONESHOT) {
6146                         list_del_init(&poll->wait.entry);
6147                         poll->head = NULL;
6148                         if (wqe_is_double(wait))
6149                                 req->flags &= ~REQ_F_DOUBLE_POLL;
6150                         else
6151                                 req->flags &= ~REQ_F_SINGLE_POLL;
6152                 }
6153                 __io_poll_execute(req, mask, poll->events);
6154         }
6155         return 1;
6156 }
6157
6158 static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
6159                             struct wait_queue_head *head,
6160                             struct io_poll_iocb **poll_ptr)
6161 {
6162         struct io_kiocb *req = pt->req;
6163         unsigned long wqe_private = (unsigned long) req;
6164
6165         /*
6166          * The file being polled uses multiple waitqueues for poll handling
6167          * (e.g. one for read, one for write). Setup a separate io_poll_iocb
6168          * if this happens.
6169          */
6170         if (unlikely(pt->nr_entries)) {
6171                 struct io_poll_iocb *first = poll;
6172
6173                 /* double add on the same waitqueue head, ignore */
6174                 if (first->head == head)
6175                         return;
6176                 /* already have a 2nd entry, fail a third attempt */
6177                 if (*poll_ptr) {
6178                         if ((*poll_ptr)->head == head)
6179                                 return;
6180                         pt->error = -EINVAL;
6181                         return;
6182                 }
6183
6184                 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
6185                 if (!poll) {
6186                         pt->error = -ENOMEM;
6187                         return;
6188                 }
6189                 /* mark as double wq entry */
6190                 wqe_private |= 1;
6191                 req->flags |= REQ_F_DOUBLE_POLL;
6192                 io_init_poll_iocb(poll, first->events, first->wait.func);
6193                 *poll_ptr = poll;
6194                 if (req->opcode == IORING_OP_POLL_ADD)
6195                         req->flags |= REQ_F_ASYNC_DATA;
6196         }
6197
6198         req->flags |= REQ_F_SINGLE_POLL;
6199         pt->nr_entries++;
6200         poll->head = head;
6201         poll->wait.private = (void *) wqe_private;
6202
6203         if (poll->events & EPOLLEXCLUSIVE)
6204                 add_wait_queue_exclusive(head, &poll->wait);
6205         else
6206                 add_wait_queue(head, &poll->wait);
6207 }
6208
6209 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
6210                                struct poll_table_struct *p)
6211 {
6212         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
6213
6214         __io_queue_proc(&pt->req->poll, pt, head,
6215                         (struct io_poll_iocb **) &pt->req->async_data);
6216 }
6217
6218 static int __io_arm_poll_handler(struct io_kiocb *req,
6219                                  struct io_poll_iocb *poll,
6220                                  struct io_poll_table *ipt, __poll_t mask)
6221 {
6222         struct io_ring_ctx *ctx = req->ctx;
6223         int v;
6224
6225         INIT_HLIST_NODE(&req->hash_node);
6226         io_init_poll_iocb(poll, mask, io_poll_wake);
6227         poll->file = req->file;
6228
6229         ipt->pt._key = mask;
6230         ipt->req = req;
6231         ipt->error = 0;
6232         ipt->nr_entries = 0;
6233
6234         /*
6235          * Take the ownership to delay any tw execution up until we're done
6236          * with poll arming. see io_poll_get_ownership().
6237          */
6238         atomic_set(&req->poll_refs, 1);
6239         mask = vfs_poll(req->file, &ipt->pt) & poll->events;
6240
6241         if (mask && (poll->events & EPOLLONESHOT)) {
6242                 io_poll_remove_entries(req);
6243                 /* no one else has access to the req, forget about the ref */
6244                 return mask;
6245         }
6246         if (!mask && unlikely(ipt->error || !ipt->nr_entries)) {
6247                 io_poll_remove_entries(req);
6248                 if (!ipt->error)
6249                         ipt->error = -EINVAL;
6250                 return 0;
6251         }
6252
6253         spin_lock(&ctx->completion_lock);
6254         io_poll_req_insert(req);
6255         spin_unlock(&ctx->completion_lock);
6256
6257         if (mask) {
6258                 /* can't multishot if failed, just queue the event we've got */
6259                 if (unlikely(ipt->error || !ipt->nr_entries))
6260                         poll->events |= EPOLLONESHOT;
6261                 __io_poll_execute(req, mask, poll->events);
6262                 return 0;
6263         }
6264         io_add_napi(req->file, req->ctx);
6265
6266         /*
6267          * Release ownership. If someone tried to queue a tw while it was
6268          * locked, kick it off for them.
6269          */
6270         v = atomic_dec_return(&req->poll_refs);
6271         if (unlikely(v & IO_POLL_REF_MASK))
6272                 __io_poll_execute(req, 0, poll->events);
6273         return 0;
6274 }
6275
6276 static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
6277                                struct poll_table_struct *p)
6278 {
6279         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
6280         struct async_poll *apoll = pt->req->apoll;
6281
6282         __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
6283 }
6284
6285 enum {
6286         IO_APOLL_OK,
6287         IO_APOLL_ABORTED,
6288         IO_APOLL_READY
6289 };
6290
6291 static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
6292 {
6293         const struct io_op_def *def = &io_op_defs[req->opcode];
6294         struct io_ring_ctx *ctx = req->ctx;
6295         struct async_poll *apoll;
6296         struct io_poll_table ipt;
6297         __poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI;
6298         int ret;
6299
6300         if (!def->pollin && !def->pollout)
6301                 return IO_APOLL_ABORTED;
6302         if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED))
6303                 return IO_APOLL_ABORTED;
6304
6305         if (def->pollin) {
6306                 mask |= POLLIN | POLLRDNORM;
6307
6308                 /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
6309                 if ((req->opcode == IORING_OP_RECVMSG) &&
6310                     (req->sr_msg.msg_flags & MSG_ERRQUEUE))
6311                         mask &= ~POLLIN;
6312         } else {
6313                 mask |= POLLOUT | POLLWRNORM;
6314         }
6315         if (def->poll_exclusive)
6316                 mask |= EPOLLEXCLUSIVE;
6317         if (!(issue_flags & IO_URING_F_UNLOCKED) &&
6318             !list_empty(&ctx->apoll_cache)) {
6319                 apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
6320                                                 poll.wait.entry);
6321                 list_del_init(&apoll->poll.wait.entry);
6322         } else {
6323                 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
6324                 if (unlikely(!apoll))
6325                         return IO_APOLL_ABORTED;
6326         }
6327         apoll->double_poll = NULL;
6328         req->apoll = apoll;
6329         req->flags |= REQ_F_POLLED;
6330         ipt.pt._qproc = io_async_queue_proc;
6331
6332         io_kbuf_recycle(req, issue_flags);
6333
6334         ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);
6335         if (ret || ipt.error)
6336                 return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
6337
6338         trace_io_uring_poll_arm(ctx, req, req->user_data, req->opcode,
6339                                 mask, apoll->poll.events);
6340         return IO_APOLL_OK;
6341 }
6342
6343 /*
6344  * Returns true if we found and killed one or more poll requests
6345  */
6346 static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
6347                                       struct task_struct *tsk, bool cancel_all)
6348 {
6349         struct hlist_node *tmp;
6350         struct io_kiocb *req;
6351         bool found = false;
6352         int i;
6353
6354         spin_lock(&ctx->completion_lock);
6355         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
6356                 struct hlist_head *list;
6357
6358                 list = &ctx->cancel_hash[i];
6359                 hlist_for_each_entry_safe(req, tmp, list, hash_node) {
6360                         if (io_match_task_safe(req, tsk, cancel_all)) {
6361                                 hlist_del_init(&req->hash_node);
6362                                 io_poll_cancel_req(req);
6363                                 found = true;
6364                         }
6365                 }
6366         }
6367         spin_unlock(&ctx->completion_lock);
6368         return found;
6369 }
6370
6371 static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
6372                                      bool poll_only)
6373         __must_hold(&ctx->completion_lock)
6374 {
6375         struct hlist_head *list;
6376         struct io_kiocb *req;
6377
6378         list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
6379         hlist_for_each_entry(req, list, hash_node) {
6380                 if (sqe_addr != req->user_data)
6381                         continue;
6382                 if (poll_only && req->opcode != IORING_OP_POLL_ADD)
6383                         continue;
6384                 return req;
6385         }
6386         return NULL;
6387 }
6388
6389 static bool io_poll_disarm(struct io_kiocb *req)
6390         __must_hold(&ctx->completion_lock)
6391 {
6392         if (!io_poll_get_ownership(req))
6393                 return false;
6394         io_poll_remove_entries(req);
6395         hash_del(&req->hash_node);
6396         return true;
6397 }
6398
6399 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
6400                           bool poll_only)
6401         __must_hold(&ctx->completion_lock)
6402 {
6403         struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only);
6404
6405         if (!req)
6406                 return -ENOENT;
6407         io_poll_cancel_req(req);
6408         return 0;
6409 }
6410
6411 static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
6412                                      unsigned int flags)
6413 {
6414         u32 events;
6415
6416         events = READ_ONCE(sqe->poll32_events);
6417 #ifdef __BIG_ENDIAN
6418         events = swahw32(events);
6419 #endif
6420         if (!(flags & IORING_POLL_ADD_MULTI))
6421                 events |= EPOLLONESHOT;
6422         return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
6423 }
6424
6425 static int io_poll_update_prep(struct io_kiocb *req,
6426                                const struct io_uring_sqe *sqe)
6427 {
6428         struct io_poll_update *upd = &req->poll_update;
6429         u32 flags;
6430
6431         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6432                 return -EINVAL;
6433         if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
6434                 return -EINVAL;
6435         flags = READ_ONCE(sqe->len);
6436         if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
6437                       IORING_POLL_ADD_MULTI))
6438                 return -EINVAL;
6439         /* meaningless without update */
6440         if (flags == IORING_POLL_ADD_MULTI)
6441                 return -EINVAL;
6442
6443         upd->old_user_data = READ_ONCE(sqe->addr);
6444         upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
6445         upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
6446
6447         upd->new_user_data = READ_ONCE(sqe->off);
6448         if (!upd->update_user_data && upd->new_user_data)
6449                 return -EINVAL;
6450         if (upd->update_events)
6451                 upd->events = io_poll_parse_events(sqe, flags);
6452         else if (sqe->poll32_events)
6453                 return -EINVAL;
6454
6455         return 0;
6456 }
6457
6458 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
6459 {
6460         struct io_poll_iocb *poll = &req->poll;
6461         u32 flags;
6462
6463         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6464                 return -EINVAL;
6465         if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
6466                 return -EINVAL;
6467         flags = READ_ONCE(sqe->len);
6468         if (flags & ~IORING_POLL_ADD_MULTI)
6469                 return -EINVAL;
6470         if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
6471                 return -EINVAL;
6472
6473         io_req_set_refcount(req);
6474         req->cflags = poll->events = io_poll_parse_events(sqe, flags);
6475         return 0;
6476 }
6477
6478 static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
6479 {
6480         struct io_poll_iocb *poll = &req->poll;
6481         struct io_poll_table ipt;
6482         int ret;
6483
6484         ipt.pt._qproc = io_poll_queue_proc;
6485
6486         ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events);
6487         ret = ret ?: ipt.error;
6488         if (ret)
6489                 __io_req_complete(req, issue_flags, ret, 0);
6490         return 0;
6491 }
6492
6493 static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
6494 {
6495         struct io_ring_ctx *ctx = req->ctx;
6496         struct io_kiocb *preq;
6497         int ret2, ret = 0;
6498         bool locked;
6499
6500         spin_lock(&ctx->completion_lock);
6501         preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
6502         if (!preq || !io_poll_disarm(preq)) {
6503                 spin_unlock(&ctx->completion_lock);
6504                 ret = preq ? -EALREADY : -ENOENT;
6505                 goto out;
6506         }
6507         spin_unlock(&ctx->completion_lock);
6508
6509         if (req->poll_update.update_events || req->poll_update.update_user_data) {
6510                 /* only mask one event flags, keep behavior flags */
6511                 if (req->poll_update.update_events) {
6512                         preq->poll.events &= ~0xffff;
6513                         preq->poll.events |= req->poll_update.events & 0xffff;
6514                         preq->poll.events |= IO_POLL_UNMASK;
6515                 }
6516                 if (req->poll_update.update_user_data)
6517                         preq->user_data = req->poll_update.new_user_data;
6518
6519                 ret2 = io_poll_add(preq, issue_flags);
6520                 /* successfully updated, don't complete poll request */
6521                 if (!ret2)
6522                         goto out;
6523         }
6524
6525         req_set_fail(preq);
6526         preq->result = -ECANCELED;
6527         locked = !(issue_flags & IO_URING_F_UNLOCKED);
6528         io_req_task_complete(preq, &locked);
6529 out:
6530         if (ret < 0)
6531                 req_set_fail(req);
6532         /* complete update request, we're done with it */
6533         __io_req_complete(req, issue_flags, ret, 0);
6534         return 0;
6535 }
6536
6537 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
6538 {
6539         struct io_timeout_data *data = container_of(timer,
6540                                                 struct io_timeout_data, timer);
6541         struct io_kiocb *req = data->req;
6542         struct io_ring_ctx *ctx = req->ctx;
6543         unsigned long flags;
6544
6545         spin_lock_irqsave(&ctx->timeout_lock, flags);
6546         list_del_init(&req->timeout.list);
6547         atomic_set(&req->ctx->cq_timeouts,
6548                 atomic_read(&req->ctx->cq_timeouts) + 1);
6549         spin_unlock_irqrestore(&ctx->timeout_lock, flags);
6550
6551         if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
6552                 req_set_fail(req);
6553
6554         req->result = -ETIME;
6555         req->io_task_work.func = io_req_task_complete;
6556         io_req_task_work_add(req, false);
6557         return HRTIMER_NORESTART;
6558 }
6559
6560 static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
6561                                            __u64 user_data)
6562         __must_hold(&ctx->timeout_lock)
6563 {
6564         struct io_timeout_data *io;
6565         struct io_kiocb *req;
6566         bool found = false;
6567
6568         list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
6569                 found = user_data == req->user_data;
6570                 if (found)
6571                         break;
6572         }
6573         if (!found)
6574                 return ERR_PTR(-ENOENT);
6575
6576         io = req->async_data;
6577         if (hrtimer_try_to_cancel(&io->timer) == -1)
6578                 return ERR_PTR(-EALREADY);
6579         list_del_init(&req->timeout.list);
6580         return req;
6581 }
6582
6583 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
6584         __must_hold(&ctx->completion_lock)
6585         __must_hold(&ctx->timeout_lock)
6586 {
6587         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
6588
6589         if (IS_ERR(req))
6590                 return PTR_ERR(req);
6591         io_req_task_queue_fail(req, -ECANCELED);
6592         return 0;
6593 }
6594
6595 static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
6596 {
6597         switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
6598         case IORING_TIMEOUT_BOOTTIME:
6599                 return CLOCK_BOOTTIME;
6600         case IORING_TIMEOUT_REALTIME:
6601                 return CLOCK_REALTIME;
6602         default:
6603                 /* can't happen, vetted at prep time */
6604                 WARN_ON_ONCE(1);
6605                 fallthrough;
6606         case 0:
6607                 return CLOCK_MONOTONIC;
6608         }
6609 }
6610
6611 static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
6612                                     struct timespec64 *ts, enum hrtimer_mode mode)
6613         __must_hold(&ctx->timeout_lock)
6614 {
6615         struct io_timeout_data *io;
6616         struct io_kiocb *req;
6617         bool found = false;
6618
6619         list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
6620                 found = user_data == req->user_data;
6621                 if (found)
6622                         break;
6623         }
6624         if (!found)
6625                 return -ENOENT;
6626
6627         io = req->async_data;
6628         if (hrtimer_try_to_cancel(&io->timer) == -1)
6629                 return -EALREADY;
6630         hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
6631         io->timer.function = io_link_timeout_fn;
6632         hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
6633         return 0;
6634 }
6635
6636 static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
6637                              struct timespec64 *ts, enum hrtimer_mode mode)
6638         __must_hold(&ctx->timeout_lock)
6639 {
6640         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
6641         struct io_timeout_data *data;
6642
6643         if (IS_ERR(req))
6644                 return PTR_ERR(req);
6645
6646         req->timeout.off = 0; /* noseq */
6647         data = req->async_data;
6648         list_add_tail(&req->timeout.list, &ctx->timeout_list);
6649         hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
6650         data->timer.function = io_timeout_fn;
6651         hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
6652         return 0;
6653 }
6654
6655 static int io_timeout_remove_prep(struct io_kiocb *req,
6656                                   const struct io_uring_sqe *sqe)
6657 {
6658         struct io_timeout_rem *tr = &req->timeout_rem;
6659
6660         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6661                 return -EINVAL;
6662         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6663                 return -EINVAL;
6664         if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
6665                 return -EINVAL;
6666
6667         tr->ltimeout = false;
6668         tr->addr = READ_ONCE(sqe->addr);
6669         tr->flags = READ_ONCE(sqe->timeout_flags);
6670         if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
6671                 if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
6672                         return -EINVAL;
6673                 if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
6674                         tr->ltimeout = true;
6675                 if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
6676                         return -EINVAL;
6677                 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
6678                         return -EFAULT;
6679                 if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0)
6680                         return -EINVAL;
6681         } else if (tr->flags) {
6682                 /* timeout removal doesn't support flags */
6683                 return -EINVAL;
6684         }
6685
6686         return 0;
6687 }
6688
6689 static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
6690 {
6691         return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
6692                                             : HRTIMER_MODE_REL;
6693 }
6694
6695 /*
6696  * Remove or update an existing timeout command
6697  */
6698 static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
6699 {
6700         struct io_timeout_rem *tr = &req->timeout_rem;
6701         struct io_ring_ctx *ctx = req->ctx;
6702         int ret;
6703
6704         if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
6705                 spin_lock(&ctx->completion_lock);
6706                 spin_lock_irq(&ctx->timeout_lock);
6707                 ret = io_timeout_cancel(ctx, tr->addr);
6708                 spin_unlock_irq(&ctx->timeout_lock);
6709                 spin_unlock(&ctx->completion_lock);
6710         } else {
6711                 enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
6712
6713                 spin_lock_irq(&ctx->timeout_lock);
6714                 if (tr->ltimeout)
6715                         ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
6716                 else
6717                         ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
6718                 spin_unlock_irq(&ctx->timeout_lock);
6719         }
6720
6721         if (ret < 0)
6722                 req_set_fail(req);
6723         io_req_complete_post(req, ret, 0);
6724         return 0;
6725 }
6726
6727 static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
6728                            bool is_timeout_link)
6729 {
6730         struct io_timeout_data *data;
6731         unsigned flags;
6732         u32 off = READ_ONCE(sqe->off);
6733
6734         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6735                 return -EINVAL;
6736         if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
6737             sqe->splice_fd_in)
6738                 return -EINVAL;
6739         if (off && is_timeout_link)
6740                 return -EINVAL;
6741         flags = READ_ONCE(sqe->timeout_flags);
6742         if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
6743                       IORING_TIMEOUT_ETIME_SUCCESS))
6744                 return -EINVAL;
6745         /* more than one clock specified is invalid, obviously */
6746         if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
6747                 return -EINVAL;
6748
6749         INIT_LIST_HEAD(&req->timeout.list);
6750         req->timeout.off = off;
6751         if (unlikely(off && !req->ctx->off_timeout_used))
6752                 req->ctx->off_timeout_used = true;
6753
6754         if (WARN_ON_ONCE(req_has_async_data(req)))
6755                 return -EFAULT;
6756         if (io_alloc_async_data(req))
6757                 return -ENOMEM;
6758
6759         data = req->async_data;
6760         data->req = req;
6761         data->flags = flags;
6762
6763         if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
6764                 return -EFAULT;
6765
6766         if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
6767                 return -EINVAL;
6768
6769         data->mode = io_translate_timeout_mode(flags);
6770         hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
6771
6772         if (is_timeout_link) {
6773                 struct io_submit_link *link = &req->ctx->submit_state.link;
6774
6775                 if (!link->head)
6776                         return -EINVAL;
6777                 if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
6778                         return -EINVAL;
6779                 req->timeout.head = link->last;
6780                 link->last->flags |= REQ_F_ARM_LTIMEOUT;
6781         }
6782         return 0;
6783 }
6784
6785 static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
6786 {
6787         struct io_ring_ctx *ctx = req->ctx;
6788         struct io_timeout_data *data = req->async_data;
6789         struct list_head *entry;
6790         u32 tail, off = req->timeout.off;
6791
6792         spin_lock_irq(&ctx->timeout_lock);
6793
6794         /*
6795          * sqe->off holds how many events that need to occur for this
6796          * timeout event to be satisfied. If it isn't set, then this is
6797          * a pure timeout request, sequence isn't used.
6798          */
6799         if (io_is_timeout_noseq(req)) {
6800                 entry = ctx->timeout_list.prev;
6801                 goto add;
6802         }
6803
6804         tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
6805         req->timeout.target_seq = tail + off;
6806
6807         /* Update the last seq here in case io_flush_timeouts() hasn't.
6808          * This is safe because ->completion_lock is held, and submissions
6809          * and completions are never mixed in the same ->completion_lock section.
6810          */
6811         ctx->cq_last_tm_flush = tail;
6812
6813         /*
6814          * Insertion sort, ensuring the first entry in the list is always
6815          * the one we need first.
6816          */
6817         list_for_each_prev(entry, &ctx->timeout_list) {
6818                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
6819                                                   timeout.list);
6820
6821                 if (io_is_timeout_noseq(nxt))
6822                         continue;
6823                 /* nxt.seq is behind @tail, otherwise would've been completed */
6824                 if (off >= nxt->timeout.target_seq - tail)
6825                         break;
6826         }
6827 add:
6828         list_add(&req->timeout.list, entry);
6829         data->timer.function = io_timeout_fn;
6830         hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
6831         spin_unlock_irq(&ctx->timeout_lock);
6832         return 0;
6833 }
6834
6835 struct io_cancel_data {
6836         struct io_ring_ctx *ctx;
6837         u64 user_data;
6838 };
6839
6840 static bool io_cancel_cb(struct io_wq_work *work, void *data)
6841 {
6842         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6843         struct io_cancel_data *cd = data;
6844
6845         return req->ctx == cd->ctx && req->user_data == cd->user_data;
6846 }
6847
6848 static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
6849                                struct io_ring_ctx *ctx)
6850 {
6851         struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
6852         enum io_wq_cancel cancel_ret;
6853         int ret = 0;
6854
6855         if (!tctx || !tctx->io_wq)
6856                 return -ENOENT;
6857
6858         cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
6859         switch (cancel_ret) {
6860         case IO_WQ_CANCEL_OK:
6861                 ret = 0;
6862                 break;
6863         case IO_WQ_CANCEL_RUNNING:
6864                 ret = -EALREADY;
6865                 break;
6866         case IO_WQ_CANCEL_NOTFOUND:
6867                 ret = -ENOENT;
6868                 break;
6869         }
6870
6871         return ret;
6872 }
6873
6874 static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
6875 {
6876         struct io_ring_ctx *ctx = req->ctx;
6877         int ret;
6878
6879         WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
6880
6881         ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
6882         /*
6883          * Fall-through even for -EALREADY, as we may have poll armed
6884          * that need unarming.
6885          */
6886         if (!ret)
6887                 return 0;
6888
6889         spin_lock(&ctx->completion_lock);
6890         ret = io_poll_cancel(ctx, sqe_addr, false);
6891         if (ret != -ENOENT)
6892                 goto out;
6893
6894         spin_lock_irq(&ctx->timeout_lock);
6895         ret = io_timeout_cancel(ctx, sqe_addr);
6896         spin_unlock_irq(&ctx->timeout_lock);
6897 out:
6898         spin_unlock(&ctx->completion_lock);
6899         return ret;
6900 }
6901
6902 static int io_async_cancel_prep(struct io_kiocb *req,
6903                                 const struct io_uring_sqe *sqe)
6904 {
6905         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6906                 return -EINVAL;
6907         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6908                 return -EINVAL;
6909         if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
6910             sqe->splice_fd_in)
6911                 return -EINVAL;
6912
6913         req->cancel.addr = READ_ONCE(sqe->addr);
6914         return 0;
6915 }
6916
6917 static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
6918 {
6919         struct io_ring_ctx *ctx = req->ctx;
6920         u64 sqe_addr = req->cancel.addr;
6921         bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
6922         struct io_tctx_node *node;
6923         int ret;
6924
6925         ret = io_try_cancel_userdata(req, sqe_addr);
6926         if (ret != -ENOENT)
6927                 goto done;
6928
6929         /* slow path, try all io-wq's */
6930         io_ring_submit_lock(ctx, needs_lock);
6931         ret = -ENOENT;
6932         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
6933                 struct io_uring_task *tctx = node->task->io_uring;
6934
6935                 ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
6936                 if (ret != -ENOENT)
6937                         break;
6938         }
6939         io_ring_submit_unlock(ctx, needs_lock);
6940 done:
6941         if (ret < 0)
6942                 req_set_fail(req);
6943         io_req_complete_post(req, ret, 0);
6944         return 0;
6945 }
6946
6947 static int io_rsrc_update_prep(struct io_kiocb *req,
6948                                 const struct io_uring_sqe *sqe)
6949 {
6950         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6951                 return -EINVAL;
6952         if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
6953                 return -EINVAL;
6954
6955         req->rsrc_update.offset = READ_ONCE(sqe->off);
6956         req->rsrc_update.nr_args = READ_ONCE(sqe->len);
6957         if (!req->rsrc_update.nr_args)
6958                 return -EINVAL;
6959         req->rsrc_update.arg = READ_ONCE(sqe->addr);
6960         return 0;
6961 }
6962
6963 static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
6964 {
6965         struct io_ring_ctx *ctx = req->ctx;
6966         bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
6967         struct io_uring_rsrc_update2 up;
6968         int ret;
6969
6970         up.offset = req->rsrc_update.offset;
6971         up.data = req->rsrc_update.arg;
6972         up.nr = 0;
6973         up.tags = 0;
6974         up.resv = 0;
6975
6976         io_ring_submit_lock(ctx, needs_lock);
6977         ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
6978                                         &up, req->rsrc_update.nr_args);
6979         io_ring_submit_unlock(ctx, needs_lock);
6980
6981         if (ret < 0)
6982                 req_set_fail(req);
6983         __io_req_complete(req, issue_flags, ret, 0);
6984         return 0;
6985 }
6986
6987 static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
6988 {
6989         switch (req->opcode) {
6990         case IORING_OP_NOP:
6991                 return 0;
6992         case IORING_OP_READV:
6993         case IORING_OP_READ_FIXED:
6994         case IORING_OP_READ:
6995                 return io_read_prep(req, sqe);
6996         case IORING_OP_WRITEV:
6997         case IORING_OP_WRITE_FIXED:
6998         case IORING_OP_WRITE:
6999                 return io_write_prep(req, sqe);
7000         case IORING_OP_POLL_ADD:
7001                 return io_poll_add_prep(req, sqe);
7002         case IORING_OP_POLL_REMOVE:
7003                 return io_poll_update_prep(req, sqe);
7004         case IORING_OP_FSYNC:
7005                 return io_fsync_prep(req, sqe);
7006         case IORING_OP_SYNC_FILE_RANGE:
7007                 return io_sfr_prep(req, sqe);
7008         case IORING_OP_SENDMSG:
7009         case IORING_OP_SEND:
7010                 return io_sendmsg_prep(req, sqe);
7011         case IORING_OP_RECVMSG:
7012         case IORING_OP_RECV:
7013                 return io_recvmsg_prep(req, sqe);
7014         case IORING_OP_CONNECT:
7015                 return io_connect_prep(req, sqe);
7016         case IORING_OP_TIMEOUT:
7017                 return io_timeout_prep(req, sqe, false);
7018         case IORING_OP_TIMEOUT_REMOVE:
7019                 return io_timeout_remove_prep(req, sqe);
7020         case IORING_OP_ASYNC_CANCEL:
7021                 return io_async_cancel_prep(req, sqe);
7022         case IORING_OP_LINK_TIMEOUT:
7023                 return io_timeout_prep(req, sqe, true);
7024         case IORING_OP_ACCEPT:
7025                 return io_accept_prep(req, sqe);
7026         case IORING_OP_FALLOCATE:
7027                 return io_fallocate_prep(req, sqe);
7028         case IORING_OP_OPENAT:
7029                 return io_openat_prep(req, sqe);
7030         case IORING_OP_CLOSE:
7031                 return io_close_prep(req, sqe);
7032         case IORING_OP_FILES_UPDATE:
7033                 return io_rsrc_update_prep(req, sqe);
7034         case IORING_OP_STATX:
7035                 return io_statx_prep(req, sqe);
7036         case IORING_OP_FADVISE:
7037                 return io_fadvise_prep(req, sqe);
7038         case IORING_OP_MADVISE:
7039                 return io_madvise_prep(req, sqe);
7040         case IORING_OP_OPENAT2:
7041                 return io_openat2_prep(req, sqe);
7042         case IORING_OP_EPOLL_CTL:
7043                 return io_epoll_ctl_prep(req, sqe);
7044         case IORING_OP_SPLICE:
7045                 return io_splice_prep(req, sqe);
7046         case IORING_OP_PROVIDE_BUFFERS:
7047                 return io_provide_buffers_prep(req, sqe);
7048         case IORING_OP_REMOVE_BUFFERS:
7049                 return io_remove_buffers_prep(req, sqe);
7050         case IORING_OP_TEE:
7051                 return io_tee_prep(req, sqe);
7052         case IORING_OP_SHUTDOWN:
7053                 return io_shutdown_prep(req, sqe);
7054         case IORING_OP_RENAMEAT:
7055                 return io_renameat_prep(req, sqe);
7056         case IORING_OP_UNLINKAT:
7057                 return io_unlinkat_prep(req, sqe);
7058         case IORING_OP_MKDIRAT:
7059                 return io_mkdirat_prep(req, sqe);
7060         case IORING_OP_SYMLINKAT:
7061                 return io_symlinkat_prep(req, sqe);
7062         case IORING_OP_LINKAT:
7063                 return io_linkat_prep(req, sqe);
7064         case IORING_OP_MSG_RING:
7065                 return io_msg_ring_prep(req, sqe);
7066         }
7067
7068         printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
7069                         req->opcode);
7070         return -EINVAL;
7071 }
7072
7073 static int io_req_prep_async(struct io_kiocb *req)
7074 {
7075         if (!io_op_defs[req->opcode].needs_async_setup)
7076                 return 0;
7077         if (WARN_ON_ONCE(req_has_async_data(req)))
7078                 return -EFAULT;
7079         if (io_alloc_async_data(req))
7080                 return -EAGAIN;
7081
7082         switch (req->opcode) {
7083         case IORING_OP_READV:
7084                 return io_rw_prep_async(req, READ);
7085         case IORING_OP_WRITEV:
7086                 return io_rw_prep_async(req, WRITE);
7087         case IORING_OP_SENDMSG:
7088                 return io_sendmsg_prep_async(req);
7089         case IORING_OP_RECVMSG:
7090                 return io_recvmsg_prep_async(req);
7091         case IORING_OP_CONNECT:
7092                 return io_connect_prep_async(req);
7093         }
7094         printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
7095                     req->opcode);
7096         return -EFAULT;
7097 }
7098
7099 static u32 io_get_sequence(struct io_kiocb *req)
7100 {
7101         u32 seq = req->ctx->cached_sq_head;
7102
7103         /* need original cached_sq_head, but it was increased for each req */
7104         io_for_each_link(req, req)
7105                 seq--;
7106         return seq;
7107 }
7108
7109 static __cold void io_drain_req(struct io_kiocb *req)
7110 {
7111         struct io_ring_ctx *ctx = req->ctx;
7112         struct io_defer_entry *de;
7113         int ret;
7114         u32 seq = io_get_sequence(req);
7115
7116         /* Still need defer if there is pending req in defer list. */
7117         spin_lock(&ctx->completion_lock);
7118         if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
7119                 spin_unlock(&ctx->completion_lock);
7120 queue:
7121                 ctx->drain_active = false;
7122                 io_req_task_queue(req);
7123                 return;
7124         }
7125         spin_unlock(&ctx->completion_lock);
7126
7127         ret = io_req_prep_async(req);
7128         if (ret) {
7129 fail:
7130                 io_req_complete_failed(req, ret);
7131                 return;
7132         }
7133         io_prep_async_link(req);
7134         de = kmalloc(sizeof(*de), GFP_KERNEL);
7135         if (!de) {
7136                 ret = -ENOMEM;
7137                 goto fail;
7138         }
7139
7140         spin_lock(&ctx->completion_lock);
7141         if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
7142                 spin_unlock(&ctx->completion_lock);
7143                 kfree(de);
7144                 goto queue;
7145         }
7146
7147         trace_io_uring_defer(ctx, req, req->user_data, req->opcode);
7148         de->req = req;
7149         de->seq = seq;
7150         list_add_tail(&de->list, &ctx->defer_list);
7151         spin_unlock(&ctx->completion_lock);
7152 }
7153
7154 static void io_clean_op(struct io_kiocb *req)
7155 {
7156         if (req->flags & REQ_F_BUFFER_SELECTED) {
7157                 spin_lock(&req->ctx->completion_lock);
7158                 io_put_kbuf_comp(req);
7159                 spin_unlock(&req->ctx->completion_lock);
7160         }
7161
7162         if (req->flags & REQ_F_NEED_CLEANUP) {
7163                 switch (req->opcode) {
7164                 case IORING_OP_READV:
7165                 case IORING_OP_READ_FIXED:
7166                 case IORING_OP_READ:
7167                 case IORING_OP_WRITEV:
7168                 case IORING_OP_WRITE_FIXED:
7169                 case IORING_OP_WRITE: {
7170                         struct io_async_rw *io = req->async_data;
7171
7172                         kfree(io->free_iovec);
7173                         break;
7174                         }
7175                 case IORING_OP_RECVMSG:
7176                 case IORING_OP_SENDMSG: {
7177                         struct io_async_msghdr *io = req->async_data;
7178
7179                         kfree(io->free_iov);
7180                         break;
7181                         }
7182                 case IORING_OP_SPLICE:
7183                 case IORING_OP_TEE:
7184                         if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED))
7185                                 io_put_file(req->splice.file_in);
7186                         break;
7187                 case IORING_OP_OPENAT:
7188                 case IORING_OP_OPENAT2:
7189                         if (req->open.filename)
7190                                 putname(req->open.filename);
7191                         break;
7192                 case IORING_OP_RENAMEAT:
7193                         putname(req->rename.oldpath);
7194                         putname(req->rename.newpath);
7195                         break;
7196                 case IORING_OP_UNLINKAT:
7197                         putname(req->unlink.filename);
7198                         break;
7199                 case IORING_OP_MKDIRAT:
7200                         putname(req->mkdir.filename);
7201                         break;
7202                 case IORING_OP_SYMLINKAT:
7203                         putname(req->symlink.oldpath);
7204                         putname(req->symlink.newpath);
7205                         break;
7206                 case IORING_OP_LINKAT:
7207                         putname(req->hardlink.oldpath);
7208                         putname(req->hardlink.newpath);
7209                         break;
7210                 case IORING_OP_STATX:
7211                         if (req->statx.filename)
7212                                 putname(req->statx.filename);
7213                         break;
7214                 }
7215         }
7216         if ((req->flags & REQ_F_POLLED) && req->apoll) {
7217                 kfree(req->apoll->double_poll);
7218                 kfree(req->apoll);
7219                 req->apoll = NULL;
7220         }
7221         if (req->flags & REQ_F_INFLIGHT) {
7222                 struct io_uring_task *tctx = req->task->io_uring;
7223
7224                 atomic_dec(&tctx->inflight_tracked);
7225         }
7226         if (req->flags & REQ_F_CREDS)
7227                 put_cred(req->creds);
7228         if (req->flags & REQ_F_ASYNC_DATA) {
7229                 kfree(req->async_data);
7230                 req->async_data = NULL;
7231         }
7232         req->flags &= ~IO_REQ_CLEAN_FLAGS;
7233 }
7234
7235 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
7236 {
7237         const struct cred *creds = NULL;
7238         int ret;
7239
7240         if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
7241                 creds = override_creds(req->creds);
7242
7243         if (!io_op_defs[req->opcode].audit_skip)
7244                 audit_uring_entry(req->opcode);
7245
7246         switch (req->opcode) {
7247         case IORING_OP_NOP:
7248                 ret = io_nop(req, issue_flags);
7249                 break;
7250         case IORING_OP_READV:
7251         case IORING_OP_READ_FIXED:
7252         case IORING_OP_READ:
7253                 ret = io_read(req, issue_flags);
7254                 break;
7255         case IORING_OP_WRITEV:
7256         case IORING_OP_WRITE_FIXED:
7257         case IORING_OP_WRITE:
7258                 ret = io_write(req, issue_flags);
7259                 break;
7260         case IORING_OP_FSYNC:
7261                 ret = io_fsync(req, issue_flags);
7262                 break;
7263         case IORING_OP_POLL_ADD:
7264                 ret = io_poll_add(req, issue_flags);
7265                 break;
7266         case IORING_OP_POLL_REMOVE:
7267                 ret = io_poll_update(req, issue_flags);
7268                 break;
7269         case IORING_OP_SYNC_FILE_RANGE:
7270                 ret = io_sync_file_range(req, issue_flags);
7271                 break;
7272         case IORING_OP_SENDMSG:
7273                 ret = io_sendmsg(req, issue_flags);
7274                 break;
7275         case IORING_OP_SEND:
7276                 ret = io_send(req, issue_flags);
7277                 break;
7278         case IORING_OP_RECVMSG:
7279                 ret = io_recvmsg(req, issue_flags);
7280                 break;
7281         case IORING_OP_RECV:
7282                 ret = io_recv(req, issue_flags);
7283                 break;
7284         case IORING_OP_TIMEOUT:
7285                 ret = io_timeout(req, issue_flags);
7286                 break;
7287         case IORING_OP_TIMEOUT_REMOVE:
7288                 ret = io_timeout_remove(req, issue_flags);
7289                 break;
7290         case IORING_OP_ACCEPT:
7291                 ret = io_accept(req, issue_flags);
7292                 break;
7293         case IORING_OP_CONNECT:
7294                 ret = io_connect(req, issue_flags);
7295                 break;
7296         case IORING_OP_ASYNC_CANCEL:
7297                 ret = io_async_cancel(req, issue_flags);
7298                 break;
7299         case IORING_OP_FALLOCATE:
7300                 ret = io_fallocate(req, issue_flags);
7301                 break;
7302         case IORING_OP_OPENAT:
7303                 ret = io_openat(req, issue_flags);
7304                 break;
7305         case IORING_OP_CLOSE:
7306                 ret = io_close(req, issue_flags);
7307                 break;
7308         case IORING_OP_FILES_UPDATE:
7309                 ret = io_files_update(req, issue_flags);
7310                 break;
7311         case IORING_OP_STATX:
7312                 ret = io_statx(req, issue_flags);
7313                 break;
7314         case IORING_OP_FADVISE:
7315                 ret = io_fadvise(req, issue_flags);
7316                 break;
7317         case IORING_OP_MADVISE:
7318                 ret = io_madvise(req, issue_flags);
7319                 break;
7320         case IORING_OP_OPENAT2:
7321                 ret = io_openat2(req, issue_flags);
7322                 break;
7323         case IORING_OP_EPOLL_CTL:
7324                 ret = io_epoll_ctl(req, issue_flags);
7325                 break;
7326         case IORING_OP_SPLICE:
7327                 ret = io_splice(req, issue_flags);
7328                 break;
7329         case IORING_OP_PROVIDE_BUFFERS:
7330                 ret = io_provide_buffers(req, issue_flags);
7331                 break;
7332         case IORING_OP_REMOVE_BUFFERS:
7333                 ret = io_remove_buffers(req, issue_flags);
7334                 break;
7335         case IORING_OP_TEE:
7336                 ret = io_tee(req, issue_flags);
7337                 break;
7338         case IORING_OP_SHUTDOWN:
7339                 ret = io_shutdown(req, issue_flags);
7340                 break;
7341         case IORING_OP_RENAMEAT:
7342                 ret = io_renameat(req, issue_flags);
7343                 break;
7344         case IORING_OP_UNLINKAT:
7345                 ret = io_unlinkat(req, issue_flags);
7346                 break;
7347         case IORING_OP_MKDIRAT:
7348                 ret = io_mkdirat(req, issue_flags);
7349                 break;
7350         case IORING_OP_SYMLINKAT:
7351                 ret = io_symlinkat(req, issue_flags);
7352                 break;
7353         case IORING_OP_LINKAT:
7354                 ret = io_linkat(req, issue_flags);
7355                 break;
7356         case IORING_OP_MSG_RING:
7357                 ret = io_msg_ring(req, issue_flags);
7358                 break;
7359         default:
7360                 ret = -EINVAL;
7361                 break;
7362         }
7363
7364         if (!io_op_defs[req->opcode].audit_skip)
7365                 audit_uring_exit(!ret, ret);
7366
7367         if (creds)
7368                 revert_creds(creds);
7369         if (ret)
7370                 return ret;
7371         /* If the op doesn't have a file, we're not polling for it */
7372         if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
7373                 io_iopoll_req_issued(req, issue_flags);
7374
7375         return 0;
7376 }
7377
7378 static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
7379 {
7380         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
7381
7382         req = io_put_req_find_next(req);
7383         return req ? &req->work : NULL;
7384 }
7385
7386 static void io_wq_submit_work(struct io_wq_work *work)
7387 {
7388         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
7389         unsigned int issue_flags = IO_URING_F_UNLOCKED;
7390         bool needs_poll = false;
7391         struct io_kiocb *timeout;
7392         int ret = 0;
7393
7394         /* one will be dropped by ->io_free_work() after returning to io-wq */
7395         if (!(req->flags & REQ_F_REFCOUNT))
7396                 __io_req_set_refcount(req, 2);
7397         else
7398                 req_ref_get(req);
7399
7400         timeout = io_prep_linked_timeout(req);
7401         if (timeout)
7402                 io_queue_linked_timeout(timeout);
7403
7404         /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
7405         if (work->flags & IO_WQ_WORK_CANCEL) {
7406                 io_req_task_queue_fail(req, -ECANCELED);
7407                 return;
7408         }
7409
7410         if (req->flags & REQ_F_FORCE_ASYNC) {
7411                 const struct io_op_def *def = &io_op_defs[req->opcode];
7412                 bool opcode_poll = def->pollin || def->pollout;
7413
7414                 if (opcode_poll && file_can_poll(req->file)) {
7415                         needs_poll = true;
7416                         issue_flags |= IO_URING_F_NONBLOCK;
7417                 }
7418         }
7419
7420         do {
7421                 ret = io_issue_sqe(req, issue_flags);
7422                 if (ret != -EAGAIN)
7423                         break;
7424                 /*
7425                  * We can get EAGAIN for iopolled IO even though we're
7426                  * forcing a sync submission from here, since we can't
7427                  * wait for request slots on the block side.
7428                  */
7429                 if (!needs_poll) {
7430                         cond_resched();
7431                         continue;
7432                 }
7433
7434                 if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
7435                         return;
7436                 /* aborted or ready, in either case retry blocking */
7437                 needs_poll = false;
7438                 issue_flags &= ~IO_URING_F_NONBLOCK;
7439         } while (1);
7440
7441         /* avoid locking problems by failing it from a clean context */
7442         if (ret)
7443                 io_req_task_queue_fail(req, ret);
7444 }
7445
7446 static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
7447                                                        unsigned i)
7448 {
7449         return &table->files[i];
7450 }
7451
7452 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
7453                                               int index)
7454 {
7455         struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
7456
7457         return (struct file *) (slot->file_ptr & FFS_MASK);
7458 }
7459
7460 static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
7461 {
7462         unsigned long file_ptr = (unsigned long) file;
7463
7464         file_ptr |= io_file_get_flags(file);
7465         file_slot->file_ptr = file_ptr;
7466 }
7467
7468 static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
7469                                              struct io_kiocb *req, int fd)
7470 {
7471         struct file *file;
7472         unsigned long file_ptr;
7473
7474         if (unlikely((unsigned int)fd >= ctx->nr_user_files))
7475                 return NULL;
7476         fd = array_index_nospec(fd, ctx->nr_user_files);
7477         file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
7478         file = (struct file *) (file_ptr & FFS_MASK);
7479         file_ptr &= ~FFS_MASK;
7480         /* mask in overlapping REQ_F and FFS bits */
7481         req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
7482         io_req_set_rsrc_node(req, ctx);
7483         return file;
7484 }
7485
7486 static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
7487                                        struct io_kiocb *req, int fd)
7488 {
7489         struct file *file = fget(fd);
7490
7491         trace_io_uring_file_get(ctx, req, req->user_data, fd);
7492
7493         /* we don't allow fixed io_uring files */
7494         if (file && unlikely(file->f_op == &io_uring_fops))
7495                 io_req_track_inflight(req);
7496         return file;
7497 }
7498
7499 static inline struct file *io_file_get(struct io_ring_ctx *ctx,
7500                                        struct io_kiocb *req, int fd, bool fixed)
7501 {
7502         if (fixed)
7503                 return io_file_get_fixed(ctx, req, fd);
7504         else
7505                 return io_file_get_normal(ctx, req, fd);
7506 }
7507
7508 static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
7509 {
7510         struct io_kiocb *prev = req->timeout.prev;
7511         int ret = -ENOENT;
7512
7513         if (prev) {
7514                 if (!(req->task->flags & PF_EXITING))
7515                         ret = io_try_cancel_userdata(req, prev->user_data);
7516                 io_req_complete_post(req, ret ?: -ETIME, 0);
7517                 io_put_req(prev);
7518         } else {
7519                 io_req_complete_post(req, -ETIME, 0);
7520         }
7521 }
7522
7523 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
7524 {
7525         struct io_timeout_data *data = container_of(timer,
7526                                                 struct io_timeout_data, timer);
7527         struct io_kiocb *prev, *req = data->req;
7528         struct io_ring_ctx *ctx = req->ctx;
7529         unsigned long flags;
7530
7531         spin_lock_irqsave(&ctx->timeout_lock, flags);
7532         prev = req->timeout.head;
7533         req->timeout.head = NULL;
7534
7535         /*
7536          * We don't expect the list to be empty, that will only happen if we
7537          * race with the completion of the linked work.
7538          */
7539         if (prev) {
7540                 io_remove_next_linked(prev);
7541                 if (!req_ref_inc_not_zero(prev))
7542                         prev = NULL;
7543         }
7544         list_del(&req->timeout.list);
7545         req->timeout.prev = prev;
7546         spin_unlock_irqrestore(&ctx->timeout_lock, flags);
7547
7548         req->io_task_work.func = io_req_task_link_timeout;
7549         io_req_task_work_add(req, false);
7550         return HRTIMER_NORESTART;
7551 }
7552
7553 static void io_queue_linked_timeout(struct io_kiocb *req)
7554 {
7555         struct io_ring_ctx *ctx = req->ctx;
7556
7557         spin_lock_irq(&ctx->timeout_lock);
7558         /*
7559          * If the back reference is NULL, then our linked request finished
7560          * before we got a chance to setup the timer
7561          */
7562         if (req->timeout.head) {
7563                 struct io_timeout_data *data = req->async_data;
7564
7565                 data->timer.function = io_link_timeout_fn;
7566                 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
7567                                 data->mode);
7568                 list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
7569         }
7570         spin_unlock_irq(&ctx->timeout_lock);
7571         /* drop submission reference */
7572         io_put_req(req);
7573 }
7574
7575 static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
7576         __must_hold(&req->ctx->uring_lock)
7577 {
7578         struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
7579
7580         switch (io_arm_poll_handler(req, 0)) {
7581         case IO_APOLL_READY:
7582                 io_req_task_queue(req);
7583                 break;
7584         case IO_APOLL_ABORTED:
7585                 /*
7586                  * Queued up for async execution, worker will release
7587                  * submit reference when the iocb is actually submitted.
7588                  */
7589                 io_queue_async_work(req, NULL);
7590                 break;
7591         case IO_APOLL_OK:
7592                 break;
7593         }
7594
7595         if (linked_timeout)
7596                 io_queue_linked_timeout(linked_timeout);
7597 }
7598
7599 static inline void __io_queue_sqe(struct io_kiocb *req)
7600         __must_hold(&req->ctx->uring_lock)
7601 {
7602         struct io_kiocb *linked_timeout;
7603         int ret;
7604
7605         ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
7606
7607         if (req->flags & REQ_F_COMPLETE_INLINE) {
7608                 io_req_add_compl_list(req);
7609                 return;
7610         }
7611         /*
7612          * We async punt it if the file wasn't marked NOWAIT, or if the file
7613          * doesn't support non-blocking read/write attempts
7614          */
7615         if (likely(!ret)) {
7616                 linked_timeout = io_prep_linked_timeout(req);
7617                 if (linked_timeout)
7618                         io_queue_linked_timeout(linked_timeout);
7619         } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
7620                 io_queue_sqe_arm_apoll(req);
7621         } else {
7622                 io_req_complete_failed(req, ret);
7623         }
7624 }
7625
7626 static void io_queue_sqe_fallback(struct io_kiocb *req)
7627         __must_hold(&req->ctx->uring_lock)
7628 {
7629         if (req->flags & REQ_F_FAIL) {
7630                 io_req_complete_fail_submit(req);
7631         } else if (unlikely(req->ctx->drain_active)) {
7632                 io_drain_req(req);
7633         } else {
7634                 int ret = io_req_prep_async(req);
7635
7636                 if (unlikely(ret))
7637                         io_req_complete_failed(req, ret);
7638                 else
7639                         io_queue_async_work(req, NULL);
7640         }
7641 }
7642
7643 static inline void io_queue_sqe(struct io_kiocb *req)
7644         __must_hold(&req->ctx->uring_lock)
7645 {
7646         if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
7647                 __io_queue_sqe(req);
7648         else
7649                 io_queue_sqe_fallback(req);
7650 }
7651
7652 /*
7653  * Check SQE restrictions (opcode and flags).
7654  *
7655  * Returns 'true' if SQE is allowed, 'false' otherwise.
7656  */
7657 static inline bool io_check_restriction(struct io_ring_ctx *ctx,
7658                                         struct io_kiocb *req,
7659                                         unsigned int sqe_flags)
7660 {
7661         if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
7662                 return false;
7663
7664         if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
7665             ctx->restrictions.sqe_flags_required)
7666                 return false;
7667
7668         if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
7669                           ctx->restrictions.sqe_flags_required))
7670                 return false;
7671
7672         return true;
7673 }
7674
7675 static void io_init_req_drain(struct io_kiocb *req)
7676 {
7677         struct io_ring_ctx *ctx = req->ctx;
7678         struct io_kiocb *head = ctx->submit_state.link.head;
7679
7680         ctx->drain_active = true;
7681         if (head) {
7682                 /*
7683                  * If we need to drain a request in the middle of a link, drain
7684                  * the head request and the next request/link after the current
7685                  * link. Considering sequential execution of links,
7686                  * REQ_F_IO_DRAIN will be maintained for every request of our
7687                  * link.
7688                  */
7689                 head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
7690                 ctx->drain_next = true;
7691         }
7692 }
7693
7694 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
7695                        const struct io_uring_sqe *sqe)
7696         __must_hold(&ctx->uring_lock)
7697 {
7698         unsigned int sqe_flags;
7699         int personality;
7700         u8 opcode;
7701
7702         /* req is partially pre-initialised, see io_preinit_req() */
7703         req->opcode = opcode = READ_ONCE(sqe->opcode);
7704         /* same numerical values with corresponding REQ_F_*, safe to copy */
7705         req->flags = sqe_flags = READ_ONCE(sqe->flags);
7706         req->user_data = READ_ONCE(sqe->user_data);
7707         req->file = NULL;
7708         req->fixed_rsrc_refs = NULL;
7709         req->task = current;
7710
7711         if (unlikely(opcode >= IORING_OP_LAST)) {
7712                 req->opcode = 0;
7713                 return -EINVAL;
7714         }
7715         if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
7716                 /* enforce forwards compatibility on users */
7717                 if (sqe_flags & ~SQE_VALID_FLAGS)
7718                         return -EINVAL;
7719                 if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
7720                     !io_op_defs[opcode].buffer_select)
7721                         return -EOPNOTSUPP;
7722                 if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
7723                         ctx->drain_disabled = true;
7724                 if (sqe_flags & IOSQE_IO_DRAIN) {
7725                         if (ctx->drain_disabled)
7726                                 return -EOPNOTSUPP;
7727                         io_init_req_drain(req);
7728                 }
7729         }
7730         if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
7731                 if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
7732                         return -EACCES;
7733                 /* knock it to the slow queue path, will be drained there */
7734                 if (ctx->drain_active)
7735                         req->flags |= REQ_F_FORCE_ASYNC;
7736                 /* if there is no link, we're at "next" request and need to drain */
7737                 if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
7738                         ctx->drain_next = false;
7739                         ctx->drain_active = true;
7740                         req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
7741                 }
7742         }
7743
7744         if (io_op_defs[opcode].needs_file) {
7745                 struct io_submit_state *state = &ctx->submit_state;
7746
7747                 /*
7748                  * Plug now if we have more than 2 IO left after this, and the
7749                  * target is potentially a read/write to block based storage.
7750                  */
7751                 if (state->need_plug && io_op_defs[opcode].plug) {
7752                         state->plug_started = true;
7753                         state->need_plug = false;
7754                         blk_start_plug_nr_ios(&state->plug, state->submit_nr);
7755                 }
7756
7757                 req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
7758                                         (sqe_flags & IOSQE_FIXED_FILE));
7759                 if (unlikely(!req->file))
7760                         return -EBADF;
7761         }
7762
7763         personality = READ_ONCE(sqe->personality);
7764         if (personality) {
7765                 int ret;
7766
7767                 req->creds = xa_load(&ctx->personalities, personality);
7768                 if (!req->creds)
7769                         return -EINVAL;
7770                 get_cred(req->creds);
7771                 ret = security_uring_override_creds(req->creds);
7772                 if (ret) {
7773                         put_cred(req->creds);
7774                         return ret;
7775                 }
7776                 req->flags |= REQ_F_CREDS;
7777         }
7778
7779         return io_req_prep(req, sqe);
7780 }
7781
7782 static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
7783                          const struct io_uring_sqe *sqe)
7784         __must_hold(&ctx->uring_lock)
7785 {
7786         struct io_submit_link *link = &ctx->submit_state.link;
7787         int ret;
7788
7789         ret = io_init_req(ctx, req, sqe);
7790         if (unlikely(ret)) {
7791                 trace_io_uring_req_failed(sqe, ctx, req, ret);
7792
7793                 /* fail even hard links since we don't submit */
7794                 if (link->head) {
7795                         /*
7796                          * we can judge a link req is failed or cancelled by if
7797                          * REQ_F_FAIL is set, but the head is an exception since
7798                          * it may be set REQ_F_FAIL because of other req's failure
7799                          * so let's leverage req->result to distinguish if a head
7800                          * is set REQ_F_FAIL because of its failure or other req's
7801                          * failure so that we can set the correct ret code for it.
7802                          * init result here to avoid affecting the normal path.
7803                          */
7804                         if (!(link->head->flags & REQ_F_FAIL))
7805                                 req_fail_link_node(link->head, -ECANCELED);
7806                 } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
7807                         /*
7808                          * the current req is a normal req, we should return
7809                          * error and thus break the submittion loop.
7810                          */
7811                         io_req_complete_failed(req, ret);
7812                         return ret;
7813                 }
7814                 req_fail_link_node(req, ret);
7815         }
7816
7817         /* don't need @sqe from now on */
7818         trace_io_uring_submit_sqe(ctx, req, req->user_data, req->opcode,
7819                                   req->flags, true,
7820                                   ctx->flags & IORING_SETUP_SQPOLL);
7821
7822         /*
7823          * If we already have a head request, queue this one for async
7824          * submittal once the head completes. If we don't have a head but
7825          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
7826          * submitted sync once the chain is complete. If none of those
7827          * conditions are true (normal request), then just queue it.
7828          */
7829         if (link->head) {
7830                 struct io_kiocb *head = link->head;
7831
7832                 if (!(req->flags & REQ_F_FAIL)) {
7833                         ret = io_req_prep_async(req);
7834                         if (unlikely(ret)) {
7835                                 req_fail_link_node(req, ret);
7836                                 if (!(head->flags & REQ_F_FAIL))
7837                                         req_fail_link_node(head, -ECANCELED);
7838                         }
7839                 }
7840                 trace_io_uring_link(ctx, req, head);
7841                 link->last->link = req;
7842                 link->last = req;
7843
7844                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
7845                         return 0;
7846                 /* last request of a link, enqueue the link */
7847                 link->head = NULL;
7848                 req = head;
7849         } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
7850                 link->head = req;
7851                 link->last = req;
7852                 return 0;
7853         }
7854
7855         io_queue_sqe(req);
7856         return 0;
7857 }
7858
7859 /*
7860  * Batched submission is done, ensure local IO is flushed out.
7861  */
7862 static void io_submit_state_end(struct io_ring_ctx *ctx)
7863 {
7864         struct io_submit_state *state = &ctx->submit_state;
7865
7866         if (state->link.head)
7867                 io_queue_sqe(state->link.head);
7868         /* flush only after queuing links as they can generate completions */
7869         io_submit_flush_completions(ctx);
7870         if (state->plug_started)
7871                 blk_finish_plug(&state->plug);
7872 }
7873
7874 /*
7875  * Start submission side cache.
7876  */
7877 static void io_submit_state_start(struct io_submit_state *state,
7878                                   unsigned int max_ios)
7879 {
7880         state->plug_started = false;
7881         state->need_plug = max_ios > 2;
7882         state->submit_nr = max_ios;
7883         /* set only head, no need to init link_last in advance */
7884         state->link.head = NULL;
7885 }
7886
7887 static void io_commit_sqring(struct io_ring_ctx *ctx)
7888 {
7889         struct io_rings *rings = ctx->rings;
7890
7891         /*
7892          * Ensure any loads from the SQEs are done at this point,
7893          * since once we write the new head, the application could
7894          * write new data to them.
7895          */
7896         smp_store_release(&rings->sq.head, ctx->cached_sq_head);
7897 }
7898
7899 /*
7900  * Fetch an sqe, if one is available. Note this returns a pointer to memory
7901  * that is mapped by userspace. This means that care needs to be taken to
7902  * ensure that reads are stable, as we cannot rely on userspace always
7903  * being a good citizen. If members of the sqe are validated and then later
7904  * used, it's important that those reads are done through READ_ONCE() to
7905  * prevent a re-load down the line.
7906  */
7907 static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
7908 {
7909         unsigned head, mask = ctx->sq_entries - 1;
7910         unsigned sq_idx = ctx->cached_sq_head++ & mask;
7911
7912         /*
7913          * The cached sq head (or cq tail) serves two purposes:
7914          *
7915          * 1) allows us to batch the cost of updating the user visible
7916          *    head updates.
7917          * 2) allows the kernel side to track the head on its own, even
7918          *    though the application is the one updating it.
7919          */
7920         head = READ_ONCE(ctx->sq_array[sq_idx]);
7921         if (likely(head < ctx->sq_entries))
7922                 return &ctx->sq_sqes[head];
7923
7924         /* drop invalid entries */
7925         ctx->cq_extra--;
7926         WRITE_ONCE(ctx->rings->sq_dropped,
7927                    READ_ONCE(ctx->rings->sq_dropped) + 1);
7928         return NULL;
7929 }
7930
7931 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
7932         __must_hold(&ctx->uring_lock)
7933 {
7934         unsigned int entries = io_sqring_entries(ctx);
7935         int submitted = 0;
7936
7937         if (unlikely(!entries))
7938                 return 0;
7939         /* make sure SQ entry isn't read before tail */
7940         nr = min3(nr, ctx->sq_entries, entries);
7941         io_get_task_refs(nr);
7942
7943         io_submit_state_start(&ctx->submit_state, nr);
7944         do {
7945                 const struct io_uring_sqe *sqe;
7946                 struct io_kiocb *req;
7947
7948                 if (unlikely(!io_alloc_req_refill(ctx))) {
7949                         if (!submitted)
7950                                 submitted = -EAGAIN;
7951                         break;
7952                 }
7953                 req = io_alloc_req(ctx);
7954                 sqe = io_get_sqe(ctx);
7955                 if (unlikely(!sqe)) {
7956                         wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
7957                         break;
7958                 }
7959                 /* will complete beyond this point, count as submitted */
7960                 submitted++;
7961                 if (io_submit_sqe(ctx, req, sqe)) {
7962                         /*
7963                          * Continue submitting even for sqe failure if the
7964                          * ring was setup with IORING_SETUP_SUBMIT_ALL
7965                          */
7966                         if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL))
7967                                 break;
7968                 }
7969         } while (submitted < nr);
7970
7971         if (unlikely(submitted != nr)) {
7972                 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
7973                 int unused = nr - ref_used;
7974
7975                 current->io_uring->cached_refs += unused;
7976         }
7977
7978         io_submit_state_end(ctx);
7979          /* Commit SQ ring head once we've consumed and submitted all SQEs */
7980         io_commit_sqring(ctx);
7981
7982         return submitted;
7983 }
7984
7985 static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
7986 {
7987         return READ_ONCE(sqd->state);
7988 }
7989
7990 static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
7991 {
7992         /* Tell userspace we may need a wakeup call */
7993         spin_lock(&ctx->completion_lock);
7994         WRITE_ONCE(ctx->rings->sq_flags,
7995                    ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
7996         spin_unlock(&ctx->completion_lock);
7997 }
7998
7999 static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
8000 {
8001         spin_lock(&ctx->completion_lock);
8002         WRITE_ONCE(ctx->rings->sq_flags,
8003                    ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
8004         spin_unlock(&ctx->completion_lock);
8005 }
8006
8007 static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
8008 {
8009         unsigned int to_submit;
8010         int ret = 0;
8011
8012         to_submit = io_sqring_entries(ctx);
8013         /* if we're handling multiple rings, cap submit size for fairness */
8014         if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
8015                 to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
8016
8017         if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
8018                 const struct cred *creds = NULL;
8019
8020                 if (ctx->sq_creds != current_cred())
8021                         creds = override_creds(ctx->sq_creds);
8022
8023                 mutex_lock(&ctx->uring_lock);
8024                 if (!wq_list_empty(&ctx->iopoll_list))
8025                         io_do_iopoll(ctx, true);
8026
8027                 /*
8028                  * Don't submit if refs are dying, good for io_uring_register(),
8029                  * but also it is relied upon by io_ring_exit_work()
8030                  */
8031                 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
8032                     !(ctx->flags & IORING_SETUP_R_DISABLED))
8033                         ret = io_submit_sqes(ctx, to_submit);
8034                 mutex_unlock(&ctx->uring_lock);
8035 #ifdef CONFIG_NET_RX_BUSY_POLL
8036                 spin_lock(&ctx->napi_lock);
8037                 if (!list_empty(&ctx->napi_list) &&
8038                     io_napi_busy_loop(&ctx->napi_list))
8039                         ++ret;
8040                 spin_unlock(&ctx->napi_lock);
8041 #endif
8042                 if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
8043                         wake_up(&ctx->sqo_sq_wait);
8044                 if (creds)
8045                         revert_creds(creds);
8046         }
8047
8048         return ret;
8049 }
8050
8051 static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
8052 {
8053         struct io_ring_ctx *ctx;
8054         unsigned sq_thread_idle = 0;
8055
8056         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
8057                 sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
8058         sqd->sq_thread_idle = sq_thread_idle;
8059 }
8060
8061 static bool io_sqd_handle_event(struct io_sq_data *sqd)
8062 {
8063         bool did_sig = false;
8064         struct ksignal ksig;
8065
8066         if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
8067             signal_pending(current)) {
8068                 mutex_unlock(&sqd->lock);
8069                 if (signal_pending(current))
8070                         did_sig = get_signal(&ksig);
8071                 cond_resched();
8072                 mutex_lock(&sqd->lock);
8073         }
8074         return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
8075 }
8076
8077 static int io_sq_thread(void *data)
8078 {
8079         struct io_sq_data *sqd = data;
8080         struct io_ring_ctx *ctx;
8081         unsigned long timeout = 0;
8082         char buf[TASK_COMM_LEN];
8083         DEFINE_WAIT(wait);
8084
8085         snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
8086         set_task_comm(current, buf);
8087
8088         if (sqd->sq_cpu != -1)
8089                 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
8090         else
8091                 set_cpus_allowed_ptr(current, cpu_online_mask);
8092         current->flags |= PF_NO_SETAFFINITY;
8093
8094         audit_alloc_kernel(current);
8095
8096         mutex_lock(&sqd->lock);
8097         while (1) {
8098                 bool cap_entries, sqt_spin = false;
8099
8100                 if (io_sqd_events_pending(sqd) || signal_pending(current)) {
8101                         if (io_sqd_handle_event(sqd))
8102                                 break;
8103                         timeout = jiffies + sqd->sq_thread_idle;
8104                 }
8105
8106                 cap_entries = !list_is_singular(&sqd->ctx_list);
8107                 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
8108                         int ret = __io_sq_thread(ctx, cap_entries);
8109
8110                         if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
8111                                 sqt_spin = true;
8112                 }
8113                 if (io_run_task_work())
8114                         sqt_spin = true;
8115
8116                 if (sqt_spin || !time_after(jiffies, timeout)) {
8117                         cond_resched();
8118                         if (sqt_spin)
8119                                 timeout = jiffies + sqd->sq_thread_idle;
8120                         continue;
8121                 }
8122
8123                 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
8124                 if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) {
8125                         bool needs_sched = true;
8126
8127                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
8128                                 io_ring_set_wakeup_flag(ctx);
8129
8130                                 if ((ctx->flags & IORING_SETUP_IOPOLL) &&
8131                                     !wq_list_empty(&ctx->iopoll_list)) {
8132                                         needs_sched = false;
8133                                         break;
8134                                 }
8135
8136                                 /*
8137                                  * Ensure the store of the wakeup flag is not
8138                                  * reordered with the load of the SQ tail
8139                                  */
8140                                 smp_mb();
8141
8142                                 if (io_sqring_entries(ctx)) {
8143                                         needs_sched = false;
8144                                         break;
8145                                 }
8146                         }
8147
8148                         if (needs_sched) {
8149                                 mutex_unlock(&sqd->lock);
8150                                 schedule();
8151                                 mutex_lock(&sqd->lock);
8152                         }
8153                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
8154                                 io_ring_clear_wakeup_flag(ctx);
8155                 }
8156
8157                 finish_wait(&sqd->wait, &wait);
8158                 timeout = jiffies + sqd->sq_thread_idle;
8159         }
8160
8161         io_uring_cancel_generic(true, sqd);
8162         sqd->thread = NULL;
8163         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
8164                 io_ring_set_wakeup_flag(ctx);
8165         io_run_task_work();
8166         mutex_unlock(&sqd->lock);
8167
8168         audit_free(current);
8169
8170         complete(&sqd->exited);
8171         do_exit(0);
8172 }
8173
8174 struct io_wait_queue {
8175         struct wait_queue_entry wq;
8176         struct io_ring_ctx *ctx;
8177         unsigned cq_tail;
8178         unsigned nr_timeouts;
8179 #ifdef CONFIG_NET_RX_BUSY_POLL
8180         unsigned busy_poll_to;
8181 #endif
8182 };
8183
8184 static inline bool io_should_wake(struct io_wait_queue *iowq)
8185 {
8186         struct io_ring_ctx *ctx = iowq->ctx;
8187         int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
8188
8189         /*
8190          * Wake up if we have enough events, or if a timeout occurred since we
8191          * started waiting. For timeouts, we always want to return to userspace,
8192          * regardless of event count.
8193          */
8194         return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
8195 }
8196
8197 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
8198                             int wake_flags, void *key)
8199 {
8200         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
8201                                                         wq);
8202
8203         /*
8204          * Cannot safely flush overflowed CQEs from here, ensure we wake up
8205          * the task, and the next invocation will do it.
8206          */
8207         if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
8208                 return autoremove_wake_function(curr, mode, wake_flags, key);
8209         return -1;
8210 }
8211
8212 static int io_run_task_work_sig(void)
8213 {
8214         if (io_run_task_work())
8215                 return 1;
8216         if (test_thread_flag(TIF_NOTIFY_SIGNAL))
8217                 return -ERESTARTSYS;
8218         if (task_sigpending(current))
8219                 return -EINTR;
8220         return 0;
8221 }
8222
8223 /* when returns >0, the caller should retry */
8224 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
8225                                           struct io_wait_queue *iowq,
8226                                           ktime_t timeout)
8227 {
8228         int ret;
8229
8230         /* make sure we run task_work before checking for signals */
8231         ret = io_run_task_work_sig();
8232         if (ret || io_should_wake(iowq))
8233                 return ret;
8234         /* let the caller flush overflows, retry */
8235         if (test_bit(0, &ctx->check_cq_overflow))
8236                 return 1;
8237
8238         if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
8239                 return -ETIME;
8240         return 1;
8241 }
8242
8243 #ifdef CONFIG_NET_RX_BUSY_POLL
8244 static void io_adjust_busy_loop_timeout(struct timespec64 *ts,
8245                                         struct io_wait_queue *iowq)
8246 {
8247         unsigned busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
8248         struct timespec64 pollto = ns_to_timespec64(1000 * (s64)busy_poll_to);
8249
8250         if (timespec64_compare(ts, &pollto) > 0) {
8251                 *ts = timespec64_sub(*ts, pollto);
8252                 iowq->busy_poll_to = busy_poll_to;
8253         } else {
8254                 u64 to = timespec64_to_ns(ts);
8255
8256                 do_div(to, 1000);
8257                 iowq->busy_poll_to = to;
8258                 ts->tv_sec = 0;
8259                 ts->tv_nsec = 0;
8260         }
8261 }
8262
8263 static inline bool io_busy_loop_timeout(unsigned long start_time,
8264                                         unsigned long bp_usec)
8265 {
8266         if (bp_usec) {
8267                 unsigned long end_time = start_time + bp_usec;
8268                 unsigned long now = busy_loop_current_time();
8269
8270                 return time_after(now, end_time);
8271         }
8272         return true;
8273 }
8274
8275 static bool io_busy_loop_end(void *p, unsigned long start_time)
8276 {
8277         struct io_wait_queue *iowq = p;
8278
8279         return signal_pending(current) ||
8280                io_should_wake(iowq) ||
8281                io_busy_loop_timeout(start_time, iowq->busy_poll_to);
8282 }
8283
8284 static void io_blocking_napi_busy_loop(struct list_head *napi_list,
8285                                        struct io_wait_queue *iowq)
8286 {
8287         unsigned long start_time =
8288                 list_is_singular(napi_list) ? 0 :
8289                 busy_loop_current_time();
8290
8291         do {
8292                 if (list_is_singular(napi_list)) {
8293                         struct napi_entry *ne =
8294                                 list_first_entry(napi_list,
8295                                                  struct napi_entry, list);
8296
8297                         napi_busy_loop(ne->napi_id, io_busy_loop_end, iowq,
8298                                        true, BUSY_POLL_BUDGET);
8299                         io_check_napi_entry_timeout(ne);
8300                         break;
8301                 }
8302         } while (io_napi_busy_loop(napi_list) &&
8303                  !io_busy_loop_end(iowq, start_time));
8304 }
8305
8306 static void io_putback_napi_list(struct io_ring_ctx *ctx,
8307                                  struct list_head *napi_list)
8308 {
8309         struct napi_entry *cne, *lne;
8310
8311         spin_lock(&ctx->napi_lock);
8312         list_for_each_entry(cne, &ctx->napi_list, list)
8313                 list_for_each_entry(lne, napi_list, list)
8314                         if (cne->napi_id == lne->napi_id) {
8315                                 list_del(&lne->list);
8316                                 kfree(lne);
8317                                 break;
8318                         }
8319         list_splice(napi_list, &ctx->napi_list);
8320         spin_unlock(&ctx->napi_lock);
8321 }
8322 #endif /* CONFIG_NET_RX_BUSY_POLL */
8323
8324 /*
8325  * Wait until events become available, if we don't already have some. The
8326  * application must reap them itself, as they reside on the shared cq ring.
8327  */
8328 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
8329                           const sigset_t __user *sig, size_t sigsz,
8330                           struct __kernel_timespec __user *uts)
8331 {
8332         struct io_wait_queue iowq;
8333         struct io_rings *rings = ctx->rings;
8334         ktime_t timeout = KTIME_MAX;
8335         int ret;
8336 #ifdef CONFIG_NET_RX_BUSY_POLL
8337         LIST_HEAD(local_napi_list);
8338 #endif
8339
8340         do {
8341                 io_cqring_overflow_flush(ctx);
8342                 if (io_cqring_events(ctx) >= min_events)
8343                         return 0;
8344                 if (!io_run_task_work())
8345                         break;
8346         } while (1);
8347
8348         if (sig) {
8349 #ifdef CONFIG_COMPAT
8350                 if (in_compat_syscall())
8351                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
8352                                                       sigsz);
8353                 else
8354 #endif
8355                         ret = set_user_sigmask(sig, sigsz);
8356
8357                 if (ret)
8358                         return ret;
8359         }
8360
8361 #ifdef CONFIG_NET_RX_BUSY_POLL
8362         iowq.busy_poll_to = 0;
8363         if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
8364                 spin_lock(&ctx->napi_lock);
8365                 list_splice_init(&ctx->napi_list, &local_napi_list);
8366                 spin_unlock(&ctx->napi_lock);
8367         }
8368 #endif
8369         if (uts) {
8370                 struct timespec64 ts;
8371
8372                 if (get_timespec64(&ts, uts))
8373                         return -EFAULT;
8374 #ifdef CONFIG_NET_RX_BUSY_POLL
8375                 if (!list_empty(&local_napi_list))
8376                         io_adjust_busy_loop_timeout(&ts, &iowq);
8377 #endif
8378                 timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
8379         }
8380 #ifdef CONFIG_NET_RX_BUSY_POLL
8381         else if (!list_empty(&local_napi_list))
8382                 iowq.busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
8383 #endif
8384
8385         init_waitqueue_func_entry(&iowq.wq, io_wake_function);
8386         iowq.wq.private = current;
8387         INIT_LIST_HEAD(&iowq.wq.entry);
8388         iowq.ctx = ctx;
8389         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
8390         iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
8391
8392         trace_io_uring_cqring_wait(ctx, min_events);
8393 #ifdef CONFIG_NET_RX_BUSY_POLL
8394         if (iowq.busy_poll_to)
8395                 io_blocking_napi_busy_loop(&local_napi_list, &iowq);
8396         if (!list_empty(&local_napi_list))
8397                 io_putback_napi_list(ctx, &local_napi_list);
8398 #endif
8399         do {
8400                 /* if we can't even flush overflow, don't wait for more */
8401                 if (!io_cqring_overflow_flush(ctx)) {
8402                         ret = -EBUSY;
8403                         break;
8404                 }
8405                 prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
8406                                                 TASK_INTERRUPTIBLE);
8407                 ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
8408                 finish_wait(&ctx->cq_wait, &iowq.wq);
8409                 cond_resched();
8410         } while (ret > 0);
8411
8412         restore_saved_sigmask_unless(ret == -EINTR);
8413
8414         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
8415 }
8416
8417 static void io_free_page_table(void **table, size_t size)
8418 {
8419         unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
8420
8421         for (i = 0; i < nr_tables; i++)
8422                 kfree(table[i]);
8423         kfree(table);
8424 }
8425
8426 static __cold void **io_alloc_page_table(size_t size)
8427 {
8428         unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
8429         size_t init_size = size;
8430         void **table;
8431
8432         table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
8433         if (!table)
8434                 return NULL;
8435
8436         for (i = 0; i < nr_tables; i++) {
8437                 unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
8438
8439                 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
8440                 if (!table[i]) {
8441                         io_free_page_table(table, init_size);
8442                         return NULL;
8443                 }
8444                 size -= this_size;
8445         }
8446         return table;
8447 }
8448
8449 static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
8450 {
8451         percpu_ref_exit(&ref_node->refs);
8452         kfree(ref_node);
8453 }
8454
8455 static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
8456 {
8457         struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
8458         struct io_ring_ctx *ctx = node->rsrc_data->ctx;
8459         unsigned long flags;
8460         bool first_add = false;
8461         unsigned long delay = HZ;
8462
8463         spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
8464         node->done = true;
8465
8466         /* if we are mid-quiesce then do not delay */
8467         if (node->rsrc_data->quiesce)
8468                 delay = 0;
8469
8470         while (!list_empty(&ctx->rsrc_ref_list)) {
8471                 node = list_first_entry(&ctx->rsrc_ref_list,
8472                                             struct io_rsrc_node, node);
8473                 /* recycle ref nodes in order */
8474                 if (!node->done)
8475                         break;
8476                 list_del(&node->node);
8477                 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
8478         }
8479         spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
8480
8481         if (first_add)
8482                 mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
8483 }
8484
8485 static struct io_rsrc_node *io_rsrc_node_alloc(void)
8486 {
8487         struct io_rsrc_node *ref_node;
8488
8489         ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
8490         if (!ref_node)
8491                 return NULL;
8492
8493         if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
8494                             0, GFP_KERNEL)) {
8495                 kfree(ref_node);
8496                 return NULL;
8497         }
8498         INIT_LIST_HEAD(&ref_node->node);
8499         INIT_LIST_HEAD(&ref_node->rsrc_list);
8500         ref_node->done = false;
8501         return ref_node;
8502 }
8503
8504 static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
8505                                 struct io_rsrc_data *data_to_kill)
8506         __must_hold(&ctx->uring_lock)
8507 {
8508         WARN_ON_ONCE(!ctx->rsrc_backup_node);
8509         WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
8510
8511         io_rsrc_refs_drop(ctx);
8512
8513         if (data_to_kill) {
8514                 struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
8515
8516                 rsrc_node->rsrc_data = data_to_kill;
8517                 spin_lock_irq(&ctx->rsrc_ref_lock);
8518                 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
8519                 spin_unlock_irq(&ctx->rsrc_ref_lock);
8520
8521                 atomic_inc(&data_to_kill->refs);
8522                 percpu_ref_kill(&rsrc_node->refs);
8523                 ctx->rsrc_node = NULL;
8524         }
8525
8526         if (!ctx->rsrc_node) {
8527                 ctx->rsrc_node = ctx->rsrc_backup_node;
8528                 ctx->rsrc_backup_node = NULL;
8529         }
8530 }
8531
8532 static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
8533 {
8534         if (ctx->rsrc_backup_node)
8535                 return 0;
8536         ctx->rsrc_backup_node = io_rsrc_node_alloc();
8537         return ctx->rsrc_backup_node ? 0 : -ENOMEM;
8538 }
8539
8540 static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
8541                                       struct io_ring_ctx *ctx)
8542 {
8543         int ret;
8544
8545         /* As we may drop ->uring_lock, other task may have started quiesce */
8546         if (data->quiesce)
8547                 return -ENXIO;
8548
8549         data->quiesce = true;
8550         do {
8551                 ret = io_rsrc_node_switch_start(ctx);
8552                 if (ret)
8553                         break;
8554                 io_rsrc_node_switch(ctx, data);
8555
8556                 /* kill initial ref, already quiesced if zero */
8557                 if (atomic_dec_and_test(&data->refs))
8558                         break;
8559                 mutex_unlock(&ctx->uring_lock);
8560                 flush_delayed_work(&ctx->rsrc_put_work);
8561                 ret = wait_for_completion_interruptible(&data->done);
8562                 if (!ret) {
8563                         mutex_lock(&ctx->uring_lock);
8564                         if (atomic_read(&data->refs) > 0) {
8565                                 /*
8566                                  * it has been revived by another thread while
8567                                  * we were unlocked
8568                                  */
8569                                 mutex_unlock(&ctx->uring_lock);
8570                         } else {
8571                                 break;
8572                         }
8573                 }
8574
8575                 atomic_inc(&data->refs);
8576                 /* wait for all works potentially completing data->done */
8577                 flush_delayed_work(&ctx->rsrc_put_work);
8578                 reinit_completion(&data->done);
8579
8580                 ret = io_run_task_work_sig();
8581                 mutex_lock(&ctx->uring_lock);
8582         } while (ret >= 0);
8583         data->quiesce = false;
8584
8585         return ret;
8586 }
8587
8588 static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
8589 {
8590         unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
8591         unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
8592
8593         return &data->tags[table_idx][off];
8594 }
8595
8596 static void io_rsrc_data_free(struct io_rsrc_data *data)
8597 {
8598         size_t size = data->nr * sizeof(data->tags[0][0]);
8599
8600         if (data->tags)
8601                 io_free_page_table((void **)data->tags, size);
8602         kfree(data);
8603 }
8604
8605 static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
8606                                      u64 __user *utags, unsigned nr,
8607                                      struct io_rsrc_data **pdata)
8608 {
8609         struct io_rsrc_data *data;
8610         int ret = -ENOMEM;
8611         unsigned i;
8612
8613         data = kzalloc(sizeof(*data), GFP_KERNEL);
8614         if (!data)
8615                 return -ENOMEM;
8616         data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
8617         if (!data->tags) {
8618                 kfree(data);
8619                 return -ENOMEM;
8620         }
8621
8622         data->nr = nr;
8623         data->ctx = ctx;
8624         data->do_put = do_put;
8625         if (utags) {
8626                 ret = -EFAULT;
8627                 for (i = 0; i < nr; i++) {
8628                         u64 *tag_slot = io_get_tag_slot(data, i);
8629
8630                         if (copy_from_user(tag_slot, &utags[i],
8631                                            sizeof(*tag_slot)))
8632                                 goto fail;
8633                 }
8634         }
8635
8636         atomic_set(&data->refs, 1);
8637         init_completion(&data->done);
8638         *pdata = data;
8639         return 0;
8640 fail:
8641         io_rsrc_data_free(data);
8642         return ret;
8643 }
8644
8645 static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
8646 {
8647         table->files = kvcalloc(nr_files, sizeof(table->files[0]),
8648                                 GFP_KERNEL_ACCOUNT);
8649         return !!table->files;
8650 }
8651
8652 static void io_free_file_tables(struct io_file_table *table)
8653 {
8654         kvfree(table->files);
8655         table->files = NULL;
8656 }
8657
8658 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
8659 {
8660 #if defined(CONFIG_UNIX)
8661         if (ctx->ring_sock) {
8662                 struct sock *sock = ctx->ring_sock->sk;
8663                 struct sk_buff *skb;
8664
8665                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
8666                         kfree_skb(skb);
8667         }
8668 #else
8669         int i;
8670
8671         for (i = 0; i < ctx->nr_user_files; i++) {
8672                 struct file *file;
8673
8674                 file = io_file_from_index(ctx, i);
8675                 if (file)
8676                         fput(file);
8677         }
8678 #endif
8679         io_free_file_tables(&ctx->file_table);
8680         io_rsrc_data_free(ctx->file_data);
8681         ctx->file_data = NULL;
8682         ctx->nr_user_files = 0;
8683 }
8684
8685 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
8686 {
8687         int ret;
8688
8689         if (!ctx->file_data)
8690                 return -ENXIO;
8691         ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
8692         if (!ret)
8693                 __io_sqe_files_unregister(ctx);
8694         return ret;
8695 }
8696
8697 static void io_sq_thread_unpark(struct io_sq_data *sqd)
8698         __releases(&sqd->lock)
8699 {
8700         WARN_ON_ONCE(sqd->thread == current);
8701
8702         /*
8703          * Do the dance but not conditional clear_bit() because it'd race with
8704          * other threads incrementing park_pending and setting the bit.
8705          */
8706         clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
8707         if (atomic_dec_return(&sqd->park_pending))
8708                 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
8709         mutex_unlock(&sqd->lock);
8710 }
8711
8712 static void io_sq_thread_park(struct io_sq_data *sqd)
8713         __acquires(&sqd->lock)
8714 {
8715         WARN_ON_ONCE(sqd->thread == current);
8716
8717         atomic_inc(&sqd->park_pending);
8718         set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
8719         mutex_lock(&sqd->lock);
8720         if (sqd->thread)
8721                 wake_up_process(sqd->thread);
8722 }
8723
8724 static void io_sq_thread_stop(struct io_sq_data *sqd)
8725 {
8726         WARN_ON_ONCE(sqd->thread == current);
8727         WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
8728
8729         set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
8730         mutex_lock(&sqd->lock);
8731         if (sqd->thread)
8732                 wake_up_process(sqd->thread);
8733         mutex_unlock(&sqd->lock);
8734         wait_for_completion(&sqd->exited);
8735 }
8736
8737 static void io_put_sq_data(struct io_sq_data *sqd)
8738 {
8739         if (refcount_dec_and_test(&sqd->refs)) {
8740                 WARN_ON_ONCE(atomic_read(&sqd->park_pending));
8741
8742                 io_sq_thread_stop(sqd);
8743                 kfree(sqd);
8744         }
8745 }
8746
8747 static void io_sq_thread_finish(struct io_ring_ctx *ctx)
8748 {
8749         struct io_sq_data *sqd = ctx->sq_data;
8750
8751         if (sqd) {
8752                 io_sq_thread_park(sqd);
8753                 list_del_init(&ctx->sqd_list);
8754                 io_sqd_update_thread_idle(sqd);
8755                 io_sq_thread_unpark(sqd);
8756
8757                 io_put_sq_data(sqd);
8758                 ctx->sq_data = NULL;
8759         }
8760 }
8761
8762 static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
8763 {
8764         struct io_ring_ctx *ctx_attach;
8765         struct io_sq_data *sqd;
8766         struct fd f;
8767
8768         f = fdget(p->wq_fd);
8769         if (!f.file)
8770                 return ERR_PTR(-ENXIO);
8771         if (f.file->f_op != &io_uring_fops) {
8772                 fdput(f);
8773                 return ERR_PTR(-EINVAL);
8774         }
8775
8776         ctx_attach = f.file->private_data;
8777         sqd = ctx_attach->sq_data;
8778         if (!sqd) {
8779                 fdput(f);
8780                 return ERR_PTR(-EINVAL);
8781         }
8782         if (sqd->task_tgid != current->tgid) {
8783                 fdput(f);
8784                 return ERR_PTR(-EPERM);
8785         }
8786
8787         refcount_inc(&sqd->refs);
8788         fdput(f);
8789         return sqd;
8790 }
8791
8792 static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
8793                                          bool *attached)
8794 {
8795         struct io_sq_data *sqd;
8796
8797         *attached = false;
8798         if (p->flags & IORING_SETUP_ATTACH_WQ) {
8799                 sqd = io_attach_sq_data(p);
8800                 if (!IS_ERR(sqd)) {
8801                         *attached = true;
8802                         return sqd;
8803                 }
8804                 /* fall through for EPERM case, setup new sqd/task */
8805                 if (PTR_ERR(sqd) != -EPERM)
8806                         return sqd;
8807         }
8808
8809         sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
8810         if (!sqd)
8811                 return ERR_PTR(-ENOMEM);
8812
8813         atomic_set(&sqd->park_pending, 0);
8814         refcount_set(&sqd->refs, 1);
8815         INIT_LIST_HEAD(&sqd->ctx_list);
8816         mutex_init(&sqd->lock);
8817         init_waitqueue_head(&sqd->wait);
8818         init_completion(&sqd->exited);
8819         return sqd;
8820 }
8821
8822 #if defined(CONFIG_UNIX)
8823 /*
8824  * Ensure the UNIX gc is aware of our file set, so we are certain that
8825  * the io_uring can be safely unregistered on process exit, even if we have
8826  * loops in the file referencing.
8827  */
8828 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
8829 {
8830         struct sock *sk = ctx->ring_sock->sk;
8831         struct scm_fp_list *fpl;
8832         struct sk_buff *skb;
8833         int i, nr_files;
8834
8835         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
8836         if (!fpl)
8837                 return -ENOMEM;
8838
8839         skb = alloc_skb(0, GFP_KERNEL);
8840         if (!skb) {
8841                 kfree(fpl);
8842                 return -ENOMEM;
8843         }
8844
8845         skb->sk = sk;
8846
8847         nr_files = 0;
8848         fpl->user = get_uid(current_user());
8849         for (i = 0; i < nr; i++) {
8850                 struct file *file = io_file_from_index(ctx, i + offset);
8851
8852                 if (!file)
8853                         continue;
8854                 fpl->fp[nr_files] = get_file(file);
8855                 unix_inflight(fpl->user, fpl->fp[nr_files]);
8856                 nr_files++;
8857         }
8858
8859         if (nr_files) {
8860                 fpl->max = SCM_MAX_FD;
8861                 fpl->count = nr_files;
8862                 UNIXCB(skb).fp = fpl;
8863                 skb->destructor = unix_destruct_scm;
8864                 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
8865                 skb_queue_head(&sk->sk_receive_queue, skb);
8866
8867                 for (i = 0; i < nr_files; i++)
8868                         fput(fpl->fp[i]);
8869         } else {
8870                 kfree_skb(skb);
8871                 free_uid(fpl->user);
8872                 kfree(fpl);
8873         }
8874
8875         return 0;
8876 }
8877
8878 /*
8879  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
8880  * causes regular reference counting to break down. We rely on the UNIX
8881  * garbage collection to take care of this problem for us.
8882  */
8883 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
8884 {
8885         unsigned left, total;
8886         int ret = 0;
8887
8888         total = 0;
8889         left = ctx->nr_user_files;
8890         while (left) {
8891                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
8892
8893                 ret = __io_sqe_files_scm(ctx, this_files, total);
8894                 if (ret)
8895                         break;
8896                 left -= this_files;
8897                 total += this_files;
8898         }
8899
8900         if (!ret)
8901                 return 0;
8902
8903         while (total < ctx->nr_user_files) {
8904                 struct file *file = io_file_from_index(ctx, total);
8905
8906                 if (file)
8907                         fput(file);
8908                 total++;
8909         }
8910
8911         return ret;
8912 }
8913 #else
8914 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
8915 {
8916         return 0;
8917 }
8918 #endif
8919
8920 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
8921 {
8922         struct file *file = prsrc->file;
8923 #if defined(CONFIG_UNIX)
8924         struct sock *sock = ctx->ring_sock->sk;
8925         struct sk_buff_head list, *head = &sock->sk_receive_queue;
8926         struct sk_buff *skb;
8927         int i;
8928
8929         __skb_queue_head_init(&list);
8930
8931         /*
8932          * Find the skb that holds this file in its SCM_RIGHTS. When found,
8933          * remove this entry and rearrange the file array.
8934          */
8935         skb = skb_dequeue(head);
8936         while (skb) {
8937                 struct scm_fp_list *fp;
8938
8939                 fp = UNIXCB(skb).fp;
8940                 for (i = 0; i < fp->count; i++) {
8941                         int left;
8942
8943                         if (fp->fp[i] != file)
8944                                 continue;
8945
8946                         unix_notinflight(fp->user, fp->fp[i]);
8947                         left = fp->count - 1 - i;
8948                         if (left) {
8949                                 memmove(&fp->fp[i], &fp->fp[i + 1],
8950                                                 left * sizeof(struct file *));
8951                         }
8952                         fp->count--;
8953                         if (!fp->count) {
8954                                 kfree_skb(skb);
8955                                 skb = NULL;
8956                         } else {
8957                                 __skb_queue_tail(&list, skb);
8958                         }
8959                         fput(file);
8960                         file = NULL;
8961                         break;
8962                 }
8963
8964                 if (!file)
8965                         break;
8966
8967                 __skb_queue_tail(&list, skb);
8968
8969                 skb = skb_dequeue(head);
8970         }
8971
8972         if (skb_peek(&list)) {
8973                 spin_lock_irq(&head->lock);
8974                 while ((skb = __skb_dequeue(&list)) != NULL)
8975                         __skb_queue_tail(head, skb);
8976                 spin_unlock_irq(&head->lock);
8977         }
8978 #else
8979         fput(file);
8980 #endif
8981 }
8982
8983 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
8984 {
8985         struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
8986         struct io_ring_ctx *ctx = rsrc_data->ctx;
8987         struct io_rsrc_put *prsrc, *tmp;
8988
8989         list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
8990                 list_del(&prsrc->list);
8991
8992                 if (prsrc->tag) {
8993                         bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
8994
8995                         io_ring_submit_lock(ctx, lock_ring);
8996                         spin_lock(&ctx->completion_lock);
8997                         io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
8998                         io_commit_cqring(ctx);
8999                         spin_unlock(&ctx->completion_lock);
9000                         io_cqring_ev_posted(ctx);
9001                         io_ring_submit_unlock(ctx, lock_ring);
9002                 }
9003
9004                 rsrc_data->do_put(ctx, prsrc);
9005                 kfree(prsrc);
9006         }
9007
9008         io_rsrc_node_destroy(ref_node);
9009         if (atomic_dec_and_test(&rsrc_data->refs))
9010                 complete(&rsrc_data->done);
9011 }
9012
9013 static void io_rsrc_put_work(struct work_struct *work)
9014 {
9015         struct io_ring_ctx *ctx;
9016         struct llist_node *node;
9017
9018         ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
9019         node = llist_del_all(&ctx->rsrc_put_llist);
9020
9021         while (node) {
9022                 struct io_rsrc_node *ref_node;
9023                 struct llist_node *next = node->next;
9024
9025                 ref_node = llist_entry(node, struct io_rsrc_node, llist);
9026                 __io_rsrc_put_work(ref_node);
9027                 node = next;
9028         }
9029 }
9030
9031 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
9032                                  unsigned nr_args, u64 __user *tags)
9033 {
9034         __s32 __user *fds = (__s32 __user *) arg;
9035         struct file *file;
9036         int fd, ret;
9037         unsigned i;
9038
9039         if (ctx->file_data)
9040                 return -EBUSY;
9041         if (!nr_args)
9042                 return -EINVAL;
9043         if (nr_args > IORING_MAX_FIXED_FILES)
9044                 return -EMFILE;
9045         if (nr_args > rlimit(RLIMIT_NOFILE))
9046                 return -EMFILE;
9047         ret = io_rsrc_node_switch_start(ctx);
9048         if (ret)
9049                 return ret;
9050         ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
9051                                  &ctx->file_data);
9052         if (ret)
9053                 return ret;
9054
9055         ret = -ENOMEM;
9056         if (!io_alloc_file_tables(&ctx->file_table, nr_args))
9057                 goto out_free;
9058
9059         for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
9060                 if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
9061                         ret = -EFAULT;
9062                         goto out_fput;
9063                 }
9064                 /* allow sparse sets */
9065                 if (fd == -1) {
9066                         ret = -EINVAL;
9067                         if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
9068                                 goto out_fput;
9069                         continue;
9070                 }
9071
9072                 file = fget(fd);
9073                 ret = -EBADF;
9074                 if (unlikely(!file))
9075                         goto out_fput;
9076
9077                 /*
9078                  * Don't allow io_uring instances to be registered. If UNIX
9079                  * isn't enabled, then this causes a reference cycle and this
9080                  * instance can never get freed. If UNIX is enabled we'll
9081                  * handle it just fine, but there's still no point in allowing
9082                  * a ring fd as it doesn't support regular read/write anyway.
9083                  */
9084                 if (file->f_op == &io_uring_fops) {
9085                         fput(file);
9086                         goto out_fput;
9087                 }
9088                 io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
9089         }
9090
9091         ret = io_sqe_files_scm(ctx);
9092         if (ret) {
9093                 __io_sqe_files_unregister(ctx);
9094                 return ret;
9095         }
9096
9097         io_rsrc_node_switch(ctx, NULL);
9098         return ret;
9099 out_fput:
9100         for (i = 0; i < ctx->nr_user_files; i++) {
9101                 file = io_file_from_index(ctx, i);
9102                 if (file)
9103                         fput(file);
9104         }
9105         io_free_file_tables(&ctx->file_table);
9106         ctx->nr_user_files = 0;
9107 out_free:
9108         io_rsrc_data_free(ctx->file_data);
9109         ctx->file_data = NULL;
9110         return ret;
9111 }
9112
9113 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
9114                                 int index)
9115 {
9116 #if defined(CONFIG_UNIX)
9117         struct sock *sock = ctx->ring_sock->sk;
9118         struct sk_buff_head *head = &sock->sk_receive_queue;
9119         struct sk_buff *skb;
9120
9121         /*
9122          * See if we can merge this file into an existing skb SCM_RIGHTS
9123          * file set. If there's no room, fall back to allocating a new skb
9124          * and filling it in.
9125          */
9126         spin_lock_irq(&head->lock);
9127         skb = skb_peek(head);
9128         if (skb) {
9129                 struct scm_fp_list *fpl = UNIXCB(skb).fp;
9130
9131                 if (fpl->count < SCM_MAX_FD) {
9132                         __skb_unlink(skb, head);
9133                         spin_unlock_irq(&head->lock);
9134                         fpl->fp[fpl->count] = get_file(file);
9135                         unix_inflight(fpl->user, fpl->fp[fpl->count]);
9136                         fpl->count++;
9137                         spin_lock_irq(&head->lock);
9138                         __skb_queue_head(head, skb);
9139                 } else {
9140                         skb = NULL;
9141                 }
9142         }
9143         spin_unlock_irq(&head->lock);
9144
9145         if (skb) {
9146                 fput(file);
9147                 return 0;
9148         }
9149
9150         return __io_sqe_files_scm(ctx, 1, index);
9151 #else
9152         return 0;
9153 #endif
9154 }
9155
9156 static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
9157                                  struct io_rsrc_node *node, void *rsrc)
9158 {
9159         struct io_rsrc_put *prsrc;
9160
9161         prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
9162         if (!prsrc)
9163                 return -ENOMEM;
9164
9165         prsrc->tag = *io_get_tag_slot(data, idx);
9166         prsrc->rsrc = rsrc;
9167         list_add(&prsrc->list, &node->rsrc_list);
9168         return 0;
9169 }
9170
9171 static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
9172                                  unsigned int issue_flags, u32 slot_index)
9173 {
9174         struct io_ring_ctx *ctx = req->ctx;
9175         bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
9176         bool needs_switch = false;
9177         struct io_fixed_file *file_slot;
9178         int ret = -EBADF;
9179
9180         io_ring_submit_lock(ctx, needs_lock);
9181         if (file->f_op == &io_uring_fops)
9182                 goto err;
9183         ret = -ENXIO;
9184         if (!ctx->file_data)
9185                 goto err;
9186         ret = -EINVAL;
9187         if (slot_index >= ctx->nr_user_files)
9188                 goto err;
9189
9190         slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
9191         file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
9192
9193         if (file_slot->file_ptr) {
9194                 struct file *old_file;
9195
9196                 ret = io_rsrc_node_switch_start(ctx);
9197                 if (ret)
9198                         goto err;
9199
9200                 old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
9201                 ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
9202                                             ctx->rsrc_node, old_file);
9203                 if (ret)
9204                         goto err;
9205                 file_slot->file_ptr = 0;
9206                 needs_switch = true;
9207         }
9208
9209         *io_get_tag_slot(ctx->file_data, slot_index) = 0;
9210         io_fixed_file_set(file_slot, file);
9211         ret = io_sqe_file_register(ctx, file, slot_index);
9212         if (ret) {
9213                 file_slot->file_ptr = 0;
9214                 goto err;
9215         }
9216
9217         ret = 0;
9218 err:
9219         if (needs_switch)
9220                 io_rsrc_node_switch(ctx, ctx->file_data);
9221         io_ring_submit_unlock(ctx, needs_lock);
9222         if (ret)
9223                 fput(file);
9224         return ret;
9225 }
9226
9227 static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
9228 {
9229         unsigned int offset = req->close.file_slot - 1;
9230         struct io_ring_ctx *ctx = req->ctx;
9231         bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
9232         struct io_fixed_file *file_slot;
9233         struct file *file;
9234         int ret, i;
9235
9236         io_ring_submit_lock(ctx, needs_lock);
9237         ret = -ENXIO;
9238         if (unlikely(!ctx->file_data))
9239                 goto out;
9240         ret = -EINVAL;
9241         if (offset >= ctx->nr_user_files)
9242                 goto out;
9243         ret = io_rsrc_node_switch_start(ctx);
9244         if (ret)
9245                 goto out;
9246
9247         i = array_index_nospec(offset, ctx->nr_user_files);
9248         file_slot = io_fixed_file_slot(&ctx->file_table, i);
9249         ret = -EBADF;
9250         if (!file_slot->file_ptr)
9251                 goto out;
9252
9253         file = (struct file *)(file_slot->file_ptr & FFS_MASK);
9254         ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
9255         if (ret)
9256                 goto out;
9257
9258         file_slot->file_ptr = 0;
9259         io_rsrc_node_switch(ctx, ctx->file_data);
9260         ret = 0;
9261 out:
9262         io_ring_submit_unlock(ctx, needs_lock);
9263         return ret;
9264 }
9265
9266 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
9267                                  struct io_uring_rsrc_update2 *up,
9268                                  unsigned nr_args)
9269 {
9270         u64 __user *tags = u64_to_user_ptr(up->tags);
9271         __s32 __user *fds = u64_to_user_ptr(up->data);
9272         struct io_rsrc_data *data = ctx->file_data;
9273         struct io_fixed_file *file_slot;
9274         struct file *file;
9275         int fd, i, err = 0;
9276         unsigned int done;
9277         bool needs_switch = false;
9278
9279         if (!ctx->file_data)
9280                 return -ENXIO;
9281         if (up->offset + nr_args > ctx->nr_user_files)
9282                 return -EINVAL;
9283
9284         for (done = 0; done < nr_args; done++) {
9285                 u64 tag = 0;
9286
9287                 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
9288                     copy_from_user(&fd, &fds[done], sizeof(fd))) {
9289                         err = -EFAULT;
9290                         break;
9291                 }
9292                 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
9293                         err = -EINVAL;
9294                         break;
9295                 }
9296                 if (fd == IORING_REGISTER_FILES_SKIP)
9297                         continue;
9298
9299                 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
9300                 file_slot = io_fixed_file_slot(&ctx->file_table, i);
9301
9302                 if (file_slot->file_ptr) {
9303                         file = (struct file *)(file_slot->file_ptr & FFS_MASK);
9304                         err = io_queue_rsrc_removal(data, up->offset + done,
9305                                                     ctx->rsrc_node, file);
9306                         if (err)
9307                                 break;
9308                         file_slot->file_ptr = 0;
9309                         needs_switch = true;
9310                 }
9311                 if (fd != -1) {
9312                         file = fget(fd);
9313                         if (!file) {
9314                                 err = -EBADF;
9315                                 break;
9316                         }
9317                         /*
9318                          * Don't allow io_uring instances to be registered. If
9319                          * UNIX isn't enabled, then this causes a reference
9320                          * cycle and this instance can never get freed. If UNIX
9321                          * is enabled we'll handle it just fine, but there's
9322                          * still no point in allowing a ring fd as it doesn't
9323                          * support regular read/write anyway.
9324                          */
9325                         if (file->f_op == &io_uring_fops) {
9326                                 fput(file);
9327                                 err = -EBADF;
9328                                 break;
9329                         }
9330                         *io_get_tag_slot(data, up->offset + done) = tag;
9331                         io_fixed_file_set(file_slot, file);
9332                         err = io_sqe_file_register(ctx, file, i);
9333                         if (err) {
9334                                 file_slot->file_ptr = 0;
9335                                 fput(file);
9336                                 break;
9337                         }
9338                 }
9339         }
9340
9341         if (needs_switch)
9342                 io_rsrc_node_switch(ctx, data);
9343         return done ? done : err;
9344 }
9345
9346 static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
9347                                         struct task_struct *task)
9348 {
9349         struct io_wq_hash *hash;
9350         struct io_wq_data data;
9351         unsigned int concurrency;
9352
9353         mutex_lock(&ctx->uring_lock);
9354         hash = ctx->hash_map;
9355         if (!hash) {
9356                 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
9357                 if (!hash) {
9358                         mutex_unlock(&ctx->uring_lock);
9359                         return ERR_PTR(-ENOMEM);
9360                 }
9361                 refcount_set(&hash->refs, 1);
9362                 init_waitqueue_head(&hash->wait);
9363                 ctx->hash_map = hash;
9364         }
9365         mutex_unlock(&ctx->uring_lock);
9366
9367         data.hash = hash;
9368         data.task = task;
9369         data.free_work = io_wq_free_work;
9370         data.do_work = io_wq_submit_work;
9371
9372         /* Do QD, or 4 * CPUS, whatever is smallest */
9373         concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
9374
9375         return io_wq_create(concurrency, &data);
9376 }
9377
9378 static __cold int io_uring_alloc_task_context(struct task_struct *task,
9379                                               struct io_ring_ctx *ctx)
9380 {
9381         struct io_uring_task *tctx;
9382         int ret;
9383
9384         tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
9385         if (unlikely(!tctx))
9386                 return -ENOMEM;
9387
9388         tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
9389                                          sizeof(struct file *), GFP_KERNEL);
9390         if (unlikely(!tctx->registered_rings)) {
9391                 kfree(tctx);
9392                 return -ENOMEM;
9393         }
9394
9395         ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
9396         if (unlikely(ret)) {
9397                 kfree(tctx->registered_rings);
9398                 kfree(tctx);
9399                 return ret;
9400         }
9401
9402         tctx->io_wq = io_init_wq_offload(ctx, task);
9403         if (IS_ERR(tctx->io_wq)) {
9404                 ret = PTR_ERR(tctx->io_wq);
9405                 percpu_counter_destroy(&tctx->inflight);
9406                 kfree(tctx->registered_rings);
9407                 kfree(tctx);
9408                 return ret;
9409         }
9410
9411         xa_init(&tctx->xa);
9412         init_waitqueue_head(&tctx->wait);
9413         atomic_set(&tctx->in_idle, 0);
9414         atomic_set(&tctx->inflight_tracked, 0);
9415         task->io_uring = tctx;
9416         spin_lock_init(&tctx->task_lock);
9417         INIT_WQ_LIST(&tctx->task_list);
9418         INIT_WQ_LIST(&tctx->prior_task_list);
9419         init_task_work(&tctx->task_work, tctx_task_work);
9420         return 0;
9421 }
9422
9423 void __io_uring_free(struct task_struct *tsk)
9424 {
9425         struct io_uring_task *tctx = tsk->io_uring;
9426
9427         WARN_ON_ONCE(!xa_empty(&tctx->xa));
9428         WARN_ON_ONCE(tctx->io_wq);
9429         WARN_ON_ONCE(tctx->cached_refs);
9430
9431         kfree(tctx->registered_rings);
9432         percpu_counter_destroy(&tctx->inflight);
9433         kfree(tctx);
9434         tsk->io_uring = NULL;
9435 }
9436
9437 static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
9438                                        struct io_uring_params *p)
9439 {
9440         int ret;
9441
9442         /* Retain compatibility with failing for an invalid attach attempt */
9443         if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
9444                                 IORING_SETUP_ATTACH_WQ) {
9445                 struct fd f;
9446
9447                 f = fdget(p->wq_fd);
9448                 if (!f.file)
9449                         return -ENXIO;
9450                 if (f.file->f_op != &io_uring_fops) {
9451                         fdput(f);
9452                         return -EINVAL;
9453                 }
9454                 fdput(f);
9455         }
9456         if (ctx->flags & IORING_SETUP_SQPOLL) {
9457                 struct task_struct *tsk;
9458                 struct io_sq_data *sqd;
9459                 bool attached;
9460
9461                 ret = security_uring_sqpoll();
9462                 if (ret)
9463                         return ret;
9464
9465                 sqd = io_get_sq_data(p, &attached);
9466                 if (IS_ERR(sqd)) {
9467                         ret = PTR_ERR(sqd);
9468                         goto err;
9469                 }
9470
9471                 ctx->sq_creds = get_current_cred();
9472                 ctx->sq_data = sqd;
9473                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
9474                 if (!ctx->sq_thread_idle)
9475                         ctx->sq_thread_idle = HZ;
9476
9477                 io_sq_thread_park(sqd);
9478                 list_add(&ctx->sqd_list, &sqd->ctx_list);
9479                 io_sqd_update_thread_idle(sqd);
9480                 /* don't attach to a dying SQPOLL thread, would be racy */
9481                 ret = (attached && !sqd->thread) ? -ENXIO : 0;
9482                 io_sq_thread_unpark(sqd);
9483
9484                 if (ret < 0)
9485                         goto err;
9486                 if (attached)
9487                         return 0;
9488
9489                 if (p->flags & IORING_SETUP_SQ_AFF) {
9490                         int cpu = p->sq_thread_cpu;
9491
9492                         ret = -EINVAL;
9493                         if (cpu >= nr_cpu_ids || !cpu_online(cpu))
9494                                 goto err_sqpoll;
9495                         sqd->sq_cpu = cpu;
9496                 } else {
9497                         sqd->sq_cpu = -1;
9498                 }
9499
9500                 sqd->task_pid = current->pid;
9501                 sqd->task_tgid = current->tgid;
9502                 tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
9503                 if (IS_ERR(tsk)) {
9504                         ret = PTR_ERR(tsk);
9505                         goto err_sqpoll;
9506                 }
9507
9508                 sqd->thread = tsk;
9509                 ret = io_uring_alloc_task_context(tsk, ctx);
9510                 wake_up_new_task(tsk);
9511                 if (ret)
9512                         goto err;
9513         } else if (p->flags & IORING_SETUP_SQ_AFF) {
9514                 /* Can't have SQ_AFF without SQPOLL */
9515                 ret = -EINVAL;
9516                 goto err;
9517         }
9518
9519         return 0;
9520 err_sqpoll:
9521         complete(&ctx->sq_data->exited);
9522 err:
9523         io_sq_thread_finish(ctx);
9524         return ret;
9525 }
9526
9527 static inline void __io_unaccount_mem(struct user_struct *user,
9528                                       unsigned long nr_pages)
9529 {
9530         atomic_long_sub(nr_pages, &user->locked_vm);
9531 }
9532
9533 static inline int __io_account_mem(struct user_struct *user,
9534                                    unsigned long nr_pages)
9535 {
9536         unsigned long page_limit, cur_pages, new_pages;
9537
9538         /* Don't allow more pages than we can safely lock */
9539         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
9540
9541         do {
9542                 cur_pages = atomic_long_read(&user->locked_vm);
9543                 new_pages = cur_pages + nr_pages;
9544                 if (new_pages > page_limit)
9545                         return -ENOMEM;
9546         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
9547                                         new_pages) != cur_pages);
9548
9549         return 0;
9550 }
9551
9552 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
9553 {
9554         if (ctx->user)
9555                 __io_unaccount_mem(ctx->user, nr_pages);
9556
9557         if (ctx->mm_account)
9558                 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
9559 }
9560
9561 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
9562 {
9563         int ret;
9564
9565         if (ctx->user) {
9566                 ret = __io_account_mem(ctx->user, nr_pages);
9567                 if (ret)
9568                         return ret;
9569         }
9570
9571         if (ctx->mm_account)
9572                 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
9573
9574         return 0;
9575 }
9576
9577 static void io_mem_free(void *ptr)
9578 {
9579         struct page *page;
9580
9581         if (!ptr)
9582                 return;
9583
9584         page = virt_to_head_page(ptr);
9585         if (put_page_testzero(page))
9586                 free_compound_page(page);
9587 }
9588
9589 static void *io_mem_alloc(size_t size)
9590 {
9591         gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
9592
9593         return (void *) __get_free_pages(gfp, get_order(size));
9594 }
9595
9596 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
9597                                 size_t *sq_offset)
9598 {
9599         struct io_rings *rings;
9600         size_t off, sq_array_size;
9601
9602         off = struct_size(rings, cqes, cq_entries);
9603         if (off == SIZE_MAX)
9604                 return SIZE_MAX;
9605
9606 #ifdef CONFIG_SMP
9607         off = ALIGN(off, SMP_CACHE_BYTES);
9608         if (off == 0)
9609                 return SIZE_MAX;
9610 #endif
9611
9612         if (sq_offset)
9613                 *sq_offset = off;
9614
9615         sq_array_size = array_size(sizeof(u32), sq_entries);
9616         if (sq_array_size == SIZE_MAX)
9617                 return SIZE_MAX;
9618
9619         if (check_add_overflow(off, sq_array_size, &off))
9620                 return SIZE_MAX;
9621
9622         return off;
9623 }
9624
9625 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
9626 {
9627         struct io_mapped_ubuf *imu = *slot;
9628         unsigned int i;
9629
9630         if (imu != ctx->dummy_ubuf) {
9631                 for (i = 0; i < imu->nr_bvecs; i++)
9632                         unpin_user_page(imu->bvec[i].bv_page);
9633                 if (imu->acct_pages)
9634                         io_unaccount_mem(ctx, imu->acct_pages);
9635                 kvfree(imu);
9636         }
9637         *slot = NULL;
9638 }
9639
9640 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
9641 {
9642         io_buffer_unmap(ctx, &prsrc->buf);
9643         prsrc->buf = NULL;
9644 }
9645
9646 static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
9647 {
9648         unsigned int i;
9649
9650         for (i = 0; i < ctx->nr_user_bufs; i++)
9651                 io_buffer_unmap(ctx, &ctx->user_bufs[i]);
9652         kfree(ctx->user_bufs);
9653         io_rsrc_data_free(ctx->buf_data);
9654         ctx->user_bufs = NULL;
9655         ctx->buf_data = NULL;
9656         ctx->nr_user_bufs = 0;
9657 }
9658
9659 static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
9660 {
9661         int ret;
9662
9663         if (!ctx->buf_data)
9664                 return -ENXIO;
9665
9666         ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
9667         if (!ret)
9668                 __io_sqe_buffers_unregister(ctx);
9669         return ret;
9670 }
9671
9672 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
9673                        void __user *arg, unsigned index)
9674 {
9675         struct iovec __user *src;
9676
9677 #ifdef CONFIG_COMPAT
9678         if (ctx->compat) {
9679                 struct compat_iovec __user *ciovs;
9680                 struct compat_iovec ciov;
9681
9682                 ciovs = (struct compat_iovec __user *) arg;
9683                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
9684                         return -EFAULT;
9685
9686                 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
9687                 dst->iov_len = ciov.iov_len;
9688                 return 0;
9689         }
9690 #endif
9691         src = (struct iovec __user *) arg;
9692         if (copy_from_user(dst, &src[index], sizeof(*dst)))
9693                 return -EFAULT;
9694         return 0;
9695 }
9696
9697 /*
9698  * Not super efficient, but this is just a registration time. And we do cache
9699  * the last compound head, so generally we'll only do a full search if we don't
9700  * match that one.
9701  *
9702  * We check if the given compound head page has already been accounted, to
9703  * avoid double accounting it. This allows us to account the full size of the
9704  * page, not just the constituent pages of a huge page.
9705  */
9706 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
9707                                   int nr_pages, struct page *hpage)
9708 {
9709         int i, j;
9710
9711         /* check current page array */
9712         for (i = 0; i < nr_pages; i++) {
9713                 if (!PageCompound(pages[i]))
9714                         continue;
9715                 if (compound_head(pages[i]) == hpage)
9716                         return true;
9717         }
9718
9719         /* check previously registered pages */
9720         for (i = 0; i < ctx->nr_user_bufs; i++) {
9721                 struct io_mapped_ubuf *imu = ctx->user_bufs[i];
9722
9723                 for (j = 0; j < imu->nr_bvecs; j++) {
9724                         if (!PageCompound(imu->bvec[j].bv_page))
9725                                 continue;
9726                         if (compound_head(imu->bvec[j].bv_page) == hpage)
9727                                 return true;
9728                 }
9729         }
9730
9731         return false;
9732 }
9733
9734 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
9735                                  int nr_pages, struct io_mapped_ubuf *imu,
9736                                  struct page **last_hpage)
9737 {
9738         int i, ret;
9739
9740         imu->acct_pages = 0;
9741         for (i = 0; i < nr_pages; i++) {
9742                 if (!PageCompound(pages[i])) {
9743                         imu->acct_pages++;
9744                 } else {
9745                         struct page *hpage;
9746
9747                         hpage = compound_head(pages[i]);
9748                         if (hpage == *last_hpage)
9749                                 continue;
9750                         *last_hpage = hpage;
9751                         if (headpage_already_acct(ctx, pages, i, hpage))
9752                                 continue;
9753                         imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
9754                 }
9755         }
9756
9757         if (!imu->acct_pages)
9758                 return 0;
9759
9760         ret = io_account_mem(ctx, imu->acct_pages);
9761         if (ret)
9762                 imu->acct_pages = 0;
9763         return ret;
9764 }
9765
9766 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
9767                                   struct io_mapped_ubuf **pimu,
9768                                   struct page **last_hpage)
9769 {
9770         struct io_mapped_ubuf *imu = NULL;
9771         struct vm_area_struct **vmas = NULL;
9772         struct page **pages = NULL;
9773         unsigned long off, start, end, ubuf;
9774         size_t size;
9775         int ret, pret, nr_pages, i;
9776
9777         if (!iov->iov_base) {
9778                 *pimu = ctx->dummy_ubuf;
9779                 return 0;
9780         }
9781
9782         ubuf = (unsigned long) iov->iov_base;
9783         end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
9784         start = ubuf >> PAGE_SHIFT;
9785         nr_pages = end - start;
9786
9787         *pimu = NULL;
9788         ret = -ENOMEM;
9789
9790         pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
9791         if (!pages)
9792                 goto done;
9793
9794         vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
9795                               GFP_KERNEL);
9796         if (!vmas)
9797                 goto done;
9798
9799         imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
9800         if (!imu)
9801                 goto done;
9802
9803         ret = 0;
9804         mmap_read_lock(current->mm);
9805         pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
9806                               pages, vmas);
9807         if (pret == nr_pages) {
9808                 /* don't support file backed memory */
9809                 for (i = 0; i < nr_pages; i++) {
9810                         struct vm_area_struct *vma = vmas[i];
9811
9812                         if (vma_is_shmem(vma))
9813                                 continue;
9814                         if (vma->vm_file &&
9815                             !is_file_hugepages(vma->vm_file)) {
9816                                 ret = -EOPNOTSUPP;
9817                                 break;
9818                         }
9819                 }
9820         } else {
9821                 ret = pret < 0 ? pret : -EFAULT;
9822         }
9823         mmap_read_unlock(current->mm);
9824         if (ret) {
9825                 /*
9826                  * if we did partial map, or found file backed vmas,
9827                  * release any pages we did get
9828                  */
9829                 if (pret > 0)
9830                         unpin_user_pages(pages, pret);
9831                 goto done;
9832         }
9833
9834         ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
9835         if (ret) {
9836                 unpin_user_pages(pages, pret);
9837                 goto done;
9838         }
9839
9840         off = ubuf & ~PAGE_MASK;
9841         size = iov->iov_len;
9842         for (i = 0; i < nr_pages; i++) {
9843                 size_t vec_len;
9844
9845                 vec_len = min_t(size_t, size, PAGE_SIZE - off);
9846                 imu->bvec[i].bv_page = pages[i];
9847                 imu->bvec[i].bv_len = vec_len;
9848                 imu->bvec[i].bv_offset = off;
9849                 off = 0;
9850                 size -= vec_len;
9851         }
9852         /* store original address for later verification */
9853         imu->ubuf = ubuf;
9854         imu->ubuf_end = ubuf + iov->iov_len;
9855         imu->nr_bvecs = nr_pages;
9856         *pimu = imu;
9857         ret = 0;
9858 done:
9859         if (ret)
9860                 kvfree(imu);
9861         kvfree(pages);
9862         kvfree(vmas);
9863         return ret;
9864 }
9865
9866 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
9867 {
9868         ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
9869         return ctx->user_bufs ? 0 : -ENOMEM;
9870 }
9871
9872 static int io_buffer_validate(struct iovec *iov)
9873 {
9874         unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
9875
9876         /*
9877          * Don't impose further limits on the size and buffer
9878          * constraints here, we'll -EINVAL later when IO is
9879          * submitted if they are wrong.
9880          */
9881         if (!iov->iov_base)
9882                 return iov->iov_len ? -EFAULT : 0;
9883         if (!iov->iov_len)
9884                 return -EFAULT;
9885
9886         /* arbitrary limit, but we need something */
9887         if (iov->iov_len > SZ_1G)
9888                 return -EFAULT;
9889
9890         if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
9891                 return -EOVERFLOW;
9892
9893         return 0;
9894 }
9895
9896 static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
9897                                    unsigned int nr_args, u64 __user *tags)
9898 {
9899         struct page *last_hpage = NULL;
9900         struct io_rsrc_data *data;
9901         int i, ret;
9902         struct iovec iov;
9903
9904         if (ctx->user_bufs)
9905                 return -EBUSY;
9906         if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
9907                 return -EINVAL;
9908         ret = io_rsrc_node_switch_start(ctx);
9909         if (ret)
9910                 return ret;
9911         ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
9912         if (ret)
9913                 return ret;
9914         ret = io_buffers_map_alloc(ctx, nr_args);
9915         if (ret) {
9916                 io_rsrc_data_free(data);
9917                 return ret;
9918         }
9919
9920         for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
9921                 ret = io_copy_iov(ctx, &iov, arg, i);
9922                 if (ret)
9923                         break;
9924                 ret = io_buffer_validate(&iov);
9925                 if (ret)
9926                         break;
9927                 if (!iov.iov_base && *io_get_tag_slot(data, i)) {
9928                         ret = -EINVAL;
9929                         break;
9930                 }
9931
9932                 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
9933                                              &last_hpage);
9934                 if (ret)
9935                         break;
9936         }
9937
9938         WARN_ON_ONCE(ctx->buf_data);
9939
9940         ctx->buf_data = data;
9941         if (ret)
9942                 __io_sqe_buffers_unregister(ctx);
9943         else
9944                 io_rsrc_node_switch(ctx, NULL);
9945         return ret;
9946 }
9947
9948 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
9949                                    struct io_uring_rsrc_update2 *up,
9950                                    unsigned int nr_args)
9951 {
9952         u64 __user *tags = u64_to_user_ptr(up->tags);
9953         struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
9954         struct page *last_hpage = NULL;
9955         bool needs_switch = false;
9956         __u32 done;
9957         int i, err;
9958
9959         if (!ctx->buf_data)
9960                 return -ENXIO;
9961         if (up->offset + nr_args > ctx->nr_user_bufs)
9962                 return -EINVAL;
9963
9964         for (done = 0; done < nr_args; done++) {
9965                 struct io_mapped_ubuf *imu;
9966                 int offset = up->offset + done;
9967                 u64 tag = 0;
9968
9969                 err = io_copy_iov(ctx, &iov, iovs, done);
9970                 if (err)
9971                         break;
9972                 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
9973                         err = -EFAULT;
9974                         break;
9975                 }
9976                 err = io_buffer_validate(&iov);
9977                 if (err)
9978                         break;
9979                 if (!iov.iov_base && tag) {
9980                         err = -EINVAL;
9981                         break;
9982                 }
9983                 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
9984                 if (err)
9985                         break;
9986
9987                 i = array_index_nospec(offset, ctx->nr_user_bufs);
9988                 if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
9989                         err = io_queue_rsrc_removal(ctx->buf_data, offset,
9990                                                     ctx->rsrc_node, ctx->user_bufs[i]);
9991                         if (unlikely(err)) {
9992                                 io_buffer_unmap(ctx, &imu);
9993                                 break;
9994                         }
9995                         ctx->user_bufs[i] = NULL;
9996                         needs_switch = true;
9997                 }
9998
9999                 ctx->user_bufs[i] = imu;
10000                 *io_get_tag_slot(ctx->buf_data, offset) = tag;
10001         }
10002
10003         if (needs_switch)
10004                 io_rsrc_node_switch(ctx, ctx->buf_data);
10005         return done ? done : err;
10006 }
10007
10008 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
10009                                unsigned int eventfd_async)
10010 {
10011         struct io_ev_fd *ev_fd;
10012         __s32 __user *fds = arg;
10013         int fd;
10014
10015         ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
10016                                         lockdep_is_held(&ctx->uring_lock));
10017         if (ev_fd)
10018                 return -EBUSY;
10019
10020         if (copy_from_user(&fd, fds, sizeof(*fds)))
10021                 return -EFAULT;
10022
10023         ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
10024         if (!ev_fd)
10025                 return -ENOMEM;
10026
10027         ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
10028         if (IS_ERR(ev_fd->cq_ev_fd)) {
10029                 int ret = PTR_ERR(ev_fd->cq_ev_fd);
10030                 kfree(ev_fd);
10031                 return ret;
10032         }
10033         ev_fd->eventfd_async = eventfd_async;
10034         ctx->has_evfd = true;
10035         rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
10036         return 0;
10037 }
10038
10039 static void io_eventfd_put(struct rcu_head *rcu)
10040 {
10041         struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
10042
10043         eventfd_ctx_put(ev_fd->cq_ev_fd);
10044         kfree(ev_fd);
10045 }
10046
10047 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
10048 {
10049         struct io_ev_fd *ev_fd;
10050
10051         ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
10052                                         lockdep_is_held(&ctx->uring_lock));
10053         if (ev_fd) {
10054                 ctx->has_evfd = false;
10055                 rcu_assign_pointer(ctx->io_ev_fd, NULL);
10056                 call_rcu(&ev_fd->rcu, io_eventfd_put);
10057                 return 0;
10058         }
10059
10060         return -ENXIO;
10061 }
10062
10063 static void io_destroy_buffers(struct io_ring_ctx *ctx)
10064 {
10065         int i;
10066
10067         for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) {
10068                 struct list_head *list = &ctx->io_buffers[i];
10069
10070                 while (!list_empty(list)) {
10071                         struct io_buffer_list *bl;
10072
10073                         bl = list_first_entry(list, struct io_buffer_list, list);
10074                         __io_remove_buffers(ctx, bl, -1U);
10075                         list_del(&bl->list);
10076                         kfree(bl);
10077                 }
10078         }
10079
10080         while (!list_empty(&ctx->io_buffers_pages)) {
10081                 struct page *page;
10082
10083                 page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
10084                 list_del_init(&page->lru);
10085                 __free_page(page);
10086         }
10087 }
10088
10089 static void io_req_caches_free(struct io_ring_ctx *ctx)
10090 {
10091         struct io_submit_state *state = &ctx->submit_state;
10092         int nr = 0;
10093
10094         mutex_lock(&ctx->uring_lock);
10095         io_flush_cached_locked_reqs(ctx, state);
10096
10097         while (state->free_list.next) {
10098                 struct io_wq_work_node *node;
10099                 struct io_kiocb *req;
10100
10101                 node = wq_stack_extract(&state->free_list);
10102                 req = container_of(node, struct io_kiocb, comp_list);
10103                 kmem_cache_free(req_cachep, req);
10104                 nr++;
10105         }
10106         if (nr)
10107                 percpu_ref_put_many(&ctx->refs, nr);
10108         mutex_unlock(&ctx->uring_lock);
10109 }
10110
10111 static void io_wait_rsrc_data(struct io_rsrc_data *data)
10112 {
10113         if (data && !atomic_dec_and_test(&data->refs))
10114                 wait_for_completion(&data->done);
10115 }
10116
10117 static void io_flush_apoll_cache(struct io_ring_ctx *ctx)
10118 {
10119         struct async_poll *apoll;
10120
10121         while (!list_empty(&ctx->apoll_cache)) {
10122                 apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
10123                                                 poll.wait.entry);
10124                 list_del(&apoll->poll.wait.entry);
10125                 kfree(apoll);
10126         }
10127 }
10128
10129 static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
10130 {
10131         io_sq_thread_finish(ctx);
10132
10133         if (ctx->mm_account) {
10134                 mmdrop(ctx->mm_account);
10135                 ctx->mm_account = NULL;
10136         }
10137
10138         io_rsrc_refs_drop(ctx);
10139         /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
10140         io_wait_rsrc_data(ctx->buf_data);
10141         io_wait_rsrc_data(ctx->file_data);
10142
10143         mutex_lock(&ctx->uring_lock);
10144         if (ctx->buf_data)
10145                 __io_sqe_buffers_unregister(ctx);
10146         if (ctx->file_data)
10147                 __io_sqe_files_unregister(ctx);
10148         if (ctx->rings)
10149                 __io_cqring_overflow_flush(ctx, true);
10150         io_eventfd_unregister(ctx);
10151         io_flush_apoll_cache(ctx);
10152         mutex_unlock(&ctx->uring_lock);
10153         io_destroy_buffers(ctx);
10154         if (ctx->sq_creds)
10155                 put_cred(ctx->sq_creds);
10156
10157         /* there are no registered resources left, nobody uses it */
10158         if (ctx->rsrc_node)
10159                 io_rsrc_node_destroy(ctx->rsrc_node);
10160         if (ctx->rsrc_backup_node)
10161                 io_rsrc_node_destroy(ctx->rsrc_backup_node);
10162         flush_delayed_work(&ctx->rsrc_put_work);
10163         flush_delayed_work(&ctx->fallback_work);
10164
10165         WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
10166         WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
10167
10168 #if defined(CONFIG_UNIX)
10169         if (ctx->ring_sock) {
10170                 ctx->ring_sock->file = NULL; /* so that iput() is called */
10171                 sock_release(ctx->ring_sock);
10172         }
10173 #endif
10174         WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
10175
10176         io_mem_free(ctx->rings);
10177         io_mem_free(ctx->sq_sqes);
10178
10179         percpu_ref_exit(&ctx->refs);
10180         free_uid(ctx->user);
10181         io_req_caches_free(ctx);
10182         if (ctx->hash_map)
10183                 io_wq_put_hash(ctx->hash_map);
10184         io_free_napi_list(ctx);
10185         kfree(ctx->cancel_hash);
10186         kfree(ctx->dummy_ubuf);
10187         kfree(ctx->io_buffers);
10188         kfree(ctx);
10189 }
10190
10191 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
10192 {
10193         struct io_ring_ctx *ctx = file->private_data;
10194         __poll_t mask = 0;
10195
10196         poll_wait(file, &ctx->cq_wait, wait);
10197         /*
10198          * synchronizes with barrier from wq_has_sleeper call in
10199          * io_commit_cqring
10200          */
10201         smp_rmb();
10202         if (!io_sqring_full(ctx))
10203                 mask |= EPOLLOUT | EPOLLWRNORM;
10204
10205         /*
10206          * Don't flush cqring overflow list here, just do a simple check.
10207          * Otherwise there could possible be ABBA deadlock:
10208          *      CPU0                    CPU1
10209          *      ----                    ----
10210          * lock(&ctx->uring_lock);
10211          *                              lock(&ep->mtx);
10212          *                              lock(&ctx->uring_lock);
10213          * lock(&ep->mtx);
10214          *
10215          * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
10216          * pushs them to do the flush.
10217          */
10218         if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
10219                 mask |= EPOLLIN | EPOLLRDNORM;
10220
10221         return mask;
10222 }
10223
10224 static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
10225 {
10226         const struct cred *creds;
10227
10228         creds = xa_erase(&ctx->personalities, id);
10229         if (creds) {
10230                 put_cred(creds);
10231                 return 0;
10232         }
10233
10234         return -EINVAL;
10235 }
10236
10237 struct io_tctx_exit {
10238         struct callback_head            task_work;
10239         struct completion               completion;
10240         struct io_ring_ctx              *ctx;
10241 };
10242
10243 static __cold void io_tctx_exit_cb(struct callback_head *cb)
10244 {
10245         struct io_uring_task *tctx = current->io_uring;
10246         struct io_tctx_exit *work;
10247
10248         work = container_of(cb, struct io_tctx_exit, task_work);
10249         /*
10250          * When @in_idle, we're in cancellation and it's racy to remove the
10251          * node. It'll be removed by the end of cancellation, just ignore it.
10252          */
10253         if (!atomic_read(&tctx->in_idle))
10254                 io_uring_del_tctx_node((unsigned long)work->ctx);
10255         complete(&work->completion);
10256 }
10257
10258 static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
10259 {
10260         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
10261
10262         return req->ctx == data;
10263 }
10264
10265 static __cold void io_ring_exit_work(struct work_struct *work)
10266 {
10267         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
10268         unsigned long timeout = jiffies + HZ * 60 * 5;
10269         unsigned long interval = HZ / 20;
10270         struct io_tctx_exit exit;
10271         struct io_tctx_node *node;
10272         int ret;
10273
10274         /*
10275          * If we're doing polled IO and end up having requests being
10276          * submitted async (out-of-line), then completions can come in while
10277          * we're waiting for refs to drop. We need to reap these manually,
10278          * as nobody else will be looking for them.
10279          */
10280         do {
10281                 io_uring_try_cancel_requests(ctx, NULL, true);
10282                 if (ctx->sq_data) {
10283                         struct io_sq_data *sqd = ctx->sq_data;
10284                         struct task_struct *tsk;
10285
10286                         io_sq_thread_park(sqd);
10287                         tsk = sqd->thread;
10288                         if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
10289                                 io_wq_cancel_cb(tsk->io_uring->io_wq,
10290                                                 io_cancel_ctx_cb, ctx, true);
10291                         io_sq_thread_unpark(sqd);
10292                 }
10293
10294                 io_req_caches_free(ctx);
10295
10296                 if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
10297                         /* there is little hope left, don't run it too often */
10298                         interval = HZ * 60;
10299                 }
10300         } while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
10301
10302         init_completion(&exit.completion);
10303         init_task_work(&exit.task_work, io_tctx_exit_cb);
10304         exit.ctx = ctx;
10305         /*
10306          * Some may use context even when all refs and requests have been put,
10307          * and they are free to do so while still holding uring_lock or
10308          * completion_lock, see io_req_task_submit(). Apart from other work,
10309          * this lock/unlock section also waits them to finish.
10310          */
10311         mutex_lock(&ctx->uring_lock);
10312         while (!list_empty(&ctx->tctx_list)) {
10313                 WARN_ON_ONCE(time_after(jiffies, timeout));
10314
10315                 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
10316                                         ctx_node);
10317                 /* don't spin on a single task if cancellation failed */
10318                 list_rotate_left(&ctx->tctx_list);
10319                 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
10320                 if (WARN_ON_ONCE(ret))
10321                         continue;
10322
10323                 mutex_unlock(&ctx->uring_lock);
10324                 wait_for_completion(&exit.completion);
10325                 mutex_lock(&ctx->uring_lock);
10326         }
10327         mutex_unlock(&ctx->uring_lock);
10328         spin_lock(&ctx->completion_lock);
10329         spin_unlock(&ctx->completion_lock);
10330
10331         io_ring_ctx_free(ctx);
10332 }
10333
10334 /* Returns true if we found and killed one or more timeouts */
10335 static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
10336                                     struct task_struct *tsk, bool cancel_all)
10337 {
10338         struct io_kiocb *req, *tmp;
10339         int canceled = 0;
10340
10341         spin_lock(&ctx->completion_lock);
10342         spin_lock_irq(&ctx->timeout_lock);
10343         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
10344                 if (io_match_task(req, tsk, cancel_all)) {
10345                         io_kill_timeout(req, -ECANCELED);
10346                         canceled++;
10347                 }
10348         }
10349         spin_unlock_irq(&ctx->timeout_lock);
10350         if (canceled != 0)
10351                 io_commit_cqring(ctx);
10352         spin_unlock(&ctx->completion_lock);
10353         if (canceled != 0)
10354                 io_cqring_ev_posted(ctx);
10355         return canceled != 0;
10356 }
10357
10358 static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
10359 {
10360         unsigned long index;
10361         struct creds *creds;
10362
10363         mutex_lock(&ctx->uring_lock);
10364         percpu_ref_kill(&ctx->refs);
10365         if (ctx->rings)
10366                 __io_cqring_overflow_flush(ctx, true);
10367         xa_for_each(&ctx->personalities, index, creds)
10368                 io_unregister_personality(ctx, index);
10369         mutex_unlock(&ctx->uring_lock);
10370
10371         io_kill_timeouts(ctx, NULL, true);
10372         io_poll_remove_all(ctx, NULL, true);
10373
10374         /* if we failed setting up the ctx, we might not have any rings */
10375         io_iopoll_try_reap_events(ctx);
10376
10377         INIT_WORK(&ctx->exit_work, io_ring_exit_work);
10378         /*
10379          * Use system_unbound_wq to avoid spawning tons of event kworkers
10380          * if we're exiting a ton of rings at the same time. It just adds
10381          * noise and overhead, there's no discernable change in runtime
10382          * over using system_wq.
10383          */
10384         queue_work(system_unbound_wq, &ctx->exit_work);
10385 }
10386
10387 static int io_uring_release(struct inode *inode, struct file *file)
10388 {
10389         struct io_ring_ctx *ctx = file->private_data;
10390
10391         file->private_data = NULL;
10392         io_ring_ctx_wait_and_kill(ctx);
10393         return 0;
10394 }
10395
10396 struct io_task_cancel {
10397         struct task_struct *task;
10398         bool all;
10399 };
10400
10401 static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
10402 {
10403         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
10404         struct io_task_cancel *cancel = data;
10405
10406         return io_match_task_safe(req, cancel->task, cancel->all);
10407 }
10408
10409 static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
10410                                          struct task_struct *task,
10411                                          bool cancel_all)
10412 {
10413         struct io_defer_entry *de;
10414         LIST_HEAD(list);
10415
10416         spin_lock(&ctx->completion_lock);
10417         list_for_each_entry_reverse(de, &ctx->defer_list, list) {
10418                 if (io_match_task_safe(de->req, task, cancel_all)) {
10419                         list_cut_position(&list, &ctx->defer_list, &de->list);
10420                         break;
10421                 }
10422         }
10423         spin_unlock(&ctx->completion_lock);
10424         if (list_empty(&list))
10425                 return false;
10426
10427         while (!list_empty(&list)) {
10428                 de = list_first_entry(&list, struct io_defer_entry, list);
10429                 list_del_init(&de->list);
10430                 io_req_complete_failed(de->req, -ECANCELED);
10431                 kfree(de);
10432         }
10433         return true;
10434 }
10435
10436 static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
10437 {
10438         struct io_tctx_node *node;
10439         enum io_wq_cancel cret;
10440         bool ret = false;
10441
10442         mutex_lock(&ctx->uring_lock);
10443         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
10444                 struct io_uring_task *tctx = node->task->io_uring;
10445
10446                 /*
10447                  * io_wq will stay alive while we hold uring_lock, because it's
10448                  * killed after ctx nodes, which requires to take the lock.
10449                  */
10450                 if (!tctx || !tctx->io_wq)
10451                         continue;
10452                 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
10453                 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
10454         }
10455         mutex_unlock(&ctx->uring_lock);
10456
10457         return ret;
10458 }
10459
10460 static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
10461                                                 struct task_struct *task,
10462                                                 bool cancel_all)
10463 {
10464         struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
10465         struct io_uring_task *tctx = task ? task->io_uring : NULL;
10466
10467         while (1) {
10468                 enum io_wq_cancel cret;
10469                 bool ret = false;
10470
10471                 if (!task) {
10472                         ret |= io_uring_try_cancel_iowq(ctx);
10473                 } else if (tctx && tctx->io_wq) {
10474                         /*
10475                          * Cancels requests of all rings, not only @ctx, but
10476                          * it's fine as the task is in exit/exec.
10477                          */
10478                         cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
10479                                                &cancel, true);
10480                         ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
10481                 }
10482
10483                 /* SQPOLL thread does its own polling */
10484                 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
10485                     (ctx->sq_data && ctx->sq_data->thread == current)) {
10486                         while (!wq_list_empty(&ctx->iopoll_list)) {
10487                                 io_iopoll_try_reap_events(ctx);
10488                                 ret = true;
10489                         }
10490                 }
10491
10492                 ret |= io_cancel_defer_files(ctx, task, cancel_all);
10493                 ret |= io_poll_remove_all(ctx, task, cancel_all);
10494                 ret |= io_kill_timeouts(ctx, task, cancel_all);
10495                 if (task)
10496                         ret |= io_run_task_work();
10497                 if (!ret)
10498                         break;
10499                 cond_resched();
10500         }
10501 }
10502
10503 static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
10504 {
10505         struct io_uring_task *tctx = current->io_uring;
10506         struct io_tctx_node *node;
10507         int ret;
10508
10509         if (unlikely(!tctx)) {
10510                 ret = io_uring_alloc_task_context(current, ctx);
10511                 if (unlikely(ret))
10512                         return ret;
10513
10514                 tctx = current->io_uring;
10515                 if (ctx->iowq_limits_set) {
10516                         unsigned int limits[2] = { ctx->iowq_limits[0],
10517                                                    ctx->iowq_limits[1], };
10518
10519                         ret = io_wq_max_workers(tctx->io_wq, limits);
10520                         if (ret)
10521                                 return ret;
10522                 }
10523         }
10524         if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
10525                 node = kmalloc(sizeof(*node), GFP_KERNEL);
10526                 if (!node)
10527                         return -ENOMEM;
10528                 node->ctx = ctx;
10529                 node->task = current;
10530
10531                 ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
10532                                         node, GFP_KERNEL));
10533                 if (ret) {
10534                         kfree(node);
10535                         return ret;
10536                 }
10537
10538                 mutex_lock(&ctx->uring_lock);
10539                 list_add(&node->ctx_node, &ctx->tctx_list);
10540                 mutex_unlock(&ctx->uring_lock);
10541         }
10542         tctx->last = ctx;
10543         return 0;
10544 }
10545
10546 /*
10547  * Note that this task has used io_uring. We use it for cancelation purposes.
10548  */
10549 static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
10550 {
10551         struct io_uring_task *tctx = current->io_uring;
10552
10553         if (likely(tctx && tctx->last == ctx))
10554                 return 0;
10555         return __io_uring_add_tctx_node(ctx);
10556 }
10557
10558 /*
10559  * Remove this io_uring_file -> task mapping.
10560  */
10561 static __cold void io_uring_del_tctx_node(unsigned long index)
10562 {
10563         struct io_uring_task *tctx = current->io_uring;
10564         struct io_tctx_node *node;
10565
10566         if (!tctx)
10567                 return;
10568         node = xa_erase(&tctx->xa, index);
10569         if (!node)
10570                 return;
10571
10572         WARN_ON_ONCE(current != node->task);
10573         WARN_ON_ONCE(list_empty(&node->ctx_node));
10574
10575         mutex_lock(&node->ctx->uring_lock);
10576         list_del(&node->ctx_node);
10577         mutex_unlock(&node->ctx->uring_lock);
10578
10579         if (tctx->last == node->ctx)
10580                 tctx->last = NULL;
10581         kfree(node);
10582 }
10583
10584 static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
10585 {
10586         struct io_wq *wq = tctx->io_wq;
10587         struct io_tctx_node *node;
10588         unsigned long index;
10589
10590         xa_for_each(&tctx->xa, index, node) {
10591                 io_uring_del_tctx_node(index);
10592                 cond_resched();
10593         }
10594         if (wq) {
10595                 /*
10596                  * Must be after io_uring_del_tctx_node() (removes nodes under
10597                  * uring_lock) to avoid race with io_uring_try_cancel_iowq().
10598                  */
10599                 io_wq_put_and_exit(wq);
10600                 tctx->io_wq = NULL;
10601         }
10602 }
10603
10604 static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
10605 {
10606         if (tracked)
10607                 return atomic_read(&tctx->inflight_tracked);
10608         return percpu_counter_sum(&tctx->inflight);
10609 }
10610
10611 /*
10612  * Find any io_uring ctx that this task has registered or done IO on, and cancel
10613  * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
10614  */
10615 static __cold void io_uring_cancel_generic(bool cancel_all,
10616                                            struct io_sq_data *sqd)
10617 {
10618         struct io_uring_task *tctx = current->io_uring;
10619         struct io_ring_ctx *ctx;
10620         s64 inflight;
10621         DEFINE_WAIT(wait);
10622
10623         WARN_ON_ONCE(sqd && sqd->thread != current);
10624
10625         if (!current->io_uring)
10626                 return;
10627         if (tctx->io_wq)
10628                 io_wq_exit_start(tctx->io_wq);
10629
10630         atomic_inc(&tctx->in_idle);
10631         do {
10632                 io_uring_drop_tctx_refs(current);
10633                 /* read completions before cancelations */
10634                 inflight = tctx_inflight(tctx, !cancel_all);
10635                 if (!inflight)
10636                         break;
10637
10638                 if (!sqd) {
10639                         struct io_tctx_node *node;
10640                         unsigned long index;
10641
10642                         xa_for_each(&tctx->xa, index, node) {
10643                                 /* sqpoll task will cancel all its requests */
10644                                 if (node->ctx->sq_data)
10645                                         continue;
10646                                 io_uring_try_cancel_requests(node->ctx, current,
10647                                                              cancel_all);
10648                         }
10649                 } else {
10650                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
10651                                 io_uring_try_cancel_requests(ctx, current,
10652                                                              cancel_all);
10653                 }
10654
10655                 prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
10656                 io_run_task_work();
10657                 io_uring_drop_tctx_refs(current);
10658
10659                 /*
10660                  * If we've seen completions, retry without waiting. This
10661                  * avoids a race where a completion comes in before we did
10662                  * prepare_to_wait().
10663                  */
10664                 if (inflight == tctx_inflight(tctx, !cancel_all))
10665                         schedule();
10666                 finish_wait(&tctx->wait, &wait);
10667         } while (1);
10668
10669         io_uring_clean_tctx(tctx);
10670         if (cancel_all) {
10671                 /*
10672                  * We shouldn't run task_works after cancel, so just leave
10673                  * ->in_idle set for normal exit.
10674                  */
10675                 atomic_dec(&tctx->in_idle);
10676                 /* for exec all current's requests should be gone, kill tctx */
10677                 __io_uring_free(current);
10678         }
10679 }
10680
10681 void __io_uring_cancel(bool cancel_all)
10682 {
10683         io_uring_cancel_generic(cancel_all, NULL);
10684 }
10685
10686 void io_uring_unreg_ringfd(void)
10687 {
10688         struct io_uring_task *tctx = current->io_uring;
10689         int i;
10690
10691         for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
10692                 if (tctx->registered_rings[i]) {
10693                         fput(tctx->registered_rings[i]);
10694                         tctx->registered_rings[i] = NULL;
10695                 }
10696         }
10697 }
10698
10699 static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
10700                                      int start, int end)
10701 {
10702         struct file *file;
10703         int offset;
10704
10705         for (offset = start; offset < end; offset++) {
10706                 offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
10707                 if (tctx->registered_rings[offset])
10708                         continue;
10709
10710                 file = fget(fd);
10711                 if (!file) {
10712                         return -EBADF;
10713                 } else if (file->f_op != &io_uring_fops) {
10714                         fput(file);
10715                         return -EOPNOTSUPP;
10716                 }
10717                 tctx->registered_rings[offset] = file;
10718                 return offset;
10719         }
10720
10721         return -EBUSY;
10722 }
10723
10724 /*
10725  * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
10726  * invocation. User passes in an array of struct io_uring_rsrc_update
10727  * with ->data set to the ring_fd, and ->offset given for the desired
10728  * index. If no index is desired, application may set ->offset == -1U
10729  * and we'll find an available index. Returns number of entries
10730  * successfully processed, or < 0 on error if none were processed.
10731  */
10732 static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
10733                               unsigned nr_args)
10734 {
10735         struct io_uring_rsrc_update __user *arg = __arg;
10736         struct io_uring_rsrc_update reg;
10737         struct io_uring_task *tctx;
10738         int ret, i;
10739
10740         if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
10741                 return -EINVAL;
10742
10743         mutex_unlock(&ctx->uring_lock);
10744         ret = io_uring_add_tctx_node(ctx);
10745         mutex_lock(&ctx->uring_lock);
10746         if (ret)
10747                 return ret;
10748
10749         tctx = current->io_uring;
10750         for (i = 0; i < nr_args; i++) {
10751                 int start, end;
10752
10753                 if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
10754                         ret = -EFAULT;
10755                         break;
10756                 }
10757
10758                 if (reg.offset == -1U) {
10759                         start = 0;
10760                         end = IO_RINGFD_REG_MAX;
10761                 } else {
10762                         if (reg.offset >= IO_RINGFD_REG_MAX) {
10763                                 ret = -EINVAL;
10764                                 break;
10765                         }
10766                         start = reg.offset;
10767                         end = start + 1;
10768                 }
10769
10770                 ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
10771                 if (ret < 0)
10772                         break;
10773
10774                 reg.offset = ret;
10775                 if (copy_to_user(&arg[i], &reg, sizeof(reg))) {
10776                         fput(tctx->registered_rings[reg.offset]);
10777                         tctx->registered_rings[reg.offset] = NULL;
10778                         ret = -EFAULT;
10779                         break;
10780                 }
10781         }
10782
10783         return i ? i : ret;
10784 }
10785
10786 static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
10787                                 unsigned nr_args)
10788 {
10789         struct io_uring_rsrc_update __user *arg = __arg;
10790         struct io_uring_task *tctx = current->io_uring;
10791         struct io_uring_rsrc_update reg;
10792         int ret = 0, i;
10793
10794         if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
10795                 return -EINVAL;
10796         if (!tctx)
10797                 return 0;
10798
10799         for (i = 0; i < nr_args; i++) {
10800                 if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
10801                         ret = -EFAULT;
10802                         break;
10803                 }
10804                 if (reg.offset >= IO_RINGFD_REG_MAX) {
10805                         ret = -EINVAL;
10806                         break;
10807                 }
10808
10809                 reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
10810                 if (tctx->registered_rings[reg.offset]) {
10811                         fput(tctx->registered_rings[reg.offset]);
10812                         tctx->registered_rings[reg.offset] = NULL;
10813                 }
10814         }
10815
10816         return i ? i : ret;
10817 }
10818
10819 static void *io_uring_validate_mmap_request(struct file *file,
10820                                             loff_t pgoff, size_t sz)
10821 {
10822         struct io_ring_ctx *ctx = file->private_data;
10823         loff_t offset = pgoff << PAGE_SHIFT;
10824         struct page *page;
10825         void *ptr;
10826
10827         switch (offset) {
10828         case IORING_OFF_SQ_RING:
10829         case IORING_OFF_CQ_RING:
10830                 ptr = ctx->rings;
10831                 break;
10832         case IORING_OFF_SQES:
10833                 ptr = ctx->sq_sqes;
10834                 break;
10835         default:
10836                 return ERR_PTR(-EINVAL);
10837         }
10838
10839         page = virt_to_head_page(ptr);
10840         if (sz > page_size(page))
10841                 return ERR_PTR(-EINVAL);
10842
10843         return ptr;
10844 }
10845
10846 #ifdef CONFIG_MMU
10847
10848 static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
10849 {
10850         size_t sz = vma->vm_end - vma->vm_start;
10851         unsigned long pfn;
10852         void *ptr;
10853
10854         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
10855         if (IS_ERR(ptr))
10856                 return PTR_ERR(ptr);
10857
10858         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
10859         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
10860 }
10861
10862 #else /* !CONFIG_MMU */
10863
10864 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
10865 {
10866         return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
10867 }
10868
10869 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
10870 {
10871         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
10872 }
10873
10874 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
10875         unsigned long addr, unsigned long len,
10876         unsigned long pgoff, unsigned long flags)
10877 {
10878         void *ptr;
10879
10880         ptr = io_uring_validate_mmap_request(file, pgoff, len);
10881         if (IS_ERR(ptr))
10882                 return PTR_ERR(ptr);
10883
10884         return (unsigned long) ptr;
10885 }
10886
10887 #endif /* !CONFIG_MMU */
10888
10889 static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
10890 {
10891         DEFINE_WAIT(wait);
10892
10893         do {
10894                 if (!io_sqring_full(ctx))
10895                         break;
10896                 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
10897
10898                 if (!io_sqring_full(ctx))
10899                         break;
10900                 schedule();
10901         } while (!signal_pending(current));
10902
10903         finish_wait(&ctx->sqo_sq_wait, &wait);
10904         return 0;
10905 }
10906
10907 static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
10908                           struct __kernel_timespec __user **ts,
10909                           const sigset_t __user **sig)
10910 {
10911         struct io_uring_getevents_arg arg;
10912
10913         /*
10914          * If EXT_ARG isn't set, then we have no timespec and the argp pointer
10915          * is just a pointer to the sigset_t.
10916          */
10917         if (!(flags & IORING_ENTER_EXT_ARG)) {
10918                 *sig = (const sigset_t __user *) argp;
10919                 *ts = NULL;
10920                 return 0;
10921         }
10922
10923         /*
10924          * EXT_ARG is set - ensure we agree on the size of it and copy in our
10925          * timespec and sigset_t pointers if good.
10926          */
10927         if (*argsz != sizeof(arg))
10928                 return -EINVAL;
10929         if (copy_from_user(&arg, argp, sizeof(arg)))
10930                 return -EFAULT;
10931         *sig = u64_to_user_ptr(arg.sigmask);
10932         *argsz = arg.sigmask_sz;
10933         *ts = u64_to_user_ptr(arg.ts);
10934         return 0;
10935 }
10936
10937 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
10938                 u32, min_complete, u32, flags, const void __user *, argp,
10939                 size_t, argsz)
10940 {
10941         struct io_ring_ctx *ctx;
10942         int submitted = 0;
10943         struct fd f;
10944         long ret;
10945
10946         io_run_task_work();
10947
10948         if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
10949                                IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
10950                                IORING_ENTER_REGISTERED_RING)))
10951                 return -EINVAL;
10952
10953         /*
10954          * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
10955          * need only dereference our task private array to find it.
10956          */
10957         if (flags & IORING_ENTER_REGISTERED_RING) {
10958                 struct io_uring_task *tctx = current->io_uring;
10959
10960                 if (!tctx || fd >= IO_RINGFD_REG_MAX)
10961                         return -EINVAL;
10962                 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
10963                 f.file = tctx->registered_rings[fd];
10964                 if (unlikely(!f.file))
10965                         return -EBADF;
10966         } else {
10967                 f = fdget(fd);
10968                 if (unlikely(!f.file))
10969                         return -EBADF;
10970         }
10971
10972         ret = -EOPNOTSUPP;
10973         if (unlikely(f.file->f_op != &io_uring_fops))
10974                 goto out_fput;
10975
10976         ret = -ENXIO;
10977         ctx = f.file->private_data;
10978         if (unlikely(!percpu_ref_tryget(&ctx->refs)))
10979                 goto out_fput;
10980
10981         ret = -EBADFD;
10982         if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
10983                 goto out;
10984
10985         /*
10986          * For SQ polling, the thread will do all submissions and completions.
10987          * Just return the requested submit count, and wake the thread if
10988          * we were asked to.
10989          */
10990         ret = 0;
10991         if (ctx->flags & IORING_SETUP_SQPOLL) {
10992                 io_cqring_overflow_flush(ctx);
10993
10994                 if (unlikely(ctx->sq_data->thread == NULL)) {
10995                         ret = -EOWNERDEAD;
10996                         goto out;
10997                 }
10998                 if (flags & IORING_ENTER_SQ_WAKEUP)
10999                         wake_up(&ctx->sq_data->wait);
11000                 if (flags & IORING_ENTER_SQ_WAIT) {
11001                         ret = io_sqpoll_wait_sq(ctx);
11002                         if (ret)
11003                                 goto out;
11004                 }
11005                 submitted = to_submit;
11006         } else if (to_submit) {
11007                 ret = io_uring_add_tctx_node(ctx);
11008                 if (unlikely(ret))
11009                         goto out;
11010                 mutex_lock(&ctx->uring_lock);
11011                 submitted = io_submit_sqes(ctx, to_submit);
11012                 mutex_unlock(&ctx->uring_lock);
11013
11014                 if (submitted != to_submit)
11015                         goto out;
11016         }
11017         if (flags & IORING_ENTER_GETEVENTS) {
11018                 const sigset_t __user *sig;
11019                 struct __kernel_timespec __user *ts;
11020
11021                 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
11022                 if (unlikely(ret))
11023                         goto out;
11024
11025                 min_complete = min(min_complete, ctx->cq_entries);
11026
11027                 /*
11028                  * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
11029                  * space applications don't need to do io completion events
11030                  * polling again, they can rely on io_sq_thread to do polling
11031                  * work, which can reduce cpu usage and uring_lock contention.
11032                  */
11033                 if (ctx->flags & IORING_SETUP_IOPOLL &&
11034                     !(ctx->flags & IORING_SETUP_SQPOLL)) {
11035                         ret = io_iopoll_check(ctx, min_complete);
11036                 } else {
11037                         ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
11038                 }
11039         }
11040
11041 out:
11042         percpu_ref_put(&ctx->refs);
11043 out_fput:
11044         if (!(flags & IORING_ENTER_REGISTERED_RING))
11045                 fdput(f);
11046         return submitted ? submitted : ret;
11047 }
11048
11049 #ifdef CONFIG_PROC_FS
11050 static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
11051                 const struct cred *cred)
11052 {
11053         struct user_namespace *uns = seq_user_ns(m);
11054         struct group_info *gi;
11055         kernel_cap_t cap;
11056         unsigned __capi;
11057         int g;
11058
11059         seq_printf(m, "%5d\n", id);
11060         seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
11061         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
11062         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
11063         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
11064         seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
11065         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
11066         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
11067         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
11068         seq_puts(m, "\n\tGroups:\t");
11069         gi = cred->group_info;
11070         for (g = 0; g < gi->ngroups; g++) {
11071                 seq_put_decimal_ull(m, g ? " " : "",
11072                                         from_kgid_munged(uns, gi->gid[g]));
11073         }
11074         seq_puts(m, "\n\tCapEff:\t");
11075         cap = cred->cap_effective;
11076         CAP_FOR_EACH_U32(__capi)
11077                 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
11078         seq_putc(m, '\n');
11079         return 0;
11080 }
11081
11082 static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
11083                                           struct seq_file *m)
11084 {
11085         struct io_sq_data *sq = NULL;
11086         struct io_overflow_cqe *ocqe;
11087         struct io_rings *r = ctx->rings;
11088         unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
11089         unsigned int sq_head = READ_ONCE(r->sq.head);
11090         unsigned int sq_tail = READ_ONCE(r->sq.tail);
11091         unsigned int cq_head = READ_ONCE(r->cq.head);
11092         unsigned int cq_tail = READ_ONCE(r->cq.tail);
11093         unsigned int sq_entries, cq_entries;
11094         bool has_lock;
11095         unsigned int i;
11096
11097         /*
11098          * we may get imprecise sqe and cqe info if uring is actively running
11099          * since we get cached_sq_head and cached_cq_tail without uring_lock
11100          * and sq_tail and cq_head are changed by userspace. But it's ok since
11101          * we usually use these info when it is stuck.
11102          */
11103         seq_printf(m, "SqMask:\t0x%x\n", sq_mask);
11104         seq_printf(m, "SqHead:\t%u\n", sq_head);
11105         seq_printf(m, "SqTail:\t%u\n", sq_tail);
11106         seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
11107         seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
11108         seq_printf(m, "CqHead:\t%u\n", cq_head);
11109         seq_printf(m, "CqTail:\t%u\n", cq_tail);
11110         seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
11111         seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
11112         sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
11113         for (i = 0; i < sq_entries; i++) {
11114                 unsigned int entry = i + sq_head;
11115                 unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
11116                 struct io_uring_sqe *sqe;
11117
11118                 if (sq_idx > sq_mask)
11119                         continue;
11120                 sqe = &ctx->sq_sqes[sq_idx];
11121                 seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
11122                            sq_idx, sqe->opcode, sqe->fd, sqe->flags,
11123                            sqe->user_data);
11124         }
11125         seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
11126         cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
11127         for (i = 0; i < cq_entries; i++) {
11128                 unsigned int entry = i + cq_head;
11129                 struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask];
11130
11131                 seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
11132                            entry & cq_mask, cqe->user_data, cqe->res,
11133                            cqe->flags);
11134         }
11135
11136         /*
11137          * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
11138          * since fdinfo case grabs it in the opposite direction of normal use
11139          * cases. If we fail to get the lock, we just don't iterate any
11140          * structures that could be going away outside the io_uring mutex.
11141          */
11142         has_lock = mutex_trylock(&ctx->uring_lock);
11143
11144         if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
11145                 sq = ctx->sq_data;
11146                 if (!sq->thread)
11147                         sq = NULL;
11148         }
11149
11150         seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
11151         seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
11152         seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
11153         for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
11154                 struct file *f = io_file_from_index(ctx, i);
11155
11156                 if (f)
11157                         seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
11158                 else
11159                         seq_printf(m, "%5u: <none>\n", i);
11160         }
11161         seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
11162         for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
11163                 struct io_mapped_ubuf *buf = ctx->user_bufs[i];
11164                 unsigned int len = buf->ubuf_end - buf->ubuf;
11165
11166                 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
11167         }
11168         if (has_lock && !xa_empty(&ctx->personalities)) {
11169                 unsigned long index;
11170                 const struct cred *cred;
11171
11172                 seq_printf(m, "Personalities:\n");
11173                 xa_for_each(&ctx->personalities, index, cred)
11174                         io_uring_show_cred(m, index, cred);
11175         }
11176         if (has_lock)
11177                 mutex_unlock(&ctx->uring_lock);
11178
11179         seq_puts(m, "PollList:\n");
11180         spin_lock(&ctx->completion_lock);
11181         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
11182                 struct hlist_head *list = &ctx->cancel_hash[i];
11183                 struct io_kiocb *req;
11184
11185                 hlist_for_each_entry(req, list, hash_node)
11186                         seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
11187                                         task_work_pending(req->task));
11188         }
11189
11190         seq_puts(m, "CqOverflowList:\n");
11191         list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
11192                 struct io_uring_cqe *cqe = &ocqe->cqe;
11193
11194                 seq_printf(m, "  user_data=%llu, res=%d, flags=%x\n",
11195                            cqe->user_data, cqe->res, cqe->flags);
11196
11197         }
11198
11199         spin_unlock(&ctx->completion_lock);
11200 }
11201
11202 static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
11203 {
11204         struct io_ring_ctx *ctx = f->private_data;
11205
11206         if (percpu_ref_tryget(&ctx->refs)) {
11207                 __io_uring_show_fdinfo(ctx, m);
11208                 percpu_ref_put(&ctx->refs);
11209         }
11210 }
11211 #endif
11212
11213 static const struct file_operations io_uring_fops = {
11214         .release        = io_uring_release,
11215         .mmap           = io_uring_mmap,
11216 #ifndef CONFIG_MMU
11217         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
11218         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
11219 #endif
11220         .poll           = io_uring_poll,
11221 #ifdef CONFIG_PROC_FS
11222         .show_fdinfo    = io_uring_show_fdinfo,
11223 #endif
11224 };
11225
11226 static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
11227                                          struct io_uring_params *p)
11228 {
11229         struct io_rings *rings;
11230         size_t size, sq_array_offset;
11231
11232         /* make sure these are sane, as we already accounted them */
11233         ctx->sq_entries = p->sq_entries;
11234         ctx->cq_entries = p->cq_entries;
11235
11236         size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
11237         if (size == SIZE_MAX)
11238                 return -EOVERFLOW;
11239
11240         rings = io_mem_alloc(size);
11241         if (!rings)
11242                 return -ENOMEM;
11243
11244         ctx->rings = rings;
11245         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
11246         rings->sq_ring_mask = p->sq_entries - 1;
11247         rings->cq_ring_mask = p->cq_entries - 1;
11248         rings->sq_ring_entries = p->sq_entries;
11249         rings->cq_ring_entries = p->cq_entries;
11250
11251         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
11252         if (size == SIZE_MAX) {
11253                 io_mem_free(ctx->rings);
11254                 ctx->rings = NULL;
11255                 return -EOVERFLOW;
11256         }
11257
11258         ctx->sq_sqes = io_mem_alloc(size);
11259         if (!ctx->sq_sqes) {
11260                 io_mem_free(ctx->rings);
11261                 ctx->rings = NULL;
11262                 return -ENOMEM;
11263         }
11264
11265         return 0;
11266 }
11267
11268 static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
11269 {
11270         int ret, fd;
11271
11272         fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
11273         if (fd < 0)
11274                 return fd;
11275
11276         ret = io_uring_add_tctx_node(ctx);
11277         if (ret) {
11278                 put_unused_fd(fd);
11279                 return ret;
11280         }
11281         fd_install(fd, file);
11282         return fd;
11283 }
11284
11285 /*
11286  * Allocate an anonymous fd, this is what constitutes the application
11287  * visible backing of an io_uring instance. The application mmaps this
11288  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
11289  * we have to tie this fd to a socket for file garbage collection purposes.
11290  */
11291 static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
11292 {
11293         struct file *file;
11294 #if defined(CONFIG_UNIX)
11295         int ret;
11296
11297         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
11298                                 &ctx->ring_sock);
11299         if (ret)
11300                 return ERR_PTR(ret);
11301 #endif
11302
11303         file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
11304                                          O_RDWR | O_CLOEXEC, NULL);
11305 #if defined(CONFIG_UNIX)
11306         if (IS_ERR(file)) {
11307                 sock_release(ctx->ring_sock);
11308                 ctx->ring_sock = NULL;
11309         } else {
11310                 ctx->ring_sock->file = file;
11311         }
11312 #endif
11313         return file;
11314 }
11315
11316 static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
11317                                   struct io_uring_params __user *params)
11318 {
11319         struct io_ring_ctx *ctx;
11320         struct file *file;
11321         int ret;
11322
11323         if (!entries)
11324                 return -EINVAL;
11325         if (entries > IORING_MAX_ENTRIES) {
11326                 if (!(p->flags & IORING_SETUP_CLAMP))
11327                         return -EINVAL;
11328                 entries = IORING_MAX_ENTRIES;
11329         }
11330
11331         /*
11332          * Use twice as many entries for the CQ ring. It's possible for the
11333          * application to drive a higher depth than the size of the SQ ring,
11334          * since the sqes are only used at submission time. This allows for
11335          * some flexibility in overcommitting a bit. If the application has
11336          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
11337          * of CQ ring entries manually.
11338          */
11339         p->sq_entries = roundup_pow_of_two(entries);
11340         if (p->flags & IORING_SETUP_CQSIZE) {
11341                 /*
11342                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
11343                  * to a power-of-two, if it isn't already. We do NOT impose
11344                  * any cq vs sq ring sizing.
11345                  */
11346                 if (!p->cq_entries)
11347                         return -EINVAL;
11348                 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
11349                         if (!(p->flags & IORING_SETUP_CLAMP))
11350                                 return -EINVAL;
11351                         p->cq_entries = IORING_MAX_CQ_ENTRIES;
11352                 }
11353                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
11354                 if (p->cq_entries < p->sq_entries)
11355                         return -EINVAL;
11356         } else {
11357                 p->cq_entries = 2 * p->sq_entries;
11358         }
11359
11360         ctx = io_ring_ctx_alloc(p);
11361         if (!ctx)
11362                 return -ENOMEM;
11363         ctx->compat = in_compat_syscall();
11364         if (!capable(CAP_IPC_LOCK))
11365                 ctx->user = get_uid(current_user());
11366
11367         /*
11368          * This is just grabbed for accounting purposes. When a process exits,
11369          * the mm is exited and dropped before the files, hence we need to hang
11370          * on to this mm purely for the purposes of being able to unaccount
11371          * memory (locked/pinned vm). It's not used for anything else.
11372          */
11373         mmgrab(current->mm);
11374         ctx->mm_account = current->mm;
11375
11376         ret = io_allocate_scq_urings(ctx, p);
11377         if (ret)
11378                 goto err;
11379
11380         ret = io_sq_offload_create(ctx, p);
11381         if (ret)
11382                 goto err;
11383         /* always set a rsrc node */
11384         ret = io_rsrc_node_switch_start(ctx);
11385         if (ret)
11386                 goto err;
11387         io_rsrc_node_switch(ctx, NULL);
11388
11389         memset(&p->sq_off, 0, sizeof(p->sq_off));
11390         p->sq_off.head = offsetof(struct io_rings, sq.head);
11391         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
11392         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
11393         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
11394         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
11395         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
11396         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
11397
11398         memset(&p->cq_off, 0, sizeof(p->cq_off));
11399         p->cq_off.head = offsetof(struct io_rings, cq.head);
11400         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
11401         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
11402         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
11403         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
11404         p->cq_off.cqes = offsetof(struct io_rings, cqes);
11405         p->cq_off.flags = offsetof(struct io_rings, cq_flags);
11406
11407         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
11408                         IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
11409                         IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
11410                         IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
11411                         IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
11412                         IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP;
11413
11414         if (copy_to_user(params, p, sizeof(*p))) {
11415                 ret = -EFAULT;
11416                 goto err;
11417         }
11418
11419         file = io_uring_get_file(ctx);
11420         if (IS_ERR(file)) {
11421                 ret = PTR_ERR(file);
11422                 goto err;
11423         }
11424
11425         /*
11426          * Install ring fd as the very last thing, so we don't risk someone
11427          * having closed it before we finish setup
11428          */
11429         ret = io_uring_install_fd(ctx, file);
11430         if (ret < 0) {
11431                 /* fput will clean it up */
11432                 fput(file);
11433                 return ret;
11434         }
11435
11436         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
11437         return ret;
11438 err:
11439         io_ring_ctx_wait_and_kill(ctx);
11440         return ret;
11441 }
11442
11443 /*
11444  * Sets up an aio uring context, and returns the fd. Applications asks for a
11445  * ring size, we return the actual sq/cq ring sizes (among other things) in the
11446  * params structure passed in.
11447  */
11448 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
11449 {
11450         struct io_uring_params p;
11451         int i;
11452
11453         if (copy_from_user(&p, params, sizeof(p)))
11454                 return -EFAULT;
11455         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
11456                 if (p.resv[i])
11457                         return -EINVAL;
11458         }
11459
11460         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
11461                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
11462                         IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
11463                         IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL))
11464                 return -EINVAL;
11465
11466         return  io_uring_create(entries, &p, params);
11467 }
11468
11469 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
11470                 struct io_uring_params __user *, params)
11471 {
11472         return io_uring_setup(entries, params);
11473 }
11474
11475 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
11476                            unsigned nr_args)
11477 {
11478         struct io_uring_probe *p;
11479         size_t size;
11480         int i, ret;
11481
11482         size = struct_size(p, ops, nr_args);
11483         if (size == SIZE_MAX)
11484                 return -EOVERFLOW;
11485         p = kzalloc(size, GFP_KERNEL);
11486         if (!p)
11487                 return -ENOMEM;
11488
11489         ret = -EFAULT;
11490         if (copy_from_user(p, arg, size))
11491                 goto out;
11492         ret = -EINVAL;
11493         if (memchr_inv(p, 0, size))
11494                 goto out;
11495
11496         p->last_op = IORING_OP_LAST - 1;
11497         if (nr_args > IORING_OP_LAST)
11498                 nr_args = IORING_OP_LAST;
11499
11500         for (i = 0; i < nr_args; i++) {
11501                 p->ops[i].op = i;
11502                 if (!io_op_defs[i].not_supported)
11503                         p->ops[i].flags = IO_URING_OP_SUPPORTED;
11504         }
11505         p->ops_len = i;
11506
11507         ret = 0;
11508         if (copy_to_user(arg, p, size))
11509                 ret = -EFAULT;
11510 out:
11511         kfree(p);
11512         return ret;
11513 }
11514
11515 static int io_register_personality(struct io_ring_ctx *ctx)
11516 {
11517         const struct cred *creds;
11518         u32 id;
11519         int ret;
11520
11521         creds = get_current_cred();
11522
11523         ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
11524                         XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
11525         if (ret < 0) {
11526                 put_cred(creds);
11527                 return ret;
11528         }
11529         return id;
11530 }
11531
11532 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
11533                                            void __user *arg, unsigned int nr_args)
11534 {
11535         struct io_uring_restriction *res;
11536         size_t size;
11537         int i, ret;
11538
11539         /* Restrictions allowed only if rings started disabled */
11540         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
11541                 return -EBADFD;
11542
11543         /* We allow only a single restrictions registration */
11544         if (ctx->restrictions.registered)
11545                 return -EBUSY;
11546
11547         if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
11548                 return -EINVAL;
11549
11550         size = array_size(nr_args, sizeof(*res));
11551         if (size == SIZE_MAX)
11552                 return -EOVERFLOW;
11553
11554         res = memdup_user(arg, size);
11555         if (IS_ERR(res))
11556                 return PTR_ERR(res);
11557
11558         ret = 0;
11559
11560         for (i = 0; i < nr_args; i++) {
11561                 switch (res[i].opcode) {
11562                 case IORING_RESTRICTION_REGISTER_OP:
11563                         if (res[i].register_op >= IORING_REGISTER_LAST) {
11564                                 ret = -EINVAL;
11565                                 goto out;
11566                         }
11567
11568                         __set_bit(res[i].register_op,
11569                                   ctx->restrictions.register_op);
11570                         break;
11571                 case IORING_RESTRICTION_SQE_OP:
11572                         if (res[i].sqe_op >= IORING_OP_LAST) {
11573                                 ret = -EINVAL;
11574                                 goto out;
11575                         }
11576
11577                         __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
11578                         break;
11579                 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
11580                         ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
11581                         break;
11582                 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
11583                         ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
11584                         break;
11585                 default:
11586                         ret = -EINVAL;
11587                         goto out;
11588                 }
11589         }
11590
11591 out:
11592         /* Reset all restrictions if an error happened */
11593         if (ret != 0)
11594                 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
11595         else
11596                 ctx->restrictions.registered = true;
11597
11598         kfree(res);
11599         return ret;
11600 }
11601
11602 static int io_register_enable_rings(struct io_ring_ctx *ctx)
11603 {
11604         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
11605                 return -EBADFD;
11606
11607         if (ctx->restrictions.registered)
11608                 ctx->restricted = 1;
11609
11610         ctx->flags &= ~IORING_SETUP_R_DISABLED;
11611         if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
11612                 wake_up(&ctx->sq_data->wait);
11613         return 0;
11614 }
11615
11616 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
11617                                      struct io_uring_rsrc_update2 *up,
11618                                      unsigned nr_args)
11619 {
11620         __u32 tmp;
11621         int err;
11622
11623         if (up->resv)
11624                 return -EINVAL;
11625         if (check_add_overflow(up->offset, nr_args, &tmp))
11626                 return -EOVERFLOW;
11627         err = io_rsrc_node_switch_start(ctx);
11628         if (err)
11629                 return err;
11630
11631         switch (type) {
11632         case IORING_RSRC_FILE:
11633                 return __io_sqe_files_update(ctx, up, nr_args);
11634         case IORING_RSRC_BUFFER:
11635                 return __io_sqe_buffers_update(ctx, up, nr_args);
11636         }
11637         return -EINVAL;
11638 }
11639
11640 static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
11641                                     unsigned nr_args)
11642 {
11643         struct io_uring_rsrc_update2 up;
11644
11645         if (!nr_args)
11646                 return -EINVAL;
11647         memset(&up, 0, sizeof(up));
11648         if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
11649                 return -EFAULT;
11650         return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
11651 }
11652
11653 static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
11654                                    unsigned size, unsigned type)
11655 {
11656         struct io_uring_rsrc_update2 up;
11657
11658         if (size != sizeof(up))
11659                 return -EINVAL;
11660         if (copy_from_user(&up, arg, sizeof(up)))
11661                 return -EFAULT;
11662         if (!up.nr || up.resv)
11663                 return -EINVAL;
11664         return __io_register_rsrc_update(ctx, type, &up, up.nr);
11665 }
11666
11667 static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
11668                             unsigned int size, unsigned int type)
11669 {
11670         struct io_uring_rsrc_register rr;
11671
11672         /* keep it extendible */
11673         if (size != sizeof(rr))
11674                 return -EINVAL;
11675
11676         memset(&rr, 0, sizeof(rr));
11677         if (copy_from_user(&rr, arg, size))
11678                 return -EFAULT;
11679         if (!rr.nr || rr.resv || rr.resv2)
11680                 return -EINVAL;
11681
11682         switch (type) {
11683         case IORING_RSRC_FILE:
11684                 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
11685                                              rr.nr, u64_to_user_ptr(rr.tags));
11686         case IORING_RSRC_BUFFER:
11687                 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
11688                                                rr.nr, u64_to_user_ptr(rr.tags));
11689         }
11690         return -EINVAL;
11691 }
11692
11693 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
11694                                        void __user *arg, unsigned len)
11695 {
11696         struct io_uring_task *tctx = current->io_uring;
11697         cpumask_var_t new_mask;
11698         int ret;
11699
11700         if (!tctx || !tctx->io_wq)
11701                 return -EINVAL;
11702
11703         if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
11704                 return -ENOMEM;
11705
11706         cpumask_clear(new_mask);
11707         if (len > cpumask_size())
11708                 len = cpumask_size();
11709
11710         if (copy_from_user(new_mask, arg, len)) {
11711                 free_cpumask_var(new_mask);
11712                 return -EFAULT;
11713         }
11714
11715         ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
11716         free_cpumask_var(new_mask);
11717         return ret;
11718 }
11719
11720 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
11721 {
11722         struct io_uring_task *tctx = current->io_uring;
11723
11724         if (!tctx || !tctx->io_wq)
11725                 return -EINVAL;
11726
11727         return io_wq_cpu_affinity(tctx->io_wq, NULL);
11728 }
11729
11730 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
11731                                                void __user *arg)
11732         __must_hold(&ctx->uring_lock)
11733 {
11734         struct io_tctx_node *node;
11735         struct io_uring_task *tctx = NULL;
11736         struct io_sq_data *sqd = NULL;
11737         __u32 new_count[2];
11738         int i, ret;
11739
11740         if (copy_from_user(new_count, arg, sizeof(new_count)))
11741                 return -EFAULT;
11742         for (i = 0; i < ARRAY_SIZE(new_count); i++)
11743                 if (new_count[i] > INT_MAX)
11744                         return -EINVAL;
11745
11746         if (ctx->flags & IORING_SETUP_SQPOLL) {
11747                 sqd = ctx->sq_data;
11748                 if (sqd) {
11749                         /*
11750                          * Observe the correct sqd->lock -> ctx->uring_lock
11751                          * ordering. Fine to drop uring_lock here, we hold
11752                          * a ref to the ctx.
11753                          */
11754                         refcount_inc(&sqd->refs);
11755                         mutex_unlock(&ctx->uring_lock);
11756                         mutex_lock(&sqd->lock);
11757                         mutex_lock(&ctx->uring_lock);
11758                         if (sqd->thread)
11759                                 tctx = sqd->thread->io_uring;
11760                 }
11761         } else {
11762                 tctx = current->io_uring;
11763         }
11764
11765         BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
11766
11767         for (i = 0; i < ARRAY_SIZE(new_count); i++)
11768                 if (new_count[i])
11769                         ctx->iowq_limits[i] = new_count[i];
11770         ctx->iowq_limits_set = true;
11771
11772         if (tctx && tctx->io_wq) {
11773                 ret = io_wq_max_workers(tctx->io_wq, new_count);
11774                 if (ret)
11775                         goto err;
11776         } else {
11777                 memset(new_count, 0, sizeof(new_count));
11778         }
11779
11780         if (sqd) {
11781                 mutex_unlock(&sqd->lock);
11782                 io_put_sq_data(sqd);
11783         }
11784
11785         if (copy_to_user(arg, new_count, sizeof(new_count)))
11786                 return -EFAULT;
11787
11788         /* that's it for SQPOLL, only the SQPOLL task creates requests */
11789         if (sqd)
11790                 return 0;
11791
11792         /* now propagate the restriction to all registered users */
11793         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
11794                 struct io_uring_task *tctx = node->task->io_uring;
11795
11796                 if (WARN_ON_ONCE(!tctx->io_wq))
11797                         continue;
11798
11799                 for (i = 0; i < ARRAY_SIZE(new_count); i++)
11800                         new_count[i] = ctx->iowq_limits[i];
11801                 /* ignore errors, it always returns zero anyway */
11802                 (void)io_wq_max_workers(tctx->io_wq, new_count);
11803         }
11804         return 0;
11805 err:
11806         if (sqd) {
11807                 mutex_unlock(&sqd->lock);
11808                 io_put_sq_data(sqd);
11809         }
11810         return ret;
11811 }
11812
11813 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
11814                                void __user *arg, unsigned nr_args)
11815         __releases(ctx->uring_lock)
11816         __acquires(ctx->uring_lock)
11817 {
11818         int ret;
11819
11820         /*
11821          * We're inside the ring mutex, if the ref is already dying, then
11822          * someone else killed the ctx or is already going through
11823          * io_uring_register().
11824          */
11825         if (percpu_ref_is_dying(&ctx->refs))
11826                 return -ENXIO;
11827
11828         if (ctx->restricted) {
11829                 if (opcode >= IORING_REGISTER_LAST)
11830                         return -EINVAL;
11831                 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
11832                 if (!test_bit(opcode, ctx->restrictions.register_op))
11833                         return -EACCES;
11834         }
11835
11836         switch (opcode) {
11837         case IORING_REGISTER_BUFFERS:
11838                 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
11839                 break;
11840         case IORING_UNREGISTER_BUFFERS:
11841                 ret = -EINVAL;
11842                 if (arg || nr_args)
11843                         break;
11844                 ret = io_sqe_buffers_unregister(ctx);
11845                 break;
11846         case IORING_REGISTER_FILES:
11847                 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
11848                 break;
11849         case IORING_UNREGISTER_FILES:
11850                 ret = -EINVAL;
11851                 if (arg || nr_args)
11852                         break;
11853                 ret = io_sqe_files_unregister(ctx);
11854                 break;
11855         case IORING_REGISTER_FILES_UPDATE:
11856                 ret = io_register_files_update(ctx, arg, nr_args);
11857                 break;
11858         case IORING_REGISTER_EVENTFD:
11859                 ret = -EINVAL;
11860                 if (nr_args != 1)
11861                         break;
11862                 ret = io_eventfd_register(ctx, arg, 0);
11863                 break;
11864         case IORING_REGISTER_EVENTFD_ASYNC:
11865                 ret = -EINVAL;
11866                 if (nr_args != 1)
11867                         break;
11868                 ret = io_eventfd_register(ctx, arg, 1);
11869                 break;
11870         case IORING_UNREGISTER_EVENTFD:
11871                 ret = -EINVAL;
11872                 if (arg || nr_args)
11873                         break;
11874                 ret = io_eventfd_unregister(ctx);
11875                 break;
11876         case IORING_REGISTER_PROBE:
11877                 ret = -EINVAL;
11878                 if (!arg || nr_args > 256)
11879                         break;
11880                 ret = io_probe(ctx, arg, nr_args);
11881                 break;
11882         case IORING_REGISTER_PERSONALITY:
11883                 ret = -EINVAL;
11884                 if (arg || nr_args)
11885                         break;
11886                 ret = io_register_personality(ctx);
11887                 break;
11888         case IORING_UNREGISTER_PERSONALITY:
11889                 ret = -EINVAL;
11890                 if (arg)
11891                         break;
11892                 ret = io_unregister_personality(ctx, nr_args);
11893                 break;
11894         case IORING_REGISTER_ENABLE_RINGS:
11895                 ret = -EINVAL;
11896                 if (arg || nr_args)
11897                         break;
11898                 ret = io_register_enable_rings(ctx);
11899                 break;
11900         case IORING_REGISTER_RESTRICTIONS:
11901                 ret = io_register_restrictions(ctx, arg, nr_args);
11902                 break;
11903         case IORING_REGISTER_FILES2:
11904                 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
11905                 break;
11906         case IORING_REGISTER_FILES_UPDATE2:
11907                 ret = io_register_rsrc_update(ctx, arg, nr_args,
11908                                               IORING_RSRC_FILE);
11909                 break;
11910         case IORING_REGISTER_BUFFERS2:
11911                 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
11912                 break;
11913         case IORING_REGISTER_BUFFERS_UPDATE:
11914                 ret = io_register_rsrc_update(ctx, arg, nr_args,
11915                                               IORING_RSRC_BUFFER);
11916                 break;
11917         case IORING_REGISTER_IOWQ_AFF:
11918                 ret = -EINVAL;
11919                 if (!arg || !nr_args)
11920                         break;
11921                 ret = io_register_iowq_aff(ctx, arg, nr_args);
11922                 break;
11923         case IORING_UNREGISTER_IOWQ_AFF:
11924                 ret = -EINVAL;
11925                 if (arg || nr_args)
11926                         break;
11927                 ret = io_unregister_iowq_aff(ctx);
11928                 break;
11929         case IORING_REGISTER_IOWQ_MAX_WORKERS:
11930                 ret = -EINVAL;
11931                 if (!arg || nr_args != 2)
11932                         break;
11933                 ret = io_register_iowq_max_workers(ctx, arg);
11934                 break;
11935         case IORING_REGISTER_RING_FDS:
11936                 ret = io_ringfd_register(ctx, arg, nr_args);
11937                 break;
11938         case IORING_UNREGISTER_RING_FDS:
11939                 ret = io_ringfd_unregister(ctx, arg, nr_args);
11940                 break;
11941         default:
11942                 ret = -EINVAL;
11943                 break;
11944         }
11945
11946         return ret;
11947 }
11948
11949 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
11950                 void __user *, arg, unsigned int, nr_args)
11951 {
11952         struct io_ring_ctx *ctx;
11953         long ret = -EBADF;
11954         struct fd f;
11955
11956         f = fdget(fd);
11957         if (!f.file)
11958                 return -EBADF;
11959
11960         ret = -EOPNOTSUPP;
11961         if (f.file->f_op != &io_uring_fops)
11962                 goto out_fput;
11963
11964         ctx = f.file->private_data;
11965
11966         io_run_task_work();
11967
11968         mutex_lock(&ctx->uring_lock);
11969         ret = __io_uring_register(ctx, opcode, arg, nr_args);
11970         mutex_unlock(&ctx->uring_lock);
11971         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
11972 out_fput:
11973         fdput(f);
11974         return ret;
11975 }
11976
11977 static int __init io_uring_init(void)
11978 {
11979 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
11980         BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
11981         BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
11982 } while (0)
11983
11984 #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
11985         __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
11986         BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
11987         BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
11988         BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
11989         BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
11990         BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
11991         BUILD_BUG_SQE_ELEM(8,  __u64,  off);
11992         BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
11993         BUILD_BUG_SQE_ELEM(16, __u64,  addr);
11994         BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
11995         BUILD_BUG_SQE_ELEM(24, __u32,  len);
11996         BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
11997         BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
11998         BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
11999         BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
12000         BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
12001         BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
12002         BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
12003         BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
12004         BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
12005         BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
12006         BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
12007         BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
12008         BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
12009         BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
12010         BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
12011         BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
12012         BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
12013         BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
12014         BUILD_BUG_SQE_ELEM(42, __u16,  personality);
12015         BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
12016         BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
12017
12018         BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
12019                      sizeof(struct io_uring_rsrc_update));
12020         BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
12021                      sizeof(struct io_uring_rsrc_update2));
12022
12023         /* ->buf_index is u16 */
12024         BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
12025
12026         /* should fit into one byte */
12027         BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
12028         BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
12029         BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
12030
12031         BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
12032         BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
12033
12034         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
12035                                 SLAB_ACCOUNT);
12036         return 0;
12037 };
12038 __initcall(io_uring_init);