fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqring (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <linux/compat.h>
  47 #include <net/compat.h>
  48 #include <linux/refcount.h>
  49 #include <linux/uio.h>
  50 #include <linux/bits.h>
  51
  52 #include <linux/sched/signal.h>
  53 #include <linux/fs.h>
  54 #include <linux/file.h>
  55 #include <linux/fdtable.h>
  56 #include <linux/mm.h>
  57 #include <linux/mman.h>
  58 #include <linux/percpu.h>
  59 #include <linux/slab.h>
  60 #include <linux/blkdev.h>
  61 #include <linux/bvec.h>
  62 #include <linux/net.h>
  63 #include <net/sock.h>
  64 #include <net/af_unix.h>
  65 #include <net/scm.h>
  66 #include <linux/anon_inodes.h>
  67 #include <linux/sched/mm.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/nospec.h>
  70 #include <linux/sizes.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/highmem.h>
  73 #include <linux/namei.h>
  74 #include <linux/fsnotify.h>
  75 #include <linux/fadvise.h>
  76 #include <linux/eventpoll.h>
  77 #include <linux/fs_struct.h>
  78 #include <linux/splice.h>
  79 #include <linux/task_work.h>
  80 #include <linux/pagemap.h>
  81 #include <linux/io_uring.h>
  82 #include <linux/blk-cgroup.h>
  83 #include <linux/audit.h>
  84
  85 #define CREATE_TRACE_POINTS
  86 #include <trace/events/io_uring.h>
  87
  88 #include <uapi/linux/io_uring.h>
  89
  90 #include "internal.h"
  91 #include "io-wq.h"
  92
  93 #define IORING_MAX_ENTRIES      32768
  94 #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
  95
  96 /*
  97  * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
  98  */
  99 #define IORING_FILE_TABLE_SHIFT 9
 100 #define IORING_MAX_FILES_TABLE  (1U << IORING_FILE_TABLE_SHIFT)
 101 #define IORING_FILE_TABLE_MASK  (IORING_MAX_FILES_TABLE - 1)
 102 #define IORING_MAX_FIXED_FILES  (64 * IORING_MAX_FILES_TABLE)
 103 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
 104                                  IORING_REGISTER_LAST + IORING_OP_LAST)
 105
 106 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
 107                                 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
 108                                 IOSQE_BUFFER_SELECT)
 109
 110 struct io_uring {
 111         u32 head ____cacheline_aligned_in_smp;
 112         u32 tail ____cacheline_aligned_in_smp;
 113 };
 114
 115 /*
 116  * This data is shared with the application through the mmap at offsets
 117  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 118  *
 119  * The offsets to the member fields are published through struct
 120  * io_sqring_offsets when calling io_uring_setup.
 121  */
 122 struct io_rings {
 123         /*
 124          * Head and tail offsets into the ring; the offsets need to be
 125          * masked to get valid indices.
 126          *
 127          * The kernel controls head of the sq ring and the tail of the cq ring,
 128          * and the application controls tail of the sq ring and the head of the
 129          * cq ring.
 130          */
 131         struct io_uring         sq, cq;
 132         /*
 133          * Bitmasks to apply to head and tail offsets (constant, equals
 134          * ring_entries - 1)
 135          */
 136         u32                     sq_ring_mask, cq_ring_mask;
 137         /* Ring sizes (constant, power of 2) */
 138         u32                     sq_ring_entries, cq_ring_entries;
 139         /*
 140          * Number of invalid entries dropped by the kernel due to
 141          * invalid index stored in array
 142          *
 143          * Written by the kernel, shouldn't be modified by the
 144          * application (i.e. get number of "new events" by comparing to
 145          * cached value).
 146          *
 147          * After a new SQ head value was read by the application this
 148          * counter includes all submissions that were dropped reaching
 149          * the new SQ head (and possibly more).
 150          */
 151         u32                     sq_dropped;
 152         /*
 153          * Runtime SQ flags
 154          *
 155          * Written by the kernel, shouldn't be modified by the
 156          * application.
 157          *
 158          * The application needs a full memory barrier before checking
 159          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 160          */
 161         u32                     sq_flags;
 162         /*
 163          * Runtime CQ flags
 164          *
 165          * Written by the application, shouldn't be modified by the
 166          * kernel.
 167          */
 168         u32                     cq_flags;
 169         /*
 170          * Number of completion events lost because the queue was full;
 171          * this should be avoided by the application by making sure
 172          * there are not more requests pending than there is space in
 173          * the completion queue.
 174          *
 175          * Written by the kernel, shouldn't be modified by the
 176          * application (i.e. get number of "new events" by comparing to
 177          * cached value).
 178          *
 179          * As completion events come in out of order this counter is not
 180          * ordered with any other data.
 181          */
 182         u32                     cq_overflow;
 183         /*
 184          * Ring buffer of completion events.
 185          *
 186          * The kernel writes completion events fresh every time they are
 187          * produced, so the application is allowed to modify pending
 188          * entries.
 189          */
 190         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 191 };
 192
 193 enum io_uring_cmd_flags {
 194         IO_URING_F_NONBLOCK             = 1,
 195         IO_URING_F_COMPLETE_DEFER       = 2,
 196 };
 197
 198 struct io_mapped_ubuf {
 199         u64             ubuf;
 200         size_t          len;
 201         struct          bio_vec *bvec;
 202         unsigned int    nr_bvecs;
 203         unsigned long   acct_pages;
 204 };
 205
 206 struct io_ring_ctx;
 207
 208 struct io_rsrc_put {
 209         struct list_head list;
 210         union {
 211                 void *rsrc;
 212                 struct file *file;
 213         };
 214 };
 215
 216 struct fixed_rsrc_table {
 217         struct file             **files;
 218 };
 219
 220 struct fixed_rsrc_ref_node {
 221         struct percpu_ref               refs;
 222         struct list_head                node;
 223         struct list_head                rsrc_list;
 224         struct fixed_rsrc_data          *rsrc_data;
 225         void                            (*rsrc_put)(struct io_ring_ctx *ctx,
 226                                                     struct io_rsrc_put *prsrc);
 227         struct llist_node               llist;
 228         bool                            done;
 229 };
 230
 231 struct fixed_rsrc_data {
 232         struct fixed_rsrc_table         *table;
 233         struct io_ring_ctx              *ctx;
 234
 235         struct fixed_rsrc_ref_node      *node;
 236         struct percpu_ref               refs;
 237         struct completion               done;
 238         bool                            quiesce;
 239 };
 240
 241 struct io_buffer {
 242         struct list_head list;
 243         __u64 addr;
 244         __s32 len;
 245         __u16 bid;
 246 };
 247
 248 struct io_restriction {
 249         DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
 250         DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
 251         u8 sqe_flags_allowed;
 252         u8 sqe_flags_required;
 253         bool registered;
 254 };
 255
 256 enum {
 257         IO_SQ_THREAD_SHOULD_STOP = 0,
 258         IO_SQ_THREAD_SHOULD_PARK,
 259 };
 260
 261 struct io_sq_data {
 262         refcount_t              refs;
 263         struct mutex            lock;
 264
 265         /* ctx's that are using this sqd */
 266         struct list_head        ctx_list;
 267         struct list_head        ctx_new_list;
 268         struct mutex            ctx_lock;
 269
 270         struct task_struct      *thread;
 271         struct wait_queue_head  wait;
 272
 273         unsigned                sq_thread_idle;
 274         int                     sq_cpu;
 275         pid_t                   task_pid;
 276
 277         unsigned long           state;
 278         struct completion       startup;
 279         struct completion       completion;
 280         struct completion       exited;
 281 };
 282
 283 #define IO_IOPOLL_BATCH                 8
 284 #define IO_COMPL_BATCH                  32
 285 #define IO_REQ_CACHE_SIZE               32
 286 #define IO_REQ_ALLOC_BATCH              8
 287
 288 struct io_comp_state {
 289         struct io_kiocb         *reqs[IO_COMPL_BATCH];
 290         unsigned int            nr;
 291         unsigned int            locked_free_nr;
 292         /* inline/task_work completion list, under ->uring_lock */
 293         struct list_head        free_list;
 294         /* IRQ completion list, under ->completion_lock */
 295         struct list_head        locked_free_list;
 296 };
 297
 298 struct io_submit_link {
 299         struct io_kiocb         *head;
 300         struct io_kiocb         *last;
 301 };
 302
 303 struct io_submit_state {
 304         struct blk_plug         plug;
 305         struct io_submit_link   link;
 306
 307         /*
 308          * io_kiocb alloc cache
 309          */
 310         void                    *reqs[IO_REQ_CACHE_SIZE];
 311         unsigned int            free_reqs;
 312
 313         bool                    plug_started;
 314
 315         /*
 316          * Batch completion logic
 317          */
 318         struct io_comp_state    comp;
 319
 320         /*
 321          * File reference cache
 322          */
 323         struct file             *file;
 324         unsigned int            fd;
 325         unsigned int            file_refs;
 326         unsigned int            ios_left;
 327 };
 328
 329 struct io_ring_ctx {
 330         struct {
 331                 struct percpu_ref       refs;
 332         } ____cacheline_aligned_in_smp;
 333
 334         struct {
 335                 unsigned int            flags;
 336                 unsigned int            compat: 1;
 337                 unsigned int            limit_mem: 1;
 338                 unsigned int            cq_overflow_flushed: 1;
 339                 unsigned int            drain_next: 1;
 340                 unsigned int            eventfd_async: 1;
 341                 unsigned int            restricted: 1;
 342                 unsigned int            sqo_dead: 1;
 343
 344                 /*
 345                  * Ring buffer of indices into array of io_uring_sqe, which is
 346                  * mmapped by the application using the IORING_OFF_SQES offset.
 347                  *
 348                  * This indirection could e.g. be used to assign fixed
 349                  * io_uring_sqe entries to operations and only submit them to
 350                  * the queue when needed.
 351                  *
 352                  * The kernel modifies neither the indices array nor the entries
 353                  * array.
 354                  */
 355                 u32                     *sq_array;
 356                 unsigned                cached_sq_head;
 357                 unsigned                sq_entries;
 358                 unsigned                sq_mask;
 359                 unsigned                sq_thread_idle;
 360                 unsigned                cached_sq_dropped;
 361                 unsigned                cached_cq_overflow;
 362                 unsigned long           sq_check_overflow;
 363
 364                 struct list_head        defer_list;
 365                 struct list_head        timeout_list;
 366                 struct list_head        cq_overflow_list;
 367
 368                 struct io_uring_sqe     *sq_sqes;
 369         } ____cacheline_aligned_in_smp;
 370
 371         struct {
 372                 struct mutex            uring_lock;
 373                 wait_queue_head_t       wait;
 374         } ____cacheline_aligned_in_smp;
 375
 376         struct io_submit_state          submit_state;
 377
 378         struct io_rings *rings;
 379
 380         /*
 381          * For SQPOLL usage
 382          */
 383         struct task_struct      *sqo_task;
 384
 385         /* Only used for accounting purposes */
 386         struct mm_struct        *mm_account;
 387
 388         struct io_sq_data       *sq_data;       /* if using sq thread polling */
 389
 390         struct wait_queue_head  sqo_sq_wait;
 391         struct list_head        sqd_list;
 392
 393         /*
 394          * If used, fixed file set. Writers must ensure that ->refs is dead,
 395          * readers must ensure that ->refs is alive as long as the file* is
 396          * used. Only updated through io_uring_register(2).
 397          */
 398         struct fixed_rsrc_data  *file_data;
 399         unsigned                nr_user_files;
 400
 401         /* if used, fixed mapped user buffers */
 402         unsigned                nr_user_bufs;
 403         struct io_mapped_ubuf   *user_bufs;
 404
 405         struct user_struct      *user;
 406
 407         struct completion       ref_comp;
 408         struct completion       sq_thread_comp;
 409
 410 #if defined(CONFIG_UNIX)
 411         struct socket           *ring_sock;
 412 #endif
 413
 414         struct idr              io_buffer_idr;
 415
 416         struct idr              personality_idr;
 417
 418         struct {
 419                 unsigned                cached_cq_tail;
 420                 unsigned                cq_entries;
 421                 unsigned                cq_mask;
 422                 atomic_t                cq_timeouts;
 423                 unsigned                cq_last_tm_flush;
 424                 unsigned long           cq_check_overflow;
 425                 struct wait_queue_head  cq_wait;
 426                 struct fasync_struct    *cq_fasync;
 427                 struct eventfd_ctx      *cq_ev_fd;
 428         } ____cacheline_aligned_in_smp;
 429
 430         struct {
 431                 spinlock_t              completion_lock;
 432
 433                 /*
 434                  * ->iopoll_list is protected by the ctx->uring_lock for
 435                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 436                  * For SQPOLL, only the single threaded io_sq_thread() will
 437                  * manipulate the list, hence no extra locking is needed there.
 438                  */
 439                 struct list_head        iopoll_list;
 440                 struct hlist_head       *cancel_hash;
 441                 unsigned                cancel_hash_bits;
 442                 bool                    poll_multi_file;
 443
 444                 spinlock_t              inflight_lock;
 445                 struct list_head        inflight_list;
 446         } ____cacheline_aligned_in_smp;
 447
 448         struct delayed_work             rsrc_put_work;
 449         struct llist_head               rsrc_put_llist;
 450         struct list_head                rsrc_ref_list;
 451         spinlock_t                      rsrc_ref_lock;
 452
 453         struct io_restriction           restrictions;
 454
 455         /* exit task_work */
 456         struct callback_head            *exit_task_work;
 457
 458         /* Keep this last, we don't need it for the fast path */
 459         struct work_struct              exit_work;
 460 };
 461
 462 /*
 463  * First field must be the file pointer in all the
 464  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 465  */
 466 struct io_poll_iocb {
 467         struct file                     *file;
 468         struct wait_queue_head          *head;
 469         __poll_t                        events;
 470         bool                            done;
 471         bool                            canceled;
 472         struct wait_queue_entry         wait;
 473 };
 474
 475 struct io_poll_remove {
 476         struct file                     *file;
 477         u64                             addr;
 478 };
 479
 480 struct io_close {
 481         struct file                     *file;
 482         int                             fd;
 483 };
 484
 485 struct io_timeout_data {
 486         struct io_kiocb                 *req;
 487         struct hrtimer                  timer;
 488         struct timespec64               ts;
 489         enum hrtimer_mode               mode;
 490 };
 491
 492 struct io_accept {
 493         struct file                     *file;
 494         struct sockaddr __user          *addr;
 495         int __user                      *addr_len;
 496         int                             flags;
 497         unsigned long                   nofile;
 498 };
 499
 500 struct io_sync {
 501         struct file                     *file;
 502         loff_t                          len;
 503         loff_t                          off;
 504         int                             flags;
 505         int                             mode;
 506 };
 507
 508 struct io_cancel {
 509         struct file                     *file;
 510         u64                             addr;
 511 };
 512
 513 struct io_timeout {
 514         struct file                     *file;
 515         u32                             off;
 516         u32                             target_seq;
 517         struct list_head                list;
 518         /* head of the link, used by linked timeouts only */
 519         struct io_kiocb                 *head;
 520 };
 521
 522 struct io_timeout_rem {
 523         struct file                     *file;
 524         u64                             addr;
 525
 526         /* timeout update */
 527         struct timespec64               ts;
 528         u32                             flags;
 529 };
 530
 531 struct io_rw {
 532         /* NOTE: kiocb has the file as the first member, so don't do it here */
 533         struct kiocb                    kiocb;
 534         u64                             addr;
 535         u64                             len;
 536 };
 537
 538 struct io_connect {
 539         struct file                     *file;
 540         struct sockaddr __user          *addr;
 541         int                             addr_len;
 542 };
 543
 544 struct io_sr_msg {
 545         struct file                     *file;
 546         union {
 547                 struct user_msghdr __user *umsg;
 548                 void __user             *buf;
 549         };
 550         int                             msg_flags;
 551         int                             bgid;
 552         size_t                          len;
 553         struct io_buffer                *kbuf;
 554 };
 555
 556 struct io_open {
 557         struct file                     *file;
 558         int                             dfd;
 559         struct filename                 *filename;
 560         struct open_how                 how;
 561         unsigned long                   nofile;
 562 };
 563
 564 struct io_rsrc_update {
 565         struct file                     *file;
 566         u64                             arg;
 567         u32                             nr_args;
 568         u32                             offset;
 569 };
 570
 571 struct io_fadvise {
 572         struct file                     *file;
 573         u64                             offset;
 574         u32                             len;
 575         u32                             advice;
 576 };
 577
 578 struct io_madvise {
 579         struct file                     *file;
 580         u64                             addr;
 581         u32                             len;
 582         u32                             advice;
 583 };
 584
 585 struct io_epoll {
 586         struct file                     *file;
 587         int                             epfd;
 588         int                             op;
 589         int                             fd;
 590         struct epoll_event              event;
 591 };
 592
 593 struct io_splice {
 594         struct file                     *file_out;
 595         struct file                     *file_in;
 596         loff_t                          off_out;
 597         loff_t                          off_in;
 598         u64                             len;
 599         unsigned int                    flags;
 600 };
 601
 602 struct io_provide_buf {
 603         struct file                     *file;
 604         __u64                           addr;
 605         __s32                           len;
 606         __u32                           bgid;
 607         __u16                           nbufs;
 608         __u16                           bid;
 609 };
 610
 611 struct io_statx {
 612         struct file                     *file;
 613         int                             dfd;
 614         unsigned int                    mask;
 615         unsigned int                    flags;
 616         const char __user               *filename;
 617         struct statx __user             *buffer;
 618 };
 619
 620 struct io_shutdown {
 621         struct file                     *file;
 622         int                             how;
 623 };
 624
 625 struct io_rename {
 626         struct file                     *file;
 627         int                             old_dfd;
 628         int                             new_dfd;
 629         struct filename                 *oldpath;
 630         struct filename                 *newpath;
 631         int                             flags;
 632 };
 633
 634 struct io_unlink {
 635         struct file                     *file;
 636         int                             dfd;
 637         int                             flags;
 638         struct filename                 *filename;
 639 };
 640
 641 struct io_completion {
 642         struct file                     *file;
 643         struct list_head                list;
 644         int                             cflags;
 645 };
 646
 647 struct io_async_connect {
 648         struct sockaddr_storage         address;
 649 };
 650
 651 struct io_async_msghdr {
 652         struct iovec                    fast_iov[UIO_FASTIOV];
 653         /* points to an allocated iov, if NULL we use fast_iov instead */
 654         struct iovec                    *free_iov;
 655         struct sockaddr __user          *uaddr;
 656         struct msghdr                   msg;
 657         struct sockaddr_storage         addr;
 658 };
 659
 660 struct io_async_rw {
 661         struct iovec                    fast_iov[UIO_FASTIOV];
 662         const struct iovec              *free_iovec;
 663         struct iov_iter                 iter;
 664         size_t                          bytes_done;
 665         struct wait_page_queue          wpq;
 666 };
 667
 668 enum {
 669         REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
 670         REQ_F_IO_DRAIN_BIT      = IOSQE_IO_DRAIN_BIT,
 671         REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
 672         REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
 673         REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
 674         REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
 675
 676         REQ_F_FAIL_LINK_BIT,
 677         REQ_F_INFLIGHT_BIT,
 678         REQ_F_CUR_POS_BIT,
 679         REQ_F_NOWAIT_BIT,
 680         REQ_F_LINK_TIMEOUT_BIT,
 681         REQ_F_ISREG_BIT,
 682         REQ_F_NEED_CLEANUP_BIT,
 683         REQ_F_POLLED_BIT,
 684         REQ_F_BUFFER_SELECTED_BIT,
 685         REQ_F_NO_FILE_TABLE_BIT,
 686         REQ_F_WORK_INITIALIZED_BIT,
 687         REQ_F_LTIMEOUT_ACTIVE_BIT,
 688         REQ_F_COMPLETE_INLINE_BIT,
 689
 690         /* not a real bit, just to check we're not overflowing the space */
 691         __REQ_F_LAST_BIT,
 692 };
 693
 694 enum {
 695         /* ctx owns file */
 696         REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
 697         /* drain existing IO first */
 698         REQ_F_IO_DRAIN          = BIT(REQ_F_IO_DRAIN_BIT),
 699         /* linked sqes */
 700         REQ_F_LINK              = BIT(REQ_F_LINK_BIT),
 701         /* doesn't sever on completion < 0 */
 702         REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
 703         /* IOSQE_ASYNC */
 704         REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
 705         /* IOSQE_BUFFER_SELECT */
 706         REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
 707
 708         /* fail rest of links */
 709         REQ_F_FAIL_LINK         = BIT(REQ_F_FAIL_LINK_BIT),
 710         /* on inflight list */
 711         REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
 712         /* read/write uses file position */
 713         REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
 714         /* must not punt to workers */
 715         REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
 716         /* has or had linked timeout */
 717         REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
 718         /* regular file */
 719         REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
 720         /* needs cleanup */
 721         REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
 722         /* already went through poll handler */
 723         REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
 724         /* buffer already selected */
 725         REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
 726         /* doesn't need file table for this request */
 727         REQ_F_NO_FILE_TABLE     = BIT(REQ_F_NO_FILE_TABLE_BIT),
 728         /* io_wq_work is initialized */
 729         REQ_F_WORK_INITIALIZED  = BIT(REQ_F_WORK_INITIALIZED_BIT),
 730         /* linked timeout is active, i.e. prepared by link's head */
 731         REQ_F_LTIMEOUT_ACTIVE   = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
 732         /* completion is deferred through io_comp_state */
 733         REQ_F_COMPLETE_INLINE   = BIT(REQ_F_COMPLETE_INLINE_BIT),
 734 };
 735
 736 struct async_poll {
 737         struct io_poll_iocb     poll;
 738         struct io_poll_iocb     *double_poll;
 739 };
 740
 741 struct io_task_work {
 742         struct io_wq_work_node  node;
 743         task_work_func_t        func;
 744 };
 745
 746 /*
 747  * NOTE! Each of the iocb union members has the file pointer
 748  * as the first entry in their struct definition. So you can
 749  * access the file pointer through any of the sub-structs,
 750  * or directly as just 'ki_filp' in this struct.
 751  */
 752 struct io_kiocb {
 753         union {
 754                 struct file             *file;
 755                 struct io_rw            rw;
 756                 struct io_poll_iocb     poll;
 757                 struct io_poll_remove   poll_remove;
 758                 struct io_accept        accept;
 759                 struct io_sync          sync;
 760                 struct io_cancel        cancel;
 761                 struct io_timeout       timeout;
 762                 struct io_timeout_rem   timeout_rem;
 763                 struct io_connect       connect;
 764                 struct io_sr_msg        sr_msg;
 765                 struct io_open          open;
 766                 struct io_close         close;
 767                 struct io_rsrc_update   rsrc_update;
 768                 struct io_fadvise       fadvise;
 769                 struct io_madvise       madvise;
 770                 struct io_epoll         epoll;
 771                 struct io_splice        splice;
 772                 struct io_provide_buf   pbuf;
 773                 struct io_statx         statx;
 774                 struct io_shutdown      shutdown;
 775                 struct io_rename        rename;
 776                 struct io_unlink        unlink;
 777                 /* use only after cleaning per-op data, see io_clean_op() */
 778                 struct io_completion    compl;
 779         };
 780
 781         /* opcode allocated if it needs to store data for async defer */
 782         void                            *async_data;
 783         u8                              opcode;
 784         /* polled IO has completed */
 785         u8                              iopoll_completed;
 786
 787         u16                             buf_index;
 788         u32                             result;
 789
 790         struct io_ring_ctx              *ctx;
 791         unsigned int                    flags;
 792         refcount_t                      refs;
 793         struct task_struct              *task;
 794         u64                             user_data;
 795
 796         struct io_kiocb                 *link;
 797         struct percpu_ref               *fixed_rsrc_refs;
 798
 799         /*
 800          * 1. used with ctx->iopoll_list with reads/writes
 801          * 2. to track reqs with ->files (see io_op_def::file_table)
 802          */
 803         struct list_head                inflight_entry;
 804         union {
 805                 struct io_task_work     io_task_work;
 806                 struct callback_head    task_work;
 807         };
 808         /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
 809         struct hlist_node               hash_node;
 810         struct async_poll               *apoll;
 811         struct io_wq_work               work;
 812 };
 813
 814 struct io_defer_entry {
 815         struct list_head        list;
 816         struct io_kiocb         *req;
 817         u32                     seq;
 818 };
 819
 820 struct io_op_def {
 821         /* needs req->file assigned */
 822         unsigned                needs_file : 1;
 823         /* hash wq insertion if file is a regular file */
 824         unsigned                hash_reg_file : 1;
 825         /* unbound wq insertion if file is a non-regular file */
 826         unsigned                unbound_nonreg_file : 1;
 827         /* opcode is not supported by this kernel */
 828         unsigned                not_supported : 1;
 829         /* set if opcode supports polled "wait" */
 830         unsigned                pollin : 1;
 831         unsigned                pollout : 1;
 832         /* op supports buffer selection */
 833         unsigned                buffer_select : 1;
 834         /* must always have async data allocated */
 835         unsigned                needs_async_data : 1;
 836         /* should block plug */
 837         unsigned                plug : 1;
 838         /* size of async data needed, if any */
 839         unsigned short          async_size;
 840 };
 841
 842 static const struct io_op_def io_op_defs[] = {
 843         [IORING_OP_NOP] = {},
 844         [IORING_OP_READV] = {
 845                 .needs_file             = 1,
 846                 .unbound_nonreg_file    = 1,
 847                 .pollin                 = 1,
 848                 .buffer_select          = 1,
 849                 .needs_async_data       = 1,
 850                 .plug                   = 1,
 851                 .async_size             = sizeof(struct io_async_rw),
 852         },
 853         [IORING_OP_WRITEV] = {
 854                 .needs_file             = 1,
 855                 .hash_reg_file          = 1,
 856                 .unbound_nonreg_file    = 1,
 857                 .pollout                = 1,
 858                 .needs_async_data       = 1,
 859                 .plug                   = 1,
 860                 .async_size             = sizeof(struct io_async_rw),
 861         },
 862         [IORING_OP_FSYNC] = {
 863                 .needs_file             = 1,
 864         },
 865         [IORING_OP_READ_FIXED] = {
 866                 .needs_file             = 1,
 867                 .unbound_nonreg_file    = 1,
 868                 .pollin                 = 1,
 869                 .plug                   = 1,
 870                 .async_size             = sizeof(struct io_async_rw),
 871         },
 872         [IORING_OP_WRITE_FIXED] = {
 873                 .needs_file             = 1,
 874                 .hash_reg_file          = 1,
 875                 .unbound_nonreg_file    = 1,
 876                 .pollout                = 1,
 877                 .plug                   = 1,
 878                 .async_size             = sizeof(struct io_async_rw),
 879         },
 880         [IORING_OP_POLL_ADD] = {
 881                 .needs_file             = 1,
 882                 .unbound_nonreg_file    = 1,
 883         },
 884         [IORING_OP_POLL_REMOVE] = {},
 885         [IORING_OP_SYNC_FILE_RANGE] = {
 886                 .needs_file             = 1,
 887         },
 888         [IORING_OP_SENDMSG] = {
 889                 .needs_file             = 1,
 890                 .unbound_nonreg_file    = 1,
 891                 .pollout                = 1,
 892                 .needs_async_data       = 1,
 893                 .async_size             = sizeof(struct io_async_msghdr),
 894         },
 895         [IORING_OP_RECVMSG] = {
 896                 .needs_file             = 1,
 897                 .unbound_nonreg_file    = 1,
 898                 .pollin                 = 1,
 899                 .buffer_select          = 1,
 900                 .needs_async_data       = 1,
 901                 .async_size             = sizeof(struct io_async_msghdr),
 902         },
 903         [IORING_OP_TIMEOUT] = {
 904                 .needs_async_data       = 1,
 905                 .async_size             = sizeof(struct io_timeout_data),
 906         },
 907         [IORING_OP_TIMEOUT_REMOVE] = {
 908                 /* used by timeout updates' prep() */
 909         },
 910         [IORING_OP_ACCEPT] = {
 911                 .needs_file             = 1,
 912                 .unbound_nonreg_file    = 1,
 913                 .pollin                 = 1,
 914         },
 915         [IORING_OP_ASYNC_CANCEL] = {},
 916         [IORING_OP_LINK_TIMEOUT] = {
 917                 .needs_async_data       = 1,
 918                 .async_size             = sizeof(struct io_timeout_data),
 919         },
 920         [IORING_OP_CONNECT] = {
 921                 .needs_file             = 1,
 922                 .unbound_nonreg_file    = 1,
 923                 .pollout                = 1,
 924                 .needs_async_data       = 1,
 925                 .async_size             = sizeof(struct io_async_connect),
 926         },
 927         [IORING_OP_FALLOCATE] = {
 928                 .needs_file             = 1,
 929         },
 930         [IORING_OP_OPENAT] = {},
 931         [IORING_OP_CLOSE] = {},
 932         [IORING_OP_FILES_UPDATE] = {},
 933         [IORING_OP_STATX] = {},
 934         [IORING_OP_READ] = {
 935                 .needs_file             = 1,
 936                 .unbound_nonreg_file    = 1,
 937                 .pollin                 = 1,
 938                 .buffer_select          = 1,
 939                 .plug                   = 1,
 940                 .async_size             = sizeof(struct io_async_rw),
 941         },
 942         [IORING_OP_WRITE] = {
 943                 .needs_file             = 1,
 944                 .unbound_nonreg_file    = 1,
 945                 .pollout                = 1,
 946                 .plug                   = 1,
 947                 .async_size             = sizeof(struct io_async_rw),
 948         },
 949         [IORING_OP_FADVISE] = {
 950                 .needs_file             = 1,
 951         },
 952         [IORING_OP_MADVISE] = {},
 953         [IORING_OP_SEND] = {
 954                 .needs_file             = 1,
 955                 .unbound_nonreg_file    = 1,
 956                 .pollout                = 1,
 957         },
 958         [IORING_OP_RECV] = {
 959                 .needs_file             = 1,
 960                 .unbound_nonreg_file    = 1,
 961                 .pollin                 = 1,
 962                 .buffer_select          = 1,
 963         },
 964         [IORING_OP_OPENAT2] = {
 965         },
 966         [IORING_OP_EPOLL_CTL] = {
 967                 .unbound_nonreg_file    = 1,
 968         },
 969         [IORING_OP_SPLICE] = {
 970                 .needs_file             = 1,
 971                 .hash_reg_file          = 1,
 972                 .unbound_nonreg_file    = 1,
 973         },
 974         [IORING_OP_PROVIDE_BUFFERS] = {},
 975         [IORING_OP_REMOVE_BUFFERS] = {},
 976         [IORING_OP_TEE] = {
 977                 .needs_file             = 1,
 978                 .hash_reg_file          = 1,
 979                 .unbound_nonreg_file    = 1,
 980         },
 981         [IORING_OP_SHUTDOWN] = {
 982                 .needs_file             = 1,
 983         },
 984         [IORING_OP_RENAMEAT] = {},
 985         [IORING_OP_UNLINKAT] = {},
 986 };
 987
 988 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 989                                          struct task_struct *task,
 990                                          struct files_struct *files);
 991 static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx);
 992 static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
 993 static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
 994                         struct io_ring_ctx *ctx);
 995 static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
 996
 997 static bool io_rw_reissue(struct io_kiocb *req);
 998 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 999 static void io_put_req(struct io_kiocb *req);
1000 static void io_put_req_deferred(struct io_kiocb *req, int nr);
1001 static void io_double_put_req(struct io_kiocb *req);
1002 static void io_dismantle_req(struct io_kiocb *req);
1003 static void io_put_task(struct task_struct *task, int nr);
1004 static void io_queue_next(struct io_kiocb *req);
1005 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
1006 static void __io_queue_linked_timeout(struct io_kiocb *req);
1007 static void io_queue_linked_timeout(struct io_kiocb *req);
1008 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
1009                                  struct io_uring_rsrc_update *ip,
1010                                  unsigned nr_args);
1011 static void __io_clean_op(struct io_kiocb *req);
1012 static struct file *io_file_get(struct io_submit_state *state,
1013                                 struct io_kiocb *req, int fd, bool fixed);
1014 static void __io_queue_sqe(struct io_kiocb *req);
1015 static void io_rsrc_put_work(struct work_struct *work);
1016
1017 static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
1018                            struct iov_iter *iter, bool needs_lock);
1019 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
1020                              const struct iovec *fast_iov,
1021                              struct iov_iter *iter, bool force);
1022 static void io_req_task_queue(struct io_kiocb *req);
1023 static void io_submit_flush_completions(struct io_comp_state *cs,
1024                                         struct io_ring_ctx *ctx);
1025
1026 static struct kmem_cache *req_cachep;
1027
1028 static const struct file_operations io_uring_fops;
1029
1030 struct sock *io_uring_get_socket(struct file *file)
1031 {
1032 #if defined(CONFIG_UNIX)
1033         if (file->f_op == &io_uring_fops) {
1034                 struct io_ring_ctx *ctx = file->private_data;
1035
1036                 return ctx->ring_sock->sk;
1037         }
1038 #endif
1039         return NULL;
1040 }
1041 EXPORT_SYMBOL(io_uring_get_socket);
1042
1043 #define io_for_each_link(pos, head) \
1044         for (pos = (head); pos; pos = pos->link)
1045
1046 static inline void io_clean_op(struct io_kiocb *req)
1047 {
1048         if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
1049                 __io_clean_op(req);
1050 }
1051
1052 static inline void io_set_resource_node(struct io_kiocb *req)
1053 {
1054         struct io_ring_ctx *ctx = req->ctx;
1055
1056         if (!req->fixed_rsrc_refs) {
1057                 req->fixed_rsrc_refs = &ctx->file_data->node->refs;
1058                 percpu_ref_get(req->fixed_rsrc_refs);
1059         }
1060 }
1061
1062 static bool io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
1063 {
1064         if (!percpu_ref_tryget(ref)) {
1065                 /* already at zero, wait for ->release() */
1066                 if (!try_wait_for_completion(compl))
1067                         synchronize_rcu();
1068                 return false;
1069         }
1070
1071         percpu_ref_resurrect(ref);
1072         reinit_completion(compl);
1073         percpu_ref_put(ref);
1074         return true;
1075 }
1076
1077 static bool io_match_task(struct io_kiocb *head,
1078                           struct task_struct *task,
1079                           struct files_struct *files)
1080 {
1081         struct io_kiocb *req;
1082
1083         if (task && head->task != task) {
1084                 /* in terms of cancelation, always match if req task is dead */
1085                 if (head->task->flags & PF_EXITING)
1086                         return true;
1087                 return false;
1088         }
1089         if (!files)
1090                 return true;
1091
1092         io_for_each_link(req, head) {
1093                 if (!(req->flags & REQ_F_WORK_INITIALIZED))
1094                         continue;
1095                 if (req->file && req->file->f_op == &io_uring_fops)
1096                         return true;
1097                 if (req->task->files == files)
1098                         return true;
1099         }
1100         return false;
1101 }
1102
1103 static inline void req_set_fail_links(struct io_kiocb *req)
1104 {
1105         if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1106                 req->flags |= REQ_F_FAIL_LINK;
1107 }
1108
1109 static inline void __io_req_init_async(struct io_kiocb *req)
1110 {
1111         memset(&req->work, 0, sizeof(req->work));
1112         req->flags |= REQ_F_WORK_INITIALIZED;
1113 }
1114
1115 /*
1116  * Note: must call io_req_init_async() for the first time you
1117  * touch any members of io_wq_work.
1118  */
1119 static inline void io_req_init_async(struct io_kiocb *req)
1120 {
1121         if (req->flags & REQ_F_WORK_INITIALIZED)
1122                 return;
1123
1124         __io_req_init_async(req);
1125 }
1126
1127 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
1128 {
1129         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1130
1131         complete(&ctx->ref_comp);
1132 }
1133
1134 static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1135 {
1136         return !req->timeout.off;
1137 }
1138
1139 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1140 {
1141         struct io_ring_ctx *ctx;
1142         int hash_bits;
1143
1144         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1145         if (!ctx)
1146                 return NULL;
1147
1148         /*
1149          * Use 5 bits less than the max cq entries, that should give us around
1150          * 32 entries per hash list if totally full and uniformly spread.
1151          */
1152         hash_bits = ilog2(p->cq_entries);
1153         hash_bits -= 5;
1154         if (hash_bits <= 0)
1155                 hash_bits = 1;
1156         ctx->cancel_hash_bits = hash_bits;
1157         ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1158                                         GFP_KERNEL);
1159         if (!ctx->cancel_hash)
1160                 goto err;
1161         __hash_init(ctx->cancel_hash, 1U << hash_bits);
1162
1163         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
1164                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1165                 goto err;
1166
1167         ctx->flags = p->flags;
1168         init_waitqueue_head(&ctx->sqo_sq_wait);
1169         INIT_LIST_HEAD(&ctx->sqd_list);
1170         init_waitqueue_head(&ctx->cq_wait);
1171         INIT_LIST_HEAD(&ctx->cq_overflow_list);
1172         init_completion(&ctx->ref_comp);
1173         init_completion(&ctx->sq_thread_comp);
1174         idr_init(&ctx->io_buffer_idr);
1175         idr_init(&ctx->personality_idr);
1176         mutex_init(&ctx->uring_lock);
1177         init_waitqueue_head(&ctx->wait);
1178         spin_lock_init(&ctx->completion_lock);
1179         INIT_LIST_HEAD(&ctx->iopoll_list);
1180         INIT_LIST_HEAD(&ctx->defer_list);
1181         INIT_LIST_HEAD(&ctx->timeout_list);
1182         spin_lock_init(&ctx->inflight_lock);
1183         INIT_LIST_HEAD(&ctx->inflight_list);
1184         spin_lock_init(&ctx->rsrc_ref_lock);
1185         INIT_LIST_HEAD(&ctx->rsrc_ref_list);
1186         INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1187         init_llist_head(&ctx->rsrc_put_llist);
1188         INIT_LIST_HEAD(&ctx->submit_state.comp.free_list);
1189         INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list);
1190         return ctx;
1191 err:
1192         kfree(ctx->cancel_hash);
1193         kfree(ctx);
1194         return NULL;
1195 }
1196
1197 static bool req_need_defer(struct io_kiocb *req, u32 seq)
1198 {
1199         if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1200                 struct io_ring_ctx *ctx = req->ctx;
1201
1202                 return seq != ctx->cached_cq_tail
1203                                 + READ_ONCE(ctx->cached_cq_overflow);
1204         }
1205
1206         return false;
1207 }
1208
1209 static void io_req_clean_work(struct io_kiocb *req)
1210 {
1211         if (!(req->flags & REQ_F_WORK_INITIALIZED))
1212                 return;
1213
1214         if (req->work.creds) {
1215                 put_cred(req->work.creds);
1216                 req->work.creds = NULL;
1217         }
1218         if (req->flags & REQ_F_INFLIGHT) {
1219                 struct io_ring_ctx *ctx = req->ctx;
1220                 struct io_uring_task *tctx = req->task->io_uring;
1221                 unsigned long flags;
1222
1223                 spin_lock_irqsave(&ctx->inflight_lock, flags);
1224                 list_del(&req->inflight_entry);
1225                 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1226                 req->flags &= ~REQ_F_INFLIGHT;
1227                 if (atomic_read(&tctx->in_idle))
1228                         wake_up(&tctx->wait);
1229         }
1230
1231         req->flags &= ~REQ_F_WORK_INITIALIZED;
1232 }
1233
1234 static void io_req_track_inflight(struct io_kiocb *req)
1235 {
1236         struct io_ring_ctx *ctx = req->ctx;
1237
1238         if (!(req->flags & REQ_F_INFLIGHT)) {
1239                 io_req_init_async(req);
1240                 req->flags |= REQ_F_INFLIGHT;
1241
1242                 spin_lock_irq(&ctx->inflight_lock);
1243                 list_add(&req->inflight_entry, &ctx->inflight_list);
1244                 spin_unlock_irq(&ctx->inflight_lock);
1245         }
1246 }
1247
1248 static void io_prep_async_work(struct io_kiocb *req)
1249 {
1250         const struct io_op_def *def = &io_op_defs[req->opcode];
1251         struct io_ring_ctx *ctx = req->ctx;
1252
1253         io_req_init_async(req);
1254
1255         if (req->flags & REQ_F_FORCE_ASYNC)
1256                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
1257
1258         if (req->flags & REQ_F_ISREG) {
1259                 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1260                         io_wq_hash_work(&req->work, file_inode(req->file));
1261         } else {
1262                 if (def->unbound_nonreg_file)
1263                         req->work.flags |= IO_WQ_WORK_UNBOUND;
1264         }
1265         if (!req->work.creds)
1266                 req->work.creds = get_current_cred();
1267 }
1268
1269 static void io_prep_async_link(struct io_kiocb *req)
1270 {
1271         struct io_kiocb *cur;
1272
1273         io_for_each_link(cur, req)
1274                 io_prep_async_work(cur);
1275 }
1276
1277 static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
1278 {
1279         struct io_ring_ctx *ctx = req->ctx;
1280         struct io_kiocb *link = io_prep_linked_timeout(req);
1281         struct io_uring_task *tctx = req->task->io_uring;
1282
1283         BUG_ON(!tctx);
1284         BUG_ON(!tctx->io_wq);
1285
1286         trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1287                                         &req->work, req->flags);
1288         io_wq_enqueue(tctx->io_wq, &req->work);
1289         return link;
1290 }
1291
1292 static void io_queue_async_work(struct io_kiocb *req)
1293 {
1294         struct io_kiocb *link;
1295
1296         /* init ->work of the whole link before punting */
1297         io_prep_async_link(req);
1298         link = __io_queue_async_work(req);
1299
1300         if (link)
1301                 io_queue_linked_timeout(link);
1302 }
1303
1304 static void io_kill_timeout(struct io_kiocb *req)
1305 {
1306         struct io_timeout_data *io = req->async_data;
1307         int ret;
1308
1309         ret = hrtimer_try_to_cancel(&io->timer);
1310         if (ret != -1) {
1311                 atomic_set(&req->ctx->cq_timeouts,
1312                         atomic_read(&req->ctx->cq_timeouts) + 1);
1313                 list_del_init(&req->timeout.list);
1314                 io_cqring_fill_event(req, 0);
1315                 io_put_req_deferred(req, 1);
1316         }
1317 }
1318
1319 /*
1320  * Returns true if we found and killed one or more timeouts
1321  */
1322 static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
1323                              struct files_struct *files)
1324 {
1325         struct io_kiocb *req, *tmp;
1326         int canceled = 0;
1327
1328         spin_lock_irq(&ctx->completion_lock);
1329         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
1330                 if (io_match_task(req, tsk, files)) {
1331                         io_kill_timeout(req);
1332                         canceled++;
1333                 }
1334         }
1335         spin_unlock_irq(&ctx->completion_lock);
1336         return canceled != 0;
1337 }
1338
1339 static void __io_queue_deferred(struct io_ring_ctx *ctx)
1340 {
1341         do {
1342                 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1343                                                 struct io_defer_entry, list);
1344
1345                 if (req_need_defer(de->req, de->seq))
1346                         break;
1347                 list_del_init(&de->list);
1348                 io_req_task_queue(de->req);
1349                 kfree(de);
1350         } while (!list_empty(&ctx->defer_list));
1351 }
1352
1353 static void io_flush_timeouts(struct io_ring_ctx *ctx)
1354 {
1355         u32 seq;
1356
1357         if (list_empty(&ctx->timeout_list))
1358                 return;
1359
1360         seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
1361
1362         do {
1363                 u32 events_needed, events_got;
1364                 struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
1365                                                 struct io_kiocb, timeout.list);
1366
1367                 if (io_is_timeout_noseq(req))
1368                         break;
1369
1370                 /*
1371                  * Since seq can easily wrap around over time, subtract
1372                  * the last seq at which timeouts were flushed before comparing.
1373                  * Assuming not more than 2^31-1 events have happened since,
1374                  * these subtractions won't have wrapped, so we can check if
1375                  * target is in [last_seq, current_seq] by comparing the two.
1376                  */
1377                 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
1378                 events_got = seq - ctx->cq_last_tm_flush;
1379                 if (events_got < events_needed)
1380                         break;
1381
1382                 list_del_init(&req->timeout.list);
1383                 io_kill_timeout(req);
1384         } while (!list_empty(&ctx->timeout_list));
1385
1386         ctx->cq_last_tm_flush = seq;
1387 }
1388
1389 static void io_commit_cqring(struct io_ring_ctx *ctx)
1390 {
1391         io_flush_timeouts(ctx);
1392
1393         /* order cqe stores with ring update */
1394         smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
1395
1396         if (unlikely(!list_empty(&ctx->defer_list)))
1397                 __io_queue_deferred(ctx);
1398 }
1399
1400 static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1401 {
1402         struct io_rings *r = ctx->rings;
1403
1404         return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries;
1405 }
1406
1407 static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
1408 {
1409         return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
1410 }
1411
1412 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
1413 {
1414         struct io_rings *rings = ctx->rings;
1415         unsigned tail;
1416
1417         /*
1418          * writes to the cq entry need to come after reading head; the
1419          * control dependency is enough as we're using WRITE_ONCE to
1420          * fill the cq entry
1421          */
1422         if (__io_cqring_events(ctx) == rings->cq_ring_entries)
1423                 return NULL;
1424
1425         tail = ctx->cached_cq_tail++;
1426         return &rings->cqes[tail & ctx->cq_mask];
1427 }
1428
1429 static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1430 {
1431         if (!ctx->cq_ev_fd)
1432                 return false;
1433         if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1434                 return false;
1435         if (!ctx->eventfd_async)
1436                 return true;
1437         return io_wq_current_is_worker();
1438 }
1439
1440 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1441 {
1442         /* see waitqueue_active() comment */
1443         smp_mb();
1444
1445         if (waitqueue_active(&ctx->wait))
1446                 wake_up(&ctx->wait);
1447         if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
1448                 wake_up(&ctx->sq_data->wait);
1449         if (io_should_trigger_evfd(ctx))
1450                 eventfd_signal(ctx->cq_ev_fd, 1);
1451         if (waitqueue_active(&ctx->cq_wait)) {
1452                 wake_up_interruptible(&ctx->cq_wait);
1453                 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1454         }
1455 }
1456
1457 static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1458 {
1459         /* see waitqueue_active() comment */
1460         smp_mb();
1461
1462         if (ctx->flags & IORING_SETUP_SQPOLL) {
1463                 if (waitqueue_active(&ctx->wait))
1464                         wake_up(&ctx->wait);
1465         }
1466         if (io_should_trigger_evfd(ctx))
1467                 eventfd_signal(ctx->cq_ev_fd, 1);
1468         if (waitqueue_active(&ctx->cq_wait)) {
1469                 wake_up_interruptible(&ctx->cq_wait);
1470                 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1471         }
1472 }
1473
1474 /* Returns true if there are no backlogged entries after the flush */
1475 static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
1476                                        struct task_struct *tsk,
1477                                        struct files_struct *files)
1478 {
1479         struct io_rings *rings = ctx->rings;
1480         struct io_kiocb *req, *tmp;
1481         struct io_uring_cqe *cqe;
1482         unsigned long flags;
1483         bool all_flushed, posted;
1484         LIST_HEAD(list);
1485
1486         if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries)
1487                 return false;
1488
1489         posted = false;
1490         spin_lock_irqsave(&ctx->completion_lock, flags);
1491         list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
1492                 if (!io_match_task(req, tsk, files))
1493                         continue;
1494
1495                 cqe = io_get_cqring(ctx);
1496                 if (!cqe && !force)
1497                         break;
1498
1499                 list_move(&req->compl.list, &list);
1500                 if (cqe) {
1501                         WRITE_ONCE(cqe->user_data, req->user_data);
1502                         WRITE_ONCE(cqe->res, req->result);
1503                         WRITE_ONCE(cqe->flags, req->compl.cflags);
1504                 } else {
1505                         ctx->cached_cq_overflow++;
1506                         WRITE_ONCE(ctx->rings->cq_overflow,
1507                                    ctx->cached_cq_overflow);
1508                 }
1509                 posted = true;
1510         }
1511
1512         all_flushed = list_empty(&ctx->cq_overflow_list);
1513         if (all_flushed) {
1514                 clear_bit(0, &ctx->sq_check_overflow);
1515                 clear_bit(0, &ctx->cq_check_overflow);
1516                 ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
1517         }
1518
1519         if (posted)
1520                 io_commit_cqring(ctx);
1521         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1522         if (posted)
1523                 io_cqring_ev_posted(ctx);
1524
1525         while (!list_empty(&list)) {
1526                 req = list_first_entry(&list, struct io_kiocb, compl.list);
1527                 list_del(&req->compl.list);
1528                 io_put_req(req);
1529         }
1530
1531         return all_flushed;
1532 }
1533
1534 static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
1535                                      struct task_struct *tsk,
1536                                      struct files_struct *files)
1537 {
1538         if (test_bit(0, &ctx->cq_check_overflow)) {
1539                 /* iopoll syncs against uring_lock, not completion_lock */
1540                 if (ctx->flags & IORING_SETUP_IOPOLL)
1541                         mutex_lock(&ctx->uring_lock);
1542                 __io_cqring_overflow_flush(ctx, force, tsk, files);
1543                 if (ctx->flags & IORING_SETUP_IOPOLL)
1544                         mutex_unlock(&ctx->uring_lock);
1545         }
1546 }
1547
1548 static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
1549 {
1550         struct io_ring_ctx *ctx = req->ctx;
1551         struct io_uring_cqe *cqe;
1552
1553         trace_io_uring_complete(ctx, req->user_data, res);
1554
1555         /*
1556          * If we can't get a cq entry, userspace overflowed the
1557          * submission (by quite a lot). Increment the overflow count in
1558          * the ring.
1559          */
1560         cqe = io_get_cqring(ctx);
1561         if (likely(cqe)) {
1562                 WRITE_ONCE(cqe->user_data, req->user_data);
1563                 WRITE_ONCE(cqe->res, res);
1564                 WRITE_ONCE(cqe->flags, cflags);
1565         } else if (ctx->cq_overflow_flushed ||
1566                    atomic_read(&req->task->io_uring->in_idle)) {
1567                 /*
1568                  * If we're in ring overflow flush mode, or in task cancel mode,
1569                  * then we cannot store the request for later flushing, we need
1570                  * to drop it on the floor.
1571                  */
1572                 ctx->cached_cq_overflow++;
1573                 WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow);
1574         } else {
1575                 if (list_empty(&ctx->cq_overflow_list)) {
1576                         set_bit(0, &ctx->sq_check_overflow);
1577                         set_bit(0, &ctx->cq_check_overflow);
1578                         ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
1579                 }
1580                 io_clean_op(req);
1581                 req->result = res;
1582                 req->compl.cflags = cflags;
1583                 refcount_inc(&req->refs);
1584                 list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
1585         }
1586 }
1587
1588 static void io_cqring_fill_event(struct io_kiocb *req, long res)
1589 {
1590         __io_cqring_fill_event(req, res, 0);
1591 }
1592
1593 static inline void io_req_complete_post(struct io_kiocb *req, long res,
1594                                         unsigned int cflags)
1595 {
1596         struct io_ring_ctx *ctx = req->ctx;
1597         unsigned long flags;
1598
1599         spin_lock_irqsave(&ctx->completion_lock, flags);
1600         __io_cqring_fill_event(req, res, cflags);
1601         io_commit_cqring(ctx);
1602         /*
1603          * If we're the last reference to this request, add to our locked
1604          * free_list cache.
1605          */
1606         if (refcount_dec_and_test(&req->refs)) {
1607                 struct io_comp_state *cs = &ctx->submit_state.comp;
1608
1609                 io_dismantle_req(req);
1610                 io_put_task(req->task, 1);
1611                 list_add(&req->compl.list, &cs->locked_free_list);
1612                 cs->locked_free_nr++;
1613         } else
1614                 req = NULL;
1615         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1616
1617         io_cqring_ev_posted(ctx);
1618         if (req) {
1619                 io_queue_next(req);
1620                 percpu_ref_put(&ctx->refs);
1621         }
1622 }
1623
1624 static void io_req_complete_state(struct io_kiocb *req, long res,
1625                                   unsigned int cflags)
1626 {
1627         io_clean_op(req);
1628         req->result = res;
1629         req->compl.cflags = cflags;
1630         req->flags |= REQ_F_COMPLETE_INLINE;
1631 }
1632
1633 static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
1634                                      long res, unsigned cflags)
1635 {
1636         if (issue_flags & IO_URING_F_COMPLETE_DEFER)
1637                 io_req_complete_state(req, res, cflags);
1638         else
1639                 io_req_complete_post(req, res, cflags);
1640 }
1641
1642 static inline void io_req_complete(struct io_kiocb *req, long res)
1643 {
1644         __io_req_complete(req, 0, res, 0);
1645 }
1646
1647 static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
1648 {
1649         struct io_submit_state *state = &ctx->submit_state;
1650         struct io_comp_state *cs = &state->comp;
1651         struct io_kiocb *req = NULL;
1652
1653         /*
1654          * If we have more than a batch's worth of requests in our IRQ side
1655          * locked cache, grab the lock and move them over to our submission
1656          * side cache.
1657          */
1658         if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH) {
1659                 spin_lock_irq(&ctx->completion_lock);
1660                 list_splice_init(&cs->locked_free_list, &cs->free_list);
1661                 cs->locked_free_nr = 0;
1662                 spin_unlock_irq(&ctx->completion_lock);
1663         }
1664
1665         while (!list_empty(&cs->free_list)) {
1666                 req = list_first_entry(&cs->free_list, struct io_kiocb,
1667                                         compl.list);
1668                 list_del(&req->compl.list);
1669                 state->reqs[state->free_reqs++] = req;
1670                 if (state->free_reqs == ARRAY_SIZE(state->reqs))
1671                         break;
1672         }
1673
1674         return req != NULL;
1675 }
1676
1677 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
1678 {
1679         struct io_submit_state *state = &ctx->submit_state;
1680
1681         BUILD_BUG_ON(IO_REQ_ALLOC_BATCH > ARRAY_SIZE(state->reqs));
1682
1683         if (!state->free_reqs) {
1684                 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1685                 int ret;
1686
1687                 if (io_flush_cached_reqs(ctx))
1688                         goto got_req;
1689
1690                 ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
1691                                             state->reqs);
1692
1693                 /*
1694                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
1695                  * retry single alloc to be on the safe side.
1696                  */
1697                 if (unlikely(ret <= 0)) {
1698                         state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1699                         if (!state->reqs[0])
1700                                 return NULL;
1701                         ret = 1;
1702                 }
1703                 state->free_reqs = ret;
1704         }
1705 got_req:
1706         state->free_reqs--;
1707         return state->reqs[state->free_reqs];
1708 }
1709
1710 static inline void io_put_file(struct io_kiocb *req, struct file *file,
1711                           bool fixed)
1712 {
1713         if (!fixed)
1714                 fput(file);
1715 }
1716
1717 static void io_dismantle_req(struct io_kiocb *req)
1718 {
1719         io_clean_op(req);
1720
1721         if (req->async_data)
1722                 kfree(req->async_data);
1723         if (req->file)
1724                 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
1725         if (req->fixed_rsrc_refs)
1726                 percpu_ref_put(req->fixed_rsrc_refs);
1727         io_req_clean_work(req);
1728 }
1729
1730 static inline void io_put_task(struct task_struct *task, int nr)
1731 {
1732         struct io_uring_task *tctx = task->io_uring;
1733
1734         percpu_counter_sub(&tctx->inflight, nr);
1735         if (unlikely(atomic_read(&tctx->in_idle)))
1736                 wake_up(&tctx->wait);
1737         put_task_struct_many(task, nr);
1738 }
1739
1740 static void __io_free_req(struct io_kiocb *req)
1741 {
1742         struct io_ring_ctx *ctx = req->ctx;
1743
1744         io_dismantle_req(req);
1745         io_put_task(req->task, 1);
1746
1747         kmem_cache_free(req_cachep, req);
1748         percpu_ref_put(&ctx->refs);
1749 }
1750
1751 static inline void io_remove_next_linked(struct io_kiocb *req)
1752 {
1753         struct io_kiocb *nxt = req->link;
1754
1755         req->link = nxt->link;
1756         nxt->link = NULL;
1757 }
1758
1759 static void io_kill_linked_timeout(struct io_kiocb *req)
1760 {
1761         struct io_ring_ctx *ctx = req->ctx;
1762         struct io_kiocb *link;
1763         bool cancelled = false;
1764         unsigned long flags;
1765
1766         spin_lock_irqsave(&ctx->completion_lock, flags);
1767         link = req->link;
1768
1769         /*
1770          * Can happen if a linked timeout fired and link had been like
1771          * req -> link t-out -> link t-out [-> ...]
1772          */
1773         if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
1774                 struct io_timeout_data *io = link->async_data;
1775                 int ret;
1776
1777                 io_remove_next_linked(req);
1778                 link->timeout.head = NULL;
1779                 ret = hrtimer_try_to_cancel(&io->timer);
1780                 if (ret != -1) {
1781                         io_cqring_fill_event(link, -ECANCELED);
1782                         io_commit_cqring(ctx);
1783                         cancelled = true;
1784                 }
1785         }
1786         req->flags &= ~REQ_F_LINK_TIMEOUT;
1787         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1788
1789         if (cancelled) {
1790                 io_cqring_ev_posted(ctx);
1791                 io_put_req(link);
1792         }
1793 }
1794
1795
1796 static void io_fail_links(struct io_kiocb *req)
1797 {
1798         struct io_kiocb *link, *nxt;
1799         struct io_ring_ctx *ctx = req->ctx;
1800         unsigned long flags;
1801
1802         spin_lock_irqsave(&ctx->completion_lock, flags);
1803         link = req->link;
1804         req->link = NULL;
1805
1806         while (link) {
1807                 nxt = link->link;
1808                 link->link = NULL;
1809
1810                 trace_io_uring_fail_link(req, link);
1811                 io_cqring_fill_event(link, -ECANCELED);
1812
1813                 /*
1814                  * It's ok to free under spinlock as they're not linked anymore,
1815                  * but avoid REQ_F_WORK_INITIALIZED because it may deadlock on
1816                  * work.fs->lock.
1817                  */
1818                 if (link->flags & REQ_F_WORK_INITIALIZED)
1819                         io_put_req_deferred(link, 2);
1820                 else
1821                         io_double_put_req(link);
1822                 link = nxt;
1823         }
1824         io_commit_cqring(ctx);
1825         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1826
1827         io_cqring_ev_posted(ctx);
1828 }
1829
1830 static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
1831 {
1832         if (req->flags & REQ_F_LINK_TIMEOUT)
1833                 io_kill_linked_timeout(req);
1834
1835         /*
1836          * If LINK is set, we have dependent requests in this chain. If we
1837          * didn't fail this request, queue the first one up, moving any other
1838          * dependencies to the next request. In case of failure, fail the rest
1839          * of the chain.
1840          */
1841         if (likely(!(req->flags & REQ_F_FAIL_LINK))) {
1842                 struct io_kiocb *nxt = req->link;
1843
1844                 req->link = NULL;
1845                 return nxt;
1846         }
1847         io_fail_links(req);
1848         return NULL;
1849 }
1850
1851 static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
1852 {
1853         if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
1854                 return NULL;
1855         return __io_req_find_next(req);
1856 }
1857
1858 static bool __tctx_task_work(struct io_uring_task *tctx)
1859 {
1860         struct io_ring_ctx *ctx = NULL;
1861         struct io_wq_work_list list;
1862         struct io_wq_work_node *node;
1863
1864         if (wq_list_empty(&tctx->task_list))
1865                 return false;
1866
1867         spin_lock_irq(&tctx->task_lock);
1868         list = tctx->task_list;
1869         INIT_WQ_LIST(&tctx->task_list);
1870         spin_unlock_irq(&tctx->task_lock);
1871
1872         node = list.first;
1873         while (node) {
1874                 struct io_wq_work_node *next = node->next;
1875                 struct io_ring_ctx *this_ctx;
1876                 struct io_kiocb *req;
1877
1878                 req = container_of(node, struct io_kiocb, io_task_work.node);
1879                 this_ctx = req->ctx;
1880                 req->task_work.func(&req->task_work);
1881                 node = next;
1882
1883                 if (!ctx) {
1884                         ctx = this_ctx;
1885                 } else if (ctx != this_ctx) {
1886                         mutex_lock(&ctx->uring_lock);
1887                         io_submit_flush_completions(&ctx->submit_state.comp, ctx);
1888                         mutex_unlock(&ctx->uring_lock);
1889                         ctx = this_ctx;
1890                 }
1891         }
1892
1893         if (ctx && ctx->submit_state.comp.nr) {
1894                 mutex_lock(&ctx->uring_lock);
1895                 io_submit_flush_completions(&ctx->submit_state.comp, ctx);
1896                 mutex_unlock(&ctx->uring_lock);
1897         }
1898
1899         return list.first != NULL;
1900 }
1901
1902 static void tctx_task_work(struct callback_head *cb)
1903 {
1904         struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work);
1905
1906         while (__tctx_task_work(tctx))
1907                 cond_resched();
1908
1909         clear_bit(0, &tctx->task_state);
1910 }
1911
1912 static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
1913                             enum task_work_notify_mode notify)
1914 {
1915         struct io_uring_task *tctx = tsk->io_uring;
1916         struct io_wq_work_node *node, *prev;
1917         unsigned long flags;
1918         int ret;
1919
1920         WARN_ON_ONCE(!tctx);
1921
1922         spin_lock_irqsave(&tctx->task_lock, flags);
1923         wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
1924         spin_unlock_irqrestore(&tctx->task_lock, flags);
1925
1926         /* task_work already pending, we're done */
1927         if (test_bit(0, &tctx->task_state) ||
1928             test_and_set_bit(0, &tctx->task_state))
1929                 return 0;
1930
1931         if (!task_work_add(tsk, &tctx->task_work, notify))
1932                 return 0;
1933
1934         /*
1935          * Slow path - we failed, find and delete work. if the work is not
1936          * in the list, it got run and we're fine.
1937          */
1938         ret = 0;
1939         spin_lock_irqsave(&tctx->task_lock, flags);
1940         wq_list_for_each(node, prev, &tctx->task_list) {
1941                 if (&req->io_task_work.node == node) {
1942                         wq_list_del(&tctx->task_list, node, prev);
1943                         ret = 1;
1944                         break;
1945                 }
1946         }
1947         spin_unlock_irqrestore(&tctx->task_lock, flags);
1948         clear_bit(0, &tctx->task_state);
1949         return ret;
1950 }
1951
1952 static int io_req_task_work_add(struct io_kiocb *req)
1953 {
1954         struct task_struct *tsk = req->task;
1955         struct io_ring_ctx *ctx = req->ctx;
1956         enum task_work_notify_mode notify;
1957         int ret;
1958
1959         if (tsk->flags & PF_EXITING)
1960                 return -ESRCH;
1961
1962         /*
1963          * SQPOLL kernel thread doesn't need notification, just a wakeup. For
1964          * all other cases, use TWA_SIGNAL unconditionally to ensure we're
1965          * processing task_work. There's no reliable way to tell if TWA_RESUME
1966          * will do the job.
1967          */
1968         notify = TWA_NONE;
1969         if (!(ctx->flags & IORING_SETUP_SQPOLL))
1970                 notify = TWA_SIGNAL;
1971
1972         ret = io_task_work_add(tsk, req, notify);
1973         if (!ret)
1974                 wake_up_process(tsk);
1975
1976         return ret;
1977 }
1978
1979 static void io_req_task_work_add_fallback(struct io_kiocb *req,
1980                                           task_work_func_t cb)
1981 {
1982         struct io_ring_ctx *ctx = req->ctx;
1983         struct callback_head *head;
1984
1985         init_task_work(&req->task_work, cb);
1986         do {
1987                 head = READ_ONCE(ctx->exit_task_work);
1988                 req->task_work.next = head;
1989         } while (cmpxchg(&ctx->exit_task_work, head, &req->task_work) != head);
1990 }
1991
1992 static void __io_req_task_cancel(struct io_kiocb *req, int error)
1993 {
1994         struct io_ring_ctx *ctx = req->ctx;
1995
1996         spin_lock_irq(&ctx->completion_lock);
1997         io_cqring_fill_event(req, error);
1998         io_commit_cqring(ctx);
1999         spin_unlock_irq(&ctx->completion_lock);
2000
2001         io_cqring_ev_posted(ctx);
2002         req_set_fail_links(req);
2003         io_double_put_req(req);
2004 }
2005
2006 static void io_req_task_cancel(struct callback_head *cb)
2007 {
2008         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2009         struct io_ring_ctx *ctx = req->ctx;
2010
2011         mutex_lock(&ctx->uring_lock);
2012         __io_req_task_cancel(req, req->result);
2013         mutex_unlock(&ctx->uring_lock);
2014         percpu_ref_put(&ctx->refs);
2015 }
2016
2017 static void __io_req_task_submit(struct io_kiocb *req)
2018 {
2019         struct io_ring_ctx *ctx = req->ctx;
2020
2021         /* ctx stays valid until unlock, even if we drop all ours ctx->refs */
2022         mutex_lock(&ctx->uring_lock);
2023         if (!ctx->sqo_dead && !(current->flags & PF_EXITING))
2024                 __io_queue_sqe(req);
2025         else
2026                 __io_req_task_cancel(req, -EFAULT);
2027         mutex_unlock(&ctx->uring_lock);
2028 }
2029
2030 static void io_req_task_submit(struct callback_head *cb)
2031 {
2032         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2033
2034         __io_req_task_submit(req);
2035 }
2036
2037 static void io_req_task_queue(struct io_kiocb *req)
2038 {
2039         int ret;
2040
2041         req->task_work.func = io_req_task_submit;
2042         ret = io_req_task_work_add(req);
2043         if (unlikely(ret)) {
2044                 req->result = -ECANCELED;
2045                 percpu_ref_get(&req->ctx->refs);
2046                 io_req_task_work_add_fallback(req, io_req_task_cancel);
2047         }
2048 }
2049
2050 static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
2051 {
2052         percpu_ref_get(&req->ctx->refs);
2053         req->result = ret;
2054         req->task_work.func = io_req_task_cancel;
2055
2056         if (unlikely(io_req_task_work_add(req)))
2057                 io_req_task_work_add_fallback(req, io_req_task_cancel);
2058 }
2059
2060 static inline void io_queue_next(struct io_kiocb *req)
2061 {
2062         struct io_kiocb *nxt = io_req_find_next(req);
2063
2064         if (nxt)
2065                 io_req_task_queue(nxt);
2066 }
2067
2068 static void io_free_req(struct io_kiocb *req)
2069 {
2070         io_queue_next(req);
2071         __io_free_req(req);
2072 }
2073
2074 struct req_batch {
2075         struct task_struct      *task;
2076         int                     task_refs;
2077         int                     ctx_refs;
2078 };
2079
2080 static inline void io_init_req_batch(struct req_batch *rb)
2081 {
2082         rb->task_refs = 0;
2083         rb->ctx_refs = 0;
2084         rb->task = NULL;
2085 }
2086
2087 static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
2088                                      struct req_batch *rb)
2089 {
2090         if (rb->task)
2091                 io_put_task(rb->task, rb->task_refs);
2092         if (rb->ctx_refs)
2093                 percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
2094 }
2095
2096 static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
2097                               struct io_submit_state *state)
2098 {
2099         io_queue_next(req);
2100
2101         if (req->task != rb->task) {
2102                 if (rb->task)
2103                         io_put_task(rb->task, rb->task_refs);
2104                 rb->task = req->task;
2105                 rb->task_refs = 0;
2106         }
2107         rb->task_refs++;
2108         rb->ctx_refs++;
2109
2110         io_dismantle_req(req);
2111         if (state->free_reqs != ARRAY_SIZE(state->reqs))
2112                 state->reqs[state->free_reqs++] = req;
2113         else
2114                 list_add(&req->compl.list, &state->comp.free_list);
2115 }
2116
2117 static void io_submit_flush_completions(struct io_comp_state *cs,
2118                                         struct io_ring_ctx *ctx)
2119 {
2120         int i, nr = cs->nr;
2121         struct io_kiocb *req;
2122         struct req_batch rb;
2123
2124         io_init_req_batch(&rb);
2125         spin_lock_irq(&ctx->completion_lock);
2126         for (i = 0; i < nr; i++) {
2127                 req = cs->reqs[i];
2128                 __io_cqring_fill_event(req, req->result, req->compl.cflags);
2129         }
2130         io_commit_cqring(ctx);
2131         spin_unlock_irq(&ctx->completion_lock);
2132
2133         io_cqring_ev_posted(ctx);
2134         for (i = 0; i < nr; i++) {
2135                 req = cs->reqs[i];
2136
2137                 /* submission and completion refs */
2138                 if (refcount_sub_and_test(2, &req->refs))
2139                         io_req_free_batch(&rb, req, &ctx->submit_state);
2140         }
2141
2142         io_req_free_batch_finish(ctx, &rb);
2143         cs->nr = 0;
2144 }
2145
2146 /*
2147  * Drop reference to request, return next in chain (if there is one) if this
2148  * was the last reference to this request.
2149  */
2150 static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
2151 {
2152         struct io_kiocb *nxt = NULL;
2153
2154         if (refcount_dec_and_test(&req->refs)) {
2155                 nxt = io_req_find_next(req);
2156                 __io_free_req(req);
2157         }
2158         return nxt;
2159 }
2160
2161 static void io_put_req(struct io_kiocb *req)
2162 {
2163         if (refcount_dec_and_test(&req->refs))
2164                 io_free_req(req);
2165 }
2166
2167 static void io_put_req_deferred_cb(struct callback_head *cb)
2168 {
2169         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2170
2171         io_free_req(req);
2172 }
2173
2174 static void io_free_req_deferred(struct io_kiocb *req)
2175 {
2176         int ret;
2177
2178         req->task_work.func = io_put_req_deferred_cb;
2179         ret = io_req_task_work_add(req);
2180         if (unlikely(ret))
2181                 io_req_task_work_add_fallback(req, io_put_req_deferred_cb);
2182 }
2183
2184 static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
2185 {
2186         if (refcount_sub_and_test(refs, &req->refs))
2187                 io_free_req_deferred(req);
2188 }
2189
2190 static void io_double_put_req(struct io_kiocb *req)
2191 {
2192         /* drop both submit and complete references */
2193         if (refcount_sub_and_test(2, &req->refs))
2194                 io_free_req(req);
2195 }
2196
2197 static unsigned io_cqring_events(struct io_ring_ctx *ctx)
2198 {
2199         /* See comment at the top of this file */
2200         smp_rmb();
2201         return __io_cqring_events(ctx);
2202 }
2203
2204 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2205 {
2206         struct io_rings *rings = ctx->rings;
2207
2208         /* make sure SQ entry isn't read before tail */
2209         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2210 }
2211
2212 static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
2213 {
2214         unsigned int cflags;
2215
2216         cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
2217         cflags |= IORING_CQE_F_BUFFER;
2218         req->flags &= ~REQ_F_BUFFER_SELECTED;
2219         kfree(kbuf);
2220         return cflags;
2221 }
2222
2223 static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
2224 {
2225         struct io_buffer *kbuf;
2226
2227         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2228         return io_put_kbuf(req, kbuf);
2229 }
2230
2231 static inline bool io_run_task_work(void)
2232 {
2233         /*
2234          * Not safe to run on exiting task, and the task_work handling will
2235          * not add work to such a task.
2236          */
2237         if (unlikely(current->flags & PF_EXITING))
2238                 return false;
2239         if (current->task_works) {
2240                 __set_current_state(TASK_RUNNING);
2241                 task_work_run();
2242                 return true;
2243         }
2244
2245         return false;
2246 }
2247
2248 /*
2249  * Find and free completed poll iocbs
2250  */
2251 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
2252                                struct list_head *done)
2253 {
2254         struct req_batch rb;
2255         struct io_kiocb *req;
2256
2257         /* order with ->result store in io_complete_rw_iopoll() */
2258         smp_rmb();
2259
2260         io_init_req_batch(&rb);
2261         while (!list_empty(done)) {
2262                 int cflags = 0;
2263
2264                 req = list_first_entry(done, struct io_kiocb, inflight_entry);
2265                 list_del(&req->inflight_entry);
2266
2267                 if (READ_ONCE(req->result) == -EAGAIN) {
2268                         req->iopoll_completed = 0;
2269                         if (io_rw_reissue(req))
2270                                 continue;
2271                 }
2272
2273                 if (req->flags & REQ_F_BUFFER_SELECTED)
2274                         cflags = io_put_rw_kbuf(req);
2275
2276                 __io_cqring_fill_event(req, req->result, cflags);
2277                 (*nr_events)++;
2278
2279                 if (refcount_dec_and_test(&req->refs))
2280                         io_req_free_batch(&rb, req, &ctx->submit_state);
2281         }
2282
2283         io_commit_cqring(ctx);
2284         io_cqring_ev_posted_iopoll(ctx);
2285         io_req_free_batch_finish(ctx, &rb);
2286 }
2287
2288 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
2289                         long min)
2290 {
2291         struct io_kiocb *req, *tmp;
2292         LIST_HEAD(done);
2293         bool spin;
2294         int ret;
2295
2296         /*
2297          * Only spin for completions if we don't have multiple devices hanging
2298          * off our complete list, and we're under the requested amount.
2299          */
2300         spin = !ctx->poll_multi_file && *nr_events < min;
2301
2302         ret = 0;
2303         list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
2304                 struct kiocb *kiocb = &req->rw.kiocb;
2305
2306                 /*
2307                  * Move completed and retryable entries to our local lists.
2308                  * If we find a request that requires polling, break out
2309                  * and complete those lists first, if we have entries there.
2310                  */
2311                 if (READ_ONCE(req->iopoll_completed)) {
2312                         list_move_tail(&req->inflight_entry, &done);
2313                         continue;
2314                 }
2315                 if (!list_empty(&done))
2316                         break;
2317
2318                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
2319                 if (ret < 0)
2320                         break;
2321
2322                 /* iopoll may have completed current req */
2323                 if (READ_ONCE(req->iopoll_completed))
2324                         list_move_tail(&req->inflight_entry, &done);
2325
2326                 if (ret && spin)
2327                         spin = false;
2328                 ret = 0;
2329         }
2330
2331         if (!list_empty(&done))
2332                 io_iopoll_complete(ctx, nr_events, &done);
2333
2334         return ret;
2335 }
2336
2337 /*
2338  * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
2339  * non-spinning poll check - we'll still enter the driver poll loop, but only
2340  * as a non-spinning completion check.
2341  */
2342 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
2343                                 long min)
2344 {
2345         while (!list_empty(&ctx->iopoll_list) && !need_resched()) {
2346                 int ret;
2347
2348                 ret = io_do_iopoll(ctx, nr_events, min);
2349                 if (ret < 0)
2350                         return ret;
2351                 if (*nr_events >= min)
2352                         return 0;
2353         }
2354
2355         return 1;
2356 }
2357
2358 /*
2359  * We can't just wait for polled events to come to us, we have to actively
2360  * find and complete them.
2361  */
2362 static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
2363 {
2364         if (!(ctx->flags & IORING_SETUP_IOPOLL))
2365                 return;
2366
2367         mutex_lock(&ctx->uring_lock);
2368         while (!list_empty(&ctx->iopoll_list)) {
2369                 unsigned int nr_events = 0;
2370
2371                 io_do_iopoll(ctx, &nr_events, 0);
2372
2373                 /* let it sleep and repeat later if can't complete a request */
2374                 if (nr_events == 0)
2375                         break;
2376                 /*
2377                  * Ensure we allow local-to-the-cpu processing to take place,
2378                  * in this case we need to ensure that we reap all events.
2379                  * Also let task_work, etc. to progress by releasing the mutex
2380                  */
2381                 if (need_resched()) {
2382                         mutex_unlock(&ctx->uring_lock);
2383                         cond_resched();
2384                         mutex_lock(&ctx->uring_lock);
2385                 }
2386         }
2387         mutex_unlock(&ctx->uring_lock);
2388 }
2389
2390 static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
2391 {
2392         unsigned int nr_events = 0;
2393         int iters = 0, ret = 0;
2394
2395         /*
2396          * We disallow the app entering submit/complete with polling, but we
2397          * still need to lock the ring to prevent racing with polled issue
2398          * that got punted to a workqueue.
2399          */
2400         mutex_lock(&ctx->uring_lock);
2401         do {
2402                 /*
2403                  * Don't enter poll loop if we already have events pending.
2404                  * If we do, we can potentially be spinning for commands that
2405                  * already triggered a CQE (eg in error).
2406                  */
2407                 if (test_bit(0, &ctx->cq_check_overflow))
2408                         __io_cqring_overflow_flush(ctx, false, NULL, NULL);
2409                 if (io_cqring_events(ctx))
2410                         break;
2411
2412                 /*
2413                  * If a submit got punted to a workqueue, we can have the
2414                  * application entering polling for a command before it gets
2415                  * issued. That app will hold the uring_lock for the duration
2416                  * of the poll right here, so we need to take a breather every
2417                  * now and then to ensure that the issue has a chance to add
2418                  * the poll to the issued list. Otherwise we can spin here
2419                  * forever, while the workqueue is stuck trying to acquire the
2420                  * very same mutex.
2421                  */
2422                 if (!(++iters & 7)) {
2423                         mutex_unlock(&ctx->uring_lock);
2424                         io_run_task_work();
2425                         mutex_lock(&ctx->uring_lock);
2426                 }
2427
2428                 ret = io_iopoll_getevents(ctx, &nr_events, min);
2429                 if (ret <= 0)
2430                         break;
2431                 ret = 0;
2432         } while (min && !nr_events && !need_resched());
2433
2434         mutex_unlock(&ctx->uring_lock);
2435         return ret;
2436 }
2437
2438 static void kiocb_end_write(struct io_kiocb *req)
2439 {
2440         /*
2441          * Tell lockdep we inherited freeze protection from submission
2442          * thread.
2443          */
2444         if (req->flags & REQ_F_ISREG) {
2445                 struct inode *inode = file_inode(req->file);
2446
2447                 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
2448         }
2449         file_end_write(req->file);
2450 }
2451
2452 #ifdef CONFIG_BLOCK
2453 static bool io_resubmit_prep(struct io_kiocb *req)
2454 {
2455         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2456         int rw, ret;
2457         struct iov_iter iter;
2458
2459         /* already prepared */
2460         if (req->async_data)
2461                 return true;
2462
2463         switch (req->opcode) {
2464         case IORING_OP_READV:
2465         case IORING_OP_READ_FIXED:
2466         case IORING_OP_READ:
2467                 rw = READ;
2468                 break;
2469         case IORING_OP_WRITEV:
2470         case IORING_OP_WRITE_FIXED:
2471         case IORING_OP_WRITE:
2472                 rw = WRITE;
2473                 break;
2474         default:
2475                 printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n",
2476                                 req->opcode);
2477                 return false;
2478         }
2479
2480         ret = io_import_iovec(rw, req, &iovec, &iter, false);
2481         if (ret < 0)
2482                 return false;
2483         return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
2484 }
2485 #endif
2486
2487 static bool io_rw_reissue(struct io_kiocb *req)
2488 {
2489 #ifdef CONFIG_BLOCK
2490         umode_t mode = file_inode(req->file)->i_mode;
2491
2492         if (!S_ISBLK(mode) && !S_ISREG(mode))
2493                 return false;
2494         if ((req->flags & REQ_F_NOWAIT) || io_wq_current_is_worker())
2495                 return false;
2496
2497         lockdep_assert_held(&req->ctx->uring_lock);
2498
2499         if (io_resubmit_prep(req)) {
2500                 refcount_inc(&req->refs);
2501                 io_queue_async_work(req);
2502                 return true;
2503         }
2504         req_set_fail_links(req);
2505 #endif
2506         return false;
2507 }
2508
2509 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
2510                              unsigned int issue_flags)
2511 {
2512         int cflags = 0;
2513
2514         if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_reissue(req))
2515                 return;
2516         if (res != req->result)
2517                 req_set_fail_links(req);
2518
2519         if (req->rw.kiocb.ki_flags & IOCB_WRITE)
2520                 kiocb_end_write(req);
2521         if (req->flags & REQ_F_BUFFER_SELECTED)
2522                 cflags = io_put_rw_kbuf(req);
2523         __io_req_complete(req, issue_flags, res, cflags);
2524 }
2525
2526 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
2527 {
2528         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2529
2530         __io_complete_rw(req, res, res2, 0);
2531 }
2532
2533 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
2534 {
2535         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2536
2537         if (kiocb->ki_flags & IOCB_WRITE)
2538                 kiocb_end_write(req);
2539
2540         if (res != -EAGAIN && res != req->result)
2541                 req_set_fail_links(req);
2542
2543         WRITE_ONCE(req->result, res);
2544         /* order with io_poll_complete() checking ->result */
2545         smp_wmb();
2546         WRITE_ONCE(req->iopoll_completed, 1);
2547 }
2548
2549 /*
2550  * After the iocb has been issued, it's safe to be found on the poll list.
2551  * Adding the kiocb to the list AFTER submission ensures that we don't
2552  * find it from a io_iopoll_getevents() thread before the issuer is done
2553  * accessing the kiocb cookie.
2554  */
2555 static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
2556 {
2557         struct io_ring_ctx *ctx = req->ctx;
2558
2559         /*
2560          * Track whether we have multiple files in our lists. This will impact
2561          * how we do polling eventually, not spinning if we're on potentially
2562          * different devices.
2563          */
2564         if (list_empty(&ctx->iopoll_list)) {
2565                 ctx->poll_multi_file = false;
2566         } else if (!ctx->poll_multi_file) {
2567                 struct io_kiocb *list_req;
2568
2569                 list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
2570                                                 inflight_entry);
2571                 if (list_req->file != req->file)
2572                         ctx->poll_multi_file = true;
2573         }
2574
2575         /*
2576          * For fast devices, IO may have already completed. If it has, add
2577          * it to the front so we find it first.
2578          */
2579         if (READ_ONCE(req->iopoll_completed))
2580                 list_add(&req->inflight_entry, &ctx->iopoll_list);
2581         else
2582                 list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
2583
2584         /*
2585          * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread
2586          * task context or in io worker task context. If current task context is
2587          * sq thread, we don't need to check whether should wake up sq thread.
2588          */
2589         if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) &&
2590             wq_has_sleeper(&ctx->sq_data->wait))
2591                 wake_up(&ctx->sq_data->wait);
2592 }
2593
2594 static inline void io_state_file_put(struct io_submit_state *state)
2595 {
2596         if (state->file_refs) {
2597                 fput_many(state->file, state->file_refs);
2598                 state->file_refs = 0;
2599         }
2600 }
2601
2602 /*
2603  * Get as many references to a file as we have IOs left in this submission,
2604  * assuming most submissions are for one file, or at least that each file
2605  * has more than one submission.
2606  */
2607 static struct file *__io_file_get(struct io_submit_state *state, int fd)
2608 {
2609         if (!state)
2610                 return fget(fd);
2611
2612         if (state->file_refs) {
2613                 if (state->fd == fd) {
2614                         state->file_refs--;
2615                         return state->file;
2616                 }
2617                 io_state_file_put(state);
2618         }
2619         state->file = fget_many(fd, state->ios_left);
2620         if (unlikely(!state->file))
2621                 return NULL;
2622
2623         state->fd = fd;
2624         state->file_refs = state->ios_left - 1;
2625         return state->file;
2626 }
2627
2628 static bool io_bdev_nowait(struct block_device *bdev)
2629 {
2630         return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
2631 }
2632
2633 /*
2634  * If we tracked the file through the SCM inflight mechanism, we could support
2635  * any file. For now, just ensure that anything potentially problematic is done
2636  * inline.
2637  */
2638 static bool io_file_supports_async(struct file *file, int rw)
2639 {
2640         umode_t mode = file_inode(file)->i_mode;
2641
2642         if (S_ISBLK(mode)) {
2643                 if (IS_ENABLED(CONFIG_BLOCK) &&
2644                     io_bdev_nowait(I_BDEV(file->f_mapping->host)))
2645                         return true;
2646                 return false;
2647         }
2648         if (S_ISCHR(mode) || S_ISSOCK(mode))
2649                 return true;
2650         if (S_ISREG(mode)) {
2651                 if (IS_ENABLED(CONFIG_BLOCK) &&
2652                     io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
2653                     file->f_op != &io_uring_fops)
2654                         return true;
2655                 return false;
2656         }
2657
2658         /* any ->read/write should understand O_NONBLOCK */
2659         if (file->f_flags & O_NONBLOCK)
2660                 return true;
2661
2662         if (!(file->f_mode & FMODE_NOWAIT))
2663                 return false;
2664
2665         if (rw == READ)
2666                 return file->f_op->read_iter != NULL;
2667
2668         return file->f_op->write_iter != NULL;
2669 }
2670
2671 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2672 {
2673         struct io_ring_ctx *ctx = req->ctx;
2674         struct kiocb *kiocb = &req->rw.kiocb;
2675         struct file *file = req->file;
2676         unsigned ioprio;
2677         int ret;
2678
2679         if (S_ISREG(file_inode(file)->i_mode))
2680                 req->flags |= REQ_F_ISREG;
2681
2682         kiocb->ki_pos = READ_ONCE(sqe->off);
2683         if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
2684                 req->flags |= REQ_F_CUR_POS;
2685                 kiocb->ki_pos = file->f_pos;
2686         }
2687         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
2688         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2689         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2690         if (unlikely(ret))
2691                 return ret;
2692
2693         /* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */
2694         if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK))
2695                 req->flags |= REQ_F_NOWAIT;
2696
2697         ioprio = READ_ONCE(sqe->ioprio);
2698         if (ioprio) {
2699                 ret = ioprio_check_cap(ioprio);
2700                 if (ret)
2701                         return ret;
2702
2703                 kiocb->ki_ioprio = ioprio;
2704         } else
2705                 kiocb->ki_ioprio = get_current_ioprio();
2706
2707         if (ctx->flags & IORING_SETUP_IOPOLL) {
2708                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2709                     !kiocb->ki_filp->f_op->iopoll)
2710                         return -EOPNOTSUPP;
2711
2712                 kiocb->ki_flags |= IOCB_HIPRI;
2713                 kiocb->ki_complete = io_complete_rw_iopoll;
2714                 req->iopoll_completed = 0;
2715         } else {
2716                 if (kiocb->ki_flags & IOCB_HIPRI)
2717                         return -EINVAL;
2718                 kiocb->ki_complete = io_complete_rw;
2719         }
2720
2721         req->rw.addr = READ_ONCE(sqe->addr);
2722         req->rw.len = READ_ONCE(sqe->len);
2723         req->buf_index = READ_ONCE(sqe->buf_index);
2724         return 0;
2725 }
2726
2727 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2728 {
2729         switch (ret) {
2730         case -EIOCBQUEUED:
2731                 break;
2732         case -ERESTARTSYS:
2733         case -ERESTARTNOINTR:
2734         case -ERESTARTNOHAND:
2735         case -ERESTART_RESTARTBLOCK:
2736                 /*
2737                  * We can't just restart the syscall, since previously
2738                  * submitted sqes may already be in progress. Just fail this
2739                  * IO with EINTR.
2740                  */
2741                 ret = -EINTR;
2742                 fallthrough;
2743         default:
2744                 kiocb->ki_complete(kiocb, ret, 0);
2745         }
2746 }
2747
2748 static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
2749                        unsigned int issue_flags)
2750 {
2751         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2752         struct io_async_rw *io = req->async_data;
2753
2754         /* add previously done IO, if any */
2755         if (io && io->bytes_done > 0) {
2756                 if (ret < 0)
2757                         ret = io->bytes_done;
2758                 else
2759                         ret += io->bytes_done;
2760         }
2761
2762         if (req->flags & REQ_F_CUR_POS)
2763                 req->file->f_pos = kiocb->ki_pos;
2764         if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
2765                 __io_complete_rw(req, ret, 0, issue_flags);
2766         else
2767                 io_rw_done(kiocb, ret);
2768 }
2769
2770 static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
2771 {
2772         struct io_ring_ctx *ctx = req->ctx;
2773         size_t len = req->rw.len;
2774         struct io_mapped_ubuf *imu;
2775         u16 index, buf_index = req->buf_index;
2776         size_t offset;
2777         u64 buf_addr;
2778
2779         if (unlikely(buf_index >= ctx->nr_user_bufs))
2780                 return -EFAULT;
2781         index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2782         imu = &ctx->user_bufs[index];
2783         buf_addr = req->rw.addr;
2784
2785         /* overflow */
2786         if (buf_addr + len < buf_addr)
2787                 return -EFAULT;
2788         /* not inside the mapped region */
2789         if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
2790                 return -EFAULT;
2791
2792         /*
2793          * May not be a start of buffer, set size appropriately
2794          * and advance us to the beginning.
2795          */
2796         offset = buf_addr - imu->ubuf;
2797         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
2798
2799         if (offset) {
2800                 /*
2801                  * Don't use iov_iter_advance() here, as it's really slow for
2802                  * using the latter parts of a big fixed buffer - it iterates
2803                  * over each segment manually. We can cheat a bit here, because
2804                  * we know that:
2805                  *
2806                  * 1) it's a BVEC iter, we set it up
2807                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
2808                  *    first and last bvec
2809                  *
2810                  * So just find our index, and adjust the iterator afterwards.
2811                  * If the offset is within the first bvec (or the whole first
2812                  * bvec, just use iov_iter_advance(). This makes it easier
2813                  * since we can just skip the first segment, which may not
2814                  * be PAGE_SIZE aligned.
2815                  */
2816                 const struct bio_vec *bvec = imu->bvec;
2817
2818                 if (offset <= bvec->bv_len) {
2819                         iov_iter_advance(iter, offset);
2820                 } else {
2821                         unsigned long seg_skip;
2822
2823                         /* skip first vec */
2824                         offset -= bvec->bv_len;
2825                         seg_skip = 1 + (offset >> PAGE_SHIFT);
2826
2827                         iter->bvec = bvec + seg_skip;
2828                         iter->nr_segs -= seg_skip;
2829                         iter->count -= bvec->bv_len + offset;
2830                         iter->iov_offset = offset & ~PAGE_MASK;
2831                 }
2832         }
2833
2834         return 0;
2835 }
2836
2837 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2838 {
2839         if (needs_lock)
2840                 mutex_unlock(&ctx->uring_lock);
2841 }
2842
2843 static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2844 {
2845         /*
2846          * "Normal" inline submissions always hold the uring_lock, since we
2847          * grab it from the system call. Same is true for the SQPOLL offload.
2848          * The only exception is when we've detached the request and issue it
2849          * from an async worker thread, grab the lock for that case.
2850          */
2851         if (needs_lock)
2852                 mutex_lock(&ctx->uring_lock);
2853 }
2854
2855 static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
2856                                           int bgid, struct io_buffer *kbuf,
2857                                           bool needs_lock)
2858 {
2859         struct io_buffer *head;
2860
2861         if (req->flags & REQ_F_BUFFER_SELECTED)
2862                 return kbuf;
2863
2864         io_ring_submit_lock(req->ctx, needs_lock);
2865
2866         lockdep_assert_held(&req->ctx->uring_lock);
2867
2868         head = idr_find(&req->ctx->io_buffer_idr, bgid);
2869         if (head) {
2870                 if (!list_empty(&head->list)) {
2871                         kbuf = list_last_entry(&head->list, struct io_buffer,
2872                                                         list);
2873                         list_del(&kbuf->list);
2874                 } else {
2875                         kbuf = head;
2876                         idr_remove(&req->ctx->io_buffer_idr, bgid);
2877                 }
2878                 if (*len > kbuf->len)
2879                         *len = kbuf->len;
2880         } else {
2881                 kbuf = ERR_PTR(-ENOBUFS);
2882         }
2883
2884         io_ring_submit_unlock(req->ctx, needs_lock);
2885
2886         return kbuf;
2887 }
2888
2889 static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
2890                                         bool needs_lock)
2891 {
2892         struct io_buffer *kbuf;
2893         u16 bgid;
2894
2895         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2896         bgid = req->buf_index;
2897         kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
2898         if (IS_ERR(kbuf))
2899                 return kbuf;
2900         req->rw.addr = (u64) (unsigned long) kbuf;
2901         req->flags |= REQ_F_BUFFER_SELECTED;
2902         return u64_to_user_ptr(kbuf->addr);
2903 }
2904
2905 #ifdef CONFIG_COMPAT
2906 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
2907                                 bool needs_lock)
2908 {
2909         struct compat_iovec __user *uiov;
2910         compat_ssize_t clen;
2911         void __user *buf;
2912         ssize_t len;
2913
2914         uiov = u64_to_user_ptr(req->rw.addr);
2915         if (!access_ok(uiov, sizeof(*uiov)))
2916                 return -EFAULT;
2917         if (__get_user(clen, &uiov->iov_len))
2918                 return -EFAULT;
2919         if (clen < 0)
2920                 return -EINVAL;
2921
2922         len = clen;
2923         buf = io_rw_buffer_select(req, &len, needs_lock);
2924         if (IS_ERR(buf))
2925                 return PTR_ERR(buf);
2926         iov[0].iov_base = buf;
2927         iov[0].iov_len = (compat_size_t) len;
2928         return 0;
2929 }
2930 #endif
2931
2932 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2933                                       bool needs_lock)
2934 {
2935         struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
2936         void __user *buf;
2937         ssize_t len;
2938
2939         if (copy_from_user(iov, uiov, sizeof(*uiov)))
2940                 return -EFAULT;
2941
2942         len = iov[0].iov_len;
2943         if (len < 0)
2944                 return -EINVAL;
2945         buf = io_rw_buffer_select(req, &len, needs_lock);
2946         if (IS_ERR(buf))
2947                 return PTR_ERR(buf);
2948         iov[0].iov_base = buf;
2949         iov[0].iov_len = len;
2950         return 0;
2951 }
2952
2953 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2954                                     bool needs_lock)
2955 {
2956         if (req->flags & REQ_F_BUFFER_SELECTED) {
2957                 struct io_buffer *kbuf;
2958
2959                 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2960                 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
2961                 iov[0].iov_len = kbuf->len;
2962                 return 0;
2963         }
2964         if (req->rw.len != 1)
2965                 return -EINVAL;
2966
2967 #ifdef CONFIG_COMPAT
2968         if (req->ctx->compat)
2969                 return io_compat_import(req, iov, needs_lock);
2970 #endif
2971
2972         return __io_iov_buffer_select(req, iov, needs_lock);
2973 }
2974
2975 static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
2976                            struct iov_iter *iter, bool needs_lock)
2977 {
2978         void __user *buf = u64_to_user_ptr(req->rw.addr);
2979         size_t sqe_len = req->rw.len;
2980         u8 opcode = req->opcode;
2981         ssize_t ret;
2982
2983         if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
2984                 *iovec = NULL;
2985                 return io_import_fixed(req, rw, iter);
2986         }
2987
2988         /* buffer index only valid with fixed read/write, or buffer select  */
2989         if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
2990                 return -EINVAL;
2991
2992         if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
2993                 if (req->flags & REQ_F_BUFFER_SELECT) {
2994                         buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
2995                         if (IS_ERR(buf))
2996                                 return PTR_ERR(buf);
2997                         req->rw.len = sqe_len;
2998                 }
2999
3000                 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
3001                 *iovec = NULL;
3002                 return ret;
3003         }
3004
3005         if (req->flags & REQ_F_BUFFER_SELECT) {
3006                 ret = io_iov_buffer_select(req, *iovec, needs_lock);
3007                 if (!ret)
3008                         iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
3009                 *iovec = NULL;
3010                 return ret;
3011         }
3012
3013         return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
3014                               req->ctx->compat);
3015 }
3016
3017 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
3018 {
3019         return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
3020 }
3021
3022 /*
3023  * For files that don't have ->read_iter() and ->write_iter(), handle them
3024  * by looping over ->read() or ->write() manually.
3025  */
3026 static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
3027 {
3028         struct kiocb *kiocb = &req->rw.kiocb;
3029         struct file *file = req->file;
3030         ssize_t ret = 0;
3031
3032         /*
3033          * Don't support polled IO through this interface, and we can't
3034          * support non-blocking either. For the latter, this just causes
3035          * the kiocb to be handled from an async context.
3036          */
3037         if (kiocb->ki_flags & IOCB_HIPRI)
3038                 return -EOPNOTSUPP;
3039         if (kiocb->ki_flags & IOCB_NOWAIT)
3040                 return -EAGAIN;
3041
3042         while (iov_iter_count(iter)) {
3043                 struct iovec iovec;
3044                 ssize_t nr;
3045
3046                 if (!iov_iter_is_bvec(iter)) {
3047                         iovec = iov_iter_iovec(iter);
3048                 } else {
3049                         iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3050                         iovec.iov_len = req->rw.len;
3051                 }
3052
3053                 if (rw == READ) {
3054                         nr = file->f_op->read(file, iovec.iov_base,
3055                                               iovec.iov_len, io_kiocb_ppos(kiocb));
3056                 } else {
3057                         nr = file->f_op->write(file, iovec.iov_base,
3058                                                iovec.iov_len, io_kiocb_ppos(kiocb));
3059                 }
3060
3061                 if (nr < 0) {
3062                         if (!ret)
3063                                 ret = nr;
3064                         break;
3065                 }
3066                 ret += nr;
3067                 if (nr != iovec.iov_len)
3068                         break;
3069                 req->rw.len -= nr;
3070                 req->rw.addr += nr;
3071                 iov_iter_advance(iter, nr);
3072         }
3073
3074         return ret;
3075 }
3076
3077 static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3078                           const struct iovec *fast_iov, struct iov_iter *iter)
3079 {
3080         struct io_async_rw *rw = req->async_data;
3081
3082         memcpy(&rw->iter, iter, sizeof(*iter));
3083         rw->free_iovec = iovec;
3084         rw->bytes_done = 0;
3085         /* can only be fixed buffers, no need to do anything */
3086         if (iov_iter_is_bvec(iter))
3087                 return;
3088         if (!iovec) {
3089                 unsigned iov_off = 0;
3090
3091                 rw->iter.iov = rw->fast_iov;
3092                 if (iter->iov != fast_iov) {
3093                         iov_off = iter->iov - fast_iov;
3094                         rw->iter.iov += iov_off;
3095                 }
3096                 if (rw->fast_iov != fast_iov)
3097                         memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
3098                                sizeof(struct iovec) * iter->nr_segs);
3099         } else {
3100                 req->flags |= REQ_F_NEED_CLEANUP;
3101         }
3102 }
3103
3104 static inline int __io_alloc_async_data(struct io_kiocb *req)
3105 {
3106         WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3107         req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
3108         return req->async_data == NULL;
3109 }
3110
3111 static int io_alloc_async_data(struct io_kiocb *req)
3112 {
3113         if (!io_op_defs[req->opcode].needs_async_data)
3114                 return 0;
3115
3116         return  __io_alloc_async_data(req);
3117 }
3118
3119 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3120                              const struct iovec *fast_iov,
3121                              struct iov_iter *iter, bool force)
3122 {
3123         if (!force && !io_op_defs[req->opcode].needs_async_data)
3124                 return 0;
3125         if (!req->async_data) {
3126                 if (__io_alloc_async_data(req)) {
3127                         kfree(iovec);
3128                         return -ENOMEM;
3129                 }
3130
3131                 io_req_map_rw(req, iovec, fast_iov, iter);
3132         }
3133         return 0;
3134 }
3135
3136 static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
3137 {
3138         struct io_async_rw *iorw = req->async_data;
3139         struct iovec *iov = iorw->fast_iov;
3140         int ret;
3141
3142         ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
3143         if (unlikely(ret < 0))
3144                 return ret;
3145
3146         iorw->bytes_done = 0;
3147         iorw->free_iovec = iov;
3148         if (iov)
3149                 req->flags |= REQ_F_NEED_CLEANUP;
3150         return 0;
3151 }
3152
3153 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3154 {
3155         if (unlikely(!(req->file->f_mode & FMODE_READ)))
3156                 return -EBADF;
3157         return io_prep_rw(req, sqe);
3158 }
3159
3160 /*
3161  * This is our waitqueue callback handler, registered through lock_page_async()
3162  * when we initially tried to do the IO with the iocb armed our waitqueue.
3163  * This gets called when the page is unlocked, and we generally expect that to
3164  * happen when the page IO is completed and the page is now uptodate. This will
3165  * queue a task_work based retry of the operation, attempting to copy the data
3166  * again. If the latter fails because the page was NOT uptodate, then we will
3167  * do a thread based blocking retry of the operation. That's the unexpected
3168  * slow path.
3169  */
3170 static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3171                              int sync, void *arg)
3172 {
3173         struct wait_page_queue *wpq;
3174         struct io_kiocb *req = wait->private;
3175         struct wait_page_key *key = arg;
3176
3177         wpq = container_of(wait, struct wait_page_queue, wait);
3178
3179         if (!wake_page_match(wpq, key))
3180                 return 0;
3181
3182         req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
3183         list_del_init(&wait->entry);
3184
3185         /* submit ref gets dropped, acquire a new one */
3186         refcount_inc(&req->refs);
3187         io_req_task_queue(req);
3188         return 1;
3189 }
3190
3191 /*
3192  * This controls whether a given IO request should be armed for async page
3193  * based retry. If we return false here, the request is handed to the async
3194  * worker threads for retry. If we're doing buffered reads on a regular file,
3195  * we prepare a private wait_page_queue entry and retry the operation. This
3196  * will either succeed because the page is now uptodate and unlocked, or it
3197  * will register a callback when the page is unlocked at IO completion. Through
3198  * that callback, io_uring uses task_work to setup a retry of the operation.
3199  * That retry will attempt the buffered read again. The retry will generally
3200  * succeed, or in rare cases where it fails, we then fall back to using the
3201  * async worker threads for a blocking retry.
3202  */
3203 static bool io_rw_should_retry(struct io_kiocb *req)
3204 {
3205         struct io_async_rw *rw = req->async_data;
3206         struct wait_page_queue *wait = &rw->wpq;
3207         struct kiocb *kiocb = &req->rw.kiocb;
3208
3209         /* never retry for NOWAIT, we just complete with -EAGAIN */
3210         if (req->flags & REQ_F_NOWAIT)
3211                 return false;
3212
3213         /* Only for buffered IO */
3214         if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
3215                 return false;
3216
3217         /*
3218          * just use poll if we can, and don't attempt if the fs doesn't
3219          * support callback based unlocks
3220          */
3221         if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3222                 return false;
3223
3224         wait->wait.func = io_async_buf_func;
3225         wait->wait.private = req;
3226         wait->wait.flags = 0;
3227         INIT_LIST_HEAD(&wait->wait.entry);
3228         kiocb->ki_flags |= IOCB_WAITQ;
3229         kiocb->ki_flags &= ~IOCB_NOWAIT;
3230         kiocb->ki_waitq = wait;
3231         return true;
3232 }
3233
3234 static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
3235 {
3236         if (req->file->f_op->read_iter)
3237                 return call_read_iter(req->file, &req->rw.kiocb, iter);
3238         else if (req->file->f_op->read)
3239                 return loop_rw_iter(READ, req, iter);
3240         else
3241                 return -EINVAL;
3242 }
3243
3244 static int io_read(struct io_kiocb *req, unsigned int issue_flags)
3245 {
3246         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3247         struct kiocb *kiocb = &req->rw.kiocb;
3248         struct iov_iter __iter, *iter = &__iter;
3249         struct io_async_rw *rw = req->async_data;
3250         ssize_t io_size, ret, ret2;
3251         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3252
3253         if (rw) {
3254                 iter = &rw->iter;
3255                 iovec = NULL;
3256         } else {
3257                 ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
3258                 if (ret < 0)
3259                         return ret;
3260         }
3261         io_size = iov_iter_count(iter);
3262         req->result = io_size;
3263
3264         /* Ensure we clear previously set non-block flag */
3265         if (!force_nonblock)
3266                 kiocb->ki_flags &= ~IOCB_NOWAIT;
3267         else
3268                 kiocb->ki_flags |= IOCB_NOWAIT;
3269
3270         /* If the file doesn't support async, just async punt */
3271         if (force_nonblock && !io_file_supports_async(req->file, READ)) {
3272                 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3273                 return ret ?: -EAGAIN;
3274         }
3275
3276         ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
3277         if (unlikely(ret)) {
3278                 kfree(iovec);
3279                 return ret;
3280         }
3281
3282         ret = io_iter_do_read(req, iter);
3283
3284         if (ret == -EIOCBQUEUED) {
3285                 goto out_free;
3286         } else if (ret == -EAGAIN) {
3287                 /* IOPOLL retry should happen for io-wq threads */
3288                 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
3289                         goto done;
3290                 /* no retry on NONBLOCK nor RWF_NOWAIT */
3291                 if (req->flags & REQ_F_NOWAIT)
3292                         goto done;
3293                 /* some cases will consume bytes even on error returns */
3294                 iov_iter_revert(iter, io_size - iov_iter_count(iter));
3295                 ret = 0;
3296         } else if (ret <= 0 || ret == io_size || !force_nonblock ||
3297                    (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) {
3298                 /* read all, failed, already did sync or don't want to retry */
3299                 goto done;
3300         }
3301
3302         ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3303         if (ret2)
3304                 return ret2;
3305
3306         iovec = NULL;
3307         rw = req->async_data;
3308         /* now use our persistent iterator, if we aren't already */
3309         iter = &rw->iter;
3310
3311         do {
3312                 io_size -= ret;
3313                 rw->bytes_done += ret;
3314                 /* if we can retry, do so with the callbacks armed */
3315                 if (!io_rw_should_retry(req)) {
3316                         kiocb->ki_flags &= ~IOCB_WAITQ;
3317                         return -EAGAIN;
3318                 }
3319
3320                 /*
3321                  * Now retry read with the IOCB_WAITQ parts set in the iocb. If
3322                  * we get -EIOCBQUEUED, then we'll get a notification when the
3323                  * desired page gets unlocked. We can also get a partial read
3324                  * here, and if we do, then just retry at the new offset.
3325                  */
3326                 ret = io_iter_do_read(req, iter);
3327                 if (ret == -EIOCBQUEUED)
3328                         return 0;
3329                 /* we got some bytes, but not all. retry. */
3330         } while (ret > 0 && ret < io_size);
3331 done:
3332         kiocb_done(kiocb, ret, issue_flags);
3333 out_free:
3334         /* it's faster to check here then delegate to kfree */
3335         if (iovec)
3336                 kfree(iovec);
3337         return 0;
3338 }
3339
3340 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3341 {
3342         if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3343                 return -EBADF;
3344         return io_prep_rw(req, sqe);
3345 }
3346
3347 static int io_write(struct io_kiocb *req, unsigned int issue_flags)
3348 {
3349         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3350         struct kiocb *kiocb = &req->rw.kiocb;
3351         struct iov_iter __iter, *iter = &__iter;
3352         struct io_async_rw *rw = req->async_data;
3353         ssize_t ret, ret2, io_size;
3354         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3355
3356         if (rw) {
3357                 iter = &rw->iter;
3358                 iovec = NULL;
3359         } else {
3360                 ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
3361                 if (ret < 0)
3362                         return ret;
3363         }
3364         io_size = iov_iter_count(iter);
3365         req->result = io_size;
3366
3367         /* Ensure we clear previously set non-block flag */
3368         if (!force_nonblock)
3369                 kiocb->ki_flags &= ~IOCB_NOWAIT;
3370         else
3371                 kiocb->ki_flags |= IOCB_NOWAIT;
3372
3373         /* If the file doesn't support async, just async punt */
3374         if (force_nonblock && !io_file_supports_async(req->file, WRITE))
3375                 goto copy_iov;
3376
3377         /* file path doesn't support NOWAIT for non-direct_IO */
3378         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3379             (req->flags & REQ_F_ISREG))
3380                 goto copy_iov;
3381
3382         ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
3383         if (unlikely(ret))
3384                 goto out_free;
3385
3386         /*
3387          * Open-code file_start_write here to grab freeze protection,
3388          * which will be released by another thread in
3389          * io_complete_rw().  Fool lockdep by telling it the lock got
3390          * released so that it doesn't complain about the held lock when
3391          * we return to userspace.
3392          */
3393         if (req->flags & REQ_F_ISREG) {
3394                 sb_start_write(file_inode(req->file)->i_sb);
3395                 __sb_writers_release(file_inode(req->file)->i_sb,
3396                                         SB_FREEZE_WRITE);
3397         }
3398         kiocb->ki_flags |= IOCB_WRITE;
3399
3400         if (req->file->f_op->write_iter)
3401                 ret2 = call_write_iter(req->file, kiocb, iter);
3402         else if (req->file->f_op->write)
3403                 ret2 = loop_rw_iter(WRITE, req, iter);
3404         else
3405                 ret2 = -EINVAL;
3406
3407         /*
3408          * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3409          * retry them without IOCB_NOWAIT.
3410          */
3411         if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3412                 ret2 = -EAGAIN;
3413         /* no retry on NONBLOCK nor RWF_NOWAIT */
3414         if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
3415                 goto done;
3416         if (!force_nonblock || ret2 != -EAGAIN) {
3417                 /* IOPOLL retry should happen for io-wq threads */
3418                 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
3419                         goto copy_iov;
3420 done:
3421                 kiocb_done(kiocb, ret2, issue_flags);
3422         } else {
3423 copy_iov:
3424                 /* some cases will consume bytes even on error returns */
3425                 iov_iter_revert(iter, io_size - iov_iter_count(iter));
3426                 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
3427                 return ret ?: -EAGAIN;
3428         }
3429 out_free:
3430         /* it's reportedly faster than delegating the null check to kfree() */
3431         if (iovec)
3432                 kfree(iovec);
3433         return ret;
3434 }
3435
3436 static int io_renameat_prep(struct io_kiocb *req,
3437                             const struct io_uring_sqe *sqe)
3438 {
3439         struct io_rename *ren = &req->rename;
3440         const char __user *oldf, *newf;
3441
3442         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3443                 return -EBADF;
3444
3445         ren->old_dfd = READ_ONCE(sqe->fd);
3446         oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3447         newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3448         ren->new_dfd = READ_ONCE(sqe->len);
3449         ren->flags = READ_ONCE(sqe->rename_flags);
3450
3451         ren->oldpath = getname(oldf);
3452         if (IS_ERR(ren->oldpath))
3453                 return PTR_ERR(ren->oldpath);
3454
3455         ren->newpath = getname(newf);
3456         if (IS_ERR(ren->newpath)) {
3457                 putname(ren->oldpath);
3458                 return PTR_ERR(ren->newpath);
3459         }
3460
3461         req->flags |= REQ_F_NEED_CLEANUP;
3462         return 0;
3463 }
3464
3465 static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
3466 {
3467         struct io_rename *ren = &req->rename;
3468         int ret;
3469
3470         if (issue_flags & IO_URING_F_NONBLOCK)
3471                 return -EAGAIN;
3472
3473         ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
3474                                 ren->newpath, ren->flags);
3475
3476         req->flags &= ~REQ_F_NEED_CLEANUP;
3477         if (ret < 0)
3478                 req_set_fail_links(req);
3479         io_req_complete(req, ret);
3480         return 0;
3481 }
3482
3483 static int io_unlinkat_prep(struct io_kiocb *req,
3484                             const struct io_uring_sqe *sqe)
3485 {
3486         struct io_unlink *un = &req->unlink;
3487         const char __user *fname;
3488
3489         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3490                 return -EBADF;
3491
3492         un->dfd = READ_ONCE(sqe->fd);
3493
3494         un->flags = READ_ONCE(sqe->unlink_flags);
3495         if (un->flags & ~AT_REMOVEDIR)
3496                 return -EINVAL;
3497
3498         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3499         un->filename = getname(fname);
3500         if (IS_ERR(un->filename))
3501                 return PTR_ERR(un->filename);
3502
3503         req->flags |= REQ_F_NEED_CLEANUP;
3504         return 0;
3505 }
3506
3507 static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
3508 {
3509         struct io_unlink *un = &req->unlink;
3510         int ret;
3511
3512         if (issue_flags & IO_URING_F_NONBLOCK)
3513                 return -EAGAIN;
3514
3515         if (un->flags & AT_REMOVEDIR)
3516                 ret = do_rmdir(un->dfd, un->filename);
3517         else
3518                 ret = do_unlinkat(un->dfd, un->filename);
3519
3520         req->flags &= ~REQ_F_NEED_CLEANUP;
3521         if (ret < 0)
3522                 req_set_fail_links(req);
3523         io_req_complete(req, ret);
3524         return 0;
3525 }
3526
3527 static int io_shutdown_prep(struct io_kiocb *req,
3528                             const struct io_uring_sqe *sqe)
3529 {
3530 #if defined(CONFIG_NET)
3531         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3532                 return -EINVAL;
3533         if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
3534             sqe->buf_index)
3535                 return -EINVAL;
3536
3537         req->shutdown.how = READ_ONCE(sqe->len);
3538         return 0;
3539 #else
3540         return -EOPNOTSUPP;
3541 #endif
3542 }
3543
3544 static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
3545 {
3546 #if defined(CONFIG_NET)
3547         struct socket *sock;
3548         int ret;
3549
3550         if (issue_flags & IO_URING_F_NONBLOCK)
3551                 return -EAGAIN;
3552
3553         sock = sock_from_file(req->file);
3554         if (unlikely(!sock))
3555                 return -ENOTSOCK;
3556
3557         ret = __sys_shutdown_sock(sock, req->shutdown.how);
3558         if (ret < 0)
3559                 req_set_fail_links(req);
3560         io_req_complete(req, ret);
3561         return 0;
3562 #else
3563         return -EOPNOTSUPP;
3564 #endif
3565 }
3566
3567 static int __io_splice_prep(struct io_kiocb *req,
3568                             const struct io_uring_sqe *sqe)
3569 {
3570         struct io_splice* sp = &req->splice;
3571         unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
3572
3573         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3574                 return -EINVAL;
3575
3576         sp->file_in = NULL;
3577         sp->len = READ_ONCE(sqe->len);
3578         sp->flags = READ_ONCE(sqe->splice_flags);
3579
3580         if (unlikely(sp->flags & ~valid_flags))
3581                 return -EINVAL;
3582
3583         sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in),
3584                                   (sp->flags & SPLICE_F_FD_IN_FIXED));
3585         if (!sp->file_in)
3586                 return -EBADF;
3587         req->flags |= REQ_F_NEED_CLEANUP;
3588
3589         if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
3590                 /*
3591                  * Splice operation will be punted aync, and here need to
3592                  * modify io_wq_work.flags, so initialize io_wq_work firstly.
3593                  */
3594                 io_req_init_async(req);
3595                 req->work.flags |= IO_WQ_WORK_UNBOUND;
3596         }
3597
3598         return 0;
3599 }
3600
3601 static int io_tee_prep(struct io_kiocb *req,
3602                        const struct io_uring_sqe *sqe)
3603 {
3604         if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
3605                 return -EINVAL;
3606         return __io_splice_prep(req, sqe);
3607 }
3608
3609 static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
3610 {
3611         struct io_splice *sp = &req->splice;
3612         struct file *in = sp->file_in;
3613         struct file *out = sp->file_out;
3614         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3615         long ret = 0;
3616
3617         if (issue_flags & IO_URING_F_NONBLOCK)
3618                 return -EAGAIN;
3619         if (sp->len)
3620                 ret = do_tee(in, out, sp->len, flags);
3621
3622         io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3623         req->flags &= ~REQ_F_NEED_CLEANUP;
3624
3625         if (ret != sp->len)
3626                 req_set_fail_links(req);
3627         io_req_complete(req, ret);
3628         return 0;
3629 }
3630
3631 static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3632 {
3633         struct io_splice* sp = &req->splice;
3634
3635         sp->off_in = READ_ONCE(sqe->splice_off_in);
3636         sp->off_out = READ_ONCE(sqe->off);
3637         return __io_splice_prep(req, sqe);
3638 }
3639
3640 static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
3641 {
3642         struct io_splice *sp = &req->splice;
3643         struct file *in = sp->file_in;
3644         struct file *out = sp->file_out;
3645         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3646         loff_t *poff_in, *poff_out;
3647         long ret = 0;
3648
3649         if (issue_flags & IO_URING_F_NONBLOCK)
3650                 return -EAGAIN;
3651
3652         poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
3653         poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
3654
3655         if (sp->len)
3656                 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
3657
3658         io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3659         req->flags &= ~REQ_F_NEED_CLEANUP;
3660
3661         if (ret != sp->len)
3662                 req_set_fail_links(req);
3663         io_req_complete(req, ret);
3664         return 0;
3665 }
3666
3667 /*
3668  * IORING_OP_NOP just posts a completion event, nothing else.
3669  */
3670 static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
3671 {
3672         struct io_ring_ctx *ctx = req->ctx;
3673
3674         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3675                 return -EINVAL;
3676
3677         __io_req_complete(req, issue_flags, 0, 0);
3678         return 0;
3679 }
3680
3681 static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3682 {
3683         struct io_ring_ctx *ctx = req->ctx;
3684
3685         if (!req->file)
3686                 return -EBADF;
3687
3688         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3689                 return -EINVAL;
3690         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
3691                 return -EINVAL;
3692
3693         req->sync.flags = READ_ONCE(sqe->fsync_flags);
3694         if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
3695                 return -EINVAL;
3696
3697         req->sync.off = READ_ONCE(sqe->off);
3698         req->sync.len = READ_ONCE(sqe->len);
3699         return 0;
3700 }
3701
3702 static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
3703 {
3704         loff_t end = req->sync.off + req->sync.len;
3705         int ret;
3706
3707         /* fsync always requires a blocking context */
3708         if (issue_flags & IO_URING_F_NONBLOCK)
3709                 return -EAGAIN;
3710
3711         ret = vfs_fsync_range(req->file, req->sync.off,
3712                                 end > 0 ? end : LLONG_MAX,
3713                                 req->sync.flags & IORING_FSYNC_DATASYNC);
3714         if (ret < 0)
3715                 req_set_fail_links(req);
3716         io_req_complete(req, ret);
3717         return 0;
3718 }
3719
3720 static int io_fallocate_prep(struct io_kiocb *req,
3721                              const struct io_uring_sqe *sqe)
3722 {
3723         if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
3724                 return -EINVAL;
3725         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3726                 return -EINVAL;
3727
3728         req->sync.off = READ_ONCE(sqe->off);
3729         req->sync.len = READ_ONCE(sqe->addr);
3730         req->sync.mode = READ_ONCE(sqe->len);
3731         return 0;
3732 }
3733
3734 static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
3735 {
3736         int ret;
3737
3738         /* fallocate always requiring blocking context */
3739         if (issue_flags & IO_URING_F_NONBLOCK)
3740                 return -EAGAIN;
3741         ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
3742                                 req->sync.len);
3743         if (ret < 0)
3744                 req_set_fail_links(req);
3745         io_req_complete(req, ret);
3746         return 0;
3747 }
3748
3749 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3750 {
3751         const char __user *fname;
3752         int ret;
3753
3754         if (unlikely(sqe->ioprio || sqe->buf_index))
3755                 return -EINVAL;
3756         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3757                 return -EBADF;
3758
3759         /* open.how should be already initialised */
3760         if (!(req->open.how.flags & O_PATH) && force_o_largefile())
3761                 req->open.how.flags |= O_LARGEFILE;
3762
3763         req->open.dfd = READ_ONCE(sqe->fd);
3764         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3765         req->open.filename = getname(fname);
3766         if (IS_ERR(req->open.filename)) {
3767                 ret = PTR_ERR(req->open.filename);
3768                 req->open.filename = NULL;
3769                 return ret;
3770         }
3771         req->open.nofile = rlimit(RLIMIT_NOFILE);
3772         req->flags |= REQ_F_NEED_CLEANUP;
3773         return 0;
3774 }
3775
3776 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3777 {
3778         u64 flags, mode;
3779
3780         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3781                 return -EINVAL;
3782         mode = READ_ONCE(sqe->len);
3783         flags = READ_ONCE(sqe->open_flags);
3784         req->open.how = build_open_how(flags, mode);
3785         return __io_openat_prep(req, sqe);
3786 }
3787
3788 static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3789 {
3790         struct open_how __user *how;
3791         size_t len;
3792         int ret;
3793
3794         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3795                 return -EINVAL;
3796         how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3797         len = READ_ONCE(sqe->len);
3798         if (len < OPEN_HOW_SIZE_VER0)
3799                 return -EINVAL;
3800
3801         ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
3802                                         len);
3803         if (ret)
3804                 return ret;
3805
3806         return __io_openat_prep(req, sqe);
3807 }
3808
3809 static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
3810 {
3811         struct open_flags op;
3812         struct file *file;
3813         bool nonblock_set;
3814         bool resolve_nonblock;
3815         int ret;
3816
3817         ret = build_open_flags(&req->open.how, &op);
3818         if (ret)
3819                 goto err;
3820         nonblock_set = op.open_flag & O_NONBLOCK;
3821         resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
3822         if (issue_flags & IO_URING_F_NONBLOCK) {
3823                 /*
3824                  * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
3825                  * it'll always -EAGAIN
3826                  */
3827                 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
3828                         return -EAGAIN;
3829                 op.lookup_flags |= LOOKUP_CACHED;
3830                 op.open_flag |= O_NONBLOCK;
3831         }
3832
3833         ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
3834         if (ret < 0)
3835                 goto err;
3836
3837         file = do_filp_open(req->open.dfd, req->open.filename, &op);
3838         /* only retry if RESOLVE_CACHED wasn't already set by application */
3839         if ((!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)) &&
3840             file == ERR_PTR(-EAGAIN)) {
3841                 /*
3842                  * We could hang on to this 'fd', but seems like marginal
3843                  * gain for something that is now known to be a slower path.
3844                  * So just put it, and we'll get a new one when we retry.
3845                  */
3846                 put_unused_fd(ret);
3847                 return -EAGAIN;
3848         }
3849
3850         if (IS_ERR(file)) {
3851                 put_unused_fd(ret);
3852                 ret = PTR_ERR(file);
3853         } else {
3854                 if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
3855                         file->f_flags &= ~O_NONBLOCK;
3856                 fsnotify_open(file);
3857                 fd_install(ret, file);
3858         }
3859 err:
3860         putname(req->open.filename);
3861         req->flags &= ~REQ_F_NEED_CLEANUP;
3862         if (ret < 0)
3863                 req_set_fail_links(req);
3864         io_req_complete(req, ret);
3865         return 0;
3866 }
3867
3868 static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
3869 {
3870         return io_openat2(req, issue_flags & IO_URING_F_NONBLOCK);
3871 }
3872
3873 static int io_remove_buffers_prep(struct io_kiocb *req,
3874                                   const struct io_uring_sqe *sqe)
3875 {
3876         struct io_provide_buf *p = &req->pbuf;
3877         u64 tmp;
3878
3879         if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
3880                 return -EINVAL;
3881
3882         tmp = READ_ONCE(sqe->fd);
3883         if (!tmp || tmp > USHRT_MAX)
3884                 return -EINVAL;
3885
3886         memset(p, 0, sizeof(*p));
3887         p->nbufs = tmp;
3888         p->bgid = READ_ONCE(sqe->buf_group);
3889         return 0;
3890 }
3891
3892 static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
3893                                int bgid, unsigned nbufs)
3894 {
3895         unsigned i = 0;
3896
3897         /* shouldn't happen */
3898         if (!nbufs)
3899                 return 0;
3900
3901         /* the head kbuf is the list itself */
3902         while (!list_empty(&buf->list)) {
3903                 struct io_buffer *nxt;
3904
3905                 nxt = list_first_entry(&buf->list, struct io_buffer, list);
3906                 list_del(&nxt->list);
3907                 kfree(nxt);
3908                 if (++i == nbufs)
3909                         return i;
3910         }
3911         i++;
3912         kfree(buf);
3913         idr_remove(&ctx->io_buffer_idr, bgid);
3914
3915         return i;
3916 }
3917
3918 static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
3919 {
3920         struct io_provide_buf *p = &req->pbuf;
3921         struct io_ring_ctx *ctx = req->ctx;
3922         struct io_buffer *head;
3923         int ret = 0;
3924         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3925
3926         io_ring_submit_lock(ctx, !force_nonblock);
3927
3928         lockdep_assert_held(&ctx->uring_lock);
3929
3930         ret = -ENOENT;
3931         head = idr_find(&ctx->io_buffer_idr, p->bgid);
3932         if (head)
3933                 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
3934         if (ret < 0)
3935                 req_set_fail_links(req);
3936
3937         /* need to hold the lock to complete IOPOLL requests */
3938         if (ctx->flags & IORING_SETUP_IOPOLL) {
3939                 __io_req_complete(req, issue_flags, ret, 0);
3940                 io_ring_submit_unlock(ctx, !force_nonblock);
3941         } else {
3942                 io_ring_submit_unlock(ctx, !force_nonblock);
3943                 __io_req_complete(req, issue_flags, ret, 0);
3944         }
3945         return 0;
3946 }
3947
3948 static int io_provide_buffers_prep(struct io_kiocb *req,
3949                                    const struct io_uring_sqe *sqe)
3950 {
3951         struct io_provide_buf *p = &req->pbuf;
3952         u64 tmp;
3953
3954         if (sqe->ioprio || sqe->rw_flags)
3955                 return -EINVAL;
3956
3957         tmp = READ_ONCE(sqe->fd);
3958         if (!tmp || tmp > USHRT_MAX)
3959                 return -E2BIG;
3960         p->nbufs = tmp;
3961         p->addr = READ_ONCE(sqe->addr);
3962         p->len = READ_ONCE(sqe->len);
3963
3964         if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
3965                 return -EFAULT;
3966
3967         p->bgid = READ_ONCE(sqe->buf_group);
3968         tmp = READ_ONCE(sqe->off);
3969         if (tmp > USHRT_MAX)
3970                 return -E2BIG;
3971         p->bid = tmp;
3972         return 0;
3973 }
3974
3975 static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
3976 {
3977         struct io_buffer *buf;
3978         u64 addr = pbuf->addr;
3979         int i, bid = pbuf->bid;
3980
3981         for (i = 0; i < pbuf->nbufs; i++) {
3982                 buf = kmalloc(sizeof(*buf), GFP_KERNEL);
3983                 if (!buf)
3984                         break;
3985
3986                 buf->addr = addr;
3987                 buf->len = pbuf->len;
3988                 buf->bid = bid;
3989                 addr += pbuf->len;
3990                 bid++;
3991                 if (!*head) {
3992                         INIT_LIST_HEAD(&buf->list);
3993                         *head = buf;
3994                 } else {
3995                         list_add_tail(&buf->list, &(*head)->list);
3996                 }
3997         }
3998
3999         return i ? i : -ENOMEM;
4000 }
4001
4002 static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
4003 {
4004         struct io_provide_buf *p = &req->pbuf;
4005         struct io_ring_ctx *ctx = req->ctx;
4006         struct io_buffer *head, *list;
4007         int ret = 0;
4008         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4009
4010         io_ring_submit_lock(ctx, !force_nonblock);
4011
4012         lockdep_assert_held(&ctx->uring_lock);
4013
4014         list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
4015
4016         ret = io_add_buffers(p, &head);
4017         if (ret < 0)
4018                 goto out;
4019
4020         if (!list) {
4021                 ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
4022                                         GFP_KERNEL);
4023                 if (ret < 0) {
4024                         __io_remove_buffers(ctx, head, p->bgid, -1U);
4025                         goto out;
4026                 }
4027         }
4028 out:
4029         if (ret < 0)
4030                 req_set_fail_links(req);
4031
4032         /* need to hold the lock to complete IOPOLL requests */
4033         if (ctx->flags & IORING_SETUP_IOPOLL) {
4034                 __io_req_complete(req, issue_flags, ret, 0);
4035                 io_ring_submit_unlock(ctx, !force_nonblock);
4036         } else {
4037                 io_ring_submit_unlock(ctx, !force_nonblock);
4038                 __io_req_complete(req, issue_flags, ret, 0);
4039         }
4040         return 0;
4041 }
4042
4043 static int io_epoll_ctl_prep(struct io_kiocb *req,
4044                              const struct io_uring_sqe *sqe)
4045 {
4046 #if defined(CONFIG_EPOLL)
4047         if (sqe->ioprio || sqe->buf_index)
4048                 return -EINVAL;
4049         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
4050                 return -EINVAL;
4051
4052         req->epoll.epfd = READ_ONCE(sqe->fd);
4053         req->epoll.op = READ_ONCE(sqe->len);
4054         req->epoll.fd = READ_ONCE(sqe->off);
4055
4056         if (ep_op_has_event(req->epoll.op)) {
4057                 struct epoll_event __user *ev;
4058
4059                 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4060                 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4061                         return -EFAULT;
4062         }
4063
4064         return 0;
4065 #else
4066         return -EOPNOTSUPP;
4067 #endif
4068 }
4069
4070 static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
4071 {
4072 #if defined(CONFIG_EPOLL)
4073         struct io_epoll *ie = &req->epoll;
4074         int ret;
4075         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4076
4077         ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4078         if (force_nonblock && ret == -EAGAIN)
4079                 return -EAGAIN;
4080
4081         if (ret < 0)
4082                 req_set_fail_links(req);
4083         __io_req_complete(req, issue_flags, ret, 0);
4084         return 0;
4085 #else
4086         return -EOPNOTSUPP;
4087 #endif
4088 }
4089
4090 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4091 {
4092 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4093         if (sqe->ioprio || sqe->buf_index || sqe->off)
4094                 return -EINVAL;
4095         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4096                 return -EINVAL;
4097
4098         req->madvise.addr = READ_ONCE(sqe->addr);
4099         req->madvise.len = READ_ONCE(sqe->len);
4100         req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4101         return 0;
4102 #else
4103         return -EOPNOTSUPP;
4104 #endif
4105 }
4106
4107 static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
4108 {
4109 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4110         struct io_madvise *ma = &req->madvise;
4111         int ret;
4112
4113         if (issue_flags & IO_URING_F_NONBLOCK)
4114                 return -EAGAIN;
4115
4116         ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
4117         if (ret < 0)
4118                 req_set_fail_links(req);
4119         io_req_complete(req, ret);
4120         return 0;
4121 #else
4122         return -EOPNOTSUPP;
4123 #endif
4124 }
4125
4126 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4127 {
4128         if (sqe->ioprio || sqe->buf_index || sqe->addr)
4129                 return -EINVAL;
4130         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4131                 return -EINVAL;
4132
4133         req->fadvise.offset = READ_ONCE(sqe->off);
4134         req->fadvise.len = READ_ONCE(sqe->len);
4135         req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
4136         return 0;
4137 }
4138
4139 static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
4140 {
4141         struct io_fadvise *fa = &req->fadvise;
4142         int ret;
4143
4144         if (issue_flags & IO_URING_F_NONBLOCK) {
4145                 switch (fa->advice) {
4146                 case POSIX_FADV_NORMAL:
4147                 case POSIX_FADV_RANDOM:
4148                 case POSIX_FADV_SEQUENTIAL:
4149                         break;
4150                 default:
4151                         return -EAGAIN;
4152                 }
4153         }
4154
4155         ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
4156         if (ret < 0)
4157                 req_set_fail_links(req);
4158         io_req_complete(req, ret);
4159         return 0;
4160 }
4161
4162 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4163 {
4164         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
4165                 return -EINVAL;
4166         if (sqe->ioprio || sqe->buf_index)
4167                 return -EINVAL;
4168         if (req->flags & REQ_F_FIXED_FILE)
4169                 return -EBADF;
4170
4171         req->statx.dfd = READ_ONCE(sqe->fd);
4172         req->statx.mask = READ_ONCE(sqe->len);
4173         req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
4174         req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4175         req->statx.flags = READ_ONCE(sqe->statx_flags);
4176
4177         return 0;
4178 }
4179
4180 static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
4181 {
4182         struct io_statx *ctx = &req->statx;
4183         int ret;
4184
4185         if (issue_flags & IO_URING_F_NONBLOCK) {
4186                 /* only need file table for an actual valid fd */
4187                 if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
4188                         req->flags |= REQ_F_NO_FILE_TABLE;
4189                 return -EAGAIN;
4190         }
4191
4192         ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
4193                        ctx->buffer);
4194
4195         if (ret < 0)
4196                 req_set_fail_links(req);
4197         io_req_complete(req, ret);
4198         return 0;
4199 }
4200
4201 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4202 {
4203         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4204                 return -EINVAL;
4205         if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
4206             sqe->rw_flags || sqe->buf_index)
4207                 return -EINVAL;
4208         if (req->flags & REQ_F_FIXED_FILE)
4209                 return -EBADF;
4210
4211         req->close.fd = READ_ONCE(sqe->fd);
4212         return 0;
4213 }
4214
4215 static int io_close(struct io_kiocb *req, unsigned int issue_flags)
4216 {
4217         struct files_struct *files = current->files;
4218         struct io_close *close = &req->close;
4219         struct fdtable *fdt;
4220         struct file *file;
4221         int ret;
4222
4223         file = NULL;
4224         ret = -EBADF;
4225         spin_lock(&files->file_lock);
4226         fdt = files_fdtable(files);
4227         if (close->fd >= fdt->max_fds) {
4228                 spin_unlock(&files->file_lock);
4229                 goto err;
4230         }
4231         file = fdt->fd[close->fd];
4232         if (!file) {
4233                 spin_unlock(&files->file_lock);
4234                 goto err;
4235         }
4236
4237         if (file->f_op == &io_uring_fops) {
4238                 spin_unlock(&files->file_lock);
4239                 file = NULL;
4240                 goto err;
4241         }
4242
4243         /* if the file has a flush method, be safe and punt to async */
4244         if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
4245                 spin_unlock(&files->file_lock);
4246                 return -EAGAIN;
4247         }
4248
4249         ret = __close_fd_get_file(close->fd, &file);
4250         spin_unlock(&files->file_lock);
4251         if (ret < 0) {
4252                 if (ret == -ENOENT)
4253                         ret = -EBADF;
4254                 goto err;
4255         }
4256
4257         /* No ->flush() or already async, safely close from here */
4258         ret = filp_close(file, current->files);
4259 err:
4260         if (ret < 0)
4261                 req_set_fail_links(req);
4262         if (file)
4263                 fput(file);
4264         __io_req_complete(req, issue_flags, ret, 0);
4265         return 0;
4266 }
4267
4268 static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4269 {
4270         struct io_ring_ctx *ctx = req->ctx;
4271
4272         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4273                 return -EINVAL;
4274         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
4275                 return -EINVAL;
4276
4277         req->sync.off = READ_ONCE(sqe->off);
4278         req->sync.len = READ_ONCE(sqe->len);
4279         req->sync.flags = READ_ONCE(sqe->sync_range_flags);
4280         return 0;
4281 }
4282
4283 static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
4284 {
4285         int ret;
4286
4287         /* sync_file_range always requires a blocking context */
4288         if (issue_flags & IO_URING_F_NONBLOCK)
4289                 return -EAGAIN;
4290
4291         ret = sync_file_range(req->file, req->sync.off, req->sync.len,
4292                                 req->sync.flags);
4293         if (ret < 0)
4294                 req_set_fail_links(req);
4295         io_req_complete(req, ret);
4296         return 0;
4297 }
4298
4299 #if defined(CONFIG_NET)
4300 static int io_setup_async_msg(struct io_kiocb *req,
4301                               struct io_async_msghdr *kmsg)
4302 {
4303         struct io_async_msghdr *async_msg = req->async_data;
4304
4305         if (async_msg)
4306                 return -EAGAIN;
4307         if (io_alloc_async_data(req)) {
4308                 kfree(kmsg->free_iov);
4309                 return -ENOMEM;
4310         }
4311         async_msg = req->async_data;
4312         req->flags |= REQ_F_NEED_CLEANUP;
4313         memcpy(async_msg, kmsg, sizeof(*kmsg));
4314         async_msg->msg.msg_name = &async_msg->addr;
4315         /* if were using fast_iov, set it to the new one */
4316         if (!async_msg->free_iov)
4317                 async_msg->msg.msg_iter.iov = async_msg->fast_iov;
4318
4319         return -EAGAIN;
4320 }
4321
4322 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
4323                                struct io_async_msghdr *iomsg)
4324 {
4325         iomsg->msg.msg_name = &iomsg->addr;
4326         iomsg->free_iov = iomsg->fast_iov;
4327         return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
4328                                    req->sr_msg.msg_flags, &iomsg->free_iov);
4329 }
4330
4331 static int io_sendmsg_prep_async(struct io_kiocb *req)
4332 {
4333         int ret;
4334
4335         if (!io_op_defs[req->opcode].needs_async_data)
4336                 return 0;
4337         ret = io_sendmsg_copy_hdr(req, req->async_data);
4338         if (!ret)
4339                 req->flags |= REQ_F_NEED_CLEANUP;
4340         return ret;
4341 }
4342
4343 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4344 {
4345         struct io_sr_msg *sr = &req->sr_msg;
4346
4347         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4348                 return -EINVAL;
4349
4350         sr->msg_flags = READ_ONCE(sqe->msg_flags);
4351         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4352         sr->len = READ_ONCE(sqe->len);
4353
4354 #ifdef CONFIG_COMPAT
4355         if (req->ctx->compat)
4356                 sr->msg_flags |= MSG_CMSG_COMPAT;
4357 #endif
4358         return 0;
4359 }
4360
4361 static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
4362 {
4363         struct io_async_msghdr iomsg, *kmsg;
4364         struct socket *sock;
4365         unsigned flags;
4366         int ret;
4367
4368         sock = sock_from_file(req->file);
4369         if (unlikely(!sock))
4370                 return -ENOTSOCK;
4371
4372         kmsg = req->async_data;
4373         if (!kmsg) {
4374                 ret = io_sendmsg_copy_hdr(req, &iomsg);
4375                 if (ret)
4376                         return ret;
4377                 kmsg = &iomsg;
4378         }
4379
4380         flags = req->sr_msg.msg_flags;
4381         if (flags & MSG_DONTWAIT)
4382                 req->flags |= REQ_F_NOWAIT;
4383         else if (issue_flags & IO_URING_F_NONBLOCK)
4384                 flags |= MSG_DONTWAIT;
4385
4386         ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
4387         if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4388                 return io_setup_async_msg(req, kmsg);
4389         if (ret == -ERESTARTSYS)
4390                 ret = -EINTR;
4391
4392         /* fast path, check for non-NULL to avoid function call */
4393         if (kmsg->free_iov)
4394                 kfree(kmsg->free_iov);
4395         req->flags &= ~REQ_F_NEED_CLEANUP;
4396         if (ret < 0)
4397                 req_set_fail_links(req);
4398         __io_req_complete(req, issue_flags, ret, 0);
4399         return 0;
4400 }
4401
4402 static int io_send(struct io_kiocb *req, unsigned int issue_flags)
4403 {
4404         struct io_sr_msg *sr = &req->sr_msg;
4405         struct msghdr msg;
4406         struct iovec iov;
4407         struct socket *sock;
4408         unsigned flags;
4409         int ret;
4410
4411         sock = sock_from_file(req->file);
4412         if (unlikely(!sock))
4413                 return -ENOTSOCK;
4414
4415         ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
4416         if (unlikely(ret))
4417                 return ret;
4418
4419         msg.msg_name = NULL;
4420         msg.msg_control = NULL;
4421         msg.msg_controllen = 0;
4422         msg.msg_namelen = 0;
4423
4424         flags = req->sr_msg.msg_flags;
4425         if (flags & MSG_DONTWAIT)
4426                 req->flags |= REQ_F_NOWAIT;
4427         else if (issue_flags & IO_URING_F_NONBLOCK)
4428                 flags |= MSG_DONTWAIT;
4429
4430         msg.msg_flags = flags;
4431         ret = sock_sendmsg(sock, &msg);
4432         if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4433                 return -EAGAIN;
4434         if (ret == -ERESTARTSYS)
4435                 ret = -EINTR;
4436
4437         if (ret < 0)
4438                 req_set_fail_links(req);
4439         __io_req_complete(req, issue_flags, ret, 0);
4440         return 0;
4441 }
4442
4443 static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
4444                                  struct io_async_msghdr *iomsg)
4445 {
4446         struct io_sr_msg *sr = &req->sr_msg;
4447         struct iovec __user *uiov;
4448         size_t iov_len;
4449         int ret;
4450
4451         ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
4452                                         &iomsg->uaddr, &uiov, &iov_len);
4453         if (ret)
4454                 return ret;
4455
4456         if (req->flags & REQ_F_BUFFER_SELECT) {
4457                 if (iov_len > 1)
4458                         return -EINVAL;
4459                 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
4460                         return -EFAULT;
4461                 sr->len = iomsg->fast_iov[0].iov_len;
4462                 iomsg->free_iov = NULL;
4463         } else {
4464                 iomsg->free_iov = iomsg->fast_iov;
4465                 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
4466                                      &iomsg->free_iov, &iomsg->msg.msg_iter,
4467                                      false);
4468                 if (ret > 0)
4469                         ret = 0;
4470         }
4471
4472         return ret;
4473 }
4474
4475 #ifdef CONFIG_COMPAT
4476 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
4477                                         struct io_async_msghdr *iomsg)
4478 {
4479         struct compat_msghdr __user *msg_compat;
4480         struct io_sr_msg *sr = &req->sr_msg;
4481         struct compat_iovec __user *uiov;
4482         compat_uptr_t ptr;
4483         compat_size_t len;
4484         int ret;
4485
4486         msg_compat = (struct compat_msghdr __user *) sr->umsg;
4487         ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr,
4488                                         &ptr, &len);
4489         if (ret)
4490                 return ret;
4491
4492         uiov = compat_ptr(ptr);
4493         if (req->flags & REQ_F_BUFFER_SELECT) {
4494                 compat_ssize_t clen;
4495
4496                 if (len > 1)
4497                         return -EINVAL;
4498                 if (!access_ok(uiov, sizeof(*uiov)))
4499                         return -EFAULT;
4500                 if (__get_user(clen, &uiov->iov_len))
4501                         return -EFAULT;
4502                 if (clen < 0)
4503                         return -EINVAL;
4504                 sr->len = clen;
4505                 iomsg->free_iov = NULL;
4506         } else {
4507                 iomsg->free_iov = iomsg->fast_iov;
4508                 ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
4509                                    UIO_FASTIOV, &iomsg->free_iov,
4510                                    &iomsg->msg.msg_iter, true);
4511                 if (ret < 0)
4512                         return ret;
4513         }
4514
4515         return 0;
4516 }
4517 #endif
4518
4519 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
4520                                struct io_async_msghdr *iomsg)
4521 {
4522         iomsg->msg.msg_name = &iomsg->addr;
4523
4524 #ifdef CONFIG_COMPAT
4525         if (req->ctx->compat)
4526                 return __io_compat_recvmsg_copy_hdr(req, iomsg);
4527 #endif
4528
4529         return __io_recvmsg_copy_hdr(req, iomsg);
4530 }
4531
4532 static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
4533                                                bool needs_lock)
4534 {
4535         struct io_sr_msg *sr = &req->sr_msg;
4536         struct io_buffer *kbuf;
4537
4538         kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
4539         if (IS_ERR(kbuf))
4540                 return kbuf;
4541
4542         sr->kbuf = kbuf;
4543         req->flags |= REQ_F_BUFFER_SELECTED;
4544         return kbuf;
4545 }
4546
4547 static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
4548 {
4549         return io_put_kbuf(req, req->sr_msg.kbuf);
4550 }
4551
4552 static int io_recvmsg_prep_async(struct io_kiocb *req)
4553 {
4554         int ret;
4555
4556         if (!io_op_defs[req->opcode].needs_async_data)
4557                 return 0;
4558         ret = io_recvmsg_copy_hdr(req, req->async_data);
4559         if (!ret)
4560                 req->flags |= REQ_F_NEED_CLEANUP;
4561         return ret;
4562 }
4563
4564 static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4565 {
4566         struct io_sr_msg *sr = &req->sr_msg;
4567
4568         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4569                 return -EINVAL;
4570
4571         sr->msg_flags = READ_ONCE(sqe->msg_flags);
4572         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4573         sr->len = READ_ONCE(sqe->len);
4574         sr->bgid = READ_ONCE(sqe->buf_group);
4575
4576 #ifdef CONFIG_COMPAT
4577         if (req->ctx->compat)
4578                 sr->msg_flags |= MSG_CMSG_COMPAT;
4579 #endif
4580         return 0;
4581 }
4582
4583 static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
4584 {
4585         struct io_async_msghdr iomsg, *kmsg;
4586         struct socket *sock;
4587         struct io_buffer *kbuf;
4588         unsigned flags;
4589         int ret, cflags = 0;
4590         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4591
4592         sock = sock_from_file(req->file);
4593         if (unlikely(!sock))
4594                 return -ENOTSOCK;
4595
4596         kmsg = req->async_data;
4597         if (!kmsg) {
4598                 ret = io_recvmsg_copy_hdr(req, &iomsg);
4599                 if (ret)
4600                         return ret;
4601                 kmsg = &iomsg;
4602         }
4603
4604         if (req->flags & REQ_F_BUFFER_SELECT) {
4605                 kbuf = io_recv_buffer_select(req, !force_nonblock);
4606                 if (IS_ERR(kbuf))
4607                         return PTR_ERR(kbuf);
4608                 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
4609                 kmsg->fast_iov[0].iov_len = req->sr_msg.len;
4610                 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
4611                                 1, req->sr_msg.len);
4612         }
4613
4614         flags = req->sr_msg.msg_flags;
4615         if (flags & MSG_DONTWAIT)
4616                 req->flags |= REQ_F_NOWAIT;
4617         else if (force_nonblock)
4618                 flags |= MSG_DONTWAIT;
4619
4620         ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
4621                                         kmsg->uaddr, flags);
4622         if (force_nonblock && ret == -EAGAIN)
4623                 return io_setup_async_msg(req, kmsg);
4624         if (ret == -ERESTARTSYS)
4625                 ret = -EINTR;
4626
4627         if (req->flags & REQ_F_BUFFER_SELECTED)
4628                 cflags = io_put_recv_kbuf(req);
4629         /* fast path, check for non-NULL to avoid function call */
4630         if (kmsg->free_iov)
4631                 kfree(kmsg->free_iov);
4632         req->flags &= ~REQ_F_NEED_CLEANUP;
4633         if (ret < 0)
4634                 req_set_fail_links(req);
4635         __io_req_complete(req, issue_flags, ret, cflags);
4636         return 0;
4637 }
4638
4639 static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
4640 {
4641         struct io_buffer *kbuf;
4642         struct io_sr_msg *sr = &req->sr_msg;
4643         struct msghdr msg;
4644         void __user *buf = sr->buf;
4645         struct socket *sock;
4646         struct iovec iov;
4647         unsigned flags;
4648         int ret, cflags = 0;
4649         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4650
4651         sock = sock_from_file(req->file);
4652         if (unlikely(!sock))
4653                 return -ENOTSOCK;
4654
4655         if (req->flags & REQ_F_BUFFER_SELECT) {
4656                 kbuf = io_recv_buffer_select(req, !force_nonblock);
4657                 if (IS_ERR(kbuf))
4658                         return PTR_ERR(kbuf);
4659                 buf = u64_to_user_ptr(kbuf->addr);
4660         }
4661
4662         ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
4663         if (unlikely(ret))
4664                 goto out_free;
4665
4666         msg.msg_name = NULL;
4667         msg.msg_control = NULL;
4668         msg.msg_controllen = 0;
4669         msg.msg_namelen = 0;
4670         msg.msg_iocb = NULL;
4671         msg.msg_flags = 0;
4672
4673         flags = req->sr_msg.msg_flags;
4674         if (flags & MSG_DONTWAIT)
4675                 req->flags |= REQ_F_NOWAIT;
4676         else if (force_nonblock)
4677                 flags |= MSG_DONTWAIT;
4678
4679         ret = sock_recvmsg(sock, &msg, flags);
4680         if (force_nonblock && ret == -EAGAIN)
4681                 return -EAGAIN;
4682         if (ret == -ERESTARTSYS)
4683                 ret = -EINTR;
4684 out_free:
4685         if (req->flags & REQ_F_BUFFER_SELECTED)
4686                 cflags = io_put_recv_kbuf(req);
4687         if (ret < 0)
4688                 req_set_fail_links(req);
4689         __io_req_complete(req, issue_flags, ret, cflags);
4690         return 0;
4691 }
4692
4693 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4694 {
4695         struct io_accept *accept = &req->accept;
4696
4697         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4698                 return -EINVAL;
4699         if (sqe->ioprio || sqe->len || sqe->buf_index)
4700                 return -EINVAL;
4701
4702         accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4703         accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4704         accept->flags = READ_ONCE(sqe->accept_flags);
4705         accept->nofile = rlimit(RLIMIT_NOFILE);
4706         return 0;
4707 }
4708
4709 static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
4710 {
4711         struct io_accept *accept = &req->accept;
4712         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4713         unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
4714         int ret;
4715
4716         if (req->file->f_flags & O_NONBLOCK)
4717                 req->flags |= REQ_F_NOWAIT;
4718
4719         ret = __sys_accept4_file(req->file, file_flags, accept->addr,
4720                                         accept->addr_len, accept->flags,
4721                                         accept->nofile);
4722         if (ret == -EAGAIN && force_nonblock)
4723                 return -EAGAIN;
4724         if (ret < 0) {
4725                 if (ret == -ERESTARTSYS)
4726                         ret = -EINTR;
4727                 req_set_fail_links(req);
4728         }
4729         __io_req_complete(req, issue_flags, ret, 0);
4730         return 0;
4731 }
4732
4733 static int io_connect_prep_async(struct io_kiocb *req)
4734 {
4735         struct io_async_connect *io = req->async_data;
4736         struct io_connect *conn = &req->connect;
4737
4738         return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
4739 }
4740
4741 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4742 {
4743         struct io_connect *conn = &req->connect;
4744
4745         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4746                 return -EINVAL;
4747         if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
4748                 return -EINVAL;
4749
4750         conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4751         conn->addr_len =  READ_ONCE(sqe->addr2);
4752         return 0;
4753 }
4754
4755 static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
4756 {
4757         struct io_async_connect __io, *io;
4758         unsigned file_flags;
4759         int ret;
4760         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4761
4762         if (req->async_data) {
4763                 io = req->async_data;
4764         } else {
4765                 ret = move_addr_to_kernel(req->connect.addr,
4766                                                 req->connect.addr_len,
4767                                                 &__io.address);
4768                 if (ret)
4769                         goto out;
4770                 io = &__io;
4771         }
4772
4773         file_flags = force_nonblock ? O_NONBLOCK : 0;
4774
4775         ret = __sys_connect_file(req->file, &io->address,
4776                                         req->connect.addr_len, file_flags);
4777         if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
4778                 if (req->async_data)
4779                         return -EAGAIN;
4780                 if (io_alloc_async_data(req)) {
4781                         ret = -ENOMEM;
4782                         goto out;
4783                 }
4784                 io = req->async_data;
4785                 memcpy(req->async_data, &__io, sizeof(__io));
4786                 return -EAGAIN;
4787         }
4788         if (ret == -ERESTARTSYS)
4789                 ret = -EINTR;
4790 out:
4791         if (ret < 0)
4792                 req_set_fail_links(req);
4793         __io_req_complete(req, issue_flags, ret, 0);
4794         return 0;
4795 }
4796 #else /* !CONFIG_NET */
4797 #define IO_NETOP_FN(op)                                                 \
4798 static int io_##op(struct io_kiocb *req, unsigned int issue_flags)      \
4799 {                                                                       \
4800         return -EOPNOTSUPP;                                             \
4801 }
4802
4803 #define IO_NETOP_PREP(op)                                               \
4804 IO_NETOP_FN(op)                                                         \
4805 static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
4806 {                                                                       \
4807         return -EOPNOTSUPP;                                             \
4808 }                                                                       \
4809
4810 #define IO_NETOP_PREP_ASYNC(op)                                         \
4811 IO_NETOP_PREP(op)                                                       \
4812 static int io_##op##_prep_async(struct io_kiocb *req)                   \
4813 {                                                                       \
4814         return -EOPNOTSUPP;                                             \
4815 }
4816
4817 IO_NETOP_PREP_ASYNC(sendmsg);
4818 IO_NETOP_PREP_ASYNC(recvmsg);
4819 IO_NETOP_PREP_ASYNC(connect);
4820 IO_NETOP_PREP(accept);
4821 IO_NETOP_FN(send);
4822 IO_NETOP_FN(recv);
4823 #endif /* CONFIG_NET */
4824
4825 struct io_poll_table {
4826         struct poll_table_struct pt;
4827         struct io_kiocb *req;
4828         int error;
4829 };
4830
4831 static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
4832                            __poll_t mask, task_work_func_t func)
4833 {
4834         int ret;
4835
4836         /* for instances that support it check for an event match first: */
4837         if (mask && !(mask & poll->events))
4838                 return 0;
4839
4840         trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
4841
4842         list_del_init(&poll->wait.entry);
4843
4844         req->result = mask;
4845         req->task_work.func = func;
4846         percpu_ref_get(&req->ctx->refs);
4847
4848         /*
4849          * If this fails, then the task is exiting. When a task exits, the
4850          * work gets canceled, so just cancel this request as well instead
4851          * of executing it. We can't safely execute it anyway, as we may not
4852          * have the needed state needed for it anyway.
4853          */
4854         ret = io_req_task_work_add(req);
4855         if (unlikely(ret)) {
4856                 WRITE_ONCE(poll->canceled, true);
4857                 io_req_task_work_add_fallback(req, func);
4858         }
4859         return 1;
4860 }
4861
4862 static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
4863         __acquires(&req->ctx->completion_lock)
4864 {
4865         struct io_ring_ctx *ctx = req->ctx;
4866
4867         if (!req->result && !READ_ONCE(poll->canceled)) {
4868                 struct poll_table_struct pt = { ._key = poll->events };
4869
4870                 req->result = vfs_poll(req->file, &pt) & poll->events;
4871         }
4872
4873         spin_lock_irq(&ctx->completion_lock);
4874         if (!req->result && !READ_ONCE(poll->canceled)) {
4875                 add_wait_queue(poll->head, &poll->wait);
4876                 return true;
4877         }
4878
4879         return false;
4880 }
4881
4882 static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
4883 {
4884         /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
4885         if (req->opcode == IORING_OP_POLL_ADD)
4886                 return req->async_data;
4887         return req->apoll->double_poll;
4888 }
4889
4890 static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
4891 {
4892         if (req->opcode == IORING_OP_POLL_ADD)
4893                 return &req->poll;
4894         return &req->apoll->poll;
4895 }
4896
4897 static void io_poll_remove_double(struct io_kiocb *req)
4898 {
4899         struct io_poll_iocb *poll = io_poll_get_double(req);
4900
4901         lockdep_assert_held(&req->ctx->completion_lock);
4902
4903         if (poll && poll->head) {
4904                 struct wait_queue_head *head = poll->head;
4905
4906                 spin_lock(&head->lock);
4907                 list_del_init(&poll->wait.entry);
4908                 if (poll->wait.private)
4909                         refcount_dec(&req->refs);
4910                 poll->head = NULL;
4911                 spin_unlock(&head->lock);
4912         }
4913 }
4914
4915 static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
4916 {
4917         struct io_ring_ctx *ctx = req->ctx;
4918
4919         io_poll_remove_double(req);
4920         req->poll.done = true;
4921         io_cqring_fill_event(req, error ? error : mangle_poll(mask));
4922         io_commit_cqring(ctx);
4923 }
4924
4925 static void io_poll_task_func(struct callback_head *cb)
4926 {
4927         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4928         struct io_ring_ctx *ctx = req->ctx;
4929         struct io_kiocb *nxt;
4930
4931         if (io_poll_rewait(req, &req->poll)) {
4932                 spin_unlock_irq(&ctx->completion_lock);
4933         } else {
4934                 hash_del(&req->hash_node);
4935                 io_poll_complete(req, req->result, 0);
4936                 spin_unlock_irq(&ctx->completion_lock);
4937
4938                 nxt = io_put_req_find_next(req);
4939                 io_cqring_ev_posted(ctx);
4940                 if (nxt)
4941                         __io_req_task_submit(nxt);
4942         }
4943
4944         percpu_ref_put(&ctx->refs);
4945 }
4946
4947 static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
4948                                int sync, void *key)
4949 {
4950         struct io_kiocb *req = wait->private;
4951         struct io_poll_iocb *poll = io_poll_get_single(req);
4952         __poll_t mask = key_to_poll(key);
4953
4954         /* for instances that support it check for an event match first: */
4955         if (mask && !(mask & poll->events))
4956                 return 0;
4957
4958         list_del_init(&wait->entry);
4959
4960         if (poll && poll->head) {
4961                 bool done;
4962
4963                 spin_lock(&poll->head->lock);
4964                 done = list_empty(&poll->wait.entry);
4965                 if (!done)
4966                         list_del_init(&poll->wait.entry);
4967                 /* make sure double remove sees this as being gone */
4968                 wait->private = NULL;
4969                 spin_unlock(&poll->head->lock);
4970                 if (!done) {
4971                         /* use wait func handler, so it matches the rq type */
4972                         poll->wait.func(&poll->wait, mode, sync, key);
4973                 }
4974         }
4975         refcount_dec(&req->refs);
4976         return 1;
4977 }
4978
4979 static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
4980                               wait_queue_func_t wake_func)
4981 {
4982         poll->head = NULL;
4983         poll->done = false;
4984         poll->canceled = false;
4985         poll->events = events;
4986         INIT_LIST_HEAD(&poll->wait.entry);
4987         init_waitqueue_func_entry(&poll->wait, wake_func);
4988 }
4989
4990 static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
4991                             struct wait_queue_head *head,
4992                             struct io_poll_iocb **poll_ptr)
4993 {
4994         struct io_kiocb *req = pt->req;
4995
4996         /*
4997          * If poll->head is already set, it's because the file being polled
4998          * uses multiple waitqueues for poll handling (eg one for read, one
4999          * for write). Setup a separate io_poll_iocb if this happens.
5000          */
5001         if (unlikely(poll->head)) {
5002                 struct io_poll_iocb *poll_one = poll;
5003
5004                 /* already have a 2nd entry, fail a third attempt */
5005                 if (*poll_ptr) {
5006                         pt->error = -EINVAL;
5007                         return;
5008                 }
5009                 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
5010                 if (!poll) {
5011                         pt->error = -ENOMEM;
5012                         return;
5013                 }
5014                 io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
5015                 refcount_inc(&req->refs);
5016                 poll->wait.private = req;
5017                 *poll_ptr = poll;
5018         }
5019
5020         pt->error = 0;
5021         poll->head = head;
5022
5023         if (poll->events & EPOLLEXCLUSIVE)
5024                 add_wait_queue_exclusive(head, &poll->wait);
5025         else
5026                 add_wait_queue(head, &poll->wait);
5027 }
5028
5029 static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
5030                                struct poll_table_struct *p)
5031 {
5032         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5033         struct async_poll *apoll = pt->req->apoll;
5034
5035         __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
5036 }
5037
5038 static void io_async_task_func(struct callback_head *cb)
5039 {
5040         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
5041         struct async_poll *apoll = req->apoll;
5042         struct io_ring_ctx *ctx = req->ctx;
5043
5044         trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
5045
5046         if (io_poll_rewait(req, &apoll->poll)) {
5047                 spin_unlock_irq(&ctx->completion_lock);
5048                 percpu_ref_put(&ctx->refs);
5049                 return;
5050         }
5051
5052         /* If req is still hashed, it cannot have been canceled. Don't check. */
5053         if (hash_hashed(&req->hash_node))
5054                 hash_del(&req->hash_node);
5055
5056         io_poll_remove_double(req);
5057         spin_unlock_irq(&ctx->completion_lock);
5058
5059         if (!READ_ONCE(apoll->poll.canceled))
5060                 __io_req_task_submit(req);
5061         else
5062                 __io_req_task_cancel(req, -ECANCELED);
5063
5064         percpu_ref_put(&ctx->refs);
5065         kfree(apoll->double_poll);
5066         kfree(apoll);
5067 }
5068
5069 static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5070                         void *key)
5071 {
5072         struct io_kiocb *req = wait->private;
5073         struct io_poll_iocb *poll = &req->apoll->poll;
5074
5075         trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
5076                                         key_to_poll(key));
5077
5078         return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
5079 }
5080
5081 static void io_poll_req_insert(struct io_kiocb *req)
5082 {
5083         struct io_ring_ctx *ctx = req->ctx;
5084         struct hlist_head *list;
5085
5086         list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
5087         hlist_add_head(&req->hash_node, list);
5088 }
5089
5090 static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
5091                                       struct io_poll_iocb *poll,
5092                                       struct io_poll_table *ipt, __poll_t mask,
5093                                       wait_queue_func_t wake_func)
5094         __acquires(&ctx->completion_lock)
5095 {
5096         struct io_ring_ctx *ctx = req->ctx;
5097         bool cancel = false;
5098
5099         INIT_HLIST_NODE(&req->hash_node);
5100         io_init_poll_iocb(poll, mask, wake_func);
5101         poll->file = req->file;
5102         poll->wait.private = req;
5103
5104         ipt->pt._key = mask;
5105         ipt->req = req;
5106         ipt->error = -EINVAL;
5107
5108         mask = vfs_poll(req->file, &ipt->pt) & poll->events;
5109
5110         spin_lock_irq(&ctx->completion_lock);
5111         if (likely(poll->head)) {
5112                 spin_lock(&poll->head->lock);
5113                 if (unlikely(list_empty(&poll->wait.entry))) {
5114                         if (ipt->error)
5115                                 cancel = true;
5116                         ipt->error = 0;
5117                         mask = 0;
5118                 }
5119                 if (mask || ipt->error)
5120                         list_del_init(&poll->wait.entry);
5121                 else if (cancel)
5122                         WRITE_ONCE(poll->canceled, true);
5123                 else if (!poll->done) /* actually waiting for an event */
5124                         io_poll_req_insert(req);
5125                 spin_unlock(&poll->head->lock);
5126         }
5127
5128         return mask;
5129 }
5130
5131 static bool io_arm_poll_handler(struct io_kiocb *req)
5132 {
5133         const struct io_op_def *def = &io_op_defs[req->opcode];
5134         struct io_ring_ctx *ctx = req->ctx;
5135         struct async_poll *apoll;
5136         struct io_poll_table ipt;
5137         __poll_t mask, ret;
5138         int rw;
5139
5140         if (!req->file || !file_can_poll(req->file))
5141                 return false;
5142         if (req->flags & REQ_F_POLLED)
5143                 return false;
5144         if (def->pollin)
5145                 rw = READ;
5146         else if (def->pollout)
5147                 rw = WRITE;
5148         else
5149                 return false;
5150         /* if we can't nonblock try, then no point in arming a poll handler */
5151         if (!io_file_supports_async(req->file, rw))
5152                 return false;
5153
5154         apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5155         if (unlikely(!apoll))
5156                 return false;
5157         apoll->double_poll = NULL;
5158
5159         req->flags |= REQ_F_POLLED;
5160         req->apoll = apoll;
5161
5162         mask = 0;
5163         if (def->pollin)
5164                 mask |= POLLIN | POLLRDNORM;
5165         if (def->pollout)
5166                 mask |= POLLOUT | POLLWRNORM;
5167
5168         /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
5169         if ((req->opcode == IORING_OP_RECVMSG) &&
5170             (req->sr_msg.msg_flags & MSG_ERRQUEUE))
5171                 mask &= ~POLLIN;
5172
5173         mask |= POLLERR | POLLPRI;
5174
5175         ipt.pt._qproc = io_async_queue_proc;
5176
5177         ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
5178                                         io_async_wake);
5179         if (ret || ipt.error) {
5180                 io_poll_remove_double(req);
5181                 spin_unlock_irq(&ctx->completion_lock);
5182                 kfree(apoll->double_poll);
5183                 kfree(apoll);
5184                 return false;
5185         }
5186         spin_unlock_irq(&ctx->completion_lock);
5187         trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
5188                                         apoll->poll.events);
5189         return true;
5190 }
5191
5192 static bool __io_poll_remove_one(struct io_kiocb *req,
5193                                  struct io_poll_iocb *poll)
5194 {
5195         bool do_complete = false;
5196
5197         spin_lock(&poll->head->lock);
5198         WRITE_ONCE(poll->canceled, true);
5199         if (!list_empty(&poll->wait.entry)) {
5200                 list_del_init(&poll->wait.entry);
5201                 do_complete = true;
5202         }
5203         spin_unlock(&poll->head->lock);
5204         hash_del(&req->hash_node);
5205         return do_complete;
5206 }
5207
5208 static bool io_poll_remove_one(struct io_kiocb *req)
5209 {
5210         bool do_complete;
5211
5212         io_poll_remove_double(req);
5213
5214         if (req->opcode == IORING_OP_POLL_ADD) {
5215                 do_complete = __io_poll_remove_one(req, &req->poll);
5216         } else {
5217                 struct async_poll *apoll = req->apoll;
5218
5219                 /* non-poll requests have submit ref still */
5220                 do_complete = __io_poll_remove_one(req, &apoll->poll);
5221                 if (do_complete) {
5222                         io_put_req(req);
5223                         kfree(apoll->double_poll);
5224                         kfree(apoll);
5225                 }
5226         }
5227
5228         if (do_complete) {
5229                 io_cqring_fill_event(req, -ECANCELED);
5230                 io_commit_cqring(req->ctx);
5231                 req_set_fail_links(req);
5232                 io_put_req_deferred(req, 1);
5233         }
5234
5235         return do_complete;
5236 }
5237
5238 /*
5239  * Returns true if we found and killed one or more poll requests
5240  */
5241 static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
5242                                struct files_struct *files)
5243 {
5244         struct hlist_node *tmp;
5245         struct io_kiocb *req;
5246         int posted = 0, i;
5247
5248         spin_lock_irq(&ctx->completion_lock);
5249         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
5250                 struct hlist_head *list;
5251
5252                 list = &ctx->cancel_hash[i];
5253                 hlist_for_each_entry_safe(req, tmp, list, hash_node) {
5254                         if (io_match_task(req, tsk, files))
5255                                 posted += io_poll_remove_one(req);
5256                 }
5257         }
5258         spin_unlock_irq(&ctx->completion_lock);
5259
5260         if (posted)
5261                 io_cqring_ev_posted(ctx);
5262
5263         return posted != 0;
5264 }
5265
5266 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
5267 {
5268         struct hlist_head *list;
5269         struct io_kiocb *req;
5270
5271         list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
5272         hlist_for_each_entry(req, list, hash_node) {
5273                 if (sqe_addr != req->user_data)
5274                         continue;
5275                 if (io_poll_remove_one(req))
5276                         return 0;
5277                 return -EALREADY;
5278         }
5279
5280         return -ENOENT;
5281 }
5282
5283 static int io_poll_remove_prep(struct io_kiocb *req,
5284                                const struct io_uring_sqe *sqe)
5285 {
5286         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5287                 return -EINVAL;
5288         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
5289             sqe->poll_events)
5290                 return -EINVAL;
5291
5292         req->poll_remove.addr = READ_ONCE(sqe->addr);
5293         return 0;
5294 }
5295
5296 /*
5297  * Find a running poll command that matches one specified in sqe->addr,
5298  * and remove it if found.
5299  */
5300 static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
5301 {
5302         struct io_ring_ctx *ctx = req->ctx;
5303         int ret;
5304
5305         spin_lock_irq(&ctx->completion_lock);
5306         ret = io_poll_cancel(ctx, req->poll_remove.addr);
5307         spin_unlock_irq(&ctx->completion_lock);
5308
5309         if (ret < 0)
5310                 req_set_fail_links(req);
5311         io_req_complete(req, ret);
5312         return 0;
5313 }
5314
5315 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5316                         void *key)
5317 {
5318         struct io_kiocb *req = wait->private;
5319         struct io_poll_iocb *poll = &req->poll;
5320
5321         return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
5322 }
5323
5324 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
5325                                struct poll_table_struct *p)
5326 {
5327         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5328
5329         __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
5330 }
5331
5332 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5333 {
5334         struct io_poll_iocb *poll = &req->poll;
5335         u32 events;
5336
5337         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5338                 return -EINVAL;
5339         if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
5340                 return -EINVAL;
5341
5342         events = READ_ONCE(sqe->poll32_events);
5343 #ifdef __BIG_ENDIAN
5344         events = swahw32(events);
5345 #endif
5346         poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
5347                        (events & EPOLLEXCLUSIVE);
5348         return 0;
5349 }
5350
5351 static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
5352 {
5353         struct io_poll_iocb *poll = &req->poll;
5354         struct io_ring_ctx *ctx = req->ctx;
5355         struct io_poll_table ipt;
5356         __poll_t mask;
5357
5358         ipt.pt._qproc = io_poll_queue_proc;
5359
5360         mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
5361                                         io_poll_wake);
5362
5363         if (mask) { /* no async, we'd stolen it */
5364                 ipt.error = 0;
5365                 io_poll_complete(req, mask, 0);
5366         }
5367         spin_unlock_irq(&ctx->completion_lock);
5368
5369         if (mask) {
5370                 io_cqring_ev_posted(ctx);
5371                 io_put_req(req);
5372         }
5373         return ipt.error;
5374 }
5375
5376 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
5377 {
5378         struct io_timeout_data *data = container_of(timer,
5379                                                 struct io_timeout_data, timer);
5380         struct io_kiocb *req = data->req;
5381         struct io_ring_ctx *ctx = req->ctx;
5382         unsigned long flags;
5383
5384         spin_lock_irqsave(&ctx->completion_lock, flags);
5385         list_del_init(&req->timeout.list);
5386         atomic_set(&req->ctx->cq_timeouts,
5387                 atomic_read(&req->ctx->cq_timeouts) + 1);
5388
5389         io_cqring_fill_event(req, -ETIME);
5390         io_commit_cqring(ctx);
5391         spin_unlock_irqrestore(&ctx->completion_lock, flags);
5392
5393         io_cqring_ev_posted(ctx);
5394         req_set_fail_links(req);
5395         io_put_req(req);
5396         return HRTIMER_NORESTART;
5397 }
5398
5399 static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
5400                                            __u64 user_data)
5401 {
5402         struct io_timeout_data *io;
5403         struct io_kiocb *req;
5404         int ret = -ENOENT;
5405
5406         list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
5407                 if (user_data == req->user_data) {
5408                         ret = 0;
5409                         break;
5410                 }
5411         }
5412
5413         if (ret == -ENOENT)
5414                 return ERR_PTR(ret);
5415
5416         io = req->async_data;
5417         ret = hrtimer_try_to_cancel(&io->timer);
5418         if (ret == -1)
5419                 return ERR_PTR(-EALREADY);
5420         list_del_init(&req->timeout.list);
5421         return req;
5422 }
5423
5424 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
5425 {
5426         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5427
5428         if (IS_ERR(req))
5429                 return PTR_ERR(req);
5430
5431         req_set_fail_links(req);
5432         io_cqring_fill_event(req, -ECANCELED);
5433         io_put_req_deferred(req, 1);
5434         return 0;
5435 }
5436
5437 static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
5438                              struct timespec64 *ts, enum hrtimer_mode mode)
5439 {
5440         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5441         struct io_timeout_data *data;
5442
5443         if (IS_ERR(req))
5444                 return PTR_ERR(req);
5445
5446         req->timeout.off = 0; /* noseq */
5447         data = req->async_data;
5448         list_add_tail(&req->timeout.list, &ctx->timeout_list);
5449         hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
5450         data->timer.function = io_timeout_fn;
5451         hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
5452         return 0;
5453 }
5454
5455 static int io_timeout_remove_prep(struct io_kiocb *req,
5456                                   const struct io_uring_sqe *sqe)
5457 {
5458         struct io_timeout_rem *tr = &req->timeout_rem;
5459
5460         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5461                 return -EINVAL;
5462         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5463                 return -EINVAL;
5464         if (sqe->ioprio || sqe->buf_index || sqe->len)
5465                 return -EINVAL;
5466
5467         tr->addr = READ_ONCE(sqe->addr);
5468         tr->flags = READ_ONCE(sqe->timeout_flags);
5469         if (tr->flags & IORING_TIMEOUT_UPDATE) {
5470                 if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
5471                         return -EINVAL;
5472                 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
5473                         return -EFAULT;
5474         } else if (tr->flags) {
5475                 /* timeout removal doesn't support flags */
5476                 return -EINVAL;
5477         }
5478
5479         return 0;
5480 }
5481
5482 static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
5483 {
5484         return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
5485                                             : HRTIMER_MODE_REL;
5486 }
5487
5488 /*
5489  * Remove or update an existing timeout command
5490  */
5491 static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
5492 {
5493         struct io_timeout_rem *tr = &req->timeout_rem;
5494         struct io_ring_ctx *ctx = req->ctx;
5495         int ret;
5496
5497         spin_lock_irq(&ctx->completion_lock);
5498         if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE))
5499                 ret = io_timeout_cancel(ctx, tr->addr);
5500         else
5501                 ret = io_timeout_update(ctx, tr->addr, &tr->ts,
5502                                         io_translate_timeout_mode(tr->flags));
5503
5504         io_cqring_fill_event(req, ret);
5505         io_commit_cqring(ctx);
5506         spin_unlock_irq(&ctx->completion_lock);
5507         io_cqring_ev_posted(ctx);
5508         if (ret < 0)
5509                 req_set_fail_links(req);
5510         io_put_req(req);
5511         return 0;
5512 }
5513
5514 static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5515                            bool is_timeout_link)
5516 {
5517         struct io_timeout_data *data;
5518         unsigned flags;
5519         u32 off = READ_ONCE(sqe->off);
5520
5521         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5522                 return -EINVAL;
5523         if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
5524                 return -EINVAL;
5525         if (off && is_timeout_link)
5526                 return -EINVAL;
5527         flags = READ_ONCE(sqe->timeout_flags);
5528         if (flags & ~IORING_TIMEOUT_ABS)
5529                 return -EINVAL;
5530
5531         req->timeout.off = off;
5532
5533         if (!req->async_data && io_alloc_async_data(req))
5534                 return -ENOMEM;
5535
5536         data = req->async_data;
5537         data->req = req;
5538
5539         if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5540                 return -EFAULT;
5541
5542         data->mode = io_translate_timeout_mode(flags);
5543         hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
5544         return 0;
5545 }
5546
5547 static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
5548 {
5549         struct io_ring_ctx *ctx = req->ctx;
5550         struct io_timeout_data *data = req->async_data;
5551         struct list_head *entry;
5552         u32 tail, off = req->timeout.off;
5553
5554         spin_lock_irq(&ctx->completion_lock);
5555
5556         /*
5557          * sqe->off holds how many events that need to occur for this
5558          * timeout event to be satisfied. If it isn't set, then this is
5559          * a pure timeout request, sequence isn't used.
5560          */
5561         if (io_is_timeout_noseq(req)) {
5562                 entry = ctx->timeout_list.prev;
5563                 goto add;
5564         }
5565
5566         tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
5567         req->timeout.target_seq = tail + off;
5568
5569         /* Update the last seq here in case io_flush_timeouts() hasn't.
5570          * This is safe because ->completion_lock is held, and submissions
5571          * and completions are never mixed in the same ->completion_lock section.
5572          */
5573         ctx->cq_last_tm_flush = tail;
5574
5575         /*
5576          * Insertion sort, ensuring the first entry in the list is always
5577          * the one we need first.
5578          */
5579         list_for_each_prev(entry, &ctx->timeout_list) {
5580                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
5581                                                   timeout.list);
5582
5583                 if (io_is_timeout_noseq(nxt))
5584                         continue;
5585                 /* nxt.seq is behind @tail, otherwise would've been completed */
5586                 if (off >= nxt->timeout.target_seq - tail)
5587                         break;
5588         }
5589 add:
5590         list_add(&req->timeout.list, entry);
5591         data->timer.function = io_timeout_fn;
5592         hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
5593         spin_unlock_irq(&ctx->completion_lock);
5594         return 0;
5595 }
5596
5597 static bool io_cancel_cb(struct io_wq_work *work, void *data)
5598 {
5599         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5600
5601         return req->user_data == (unsigned long) data;
5602 }
5603
5604 static int io_async_cancel_one(struct io_uring_task *tctx, void *sqe_addr)
5605 {
5606         enum io_wq_cancel cancel_ret;
5607         int ret = 0;
5608
5609         if (!tctx->io_wq)
5610                 return -ENOENT;
5611
5612         cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, sqe_addr, false);
5613         switch (cancel_ret) {
5614         case IO_WQ_CANCEL_OK:
5615                 ret = 0;
5616                 break;
5617         case IO_WQ_CANCEL_RUNNING:
5618                 ret = -EALREADY;
5619                 break;
5620         case IO_WQ_CANCEL_NOTFOUND:
5621                 ret = -ENOENT;
5622                 break;
5623         }
5624
5625         return ret;
5626 }
5627
5628 static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
5629                                      struct io_kiocb *req, __u64 sqe_addr,
5630                                      int success_ret)
5631 {
5632         unsigned long flags;
5633         int ret;
5634
5635         ret = io_async_cancel_one(req->task->io_uring,
5636                                         (void *) (unsigned long) sqe_addr);
5637         if (ret != -ENOENT) {
5638                 spin_lock_irqsave(&ctx->completion_lock, flags);
5639                 goto done;
5640         }
5641
5642         spin_lock_irqsave(&ctx->completion_lock, flags);
5643         ret = io_timeout_cancel(ctx, sqe_addr);
5644         if (ret != -ENOENT)
5645                 goto done;
5646         ret = io_poll_cancel(ctx, sqe_addr);
5647 done:
5648         if (!ret)
5649                 ret = success_ret;
5650         io_cqring_fill_event(req, ret);
5651         io_commit_cqring(ctx);
5652         spin_unlock_irqrestore(&ctx->completion_lock, flags);
5653         io_cqring_ev_posted(ctx);
5654
5655         if (ret < 0)
5656                 req_set_fail_links(req);
5657         io_put_req(req);
5658 }
5659
5660 static int io_async_cancel_prep(struct io_kiocb *req,
5661                                 const struct io_uring_sqe *sqe)
5662 {
5663         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5664                 return -EINVAL;
5665         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5666                 return -EINVAL;
5667         if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
5668                 return -EINVAL;
5669
5670         req->cancel.addr = READ_ONCE(sqe->addr);
5671         return 0;
5672 }
5673
5674 static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
5675 {
5676         struct io_ring_ctx *ctx = req->ctx;
5677
5678         io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
5679         return 0;
5680 }
5681
5682 static int io_rsrc_update_prep(struct io_kiocb *req,
5683                                 const struct io_uring_sqe *sqe)
5684 {
5685         if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL))
5686                 return -EINVAL;
5687         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5688                 return -EINVAL;
5689         if (sqe->ioprio || sqe->rw_flags)
5690                 return -EINVAL;
5691
5692         req->rsrc_update.offset = READ_ONCE(sqe->off);
5693         req->rsrc_update.nr_args = READ_ONCE(sqe->len);
5694         if (!req->rsrc_update.nr_args)
5695                 return -EINVAL;
5696         req->rsrc_update.arg = READ_ONCE(sqe->addr);
5697         return 0;
5698 }
5699
5700 static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
5701 {
5702         struct io_ring_ctx *ctx = req->ctx;
5703         struct io_uring_rsrc_update up;
5704         int ret;
5705
5706         if (issue_flags & IO_URING_F_NONBLOCK)
5707                 return -EAGAIN;
5708
5709         up.offset = req->rsrc_update.offset;
5710         up.data = req->rsrc_update.arg;
5711
5712         mutex_lock(&ctx->uring_lock);
5713         ret = __io_sqe_files_update(ctx, &up, req->rsrc_update.nr_args);
5714         mutex_unlock(&ctx->uring_lock);
5715
5716         if (ret < 0)
5717                 req_set_fail_links(req);
5718         __io_req_complete(req, issue_flags, ret, 0);
5719         return 0;
5720 }
5721
5722 static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5723 {
5724         switch (req->opcode) {
5725         case IORING_OP_NOP:
5726                 return 0;
5727         case IORING_OP_READV:
5728         case IORING_OP_READ_FIXED:
5729         case IORING_OP_READ:
5730                 return io_read_prep(req, sqe);
5731         case IORING_OP_WRITEV:
5732         case IORING_OP_WRITE_FIXED:
5733         case IORING_OP_WRITE:
5734                 return io_write_prep(req, sqe);
5735         case IORING_OP_POLL_ADD:
5736                 return io_poll_add_prep(req, sqe);
5737         case IORING_OP_POLL_REMOVE:
5738                 return io_poll_remove_prep(req, sqe);
5739         case IORING_OP_FSYNC:
5740                 return io_fsync_prep(req, sqe);
5741         case IORING_OP_SYNC_FILE_RANGE:
5742                 return io_sfr_prep(req, sqe);
5743         case IORING_OP_SENDMSG:
5744         case IORING_OP_SEND:
5745                 return io_sendmsg_prep(req, sqe);
5746         case IORING_OP_RECVMSG:
5747         case IORING_OP_RECV:
5748                 return io_recvmsg_prep(req, sqe);
5749         case IORING_OP_CONNECT:
5750                 return io_connect_prep(req, sqe);
5751         case IORING_OP_TIMEOUT:
5752                 return io_timeout_prep(req, sqe, false);
5753         case IORING_OP_TIMEOUT_REMOVE:
5754                 return io_timeout_remove_prep(req, sqe);
5755         case IORING_OP_ASYNC_CANCEL:
5756                 return io_async_cancel_prep(req, sqe);
5757         case IORING_OP_LINK_TIMEOUT:
5758                 return io_timeout_prep(req, sqe, true);
5759         case IORING_OP_ACCEPT:
5760                 return io_accept_prep(req, sqe);
5761         case IORING_OP_FALLOCATE:
5762                 return io_fallocate_prep(req, sqe);
5763         case IORING_OP_OPENAT:
5764                 return io_openat_prep(req, sqe);
5765         case IORING_OP_CLOSE:
5766                 return io_close_prep(req, sqe);
5767         case IORING_OP_FILES_UPDATE:
5768                 return io_rsrc_update_prep(req, sqe);
5769         case IORING_OP_STATX:
5770                 return io_statx_prep(req, sqe);
5771         case IORING_OP_FADVISE:
5772                 return io_fadvise_prep(req, sqe);
5773         case IORING_OP_MADVISE:
5774                 return io_madvise_prep(req, sqe);
5775         case IORING_OP_OPENAT2:
5776                 return io_openat2_prep(req, sqe);
5777         case IORING_OP_EPOLL_CTL:
5778                 return io_epoll_ctl_prep(req, sqe);
5779         case IORING_OP_SPLICE:
5780                 return io_splice_prep(req, sqe);
5781         case IORING_OP_PROVIDE_BUFFERS:
5782                 return io_provide_buffers_prep(req, sqe);
5783         case IORING_OP_REMOVE_BUFFERS:
5784                 return io_remove_buffers_prep(req, sqe);
5785         case IORING_OP_TEE:
5786                 return io_tee_prep(req, sqe);
5787         case IORING_OP_SHUTDOWN:
5788                 return io_shutdown_prep(req, sqe);
5789         case IORING_OP_RENAMEAT:
5790                 return io_renameat_prep(req, sqe);
5791         case IORING_OP_UNLINKAT:
5792                 return io_unlinkat_prep(req, sqe);
5793         }
5794
5795         printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
5796                         req->opcode);
5797         return-EINVAL;
5798 }
5799
5800 static int io_req_prep_async(struct io_kiocb *req)
5801 {
5802         switch (req->opcode) {
5803         case IORING_OP_READV:
5804         case IORING_OP_READ_FIXED:
5805         case IORING_OP_READ:
5806                 return io_rw_prep_async(req, READ);
5807         case IORING_OP_WRITEV:
5808         case IORING_OP_WRITE_FIXED:
5809         case IORING_OP_WRITE:
5810                 return io_rw_prep_async(req, WRITE);
5811         case IORING_OP_SENDMSG:
5812         case IORING_OP_SEND:
5813                 return io_sendmsg_prep_async(req);
5814         case IORING_OP_RECVMSG:
5815         case IORING_OP_RECV:
5816                 return io_recvmsg_prep_async(req);
5817         case IORING_OP_CONNECT:
5818                 return io_connect_prep_async(req);
5819         }
5820         return 0;
5821 }
5822
5823 static int io_req_defer_prep(struct io_kiocb *req)
5824 {
5825         if (!io_op_defs[req->opcode].needs_async_data)
5826                 return 0;
5827         /* some opcodes init it during the inital prep */
5828         if (req->async_data)
5829                 return 0;
5830         if (__io_alloc_async_data(req))
5831                 return -EAGAIN;
5832         return io_req_prep_async(req);
5833 }
5834
5835 static u32 io_get_sequence(struct io_kiocb *req)
5836 {
5837         struct io_kiocb *pos;
5838         struct io_ring_ctx *ctx = req->ctx;
5839         u32 total_submitted, nr_reqs = 0;
5840
5841         io_for_each_link(pos, req)
5842                 nr_reqs++;
5843
5844         total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
5845         return total_submitted - nr_reqs;
5846 }
5847
5848 static int io_req_defer(struct io_kiocb *req)
5849 {
5850         struct io_ring_ctx *ctx = req->ctx;
5851         struct io_defer_entry *de;
5852         int ret;
5853         u32 seq;
5854
5855         /* Still need defer if there is pending req in defer list. */
5856         if (likely(list_empty_careful(&ctx->defer_list) &&
5857                 !(req->flags & REQ_F_IO_DRAIN)))
5858                 return 0;
5859
5860         seq = io_get_sequence(req);
5861         /* Still a chance to pass the sequence check */
5862         if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
5863                 return 0;
5864
5865         ret = io_req_defer_prep(req);
5866         if (ret)
5867                 return ret;
5868         io_prep_async_link(req);
5869         de = kmalloc(sizeof(*de), GFP_KERNEL);
5870         if (!de)
5871                 return -ENOMEM;
5872
5873         spin_lock_irq(&ctx->completion_lock);
5874         if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
5875                 spin_unlock_irq(&ctx->completion_lock);
5876                 kfree(de);
5877                 io_queue_async_work(req);
5878                 return -EIOCBQUEUED;
5879         }
5880
5881         trace_io_uring_defer(ctx, req, req->user_data);
5882         de->req = req;
5883         de->seq = seq;
5884         list_add_tail(&de->list, &ctx->defer_list);
5885         spin_unlock_irq(&ctx->completion_lock);
5886         return -EIOCBQUEUED;
5887 }
5888
5889 static void __io_clean_op(struct io_kiocb *req)
5890 {
5891         if (req->flags & REQ_F_BUFFER_SELECTED) {
5892                 switch (req->opcode) {
5893                 case IORING_OP_READV:
5894                 case IORING_OP_READ_FIXED:
5895                 case IORING_OP_READ:
5896                         kfree((void *)(unsigned long)req->rw.addr);
5897                         break;
5898                 case IORING_OP_RECVMSG:
5899                 case IORING_OP_RECV:
5900                         kfree(req->sr_msg.kbuf);
5901                         break;
5902                 }
5903                 req->flags &= ~REQ_F_BUFFER_SELECTED;
5904         }
5905
5906         if (req->flags & REQ_F_NEED_CLEANUP) {
5907                 switch (req->opcode) {
5908                 case IORING_OP_READV:
5909                 case IORING_OP_READ_FIXED:
5910                 case IORING_OP_READ:
5911                 case IORING_OP_WRITEV:
5912                 case IORING_OP_WRITE_FIXED:
5913                 case IORING_OP_WRITE: {
5914                         struct io_async_rw *io = req->async_data;
5915                         if (io->free_iovec)
5916                                 kfree(io->free_iovec);
5917                         break;
5918                         }
5919                 case IORING_OP_RECVMSG:
5920                 case IORING_OP_SENDMSG: {
5921                         struct io_async_msghdr *io = req->async_data;
5922
5923                         kfree(io->free_iov);
5924                         break;
5925                         }
5926                 case IORING_OP_SPLICE:
5927                 case IORING_OP_TEE:
5928                         io_put_file(req, req->splice.file_in,
5929                                     (req->splice.flags & SPLICE_F_FD_IN_FIXED));
5930                         break;
5931                 case IORING_OP_OPENAT:
5932                 case IORING_OP_OPENAT2:
5933                         if (req->open.filename)
5934                                 putname(req->open.filename);
5935                         break;
5936                 case IORING_OP_RENAMEAT:
5937                         putname(req->rename.oldpath);
5938                         putname(req->rename.newpath);
5939                         break;
5940                 case IORING_OP_UNLINKAT:
5941                         putname(req->unlink.filename);
5942                         break;
5943                 }
5944                 req->flags &= ~REQ_F_NEED_CLEANUP;
5945         }
5946 }
5947
5948 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
5949 {
5950         struct io_ring_ctx *ctx = req->ctx;
5951         int ret;
5952
5953         switch (req->opcode) {
5954         case IORING_OP_NOP:
5955                 ret = io_nop(req, issue_flags);
5956                 break;
5957         case IORING_OP_READV:
5958         case IORING_OP_READ_FIXED:
5959         case IORING_OP_READ:
5960                 ret = io_read(req, issue_flags);
5961                 break;
5962         case IORING_OP_WRITEV:
5963         case IORING_OP_WRITE_FIXED:
5964         case IORING_OP_WRITE:
5965                 ret = io_write(req, issue_flags);
5966                 break;
5967         case IORING_OP_FSYNC:
5968                 ret = io_fsync(req, issue_flags);
5969                 break;
5970         case IORING_OP_POLL_ADD:
5971                 ret = io_poll_add(req, issue_flags);
5972                 break;
5973         case IORING_OP_POLL_REMOVE:
5974                 ret = io_poll_remove(req, issue_flags);
5975                 break;
5976         case IORING_OP_SYNC_FILE_RANGE:
5977                 ret = io_sync_file_range(req, issue_flags);
5978                 break;
5979         case IORING_OP_SENDMSG:
5980                 ret = io_sendmsg(req, issue_flags);
5981                 break;
5982         case IORING_OP_SEND:
5983                 ret = io_send(req, issue_flags);
5984                 break;
5985         case IORING_OP_RECVMSG:
5986                 ret = io_recvmsg(req, issue_flags);
5987                 break;
5988         case IORING_OP_RECV:
5989                 ret = io_recv(req, issue_flags);
5990                 break;
5991         case IORING_OP_TIMEOUT:
5992                 ret = io_timeout(req, issue_flags);
5993                 break;
5994         case IORING_OP_TIMEOUT_REMOVE:
5995                 ret = io_timeout_remove(req, issue_flags);
5996                 break;
5997         case IORING_OP_ACCEPT:
5998                 ret = io_accept(req, issue_flags);
5999                 break;
6000         case IORING_OP_CONNECT:
6001                 ret = io_connect(req, issue_flags);
6002                 break;
6003         case IORING_OP_ASYNC_CANCEL:
6004                 ret = io_async_cancel(req, issue_flags);
6005                 break;
6006         case IORING_OP_FALLOCATE:
6007                 ret = io_fallocate(req, issue_flags);
6008                 break;
6009         case IORING_OP_OPENAT:
6010                 ret = io_openat(req, issue_flags);
6011                 break;
6012         case IORING_OP_CLOSE:
6013                 ret = io_close(req, issue_flags);
6014                 break;
6015         case IORING_OP_FILES_UPDATE:
6016                 ret = io_files_update(req, issue_flags);
6017                 break;
6018         case IORING_OP_STATX:
6019                 ret = io_statx(req, issue_flags);
6020                 break;
6021         case IORING_OP_FADVISE:
6022                 ret = io_fadvise(req, issue_flags);
6023                 break;
6024         case IORING_OP_MADVISE:
6025                 ret = io_madvise(req, issue_flags);
6026                 break;
6027         case IORING_OP_OPENAT2:
6028                 ret = io_openat2(req, issue_flags);
6029                 break;
6030         case IORING_OP_EPOLL_CTL:
6031                 ret = io_epoll_ctl(req, issue_flags);
6032                 break;
6033         case IORING_OP_SPLICE:
6034                 ret = io_splice(req, issue_flags);
6035                 break;
6036         case IORING_OP_PROVIDE_BUFFERS:
6037                 ret = io_provide_buffers(req, issue_flags);
6038                 break;
6039         case IORING_OP_REMOVE_BUFFERS:
6040                 ret = io_remove_buffers(req, issue_flags);
6041                 break;
6042         case IORING_OP_TEE:
6043                 ret = io_tee(req, issue_flags);
6044                 break;
6045         case IORING_OP_SHUTDOWN:
6046                 ret = io_shutdown(req, issue_flags);
6047                 break;
6048         case IORING_OP_RENAMEAT:
6049                 ret = io_renameat(req, issue_flags);
6050                 break;
6051         case IORING_OP_UNLINKAT:
6052                 ret = io_unlinkat(req, issue_flags);
6053                 break;
6054         default:
6055                 ret = -EINVAL;
6056                 break;
6057         }
6058
6059         if (ret)
6060                 return ret;
6061
6062         /* If the op doesn't have a file, we're not polling for it */
6063         if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
6064                 const bool in_async = io_wq_current_is_worker();
6065
6066                 /* workqueue context doesn't hold uring_lock, grab it now */
6067                 if (in_async)
6068                         mutex_lock(&ctx->uring_lock);
6069
6070                 io_iopoll_req_issued(req, in_async);
6071
6072                 if (in_async)
6073                         mutex_unlock(&ctx->uring_lock);
6074         }
6075
6076         return 0;
6077 }
6078
6079 static void io_wq_submit_work(struct io_wq_work *work)
6080 {
6081         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6082         struct io_kiocb *timeout;
6083         int ret = 0;
6084
6085         timeout = io_prep_linked_timeout(req);
6086         if (timeout)
6087                 io_queue_linked_timeout(timeout);
6088
6089         if (work->flags & IO_WQ_WORK_CANCEL)
6090                 ret = -ECANCELED;
6091
6092         if (!ret) {
6093                 do {
6094                         ret = io_issue_sqe(req, 0);
6095                         /*
6096                          * We can get EAGAIN for polled IO even though we're
6097                          * forcing a sync submission from here, since we can't
6098                          * wait for request slots on the block side.
6099                          */
6100                         if (ret != -EAGAIN)
6101                                 break;
6102                         cond_resched();
6103                 } while (1);
6104         }
6105
6106         /* avoid locking problems by failing it from a clean context */
6107         if (ret) {
6108                 /* io-wq is going to take one down */
6109                 refcount_inc(&req->refs);
6110                 io_req_task_queue_fail(req, ret);
6111         }
6112 }
6113
6114 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
6115                                               int index)
6116 {
6117         struct fixed_rsrc_table *table;
6118
6119         table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
6120         return table->files[index & IORING_FILE_TABLE_MASK];
6121 }
6122
6123 static struct file *io_file_get(struct io_submit_state *state,
6124                                 struct io_kiocb *req, int fd, bool fixed)
6125 {
6126         struct io_ring_ctx *ctx = req->ctx;
6127         struct file *file;
6128
6129         if (fixed) {
6130                 if (unlikely((unsigned int)fd >= ctx->nr_user_files))
6131                         return NULL;
6132                 fd = array_index_nospec(fd, ctx->nr_user_files);
6133                 file = io_file_from_index(ctx, fd);
6134                 io_set_resource_node(req);
6135         } else {
6136                 trace_io_uring_file_get(ctx, fd);
6137                 file = __io_file_get(state, fd);
6138         }
6139
6140         if (file && unlikely(file->f_op == &io_uring_fops))
6141                 io_req_track_inflight(req);
6142         return file;
6143 }
6144
6145 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
6146 {
6147         struct io_timeout_data *data = container_of(timer,
6148                                                 struct io_timeout_data, timer);
6149         struct io_kiocb *prev, *req = data->req;
6150         struct io_ring_ctx *ctx = req->ctx;
6151         unsigned long flags;
6152
6153         spin_lock_irqsave(&ctx->completion_lock, flags);
6154         prev = req->timeout.head;
6155         req->timeout.head = NULL;
6156
6157         /*
6158          * We don't expect the list to be empty, that will only happen if we
6159          * race with the completion of the linked work.
6160          */
6161         if (prev && refcount_inc_not_zero(&prev->refs))
6162                 io_remove_next_linked(prev);
6163         else
6164                 prev = NULL;
6165         spin_unlock_irqrestore(&ctx->completion_lock, flags);
6166
6167         if (prev) {
6168                 req_set_fail_links(prev);
6169                 io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
6170                 io_put_req_deferred(prev, 1);
6171         } else {
6172                 io_req_complete_post(req, -ETIME, 0);
6173                 io_put_req_deferred(req, 1);
6174         }
6175         return HRTIMER_NORESTART;
6176 }
6177
6178 static void __io_queue_linked_timeout(struct io_kiocb *req)
6179 {
6180         /*
6181          * If the back reference is NULL, then our linked request finished
6182          * before we got a chance to setup the timer
6183          */
6184         if (req->timeout.head) {
6185                 struct io_timeout_data *data = req->async_data;
6186
6187                 data->timer.function = io_link_timeout_fn;
6188                 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
6189                                 data->mode);
6190         }
6191 }
6192
6193 static void io_queue_linked_timeout(struct io_kiocb *req)
6194 {
6195         struct io_ring_ctx *ctx = req->ctx;
6196
6197         spin_lock_irq(&ctx->completion_lock);
6198         __io_queue_linked_timeout(req);
6199         spin_unlock_irq(&ctx->completion_lock);
6200
6201         /* drop submission reference */
6202         io_put_req(req);
6203 }
6204
6205 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
6206 {
6207         struct io_kiocb *nxt = req->link;
6208
6209         if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) ||
6210             nxt->opcode != IORING_OP_LINK_TIMEOUT)
6211                 return NULL;
6212
6213         nxt->timeout.head = req;
6214         nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
6215         req->flags |= REQ_F_LINK_TIMEOUT;
6216         return nxt;
6217 }
6218
6219 static void __io_queue_sqe(struct io_kiocb *req)
6220 {
6221         struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
6222         const struct cred *old_creds = NULL;
6223         int ret;
6224
6225         if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds &&
6226             req->work.creds != current_cred())
6227                 old_creds = override_creds(req->work.creds);
6228
6229         ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
6230
6231         if (old_creds)
6232                 revert_creds(old_creds);
6233
6234         /*
6235          * We async punt it if the file wasn't marked NOWAIT, or if the file
6236          * doesn't support non-blocking read/write attempts
6237          */
6238         if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
6239                 if (!io_arm_poll_handler(req)) {
6240                         /*
6241                          * Queued up for async execution, worker will release
6242                          * submit reference when the iocb is actually submitted.
6243                          */
6244                         io_queue_async_work(req);
6245                 }
6246         } else if (likely(!ret)) {
6247                 /* drop submission reference */
6248                 if (req->flags & REQ_F_COMPLETE_INLINE) {
6249                         struct io_ring_ctx *ctx = req->ctx;
6250                         struct io_comp_state *cs = &ctx->submit_state.comp;
6251
6252                         cs->reqs[cs->nr++] = req;
6253                         if (cs->nr == ARRAY_SIZE(cs->reqs))
6254                                 io_submit_flush_completions(cs, ctx);
6255                 } else {
6256                         io_put_req(req);
6257                 }
6258         } else {
6259                 req_set_fail_links(req);
6260                 io_put_req(req);
6261                 io_req_complete(req, ret);
6262         }
6263         if (linked_timeout)
6264                 io_queue_linked_timeout(linked_timeout);
6265 }
6266
6267 static void io_queue_sqe(struct io_kiocb *req)
6268 {
6269         int ret;
6270
6271         ret = io_req_defer(req);
6272         if (ret) {
6273                 if (ret != -EIOCBQUEUED) {
6274 fail_req:
6275                         req_set_fail_links(req);
6276                         io_put_req(req);
6277                         io_req_complete(req, ret);
6278                 }
6279         } else if (req->flags & REQ_F_FORCE_ASYNC) {
6280                 ret = io_req_defer_prep(req);
6281                 if (unlikely(ret))
6282                         goto fail_req;
6283                 io_queue_async_work(req);
6284         } else {
6285                 __io_queue_sqe(req);
6286         }
6287 }
6288
6289 /*
6290  * Check SQE restrictions (opcode and flags).
6291  *
6292  * Returns 'true' if SQE is allowed, 'false' otherwise.
6293  */
6294 static inline bool io_check_restriction(struct io_ring_ctx *ctx,
6295                                         struct io_kiocb *req,
6296                                         unsigned int sqe_flags)
6297 {
6298         if (!ctx->restricted)
6299                 return true;
6300
6301         if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
6302                 return false;
6303
6304         if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
6305             ctx->restrictions.sqe_flags_required)
6306                 return false;
6307
6308         if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
6309                           ctx->restrictions.sqe_flags_required))
6310                 return false;
6311
6312         return true;
6313 }
6314
6315 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
6316                        const struct io_uring_sqe *sqe)
6317 {
6318         struct io_submit_state *state;
6319         unsigned int sqe_flags;
6320         int id, ret = 0;
6321
6322         req->opcode = READ_ONCE(sqe->opcode);
6323         /* same numerical values with corresponding REQ_F_*, safe to copy */
6324         req->flags = sqe_flags = READ_ONCE(sqe->flags);
6325         req->user_data = READ_ONCE(sqe->user_data);
6326         req->async_data = NULL;
6327         req->file = NULL;
6328         req->ctx = ctx;
6329         req->link = NULL;
6330         req->fixed_rsrc_refs = NULL;
6331         /* one is dropped after submission, the other at completion */
6332         refcount_set(&req->refs, 2);
6333         req->task = current;
6334         req->result = 0;
6335
6336         /* enforce forwards compatibility on users */
6337         if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
6338                 req->flags = 0;
6339                 return -EINVAL;
6340         }
6341
6342         if (unlikely(req->opcode >= IORING_OP_LAST))
6343                 return -EINVAL;
6344
6345         if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
6346                 return -EACCES;
6347
6348         if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
6349             !io_op_defs[req->opcode].buffer_select)
6350                 return -EOPNOTSUPP;
6351
6352         id = READ_ONCE(sqe->personality);
6353         if (id) {
6354                 __io_req_init_async(req);
6355                 req->work.creds = idr_find(&ctx->personality_idr, id);
6356                 if (unlikely(!req->work.creds))
6357                         return -EINVAL;
6358                 get_cred(req->work.creds);
6359         }
6360
6361         state = &ctx->submit_state;
6362
6363         /*
6364          * Plug now if we have more than 1 IO left after this, and the target
6365          * is potentially a read/write to block based storage.
6366          */
6367         if (!state->plug_started && state->ios_left > 1 &&
6368             io_op_defs[req->opcode].plug) {
6369                 blk_start_plug(&state->plug);
6370                 state->plug_started = true;
6371         }
6372
6373         if (io_op_defs[req->opcode].needs_file) {
6374                 bool fixed = req->flags & REQ_F_FIXED_FILE;
6375
6376                 req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
6377                 if (unlikely(!req->file))
6378                         ret = -EBADF;
6379         }
6380
6381         state->ios_left--;
6382         return ret;
6383 }
6384
6385 static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
6386                          const struct io_uring_sqe *sqe)
6387 {
6388         struct io_submit_link *link = &ctx->submit_state.link;
6389         int ret;
6390
6391         ret = io_init_req(ctx, req, sqe);
6392         if (unlikely(ret)) {
6393 fail_req:
6394                 io_put_req(req);
6395                 io_req_complete(req, ret);
6396                 if (link->head) {
6397                         /* fail even hard links since we don't submit */
6398                         link->head->flags |= REQ_F_FAIL_LINK;
6399                         io_put_req(link->head);
6400                         io_req_complete(link->head, -ECANCELED);
6401                         link->head = NULL;
6402                 }
6403                 return ret;
6404         }
6405         ret = io_req_prep(req, sqe);
6406         if (unlikely(ret))
6407                 goto fail_req;
6408
6409         /* don't need @sqe from now on */
6410         trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
6411                                 true, ctx->flags & IORING_SETUP_SQPOLL);
6412
6413         /*
6414          * If we already have a head request, queue this one for async
6415          * submittal once the head completes. If we don't have a head but
6416          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
6417          * submitted sync once the chain is complete. If none of those
6418          * conditions are true (normal request), then just queue it.
6419          */
6420         if (link->head) {
6421                 struct io_kiocb *head = link->head;
6422
6423                 /*
6424                  * Taking sequential execution of a link, draining both sides
6425                  * of the link also fullfils IOSQE_IO_DRAIN semantics for all
6426                  * requests in the link. So, it drains the head and the
6427                  * next after the link request. The last one is done via
6428                  * drain_next flag to persist the effect across calls.
6429                  */
6430                 if (req->flags & REQ_F_IO_DRAIN) {
6431                         head->flags |= REQ_F_IO_DRAIN;
6432                         ctx->drain_next = 1;
6433                 }
6434                 ret = io_req_defer_prep(req);
6435                 if (unlikely(ret))
6436                         goto fail_req;
6437                 trace_io_uring_link(ctx, req, head);
6438                 link->last->link = req;
6439                 link->last = req;
6440
6441                 /* last request of a link, enqueue the link */
6442                 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
6443                         io_queue_sqe(head);
6444                         link->head = NULL;
6445                 }
6446         } else {
6447                 if (unlikely(ctx->drain_next)) {
6448                         req->flags |= REQ_F_IO_DRAIN;
6449                         ctx->drain_next = 0;
6450                 }
6451                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
6452                         link->head = req;
6453                         link->last = req;
6454                 } else {
6455                         io_queue_sqe(req);
6456                 }
6457         }
6458
6459         return 0;
6460 }
6461
6462 /*
6463  * Batched submission is done, ensure local IO is flushed out.
6464  */
6465 static void io_submit_state_end(struct io_submit_state *state,
6466                                 struct io_ring_ctx *ctx)
6467 {
6468         if (state->link.head)
6469                 io_queue_sqe(state->link.head);
6470         if (state->comp.nr)
6471                 io_submit_flush_completions(&state->comp, ctx);
6472         if (state->plug_started)
6473                 blk_finish_plug(&state->plug);
6474         io_state_file_put(state);
6475 }
6476
6477 /*
6478  * Start submission side cache.
6479  */
6480 static void io_submit_state_start(struct io_submit_state *state,
6481                                   unsigned int max_ios)
6482 {
6483         state->plug_started = false;
6484         state->ios_left = max_ios;
6485         /* set only head, no need to init link_last in advance */
6486         state->link.head = NULL;
6487 }
6488
6489 static void io_commit_sqring(struct io_ring_ctx *ctx)
6490 {
6491         struct io_rings *rings = ctx->rings;
6492
6493         /*
6494          * Ensure any loads from the SQEs are done at this point,
6495          * since once we write the new head, the application could
6496          * write new data to them.
6497          */
6498         smp_store_release(&rings->sq.head, ctx->cached_sq_head);
6499 }
6500
6501 /*
6502  * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
6503  * that is mapped by userspace. This means that care needs to be taken to
6504  * ensure that reads are stable, as we cannot rely on userspace always
6505  * being a good citizen. If members of the sqe are validated and then later
6506  * used, it's important that those reads are done through READ_ONCE() to
6507  * prevent a re-load down the line.
6508  */
6509 static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
6510 {
6511         u32 *sq_array = ctx->sq_array;
6512         unsigned head;
6513
6514         /*
6515          * The cached sq head (or cq tail) serves two purposes:
6516          *
6517          * 1) allows us to batch the cost of updating the user visible
6518          *    head updates.
6519          * 2) allows the kernel side to track the head on its own, even
6520          *    though the application is the one updating it.
6521          */
6522         head = READ_ONCE(sq_array[ctx->cached_sq_head++ & ctx->sq_mask]);
6523         if (likely(head < ctx->sq_entries))
6524                 return &ctx->sq_sqes[head];
6525
6526         /* drop invalid entries */
6527         ctx->cached_sq_dropped++;
6528         WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
6529         return NULL;
6530 }
6531
6532 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
6533 {
6534         int submitted = 0;
6535
6536         /* if we have a backlog and couldn't flush it all, return BUSY */
6537         if (test_bit(0, &ctx->sq_check_overflow)) {
6538                 if (!__io_cqring_overflow_flush(ctx, false, NULL, NULL))
6539                         return -EBUSY;
6540         }
6541
6542         /* make sure SQ entry isn't read before tail */
6543         nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
6544
6545         if (!percpu_ref_tryget_many(&ctx->refs, nr))
6546                 return -EAGAIN;
6547
6548         percpu_counter_add(&current->io_uring->inflight, nr);
6549         refcount_add(nr, &current->usage);
6550         io_submit_state_start(&ctx->submit_state, nr);
6551
6552         while (submitted < nr) {
6553                 const struct io_uring_sqe *sqe;
6554                 struct io_kiocb *req;
6555
6556                 req = io_alloc_req(ctx);
6557                 if (unlikely(!req)) {
6558                         if (!submitted)
6559                                 submitted = -EAGAIN;
6560                         break;
6561                 }
6562                 sqe = io_get_sqe(ctx);
6563                 if (unlikely(!sqe)) {
6564                         kmem_cache_free(req_cachep, req);
6565                         break;
6566                 }
6567                 /* will complete beyond this point, count as submitted */
6568                 submitted++;
6569                 if (io_submit_sqe(ctx, req, sqe))
6570                         break;
6571         }
6572
6573         if (unlikely(submitted != nr)) {
6574                 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
6575                 struct io_uring_task *tctx = current->io_uring;
6576                 int unused = nr - ref_used;
6577
6578                 percpu_ref_put_many(&ctx->refs, unused);
6579                 percpu_counter_sub(&tctx->inflight, unused);
6580                 put_task_struct_many(current, unused);
6581         }
6582
6583         io_submit_state_end(&ctx->submit_state, ctx);
6584          /* Commit SQ ring head once we've consumed and submitted all SQEs */
6585         io_commit_sqring(ctx);
6586
6587         return submitted;
6588 }
6589
6590 static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
6591 {
6592         /* Tell userspace we may need a wakeup call */
6593         spin_lock_irq(&ctx->completion_lock);
6594         ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
6595         spin_unlock_irq(&ctx->completion_lock);
6596 }
6597
6598 static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
6599 {
6600         spin_lock_irq(&ctx->completion_lock);
6601         ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6602         spin_unlock_irq(&ctx->completion_lock);
6603 }
6604
6605 static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
6606 {
6607         unsigned int to_submit;
6608         int ret = 0;
6609
6610         to_submit = io_sqring_entries(ctx);
6611         /* if we're handling multiple rings, cap submit size for fairness */
6612         if (cap_entries && to_submit > 8)
6613                 to_submit = 8;
6614
6615         if (!list_empty(&ctx->iopoll_list) || to_submit) {
6616                 unsigned nr_events = 0;
6617
6618                 mutex_lock(&ctx->uring_lock);
6619                 if (!list_empty(&ctx->iopoll_list))
6620                         io_do_iopoll(ctx, &nr_events, 0);
6621
6622                 if (to_submit && !ctx->sqo_dead &&
6623                     likely(!percpu_ref_is_dying(&ctx->refs)))
6624                         ret = io_submit_sqes(ctx, to_submit);
6625                 mutex_unlock(&ctx->uring_lock);
6626         }
6627
6628         if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
6629                 wake_up(&ctx->sqo_sq_wait);
6630
6631         return ret;
6632 }
6633
6634 static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
6635 {
6636         struct io_ring_ctx *ctx;
6637         unsigned sq_thread_idle = 0;
6638
6639         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
6640                 if (sq_thread_idle < ctx->sq_thread_idle)
6641                         sq_thread_idle = ctx->sq_thread_idle;
6642         }
6643
6644         sqd->sq_thread_idle = sq_thread_idle;
6645 }
6646
6647 static void io_sqd_init_new(struct io_sq_data *sqd)
6648 {
6649         struct io_ring_ctx *ctx;
6650
6651         while (!list_empty(&sqd->ctx_new_list)) {
6652                 ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list);
6653                 list_move_tail(&ctx->sqd_list, &sqd->ctx_list);
6654                 complete(&ctx->sq_thread_comp);
6655         }
6656
6657         io_sqd_update_thread_idle(sqd);
6658 }
6659
6660 static bool io_sq_thread_should_stop(struct io_sq_data *sqd)
6661 {
6662         return test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
6663 }
6664
6665 static bool io_sq_thread_should_park(struct io_sq_data *sqd)
6666 {
6667         return test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
6668 }
6669
6670 static void io_sq_thread_parkme(struct io_sq_data *sqd)
6671 {
6672         for (;;) {
6673                 /*
6674                  * TASK_PARKED is a special state; we must serialize against
6675                  * possible pending wakeups to avoid store-store collisions on
6676                  * task->state.
6677                  *
6678                  * Such a collision might possibly result in the task state
6679                  * changin from TASK_PARKED and us failing the
6680                  * wait_task_inactive() in kthread_park().
6681                  */
6682                 set_special_state(TASK_PARKED);
6683                 if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state))
6684                         break;
6685
6686                 /*
6687                  * Thread is going to call schedule(), do not preempt it,
6688                  * or the caller of kthread_park() may spend more time in
6689                  * wait_task_inactive().
6690                  */
6691                 preempt_disable();
6692                 complete(&sqd->completion);
6693                 schedule_preempt_disabled();
6694                 preempt_enable();
6695         }
6696         __set_current_state(TASK_RUNNING);
6697 }
6698
6699 static int io_sq_thread(void *data)
6700 {
6701         struct io_sq_data *sqd = data;
6702         struct io_ring_ctx *ctx;
6703         unsigned long timeout = 0;
6704         char buf[TASK_COMM_LEN];
6705         DEFINE_WAIT(wait);
6706
6707         sprintf(buf, "iou-sqp-%d", sqd->task_pid);
6708         set_task_comm(current, buf);
6709         sqd->thread = current;
6710         current->pf_io_worker = NULL;
6711
6712         if (sqd->sq_cpu != -1)
6713                 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
6714         else
6715                 set_cpus_allowed_ptr(current, cpu_online_mask);
6716         current->flags |= PF_NO_SETAFFINITY;
6717
6718         complete(&sqd->completion);
6719
6720         wait_for_completion(&sqd->startup);
6721
6722         while (!io_sq_thread_should_stop(sqd)) {
6723                 int ret;
6724                 bool cap_entries, sqt_spin, needs_sched;
6725
6726                 /*
6727                  * Any changes to the sqd lists are synchronized through the
6728                  * thread parking. This synchronizes the thread vs users,
6729                  * the users are synchronized on the sqd->ctx_lock.
6730                  */
6731                 if (io_sq_thread_should_park(sqd)) {
6732                         io_sq_thread_parkme(sqd);
6733                         continue;
6734                 }
6735                 if (unlikely(!list_empty(&sqd->ctx_new_list))) {
6736                         io_sqd_init_new(sqd);
6737                         timeout = jiffies + sqd->sq_thread_idle;
6738                 }
6739                 if (fatal_signal_pending(current))
6740                         break;
6741                 sqt_spin = false;
6742                 cap_entries = !list_is_singular(&sqd->ctx_list);
6743                 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
6744                         ret = __io_sq_thread(ctx, cap_entries);
6745                         if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
6746                                 sqt_spin = true;
6747                 }
6748
6749                 if (sqt_spin || !time_after(jiffies, timeout)) {
6750                         io_run_task_work();
6751                         cond_resched();
6752                         if (sqt_spin)
6753                                 timeout = jiffies + sqd->sq_thread_idle;
6754                         continue;
6755                 }
6756
6757                 needs_sched = true;
6758                 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
6759                 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
6760                         if ((ctx->flags & IORING_SETUP_IOPOLL) &&
6761                             !list_empty_careful(&ctx->iopoll_list)) {
6762                                 needs_sched = false;
6763                                 break;
6764                         }
6765                         if (io_sqring_entries(ctx)) {
6766                                 needs_sched = false;
6767                                 break;
6768                         }
6769                 }
6770
6771                 if (needs_sched && !io_sq_thread_should_park(sqd)) {
6772                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6773                                 io_ring_set_wakeup_flag(ctx);
6774
6775                         schedule();
6776                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6777                                 io_ring_clear_wakeup_flag(ctx);
6778                 }
6779
6780                 finish_wait(&sqd->wait, &wait);
6781                 timeout = jiffies + sqd->sq_thread_idle;
6782         }
6783
6784         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6785                 io_uring_cancel_sqpoll(ctx);
6786
6787         io_run_task_work();
6788
6789         /*
6790          * Clear thread under lock so that concurrent parks work correctly
6791          */
6792         complete_all(&sqd->completion);
6793         mutex_lock(&sqd->lock);
6794         sqd->thread = NULL;
6795         mutex_unlock(&sqd->lock);
6796
6797         complete(&sqd->exited);
6798         do_exit(0);
6799 }
6800
6801 struct io_wait_queue {
6802         struct wait_queue_entry wq;
6803         struct io_ring_ctx *ctx;
6804         unsigned to_wait;
6805         unsigned nr_timeouts;
6806 };
6807
6808 static inline bool io_should_wake(struct io_wait_queue *iowq)
6809 {
6810         struct io_ring_ctx *ctx = iowq->ctx;
6811
6812         /*
6813          * Wake up if we have enough events, or if a timeout occurred since we
6814          * started waiting. For timeouts, we always want to return to userspace,
6815          * regardless of event count.
6816          */
6817         return io_cqring_events(ctx) >= iowq->to_wait ||
6818                         atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
6819 }
6820
6821 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
6822                             int wake_flags, void *key)
6823 {
6824         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
6825                                                         wq);
6826
6827         /*
6828          * Cannot safely flush overflowed CQEs from here, ensure we wake up
6829          * the task, and the next invocation will do it.
6830          */
6831         if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->cq_check_overflow))
6832                 return autoremove_wake_function(curr, mode, wake_flags, key);
6833         return -1;
6834 }
6835
6836 static int io_run_task_work_sig(void)
6837 {
6838         if (io_run_task_work())
6839                 return 1;
6840         if (!signal_pending(current))
6841                 return 0;
6842         if (test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))
6843                 return -ERESTARTSYS;
6844         return -EINTR;
6845 }
6846
6847 /* when returns >0, the caller should retry */
6848 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
6849                                           struct io_wait_queue *iowq,
6850                                           signed long *timeout)
6851 {
6852         int ret;
6853
6854         /* make sure we run task_work before checking for signals */
6855         ret = io_run_task_work_sig();
6856         if (ret || io_should_wake(iowq))
6857                 return ret;
6858         /* let the caller flush overflows, retry */
6859         if (test_bit(0, &ctx->cq_check_overflow))
6860                 return 1;
6861
6862         *timeout = schedule_timeout(*timeout);
6863         return !*timeout ? -ETIME : 1;
6864 }
6865
6866 /*
6867  * Wait until events become available, if we don't already have some. The
6868  * application must reap them itself, as they reside on the shared cq ring.
6869  */
6870 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
6871                           const sigset_t __user *sig, size_t sigsz,
6872                           struct __kernel_timespec __user *uts)
6873 {
6874         struct io_wait_queue iowq = {
6875                 .wq = {
6876                         .private        = current,
6877                         .func           = io_wake_function,
6878                         .entry          = LIST_HEAD_INIT(iowq.wq.entry),
6879                 },
6880                 .ctx            = ctx,
6881                 .to_wait        = min_events,
6882         };
6883         struct io_rings *rings = ctx->rings;
6884         signed long timeout = MAX_SCHEDULE_TIMEOUT;
6885         int ret;
6886
6887         do {
6888                 io_cqring_overflow_flush(ctx, false, NULL, NULL);
6889                 if (io_cqring_events(ctx) >= min_events)
6890                         return 0;
6891                 if (!io_run_task_work())
6892                         break;
6893         } while (1);
6894
6895         if (sig) {
6896 #ifdef CONFIG_COMPAT
6897                 if (in_compat_syscall())
6898                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
6899                                                       sigsz);
6900                 else
6901 #endif
6902                         ret = set_user_sigmask(sig, sigsz);
6903
6904                 if (ret)
6905                         return ret;
6906         }
6907
6908         if (uts) {
6909                 struct timespec64 ts;
6910
6911                 if (get_timespec64(&ts, uts))
6912                         return -EFAULT;
6913                 timeout = timespec64_to_jiffies(&ts);
6914         }
6915
6916         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
6917         trace_io_uring_cqring_wait(ctx, min_events);
6918         do {
6919                 io_cqring_overflow_flush(ctx, false, NULL, NULL);
6920                 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
6921                                                 TASK_INTERRUPTIBLE);
6922                 ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
6923                 finish_wait(&ctx->wait, &iowq.wq);
6924         } while (ret > 0);
6925
6926         restore_saved_sigmask_unless(ret == -EINTR);
6927
6928         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
6929 }
6930
6931 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
6932 {
6933 #if defined(CONFIG_UNIX)
6934         if (ctx->ring_sock) {
6935                 struct sock *sock = ctx->ring_sock->sk;
6936                 struct sk_buff *skb;
6937
6938                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
6939                         kfree_skb(skb);
6940         }
6941 #else
6942         int i;
6943
6944         for (i = 0; i < ctx->nr_user_files; i++) {
6945                 struct file *file;
6946
6947                 file = io_file_from_index(ctx, i);
6948                 if (file)
6949                         fput(file);
6950         }
6951 #endif
6952 }
6953
6954 static void io_rsrc_data_ref_zero(struct percpu_ref *ref)
6955 {
6956         struct fixed_rsrc_data *data;
6957
6958         data = container_of(ref, struct fixed_rsrc_data, refs);
6959         complete(&data->done);
6960 }
6961
6962 static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
6963 {
6964         spin_lock_bh(&ctx->rsrc_ref_lock);
6965 }
6966
6967 static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx)
6968 {
6969         spin_unlock_bh(&ctx->rsrc_ref_lock);
6970 }
6971
6972 static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx,
6973                                  struct fixed_rsrc_data *rsrc_data,
6974                                  struct fixed_rsrc_ref_node *ref_node)
6975 {
6976         io_rsrc_ref_lock(ctx);
6977         rsrc_data->node = ref_node;
6978         list_add_tail(&ref_node->node, &ctx->rsrc_ref_list);
6979         io_rsrc_ref_unlock(ctx);
6980         percpu_ref_get(&rsrc_data->refs);
6981 }
6982
6983 static void io_sqe_rsrc_kill_node(struct io_ring_ctx *ctx, struct fixed_rsrc_data *data)
6984 {
6985         struct fixed_rsrc_ref_node *ref_node = NULL;
6986
6987         io_rsrc_ref_lock(ctx);
6988         ref_node = data->node;
6989         data->node = NULL;
6990         io_rsrc_ref_unlock(ctx);
6991         if (ref_node)
6992                 percpu_ref_kill(&ref_node->refs);
6993 }
6994
6995 static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
6996                                struct io_ring_ctx *ctx,
6997                                void (*rsrc_put)(struct io_ring_ctx *ctx,
6998                                                 struct io_rsrc_put *prsrc))
6999 {
7000         struct fixed_rsrc_ref_node *backup_node;
7001         int ret;
7002
7003         if (data->quiesce)
7004                 return -ENXIO;
7005
7006         data->quiesce = true;
7007         do {
7008                 ret = -ENOMEM;
7009                 backup_node = alloc_fixed_rsrc_ref_node(ctx);
7010                 if (!backup_node)
7011                         break;
7012                 backup_node->rsrc_data = data;
7013                 backup_node->rsrc_put = rsrc_put;
7014
7015                 io_sqe_rsrc_kill_node(ctx, data);
7016                 percpu_ref_kill(&data->refs);
7017                 flush_delayed_work(&ctx->rsrc_put_work);
7018
7019                 ret = wait_for_completion_interruptible(&data->done);
7020                 if (!ret || !io_refs_resurrect(&data->refs, &data->done))
7021                         break;
7022
7023                 io_sqe_rsrc_set_node(ctx, data, backup_node);
7024                 backup_node = NULL;
7025                 mutex_unlock(&ctx->uring_lock);
7026                 ret = io_run_task_work_sig();
7027                 mutex_lock(&ctx->uring_lock);
7028         } while (ret >= 0);
7029         data->quiesce = false;
7030
7031         if (backup_node)
7032                 destroy_fixed_rsrc_ref_node(backup_node);
7033         return ret;
7034 }
7035
7036 static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx)
7037 {
7038         struct fixed_rsrc_data *data;
7039
7040         data = kzalloc(sizeof(*data), GFP_KERNEL);
7041         if (!data)
7042                 return NULL;
7043
7044         if (percpu_ref_init(&data->refs, io_rsrc_data_ref_zero,
7045                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
7046                 kfree(data);
7047                 return NULL;
7048         }
7049         data->ctx = ctx;
7050         init_completion(&data->done);
7051         return data;
7052 }
7053
7054 static void free_fixed_rsrc_data(struct fixed_rsrc_data *data)
7055 {
7056         percpu_ref_exit(&data->refs);
7057         kfree(data->table);
7058         kfree(data);
7059 }
7060
7061 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
7062 {
7063         struct fixed_rsrc_data *data = ctx->file_data;
7064         unsigned nr_tables, i;
7065         int ret;
7066
7067         /*
7068          * percpu_ref_is_dying() is to stop parallel files unregister
7069          * Since we possibly drop uring lock later in this function to
7070          * run task work.
7071          */
7072         if (!data || percpu_ref_is_dying(&data->refs))
7073                 return -ENXIO;
7074         ret = io_rsrc_ref_quiesce(data, ctx, io_ring_file_put);
7075         if (ret)
7076                 return ret;
7077
7078         __io_sqe_files_unregister(ctx);
7079         nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
7080         for (i = 0; i < nr_tables; i++)
7081                 kfree(data->table[i].files);
7082         free_fixed_rsrc_data(data);
7083         ctx->file_data = NULL;
7084         ctx->nr_user_files = 0;
7085         return 0;
7086 }
7087
7088 static void io_sq_thread_unpark(struct io_sq_data *sqd)
7089         __releases(&sqd->lock)
7090 {
7091         if (!sqd->thread)
7092                 return;
7093         if (sqd->thread == current)
7094                 return;
7095         clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7096         wake_up_state(sqd->thread, TASK_PARKED);
7097         mutex_unlock(&sqd->lock);
7098 }
7099
7100 static bool io_sq_thread_park(struct io_sq_data *sqd)
7101         __acquires(&sqd->lock)
7102 {
7103         if (sqd->thread == current)
7104                 return true;
7105         mutex_lock(&sqd->lock);
7106         if (!sqd->thread) {
7107                 mutex_unlock(&sqd->lock);
7108                 return false;
7109         }
7110         set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7111         wake_up_process(sqd->thread);
7112         wait_for_completion(&sqd->completion);
7113         return true;
7114 }
7115
7116 static void io_sq_thread_stop(struct io_sq_data *sqd)
7117 {
7118         if (!sqd->thread)
7119                 return;
7120
7121         set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
7122         WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state));
7123         wake_up_process(sqd->thread);
7124         wait_for_completion(&sqd->exited);
7125 }
7126
7127 static void io_put_sq_data(struct io_sq_data *sqd)
7128 {
7129         if (refcount_dec_and_test(&sqd->refs)) {
7130                 io_sq_thread_stop(sqd);
7131                 kfree(sqd);
7132         }
7133 }
7134
7135 static void io_sq_thread_finish(struct io_ring_ctx *ctx)
7136 {
7137         struct io_sq_data *sqd = ctx->sq_data;
7138
7139         if (sqd) {
7140                 if (sqd->thread) {
7141                         wait_for_completion(&ctx->sq_thread_comp);
7142                         io_sq_thread_park(sqd);
7143                 }
7144
7145                 mutex_lock(&sqd->ctx_lock);
7146                 list_del(&ctx->sqd_list);
7147                 io_sqd_update_thread_idle(sqd);
7148                 mutex_unlock(&sqd->ctx_lock);
7149
7150                 if (sqd->thread)
7151                         io_sq_thread_unpark(sqd);
7152
7153                 io_put_sq_data(sqd);
7154                 ctx->sq_data = NULL;
7155         }
7156 }
7157
7158 static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
7159 {
7160         struct io_ring_ctx *ctx_attach;
7161         struct io_sq_data *sqd;
7162         struct fd f;
7163
7164         f = fdget(p->wq_fd);
7165         if (!f.file)
7166                 return ERR_PTR(-ENXIO);
7167         if (f.file->f_op != &io_uring_fops) {
7168                 fdput(f);
7169                 return ERR_PTR(-EINVAL);
7170         }
7171
7172         ctx_attach = f.file->private_data;
7173         sqd = ctx_attach->sq_data;
7174         if (!sqd) {
7175                 fdput(f);
7176                 return ERR_PTR(-EINVAL);
7177         }
7178
7179         refcount_inc(&sqd->refs);
7180         fdput(f);
7181         return sqd;
7182 }
7183
7184 static struct io_sq_data *io_get_sq_data(struct io_uring_params *p)
7185 {
7186         struct io_sq_data *sqd;
7187
7188         if (p->flags & IORING_SETUP_ATTACH_WQ)
7189                 return io_attach_sq_data(p);
7190
7191         sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
7192         if (!sqd)
7193                 return ERR_PTR(-ENOMEM);
7194
7195         refcount_set(&sqd->refs, 1);
7196         INIT_LIST_HEAD(&sqd->ctx_list);
7197         INIT_LIST_HEAD(&sqd->ctx_new_list);
7198         mutex_init(&sqd->ctx_lock);
7199         mutex_init(&sqd->lock);
7200         init_waitqueue_head(&sqd->wait);
7201         init_completion(&sqd->startup);
7202         init_completion(&sqd->completion);
7203         init_completion(&sqd->exited);
7204         return sqd;
7205 }
7206
7207 #if defined(CONFIG_UNIX)
7208 /*
7209  * Ensure the UNIX gc is aware of our file set, so we are certain that
7210  * the io_uring can be safely unregistered on process exit, even if we have
7211  * loops in the file referencing.
7212  */
7213 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
7214 {
7215         struct sock *sk = ctx->ring_sock->sk;
7216         struct scm_fp_list *fpl;
7217         struct sk_buff *skb;
7218         int i, nr_files;
7219
7220         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
7221         if (!fpl)
7222                 return -ENOMEM;
7223
7224         skb = alloc_skb(0, GFP_KERNEL);
7225         if (!skb) {
7226                 kfree(fpl);
7227                 return -ENOMEM;
7228         }
7229
7230         skb->sk = sk;
7231
7232         nr_files = 0;
7233         fpl->user = get_uid(ctx->user);
7234         for (i = 0; i < nr; i++) {
7235                 struct file *file = io_file_from_index(ctx, i + offset);
7236
7237                 if (!file)
7238                         continue;
7239                 fpl->fp[nr_files] = get_file(file);
7240                 unix_inflight(fpl->user, fpl->fp[nr_files]);
7241                 nr_files++;
7242         }
7243
7244         if (nr_files) {
7245                 fpl->max = SCM_MAX_FD;
7246                 fpl->count = nr_files;
7247                 UNIXCB(skb).fp = fpl;
7248                 skb->destructor = unix_destruct_scm;
7249                 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
7250                 skb_queue_head(&sk->sk_receive_queue, skb);
7251
7252                 for (i = 0; i < nr_files; i++)
7253                         fput(fpl->fp[i]);
7254         } else {
7255                 kfree_skb(skb);
7256                 kfree(fpl);
7257         }
7258
7259         return 0;
7260 }
7261
7262 /*
7263  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
7264  * causes regular reference counting to break down. We rely on the UNIX
7265  * garbage collection to take care of this problem for us.
7266  */
7267 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7268 {
7269         unsigned left, total;
7270         int ret = 0;
7271
7272         total = 0;
7273         left = ctx->nr_user_files;
7274         while (left) {
7275                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
7276
7277                 ret = __io_sqe_files_scm(ctx, this_files, total);
7278                 if (ret)
7279                         break;
7280                 left -= this_files;
7281                 total += this_files;
7282         }
7283
7284         if (!ret)
7285                 return 0;
7286
7287         while (total < ctx->nr_user_files) {
7288                 struct file *file = io_file_from_index(ctx, total);
7289
7290                 if (file)
7291                         fput(file);
7292                 total++;
7293         }
7294
7295         return ret;
7296 }
7297 #else
7298 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7299 {
7300         return 0;
7301 }
7302 #endif
7303
7304 static int io_sqe_alloc_file_tables(struct fixed_rsrc_data *file_data,
7305                                     unsigned nr_tables, unsigned nr_files)
7306 {
7307         int i;
7308
7309         for (i = 0; i < nr_tables; i++) {
7310                 struct fixed_rsrc_table *table = &file_data->table[i];
7311                 unsigned this_files;
7312
7313                 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
7314                 table->files = kcalloc(this_files, sizeof(struct file *),
7315                                         GFP_KERNEL);
7316                 if (!table->files)
7317                         break;
7318                 nr_files -= this_files;
7319         }
7320
7321         if (i == nr_tables)
7322                 return 0;
7323
7324         for (i = 0; i < nr_tables; i++) {
7325                 struct fixed_rsrc_table *table = &file_data->table[i];
7326                 kfree(table->files);
7327         }
7328         return 1;
7329 }
7330
7331 static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
7332 {
7333         struct file *file = prsrc->file;
7334 #if defined(CONFIG_UNIX)
7335         struct sock *sock = ctx->ring_sock->sk;
7336         struct sk_buff_head list, *head = &sock->sk_receive_queue;
7337         struct sk_buff *skb;
7338         int i;
7339
7340         __skb_queue_head_init(&list);
7341
7342         /*
7343          * Find the skb that holds this file in its SCM_RIGHTS. When found,
7344          * remove this entry and rearrange the file array.
7345          */
7346         skb = skb_dequeue(head);
7347         while (skb) {
7348                 struct scm_fp_list *fp;
7349
7350                 fp = UNIXCB(skb).fp;
7351                 for (i = 0; i < fp->count; i++) {
7352                         int left;
7353
7354                         if (fp->fp[i] != file)
7355                                 continue;
7356
7357                         unix_notinflight(fp->user, fp->fp[i]);
7358                         left = fp->count - 1 - i;
7359                         if (left) {
7360                                 memmove(&fp->fp[i], &fp->fp[i + 1],
7361                                                 left * sizeof(struct file *));
7362                         }
7363                         fp->count--;
7364                         if (!fp->count) {
7365                                 kfree_skb(skb);
7366                                 skb = NULL;
7367                         } else {
7368                                 __skb_queue_tail(&list, skb);
7369                         }
7370                         fput(file);
7371                         file = NULL;
7372                         break;
7373                 }
7374
7375                 if (!file)
7376                         break;
7377
7378                 __skb_queue_tail(&list, skb);
7379
7380                 skb = skb_dequeue(head);
7381         }
7382
7383         if (skb_peek(&list)) {
7384                 spin_lock_irq(&head->lock);
7385                 while ((skb = __skb_dequeue(&list)) != NULL)
7386                         __skb_queue_tail(head, skb);
7387                 spin_unlock_irq(&head->lock);
7388         }
7389 #else
7390         fput(file);
7391 #endif
7392 }
7393
7394 static void __io_rsrc_put_work(struct fixed_rsrc_ref_node *ref_node)
7395 {
7396         struct fixed_rsrc_data *rsrc_data = ref_node->rsrc_data;
7397         struct io_ring_ctx *ctx = rsrc_data->ctx;
7398         struct io_rsrc_put *prsrc, *tmp;
7399
7400         list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
7401                 list_del(&prsrc->list);
7402                 ref_node->rsrc_put(ctx, prsrc);
7403                 kfree(prsrc);
7404         }
7405
7406         percpu_ref_exit(&ref_node->refs);
7407         kfree(ref_node);
7408         percpu_ref_put(&rsrc_data->refs);
7409 }
7410
7411 static void io_rsrc_put_work(struct work_struct *work)
7412 {
7413         struct io_ring_ctx *ctx;
7414         struct llist_node *node;
7415
7416         ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
7417         node = llist_del_all(&ctx->rsrc_put_llist);
7418
7419         while (node) {
7420                 struct fixed_rsrc_ref_node *ref_node;
7421                 struct llist_node *next = node->next;
7422
7423                 ref_node = llist_entry(node, struct fixed_rsrc_ref_node, llist);
7424                 __io_rsrc_put_work(ref_node);
7425                 node = next;
7426         }
7427 }
7428
7429 static struct file **io_fixed_file_slot(struct fixed_rsrc_data *file_data,
7430                                         unsigned i)
7431 {
7432         struct fixed_rsrc_table *table;
7433
7434         table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
7435         return &table->files[i & IORING_FILE_TABLE_MASK];
7436 }
7437
7438 static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
7439 {
7440         struct fixed_rsrc_ref_node *ref_node;
7441         struct fixed_rsrc_data *data;
7442         struct io_ring_ctx *ctx;
7443         bool first_add = false;
7444         int delay = HZ;
7445
7446         ref_node = container_of(ref, struct fixed_rsrc_ref_node, refs);
7447         data = ref_node->rsrc_data;
7448         ctx = data->ctx;
7449
7450         io_rsrc_ref_lock(ctx);
7451         ref_node->done = true;
7452
7453         while (!list_empty(&ctx->rsrc_ref_list)) {
7454                 ref_node = list_first_entry(&ctx->rsrc_ref_list,
7455                                         struct fixed_rsrc_ref_node, node);
7456                 /* recycle ref nodes in order */
7457                 if (!ref_node->done)
7458                         break;
7459                 list_del(&ref_node->node);
7460                 first_add |= llist_add(&ref_node->llist, &ctx->rsrc_put_llist);
7461         }
7462         io_rsrc_ref_unlock(ctx);
7463
7464         if (percpu_ref_is_dying(&data->refs))
7465                 delay = 0;
7466
7467         if (!delay)
7468                 mod_delayed_work(system_wq, &ctx->rsrc_put_work, 0);
7469         else if (first_add)
7470                 queue_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
7471 }
7472
7473 static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
7474                         struct io_ring_ctx *ctx)
7475 {
7476         struct fixed_rsrc_ref_node *ref_node;
7477
7478         ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
7479         if (!ref_node)
7480                 return NULL;
7481
7482         if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
7483                             0, GFP_KERNEL)) {
7484                 kfree(ref_node);
7485                 return NULL;
7486         }
7487         INIT_LIST_HEAD(&ref_node->node);
7488         INIT_LIST_HEAD(&ref_node->rsrc_list);
7489         ref_node->done = false;
7490         return ref_node;
7491 }
7492
7493 static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
7494                                      struct fixed_rsrc_ref_node *ref_node)
7495 {
7496         ref_node->rsrc_data = ctx->file_data;
7497         ref_node->rsrc_put = io_ring_file_put;
7498 }
7499
7500 static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
7501 {
7502         percpu_ref_exit(&ref_node->refs);
7503         kfree(ref_node);
7504 }
7505
7506
7507 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
7508                                  unsigned nr_args)
7509 {
7510         __s32 __user *fds = (__s32 __user *) arg;
7511         unsigned nr_tables, i;
7512         struct file *file;
7513         int fd, ret = -ENOMEM;
7514         struct fixed_rsrc_ref_node *ref_node;
7515         struct fixed_rsrc_data *file_data;
7516
7517         if (ctx->file_data)
7518                 return -EBUSY;
7519         if (!nr_args)
7520                 return -EINVAL;
7521         if (nr_args > IORING_MAX_FIXED_FILES)
7522                 return -EMFILE;
7523
7524         file_data = alloc_fixed_rsrc_data(ctx);
7525         if (!file_data)
7526                 return -ENOMEM;
7527         ctx->file_data = file_data;
7528
7529         nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
7530         file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
7531                                    GFP_KERNEL);
7532         if (!file_data->table)
7533                 goto out_free;
7534
7535         if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args))
7536                 goto out_free;
7537
7538         for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
7539                 if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
7540                         ret = -EFAULT;
7541                         goto out_fput;
7542                 }
7543                 /* allow sparse sets */
7544                 if (fd == -1)
7545                         continue;
7546
7547                 file = fget(fd);
7548                 ret = -EBADF;
7549                 if (!file)
7550                         goto out_fput;
7551
7552                 /*
7553                  * Don't allow io_uring instances to be registered. If UNIX
7554                  * isn't enabled, then this causes a reference cycle and this
7555                  * instance can never get freed. If UNIX is enabled we'll
7556                  * handle it just fine, but there's still no point in allowing
7557                  * a ring fd as it doesn't support regular read/write anyway.
7558                  */
7559                 if (file->f_op == &io_uring_fops) {
7560                         fput(file);
7561                         goto out_fput;
7562                 }
7563                 *io_fixed_file_slot(file_data, i) = file;
7564         }
7565
7566         ret = io_sqe_files_scm(ctx);
7567         if (ret) {
7568                 io_sqe_files_unregister(ctx);
7569                 return ret;
7570         }
7571
7572         ref_node = alloc_fixed_rsrc_ref_node(ctx);
7573         if (!ref_node) {
7574                 io_sqe_files_unregister(ctx);
7575                 return -ENOMEM;
7576         }
7577         init_fixed_file_ref_node(ctx, ref_node);
7578
7579         io_sqe_rsrc_set_node(ctx, file_data, ref_node);
7580         return ret;
7581 out_fput:
7582         for (i = 0; i < ctx->nr_user_files; i++) {
7583                 file = io_file_from_index(ctx, i);
7584                 if (file)
7585                         fput(file);
7586         }
7587         for (i = 0; i < nr_tables; i++)
7588                 kfree(file_data->table[i].files);
7589         ctx->nr_user_files = 0;
7590 out_free:
7591         free_fixed_rsrc_data(ctx->file_data);
7592         ctx->file_data = NULL;
7593         return ret;
7594 }
7595
7596 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
7597                                 int index)
7598 {
7599 #if defined(CONFIG_UNIX)
7600         struct sock *sock = ctx->ring_sock->sk;
7601         struct sk_buff_head *head = &sock->sk_receive_queue;
7602         struct sk_buff *skb;
7603
7604         /*
7605          * See if we can merge this file into an existing skb SCM_RIGHTS
7606          * file set. If there's no room, fall back to allocating a new skb
7607          * and filling it in.
7608          */
7609         spin_lock_irq(&head->lock);
7610         skb = skb_peek(head);
7611         if (skb) {
7612                 struct scm_fp_list *fpl = UNIXCB(skb).fp;
7613
7614                 if (fpl->count < SCM_MAX_FD) {
7615                         __skb_unlink(skb, head);
7616                         spin_unlock_irq(&head->lock);
7617                         fpl->fp[fpl->count] = get_file(file);
7618                         unix_inflight(fpl->user, fpl->fp[fpl->count]);
7619                         fpl->count++;
7620                         spin_lock_irq(&head->lock);
7621                         __skb_queue_head(head, skb);
7622                 } else {
7623                         skb = NULL;
7624                 }
7625         }
7626         spin_unlock_irq(&head->lock);
7627
7628         if (skb) {
7629                 fput(file);
7630                 return 0;
7631         }
7632
7633         return __io_sqe_files_scm(ctx, 1, index);
7634 #else
7635         return 0;
7636 #endif
7637 }
7638
7639 static int io_queue_rsrc_removal(struct fixed_rsrc_data *data, void *rsrc)
7640 {
7641         struct io_rsrc_put *prsrc;
7642         struct fixed_rsrc_ref_node *ref_node = data->node;
7643
7644         prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
7645         if (!prsrc)
7646                 return -ENOMEM;
7647
7648         prsrc->rsrc = rsrc;
7649         list_add(&prsrc->list, &ref_node->rsrc_list);
7650
7651         return 0;
7652 }
7653
7654 static inline int io_queue_file_removal(struct fixed_rsrc_data *data,
7655                                         struct file *file)
7656 {
7657         return io_queue_rsrc_removal(data, (void *)file);
7658 }
7659
7660 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
7661                                  struct io_uring_rsrc_update *up,
7662                                  unsigned nr_args)
7663 {
7664         struct fixed_rsrc_data *data = ctx->file_data;
7665         struct fixed_rsrc_ref_node *ref_node;
7666         struct file *file, **file_slot;
7667         __s32 __user *fds;
7668         int fd, i, err;
7669         __u32 done;
7670         bool needs_switch = false;
7671
7672         if (check_add_overflow(up->offset, nr_args, &done))
7673                 return -EOVERFLOW;
7674         if (done > ctx->nr_user_files)
7675                 return -EINVAL;
7676
7677         ref_node = alloc_fixed_rsrc_ref_node(ctx);
7678         if (!ref_node)
7679                 return -ENOMEM;
7680         init_fixed_file_ref_node(ctx, ref_node);
7681
7682         fds = u64_to_user_ptr(up->data);
7683         for (done = 0; done < nr_args; done++) {
7684                 err = 0;
7685                 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
7686                         err = -EFAULT;
7687                         break;
7688                 }
7689                 if (fd == IORING_REGISTER_FILES_SKIP)
7690                         continue;
7691
7692                 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
7693                 file_slot = io_fixed_file_slot(ctx->file_data, i);
7694
7695                 if (*file_slot) {
7696                         err = io_queue_file_removal(data, *file_slot);
7697                         if (err)
7698                                 break;
7699                         *file_slot = NULL;
7700                         needs_switch = true;
7701                 }
7702                 if (fd != -1) {
7703                         file = fget(fd);
7704                         if (!file) {
7705                                 err = -EBADF;
7706                                 break;
7707                         }
7708                         /*
7709                          * Don't allow io_uring instances to be registered. If
7710                          * UNIX isn't enabled, then this causes a reference
7711                          * cycle and this instance can never get freed. If UNIX
7712                          * is enabled we'll handle it just fine, but there's
7713                          * still no point in allowing a ring fd as it doesn't
7714                          * support regular read/write anyway.
7715                          */
7716                         if (file->f_op == &io_uring_fops) {
7717                                 fput(file);
7718                                 err = -EBADF;
7719                                 break;
7720                         }
7721                         *file_slot = file;
7722                         err = io_sqe_file_register(ctx, file, i);
7723                         if (err) {
7724                                 *file_slot = NULL;
7725                                 fput(file);
7726                                 break;
7727                         }
7728                 }
7729         }
7730
7731         if (needs_switch) {
7732                 percpu_ref_kill(&data->node->refs);
7733                 io_sqe_rsrc_set_node(ctx, data, ref_node);
7734         } else
7735                 destroy_fixed_rsrc_ref_node(ref_node);
7736
7737         return done ? done : err;
7738 }
7739
7740 static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
7741                                unsigned nr_args)
7742 {
7743         struct io_uring_rsrc_update up;
7744
7745         if (!ctx->file_data)
7746                 return -ENXIO;
7747         if (!nr_args)
7748                 return -EINVAL;
7749         if (copy_from_user(&up, arg, sizeof(up)))
7750                 return -EFAULT;
7751         if (up.resv)
7752                 return -EINVAL;
7753
7754         return __io_sqe_files_update(ctx, &up, nr_args);
7755 }
7756
7757 static struct io_wq_work *io_free_work(struct io_wq_work *work)
7758 {
7759         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
7760
7761         req = io_put_req_find_next(req);
7762         return req ? &req->work : NULL;
7763 }
7764
7765 static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
7766 {
7767         struct io_wq_data data;
7768         unsigned int concurrency;
7769
7770         data.free_work = io_free_work;
7771         data.do_work = io_wq_submit_work;
7772
7773         /* Do QD, or 4 * CPUS, whatever is smallest */
7774         concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
7775
7776         return io_wq_create(concurrency, &data);
7777 }
7778
7779 static int io_uring_alloc_task_context(struct task_struct *task,
7780                                        struct io_ring_ctx *ctx)
7781 {
7782         struct io_uring_task *tctx;
7783         int ret;
7784
7785         tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
7786         if (unlikely(!tctx))
7787                 return -ENOMEM;
7788
7789         ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
7790         if (unlikely(ret)) {
7791                 kfree(tctx);
7792                 return ret;
7793         }
7794
7795         tctx->io_wq = io_init_wq_offload(ctx);
7796         if (IS_ERR(tctx->io_wq)) {
7797                 ret = PTR_ERR(tctx->io_wq);
7798                 percpu_counter_destroy(&tctx->inflight);
7799                 kfree(tctx);
7800                 return ret;
7801         }
7802
7803         xa_init(&tctx->xa);
7804         init_waitqueue_head(&tctx->wait);
7805         tctx->last = NULL;
7806         atomic_set(&tctx->in_idle, 0);
7807         tctx->sqpoll = false;
7808         task->io_uring = tctx;
7809         spin_lock_init(&tctx->task_lock);
7810         INIT_WQ_LIST(&tctx->task_list);
7811         tctx->task_state = 0;
7812         init_task_work(&tctx->task_work, tctx_task_work);
7813         return 0;
7814 }
7815
7816 void __io_uring_free(struct task_struct *tsk)
7817 {
7818         struct io_uring_task *tctx = tsk->io_uring;
7819
7820         WARN_ON_ONCE(!xa_empty(&tctx->xa));
7821         percpu_counter_destroy(&tctx->inflight);
7822         kfree(tctx);
7823         tsk->io_uring = NULL;
7824 }
7825
7826 static int io_sq_offload_create(struct io_ring_ctx *ctx,
7827                                 struct io_uring_params *p)
7828 {
7829         int ret;
7830
7831         /* Retain compatibility with failing for an invalid attach attempt */
7832         if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
7833                                 IORING_SETUP_ATTACH_WQ) {
7834                 struct fd f;
7835
7836                 f = fdget(p->wq_fd);
7837                 if (!f.file)
7838                         return -ENXIO;
7839                 if (f.file->f_op != &io_uring_fops) {
7840                         fdput(f);
7841                         return -EINVAL;
7842                 }
7843                 fdput(f);
7844         }
7845         if (ctx->flags & IORING_SETUP_SQPOLL) {
7846                 struct io_sq_data *sqd;
7847
7848                 ret = -EPERM;
7849                 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
7850                         goto err;
7851
7852                 sqd = io_get_sq_data(p);
7853                 if (IS_ERR(sqd)) {
7854                         ret = PTR_ERR(sqd);
7855                         goto err;
7856                 }
7857
7858                 ctx->sq_data = sqd;
7859                 io_sq_thread_park(sqd);
7860                 mutex_lock(&sqd->ctx_lock);
7861                 list_add(&ctx->sqd_list, &sqd->ctx_new_list);
7862                 mutex_unlock(&sqd->ctx_lock);
7863                 io_sq_thread_unpark(sqd);
7864
7865                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
7866                 if (!ctx->sq_thread_idle)
7867                         ctx->sq_thread_idle = HZ;
7868
7869                 if (sqd->thread)
7870                         return 0;
7871
7872                 if (p->flags & IORING_SETUP_SQ_AFF) {
7873                         int cpu = p->sq_thread_cpu;
7874
7875                         ret = -EINVAL;
7876                         if (cpu >= nr_cpu_ids)
7877                                 goto err;
7878                         if (!cpu_online(cpu))
7879                                 goto err;
7880
7881                         sqd->sq_cpu = cpu;
7882                 } else {
7883                         sqd->sq_cpu = -1;
7884                 }
7885
7886                 sqd->task_pid = current->pid;
7887                 current->flags |= PF_IO_WORKER;
7888                 ret = io_wq_fork_thread(io_sq_thread, sqd);
7889                 current->flags &= ~PF_IO_WORKER;
7890                 if (ret < 0) {
7891                         sqd->thread = NULL;
7892                         goto err;
7893                 }
7894                 wait_for_completion(&sqd->completion);
7895                 ret = io_uring_alloc_task_context(sqd->thread, ctx);
7896                 if (ret)
7897                         goto err;
7898         } else if (p->flags & IORING_SETUP_SQ_AFF) {
7899                 /* Can't have SQ_AFF without SQPOLL */
7900                 ret = -EINVAL;
7901                 goto err;
7902         }
7903
7904         return 0;
7905 err:
7906         io_sq_thread_finish(ctx);
7907         return ret;
7908 }
7909
7910 static void io_sq_offload_start(struct io_ring_ctx *ctx)
7911 {
7912         struct io_sq_data *sqd = ctx->sq_data;
7913
7914         if ((ctx->flags & IORING_SETUP_SQPOLL) && sqd->thread)
7915                 complete(&sqd->startup);
7916 }
7917
7918 static inline void __io_unaccount_mem(struct user_struct *user,
7919                                       unsigned long nr_pages)
7920 {
7921         atomic_long_sub(nr_pages, &user->locked_vm);
7922 }
7923
7924 static inline int __io_account_mem(struct user_struct *user,
7925                                    unsigned long nr_pages)
7926 {
7927         unsigned long page_limit, cur_pages, new_pages;
7928
7929         /* Don't allow more pages than we can safely lock */
7930         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
7931
7932         do {
7933                 cur_pages = atomic_long_read(&user->locked_vm);
7934                 new_pages = cur_pages + nr_pages;
7935                 if (new_pages > page_limit)
7936                         return -ENOMEM;
7937         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
7938                                         new_pages) != cur_pages);
7939
7940         return 0;
7941 }
7942
7943 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
7944 {
7945         if (ctx->limit_mem)
7946                 __io_unaccount_mem(ctx->user, nr_pages);
7947
7948         if (ctx->mm_account)
7949                 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
7950 }
7951
7952 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
7953 {
7954         int ret;
7955
7956         if (ctx->limit_mem) {
7957                 ret = __io_account_mem(ctx->user, nr_pages);
7958                 if (ret)
7959                         return ret;
7960         }
7961
7962         if (ctx->mm_account)
7963                 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
7964
7965         return 0;
7966 }
7967
7968 static void io_mem_free(void *ptr)
7969 {
7970         struct page *page;
7971
7972         if (!ptr)
7973                 return;
7974
7975         page = virt_to_head_page(ptr);
7976         if (put_page_testzero(page))
7977                 free_compound_page(page);
7978 }
7979
7980 static void *io_mem_alloc(size_t size)
7981 {
7982         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
7983                                 __GFP_NORETRY | __GFP_ACCOUNT;
7984
7985         return (void *) __get_free_pages(gfp_flags, get_order(size));
7986 }
7987
7988 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
7989                                 size_t *sq_offset)
7990 {
7991         struct io_rings *rings;
7992         size_t off, sq_array_size;
7993
7994         off = struct_size(rings, cqes, cq_entries);
7995         if (off == SIZE_MAX)
7996                 return SIZE_MAX;
7997
7998 #ifdef CONFIG_SMP
7999         off = ALIGN(off, SMP_CACHE_BYTES);
8000         if (off == 0)
8001                 return SIZE_MAX;
8002 #endif
8003
8004         if (sq_offset)
8005                 *sq_offset = off;
8006
8007         sq_array_size = array_size(sizeof(u32), sq_entries);
8008         if (sq_array_size == SIZE_MAX)
8009                 return SIZE_MAX;
8010
8011         if (check_add_overflow(off, sq_array_size, &off))
8012                 return SIZE_MAX;
8013
8014         return off;
8015 }
8016
8017 static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
8018 {
8019         int i, j;
8020
8021         if (!ctx->user_bufs)
8022                 return -ENXIO;
8023
8024         for (i = 0; i < ctx->nr_user_bufs; i++) {
8025                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8026
8027                 for (j = 0; j < imu->nr_bvecs; j++)
8028                         unpin_user_page(imu->bvec[j].bv_page);
8029
8030                 if (imu->acct_pages)
8031                         io_unaccount_mem(ctx, imu->acct_pages);
8032                 kvfree(imu->bvec);
8033                 imu->nr_bvecs = 0;
8034         }
8035
8036         kfree(ctx->user_bufs);
8037         ctx->user_bufs = NULL;
8038         ctx->nr_user_bufs = 0;
8039         return 0;
8040 }
8041
8042 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
8043                        void __user *arg, unsigned index)
8044 {
8045         struct iovec __user *src;
8046
8047 #ifdef CONFIG_COMPAT
8048         if (ctx->compat) {
8049                 struct compat_iovec __user *ciovs;
8050                 struct compat_iovec ciov;
8051
8052                 ciovs = (struct compat_iovec __user *) arg;
8053                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
8054                         return -EFAULT;
8055
8056                 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
8057                 dst->iov_len = ciov.iov_len;
8058                 return 0;
8059         }
8060 #endif
8061         src = (struct iovec __user *) arg;
8062         if (copy_from_user(dst, &src[index], sizeof(*dst)))
8063                 return -EFAULT;
8064         return 0;
8065 }
8066
8067 /*
8068  * Not super efficient, but this is just a registration time. And we do cache
8069  * the last compound head, so generally we'll only do a full search if we don't
8070  * match that one.
8071  *
8072  * We check if the given compound head page has already been accounted, to
8073  * avoid double accounting it. This allows us to account the full size of the
8074  * page, not just the constituent pages of a huge page.
8075  */
8076 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
8077                                   int nr_pages, struct page *hpage)
8078 {
8079         int i, j;
8080
8081         /* check current page array */
8082         for (i = 0; i < nr_pages; i++) {
8083                 if (!PageCompound(pages[i]))
8084                         continue;
8085                 if (compound_head(pages[i]) == hpage)
8086                         return true;
8087         }
8088
8089         /* check previously registered pages */
8090         for (i = 0; i < ctx->nr_user_bufs; i++) {
8091                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8092
8093                 for (j = 0; j < imu->nr_bvecs; j++) {
8094                         if (!PageCompound(imu->bvec[j].bv_page))
8095                                 continue;
8096                         if (compound_head(imu->bvec[j].bv_page) == hpage)
8097                                 return true;
8098                 }
8099         }
8100
8101         return false;
8102 }
8103
8104 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
8105                                  int nr_pages, struct io_mapped_ubuf *imu,
8106                                  struct page **last_hpage)
8107 {
8108         int i, ret;
8109
8110         for (i = 0; i < nr_pages; i++) {
8111                 if (!PageCompound(pages[i])) {
8112                         imu->acct_pages++;
8113                 } else {
8114                         struct page *hpage;
8115
8116                         hpage = compound_head(pages[i]);
8117                         if (hpage == *last_hpage)
8118                                 continue;
8119                         *last_hpage = hpage;
8120                         if (headpage_already_acct(ctx, pages, i, hpage))
8121                                 continue;
8122                         imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
8123                 }
8124         }
8125
8126         if (!imu->acct_pages)
8127                 return 0;
8128
8129         ret = io_account_mem(ctx, imu->acct_pages);
8130         if (ret)
8131                 imu->acct_pages = 0;
8132         return ret;
8133 }
8134
8135 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
8136                                   struct io_mapped_ubuf *imu,
8137                                   struct page **last_hpage)
8138 {
8139         struct vm_area_struct **vmas = NULL;
8140         struct page **pages = NULL;
8141         unsigned long off, start, end, ubuf;
8142         size_t size;
8143         int ret, pret, nr_pages, i;
8144
8145         ubuf = (unsigned long) iov->iov_base;
8146         end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
8147         start = ubuf >> PAGE_SHIFT;
8148         nr_pages = end - start;
8149
8150         ret = -ENOMEM;
8151
8152         pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
8153         if (!pages)
8154                 goto done;
8155
8156         vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
8157                               GFP_KERNEL);
8158         if (!vmas)
8159                 goto done;
8160
8161         imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
8162                                    GFP_KERNEL);
8163         if (!imu->bvec)
8164                 goto done;
8165
8166         ret = 0;
8167         mmap_read_lock(current->mm);
8168         pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
8169                               pages, vmas);
8170         if (pret == nr_pages) {
8171                 /* don't support file backed memory */
8172                 for (i = 0; i < nr_pages; i++) {
8173                         struct vm_area_struct *vma = vmas[i];
8174
8175                         if (vma->vm_file &&
8176                             !is_file_hugepages(vma->vm_file)) {
8177                                 ret = -EOPNOTSUPP;
8178                                 break;
8179                         }
8180                 }
8181         } else {
8182                 ret = pret < 0 ? pret : -EFAULT;
8183         }
8184         mmap_read_unlock(current->mm);
8185         if (ret) {
8186                 /*
8187                  * if we did partial map, or found file backed vmas,
8188                  * release any pages we did get
8189                  */
8190                 if (pret > 0)
8191                         unpin_user_pages(pages, pret);
8192                 kvfree(imu->bvec);
8193                 goto done;
8194         }
8195
8196         ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
8197         if (ret) {
8198                 unpin_user_pages(pages, pret);
8199                 kvfree(imu->bvec);
8200                 goto done;
8201         }
8202
8203         off = ubuf & ~PAGE_MASK;
8204         size = iov->iov_len;
8205         for (i = 0; i < nr_pages; i++) {
8206                 size_t vec_len;
8207
8208                 vec_len = min_t(size_t, size, PAGE_SIZE - off);
8209                 imu->bvec[i].bv_page = pages[i];
8210                 imu->bvec[i].bv_len = vec_len;
8211                 imu->bvec[i].bv_offset = off;
8212                 off = 0;
8213                 size -= vec_len;
8214         }
8215         /* store original address for later verification */
8216         imu->ubuf = ubuf;
8217         imu->len = iov->iov_len;
8218         imu->nr_bvecs = nr_pages;
8219         ret = 0;
8220 done:
8221         kvfree(pages);
8222         kvfree(vmas);
8223         return ret;
8224 }
8225
8226 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
8227 {
8228         if (ctx->user_bufs)
8229                 return -EBUSY;
8230         if (!nr_args || nr_args > UIO_MAXIOV)
8231                 return -EINVAL;
8232
8233         ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
8234                                         GFP_KERNEL);
8235         if (!ctx->user_bufs)
8236                 return -ENOMEM;
8237
8238         return 0;
8239 }
8240
8241 static int io_buffer_validate(struct iovec *iov)
8242 {
8243         /*
8244          * Don't impose further limits on the size and buffer
8245          * constraints here, we'll -EINVAL later when IO is
8246          * submitted if they are wrong.
8247          */
8248         if (!iov->iov_base || !iov->iov_len)
8249                 return -EFAULT;
8250
8251         /* arbitrary limit, but we need something */
8252         if (iov->iov_len > SZ_1G)
8253                 return -EFAULT;
8254
8255         return 0;
8256 }
8257
8258 static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
8259                                    unsigned int nr_args)
8260 {
8261         int i, ret;
8262         struct iovec iov;
8263         struct page *last_hpage = NULL;
8264
8265         ret = io_buffers_map_alloc(ctx, nr_args);
8266         if (ret)
8267                 return ret;
8268
8269         for (i = 0; i < nr_args; i++) {
8270                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8271
8272                 ret = io_copy_iov(ctx, &iov, arg, i);
8273                 if (ret)
8274                         break;
8275
8276                 ret = io_buffer_validate(&iov);
8277                 if (ret)
8278                         break;
8279
8280                 ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
8281                 if (ret)
8282                         break;
8283
8284                 ctx->nr_user_bufs++;
8285         }
8286
8287         if (ret)
8288                 io_sqe_buffers_unregister(ctx);
8289
8290         return ret;
8291 }
8292
8293 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
8294 {
8295         __s32 __user *fds = arg;
8296         int fd;
8297
8298         if (ctx->cq_ev_fd)
8299                 return -EBUSY;
8300
8301         if (copy_from_user(&fd, fds, sizeof(*fds)))
8302                 return -EFAULT;
8303
8304         ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
8305         if (IS_ERR(ctx->cq_ev_fd)) {
8306                 int ret = PTR_ERR(ctx->cq_ev_fd);
8307                 ctx->cq_ev_fd = NULL;
8308                 return ret;
8309         }
8310
8311         return 0;
8312 }
8313
8314 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
8315 {
8316         if (ctx->cq_ev_fd) {
8317                 eventfd_ctx_put(ctx->cq_ev_fd);
8318                 ctx->cq_ev_fd = NULL;
8319                 return 0;
8320         }
8321
8322         return -ENXIO;
8323 }
8324
8325 static int __io_destroy_buffers(int id, void *p, void *data)
8326 {
8327         struct io_ring_ctx *ctx = data;
8328         struct io_buffer *buf = p;
8329
8330         __io_remove_buffers(ctx, buf, id, -1U);
8331         return 0;
8332 }
8333
8334 static void io_destroy_buffers(struct io_ring_ctx *ctx)
8335 {
8336         idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
8337         idr_destroy(&ctx->io_buffer_idr);
8338 }
8339
8340 static void io_req_cache_free(struct list_head *list, struct task_struct *tsk)
8341 {
8342         struct io_kiocb *req, *nxt;
8343
8344         list_for_each_entry_safe(req, nxt, list, compl.list) {
8345                 if (tsk && req->task != tsk)
8346                         continue;
8347                 list_del(&req->compl.list);
8348                 kmem_cache_free(req_cachep, req);
8349         }
8350 }
8351
8352 static void io_req_caches_free(struct io_ring_ctx *ctx, struct task_struct *tsk)
8353 {
8354         struct io_submit_state *submit_state = &ctx->submit_state;
8355
8356         mutex_lock(&ctx->uring_lock);
8357
8358         if (submit_state->free_reqs)
8359                 kmem_cache_free_bulk(req_cachep, submit_state->free_reqs,
8360                                      submit_state->reqs);
8361
8362         io_req_cache_free(&submit_state->comp.free_list, NULL);
8363
8364         spin_lock_irq(&ctx->completion_lock);
8365         io_req_cache_free(&submit_state->comp.locked_free_list, NULL);
8366         spin_unlock_irq(&ctx->completion_lock);
8367
8368         mutex_unlock(&ctx->uring_lock);
8369 }
8370
8371 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
8372 {
8373         /*
8374          * Some may use context even when all refs and requests have been put,
8375          * and they are free to do so while still holding uring_lock, see
8376          * __io_req_task_submit(). Wait for them to finish.
8377          */
8378         mutex_lock(&ctx->uring_lock);
8379         mutex_unlock(&ctx->uring_lock);
8380
8381         io_sq_thread_finish(ctx);
8382         io_sqe_buffers_unregister(ctx);
8383
8384         if (ctx->mm_account) {
8385                 mmdrop(ctx->mm_account);
8386                 ctx->mm_account = NULL;
8387         }
8388
8389         mutex_lock(&ctx->uring_lock);
8390         io_sqe_files_unregister(ctx);
8391         mutex_unlock(&ctx->uring_lock);
8392         io_eventfd_unregister(ctx);
8393         io_destroy_buffers(ctx);
8394         idr_destroy(&ctx->personality_idr);
8395
8396 #if defined(CONFIG_UNIX)
8397         if (ctx->ring_sock) {
8398                 ctx->ring_sock->file = NULL; /* so that iput() is called */
8399                 sock_release(ctx->ring_sock);
8400         }
8401 #endif
8402
8403         io_mem_free(ctx->rings);
8404         io_mem_free(ctx->sq_sqes);
8405
8406         percpu_ref_exit(&ctx->refs);
8407         free_uid(ctx->user);
8408         io_req_caches_free(ctx, NULL);
8409         kfree(ctx->cancel_hash);
8410         kfree(ctx);
8411 }
8412
8413 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
8414 {
8415         struct io_ring_ctx *ctx = file->private_data;
8416         __poll_t mask = 0;
8417
8418         poll_wait(file, &ctx->cq_wait, wait);
8419         /*
8420          * synchronizes with barrier from wq_has_sleeper call in
8421          * io_commit_cqring
8422          */
8423         smp_rmb();
8424         if (!io_sqring_full(ctx))
8425                 mask |= EPOLLOUT | EPOLLWRNORM;
8426
8427         /*
8428          * Don't flush cqring overflow list here, just do a simple check.
8429          * Otherwise there could possible be ABBA deadlock:
8430          *      CPU0                    CPU1
8431          *      ----                    ----
8432          * lock(&ctx->uring_lock);
8433          *                              lock(&ep->mtx);
8434          *                              lock(&ctx->uring_lock);
8435          * lock(&ep->mtx);
8436          *
8437          * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
8438          * pushs them to do the flush.
8439          */
8440         if (io_cqring_events(ctx) || test_bit(0, &ctx->cq_check_overflow))
8441                 mask |= EPOLLIN | EPOLLRDNORM;
8442
8443         return mask;
8444 }
8445
8446 static int io_uring_fasync(int fd, struct file *file, int on)
8447 {
8448         struct io_ring_ctx *ctx = file->private_data;
8449
8450         return fasync_helper(fd, file, on, &ctx->cq_fasync);
8451 }
8452
8453 static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
8454 {
8455         const struct cred *creds;
8456
8457         creds = idr_remove(&ctx->personality_idr, id);
8458         if (creds) {
8459                 put_cred(creds);
8460                 return 0;
8461         }
8462
8463         return -EINVAL;
8464 }
8465
8466 static int io_remove_personalities(int id, void *p, void *data)
8467 {
8468         struct io_ring_ctx *ctx = data;
8469
8470         io_unregister_personality(ctx, id);
8471         return 0;
8472 }
8473
8474 static void io_run_ctx_fallback(struct io_ring_ctx *ctx)
8475 {
8476         struct callback_head *work, *head, *next;
8477
8478         do {
8479                 do {
8480                         head = NULL;
8481                         work = READ_ONCE(ctx->exit_task_work);
8482                 } while (cmpxchg(&ctx->exit_task_work, work, head) != work);
8483
8484                 if (!work)
8485                         break;
8486
8487                 do {
8488                         next = work->next;
8489                         work->func(work);
8490                         work = next;
8491                         cond_resched();
8492                 } while (work);
8493         } while (1);
8494 }
8495
8496 static void io_ring_exit_work(struct work_struct *work)
8497 {
8498         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
8499                                                exit_work);
8500
8501         /*
8502          * If we're doing polled IO and end up having requests being
8503          * submitted async (out-of-line), then completions can come in while
8504          * we're waiting for refs to drop. We need to reap these manually,
8505          * as nobody else will be looking for them.
8506          */
8507         do {
8508                 io_uring_try_cancel_requests(ctx, NULL, NULL);
8509                 io_run_ctx_fallback(ctx);
8510         } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
8511         io_ring_ctx_free(ctx);
8512 }
8513
8514 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
8515 {
8516         mutex_lock(&ctx->uring_lock);
8517         percpu_ref_kill(&ctx->refs);
8518
8519         if (WARN_ON_ONCE((ctx->flags & IORING_SETUP_SQPOLL) && !ctx->sqo_dead))
8520                 ctx->sqo_dead = 1;
8521
8522         /* if force is set, the ring is going away. always drop after that */
8523         ctx->cq_overflow_flushed = 1;
8524         if (ctx->rings)
8525                 __io_cqring_overflow_flush(ctx, true, NULL, NULL);
8526         idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
8527         mutex_unlock(&ctx->uring_lock);
8528
8529         io_kill_timeouts(ctx, NULL, NULL);
8530         io_poll_remove_all(ctx, NULL, NULL);
8531
8532         /* if we failed setting up the ctx, we might not have any rings */
8533         io_iopoll_try_reap_events(ctx);
8534
8535         INIT_WORK(&ctx->exit_work, io_ring_exit_work);
8536         /*
8537          * Use system_unbound_wq to avoid spawning tons of event kworkers
8538          * if we're exiting a ton of rings at the same time. It just adds
8539          * noise and overhead, there's no discernable change in runtime
8540          * over using system_wq.
8541          */
8542         queue_work(system_unbound_wq, &ctx->exit_work);
8543 }
8544
8545 static int io_uring_release(struct inode *inode, struct file *file)
8546 {
8547         struct io_ring_ctx *ctx = file->private_data;
8548
8549         file->private_data = NULL;
8550         io_ring_ctx_wait_and_kill(ctx);
8551         return 0;
8552 }
8553
8554 struct io_task_cancel {
8555         struct task_struct *task;
8556         struct files_struct *files;
8557 };
8558
8559 static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
8560 {
8561         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
8562         struct io_task_cancel *cancel = data;
8563         bool ret;
8564
8565         if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) {
8566                 unsigned long flags;
8567                 struct io_ring_ctx *ctx = req->ctx;
8568
8569                 /* protect against races with linked timeouts */
8570                 spin_lock_irqsave(&ctx->completion_lock, flags);
8571                 ret = io_match_task(req, cancel->task, cancel->files);
8572                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
8573         } else {
8574                 ret = io_match_task(req, cancel->task, cancel->files);
8575         }
8576         return ret;
8577 }
8578
8579 static void io_cancel_defer_files(struct io_ring_ctx *ctx,
8580                                   struct task_struct *task,
8581                                   struct files_struct *files)
8582 {
8583         struct io_defer_entry *de = NULL;
8584         LIST_HEAD(list);
8585
8586         spin_lock_irq(&ctx->completion_lock);
8587         list_for_each_entry_reverse(de, &ctx->defer_list, list) {
8588                 if (io_match_task(de->req, task, files)) {
8589                         list_cut_position(&list, &ctx->defer_list, &de->list);
8590                         break;
8591                 }
8592         }
8593         spin_unlock_irq(&ctx->completion_lock);
8594
8595         while (!list_empty(&list)) {
8596                 de = list_first_entry(&list, struct io_defer_entry, list);
8597                 list_del_init(&de->list);
8598                 req_set_fail_links(de->req);
8599                 io_put_req(de->req);
8600                 io_req_complete(de->req, -ECANCELED);
8601                 kfree(de);
8602         }
8603 }
8604
8605 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
8606                                          struct task_struct *task,
8607                                          struct files_struct *files)
8608 {
8609         struct io_task_cancel cancel = { .task = task, .files = files, };
8610         struct io_uring_task *tctx = current->io_uring;
8611
8612         while (1) {
8613                 enum io_wq_cancel cret;
8614                 bool ret = false;
8615
8616                 if (tctx && tctx->io_wq) {
8617                         cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
8618                                                &cancel, true);
8619                         ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
8620                 }
8621
8622                 /* SQPOLL thread does its own polling */
8623                 if (!(ctx->flags & IORING_SETUP_SQPOLL) && !files) {
8624                         while (!list_empty_careful(&ctx->iopoll_list)) {
8625                                 io_iopoll_try_reap_events(ctx);
8626                                 ret = true;
8627                         }
8628                 }
8629
8630                 ret |= io_poll_remove_all(ctx, task, files);
8631                 ret |= io_kill_timeouts(ctx, task, files);
8632                 ret |= io_run_task_work();
8633                 io_cqring_overflow_flush(ctx, true, task, files);
8634                 if (!ret)
8635                         break;
8636                 cond_resched();
8637         }
8638 }
8639
8640 static int io_uring_count_inflight(struct io_ring_ctx *ctx,
8641                                    struct task_struct *task,
8642                                    struct files_struct *files)
8643 {
8644         struct io_kiocb *req;
8645         int cnt = 0;
8646
8647         spin_lock_irq(&ctx->inflight_lock);
8648         list_for_each_entry(req, &ctx->inflight_list, inflight_entry)
8649                 cnt += io_match_task(req, task, files);
8650         spin_unlock_irq(&ctx->inflight_lock);
8651         return cnt;
8652 }
8653
8654 static void io_uring_cancel_files(struct io_ring_ctx *ctx,
8655                                   struct task_struct *task,
8656                                   struct files_struct *files)
8657 {
8658         while (!list_empty_careful(&ctx->inflight_list)) {
8659                 DEFINE_WAIT(wait);
8660                 int inflight;
8661
8662                 inflight = io_uring_count_inflight(ctx, task, files);
8663                 if (!inflight)
8664                         break;
8665
8666                 io_uring_try_cancel_requests(ctx, task, files);
8667
8668                 if (ctx->sq_data)
8669                         io_sq_thread_unpark(ctx->sq_data);
8670                 prepare_to_wait(&task->io_uring->wait, &wait,
8671                                 TASK_UNINTERRUPTIBLE);
8672                 if (inflight == io_uring_count_inflight(ctx, task, files))
8673                         schedule();
8674                 finish_wait(&task->io_uring->wait, &wait);
8675                 if (ctx->sq_data)
8676                         io_sq_thread_park(ctx->sq_data);
8677         }
8678 }
8679
8680 static void io_disable_sqo_submit(struct io_ring_ctx *ctx)
8681 {
8682         mutex_lock(&ctx->uring_lock);
8683         ctx->sqo_dead = 1;
8684         mutex_unlock(&ctx->uring_lock);
8685
8686         /* make sure callers enter the ring to get error */
8687         if (ctx->rings)
8688                 io_ring_set_wakeup_flag(ctx);
8689 }
8690
8691 /*
8692  * We need to iteratively cancel requests, in case a request has dependent
8693  * hard links. These persist even for failure of cancelations, hence keep
8694  * looping until none are found.
8695  */
8696 static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
8697                                           struct files_struct *files)
8698 {
8699         struct task_struct *task = current;
8700         bool did_park = false;
8701
8702         if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
8703                 io_disable_sqo_submit(ctx);
8704                 did_park = io_sq_thread_park(ctx->sq_data);
8705                 if (did_park) {
8706                         task = ctx->sq_data->thread;
8707                         atomic_inc(&task->io_uring->in_idle);
8708                 }
8709         }
8710
8711         io_cancel_defer_files(ctx, task, files);
8712
8713         io_uring_cancel_files(ctx, task, files);
8714         if (!files)
8715                 io_uring_try_cancel_requests(ctx, task, NULL);
8716
8717         if (did_park) {
8718                 atomic_dec(&task->io_uring->in_idle);
8719                 io_sq_thread_unpark(ctx->sq_data);
8720         }
8721 }
8722
8723 /*
8724  * Note that this task has used io_uring. We use it for cancelation purposes.
8725  */
8726 static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
8727 {
8728         struct io_uring_task *tctx = current->io_uring;
8729         int ret;
8730
8731         if (unlikely(!tctx)) {
8732                 ret = io_uring_alloc_task_context(current, ctx);
8733                 if (unlikely(ret))
8734                         return ret;
8735                 tctx = current->io_uring;
8736         }
8737         if (tctx->last != file) {
8738                 void *old = xa_load(&tctx->xa, (unsigned long)file);
8739
8740                 if (!old) {
8741                         get_file(file);
8742                         ret = xa_err(xa_store(&tctx->xa, (unsigned long)file,
8743                                                 file, GFP_KERNEL));
8744                         if (ret) {
8745                                 fput(file);
8746                                 return ret;
8747                         }
8748
8749                         /* one and only SQPOLL file note, held by sqo_task */
8750                         WARN_ON_ONCE((ctx->flags & IORING_SETUP_SQPOLL) &&
8751                                      current != ctx->sqo_task);
8752                 }
8753                 tctx->last = file;
8754         }
8755
8756         /*
8757          * This is race safe in that the task itself is doing this, hence it
8758          * cannot be going through the exit/cancel paths at the same time.
8759          * This cannot be modified while exit/cancel is running.
8760          */
8761         if (!tctx->sqpoll && (ctx->flags & IORING_SETUP_SQPOLL))
8762                 tctx->sqpoll = true;
8763
8764         return 0;
8765 }
8766
8767 /*
8768  * Remove this io_uring_file -> task mapping.
8769  */
8770 static void io_uring_del_task_file(struct file *file)
8771 {
8772         struct io_uring_task *tctx = current->io_uring;
8773
8774         if (tctx->last == file)
8775                 tctx->last = NULL;
8776         file = xa_erase(&tctx->xa, (unsigned long)file);
8777         if (file)
8778                 fput(file);
8779 }
8780
8781 static void io_uring_remove_task_files(struct io_uring_task *tctx)
8782 {
8783         struct file *file;
8784         unsigned long index;
8785
8786         xa_for_each(&tctx->xa, index, file)
8787                 io_uring_del_task_file(file);
8788 }
8789
8790 void __io_uring_files_cancel(struct files_struct *files)
8791 {
8792         struct io_uring_task *tctx = current->io_uring;
8793         struct file *file;
8794         unsigned long index;
8795
8796         /* make sure overflow events are dropped */
8797         atomic_inc(&tctx->in_idle);
8798         xa_for_each(&tctx->xa, index, file)
8799                 io_uring_cancel_task_requests(file->private_data, files);
8800         atomic_dec(&tctx->in_idle);
8801
8802         if (files) {
8803                 io_uring_remove_task_files(tctx);
8804         } else if (tctx->io_wq && current->flags & PF_EXITING) {
8805                 io_wq_destroy(tctx->io_wq);
8806                 tctx->io_wq = NULL;
8807         }
8808 }
8809
8810 static s64 tctx_inflight(struct io_uring_task *tctx)
8811 {
8812         return percpu_counter_sum(&tctx->inflight);
8813 }
8814
8815 static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
8816 {
8817         struct io_sq_data *sqd = ctx->sq_data;
8818         struct io_uring_task *tctx;
8819         s64 inflight;
8820         DEFINE_WAIT(wait);
8821
8822         if (!sqd)
8823                 return;
8824         io_disable_sqo_submit(ctx);
8825         if (!io_sq_thread_park(sqd))
8826                 return;
8827         tctx = ctx->sq_data->thread->io_uring;
8828
8829         atomic_inc(&tctx->in_idle);
8830         do {
8831                 /* read completions before cancelations */
8832                 inflight = tctx_inflight(tctx);
8833                 if (!inflight)
8834                         break;
8835                 io_uring_cancel_task_requests(ctx, NULL);
8836
8837                 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
8838                 /*
8839                  * If we've seen completions, retry without waiting. This
8840                  * avoids a race where a completion comes in before we did
8841                  * prepare_to_wait().
8842                  */
8843                 if (inflight == tctx_inflight(tctx))
8844                         schedule();
8845                 finish_wait(&tctx->wait, &wait);
8846         } while (1);
8847         atomic_dec(&tctx->in_idle);
8848         io_sq_thread_unpark(sqd);
8849 }
8850
8851 /*
8852  * Find any io_uring fd that this task has registered or done IO on, and cancel
8853  * requests.
8854  */
8855 void __io_uring_task_cancel(void)
8856 {
8857         struct io_uring_task *tctx = current->io_uring;
8858         DEFINE_WAIT(wait);
8859         s64 inflight;
8860
8861         /* make sure overflow events are dropped */
8862         atomic_inc(&tctx->in_idle);
8863
8864         /* trigger io_disable_sqo_submit() */
8865         if (tctx->sqpoll) {
8866                 struct file *file;
8867                 unsigned long index;
8868
8869                 xa_for_each(&tctx->xa, index, file)
8870                         io_uring_cancel_sqpoll(file->private_data);
8871         }
8872
8873         do {
8874                 /* read completions before cancelations */
8875                 inflight = tctx_inflight(tctx);
8876                 if (!inflight)
8877                         break;
8878                 __io_uring_files_cancel(NULL);
8879
8880                 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
8881
8882                 /*
8883                  * If we've seen completions, retry without waiting. This
8884                  * avoids a race where a completion comes in before we did
8885                  * prepare_to_wait().
8886                  */
8887                 if (inflight == tctx_inflight(tctx))
8888                         schedule();
8889                 finish_wait(&tctx->wait, &wait);
8890         } while (1);
8891
8892         atomic_dec(&tctx->in_idle);
8893
8894         io_uring_remove_task_files(tctx);
8895 }
8896
8897 static int io_uring_flush(struct file *file, void *data)
8898 {
8899         struct io_uring_task *tctx = current->io_uring;
8900         struct io_ring_ctx *ctx = file->private_data;
8901
8902         /* Ignore helper thread files exit */
8903         if (current->flags & PF_IO_WORKER)
8904                 return 0;
8905
8906         if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
8907                 io_uring_cancel_task_requests(ctx, NULL);
8908                 io_req_caches_free(ctx, current);
8909         }
8910
8911         io_run_ctx_fallback(ctx);
8912
8913         if (!tctx)
8914                 return 0;
8915
8916         /* we should have cancelled and erased it before PF_EXITING */
8917         WARN_ON_ONCE((current->flags & PF_EXITING) &&
8918                      xa_load(&tctx->xa, (unsigned long)file));
8919
8920         /*
8921          * fput() is pending, will be 2 if the only other ref is our potential
8922          * task file note. If the task is exiting, drop regardless of count.
8923          */
8924         if (atomic_long_read(&file->f_count) != 2)
8925                 return 0;
8926
8927         if (ctx->flags & IORING_SETUP_SQPOLL) {
8928                 /* there is only one file note, which is owned by sqo_task */
8929                 WARN_ON_ONCE(ctx->sqo_task != current &&
8930                              xa_load(&tctx->xa, (unsigned long)file));
8931                 /* sqo_dead check is for when this happens after cancellation */
8932                 WARN_ON_ONCE(ctx->sqo_task == current && !ctx->sqo_dead &&
8933                              !xa_load(&tctx->xa, (unsigned long)file));
8934
8935                 io_disable_sqo_submit(ctx);
8936         }
8937
8938         if (!(ctx->flags & IORING_SETUP_SQPOLL) || ctx->sqo_task == current)
8939                 io_uring_del_task_file(file);
8940         return 0;
8941 }
8942
8943 static void *io_uring_validate_mmap_request(struct file *file,
8944                                             loff_t pgoff, size_t sz)
8945 {
8946         struct io_ring_ctx *ctx = file->private_data;
8947         loff_t offset = pgoff << PAGE_SHIFT;
8948         struct page *page;
8949         void *ptr;
8950
8951         switch (offset) {
8952         case IORING_OFF_SQ_RING:
8953         case IORING_OFF_CQ_RING:
8954                 ptr = ctx->rings;
8955                 break;
8956         case IORING_OFF_SQES:
8957                 ptr = ctx->sq_sqes;
8958                 break;
8959         default:
8960                 return ERR_PTR(-EINVAL);
8961         }
8962
8963         page = virt_to_head_page(ptr);
8964         if (sz > page_size(page))
8965                 return ERR_PTR(-EINVAL);
8966
8967         return ptr;
8968 }
8969
8970 #ifdef CONFIG_MMU
8971
8972 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
8973 {
8974         size_t sz = vma->vm_end - vma->vm_start;
8975         unsigned long pfn;
8976         void *ptr;
8977
8978         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
8979         if (IS_ERR(ptr))
8980                 return PTR_ERR(ptr);
8981
8982         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
8983         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
8984 }
8985
8986 #else /* !CONFIG_MMU */
8987
8988 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
8989 {
8990         return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
8991 }
8992
8993 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
8994 {
8995         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
8996 }
8997
8998 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
8999         unsigned long addr, unsigned long len,
9000         unsigned long pgoff, unsigned long flags)
9001 {
9002         void *ptr;
9003
9004         ptr = io_uring_validate_mmap_request(file, pgoff, len);
9005         if (IS_ERR(ptr))
9006                 return PTR_ERR(ptr);
9007
9008         return (unsigned long) ptr;
9009 }
9010
9011 #endif /* !CONFIG_MMU */
9012
9013 static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
9014 {
9015         int ret = 0;
9016         DEFINE_WAIT(wait);
9017
9018         do {
9019                 if (!io_sqring_full(ctx))
9020                         break;
9021
9022                 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
9023
9024                 if (unlikely(ctx->sqo_dead)) {
9025                         ret = -EOWNERDEAD;
9026                         goto out;
9027                 }
9028
9029                 if (!io_sqring_full(ctx))
9030                         break;
9031
9032                 schedule();
9033         } while (!signal_pending(current));
9034
9035         finish_wait(&ctx->sqo_sq_wait, &wait);
9036 out:
9037         return ret;
9038 }
9039
9040 static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
9041                           struct __kernel_timespec __user **ts,
9042                           const sigset_t __user **sig)
9043 {
9044         struct io_uring_getevents_arg arg;
9045
9046         /*
9047          * If EXT_ARG isn't set, then we have no timespec and the argp pointer
9048          * is just a pointer to the sigset_t.
9049          */
9050         if (!(flags & IORING_ENTER_EXT_ARG)) {
9051                 *sig = (const sigset_t __user *) argp;
9052                 *ts = NULL;
9053                 return 0;
9054         }
9055
9056         /*
9057          * EXT_ARG is set - ensure we agree on the size of it and copy in our
9058          * timespec and sigset_t pointers if good.
9059          */
9060         if (*argsz != sizeof(arg))
9061                 return -EINVAL;
9062         if (copy_from_user(&arg, argp, sizeof(arg)))
9063                 return -EFAULT;
9064         *sig = u64_to_user_ptr(arg.sigmask);
9065         *argsz = arg.sigmask_sz;
9066         *ts = u64_to_user_ptr(arg.ts);
9067         return 0;
9068 }
9069
9070 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
9071                 u32, min_complete, u32, flags, const void __user *, argp,
9072                 size_t, argsz)
9073 {
9074         struct io_ring_ctx *ctx;
9075         long ret = -EBADF;
9076         int submitted = 0;
9077         struct fd f;
9078
9079         io_run_task_work();
9080
9081         if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
9082                         IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))
9083                 return -EINVAL;
9084
9085         f = fdget(fd);
9086         if (!f.file)
9087                 return -EBADF;
9088
9089         ret = -EOPNOTSUPP;
9090         if (f.file->f_op != &io_uring_fops)
9091                 goto out_fput;
9092
9093         ret = -ENXIO;
9094         ctx = f.file->private_data;
9095         if (!percpu_ref_tryget(&ctx->refs))
9096                 goto out_fput;
9097
9098         ret = -EBADFD;
9099         if (ctx->flags & IORING_SETUP_R_DISABLED)
9100                 goto out;
9101
9102         /*
9103          * For SQ polling, the thread will do all submissions and completions.
9104          * Just return the requested submit count, and wake the thread if
9105          * we were asked to.
9106          */
9107         ret = 0;
9108         if (ctx->flags & IORING_SETUP_SQPOLL) {
9109                 io_cqring_overflow_flush(ctx, false, NULL, NULL);
9110
9111                 ret = -EOWNERDEAD;
9112                 if (unlikely(ctx->sqo_dead))
9113                         goto out;
9114                 if (flags & IORING_ENTER_SQ_WAKEUP)
9115                         wake_up(&ctx->sq_data->wait);
9116                 if (flags & IORING_ENTER_SQ_WAIT) {
9117                         ret = io_sqpoll_wait_sq(ctx);
9118                         if (ret)
9119                                 goto out;
9120                 }
9121                 submitted = to_submit;
9122         } else if (to_submit) {
9123                 ret = io_uring_add_task_file(ctx, f.file);
9124                 if (unlikely(ret))
9125                         goto out;
9126                 mutex_lock(&ctx->uring_lock);
9127                 submitted = io_submit_sqes(ctx, to_submit);
9128                 mutex_unlock(&ctx->uring_lock);
9129
9130                 if (submitted != to_submit)
9131                         goto out;
9132         }
9133         if (flags & IORING_ENTER_GETEVENTS) {
9134                 const sigset_t __user *sig;
9135                 struct __kernel_timespec __user *ts;
9136
9137                 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
9138                 if (unlikely(ret))
9139                         goto out;
9140
9141                 min_complete = min(min_complete, ctx->cq_entries);
9142
9143                 /*
9144                  * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
9145                  * space applications don't need to do io completion events
9146                  * polling again, they can rely on io_sq_thread to do polling
9147                  * work, which can reduce cpu usage and uring_lock contention.
9148                  */
9149                 if (ctx->flags & IORING_SETUP_IOPOLL &&
9150                     !(ctx->flags & IORING_SETUP_SQPOLL)) {
9151                         ret = io_iopoll_check(ctx, min_complete);
9152                 } else {
9153                         ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
9154                 }
9155         }
9156
9157 out:
9158         percpu_ref_put(&ctx->refs);
9159 out_fput:
9160         fdput(f);
9161         return submitted ? submitted : ret;
9162 }
9163
9164 #ifdef CONFIG_PROC_FS
9165 static int io_uring_show_cred(int id, void *p, void *data)
9166 {
9167         const struct cred *cred = p;
9168         struct seq_file *m = data;
9169         struct user_namespace *uns = seq_user_ns(m);
9170         struct group_info *gi;
9171         kernel_cap_t cap;
9172         unsigned __capi;
9173         int g;
9174
9175         seq_printf(m, "%5d\n", id);
9176         seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
9177         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
9178         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
9179         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
9180         seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
9181         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
9182         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
9183         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
9184         seq_puts(m, "\n\tGroups:\t");
9185         gi = cred->group_info;
9186         for (g = 0; g < gi->ngroups; g++) {
9187                 seq_put_decimal_ull(m, g ? " " : "",
9188                                         from_kgid_munged(uns, gi->gid[g]));
9189         }
9190         seq_puts(m, "\n\tCapEff:\t");
9191         cap = cred->cap_effective;
9192         CAP_FOR_EACH_U32(__capi)
9193                 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
9194         seq_putc(m, '\n');
9195         return 0;
9196 }
9197
9198 static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
9199 {
9200         struct io_sq_data *sq = NULL;
9201         bool has_lock;
9202         int i;
9203
9204         /*
9205          * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
9206          * since fdinfo case grabs it in the opposite direction of normal use
9207          * cases. If we fail to get the lock, we just don't iterate any
9208          * structures that could be going away outside the io_uring mutex.
9209          */
9210         has_lock = mutex_trylock(&ctx->uring_lock);
9211
9212         if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL))
9213                 sq = ctx->sq_data;
9214
9215         seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
9216         seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
9217         seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
9218         for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
9219                 struct file *f = *io_fixed_file_slot(ctx->file_data, i);
9220
9221                 if (f)
9222                         seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
9223                 else
9224                         seq_printf(m, "%5u: <none>\n", i);
9225         }
9226         seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
9227         for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
9228                 struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
9229
9230                 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
9231                                                 (unsigned int) buf->len);
9232         }
9233         if (has_lock && !idr_is_empty(&ctx->personality_idr)) {
9234                 seq_printf(m, "Personalities:\n");
9235                 idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
9236         }
9237         seq_printf(m, "PollList:\n");
9238         spin_lock_irq(&ctx->completion_lock);
9239         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
9240                 struct hlist_head *list = &ctx->cancel_hash[i];
9241                 struct io_kiocb *req;
9242
9243                 hlist_for_each_entry(req, list, hash_node)
9244                         seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
9245                                         req->task->task_works != NULL);
9246         }
9247         spin_unlock_irq(&ctx->completion_lock);
9248         if (has_lock)
9249                 mutex_unlock(&ctx->uring_lock);
9250 }
9251
9252 static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
9253 {
9254         struct io_ring_ctx *ctx = f->private_data;
9255
9256         if (percpu_ref_tryget(&ctx->refs)) {
9257                 __io_uring_show_fdinfo(ctx, m);
9258                 percpu_ref_put(&ctx->refs);
9259         }
9260 }
9261 #endif
9262
9263 static const struct file_operations io_uring_fops = {
9264         .release        = io_uring_release,
9265         .flush          = io_uring_flush,
9266         .mmap           = io_uring_mmap,
9267 #ifndef CONFIG_MMU
9268         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
9269         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
9270 #endif
9271         .poll           = io_uring_poll,
9272         .fasync         = io_uring_fasync,
9273 #ifdef CONFIG_PROC_FS
9274         .show_fdinfo    = io_uring_show_fdinfo,
9275 #endif
9276 };
9277
9278 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
9279                                   struct io_uring_params *p)
9280 {
9281         struct io_rings *rings;
9282         size_t size, sq_array_offset;
9283
9284         /* make sure these are sane, as we already accounted them */
9285         ctx->sq_entries = p->sq_entries;
9286         ctx->cq_entries = p->cq_entries;
9287
9288         size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
9289         if (size == SIZE_MAX)
9290                 return -EOVERFLOW;
9291
9292         rings = io_mem_alloc(size);
9293         if (!rings)
9294                 return -ENOMEM;
9295
9296         ctx->rings = rings;
9297         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
9298         rings->sq_ring_mask = p->sq_entries - 1;
9299         rings->cq_ring_mask = p->cq_entries - 1;
9300         rings->sq_ring_entries = p->sq_entries;
9301         rings->cq_ring_entries = p->cq_entries;
9302         ctx->sq_mask = rings->sq_ring_mask;
9303         ctx->cq_mask = rings->cq_ring_mask;
9304
9305         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
9306         if (size == SIZE_MAX) {
9307                 io_mem_free(ctx->rings);
9308                 ctx->rings = NULL;
9309                 return -EOVERFLOW;
9310         }
9311
9312         ctx->sq_sqes = io_mem_alloc(size);
9313         if (!ctx->sq_sqes) {
9314                 io_mem_free(ctx->rings);
9315                 ctx->rings = NULL;
9316                 return -ENOMEM;
9317         }
9318
9319         return 0;
9320 }
9321
9322 static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
9323 {
9324         int ret, fd;
9325
9326         fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
9327         if (fd < 0)
9328                 return fd;
9329
9330         ret = io_uring_add_task_file(ctx, file);
9331         if (ret) {
9332                 put_unused_fd(fd);
9333                 return ret;
9334         }
9335         fd_install(fd, file);
9336         return fd;
9337 }
9338
9339 /*
9340  * Allocate an anonymous fd, this is what constitutes the application
9341  * visible backing of an io_uring instance. The application mmaps this
9342  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
9343  * we have to tie this fd to a socket for file garbage collection purposes.
9344  */
9345 static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
9346 {
9347         struct file *file;
9348 #if defined(CONFIG_UNIX)
9349         int ret;
9350
9351         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
9352                                 &ctx->ring_sock);
9353         if (ret)
9354                 return ERR_PTR(ret);
9355 #endif
9356
9357         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
9358                                         O_RDWR | O_CLOEXEC);
9359 #if defined(CONFIG_UNIX)
9360         if (IS_ERR(file)) {
9361                 sock_release(ctx->ring_sock);
9362                 ctx->ring_sock = NULL;
9363         } else {
9364                 ctx->ring_sock->file = file;
9365         }
9366 #endif
9367         return file;
9368 }
9369
9370 static int io_uring_create(unsigned entries, struct io_uring_params *p,
9371                            struct io_uring_params __user *params)
9372 {
9373         struct user_struct *user = NULL;
9374         struct io_ring_ctx *ctx;
9375         struct file *file;
9376         int ret;
9377
9378         if (!entries)
9379                 return -EINVAL;
9380         if (entries > IORING_MAX_ENTRIES) {
9381                 if (!(p->flags & IORING_SETUP_CLAMP))
9382                         return -EINVAL;
9383                 entries = IORING_MAX_ENTRIES;
9384         }
9385
9386         /*
9387          * Use twice as many entries for the CQ ring. It's possible for the
9388          * application to drive a higher depth than the size of the SQ ring,
9389          * since the sqes are only used at submission time. This allows for
9390          * some flexibility in overcommitting a bit. If the application has
9391          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
9392          * of CQ ring entries manually.
9393          */
9394         p->sq_entries = roundup_pow_of_two(entries);
9395         if (p->flags & IORING_SETUP_CQSIZE) {
9396                 /*
9397                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
9398                  * to a power-of-two, if it isn't already. We do NOT impose
9399                  * any cq vs sq ring sizing.
9400                  */
9401                 if (!p->cq_entries)
9402                         return -EINVAL;
9403                 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
9404                         if (!(p->flags & IORING_SETUP_CLAMP))
9405                                 return -EINVAL;
9406                         p->cq_entries = IORING_MAX_CQ_ENTRIES;
9407                 }
9408                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
9409                 if (p->cq_entries < p->sq_entries)
9410                         return -EINVAL;
9411         } else {
9412                 p->cq_entries = 2 * p->sq_entries;
9413         }
9414
9415         user = get_uid(current_user());
9416
9417         ctx = io_ring_ctx_alloc(p);
9418         if (!ctx) {
9419                 free_uid(user);
9420                 return -ENOMEM;
9421         }
9422         ctx->compat = in_compat_syscall();
9423         ctx->limit_mem = !capable(CAP_IPC_LOCK);
9424         ctx->user = user;
9425         ctx->sqo_task = current;
9426
9427         /*
9428          * This is just grabbed for accounting purposes. When a process exits,
9429          * the mm is exited and dropped before the files, hence we need to hang
9430          * on to this mm purely for the purposes of being able to unaccount
9431          * memory (locked/pinned vm). It's not used for anything else.
9432          */
9433         mmgrab(current->mm);
9434         ctx->mm_account = current->mm;
9435
9436         ret = io_allocate_scq_urings(ctx, p);
9437         if (ret)
9438                 goto err;
9439
9440         ret = io_sq_offload_create(ctx, p);
9441         if (ret)
9442                 goto err;
9443
9444         if (!(p->flags & IORING_SETUP_R_DISABLED))
9445                 io_sq_offload_start(ctx);
9446
9447         memset(&p->sq_off, 0, sizeof(p->sq_off));
9448         p->sq_off.head = offsetof(struct io_rings, sq.head);
9449         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
9450         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
9451         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
9452         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
9453         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
9454         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
9455
9456         memset(&p->cq_off, 0, sizeof(p->cq_off));
9457         p->cq_off.head = offsetof(struct io_rings, cq.head);
9458         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
9459         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
9460         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
9461         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
9462         p->cq_off.cqes = offsetof(struct io_rings, cqes);
9463         p->cq_off.flags = offsetof(struct io_rings, cq_flags);
9464
9465         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
9466                         IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
9467                         IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
9468                         IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
9469                         IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS;
9470
9471         if (copy_to_user(params, p, sizeof(*p))) {
9472                 ret = -EFAULT;
9473                 goto err;
9474         }
9475
9476         file = io_uring_get_file(ctx);
9477         if (IS_ERR(file)) {
9478                 ret = PTR_ERR(file);
9479                 goto err;
9480         }
9481
9482         /*
9483          * Install ring fd as the very last thing, so we don't risk someone
9484          * having closed it before we finish setup
9485          */
9486         ret = io_uring_install_fd(ctx, file);
9487         if (ret < 0) {
9488                 io_disable_sqo_submit(ctx);
9489                 /* fput will clean it up */
9490                 fput(file);
9491                 return ret;
9492         }
9493
9494         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
9495         return ret;
9496 err:
9497         io_disable_sqo_submit(ctx);
9498         io_ring_ctx_wait_and_kill(ctx);
9499         return ret;
9500 }
9501
9502 /*
9503  * Sets up an aio uring context, and returns the fd. Applications asks for a
9504  * ring size, we return the actual sq/cq ring sizes (among other things) in the
9505  * params structure passed in.
9506  */
9507 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
9508 {
9509         struct io_uring_params p;
9510         int i;
9511
9512         if (copy_from_user(&p, params, sizeof(p)))
9513                 return -EFAULT;
9514         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
9515                 if (p.resv[i])
9516                         return -EINVAL;
9517         }
9518
9519         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
9520                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
9521                         IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
9522                         IORING_SETUP_R_DISABLED))
9523                 return -EINVAL;
9524
9525         return  io_uring_create(entries, &p, params);
9526 }
9527
9528 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
9529                 struct io_uring_params __user *, params)
9530 {
9531         return io_uring_setup(entries, params);
9532 }
9533
9534 static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
9535 {
9536         struct io_uring_probe *p;
9537         size_t size;
9538         int i, ret;
9539
9540         size = struct_size(p, ops, nr_args);
9541         if (size == SIZE_MAX)
9542                 return -EOVERFLOW;
9543         p = kzalloc(size, GFP_KERNEL);
9544         if (!p)
9545                 return -ENOMEM;
9546
9547         ret = -EFAULT;
9548         if (copy_from_user(p, arg, size))
9549                 goto out;
9550         ret = -EINVAL;
9551         if (memchr_inv(p, 0, size))
9552                 goto out;
9553
9554         p->last_op = IORING_OP_LAST - 1;
9555         if (nr_args > IORING_OP_LAST)
9556                 nr_args = IORING_OP_LAST;
9557
9558         for (i = 0; i < nr_args; i++) {
9559                 p->ops[i].op = i;
9560                 if (!io_op_defs[i].not_supported)
9561                         p->ops[i].flags = IO_URING_OP_SUPPORTED;
9562         }
9563         p->ops_len = i;
9564
9565         ret = 0;
9566         if (copy_to_user(arg, p, size))
9567                 ret = -EFAULT;
9568 out:
9569         kfree(p);
9570         return ret;
9571 }
9572
9573 static int io_register_personality(struct io_ring_ctx *ctx)
9574 {
9575         const struct cred *creds;
9576         int ret;
9577
9578         creds = get_current_cred();
9579
9580         ret = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
9581                                 USHRT_MAX, GFP_KERNEL);
9582         if (ret < 0)
9583                 put_cred(creds);
9584         return ret;
9585 }
9586
9587 static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
9588                                     unsigned int nr_args)
9589 {
9590         struct io_uring_restriction *res;
9591         size_t size;
9592         int i, ret;
9593
9594         /* Restrictions allowed only if rings started disabled */
9595         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
9596                 return -EBADFD;
9597
9598         /* We allow only a single restrictions registration */
9599         if (ctx->restrictions.registered)
9600                 return -EBUSY;
9601
9602         if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
9603                 return -EINVAL;
9604
9605         size = array_size(nr_args, sizeof(*res));
9606         if (size == SIZE_MAX)
9607                 return -EOVERFLOW;
9608
9609         res = memdup_user(arg, size);
9610         if (IS_ERR(res))
9611                 return PTR_ERR(res);
9612
9613         ret = 0;
9614
9615         for (i = 0; i < nr_args; i++) {
9616                 switch (res[i].opcode) {
9617                 case IORING_RESTRICTION_REGISTER_OP:
9618                         if (res[i].register_op >= IORING_REGISTER_LAST) {
9619                                 ret = -EINVAL;
9620                                 goto out;
9621                         }
9622
9623                         __set_bit(res[i].register_op,
9624                                   ctx->restrictions.register_op);
9625                         break;
9626                 case IORING_RESTRICTION_SQE_OP:
9627                         if (res[i].sqe_op >= IORING_OP_LAST) {
9628                                 ret = -EINVAL;
9629                                 goto out;
9630                         }
9631
9632                         __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
9633                         break;
9634                 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
9635                         ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
9636                         break;
9637                 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
9638                         ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
9639                         break;
9640                 default:
9641                         ret = -EINVAL;
9642                         goto out;
9643                 }
9644         }
9645
9646 out:
9647         /* Reset all restrictions if an error happened */
9648         if (ret != 0)
9649                 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
9650         else
9651                 ctx->restrictions.registered = true;
9652
9653         kfree(res);
9654         return ret;
9655 }
9656
9657 static int io_register_enable_rings(struct io_ring_ctx *ctx)
9658 {
9659         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
9660                 return -EBADFD;
9661
9662         if (ctx->restrictions.registered)
9663                 ctx->restricted = 1;
9664
9665         ctx->flags &= ~IORING_SETUP_R_DISABLED;
9666
9667         io_sq_offload_start(ctx);
9668
9669         return 0;
9670 }
9671
9672 static bool io_register_op_must_quiesce(int op)
9673 {
9674         switch (op) {
9675         case IORING_UNREGISTER_FILES:
9676         case IORING_REGISTER_FILES_UPDATE:
9677         case IORING_REGISTER_PROBE:
9678         case IORING_REGISTER_PERSONALITY:
9679         case IORING_UNREGISTER_PERSONALITY:
9680                 return false;
9681         default:
9682                 return true;
9683         }
9684 }
9685
9686 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
9687                                void __user *arg, unsigned nr_args)
9688         __releases(ctx->uring_lock)
9689         __acquires(ctx->uring_lock)
9690 {
9691         int ret;
9692
9693         /*
9694          * We're inside the ring mutex, if the ref is already dying, then
9695          * someone else killed the ctx or is already going through
9696          * io_uring_register().
9697          */
9698         if (percpu_ref_is_dying(&ctx->refs))
9699                 return -ENXIO;
9700
9701         if (io_register_op_must_quiesce(opcode)) {
9702                 percpu_ref_kill(&ctx->refs);
9703
9704                 /*
9705                  * Drop uring mutex before waiting for references to exit. If
9706                  * another thread is currently inside io_uring_enter() it might
9707                  * need to grab the uring_lock to make progress. If we hold it
9708                  * here across the drain wait, then we can deadlock. It's safe
9709                  * to drop the mutex here, since no new references will come in
9710                  * after we've killed the percpu ref.
9711                  */
9712                 mutex_unlock(&ctx->uring_lock);
9713                 do {
9714                         ret = wait_for_completion_interruptible(&ctx->ref_comp);
9715                         if (!ret)
9716                                 break;
9717                         ret = io_run_task_work_sig();
9718                         if (ret < 0)
9719                                 break;
9720                 } while (1);
9721
9722                 mutex_lock(&ctx->uring_lock);
9723
9724                 if (ret && io_refs_resurrect(&ctx->refs, &ctx->ref_comp))
9725                         return ret;
9726         }
9727
9728         if (ctx->restricted) {
9729                 if (opcode >= IORING_REGISTER_LAST) {
9730                         ret = -EINVAL;
9731                         goto out;
9732                 }
9733
9734                 if (!test_bit(opcode, ctx->restrictions.register_op)) {
9735                         ret = -EACCES;
9736                         goto out;
9737                 }
9738         }
9739
9740         switch (opcode) {
9741         case IORING_REGISTER_BUFFERS:
9742                 ret = io_sqe_buffers_register(ctx, arg, nr_args);
9743                 break;
9744         case IORING_UNREGISTER_BUFFERS:
9745                 ret = -EINVAL;
9746                 if (arg || nr_args)
9747                         break;
9748                 ret = io_sqe_buffers_unregister(ctx);
9749                 break;
9750         case IORING_REGISTER_FILES:
9751                 ret = io_sqe_files_register(ctx, arg, nr_args);
9752                 break;
9753         case IORING_UNREGISTER_FILES:
9754                 ret = -EINVAL;
9755                 if (arg || nr_args)
9756                         break;
9757                 ret = io_sqe_files_unregister(ctx);
9758                 break;
9759         case IORING_REGISTER_FILES_UPDATE:
9760                 ret = io_sqe_files_update(ctx, arg, nr_args);
9761                 break;
9762         case IORING_REGISTER_EVENTFD:
9763         case IORING_REGISTER_EVENTFD_ASYNC:
9764                 ret = -EINVAL;
9765                 if (nr_args != 1)
9766                         break;
9767                 ret = io_eventfd_register(ctx, arg);
9768                 if (ret)
9769                         break;
9770                 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
9771                         ctx->eventfd_async = 1;
9772                 else
9773                         ctx->eventfd_async = 0;
9774                 break;
9775         case IORING_UNREGISTER_EVENTFD:
9776                 ret = -EINVAL;
9777                 if (arg || nr_args)
9778                         break;
9779                 ret = io_eventfd_unregister(ctx);
9780                 break;
9781         case IORING_REGISTER_PROBE:
9782                 ret = -EINVAL;
9783                 if (!arg || nr_args > 256)
9784                         break;
9785                 ret = io_probe(ctx, arg, nr_args);
9786                 break;
9787         case IORING_REGISTER_PERSONALITY:
9788                 ret = -EINVAL;
9789                 if (arg || nr_args)
9790                         break;
9791                 ret = io_register_personality(ctx);
9792                 break;
9793         case IORING_UNREGISTER_PERSONALITY:
9794                 ret = -EINVAL;
9795                 if (arg)
9796                         break;
9797                 ret = io_unregister_personality(ctx, nr_args);
9798                 break;
9799         case IORING_REGISTER_ENABLE_RINGS:
9800                 ret = -EINVAL;
9801                 if (arg || nr_args)
9802                         break;
9803                 ret = io_register_enable_rings(ctx);
9804                 break;
9805         case IORING_REGISTER_RESTRICTIONS:
9806                 ret = io_register_restrictions(ctx, arg, nr_args);
9807                 break;
9808         default:
9809                 ret = -EINVAL;
9810                 break;
9811         }
9812
9813 out:
9814         if (io_register_op_must_quiesce(opcode)) {
9815                 /* bring the ctx back to life */
9816                 percpu_ref_reinit(&ctx->refs);
9817                 reinit_completion(&ctx->ref_comp);
9818         }
9819         return ret;
9820 }
9821
9822 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
9823                 void __user *, arg, unsigned int, nr_args)
9824 {
9825         struct io_ring_ctx *ctx;
9826         long ret = -EBADF;
9827         struct fd f;
9828
9829         f = fdget(fd);
9830         if (!f.file)
9831                 return -EBADF;
9832
9833         ret = -EOPNOTSUPP;
9834         if (f.file->f_op != &io_uring_fops)
9835                 goto out_fput;
9836
9837         ctx = f.file->private_data;
9838
9839         io_run_task_work();
9840
9841         mutex_lock(&ctx->uring_lock);
9842         ret = __io_uring_register(ctx, opcode, arg, nr_args);
9843         mutex_unlock(&ctx->uring_lock);
9844         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
9845                                                         ctx->cq_ev_fd != NULL, ret);
9846 out_fput:
9847         fdput(f);
9848         return ret;
9849 }
9850
9851 static int __init io_uring_init(void)
9852 {
9853 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
9854         BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
9855         BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
9856 } while (0)
9857
9858 #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
9859         __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
9860         BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
9861         BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
9862         BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
9863         BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
9864         BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
9865         BUILD_BUG_SQE_ELEM(8,  __u64,  off);
9866         BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
9867         BUILD_BUG_SQE_ELEM(16, __u64,  addr);
9868         BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
9869         BUILD_BUG_SQE_ELEM(24, __u32,  len);
9870         BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
9871         BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
9872         BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
9873         BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
9874         BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
9875         BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
9876         BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
9877         BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
9878         BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
9879         BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
9880         BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
9881         BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
9882         BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
9883         BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
9884         BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
9885         BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
9886         BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
9887         BUILD_BUG_SQE_ELEM(42, __u16,  personality);
9888         BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
9889
9890         BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
9891         BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
9892         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
9893                                 SLAB_ACCOUNT);
9894         return 0;
9895 };
9896 __initcall(io_uring_init);