fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqe (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <linux/compat.h>
  47 #include <net/compat.h>
  48 #include <linux/refcount.h>
  49 #include <linux/uio.h>
  50 #include <linux/bits.h>
  51
  52 #include <linux/sched/signal.h>
  53 #include <linux/fs.h>
  54 #include <linux/file.h>
  55 #include <linux/fdtable.h>
  56 #include <linux/mm.h>
  57 #include <linux/mman.h>
  58 #include <linux/percpu.h>
  59 #include <linux/slab.h>
  60 #include <linux/blkdev.h>
  61 #include <linux/bvec.h>
  62 #include <linux/net.h>
  63 #include <net/sock.h>
  64 #include <net/af_unix.h>
  65 #include <net/scm.h>
  66 #include <linux/anon_inodes.h>
  67 #include <linux/sched/mm.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/nospec.h>
  70 #include <linux/sizes.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/highmem.h>
  73 #include <linux/namei.h>
  74 #include <linux/fsnotify.h>
  75 #include <linux/fadvise.h>
  76 #include <linux/eventpoll.h>
  77 #include <linux/splice.h>
  78 #include <linux/task_work.h>
  79 #include <linux/pagemap.h>
  80 #include <linux/io_uring.h>
  81
  82 #define CREATE_TRACE_POINTS
  83 #include <trace/events/io_uring.h>
  84
  85 #include <uapi/linux/io_uring.h>
  86
  87 #include "internal.h"
  88 #include "io-wq.h"
  89
  90 #define IORING_MAX_ENTRIES      32768
  91 #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
  92
  93 /*
  94  * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
  95  */
  96 #define IORING_FILE_TABLE_SHIFT 9
  97 #define IORING_MAX_FILES_TABLE  (1U << IORING_FILE_TABLE_SHIFT)
  98 #define IORING_FILE_TABLE_MASK  (IORING_MAX_FILES_TABLE - 1)
  99 #define IORING_MAX_FIXED_FILES  (64 * IORING_MAX_FILES_TABLE)
 100 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
 101                                  IORING_REGISTER_LAST + IORING_OP_LAST)
 102
 103 #define IO_RSRC_TAG_TABLE_SHIFT 9
 104 #define IO_RSRC_TAG_TABLE_MAX   (1U << IO_RSRC_TAG_TABLE_SHIFT)
 105 #define IO_RSRC_TAG_TABLE_MASK  (IO_RSRC_TAG_TABLE_MAX - 1)
 106
 107 #define IORING_MAX_REG_BUFFERS  (1U << 14)
 108
 109 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
 110                                 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
 111                                 IOSQE_BUFFER_SELECT)
 112
 113 struct io_uring {
 114         u32 head ____cacheline_aligned_in_smp;
 115         u32 tail ____cacheline_aligned_in_smp;
 116 };
 117
 118 /*
 119  * This data is shared with the application through the mmap at offsets
 120  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 121  *
 122  * The offsets to the member fields are published through struct
 123  * io_sqring_offsets when calling io_uring_setup.
 124  */
 125 struct io_rings {
 126         /*
 127          * Head and tail offsets into the ring; the offsets need to be
 128          * masked to get valid indices.
 129          *
 130          * The kernel controls head of the sq ring and the tail of the cq ring,
 131          * and the application controls tail of the sq ring and the head of the
 132          * cq ring.
 133          */
 134         struct io_uring         sq, cq;
 135         /*
 136          * Bitmasks to apply to head and tail offsets (constant, equals
 137          * ring_entries - 1)
 138          */
 139         u32                     sq_ring_mask, cq_ring_mask;
 140         /* Ring sizes (constant, power of 2) */
 141         u32                     sq_ring_entries, cq_ring_entries;
 142         /*
 143          * Number of invalid entries dropped by the kernel due to
 144          * invalid index stored in array
 145          *
 146          * Written by the kernel, shouldn't be modified by the
 147          * application (i.e. get number of "new events" by comparing to
 148          * cached value).
 149          *
 150          * After a new SQ head value was read by the application this
 151          * counter includes all submissions that were dropped reaching
 152          * the new SQ head (and possibly more).
 153          */
 154         u32                     sq_dropped;
 155         /*
 156          * Runtime SQ flags
 157          *
 158          * Written by the kernel, shouldn't be modified by the
 159          * application.
 160          *
 161          * The application needs a full memory barrier before checking
 162          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 163          */
 164         u32                     sq_flags;
 165         /*
 166          * Runtime CQ flags
 167          *
 168          * Written by the application, shouldn't be modified by the
 169          * kernel.
 170          */
 171         u32                     cq_flags;
 172         /*
 173          * Number of completion events lost because the queue was full;
 174          * this should be avoided by the application by making sure
 175          * there are not more requests pending than there is space in
 176          * the completion queue.
 177          *
 178          * Written by the kernel, shouldn't be modified by the
 179          * application (i.e. get number of "new events" by comparing to
 180          * cached value).
 181          *
 182          * As completion events come in out of order this counter is not
 183          * ordered with any other data.
 184          */
 185         u32                     cq_overflow;
 186         /*
 187          * Ring buffer of completion events.
 188          *
 189          * The kernel writes completion events fresh every time they are
 190          * produced, so the application is allowed to modify pending
 191          * entries.
 192          */
 193         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 194 };
 195
 196 enum io_uring_cmd_flags {
 197         IO_URING_F_NONBLOCK             = 1,
 198         IO_URING_F_COMPLETE_DEFER       = 2,
 199 };
 200
 201 struct io_mapped_ubuf {
 202         u64             ubuf;
 203         u64             ubuf_end;
 204         unsigned int    nr_bvecs;
 205         unsigned long   acct_pages;
 206         struct bio_vec  bvec[];
 207 };
 208
 209 struct io_ring_ctx;
 210
 211 struct io_overflow_cqe {
 212         struct io_uring_cqe cqe;
 213         struct list_head list;
 214 };
 215
 216 struct io_fixed_file {
 217         /* file * with additional FFS_* flags */
 218         unsigned long file_ptr;
 219 };
 220
 221 struct io_rsrc_put {
 222         struct list_head list;
 223         u64 tag;
 224         union {
 225                 void *rsrc;
 226                 struct file *file;
 227                 struct io_mapped_ubuf *buf;
 228         };
 229 };
 230
 231 struct io_file_table {
 232         /* two level table */
 233         struct io_fixed_file **files;
 234 };
 235
 236 struct io_rsrc_node {
 237         struct percpu_ref               refs;
 238         struct list_head                node;
 239         struct list_head                rsrc_list;
 240         struct io_rsrc_data             *rsrc_data;
 241         struct llist_node               llist;
 242         bool                            done;
 243 };
 244
 245 typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
 246
 247 struct io_rsrc_data {
 248         struct io_ring_ctx              *ctx;
 249
 250         u64                             **tags;
 251         unsigned int                    nr;
 252         rsrc_put_fn                     *do_put;
 253         atomic_t                        refs;
 254         struct completion               done;
 255         bool                            quiesce;
 256 };
 257
 258 struct io_buffer {
 259         struct list_head list;
 260         __u64 addr;
 261         __u32 len;
 262         __u16 bid;
 263 };
 264
 265 struct io_restriction {
 266         DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
 267         DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
 268         u8 sqe_flags_allowed;
 269         u8 sqe_flags_required;
 270         bool registered;
 271 };
 272
 273 enum {
 274         IO_SQ_THREAD_SHOULD_STOP = 0,
 275         IO_SQ_THREAD_SHOULD_PARK,
 276 };
 277
 278 struct io_sq_data {
 279         refcount_t              refs;
 280         atomic_t                park_pending;
 281         struct mutex            lock;
 282
 283         /* ctx's that are using this sqd */
 284         struct list_head        ctx_list;
 285
 286         struct task_struct      *thread;
 287         struct wait_queue_head  wait;
 288
 289         unsigned                sq_thread_idle;
 290         int                     sq_cpu;
 291         pid_t                   task_pid;
 292         pid_t                   task_tgid;
 293
 294         unsigned long           state;
 295         struct completion       exited;
 296 };
 297
 298 #define IO_IOPOLL_BATCH                 8
 299 #define IO_COMPL_BATCH                  32
 300 #define IO_REQ_CACHE_SIZE               32
 301 #define IO_REQ_ALLOC_BATCH              8
 302
 303 struct io_comp_state {
 304         struct io_kiocb         *reqs[IO_COMPL_BATCH];
 305         unsigned int            nr;
 306         /* inline/task_work completion list, under ->uring_lock */
 307         struct list_head        free_list;
 308 };
 309
 310 struct io_submit_link {
 311         struct io_kiocb         *head;
 312         struct io_kiocb         *last;
 313 };
 314
 315 struct io_submit_state {
 316         struct blk_plug         plug;
 317         struct io_submit_link   link;
 318
 319         /*
 320          * io_kiocb alloc cache
 321          */
 322         void                    *reqs[IO_REQ_CACHE_SIZE];
 323         unsigned int            free_reqs;
 324
 325         bool                    plug_started;
 326
 327         /*
 328          * Batch completion logic
 329          */
 330         struct io_comp_state    comp;
 331
 332         /*
 333          * File reference cache
 334          */
 335         struct file             *file;
 336         unsigned int            fd;
 337         unsigned int            file_refs;
 338         unsigned int            ios_left;
 339 };
 340
 341 struct io_ring_ctx {
 342         struct {
 343                 struct percpu_ref       refs;
 344         } ____cacheline_aligned_in_smp;
 345
 346         struct {
 347                 unsigned int            flags;
 348                 unsigned int            compat: 1;
 349                 unsigned int            drain_next: 1;
 350                 unsigned int            eventfd_async: 1;
 351                 unsigned int            restricted: 1;
 352
 353                 /*
 354                  * Ring buffer of indices into array of io_uring_sqe, which is
 355                  * mmapped by the application using the IORING_OFF_SQES offset.
 356                  *
 357                  * This indirection could e.g. be used to assign fixed
 358                  * io_uring_sqe entries to operations and only submit them to
 359                  * the queue when needed.
 360                  *
 361                  * The kernel modifies neither the indices array nor the entries
 362                  * array.
 363                  */
 364                 u32                     *sq_array;
 365                 unsigned                cached_sq_head;
 366                 unsigned                sq_entries;
 367                 unsigned                sq_thread_idle;
 368                 unsigned                cached_sq_dropped;
 369                 unsigned long           sq_check_overflow;
 370
 371                 struct list_head        defer_list;
 372                 struct list_head        timeout_list;
 373                 struct list_head        cq_overflow_list;
 374
 375                 struct io_uring_sqe     *sq_sqes;
 376         } ____cacheline_aligned_in_smp;
 377
 378         struct {
 379                 struct mutex            uring_lock;
 380                 wait_queue_head_t       wait;
 381         } ____cacheline_aligned_in_smp;
 382
 383         struct io_submit_state          submit_state;
 384         /* IRQ completion list, under ->completion_lock */
 385         struct list_head        locked_free_list;
 386         unsigned int            locked_free_nr;
 387
 388         struct io_rings *rings;
 389
 390         const struct cred       *sq_creds;      /* cred used for __io_sq_thread() */
 391         struct io_sq_data       *sq_data;       /* if using sq thread polling */
 392
 393         struct wait_queue_head  sqo_sq_wait;
 394         struct list_head        sqd_list;
 395
 396         /*
 397          * Fixed resources fast path, should be accessed only under uring_lock,
 398          * and updated through io_uring_register(2)
 399          */
 400         struct io_rsrc_node     *rsrc_node;
 401
 402         struct io_file_table    file_table;
 403         unsigned                nr_user_files;
 404         unsigned                nr_user_bufs;
 405         struct io_mapped_ubuf   **user_bufs;
 406
 407         struct xarray           io_buffers;
 408         struct xarray           personalities;
 409         u32                     pers_next;
 410
 411         struct {
 412                 unsigned                cached_cq_tail;
 413                 unsigned                cq_entries;
 414                 atomic_t                cq_timeouts;
 415                 unsigned                cq_last_tm_flush;
 416                 unsigned                cq_extra;
 417                 unsigned long           cq_check_overflow;
 418                 struct wait_queue_head  cq_wait;
 419                 struct fasync_struct    *cq_fasync;
 420                 struct eventfd_ctx      *cq_ev_fd;
 421         } ____cacheline_aligned_in_smp;
 422
 423         struct {
 424                 spinlock_t              completion_lock;
 425
 426                 /*
 427                  * ->iopoll_list is protected by the ctx->uring_lock for
 428                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 429                  * For SQPOLL, only the single threaded io_sq_thread() will
 430                  * manipulate the list, hence no extra locking is needed there.
 431                  */
 432                 struct list_head        iopoll_list;
 433                 struct hlist_head       *cancel_hash;
 434                 unsigned                cancel_hash_bits;
 435                 bool                    poll_multi_file;
 436         } ____cacheline_aligned_in_smp;
 437
 438         struct io_restriction           restrictions;
 439
 440         /* slow path rsrc auxilary data, used by update/register */
 441         struct {
 442                 struct io_rsrc_node             *rsrc_backup_node;
 443                 struct io_mapped_ubuf           *dummy_ubuf;
 444                 struct io_rsrc_data             *file_data;
 445                 struct io_rsrc_data             *buf_data;
 446
 447                 struct delayed_work             rsrc_put_work;
 448                 struct llist_head               rsrc_put_llist;
 449                 struct list_head                rsrc_ref_list;
 450                 spinlock_t                      rsrc_ref_lock;
 451         };
 452
 453         /* Keep this last, we don't need it for the fast path */
 454         struct {
 455                 #if defined(CONFIG_UNIX)
 456                         struct socket           *ring_sock;
 457                 #endif
 458                 /* hashed buffered write serialization */
 459                 struct io_wq_hash               *hash_map;
 460
 461                 /* Only used for accounting purposes */
 462                 struct user_struct              *user;
 463                 struct mm_struct                *mm_account;
 464
 465                 /* ctx exit and cancelation */
 466                 struct callback_head            *exit_task_work;
 467                 struct work_struct              exit_work;
 468                 struct list_head                tctx_list;
 469                 struct completion               ref_comp;
 470         };
 471 };
 472
 473 struct io_uring_task {
 474         /* submission side */
 475         struct xarray           xa;
 476         struct wait_queue_head  wait;
 477         const struct io_ring_ctx *last;
 478         struct io_wq            *io_wq;
 479         struct percpu_counter   inflight;
 480         atomic_t                inflight_tracked;
 481         atomic_t                in_idle;
 482
 483         spinlock_t              task_lock;
 484         struct io_wq_work_list  task_list;
 485         unsigned long           task_state;
 486         struct callback_head    task_work;
 487 };
 488
 489 /*
 490  * First field must be the file pointer in all the
 491  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 492  */
 493 struct io_poll_iocb {
 494         struct file                     *file;
 495         struct wait_queue_head          *head;
 496         __poll_t                        events;
 497         bool                            done;
 498         bool                            canceled;
 499         struct wait_queue_entry         wait;
 500 };
 501
 502 struct io_poll_update {
 503         struct file                     *file;
 504         u64                             old_user_data;
 505         u64                             new_user_data;
 506         __poll_t                        events;
 507         bool                            update_events;
 508         bool                            update_user_data;
 509 };
 510
 511 struct io_close {
 512         struct file                     *file;
 513         int                             fd;
 514 };
 515
 516 struct io_timeout_data {
 517         struct io_kiocb                 *req;
 518         struct hrtimer                  timer;
 519         struct timespec64               ts;
 520         enum hrtimer_mode               mode;
 521 };
 522
 523 struct io_accept {
 524         struct file                     *file;
 525         struct sockaddr __user          *addr;
 526         int __user                      *addr_len;
 527         int                             flags;
 528         unsigned long                   nofile;
 529 };
 530
 531 struct io_sync {
 532         struct file                     *file;
 533         loff_t                          len;
 534         loff_t                          off;
 535         int                             flags;
 536         int                             mode;
 537 };
 538
 539 struct io_cancel {
 540         struct file                     *file;
 541         u64                             addr;
 542 };
 543
 544 struct io_timeout {
 545         struct file                     *file;
 546         u32                             off;
 547         u32                             target_seq;
 548         struct list_head                list;
 549         /* head of the link, used by linked timeouts only */
 550         struct io_kiocb                 *head;
 551 };
 552
 553 struct io_timeout_rem {
 554         struct file                     *file;
 555         u64                             addr;
 556
 557         /* timeout update */
 558         struct timespec64               ts;
 559         u32                             flags;
 560 };
 561
 562 struct io_rw {
 563         /* NOTE: kiocb has the file as the first member, so don't do it here */
 564         struct kiocb                    kiocb;
 565         u64                             addr;
 566         u64                             len;
 567 };
 568
 569 struct io_connect {
 570         struct file                     *file;
 571         struct sockaddr __user          *addr;
 572         int                             addr_len;
 573 };
 574
 575 struct io_sr_msg {
 576         struct file                     *file;
 577         union {
 578                 struct compat_msghdr __user     *umsg_compat;
 579                 struct user_msghdr __user       *umsg;
 580                 void __user                     *buf;
 581         };
 582         int                             msg_flags;
 583         int                             bgid;
 584         size_t                          len;
 585         struct io_buffer                *kbuf;
 586 };
 587
 588 struct io_open {
 589         struct file                     *file;
 590         int                             dfd;
 591         struct filename                 *filename;
 592         struct open_how                 how;
 593         unsigned long                   nofile;
 594 };
 595
 596 struct io_rsrc_update {
 597         struct file                     *file;
 598         u64                             arg;
 599         u32                             nr_args;
 600         u32                             offset;
 601 };
 602
 603 struct io_fadvise {
 604         struct file                     *file;
 605         u64                             offset;
 606         u32                             len;
 607         u32                             advice;
 608 };
 609
 610 struct io_madvise {
 611         struct file                     *file;
 612         u64                             addr;
 613         u32                             len;
 614         u32                             advice;
 615 };
 616
 617 struct io_epoll {
 618         struct file                     *file;
 619         int                             epfd;
 620         int                             op;
 621         int                             fd;
 622         struct epoll_event              event;
 623 };
 624
 625 struct io_splice {
 626         struct file                     *file_out;
 627         struct file                     *file_in;
 628         loff_t                          off_out;
 629         loff_t                          off_in;
 630         u64                             len;
 631         unsigned int                    flags;
 632 };
 633
 634 struct io_provide_buf {
 635         struct file                     *file;
 636         __u64                           addr;
 637         __u32                           len;
 638         __u32                           bgid;
 639         __u16                           nbufs;
 640         __u16                           bid;
 641 };
 642
 643 struct io_statx {
 644         struct file                     *file;
 645         int                             dfd;
 646         unsigned int                    mask;
 647         unsigned int                    flags;
 648         const char __user               *filename;
 649         struct statx __user             *buffer;
 650 };
 651
 652 struct io_shutdown {
 653         struct file                     *file;
 654         int                             how;
 655 };
 656
 657 struct io_rename {
 658         struct file                     *file;
 659         int                             old_dfd;
 660         int                             new_dfd;
 661         struct filename                 *oldpath;
 662         struct filename                 *newpath;
 663         int                             flags;
 664 };
 665
 666 struct io_unlink {
 667         struct file                     *file;
 668         int                             dfd;
 669         int                             flags;
 670         struct filename                 *filename;
 671 };
 672
 673 struct io_completion {
 674         struct file                     *file;
 675         struct list_head                list;
 676         u32                             cflags;
 677 };
 678
 679 struct io_async_connect {
 680         struct sockaddr_storage         address;
 681 };
 682
 683 struct io_async_msghdr {
 684         struct iovec                    fast_iov[UIO_FASTIOV];
 685         /* points to an allocated iov, if NULL we use fast_iov instead */
 686         struct iovec                    *free_iov;
 687         struct sockaddr __user          *uaddr;
 688         struct msghdr                   msg;
 689         struct sockaddr_storage         addr;
 690 };
 691
 692 struct io_async_rw {
 693         struct iovec                    fast_iov[UIO_FASTIOV];
 694         const struct iovec              *free_iovec;
 695         struct iov_iter                 iter;
 696         size_t                          bytes_done;
 697         struct wait_page_queue          wpq;
 698 };
 699
 700 enum {
 701         REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
 702         REQ_F_IO_DRAIN_BIT      = IOSQE_IO_DRAIN_BIT,
 703         REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
 704         REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
 705         REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
 706         REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
 707
 708         /* first byte is taken by user flags, shift it to not overlap */
 709         REQ_F_FAIL_BIT          = 8,
 710         REQ_F_INFLIGHT_BIT,
 711         REQ_F_CUR_POS_BIT,
 712         REQ_F_NOWAIT_BIT,
 713         REQ_F_LINK_TIMEOUT_BIT,
 714         REQ_F_NEED_CLEANUP_BIT,
 715         REQ_F_POLLED_BIT,
 716         REQ_F_BUFFER_SELECTED_BIT,
 717         REQ_F_LTIMEOUT_ACTIVE_BIT,
 718         REQ_F_COMPLETE_INLINE_BIT,
 719         REQ_F_REISSUE_BIT,
 720         REQ_F_DONT_REISSUE_BIT,
 721         /* keep async read/write and isreg together and in order */
 722         REQ_F_ASYNC_READ_BIT,
 723         REQ_F_ASYNC_WRITE_BIT,
 724         REQ_F_ISREG_BIT,
 725
 726         /* not a real bit, just to check we're not overflowing the space */
 727         __REQ_F_LAST_BIT,
 728 };
 729
 730 enum {
 731         /* ctx owns file */
 732         REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
 733         /* drain existing IO first */
 734         REQ_F_IO_DRAIN          = BIT(REQ_F_IO_DRAIN_BIT),
 735         /* linked sqes */
 736         REQ_F_LINK              = BIT(REQ_F_LINK_BIT),
 737         /* doesn't sever on completion < 0 */
 738         REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
 739         /* IOSQE_ASYNC */
 740         REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
 741         /* IOSQE_BUFFER_SELECT */
 742         REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
 743
 744         /* fail rest of links */
 745         REQ_F_FAIL              = BIT(REQ_F_FAIL_BIT),
 746         /* on inflight list, should be cancelled and waited on exit reliably */
 747         REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
 748         /* read/write uses file position */
 749         REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
 750         /* must not punt to workers */
 751         REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
 752         /* has or had linked timeout */
 753         REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
 754         /* needs cleanup */
 755         REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
 756         /* already went through poll handler */
 757         REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
 758         /* buffer already selected */
 759         REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
 760         /* linked timeout is active, i.e. prepared by link's head */
 761         REQ_F_LTIMEOUT_ACTIVE   = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
 762         /* completion is deferred through io_comp_state */
 763         REQ_F_COMPLETE_INLINE   = BIT(REQ_F_COMPLETE_INLINE_BIT),
 764         /* caller should reissue async */
 765         REQ_F_REISSUE           = BIT(REQ_F_REISSUE_BIT),
 766         /* don't attempt request reissue, see io_rw_reissue() */
 767         REQ_F_DONT_REISSUE      = BIT(REQ_F_DONT_REISSUE_BIT),
 768         /* supports async reads */
 769         REQ_F_ASYNC_READ        = BIT(REQ_F_ASYNC_READ_BIT),
 770         /* supports async writes */
 771         REQ_F_ASYNC_WRITE       = BIT(REQ_F_ASYNC_WRITE_BIT),
 772         /* regular file */
 773         REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
 774 };
 775
 776 struct async_poll {
 777         struct io_poll_iocb     poll;
 778         struct io_poll_iocb     *double_poll;
 779 };
 780
 781 struct io_task_work {
 782         struct io_wq_work_node  node;
 783         task_work_func_t        func;
 784 };
 785
 786 enum {
 787         IORING_RSRC_FILE                = 0,
 788         IORING_RSRC_BUFFER              = 1,
 789 };
 790
 791 /*
 792  * NOTE! Each of the iocb union members has the file pointer
 793  * as the first entry in their struct definition. So you can
 794  * access the file pointer through any of the sub-structs,
 795  * or directly as just 'ki_filp' in this struct.
 796  */
 797 struct io_kiocb {
 798         union {
 799                 struct file             *file;
 800                 struct io_rw            rw;
 801                 struct io_poll_iocb     poll;
 802                 struct io_poll_update   poll_update;
 803                 struct io_accept        accept;
 804                 struct io_sync          sync;
 805                 struct io_cancel        cancel;
 806                 struct io_timeout       timeout;
 807                 struct io_timeout_rem   timeout_rem;
 808                 struct io_connect       connect;
 809                 struct io_sr_msg        sr_msg;
 810                 struct io_open          open;
 811                 struct io_close         close;
 812                 struct io_rsrc_update   rsrc_update;
 813                 struct io_fadvise       fadvise;
 814                 struct io_madvise       madvise;
 815                 struct io_epoll         epoll;
 816                 struct io_splice        splice;
 817                 struct io_provide_buf   pbuf;
 818                 struct io_statx         statx;
 819                 struct io_shutdown      shutdown;
 820                 struct io_rename        rename;
 821                 struct io_unlink        unlink;
 822                 /* use only after cleaning per-op data, see io_clean_op() */
 823                 struct io_completion    compl;
 824         };
 825
 826         /* opcode allocated if it needs to store data for async defer */
 827         void                            *async_data;
 828         u8                              opcode;
 829         /* polled IO has completed */
 830         u8                              iopoll_completed;
 831
 832         u16                             buf_index;
 833         u32                             result;
 834
 835         struct io_ring_ctx              *ctx;
 836         unsigned int                    flags;
 837         atomic_t                        refs;
 838         struct task_struct              *task;
 839         u64                             user_data;
 840
 841         struct io_kiocb                 *link;
 842         struct percpu_ref               *fixed_rsrc_refs;
 843
 844         /* used with ctx->iopoll_list with reads/writes */
 845         struct list_head                inflight_entry;
 846         union {
 847                 struct io_task_work     io_task_work;
 848                 struct callback_head    task_work;
 849         };
 850         /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
 851         struct hlist_node               hash_node;
 852         struct async_poll               *apoll;
 853         struct io_wq_work               work;
 854         /* store used ubuf, so we can prevent reloading */
 855         struct io_mapped_ubuf           *imu;
 856 };
 857
 858 struct io_tctx_node {
 859         struct list_head        ctx_node;
 860         struct task_struct      *task;
 861         struct io_ring_ctx      *ctx;
 862 };
 863
 864 struct io_defer_entry {
 865         struct list_head        list;
 866         struct io_kiocb         *req;
 867         u32                     seq;
 868 };
 869
 870 struct io_op_def {
 871         /* needs req->file assigned */
 872         unsigned                needs_file : 1;
 873         /* hash wq insertion if file is a regular file */
 874         unsigned                hash_reg_file : 1;
 875         /* unbound wq insertion if file is a non-regular file */
 876         unsigned                unbound_nonreg_file : 1;
 877         /* opcode is not supported by this kernel */
 878         unsigned                not_supported : 1;
 879         /* set if opcode supports polled "wait" */
 880         unsigned                pollin : 1;
 881         unsigned                pollout : 1;
 882         /* op supports buffer selection */
 883         unsigned                buffer_select : 1;
 884         /* do prep async if is going to be punted */
 885         unsigned                needs_async_setup : 1;
 886         /* should block plug */
 887         unsigned                plug : 1;
 888         /* size of async data needed, if any */
 889         unsigned short          async_size;
 890 };
 891
 892 static const struct io_op_def io_op_defs[] = {
 893         [IORING_OP_NOP] = {},
 894         [IORING_OP_READV] = {
 895                 .needs_file             = 1,
 896                 .unbound_nonreg_file    = 1,
 897                 .pollin                 = 1,
 898                 .buffer_select          = 1,
 899                 .needs_async_setup      = 1,
 900                 .plug                   = 1,
 901                 .async_size             = sizeof(struct io_async_rw),
 902         },
 903         [IORING_OP_WRITEV] = {
 904                 .needs_file             = 1,
 905                 .hash_reg_file          = 1,
 906                 .unbound_nonreg_file    = 1,
 907                 .pollout                = 1,
 908                 .needs_async_setup      = 1,
 909                 .plug                   = 1,
 910                 .async_size             = sizeof(struct io_async_rw),
 911         },
 912         [IORING_OP_FSYNC] = {
 913                 .needs_file             = 1,
 914         },
 915         [IORING_OP_READ_FIXED] = {
 916                 .needs_file             = 1,
 917                 .unbound_nonreg_file    = 1,
 918                 .pollin                 = 1,
 919                 .plug                   = 1,
 920                 .async_size             = sizeof(struct io_async_rw),
 921         },
 922         [IORING_OP_WRITE_FIXED] = {
 923                 .needs_file             = 1,
 924                 .hash_reg_file          = 1,
 925                 .unbound_nonreg_file    = 1,
 926                 .pollout                = 1,
 927                 .plug                   = 1,
 928                 .async_size             = sizeof(struct io_async_rw),
 929         },
 930         [IORING_OP_POLL_ADD] = {
 931                 .needs_file             = 1,
 932                 .unbound_nonreg_file    = 1,
 933         },
 934         [IORING_OP_POLL_REMOVE] = {},
 935         [IORING_OP_SYNC_FILE_RANGE] = {
 936                 .needs_file             = 1,
 937         },
 938         [IORING_OP_SENDMSG] = {
 939                 .needs_file             = 1,
 940                 .unbound_nonreg_file    = 1,
 941                 .pollout                = 1,
 942                 .needs_async_setup      = 1,
 943                 .async_size             = sizeof(struct io_async_msghdr),
 944         },
 945         [IORING_OP_RECVMSG] = {
 946                 .needs_file             = 1,
 947                 .unbound_nonreg_file    = 1,
 948                 .pollin                 = 1,
 949                 .buffer_select          = 1,
 950                 .needs_async_setup      = 1,
 951                 .async_size             = sizeof(struct io_async_msghdr),
 952         },
 953         [IORING_OP_TIMEOUT] = {
 954                 .async_size             = sizeof(struct io_timeout_data),
 955         },
 956         [IORING_OP_TIMEOUT_REMOVE] = {
 957                 /* used by timeout updates' prep() */
 958         },
 959         [IORING_OP_ACCEPT] = {
 960                 .needs_file             = 1,
 961                 .unbound_nonreg_file    = 1,
 962                 .pollin                 = 1,
 963         },
 964         [IORING_OP_ASYNC_CANCEL] = {},
 965         [IORING_OP_LINK_TIMEOUT] = {
 966                 .async_size             = sizeof(struct io_timeout_data),
 967         },
 968         [IORING_OP_CONNECT] = {
 969                 .needs_file             = 1,
 970                 .unbound_nonreg_file    = 1,
 971                 .pollout                = 1,
 972                 .needs_async_setup      = 1,
 973                 .async_size             = sizeof(struct io_async_connect),
 974         },
 975         [IORING_OP_FALLOCATE] = {
 976                 .needs_file             = 1,
 977         },
 978         [IORING_OP_OPENAT] = {},
 979         [IORING_OP_CLOSE] = {},
 980         [IORING_OP_FILES_UPDATE] = {},
 981         [IORING_OP_STATX] = {},
 982         [IORING_OP_READ] = {
 983                 .needs_file             = 1,
 984                 .unbound_nonreg_file    = 1,
 985                 .pollin                 = 1,
 986                 .buffer_select          = 1,
 987                 .plug                   = 1,
 988                 .async_size             = sizeof(struct io_async_rw),
 989         },
 990         [IORING_OP_WRITE] = {
 991                 .needs_file             = 1,
 992                 .unbound_nonreg_file    = 1,
 993                 .pollout                = 1,
 994                 .plug                   = 1,
 995                 .async_size             = sizeof(struct io_async_rw),
 996         },
 997         [IORING_OP_FADVISE] = {
 998                 .needs_file             = 1,
 999         },
1000         [IORING_OP_MADVISE] = {},
1001         [IORING_OP_SEND] = {
1002                 .needs_file             = 1,
1003                 .unbound_nonreg_file    = 1,
1004                 .pollout                = 1,
1005         },
1006         [IORING_OP_RECV] = {
1007                 .needs_file             = 1,
1008                 .unbound_nonreg_file    = 1,
1009                 .pollin                 = 1,
1010                 .buffer_select          = 1,
1011         },
1012         [IORING_OP_OPENAT2] = {
1013         },
1014         [IORING_OP_EPOLL_CTL] = {
1015                 .unbound_nonreg_file    = 1,
1016         },
1017         [IORING_OP_SPLICE] = {
1018                 .needs_file             = 1,
1019                 .hash_reg_file          = 1,
1020                 .unbound_nonreg_file    = 1,
1021         },
1022         [IORING_OP_PROVIDE_BUFFERS] = {},
1023         [IORING_OP_REMOVE_BUFFERS] = {},
1024         [IORING_OP_TEE] = {
1025                 .needs_file             = 1,
1026                 .hash_reg_file          = 1,
1027                 .unbound_nonreg_file    = 1,
1028         },
1029         [IORING_OP_SHUTDOWN] = {
1030                 .needs_file             = 1,
1031         },
1032         [IORING_OP_RENAMEAT] = {},
1033         [IORING_OP_UNLINKAT] = {},
1034 };
1035
1036 static bool io_disarm_next(struct io_kiocb *req);
1037 static void io_uring_del_tctx_node(unsigned long index);
1038 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
1039                                          struct task_struct *task,
1040                                          bool cancel_all);
1041 static void io_uring_cancel_sqpoll(struct io_sq_data *sqd);
1042 static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
1043
1044 static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1045                                  long res, unsigned int cflags);
1046 static void io_put_req(struct io_kiocb *req);
1047 static void io_put_req_deferred(struct io_kiocb *req, int nr);
1048 static void io_dismantle_req(struct io_kiocb *req);
1049 static void io_put_task(struct task_struct *task, int nr);
1050 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
1051 static void io_queue_linked_timeout(struct io_kiocb *req);
1052 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
1053                                      struct io_uring_rsrc_update2 *up,
1054                                      unsigned nr_args);
1055 static void io_clean_op(struct io_kiocb *req);
1056 static struct file *io_file_get(struct io_submit_state *state,
1057                                 struct io_kiocb *req, int fd, bool fixed);
1058 static void __io_queue_sqe(struct io_kiocb *req);
1059 static void io_rsrc_put_work(struct work_struct *work);
1060
1061 static void io_req_task_queue(struct io_kiocb *req);
1062 static void io_submit_flush_completions(struct io_comp_state *cs,
1063                                         struct io_ring_ctx *ctx);
1064 static bool io_poll_remove_waitqs(struct io_kiocb *req);
1065 static int io_req_prep_async(struct io_kiocb *req);
1066
1067 static struct kmem_cache *req_cachep;
1068
1069 static const struct file_operations io_uring_fops;
1070
1071 struct sock *io_uring_get_socket(struct file *file)
1072 {
1073 #if defined(CONFIG_UNIX)
1074         if (file->f_op == &io_uring_fops) {
1075                 struct io_ring_ctx *ctx = file->private_data;
1076
1077                 return ctx->ring_sock->sk;
1078         }
1079 #endif
1080         return NULL;
1081 }
1082 EXPORT_SYMBOL(io_uring_get_socket);
1083
1084 #define io_for_each_link(pos, head) \
1085         for (pos = (head); pos; pos = pos->link)
1086
1087 static inline void io_req_set_rsrc_node(struct io_kiocb *req)
1088 {
1089         struct io_ring_ctx *ctx = req->ctx;
1090
1091         if (!req->fixed_rsrc_refs) {
1092                 req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
1093                 percpu_ref_get(req->fixed_rsrc_refs);
1094         }
1095 }
1096
1097 static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
1098 {
1099         bool got = percpu_ref_tryget(ref);
1100
1101         /* already at zero, wait for ->release() */
1102         if (!got)
1103                 wait_for_completion(compl);
1104         percpu_ref_resurrect(ref);
1105         if (got)
1106                 percpu_ref_put(ref);
1107 }
1108
1109 static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
1110                           bool cancel_all)
1111 {
1112         struct io_kiocb *req;
1113
1114         if (task && head->task != task)
1115                 return false;
1116         if (cancel_all)
1117                 return true;
1118
1119         io_for_each_link(req, head) {
1120                 if (req->flags & REQ_F_INFLIGHT)
1121                         return true;
1122         }
1123         return false;
1124 }
1125
1126 static inline void req_set_fail(struct io_kiocb *req)
1127 {
1128         req->flags |= REQ_F_FAIL;
1129 }
1130
1131 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
1132 {
1133         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1134
1135         complete(&ctx->ref_comp);
1136 }
1137
1138 static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1139 {
1140         return !req->timeout.off;
1141 }
1142
1143 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1144 {
1145         struct io_ring_ctx *ctx;
1146         int hash_bits;
1147
1148         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1149         if (!ctx)
1150                 return NULL;
1151
1152         /*
1153          * Use 5 bits less than the max cq entries, that should give us around
1154          * 32 entries per hash list if totally full and uniformly spread.
1155          */
1156         hash_bits = ilog2(p->cq_entries);
1157         hash_bits -= 5;
1158         if (hash_bits <= 0)
1159                 hash_bits = 1;
1160         ctx->cancel_hash_bits = hash_bits;
1161         ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1162                                         GFP_KERNEL);
1163         if (!ctx->cancel_hash)
1164                 goto err;
1165         __hash_init(ctx->cancel_hash, 1U << hash_bits);
1166
1167         ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
1168         if (!ctx->dummy_ubuf)
1169                 goto err;
1170         /* set invalid range, so io_import_fixed() fails meeting it */
1171         ctx->dummy_ubuf->ubuf = -1UL;
1172
1173         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
1174                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1175                 goto err;
1176
1177         ctx->flags = p->flags;
1178         init_waitqueue_head(&ctx->sqo_sq_wait);
1179         INIT_LIST_HEAD(&ctx->sqd_list);
1180         init_waitqueue_head(&ctx->cq_wait);
1181         INIT_LIST_HEAD(&ctx->cq_overflow_list);
1182         init_completion(&ctx->ref_comp);
1183         xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
1184         xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
1185         mutex_init(&ctx->uring_lock);
1186         init_waitqueue_head(&ctx->wait);
1187         spin_lock_init(&ctx->completion_lock);
1188         INIT_LIST_HEAD(&ctx->iopoll_list);
1189         INIT_LIST_HEAD(&ctx->defer_list);
1190         INIT_LIST_HEAD(&ctx->timeout_list);
1191         spin_lock_init(&ctx->rsrc_ref_lock);
1192         INIT_LIST_HEAD(&ctx->rsrc_ref_list);
1193         INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1194         init_llist_head(&ctx->rsrc_put_llist);
1195         INIT_LIST_HEAD(&ctx->tctx_list);
1196         INIT_LIST_HEAD(&ctx->submit_state.comp.free_list);
1197         INIT_LIST_HEAD(&ctx->locked_free_list);
1198         return ctx;
1199 err:
1200         kfree(ctx->dummy_ubuf);
1201         kfree(ctx->cancel_hash);
1202         kfree(ctx);
1203         return NULL;
1204 }
1205
1206 static void io_account_cq_overflow(struct io_ring_ctx *ctx)
1207 {
1208         struct io_rings *r = ctx->rings;
1209
1210         WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
1211         ctx->cq_extra--;
1212 }
1213
1214 static bool req_need_defer(struct io_kiocb *req, u32 seq)
1215 {
1216         if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1217                 struct io_ring_ctx *ctx = req->ctx;
1218
1219                 return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
1220         }
1221
1222         return false;
1223 }
1224
1225 static void io_req_track_inflight(struct io_kiocb *req)
1226 {
1227         if (!(req->flags & REQ_F_INFLIGHT)) {
1228                 req->flags |= REQ_F_INFLIGHT;
1229                 atomic_inc(&current->io_uring->inflight_tracked);
1230         }
1231 }
1232
1233 static void io_prep_async_work(struct io_kiocb *req)
1234 {
1235         const struct io_op_def *def = &io_op_defs[req->opcode];
1236         struct io_ring_ctx *ctx = req->ctx;
1237
1238         if (!req->work.creds)
1239                 req->work.creds = get_current_cred();
1240
1241         req->work.list.next = NULL;
1242         req->work.flags = 0;
1243         if (req->flags & REQ_F_FORCE_ASYNC)
1244                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
1245
1246         if (req->flags & REQ_F_ISREG) {
1247                 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1248                         io_wq_hash_work(&req->work, file_inode(req->file));
1249         } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
1250                 if (def->unbound_nonreg_file)
1251                         req->work.flags |= IO_WQ_WORK_UNBOUND;
1252         }
1253
1254         switch (req->opcode) {
1255         case IORING_OP_SPLICE:
1256         case IORING_OP_TEE:
1257                 if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
1258                         req->work.flags |= IO_WQ_WORK_UNBOUND;
1259                 break;
1260         }
1261 }
1262
1263 static void io_prep_async_link(struct io_kiocb *req)
1264 {
1265         struct io_kiocb *cur;
1266
1267         io_for_each_link(cur, req)
1268                 io_prep_async_work(cur);
1269 }
1270
1271 static void io_queue_async_work(struct io_kiocb *req)
1272 {
1273         struct io_ring_ctx *ctx = req->ctx;
1274         struct io_kiocb *link = io_prep_linked_timeout(req);
1275         struct io_uring_task *tctx = req->task->io_uring;
1276
1277         BUG_ON(!tctx);
1278         BUG_ON(!tctx->io_wq);
1279
1280         /* init ->work of the whole link before punting */
1281         io_prep_async_link(req);
1282         trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1283                                         &req->work, req->flags);
1284         io_wq_enqueue(tctx->io_wq, &req->work);
1285         if (link)
1286                 io_queue_linked_timeout(link);
1287 }
1288
1289 static void io_kill_timeout(struct io_kiocb *req, int status)
1290         __must_hold(&req->ctx->completion_lock)
1291 {
1292         struct io_timeout_data *io = req->async_data;
1293
1294         if (hrtimer_try_to_cancel(&io->timer) != -1) {
1295                 atomic_set(&req->ctx->cq_timeouts,
1296                         atomic_read(&req->ctx->cq_timeouts) + 1);
1297                 list_del_init(&req->timeout.list);
1298                 io_cqring_fill_event(req->ctx, req->user_data, status, 0);
1299                 io_put_req_deferred(req, 1);
1300         }
1301 }
1302
1303 static void __io_queue_deferred(struct io_ring_ctx *ctx)
1304 {
1305         do {
1306                 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1307                                                 struct io_defer_entry, list);
1308
1309                 if (req_need_defer(de->req, de->seq))
1310                         break;
1311                 list_del_init(&de->list);
1312                 io_req_task_queue(de->req);
1313                 kfree(de);
1314         } while (!list_empty(&ctx->defer_list));
1315 }
1316
1317 static void io_flush_timeouts(struct io_ring_ctx *ctx)
1318 {
1319         u32 seq;
1320
1321         if (list_empty(&ctx->timeout_list))
1322                 return;
1323
1324         seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
1325
1326         do {
1327                 u32 events_needed, events_got;
1328                 struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
1329                                                 struct io_kiocb, timeout.list);
1330
1331                 if (io_is_timeout_noseq(req))
1332                         break;
1333
1334                 /*
1335                  * Since seq can easily wrap around over time, subtract
1336                  * the last seq at which timeouts were flushed before comparing.
1337                  * Assuming not more than 2^31-1 events have happened since,
1338                  * these subtractions won't have wrapped, so we can check if
1339                  * target is in [last_seq, current_seq] by comparing the two.
1340                  */
1341                 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
1342                 events_got = seq - ctx->cq_last_tm_flush;
1343                 if (events_got < events_needed)
1344                         break;
1345
1346                 list_del_init(&req->timeout.list);
1347                 io_kill_timeout(req, 0);
1348         } while (!list_empty(&ctx->timeout_list));
1349
1350         ctx->cq_last_tm_flush = seq;
1351 }
1352
1353 static void io_commit_cqring(struct io_ring_ctx *ctx)
1354 {
1355         io_flush_timeouts(ctx);
1356
1357         /* order cqe stores with ring update */
1358         smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
1359
1360         if (unlikely(!list_empty(&ctx->defer_list)))
1361                 __io_queue_deferred(ctx);
1362 }
1363
1364 static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1365 {
1366         struct io_rings *r = ctx->rings;
1367
1368         return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
1369 }
1370
1371 static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
1372 {
1373         return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
1374 }
1375
1376 static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
1377 {
1378         struct io_rings *rings = ctx->rings;
1379         unsigned tail, mask = ctx->cq_entries - 1;
1380
1381         /*
1382          * writes to the cq entry need to come after reading head; the
1383          * control dependency is enough as we're using WRITE_ONCE to
1384          * fill the cq entry
1385          */
1386         if (__io_cqring_events(ctx) == ctx->cq_entries)
1387                 return NULL;
1388
1389         tail = ctx->cached_cq_tail++;
1390         return &rings->cqes[tail & mask];
1391 }
1392
1393 static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1394 {
1395         if (likely(!ctx->cq_ev_fd))
1396                 return false;
1397         if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1398                 return false;
1399         return !ctx->eventfd_async || io_wq_current_is_worker();
1400 }
1401
1402 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1403 {
1404         /* see waitqueue_active() comment */
1405         smp_mb();
1406
1407         if (waitqueue_active(&ctx->wait))
1408                 wake_up(&ctx->wait);
1409         if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
1410                 wake_up(&ctx->sq_data->wait);
1411         if (io_should_trigger_evfd(ctx))
1412                 eventfd_signal(ctx->cq_ev_fd, 1);
1413         if (waitqueue_active(&ctx->cq_wait)) {
1414                 wake_up_interruptible(&ctx->cq_wait);
1415                 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1416         }
1417 }
1418
1419 static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1420 {
1421         /* see waitqueue_active() comment */
1422         smp_mb();
1423
1424         if (ctx->flags & IORING_SETUP_SQPOLL) {
1425                 if (waitqueue_active(&ctx->wait))
1426                         wake_up(&ctx->wait);
1427         }
1428         if (io_should_trigger_evfd(ctx))
1429                 eventfd_signal(ctx->cq_ev_fd, 1);
1430         if (waitqueue_active(&ctx->cq_wait)) {
1431                 wake_up_interruptible(&ctx->cq_wait);
1432                 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1433         }
1434 }
1435
1436 /* Returns true if there are no backlogged entries after the flush */
1437 static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1438 {
1439         unsigned long flags;
1440         bool all_flushed, posted;
1441
1442         if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
1443                 return false;
1444
1445         posted = false;
1446         spin_lock_irqsave(&ctx->completion_lock, flags);
1447         while (!list_empty(&ctx->cq_overflow_list)) {
1448                 struct io_uring_cqe *cqe = io_get_cqe(ctx);
1449                 struct io_overflow_cqe *ocqe;
1450
1451                 if (!cqe && !force)
1452                         break;
1453                 ocqe = list_first_entry(&ctx->cq_overflow_list,
1454                                         struct io_overflow_cqe, list);
1455                 if (cqe)
1456                         memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
1457                 else
1458                         io_account_cq_overflow(ctx);
1459
1460                 posted = true;
1461                 list_del(&ocqe->list);
1462                 kfree(ocqe);
1463         }
1464
1465         all_flushed = list_empty(&ctx->cq_overflow_list);
1466         if (all_flushed) {
1467                 clear_bit(0, &ctx->sq_check_overflow);
1468                 clear_bit(0, &ctx->cq_check_overflow);
1469                 ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
1470         }
1471
1472         if (posted)
1473                 io_commit_cqring(ctx);
1474         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1475         if (posted)
1476                 io_cqring_ev_posted(ctx);
1477         return all_flushed;
1478 }
1479
1480 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1481 {
1482         bool ret = true;
1483
1484         if (test_bit(0, &ctx->cq_check_overflow)) {
1485                 /* iopoll syncs against uring_lock, not completion_lock */
1486                 if (ctx->flags & IORING_SETUP_IOPOLL)
1487                         mutex_lock(&ctx->uring_lock);
1488                 ret = __io_cqring_overflow_flush(ctx, force);
1489                 if (ctx->flags & IORING_SETUP_IOPOLL)
1490                         mutex_unlock(&ctx->uring_lock);
1491         }
1492
1493         return ret;
1494 }
1495
1496 /*
1497  * Shamelessly stolen from the mm implementation of page reference checking,
1498  * see commit f958d7b528b1 for details.
1499  */
1500 #define req_ref_zero_or_close_to_overflow(req)  \
1501         ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
1502
1503 static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
1504 {
1505         return atomic_inc_not_zero(&req->refs);
1506 }
1507
1508 static inline bool req_ref_sub_and_test(struct io_kiocb *req, int refs)
1509 {
1510         WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1511         return atomic_sub_and_test(refs, &req->refs);
1512 }
1513
1514 static inline bool req_ref_put_and_test(struct io_kiocb *req)
1515 {
1516         WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1517         return atomic_dec_and_test(&req->refs);
1518 }
1519
1520 static inline void req_ref_put(struct io_kiocb *req)
1521 {
1522         WARN_ON_ONCE(req_ref_put_and_test(req));
1523 }
1524
1525 static inline void req_ref_get(struct io_kiocb *req)
1526 {
1527         WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1528         atomic_inc(&req->refs);
1529 }
1530
1531 static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
1532                                      long res, unsigned int cflags)
1533 {
1534         struct io_overflow_cqe *ocqe;
1535
1536         ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
1537         if (!ocqe) {
1538                 /*
1539                  * If we're in ring overflow flush mode, or in task cancel mode,
1540                  * or cannot allocate an overflow entry, then we need to drop it
1541                  * on the floor.
1542                  */
1543                 io_account_cq_overflow(ctx);
1544                 return false;
1545         }
1546         if (list_empty(&ctx->cq_overflow_list)) {
1547                 set_bit(0, &ctx->sq_check_overflow);
1548                 set_bit(0, &ctx->cq_check_overflow);
1549                 ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
1550         }
1551         ocqe->cqe.user_data = user_data;
1552         ocqe->cqe.res = res;
1553         ocqe->cqe.flags = cflags;
1554         list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
1555         return true;
1556 }
1557
1558 static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1559                                           long res, unsigned int cflags)
1560 {
1561         struct io_uring_cqe *cqe;
1562
1563         trace_io_uring_complete(ctx, user_data, res, cflags);
1564
1565         /*
1566          * If we can't get a cq entry, userspace overflowed the
1567          * submission (by quite a lot). Increment the overflow count in
1568          * the ring.
1569          */
1570         cqe = io_get_cqe(ctx);
1571         if (likely(cqe)) {
1572                 WRITE_ONCE(cqe->user_data, user_data);
1573                 WRITE_ONCE(cqe->res, res);
1574                 WRITE_ONCE(cqe->flags, cflags);
1575                 return true;
1576         }
1577         return io_cqring_event_overflow(ctx, user_data, res, cflags);
1578 }
1579
1580 /* not as hot to bloat with inlining */
1581 static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1582                                           long res, unsigned int cflags)
1583 {
1584         return __io_cqring_fill_event(ctx, user_data, res, cflags);
1585 }
1586
1587 static void io_req_complete_post(struct io_kiocb *req, long res,
1588                                  unsigned int cflags)
1589 {
1590         struct io_ring_ctx *ctx = req->ctx;
1591         unsigned long flags;
1592
1593         spin_lock_irqsave(&ctx->completion_lock, flags);
1594         __io_cqring_fill_event(ctx, req->user_data, res, cflags);
1595         /*
1596          * If we're the last reference to this request, add to our locked
1597          * free_list cache.
1598          */
1599         if (req_ref_put_and_test(req)) {
1600                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
1601                         if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL))
1602                                 io_disarm_next(req);
1603                         if (req->link) {
1604                                 io_req_task_queue(req->link);
1605                                 req->link = NULL;
1606                         }
1607                 }
1608                 io_dismantle_req(req);
1609                 io_put_task(req->task, 1);
1610                 list_add(&req->compl.list, &ctx->locked_free_list);
1611                 ctx->locked_free_nr++;
1612         } else {
1613                 if (!percpu_ref_tryget(&ctx->refs))
1614                         req = NULL;
1615         }
1616         io_commit_cqring(ctx);
1617         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1618
1619         if (req) {
1620                 io_cqring_ev_posted(ctx);
1621                 percpu_ref_put(&ctx->refs);
1622         }
1623 }
1624
1625 static inline bool io_req_needs_clean(struct io_kiocb *req)
1626 {
1627         return req->flags & (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP |
1628                                 REQ_F_POLLED | REQ_F_INFLIGHT);
1629 }
1630
1631 static void io_req_complete_state(struct io_kiocb *req, long res,
1632                                   unsigned int cflags)
1633 {
1634         if (io_req_needs_clean(req))
1635                 io_clean_op(req);
1636         req->result = res;
1637         req->compl.cflags = cflags;
1638         req->flags |= REQ_F_COMPLETE_INLINE;
1639 }
1640
1641 static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
1642                                      long res, unsigned cflags)
1643 {
1644         if (issue_flags & IO_URING_F_COMPLETE_DEFER)
1645                 io_req_complete_state(req, res, cflags);
1646         else
1647                 io_req_complete_post(req, res, cflags);
1648 }
1649
1650 static inline void io_req_complete(struct io_kiocb *req, long res)
1651 {
1652         __io_req_complete(req, 0, res, 0);
1653 }
1654
1655 static void io_req_complete_failed(struct io_kiocb *req, long res)
1656 {
1657         req_set_fail(req);
1658         io_put_req(req);
1659         io_req_complete_post(req, res, 0);
1660 }
1661
1662 static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
1663                                         struct io_comp_state *cs)
1664 {
1665         spin_lock_irq(&ctx->completion_lock);
1666         list_splice_init(&ctx->locked_free_list, &cs->free_list);
1667         ctx->locked_free_nr = 0;
1668         spin_unlock_irq(&ctx->completion_lock);
1669 }
1670
1671 /* Returns true IFF there are requests in the cache */
1672 static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
1673 {
1674         struct io_submit_state *state = &ctx->submit_state;
1675         struct io_comp_state *cs = &state->comp;
1676         int nr;
1677
1678         /*
1679          * If we have more than a batch's worth of requests in our IRQ side
1680          * locked cache, grab the lock and move them over to our submission
1681          * side cache.
1682          */
1683         if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
1684                 io_flush_cached_locked_reqs(ctx, cs);
1685
1686         nr = state->free_reqs;
1687         while (!list_empty(&cs->free_list)) {
1688                 struct io_kiocb *req = list_first_entry(&cs->free_list,
1689                                                 struct io_kiocb, compl.list);
1690
1691                 list_del(&req->compl.list);
1692                 state->reqs[nr++] = req;
1693                 if (nr == ARRAY_SIZE(state->reqs))
1694                         break;
1695         }
1696
1697         state->free_reqs = nr;
1698         return nr != 0;
1699 }
1700
1701 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
1702 {
1703         struct io_submit_state *state = &ctx->submit_state;
1704
1705         BUILD_BUG_ON(IO_REQ_ALLOC_BATCH > ARRAY_SIZE(state->reqs));
1706
1707         if (!state->free_reqs) {
1708                 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1709                 int ret;
1710
1711                 if (io_flush_cached_reqs(ctx))
1712                         goto got_req;
1713
1714                 ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
1715                                             state->reqs);
1716
1717                 /*
1718                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
1719                  * retry single alloc to be on the safe side.
1720                  */
1721                 if (unlikely(ret <= 0)) {
1722                         state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1723                         if (!state->reqs[0])
1724                                 return NULL;
1725                         ret = 1;
1726                 }
1727                 state->free_reqs = ret;
1728         }
1729 got_req:
1730         state->free_reqs--;
1731         return state->reqs[state->free_reqs];
1732 }
1733
1734 static inline void io_put_file(struct file *file)
1735 {
1736         if (file)
1737                 fput(file);
1738 }
1739
1740 static void io_dismantle_req(struct io_kiocb *req)
1741 {
1742         unsigned int flags = req->flags;
1743
1744         if (io_req_needs_clean(req))
1745                 io_clean_op(req);
1746         if (!(flags & REQ_F_FIXED_FILE))
1747                 io_put_file(req->file);
1748         if (req->fixed_rsrc_refs)
1749                 percpu_ref_put(req->fixed_rsrc_refs);
1750         if (req->async_data)
1751                 kfree(req->async_data);
1752         if (req->work.creds) {
1753                 put_cred(req->work.creds);
1754                 req->work.creds = NULL;
1755         }
1756 }
1757
1758 /* must to be called somewhat shortly after putting a request */
1759 static inline void io_put_task(struct task_struct *task, int nr)
1760 {
1761         struct io_uring_task *tctx = task->io_uring;
1762
1763         percpu_counter_sub(&tctx->inflight, nr);
1764         if (unlikely(atomic_read(&tctx->in_idle)))
1765                 wake_up(&tctx->wait);
1766         put_task_struct_many(task, nr);
1767 }
1768
1769 static void __io_free_req(struct io_kiocb *req)
1770 {
1771         struct io_ring_ctx *ctx = req->ctx;
1772
1773         io_dismantle_req(req);
1774         io_put_task(req->task, 1);
1775
1776         kmem_cache_free(req_cachep, req);
1777         percpu_ref_put(&ctx->refs);
1778 }
1779
1780 static inline void io_remove_next_linked(struct io_kiocb *req)
1781 {
1782         struct io_kiocb *nxt = req->link;
1783
1784         req->link = nxt->link;
1785         nxt->link = NULL;
1786 }
1787
1788 static bool io_kill_linked_timeout(struct io_kiocb *req)
1789         __must_hold(&req->ctx->completion_lock)
1790 {
1791         struct io_kiocb *link = req->link;
1792
1793         /*
1794          * Can happen if a linked timeout fired and link had been like
1795          * req -> link t-out -> link t-out [-> ...]
1796          */
1797         if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
1798                 struct io_timeout_data *io = link->async_data;
1799
1800                 io_remove_next_linked(req);
1801                 link->timeout.head = NULL;
1802                 if (hrtimer_try_to_cancel(&io->timer) != -1) {
1803                         io_cqring_fill_event(link->ctx, link->user_data,
1804                                              -ECANCELED, 0);
1805                         io_put_req_deferred(link, 1);
1806                         return true;
1807                 }
1808         }
1809         return false;
1810 }
1811
1812 static void io_fail_links(struct io_kiocb *req)
1813         __must_hold(&req->ctx->completion_lock)
1814 {
1815         struct io_kiocb *nxt, *link = req->link;
1816
1817         req->link = NULL;
1818         while (link) {
1819                 nxt = link->link;
1820                 link->link = NULL;
1821
1822                 trace_io_uring_fail_link(req, link);
1823                 io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0);
1824                 io_put_req_deferred(link, 2);
1825                 link = nxt;
1826         }
1827 }
1828
1829 static bool io_disarm_next(struct io_kiocb *req)
1830         __must_hold(&req->ctx->completion_lock)
1831 {
1832         bool posted = false;
1833
1834         if (likely(req->flags & REQ_F_LINK_TIMEOUT))
1835                 posted = io_kill_linked_timeout(req);
1836         if (unlikely((req->flags & REQ_F_FAIL) &&
1837                      !(req->flags & REQ_F_HARDLINK))) {
1838                 posted |= (req->link != NULL);
1839                 io_fail_links(req);
1840         }
1841         return posted;
1842 }
1843
1844 static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
1845 {
1846         struct io_kiocb *nxt;
1847
1848         /*
1849          * If LINK is set, we have dependent requests in this chain. If we
1850          * didn't fail this request, queue the first one up, moving any other
1851          * dependencies to the next request. In case of failure, fail the rest
1852          * of the chain.
1853          */
1854         if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL)) {
1855                 struct io_ring_ctx *ctx = req->ctx;
1856                 unsigned long flags;
1857                 bool posted;
1858
1859                 spin_lock_irqsave(&ctx->completion_lock, flags);
1860                 posted = io_disarm_next(req);
1861                 if (posted)
1862                         io_commit_cqring(req->ctx);
1863                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1864                 if (posted)
1865                         io_cqring_ev_posted(ctx);
1866         }
1867         nxt = req->link;
1868         req->link = NULL;
1869         return nxt;
1870 }
1871
1872 static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
1873 {
1874         if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
1875                 return NULL;
1876         return __io_req_find_next(req);
1877 }
1878
1879 static void ctx_flush_and_put(struct io_ring_ctx *ctx)
1880 {
1881         if (!ctx)
1882                 return;
1883         if (ctx->submit_state.comp.nr) {
1884                 mutex_lock(&ctx->uring_lock);
1885                 io_submit_flush_completions(&ctx->submit_state.comp, ctx);
1886                 mutex_unlock(&ctx->uring_lock);
1887         }
1888         percpu_ref_put(&ctx->refs);
1889 }
1890
1891 static bool __tctx_task_work(struct io_uring_task *tctx)
1892 {
1893         struct io_ring_ctx *ctx = NULL;
1894         struct io_wq_work_list list;
1895         struct io_wq_work_node *node;
1896
1897         if (wq_list_empty(&tctx->task_list))
1898                 return false;
1899
1900         spin_lock_irq(&tctx->task_lock);
1901         list = tctx->task_list;
1902         INIT_WQ_LIST(&tctx->task_list);
1903         spin_unlock_irq(&tctx->task_lock);
1904
1905         node = list.first;
1906         while (node) {
1907                 struct io_wq_work_node *next = node->next;
1908                 struct io_kiocb *req;
1909
1910                 req = container_of(node, struct io_kiocb, io_task_work.node);
1911                 if (req->ctx != ctx) {
1912                         ctx_flush_and_put(ctx);
1913                         ctx = req->ctx;
1914                         percpu_ref_get(&ctx->refs);
1915                 }
1916
1917                 req->task_work.func(&req->task_work);
1918                 node = next;
1919         }
1920
1921         ctx_flush_and_put(ctx);
1922         return list.first != NULL;
1923 }
1924
1925 static void tctx_task_work(struct callback_head *cb)
1926 {
1927         struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work);
1928
1929         clear_bit(0, &tctx->task_state);
1930
1931         while (__tctx_task_work(tctx))
1932                 cond_resched();
1933 }
1934
1935 static int io_req_task_work_add(struct io_kiocb *req)
1936 {
1937         struct task_struct *tsk = req->task;
1938         struct io_uring_task *tctx = tsk->io_uring;
1939         enum task_work_notify_mode notify;
1940         struct io_wq_work_node *node, *prev;
1941         unsigned long flags;
1942         int ret = 0;
1943
1944         if (unlikely(tsk->flags & PF_EXITING))
1945                 return -ESRCH;
1946
1947         WARN_ON_ONCE(!tctx);
1948
1949         spin_lock_irqsave(&tctx->task_lock, flags);
1950         wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
1951         spin_unlock_irqrestore(&tctx->task_lock, flags);
1952
1953         /* task_work already pending, we're done */
1954         if (test_bit(0, &tctx->task_state) ||
1955             test_and_set_bit(0, &tctx->task_state))
1956                 return 0;
1957
1958         /*
1959          * SQPOLL kernel thread doesn't need notification, just a wakeup. For
1960          * all other cases, use TWA_SIGNAL unconditionally to ensure we're
1961          * processing task_work. There's no reliable way to tell if TWA_RESUME
1962          * will do the job.
1963          */
1964         notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
1965
1966         if (!task_work_add(tsk, &tctx->task_work, notify)) {
1967                 wake_up_process(tsk);
1968                 return 0;
1969         }
1970
1971         /*
1972          * Slow path - we failed, find and delete work. if the work is not
1973          * in the list, it got run and we're fine.
1974          */
1975         spin_lock_irqsave(&tctx->task_lock, flags);
1976         wq_list_for_each(node, prev, &tctx->task_list) {
1977                 if (&req->io_task_work.node == node) {
1978                         wq_list_del(&tctx->task_list, node, prev);
1979                         ret = 1;
1980                         break;
1981                 }
1982         }
1983         spin_unlock_irqrestore(&tctx->task_lock, flags);
1984         clear_bit(0, &tctx->task_state);
1985         return ret;
1986 }
1987
1988 static bool io_run_task_work_head(struct callback_head **work_head)
1989 {
1990         struct callback_head *work, *next;
1991         bool executed = false;
1992
1993         do {
1994                 work = xchg(work_head, NULL);
1995                 if (!work)
1996                         break;
1997
1998                 do {
1999                         next = work->next;
2000                         work->func(work);
2001                         work = next;
2002                         cond_resched();
2003                 } while (work);
2004                 executed = true;
2005         } while (1);
2006
2007         return executed;
2008 }
2009
2010 static void io_task_work_add_head(struct callback_head **work_head,
2011                                   struct callback_head *task_work)
2012 {
2013         struct callback_head *head;
2014
2015         do {
2016                 head = READ_ONCE(*work_head);
2017                 task_work->next = head;
2018         } while (cmpxchg(work_head, head, task_work) != head);
2019 }
2020
2021 static void io_req_task_work_add_fallback(struct io_kiocb *req,
2022                                           task_work_func_t cb)
2023 {
2024         init_task_work(&req->task_work, cb);
2025         io_task_work_add_head(&req->ctx->exit_task_work, &req->task_work);
2026 }
2027
2028 static void io_req_task_cancel(struct callback_head *cb)
2029 {
2030         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2031         struct io_ring_ctx *ctx = req->ctx;
2032
2033         /* ctx is guaranteed to stay alive while we hold uring_lock */
2034         mutex_lock(&ctx->uring_lock);
2035         io_req_complete_failed(req, req->result);
2036         mutex_unlock(&ctx->uring_lock);
2037 }
2038
2039 static void __io_req_task_submit(struct io_kiocb *req)
2040 {
2041         struct io_ring_ctx *ctx = req->ctx;
2042
2043         /* ctx stays valid until unlock, even if we drop all ours ctx->refs */
2044         mutex_lock(&ctx->uring_lock);
2045         if (!(current->flags & PF_EXITING) && !current->in_execve)
2046                 __io_queue_sqe(req);
2047         else
2048                 io_req_complete_failed(req, -EFAULT);
2049         mutex_unlock(&ctx->uring_lock);
2050 }
2051
2052 static void io_req_task_submit(struct callback_head *cb)
2053 {
2054         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2055
2056         __io_req_task_submit(req);
2057 }
2058
2059 static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
2060 {
2061         req->result = ret;
2062         req->task_work.func = io_req_task_cancel;
2063
2064         if (unlikely(io_req_task_work_add(req)))
2065                 io_req_task_work_add_fallback(req, io_req_task_cancel);
2066 }
2067
2068 static void io_req_task_queue(struct io_kiocb *req)
2069 {
2070         req->task_work.func = io_req_task_submit;
2071
2072         if (unlikely(io_req_task_work_add(req)))
2073                 io_req_task_queue_fail(req, -ECANCELED);
2074 }
2075
2076 static inline void io_queue_next(struct io_kiocb *req)
2077 {
2078         struct io_kiocb *nxt = io_req_find_next(req);
2079
2080         if (nxt)
2081                 io_req_task_queue(nxt);
2082 }
2083
2084 static void io_free_req(struct io_kiocb *req)
2085 {
2086         io_queue_next(req);
2087         __io_free_req(req);
2088 }
2089
2090 struct req_batch {
2091         struct task_struct      *task;
2092         int                     task_refs;
2093         int                     ctx_refs;
2094 };
2095
2096 static inline void io_init_req_batch(struct req_batch *rb)
2097 {
2098         rb->task_refs = 0;
2099         rb->ctx_refs = 0;
2100         rb->task = NULL;
2101 }
2102
2103 static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
2104                                      struct req_batch *rb)
2105 {
2106         if (rb->task)
2107                 io_put_task(rb->task, rb->task_refs);
2108         if (rb->ctx_refs)
2109                 percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
2110 }
2111
2112 static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
2113                               struct io_submit_state *state)
2114 {
2115         io_queue_next(req);
2116         io_dismantle_req(req);
2117
2118         if (req->task != rb->task) {
2119                 if (rb->task)
2120                         io_put_task(rb->task, rb->task_refs);
2121                 rb->task = req->task;
2122                 rb->task_refs = 0;
2123         }
2124         rb->task_refs++;
2125         rb->ctx_refs++;
2126
2127         if (state->free_reqs != ARRAY_SIZE(state->reqs))
2128                 state->reqs[state->free_reqs++] = req;
2129         else
2130                 list_add(&req->compl.list, &state->comp.free_list);
2131 }
2132
2133 static void io_submit_flush_completions(struct io_comp_state *cs,
2134                                         struct io_ring_ctx *ctx)
2135 {
2136         int i, nr = cs->nr;
2137         struct io_kiocb *req;
2138         struct req_batch rb;
2139
2140         io_init_req_batch(&rb);
2141         spin_lock_irq(&ctx->completion_lock);
2142         for (i = 0; i < nr; i++) {
2143                 req = cs->reqs[i];
2144                 __io_cqring_fill_event(ctx, req->user_data, req->result,
2145                                         req->compl.cflags);
2146         }
2147         io_commit_cqring(ctx);
2148         spin_unlock_irq(&ctx->completion_lock);
2149
2150         io_cqring_ev_posted(ctx);
2151         for (i = 0; i < nr; i++) {
2152                 req = cs->reqs[i];
2153
2154                 /* submission and completion refs */
2155                 if (req_ref_sub_and_test(req, 2))
2156                         io_req_free_batch(&rb, req, &ctx->submit_state);
2157         }
2158
2159         io_req_free_batch_finish(ctx, &rb);
2160         cs->nr = 0;
2161 }
2162
2163 /*
2164  * Drop reference to request, return next in chain (if there is one) if this
2165  * was the last reference to this request.
2166  */
2167 static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
2168 {
2169         struct io_kiocb *nxt = NULL;
2170
2171         if (req_ref_put_and_test(req)) {
2172                 nxt = io_req_find_next(req);
2173                 __io_free_req(req);
2174         }
2175         return nxt;
2176 }
2177
2178 static inline void io_put_req(struct io_kiocb *req)
2179 {
2180         if (req_ref_put_and_test(req))
2181                 io_free_req(req);
2182 }
2183
2184 static void io_put_req_deferred_cb(struct callback_head *cb)
2185 {
2186         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2187
2188         io_free_req(req);
2189 }
2190
2191 static void io_free_req_deferred(struct io_kiocb *req)
2192 {
2193         req->task_work.func = io_put_req_deferred_cb;
2194         if (unlikely(io_req_task_work_add(req)))
2195                 io_req_task_work_add_fallback(req, io_put_req_deferred_cb);
2196 }
2197
2198 static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
2199 {
2200         if (req_ref_sub_and_test(req, refs))
2201                 io_free_req_deferred(req);
2202 }
2203
2204 static unsigned io_cqring_events(struct io_ring_ctx *ctx)
2205 {
2206         /* See comment at the top of this file */
2207         smp_rmb();
2208         return __io_cqring_events(ctx);
2209 }
2210
2211 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2212 {
2213         struct io_rings *rings = ctx->rings;
2214
2215         /* make sure SQ entry isn't read before tail */
2216         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2217 }
2218
2219 static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
2220 {
2221         unsigned int cflags;
2222
2223         cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
2224         cflags |= IORING_CQE_F_BUFFER;
2225         req->flags &= ~REQ_F_BUFFER_SELECTED;
2226         kfree(kbuf);
2227         return cflags;
2228 }
2229
2230 static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
2231 {
2232         struct io_buffer *kbuf;
2233
2234         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2235         return io_put_kbuf(req, kbuf);
2236 }
2237
2238 static inline bool io_run_task_work(void)
2239 {
2240         /*
2241          * Not safe to run on exiting task, and the task_work handling will
2242          * not add work to such a task.
2243          */
2244         if (unlikely(current->flags & PF_EXITING))
2245                 return false;
2246         if (current->task_works) {
2247                 __set_current_state(TASK_RUNNING);
2248                 task_work_run();
2249                 return true;
2250         }
2251
2252         return false;
2253 }
2254
2255 /*
2256  * Find and free completed poll iocbs
2257  */
2258 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
2259                                struct list_head *done)
2260 {
2261         struct req_batch rb;
2262         struct io_kiocb *req;
2263
2264         /* order with ->result store in io_complete_rw_iopoll() */
2265         smp_rmb();
2266
2267         io_init_req_batch(&rb);
2268         while (!list_empty(done)) {
2269                 int cflags = 0;
2270
2271                 req = list_first_entry(done, struct io_kiocb, inflight_entry);
2272                 list_del(&req->inflight_entry);
2273
2274                 if (READ_ONCE(req->result) == -EAGAIN &&
2275                     !(req->flags & REQ_F_DONT_REISSUE)) {
2276                         req->iopoll_completed = 0;
2277                         req_ref_get(req);
2278                         io_queue_async_work(req);
2279                         continue;
2280                 }
2281
2282                 if (req->flags & REQ_F_BUFFER_SELECTED)
2283                         cflags = io_put_rw_kbuf(req);
2284
2285                 __io_cqring_fill_event(ctx, req->user_data, req->result, cflags);
2286                 (*nr_events)++;
2287
2288                 if (req_ref_put_and_test(req))
2289                         io_req_free_batch(&rb, req, &ctx->submit_state);
2290         }
2291
2292         io_commit_cqring(ctx);
2293         io_cqring_ev_posted_iopoll(ctx);
2294         io_req_free_batch_finish(ctx, &rb);
2295 }
2296
2297 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
2298                         long min)
2299 {
2300         struct io_kiocb *req, *tmp;
2301         LIST_HEAD(done);
2302         bool spin;
2303         int ret;
2304
2305         /*
2306          * Only spin for completions if we don't have multiple devices hanging
2307          * off our complete list, and we're under the requested amount.
2308          */
2309         spin = !ctx->poll_multi_file && *nr_events < min;
2310
2311         ret = 0;
2312         list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
2313                 struct kiocb *kiocb = &req->rw.kiocb;
2314
2315                 /*
2316                  * Move completed and retryable entries to our local lists.
2317                  * If we find a request that requires polling, break out
2318                  * and complete those lists first, if we have entries there.
2319                  */
2320                 if (READ_ONCE(req->iopoll_completed)) {
2321                         list_move_tail(&req->inflight_entry, &done);
2322                         continue;
2323                 }
2324                 if (!list_empty(&done))
2325                         break;
2326
2327                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
2328                 if (ret < 0)
2329                         break;
2330
2331                 /* iopoll may have completed current req */
2332                 if (READ_ONCE(req->iopoll_completed))
2333                         list_move_tail(&req->inflight_entry, &done);
2334
2335                 if (ret && spin)
2336                         spin = false;
2337                 ret = 0;
2338         }
2339
2340         if (!list_empty(&done))
2341                 io_iopoll_complete(ctx, nr_events, &done);
2342
2343         return ret;
2344 }
2345
2346 /*
2347  * We can't just wait for polled events to come to us, we have to actively
2348  * find and complete them.
2349  */
2350 static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
2351 {
2352         if (!(ctx->flags & IORING_SETUP_IOPOLL))
2353                 return;
2354
2355         mutex_lock(&ctx->uring_lock);
2356         while (!list_empty(&ctx->iopoll_list)) {
2357                 unsigned int nr_events = 0;
2358
2359                 io_do_iopoll(ctx, &nr_events, 0);
2360
2361                 /* let it sleep and repeat later if can't complete a request */
2362                 if (nr_events == 0)
2363                         break;
2364                 /*
2365                  * Ensure we allow local-to-the-cpu processing to take place,
2366                  * in this case we need to ensure that we reap all events.
2367                  * Also let task_work, etc. to progress by releasing the mutex
2368                  */
2369                 if (need_resched()) {
2370                         mutex_unlock(&ctx->uring_lock);
2371                         cond_resched();
2372                         mutex_lock(&ctx->uring_lock);
2373                 }
2374         }
2375         mutex_unlock(&ctx->uring_lock);
2376 }
2377
2378 static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
2379 {
2380         unsigned int nr_events = 0;
2381         int ret = 0;
2382
2383         /*
2384          * We disallow the app entering submit/complete with polling, but we
2385          * still need to lock the ring to prevent racing with polled issue
2386          * that got punted to a workqueue.
2387          */
2388         mutex_lock(&ctx->uring_lock);
2389         /*
2390          * Don't enter poll loop if we already have events pending.
2391          * If we do, we can potentially be spinning for commands that
2392          * already triggered a CQE (eg in error).
2393          */
2394         if (test_bit(0, &ctx->cq_check_overflow))
2395                 __io_cqring_overflow_flush(ctx, false);
2396         if (io_cqring_events(ctx))
2397                 goto out;
2398         do {
2399                 /*
2400                  * If a submit got punted to a workqueue, we can have the
2401                  * application entering polling for a command before it gets
2402                  * issued. That app will hold the uring_lock for the duration
2403                  * of the poll right here, so we need to take a breather every
2404                  * now and then to ensure that the issue has a chance to add
2405                  * the poll to the issued list. Otherwise we can spin here
2406                  * forever, while the workqueue is stuck trying to acquire the
2407                  * very same mutex.
2408                  */
2409                 if (list_empty(&ctx->iopoll_list)) {
2410                         mutex_unlock(&ctx->uring_lock);
2411                         io_run_task_work();
2412                         mutex_lock(&ctx->uring_lock);
2413
2414                         if (list_empty(&ctx->iopoll_list))
2415                                 break;
2416                 }
2417                 ret = io_do_iopoll(ctx, &nr_events, min);
2418         } while (!ret && nr_events < min && !need_resched());
2419 out:
2420         mutex_unlock(&ctx->uring_lock);
2421         return ret;
2422 }
2423
2424 static void kiocb_end_write(struct io_kiocb *req)
2425 {
2426         /*
2427          * Tell lockdep we inherited freeze protection from submission
2428          * thread.
2429          */
2430         if (req->flags & REQ_F_ISREG) {
2431                 struct super_block *sb = file_inode(req->file)->i_sb;
2432
2433                 __sb_writers_acquired(sb, SB_FREEZE_WRITE);
2434                 sb_end_write(sb);
2435         }
2436 }
2437
2438 #ifdef CONFIG_BLOCK
2439 static bool io_resubmit_prep(struct io_kiocb *req)
2440 {
2441         struct io_async_rw *rw = req->async_data;
2442
2443         if (!rw)
2444                 return !io_req_prep_async(req);
2445         /* may have left rw->iter inconsistent on -EIOCBQUEUED */
2446         iov_iter_revert(&rw->iter, req->result - iov_iter_count(&rw->iter));
2447         return true;
2448 }
2449
2450 static bool io_rw_should_reissue(struct io_kiocb *req)
2451 {
2452         umode_t mode = file_inode(req->file)->i_mode;
2453         struct io_ring_ctx *ctx = req->ctx;
2454
2455         if (!S_ISBLK(mode) && !S_ISREG(mode))
2456                 return false;
2457         if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
2458             !(ctx->flags & IORING_SETUP_IOPOLL)))
2459                 return false;
2460         /*
2461          * If ref is dying, we might be running poll reap from the exit work.
2462          * Don't attempt to reissue from that path, just let it fail with
2463          * -EAGAIN.
2464          */
2465         if (percpu_ref_is_dying(&ctx->refs))
2466                 return false;
2467         return true;
2468 }
2469 #else
2470 static bool io_resubmit_prep(struct io_kiocb *req)
2471 {
2472         return false;
2473 }
2474 static bool io_rw_should_reissue(struct io_kiocb *req)
2475 {
2476         return false;
2477 }
2478 #endif
2479
2480 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
2481                              unsigned int issue_flags)
2482 {
2483         int cflags = 0;
2484
2485         if (req->rw.kiocb.ki_flags & IOCB_WRITE)
2486                 kiocb_end_write(req);
2487         if (res != req->result) {
2488                 if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
2489                     io_rw_should_reissue(req)) {
2490                         req->flags |= REQ_F_REISSUE;
2491                         return;
2492                 }
2493                 req_set_fail(req);
2494         }
2495         if (req->flags & REQ_F_BUFFER_SELECTED)
2496                 cflags = io_put_rw_kbuf(req);
2497         __io_req_complete(req, issue_flags, res, cflags);
2498 }
2499
2500 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
2501 {
2502         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2503
2504         __io_complete_rw(req, res, res2, 0);
2505 }
2506
2507 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
2508 {
2509         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2510
2511         if (kiocb->ki_flags & IOCB_WRITE)
2512                 kiocb_end_write(req);
2513         if (unlikely(res != req->result)) {
2514                 if (!(res == -EAGAIN && io_rw_should_reissue(req) &&
2515                     io_resubmit_prep(req))) {
2516                         req_set_fail(req);
2517                         req->flags |= REQ_F_DONT_REISSUE;
2518                 }
2519         }
2520
2521         WRITE_ONCE(req->result, res);
2522         /* order with io_iopoll_complete() checking ->result */
2523         smp_wmb();
2524         WRITE_ONCE(req->iopoll_completed, 1);
2525 }
2526
2527 /*
2528  * After the iocb has been issued, it's safe to be found on the poll list.
2529  * Adding the kiocb to the list AFTER submission ensures that we don't
2530  * find it from a io_do_iopoll() thread before the issuer is done
2531  * accessing the kiocb cookie.
2532  */
2533 static void io_iopoll_req_issued(struct io_kiocb *req)
2534 {
2535         struct io_ring_ctx *ctx = req->ctx;
2536         const bool in_async = io_wq_current_is_worker();
2537
2538         /* workqueue context doesn't hold uring_lock, grab it now */
2539         if (unlikely(in_async))
2540                 mutex_lock(&ctx->uring_lock);
2541
2542         /*
2543          * Track whether we have multiple files in our lists. This will impact
2544          * how we do polling eventually, not spinning if we're on potentially
2545          * different devices.
2546          */
2547         if (list_empty(&ctx->iopoll_list)) {
2548                 ctx->poll_multi_file = false;
2549         } else if (!ctx->poll_multi_file) {
2550                 struct io_kiocb *list_req;
2551
2552                 list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
2553                                                 inflight_entry);
2554                 if (list_req->file != req->file)
2555                         ctx->poll_multi_file = true;
2556         }
2557
2558         /*
2559          * For fast devices, IO may have already completed. If it has, add
2560          * it to the front so we find it first.
2561          */
2562         if (READ_ONCE(req->iopoll_completed))
2563                 list_add(&req->inflight_entry, &ctx->iopoll_list);
2564         else
2565                 list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
2566
2567         if (unlikely(in_async)) {
2568                 /*
2569                  * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
2570                  * in sq thread task context or in io worker task context. If
2571                  * current task context is sq thread, we don't need to check
2572                  * whether should wake up sq thread.
2573                  */
2574                 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
2575                     wq_has_sleeper(&ctx->sq_data->wait))
2576                         wake_up(&ctx->sq_data->wait);
2577
2578                 mutex_unlock(&ctx->uring_lock);
2579         }
2580 }
2581
2582 static inline void io_state_file_put(struct io_submit_state *state)
2583 {
2584         if (state->file_refs) {
2585                 fput_many(state->file, state->file_refs);
2586                 state->file_refs = 0;
2587         }
2588 }
2589
2590 /*
2591  * Get as many references to a file as we have IOs left in this submission,
2592  * assuming most submissions are for one file, or at least that each file
2593  * has more than one submission.
2594  */
2595 static struct file *__io_file_get(struct io_submit_state *state, int fd)
2596 {
2597         if (!state)
2598                 return fget(fd);
2599
2600         if (state->file_refs) {
2601                 if (state->fd == fd) {
2602                         state->file_refs--;
2603                         return state->file;
2604                 }
2605                 io_state_file_put(state);
2606         }
2607         state->file = fget_many(fd, state->ios_left);
2608         if (unlikely(!state->file))
2609                 return NULL;
2610
2611         state->fd = fd;
2612         state->file_refs = state->ios_left - 1;
2613         return state->file;
2614 }
2615
2616 static bool io_bdev_nowait(struct block_device *bdev)
2617 {
2618         return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
2619 }
2620
2621 /*
2622  * If we tracked the file through the SCM inflight mechanism, we could support
2623  * any file. For now, just ensure that anything potentially problematic is done
2624  * inline.
2625  */
2626 static bool __io_file_supports_async(struct file *file, int rw)
2627 {
2628         umode_t mode = file_inode(file)->i_mode;
2629
2630         if (S_ISBLK(mode)) {
2631                 if (IS_ENABLED(CONFIG_BLOCK) &&
2632                     io_bdev_nowait(I_BDEV(file->f_mapping->host)))
2633                         return true;
2634                 return false;
2635         }
2636         if (S_ISSOCK(mode))
2637                 return true;
2638         if (S_ISREG(mode)) {
2639                 if (IS_ENABLED(CONFIG_BLOCK) &&
2640                     io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
2641                     file->f_op != &io_uring_fops)
2642                         return true;
2643                 return false;
2644         }
2645
2646         /* any ->read/write should understand O_NONBLOCK */
2647         if (file->f_flags & O_NONBLOCK)
2648                 return true;
2649
2650         if (!(file->f_mode & FMODE_NOWAIT))
2651                 return false;
2652
2653         if (rw == READ)
2654                 return file->f_op->read_iter != NULL;
2655
2656         return file->f_op->write_iter != NULL;
2657 }
2658
2659 static bool io_file_supports_async(struct io_kiocb *req, int rw)
2660 {
2661         if (rw == READ && (req->flags & REQ_F_ASYNC_READ))
2662                 return true;
2663         else if (rw == WRITE && (req->flags & REQ_F_ASYNC_WRITE))
2664                 return true;
2665
2666         return __io_file_supports_async(req->file, rw);
2667 }
2668
2669 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2670 {
2671         struct io_ring_ctx *ctx = req->ctx;
2672         struct kiocb *kiocb = &req->rw.kiocb;
2673         struct file *file = req->file;
2674         unsigned ioprio;
2675         int ret;
2676
2677         if (!(req->flags & REQ_F_ISREG) && S_ISREG(file_inode(file)->i_mode))
2678                 req->flags |= REQ_F_ISREG;
2679
2680         kiocb->ki_pos = READ_ONCE(sqe->off);
2681         if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
2682                 req->flags |= REQ_F_CUR_POS;
2683                 kiocb->ki_pos = file->f_pos;
2684         }
2685         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
2686         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2687         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2688         if (unlikely(ret))
2689                 return ret;
2690
2691         /* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */
2692         if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK))
2693                 req->flags |= REQ_F_NOWAIT;
2694
2695         ioprio = READ_ONCE(sqe->ioprio);
2696         if (ioprio) {
2697                 ret = ioprio_check_cap(ioprio);
2698                 if (ret)
2699                         return ret;
2700
2701                 kiocb->ki_ioprio = ioprio;
2702         } else
2703                 kiocb->ki_ioprio = get_current_ioprio();
2704
2705         if (ctx->flags & IORING_SETUP_IOPOLL) {
2706                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2707                     !kiocb->ki_filp->f_op->iopoll)
2708                         return -EOPNOTSUPP;
2709
2710                 kiocb->ki_flags |= IOCB_HIPRI;
2711                 kiocb->ki_complete = io_complete_rw_iopoll;
2712                 req->iopoll_completed = 0;
2713         } else {
2714                 if (kiocb->ki_flags & IOCB_HIPRI)
2715                         return -EINVAL;
2716                 kiocb->ki_complete = io_complete_rw;
2717         }
2718
2719         if (req->opcode == IORING_OP_READ_FIXED ||
2720             req->opcode == IORING_OP_WRITE_FIXED) {
2721                 req->imu = NULL;
2722                 io_req_set_rsrc_node(req);
2723         }
2724
2725         req->rw.addr = READ_ONCE(sqe->addr);
2726         req->rw.len = READ_ONCE(sqe->len);
2727         req->buf_index = READ_ONCE(sqe->buf_index);
2728         return 0;
2729 }
2730
2731 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2732 {
2733         switch (ret) {
2734         case -EIOCBQUEUED:
2735                 break;
2736         case -ERESTARTSYS:
2737         case -ERESTARTNOINTR:
2738         case -ERESTARTNOHAND:
2739         case -ERESTART_RESTARTBLOCK:
2740                 /*
2741                  * We can't just restart the syscall, since previously
2742                  * submitted sqes may already be in progress. Just fail this
2743                  * IO with EINTR.
2744                  */
2745                 ret = -EINTR;
2746                 fallthrough;
2747         default:
2748                 kiocb->ki_complete(kiocb, ret, 0);
2749         }
2750 }
2751
2752 static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
2753                        unsigned int issue_flags)
2754 {
2755         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2756         struct io_async_rw *io = req->async_data;
2757         bool check_reissue = kiocb->ki_complete == io_complete_rw;
2758
2759         /* add previously done IO, if any */
2760         if (io && io->bytes_done > 0) {
2761                 if (ret < 0)
2762                         ret = io->bytes_done;
2763                 else
2764                         ret += io->bytes_done;
2765         }
2766
2767         if (req->flags & REQ_F_CUR_POS)
2768                 req->file->f_pos = kiocb->ki_pos;
2769         if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
2770                 __io_complete_rw(req, ret, 0, issue_flags);
2771         else
2772                 io_rw_done(kiocb, ret);
2773
2774         if (check_reissue && req->flags & REQ_F_REISSUE) {
2775                 req->flags &= ~REQ_F_REISSUE;
2776                 if (io_resubmit_prep(req)) {
2777                         req_ref_get(req);
2778                         io_queue_async_work(req);
2779                 } else {
2780                         int cflags = 0;
2781
2782                         req_set_fail(req);
2783                         if (req->flags & REQ_F_BUFFER_SELECTED)
2784                                 cflags = io_put_rw_kbuf(req);
2785                         __io_req_complete(req, issue_flags, ret, cflags);
2786                 }
2787         }
2788 }
2789
2790 static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
2791                              struct io_mapped_ubuf *imu)
2792 {
2793         size_t len = req->rw.len;
2794         u64 buf_end, buf_addr = req->rw.addr;
2795         size_t offset;
2796
2797         if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
2798                 return -EFAULT;
2799         /* not inside the mapped region */
2800         if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
2801                 return -EFAULT;
2802
2803         /*
2804          * May not be a start of buffer, set size appropriately
2805          * and advance us to the beginning.
2806          */
2807         offset = buf_addr - imu->ubuf;
2808         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
2809
2810         if (offset) {
2811                 /*
2812                  * Don't use iov_iter_advance() here, as it's really slow for
2813                  * using the latter parts of a big fixed buffer - it iterates
2814                  * over each segment manually. We can cheat a bit here, because
2815                  * we know that:
2816                  *
2817                  * 1) it's a BVEC iter, we set it up
2818                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
2819                  *    first and last bvec
2820                  *
2821                  * So just find our index, and adjust the iterator afterwards.
2822                  * If the offset is within the first bvec (or the whole first
2823                  * bvec, just use iov_iter_advance(). This makes it easier
2824                  * since we can just skip the first segment, which may not
2825                  * be PAGE_SIZE aligned.
2826                  */
2827                 const struct bio_vec *bvec = imu->bvec;
2828
2829                 if (offset <= bvec->bv_len) {
2830                         iov_iter_advance(iter, offset);
2831                 } else {
2832                         unsigned long seg_skip;
2833
2834                         /* skip first vec */
2835                         offset -= bvec->bv_len;
2836                         seg_skip = 1 + (offset >> PAGE_SHIFT);
2837
2838                         iter->bvec = bvec + seg_skip;
2839                         iter->nr_segs -= seg_skip;
2840                         iter->count -= bvec->bv_len + offset;
2841                         iter->iov_offset = offset & ~PAGE_MASK;
2842                 }
2843         }
2844
2845         return 0;
2846 }
2847
2848 static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
2849 {
2850         struct io_ring_ctx *ctx = req->ctx;
2851         struct io_mapped_ubuf *imu = req->imu;
2852         u16 index, buf_index = req->buf_index;
2853
2854         if (likely(!imu)) {
2855                 if (unlikely(buf_index >= ctx->nr_user_bufs))
2856                         return -EFAULT;
2857                 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2858                 imu = READ_ONCE(ctx->user_bufs[index]);
2859                 req->imu = imu;
2860         }
2861         return __io_import_fixed(req, rw, iter, imu);
2862 }
2863
2864 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2865 {
2866         if (needs_lock)
2867                 mutex_unlock(&ctx->uring_lock);
2868 }
2869
2870 static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2871 {
2872         /*
2873          * "Normal" inline submissions always hold the uring_lock, since we
2874          * grab it from the system call. Same is true for the SQPOLL offload.
2875          * The only exception is when we've detached the request and issue it
2876          * from an async worker thread, grab the lock for that case.
2877          */
2878         if (needs_lock)
2879                 mutex_lock(&ctx->uring_lock);
2880 }
2881
2882 static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
2883                                           int bgid, struct io_buffer *kbuf,
2884                                           bool needs_lock)
2885 {
2886         struct io_buffer *head;
2887
2888         if (req->flags & REQ_F_BUFFER_SELECTED)
2889                 return kbuf;
2890
2891         io_ring_submit_lock(req->ctx, needs_lock);
2892
2893         lockdep_assert_held(&req->ctx->uring_lock);
2894
2895         head = xa_load(&req->ctx->io_buffers, bgid);
2896         if (head) {
2897                 if (!list_empty(&head->list)) {
2898                         kbuf = list_last_entry(&head->list, struct io_buffer,
2899                                                         list);
2900                         list_del(&kbuf->list);
2901                 } else {
2902                         kbuf = head;
2903                         xa_erase(&req->ctx->io_buffers, bgid);
2904                 }
2905                 if (*len > kbuf->len)
2906                         *len = kbuf->len;
2907         } else {
2908                 kbuf = ERR_PTR(-ENOBUFS);
2909         }
2910
2911         io_ring_submit_unlock(req->ctx, needs_lock);
2912
2913         return kbuf;
2914 }
2915
2916 static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
2917                                         bool needs_lock)
2918 {
2919         struct io_buffer *kbuf;
2920         u16 bgid;
2921
2922         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2923         bgid = req->buf_index;
2924         kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
2925         if (IS_ERR(kbuf))
2926                 return kbuf;
2927         req->rw.addr = (u64) (unsigned long) kbuf;
2928         req->flags |= REQ_F_BUFFER_SELECTED;
2929         return u64_to_user_ptr(kbuf->addr);
2930 }
2931
2932 #ifdef CONFIG_COMPAT
2933 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
2934                                 bool needs_lock)
2935 {
2936         struct compat_iovec __user *uiov;
2937         compat_ssize_t clen;
2938         void __user *buf;
2939         ssize_t len;
2940
2941         uiov = u64_to_user_ptr(req->rw.addr);
2942         if (!access_ok(uiov, sizeof(*uiov)))
2943                 return -EFAULT;
2944         if (__get_user(clen, &uiov->iov_len))
2945                 return -EFAULT;
2946         if (clen < 0)
2947                 return -EINVAL;
2948
2949         len = clen;
2950         buf = io_rw_buffer_select(req, &len, needs_lock);
2951         if (IS_ERR(buf))
2952                 return PTR_ERR(buf);
2953         iov[0].iov_base = buf;
2954         iov[0].iov_len = (compat_size_t) len;
2955         return 0;
2956 }
2957 #endif
2958
2959 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2960                                       bool needs_lock)
2961 {
2962         struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
2963         void __user *buf;
2964         ssize_t len;
2965
2966         if (copy_from_user(iov, uiov, sizeof(*uiov)))
2967                 return -EFAULT;
2968
2969         len = iov[0].iov_len;
2970         if (len < 0)
2971                 return -EINVAL;
2972         buf = io_rw_buffer_select(req, &len, needs_lock);
2973         if (IS_ERR(buf))
2974                 return PTR_ERR(buf);
2975         iov[0].iov_base = buf;
2976         iov[0].iov_len = len;
2977         return 0;
2978 }
2979
2980 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2981                                     bool needs_lock)
2982 {
2983         if (req->flags & REQ_F_BUFFER_SELECTED) {
2984                 struct io_buffer *kbuf;
2985
2986                 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2987                 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
2988                 iov[0].iov_len = kbuf->len;
2989                 return 0;
2990         }
2991         if (req->rw.len != 1)
2992                 return -EINVAL;
2993
2994 #ifdef CONFIG_COMPAT
2995         if (req->ctx->compat)
2996                 return io_compat_import(req, iov, needs_lock);
2997 #endif
2998
2999         return __io_iov_buffer_select(req, iov, needs_lock);
3000 }
3001
3002 static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
3003                            struct iov_iter *iter, bool needs_lock)
3004 {
3005         void __user *buf = u64_to_user_ptr(req->rw.addr);
3006         size_t sqe_len = req->rw.len;
3007         u8 opcode = req->opcode;
3008         ssize_t ret;
3009
3010         if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
3011                 *iovec = NULL;
3012                 return io_import_fixed(req, rw, iter);
3013         }
3014
3015         /* buffer index only valid with fixed read/write, or buffer select  */
3016         if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
3017                 return -EINVAL;
3018
3019         if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
3020                 if (req->flags & REQ_F_BUFFER_SELECT) {
3021                         buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
3022                         if (IS_ERR(buf))
3023                                 return PTR_ERR(buf);
3024                         req->rw.len = sqe_len;
3025                 }
3026
3027                 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
3028                 *iovec = NULL;
3029                 return ret;
3030         }
3031
3032         if (req->flags & REQ_F_BUFFER_SELECT) {
3033                 ret = io_iov_buffer_select(req, *iovec, needs_lock);
3034                 if (!ret)
3035                         iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
3036                 *iovec = NULL;
3037                 return ret;
3038         }
3039
3040         return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
3041                               req->ctx->compat);
3042 }
3043
3044 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
3045 {
3046         return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
3047 }
3048
3049 /*
3050  * For files that don't have ->read_iter() and ->write_iter(), handle them
3051  * by looping over ->read() or ->write() manually.
3052  */
3053 static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
3054 {
3055         struct kiocb *kiocb = &req->rw.kiocb;
3056         struct file *file = req->file;
3057         ssize_t ret = 0;
3058
3059         /*
3060          * Don't support polled IO through this interface, and we can't
3061          * support non-blocking either. For the latter, this just causes
3062          * the kiocb to be handled from an async context.
3063          */
3064         if (kiocb->ki_flags & IOCB_HIPRI)
3065                 return -EOPNOTSUPP;
3066         if (kiocb->ki_flags & IOCB_NOWAIT)
3067                 return -EAGAIN;
3068
3069         while (iov_iter_count(iter)) {
3070                 struct iovec iovec;
3071                 ssize_t nr;
3072
3073                 if (!iov_iter_is_bvec(iter)) {
3074                         iovec = iov_iter_iovec(iter);
3075                 } else {
3076                         iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3077                         iovec.iov_len = req->rw.len;
3078                 }
3079
3080                 if (rw == READ) {
3081                         nr = file->f_op->read(file, iovec.iov_base,
3082                                               iovec.iov_len, io_kiocb_ppos(kiocb));
3083                 } else {
3084                         nr = file->f_op->write(file, iovec.iov_base,
3085                                                iovec.iov_len, io_kiocb_ppos(kiocb));
3086                 }
3087
3088                 if (nr < 0) {
3089                         if (!ret)
3090                                 ret = nr;
3091                         break;
3092                 }
3093                 ret += nr;
3094                 if (nr != iovec.iov_len)
3095                         break;
3096                 req->rw.len -= nr;
3097                 req->rw.addr += nr;
3098                 iov_iter_advance(iter, nr);
3099         }
3100
3101         return ret;
3102 }
3103
3104 static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3105                           const struct iovec *fast_iov, struct iov_iter *iter)
3106 {
3107         struct io_async_rw *rw = req->async_data;
3108
3109         memcpy(&rw->iter, iter, sizeof(*iter));
3110         rw->free_iovec = iovec;
3111         rw->bytes_done = 0;
3112         /* can only be fixed buffers, no need to do anything */
3113         if (iov_iter_is_bvec(iter))
3114                 return;
3115         if (!iovec) {
3116                 unsigned iov_off = 0;
3117
3118                 rw->iter.iov = rw->fast_iov;
3119                 if (iter->iov != fast_iov) {
3120                         iov_off = iter->iov - fast_iov;
3121                         rw->iter.iov += iov_off;
3122                 }
3123                 if (rw->fast_iov != fast_iov)
3124                         memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
3125                                sizeof(struct iovec) * iter->nr_segs);
3126         } else {
3127                 req->flags |= REQ_F_NEED_CLEANUP;
3128         }
3129 }
3130
3131 static inline int io_alloc_async_data(struct io_kiocb *req)
3132 {
3133         WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3134         req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
3135         return req->async_data == NULL;
3136 }
3137
3138 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3139                              const struct iovec *fast_iov,
3140                              struct iov_iter *iter, bool force)
3141 {
3142         if (!force && !io_op_defs[req->opcode].needs_async_setup)
3143                 return 0;
3144         if (!req->async_data) {
3145                 if (io_alloc_async_data(req)) {
3146                         kfree(iovec);
3147                         return -ENOMEM;
3148                 }
3149
3150                 io_req_map_rw(req, iovec, fast_iov, iter);
3151         }
3152         return 0;
3153 }
3154
3155 static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
3156 {
3157         struct io_async_rw *iorw = req->async_data;
3158         struct iovec *iov = iorw->fast_iov;
3159         int ret;
3160
3161         ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
3162         if (unlikely(ret < 0))
3163                 return ret;
3164
3165         iorw->bytes_done = 0;
3166         iorw->free_iovec = iov;
3167         if (iov)
3168                 req->flags |= REQ_F_NEED_CLEANUP;
3169         return 0;
3170 }
3171
3172 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3173 {
3174         if (unlikely(!(req->file->f_mode & FMODE_READ)))
3175                 return -EBADF;
3176         return io_prep_rw(req, sqe);
3177 }
3178
3179 /*
3180  * This is our waitqueue callback handler, registered through lock_page_async()
3181  * when we initially tried to do the IO with the iocb armed our waitqueue.
3182  * This gets called when the page is unlocked, and we generally expect that to
3183  * happen when the page IO is completed and the page is now uptodate. This will
3184  * queue a task_work based retry of the operation, attempting to copy the data
3185  * again. If the latter fails because the page was NOT uptodate, then we will
3186  * do a thread based blocking retry of the operation. That's the unexpected
3187  * slow path.
3188  */
3189 static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3190                              int sync, void *arg)
3191 {
3192         struct wait_page_queue *wpq;
3193         struct io_kiocb *req = wait->private;
3194         struct wait_page_key *key = arg;
3195
3196         wpq = container_of(wait, struct wait_page_queue, wait);
3197
3198         if (!wake_page_match(wpq, key))
3199                 return 0;
3200
3201         req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
3202         list_del_init(&wait->entry);
3203
3204         /* submit ref gets dropped, acquire a new one */
3205         req_ref_get(req);
3206         io_req_task_queue(req);
3207         return 1;
3208 }
3209
3210 /*
3211  * This controls whether a given IO request should be armed for async page
3212  * based retry. If we return false here, the request is handed to the async
3213  * worker threads for retry. If we're doing buffered reads on a regular file,
3214  * we prepare a private wait_page_queue entry and retry the operation. This
3215  * will either succeed because the page is now uptodate and unlocked, or it
3216  * will register a callback when the page is unlocked at IO completion. Through
3217  * that callback, io_uring uses task_work to setup a retry of the operation.
3218  * That retry will attempt the buffered read again. The retry will generally
3219  * succeed, or in rare cases where it fails, we then fall back to using the
3220  * async worker threads for a blocking retry.
3221  */
3222 static bool io_rw_should_retry(struct io_kiocb *req)
3223 {
3224         struct io_async_rw *rw = req->async_data;
3225         struct wait_page_queue *wait = &rw->wpq;
3226         struct kiocb *kiocb = &req->rw.kiocb;
3227
3228         /* never retry for NOWAIT, we just complete with -EAGAIN */
3229         if (req->flags & REQ_F_NOWAIT)
3230                 return false;
3231
3232         /* Only for buffered IO */
3233         if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
3234                 return false;
3235
3236         /*
3237          * just use poll if we can, and don't attempt if the fs doesn't
3238          * support callback based unlocks
3239          */
3240         if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3241                 return false;
3242
3243         wait->wait.func = io_async_buf_func;
3244         wait->wait.private = req;
3245         wait->wait.flags = 0;
3246         INIT_LIST_HEAD(&wait->wait.entry);
3247         kiocb->ki_flags |= IOCB_WAITQ;
3248         kiocb->ki_flags &= ~IOCB_NOWAIT;
3249         kiocb->ki_waitq = wait;
3250         return true;
3251 }
3252
3253 static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
3254 {
3255         if (req->file->f_op->read_iter)
3256                 return call_read_iter(req->file, &req->rw.kiocb, iter);
3257         else if (req->file->f_op->read)
3258                 return loop_rw_iter(READ, req, iter);
3259         else
3260                 return -EINVAL;
3261 }
3262
3263 static int io_read(struct io_kiocb *req, unsigned int issue_flags)
3264 {
3265         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3266         struct kiocb *kiocb = &req->rw.kiocb;
3267         struct iov_iter __iter, *iter = &__iter;
3268         struct io_async_rw *rw = req->async_data;
3269         ssize_t io_size, ret, ret2;
3270         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3271
3272         if (rw) {
3273                 iter = &rw->iter;
3274                 iovec = NULL;
3275         } else {
3276                 ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
3277                 if (ret < 0)
3278                         return ret;
3279         }
3280         io_size = iov_iter_count(iter);
3281         req->result = io_size;
3282
3283         /* Ensure we clear previously set non-block flag */
3284         if (!force_nonblock)
3285                 kiocb->ki_flags &= ~IOCB_NOWAIT;
3286         else
3287                 kiocb->ki_flags |= IOCB_NOWAIT;
3288
3289         /* If the file doesn't support async, just async punt */
3290         if (force_nonblock && !io_file_supports_async(req, READ)) {
3291                 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3292                 return ret ?: -EAGAIN;
3293         }
3294
3295         ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
3296         if (unlikely(ret)) {
3297                 kfree(iovec);
3298                 return ret;
3299         }
3300
3301         ret = io_iter_do_read(req, iter);
3302
3303         if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
3304                 req->flags &= ~REQ_F_REISSUE;
3305                 /* IOPOLL retry should happen for io-wq threads */
3306                 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
3307                         goto done;
3308                 /* no retry on NONBLOCK nor RWF_NOWAIT */
3309                 if (req->flags & REQ_F_NOWAIT)
3310                         goto done;
3311                 /* some cases will consume bytes even on error returns */
3312                 iov_iter_revert(iter, io_size - iov_iter_count(iter));
3313                 ret = 0;
3314         } else if (ret == -EIOCBQUEUED) {
3315                 goto out_free;
3316         } else if (ret <= 0 || ret == io_size || !force_nonblock ||
3317                    (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) {
3318                 /* read all, failed, already did sync or don't want to retry */
3319                 goto done;
3320         }
3321
3322         ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3323         if (ret2)
3324                 return ret2;
3325
3326         iovec = NULL;
3327         rw = req->async_data;
3328         /* now use our persistent iterator, if we aren't already */
3329         iter = &rw->iter;
3330
3331         do {
3332                 io_size -= ret;
3333                 rw->bytes_done += ret;
3334                 /* if we can retry, do so with the callbacks armed */
3335                 if (!io_rw_should_retry(req)) {
3336                         kiocb->ki_flags &= ~IOCB_WAITQ;
3337                         return -EAGAIN;
3338                 }
3339
3340                 /*
3341                  * Now retry read with the IOCB_WAITQ parts set in the iocb. If
3342                  * we get -EIOCBQUEUED, then we'll get a notification when the
3343                  * desired page gets unlocked. We can also get a partial read
3344                  * here, and if we do, then just retry at the new offset.
3345                  */
3346                 ret = io_iter_do_read(req, iter);
3347                 if (ret == -EIOCBQUEUED)
3348                         return 0;
3349                 /* we got some bytes, but not all. retry. */
3350                 kiocb->ki_flags &= ~IOCB_WAITQ;
3351         } while (ret > 0 && ret < io_size);
3352 done:
3353         kiocb_done(kiocb, ret, issue_flags);
3354 out_free:
3355         /* it's faster to check here then delegate to kfree */
3356         if (iovec)
3357                 kfree(iovec);
3358         return 0;
3359 }
3360
3361 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3362 {
3363         if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3364                 return -EBADF;
3365         return io_prep_rw(req, sqe);
3366 }
3367
3368 static int io_write(struct io_kiocb *req, unsigned int issue_flags)
3369 {
3370         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3371         struct kiocb *kiocb = &req->rw.kiocb;
3372         struct iov_iter __iter, *iter = &__iter;
3373         struct io_async_rw *rw = req->async_data;
3374         ssize_t ret, ret2, io_size;
3375         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3376
3377         if (rw) {
3378                 iter = &rw->iter;
3379                 iovec = NULL;
3380         } else {
3381                 ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
3382                 if (ret < 0)
3383                         return ret;
3384         }
3385         io_size = iov_iter_count(iter);
3386         req->result = io_size;
3387
3388         /* Ensure we clear previously set non-block flag */
3389         if (!force_nonblock)
3390                 kiocb->ki_flags &= ~IOCB_NOWAIT;
3391         else
3392                 kiocb->ki_flags |= IOCB_NOWAIT;
3393
3394         /* If the file doesn't support async, just async punt */
3395         if (force_nonblock && !io_file_supports_async(req, WRITE))
3396                 goto copy_iov;
3397
3398         /* file path doesn't support NOWAIT for non-direct_IO */
3399         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3400             (req->flags & REQ_F_ISREG))
3401                 goto copy_iov;
3402
3403         ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
3404         if (unlikely(ret))
3405                 goto out_free;
3406
3407         /*
3408          * Open-code file_start_write here to grab freeze protection,
3409          * which will be released by another thread in
3410          * io_complete_rw().  Fool lockdep by telling it the lock got
3411          * released so that it doesn't complain about the held lock when
3412          * we return to userspace.
3413          */
3414         if (req->flags & REQ_F_ISREG) {
3415                 sb_start_write(file_inode(req->file)->i_sb);
3416                 __sb_writers_release(file_inode(req->file)->i_sb,
3417                                         SB_FREEZE_WRITE);
3418         }
3419         kiocb->ki_flags |= IOCB_WRITE;
3420
3421         if (req->file->f_op->write_iter)
3422                 ret2 = call_write_iter(req->file, kiocb, iter);
3423         else if (req->file->f_op->write)
3424                 ret2 = loop_rw_iter(WRITE, req, iter);
3425         else
3426                 ret2 = -EINVAL;
3427
3428         if (req->flags & REQ_F_REISSUE) {
3429                 req->flags &= ~REQ_F_REISSUE;
3430                 ret2 = -EAGAIN;
3431         }
3432
3433         /*
3434          * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3435          * retry them without IOCB_NOWAIT.
3436          */
3437         if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3438                 ret2 = -EAGAIN;
3439         /* no retry on NONBLOCK nor RWF_NOWAIT */
3440         if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
3441                 goto done;
3442         if (!force_nonblock || ret2 != -EAGAIN) {
3443                 /* IOPOLL retry should happen for io-wq threads */
3444                 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
3445                         goto copy_iov;
3446 done:
3447                 kiocb_done(kiocb, ret2, issue_flags);
3448         } else {
3449 copy_iov:
3450                 /* some cases will consume bytes even on error returns */
3451                 iov_iter_revert(iter, io_size - iov_iter_count(iter));
3452                 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
3453                 return ret ?: -EAGAIN;
3454         }
3455 out_free:
3456         /* it's reportedly faster than delegating the null check to kfree() */
3457         if (iovec)
3458                 kfree(iovec);
3459         return ret;
3460 }
3461
3462 static int io_renameat_prep(struct io_kiocb *req,
3463                             const struct io_uring_sqe *sqe)
3464 {
3465         struct io_rename *ren = &req->rename;
3466         const char __user *oldf, *newf;
3467
3468         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3469                 return -EBADF;
3470
3471         ren->old_dfd = READ_ONCE(sqe->fd);
3472         oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3473         newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3474         ren->new_dfd = READ_ONCE(sqe->len);
3475         ren->flags = READ_ONCE(sqe->rename_flags);
3476
3477         ren->oldpath = getname(oldf);
3478         if (IS_ERR(ren->oldpath))
3479                 return PTR_ERR(ren->oldpath);
3480
3481         ren->newpath = getname(newf);
3482         if (IS_ERR(ren->newpath)) {
3483                 putname(ren->oldpath);
3484                 return PTR_ERR(ren->newpath);
3485         }
3486
3487         req->flags |= REQ_F_NEED_CLEANUP;
3488         return 0;
3489 }
3490
3491 static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
3492 {
3493         struct io_rename *ren = &req->rename;
3494         int ret;
3495
3496         if (issue_flags & IO_URING_F_NONBLOCK)
3497                 return -EAGAIN;
3498
3499         ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
3500                                 ren->newpath, ren->flags);
3501
3502         req->flags &= ~REQ_F_NEED_CLEANUP;
3503         if (ret < 0)
3504                 req_set_fail(req);
3505         io_req_complete(req, ret);
3506         return 0;
3507 }
3508
3509 static int io_unlinkat_prep(struct io_kiocb *req,
3510                             const struct io_uring_sqe *sqe)
3511 {
3512         struct io_unlink *un = &req->unlink;
3513         const char __user *fname;
3514
3515         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3516                 return -EBADF;
3517
3518         un->dfd = READ_ONCE(sqe->fd);
3519
3520         un->flags = READ_ONCE(sqe->unlink_flags);
3521         if (un->flags & ~AT_REMOVEDIR)
3522                 return -EINVAL;
3523
3524         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3525         un->filename = getname(fname);
3526         if (IS_ERR(un->filename))
3527                 return PTR_ERR(un->filename);
3528
3529         req->flags |= REQ_F_NEED_CLEANUP;
3530         return 0;
3531 }
3532
3533 static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
3534 {
3535         struct io_unlink *un = &req->unlink;
3536         int ret;
3537
3538         if (issue_flags & IO_URING_F_NONBLOCK)
3539                 return -EAGAIN;
3540
3541         if (un->flags & AT_REMOVEDIR)
3542                 ret = do_rmdir(un->dfd, un->filename);
3543         else
3544                 ret = do_unlinkat(un->dfd, un->filename);
3545
3546         req->flags &= ~REQ_F_NEED_CLEANUP;
3547         if (ret < 0)
3548                 req_set_fail(req);
3549         io_req_complete(req, ret);
3550         return 0;
3551 }
3552
3553 static int io_shutdown_prep(struct io_kiocb *req,
3554                             const struct io_uring_sqe *sqe)
3555 {
3556 #if defined(CONFIG_NET)
3557         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3558                 return -EINVAL;
3559         if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
3560             sqe->buf_index)
3561                 return -EINVAL;
3562
3563         req->shutdown.how = READ_ONCE(sqe->len);
3564         return 0;
3565 #else
3566         return -EOPNOTSUPP;
3567 #endif
3568 }
3569
3570 static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
3571 {
3572 #if defined(CONFIG_NET)
3573         struct socket *sock;
3574         int ret;
3575
3576         if (issue_flags & IO_URING_F_NONBLOCK)
3577                 return -EAGAIN;
3578
3579         sock = sock_from_file(req->file);
3580         if (unlikely(!sock))
3581                 return -ENOTSOCK;
3582
3583         ret = __sys_shutdown_sock(sock, req->shutdown.how);
3584         if (ret < 0)
3585                 req_set_fail(req);
3586         io_req_complete(req, ret);
3587         return 0;
3588 #else
3589         return -EOPNOTSUPP;
3590 #endif
3591 }
3592
3593 static int __io_splice_prep(struct io_kiocb *req,
3594                             const struct io_uring_sqe *sqe)
3595 {
3596         struct io_splice* sp = &req->splice;
3597         unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
3598
3599         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3600                 return -EINVAL;
3601
3602         sp->file_in = NULL;
3603         sp->len = READ_ONCE(sqe->len);
3604         sp->flags = READ_ONCE(sqe->splice_flags);
3605
3606         if (unlikely(sp->flags & ~valid_flags))
3607                 return -EINVAL;
3608
3609         sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in),
3610                                   (sp->flags & SPLICE_F_FD_IN_FIXED));
3611         if (!sp->file_in)
3612                 return -EBADF;
3613         req->flags |= REQ_F_NEED_CLEANUP;
3614         return 0;
3615 }
3616
3617 static int io_tee_prep(struct io_kiocb *req,
3618                        const struct io_uring_sqe *sqe)
3619 {
3620         if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
3621                 return -EINVAL;
3622         return __io_splice_prep(req, sqe);
3623 }
3624
3625 static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
3626 {
3627         struct io_splice *sp = &req->splice;
3628         struct file *in = sp->file_in;
3629         struct file *out = sp->file_out;
3630         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3631         long ret = 0;
3632
3633         if (issue_flags & IO_URING_F_NONBLOCK)
3634                 return -EAGAIN;
3635         if (sp->len)
3636                 ret = do_tee(in, out, sp->len, flags);
3637
3638         if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
3639                 io_put_file(in);
3640         req->flags &= ~REQ_F_NEED_CLEANUP;
3641
3642         if (ret != sp->len)
3643                 req_set_fail(req);
3644         io_req_complete(req, ret);
3645         return 0;
3646 }
3647
3648 static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3649 {
3650         struct io_splice* sp = &req->splice;
3651
3652         sp->off_in = READ_ONCE(sqe->splice_off_in);
3653         sp->off_out = READ_ONCE(sqe->off);
3654         return __io_splice_prep(req, sqe);
3655 }
3656
3657 static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
3658 {
3659         struct io_splice *sp = &req->splice;
3660         struct file *in = sp->file_in;
3661         struct file *out = sp->file_out;
3662         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3663         loff_t *poff_in, *poff_out;
3664         long ret = 0;
3665
3666         if (issue_flags & IO_URING_F_NONBLOCK)
3667                 return -EAGAIN;
3668
3669         poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
3670         poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
3671
3672         if (sp->len)
3673                 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
3674
3675         if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
3676                 io_put_file(in);
3677         req->flags &= ~REQ_F_NEED_CLEANUP;
3678
3679         if (ret != sp->len)
3680                 req_set_fail(req);
3681         io_req_complete(req, ret);
3682         return 0;
3683 }
3684
3685 /*
3686  * IORING_OP_NOP just posts a completion event, nothing else.
3687  */
3688 static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
3689 {
3690         struct io_ring_ctx *ctx = req->ctx;
3691
3692         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3693                 return -EINVAL;
3694
3695         __io_req_complete(req, issue_flags, 0, 0);
3696         return 0;
3697 }
3698
3699 static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3700 {
3701         struct io_ring_ctx *ctx = req->ctx;
3702
3703         if (!req->file)
3704                 return -EBADF;
3705
3706         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3707                 return -EINVAL;
3708         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
3709                 return -EINVAL;
3710
3711         req->sync.flags = READ_ONCE(sqe->fsync_flags);
3712         if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
3713                 return -EINVAL;
3714
3715         req->sync.off = READ_ONCE(sqe->off);
3716         req->sync.len = READ_ONCE(sqe->len);
3717         return 0;
3718 }
3719
3720 static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
3721 {
3722         loff_t end = req->sync.off + req->sync.len;
3723         int ret;
3724
3725         /* fsync always requires a blocking context */
3726         if (issue_flags & IO_URING_F_NONBLOCK)
3727                 return -EAGAIN;
3728
3729         ret = vfs_fsync_range(req->file, req->sync.off,
3730                                 end > 0 ? end : LLONG_MAX,
3731                                 req->sync.flags & IORING_FSYNC_DATASYNC);
3732         if (ret < 0)
3733                 req_set_fail(req);
3734         io_req_complete(req, ret);
3735         return 0;
3736 }
3737
3738 static int io_fallocate_prep(struct io_kiocb *req,
3739                              const struct io_uring_sqe *sqe)
3740 {
3741         if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
3742                 return -EINVAL;
3743         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3744                 return -EINVAL;
3745
3746         req->sync.off = READ_ONCE(sqe->off);
3747         req->sync.len = READ_ONCE(sqe->addr);
3748         req->sync.mode = READ_ONCE(sqe->len);
3749         return 0;
3750 }
3751
3752 static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
3753 {
3754         int ret;
3755
3756         /* fallocate always requiring blocking context */
3757         if (issue_flags & IO_URING_F_NONBLOCK)
3758                 return -EAGAIN;
3759         ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
3760                                 req->sync.len);
3761         if (ret < 0)
3762                 req_set_fail(req);
3763         io_req_complete(req, ret);
3764         return 0;
3765 }
3766
3767 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3768 {
3769         const char __user *fname;
3770         int ret;
3771
3772         if (unlikely(sqe->ioprio || sqe->buf_index))
3773                 return -EINVAL;
3774         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3775                 return -EBADF;
3776
3777         /* open.how should be already initialised */
3778         if (!(req->open.how.flags & O_PATH) && force_o_largefile())
3779                 req->open.how.flags |= O_LARGEFILE;
3780
3781         req->open.dfd = READ_ONCE(sqe->fd);
3782         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3783         req->open.filename = getname(fname);
3784         if (IS_ERR(req->open.filename)) {
3785                 ret = PTR_ERR(req->open.filename);
3786                 req->open.filename = NULL;
3787                 return ret;
3788         }
3789         req->open.nofile = rlimit(RLIMIT_NOFILE);
3790         req->flags |= REQ_F_NEED_CLEANUP;
3791         return 0;
3792 }
3793
3794 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3795 {
3796         u64 flags, mode;
3797
3798         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3799                 return -EINVAL;
3800         mode = READ_ONCE(sqe->len);
3801         flags = READ_ONCE(sqe->open_flags);
3802         req->open.how = build_open_how(flags, mode);
3803         return __io_openat_prep(req, sqe);
3804 }
3805
3806 static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3807 {
3808         struct open_how __user *how;
3809         size_t len;
3810         int ret;
3811
3812         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3813                 return -EINVAL;
3814         how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3815         len = READ_ONCE(sqe->len);
3816         if (len < OPEN_HOW_SIZE_VER0)
3817                 return -EINVAL;
3818
3819         ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
3820                                         len);
3821         if (ret)
3822                 return ret;
3823
3824         return __io_openat_prep(req, sqe);
3825 }
3826
3827 static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
3828 {
3829         struct open_flags op;
3830         struct file *file;
3831         bool nonblock_set;
3832         bool resolve_nonblock;
3833         int ret;
3834
3835         ret = build_open_flags(&req->open.how, &op);
3836         if (ret)
3837                 goto err;
3838         nonblock_set = op.open_flag & O_NONBLOCK;
3839         resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
3840         if (issue_flags & IO_URING_F_NONBLOCK) {
3841                 /*
3842                  * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
3843                  * it'll always -EAGAIN
3844                  */
3845                 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
3846                         return -EAGAIN;
3847                 op.lookup_flags |= LOOKUP_CACHED;
3848                 op.open_flag |= O_NONBLOCK;
3849         }
3850
3851         ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
3852         if (ret < 0)
3853                 goto err;
3854
3855         file = do_filp_open(req->open.dfd, req->open.filename, &op);
3856         /* only retry if RESOLVE_CACHED wasn't already set by application */
3857         if ((!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)) &&
3858             file == ERR_PTR(-EAGAIN)) {
3859                 /*
3860                  * We could hang on to this 'fd', but seems like marginal
3861                  * gain for something that is now known to be a slower path.
3862                  * So just put it, and we'll get a new one when we retry.
3863                  */
3864                 put_unused_fd(ret);
3865                 return -EAGAIN;
3866         }
3867
3868         if (IS_ERR(file)) {
3869                 put_unused_fd(ret);
3870                 ret = PTR_ERR(file);
3871         } else {
3872                 if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
3873                         file->f_flags &= ~O_NONBLOCK;
3874                 fsnotify_open(file);
3875                 fd_install(ret, file);
3876         }
3877 err:
3878         putname(req->open.filename);
3879         req->flags &= ~REQ_F_NEED_CLEANUP;
3880         if (ret < 0)
3881                 req_set_fail(req);
3882         __io_req_complete(req, issue_flags, ret, 0);
3883         return 0;
3884 }
3885
3886 static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
3887 {
3888         return io_openat2(req, issue_flags);
3889 }
3890
3891 static int io_remove_buffers_prep(struct io_kiocb *req,
3892                                   const struct io_uring_sqe *sqe)
3893 {
3894         struct io_provide_buf *p = &req->pbuf;
3895         u64 tmp;
3896
3897         if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
3898                 return -EINVAL;
3899
3900         tmp = READ_ONCE(sqe->fd);
3901         if (!tmp || tmp > USHRT_MAX)
3902                 return -EINVAL;
3903
3904         memset(p, 0, sizeof(*p));
3905         p->nbufs = tmp;
3906         p->bgid = READ_ONCE(sqe->buf_group);
3907         return 0;
3908 }
3909
3910 static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
3911                                int bgid, unsigned nbufs)
3912 {
3913         unsigned i = 0;
3914
3915         /* shouldn't happen */
3916         if (!nbufs)
3917                 return 0;
3918
3919         /* the head kbuf is the list itself */
3920         while (!list_empty(&buf->list)) {
3921                 struct io_buffer *nxt;
3922
3923                 nxt = list_first_entry(&buf->list, struct io_buffer, list);
3924                 list_del(&nxt->list);
3925                 kfree(nxt);
3926                 if (++i == nbufs)
3927                         return i;
3928         }
3929         i++;
3930         kfree(buf);
3931         xa_erase(&ctx->io_buffers, bgid);
3932
3933         return i;
3934 }
3935
3936 static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
3937 {
3938         struct io_provide_buf *p = &req->pbuf;
3939         struct io_ring_ctx *ctx = req->ctx;
3940         struct io_buffer *head;
3941         int ret = 0;
3942         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3943
3944         io_ring_submit_lock(ctx, !force_nonblock);
3945
3946         lockdep_assert_held(&ctx->uring_lock);
3947
3948         ret = -ENOENT;
3949         head = xa_load(&ctx->io_buffers, p->bgid);
3950         if (head)
3951                 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
3952         if (ret < 0)
3953                 req_set_fail(req);
3954
3955         /* complete before unlock, IOPOLL may need the lock */
3956         __io_req_complete(req, issue_flags, ret, 0);
3957         io_ring_submit_unlock(ctx, !force_nonblock);
3958         return 0;
3959 }
3960
3961 static int io_provide_buffers_prep(struct io_kiocb *req,
3962                                    const struct io_uring_sqe *sqe)
3963 {
3964         unsigned long size, tmp_check;
3965         struct io_provide_buf *p = &req->pbuf;
3966         u64 tmp;
3967
3968         if (sqe->ioprio || sqe->rw_flags)
3969                 return -EINVAL;
3970
3971         tmp = READ_ONCE(sqe->fd);
3972         if (!tmp || tmp > USHRT_MAX)
3973                 return -E2BIG;
3974         p->nbufs = tmp;
3975         p->addr = READ_ONCE(sqe->addr);
3976         p->len = READ_ONCE(sqe->len);
3977
3978         if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
3979                                 &size))
3980                 return -EOVERFLOW;
3981         if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
3982                 return -EOVERFLOW;
3983
3984         size = (unsigned long)p->len * p->nbufs;
3985         if (!access_ok(u64_to_user_ptr(p->addr), size))
3986                 return -EFAULT;
3987
3988         p->bgid = READ_ONCE(sqe->buf_group);
3989         tmp = READ_ONCE(sqe->off);
3990         if (tmp > USHRT_MAX)
3991                 return -E2BIG;
3992         p->bid = tmp;
3993         return 0;
3994 }
3995
3996 static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
3997 {
3998         struct io_buffer *buf;
3999         u64 addr = pbuf->addr;
4000         int i, bid = pbuf->bid;
4001
4002         for (i = 0; i < pbuf->nbufs; i++) {
4003                 buf = kmalloc(sizeof(*buf), GFP_KERNEL);
4004                 if (!buf)
4005                         break;
4006
4007                 buf->addr = addr;
4008                 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
4009                 buf->bid = bid;
4010                 addr += pbuf->len;
4011                 bid++;
4012                 if (!*head) {
4013                         INIT_LIST_HEAD(&buf->list);
4014                         *head = buf;
4015                 } else {
4016                         list_add_tail(&buf->list, &(*head)->list);
4017                 }
4018         }
4019
4020         return i ? i : -ENOMEM;
4021 }
4022
4023 static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
4024 {
4025         struct io_provide_buf *p = &req->pbuf;
4026         struct io_ring_ctx *ctx = req->ctx;
4027         struct io_buffer *head, *list;
4028         int ret = 0;
4029         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4030
4031         io_ring_submit_lock(ctx, !force_nonblock);
4032
4033         lockdep_assert_held(&ctx->uring_lock);
4034
4035         list = head = xa_load(&ctx->io_buffers, p->bgid);
4036
4037         ret = io_add_buffers(p, &head);
4038         if (ret >= 0 && !list) {
4039                 ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL);
4040                 if (ret < 0)
4041                         __io_remove_buffers(ctx, head, p->bgid, -1U);
4042         }
4043         if (ret < 0)
4044                 req_set_fail(req);
4045         /* complete before unlock, IOPOLL may need the lock */
4046         __io_req_complete(req, issue_flags, ret, 0);
4047         io_ring_submit_unlock(ctx, !force_nonblock);
4048         return 0;
4049 }
4050
4051 static int io_epoll_ctl_prep(struct io_kiocb *req,
4052                              const struct io_uring_sqe *sqe)
4053 {
4054 #if defined(CONFIG_EPOLL)
4055         if (sqe->ioprio || sqe->buf_index)
4056                 return -EINVAL;
4057         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4058                 return -EINVAL;
4059
4060         req->epoll.epfd = READ_ONCE(sqe->fd);
4061         req->epoll.op = READ_ONCE(sqe->len);
4062         req->epoll.fd = READ_ONCE(sqe->off);
4063
4064         if (ep_op_has_event(req->epoll.op)) {
4065                 struct epoll_event __user *ev;
4066
4067                 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4068                 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4069                         return -EFAULT;
4070         }
4071
4072         return 0;
4073 #else
4074         return -EOPNOTSUPP;
4075 #endif
4076 }
4077
4078 static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
4079 {
4080 #if defined(CONFIG_EPOLL)
4081         struct io_epoll *ie = &req->epoll;
4082         int ret;
4083         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4084
4085         ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4086         if (force_nonblock && ret == -EAGAIN)
4087                 return -EAGAIN;
4088
4089         if (ret < 0)
4090                 req_set_fail(req);
4091         __io_req_complete(req, issue_flags, ret, 0);
4092         return 0;
4093 #else
4094         return -EOPNOTSUPP;
4095 #endif
4096 }
4097
4098 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4099 {
4100 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4101         if (sqe->ioprio || sqe->buf_index || sqe->off)
4102                 return -EINVAL;
4103         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4104                 return -EINVAL;
4105
4106         req->madvise.addr = READ_ONCE(sqe->addr);
4107         req->madvise.len = READ_ONCE(sqe->len);
4108         req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4109         return 0;
4110 #else
4111         return -EOPNOTSUPP;
4112 #endif
4113 }
4114
4115 static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
4116 {
4117 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4118         struct io_madvise *ma = &req->madvise;
4119         int ret;
4120
4121         if (issue_flags & IO_URING_F_NONBLOCK)
4122                 return -EAGAIN;
4123
4124         ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
4125         if (ret < 0)
4126                 req_set_fail(req);
4127         io_req_complete(req, ret);
4128         return 0;
4129 #else
4130         return -EOPNOTSUPP;
4131 #endif
4132 }
4133
4134 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4135 {
4136         if (sqe->ioprio || sqe->buf_index || sqe->addr)
4137                 return -EINVAL;
4138         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4139                 return -EINVAL;
4140
4141         req->fadvise.offset = READ_ONCE(sqe->off);
4142         req->fadvise.len = READ_ONCE(sqe->len);
4143         req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
4144         return 0;
4145 }
4146
4147 static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
4148 {
4149         struct io_fadvise *fa = &req->fadvise;
4150         int ret;
4151
4152         if (issue_flags & IO_URING_F_NONBLOCK) {
4153                 switch (fa->advice) {
4154                 case POSIX_FADV_NORMAL:
4155                 case POSIX_FADV_RANDOM:
4156                 case POSIX_FADV_SEQUENTIAL:
4157                         break;
4158                 default:
4159                         return -EAGAIN;
4160                 }
4161         }
4162
4163         ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
4164         if (ret < 0)
4165                 req_set_fail(req);
4166         __io_req_complete(req, issue_flags, ret, 0);
4167         return 0;
4168 }
4169
4170 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4171 {
4172         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4173                 return -EINVAL;
4174         if (sqe->ioprio || sqe->buf_index)
4175                 return -EINVAL;
4176         if (req->flags & REQ_F_FIXED_FILE)
4177                 return -EBADF;
4178
4179         req->statx.dfd = READ_ONCE(sqe->fd);
4180         req->statx.mask = READ_ONCE(sqe->len);
4181         req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
4182         req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4183         req->statx.flags = READ_ONCE(sqe->statx_flags);
4184
4185         return 0;
4186 }
4187
4188 static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
4189 {
4190         struct io_statx *ctx = &req->statx;
4191         int ret;
4192
4193         if (issue_flags & IO_URING_F_NONBLOCK)
4194                 return -EAGAIN;
4195
4196         ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
4197                        ctx->buffer);
4198
4199         if (ret < 0)
4200                 req_set_fail(req);
4201         io_req_complete(req, ret);
4202         return 0;
4203 }
4204
4205 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4206 {
4207         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4208                 return -EINVAL;
4209         if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
4210             sqe->rw_flags || sqe->buf_index)
4211                 return -EINVAL;
4212         if (req->flags & REQ_F_FIXED_FILE)
4213                 return -EBADF;
4214
4215         req->close.fd = READ_ONCE(sqe->fd);
4216         return 0;
4217 }
4218
4219 static int io_close(struct io_kiocb *req, unsigned int issue_flags)
4220 {
4221         struct files_struct *files = current->files;
4222         struct io_close *close = &req->close;
4223         struct fdtable *fdt;
4224         struct file *file = NULL;
4225         int ret = -EBADF;
4226
4227         spin_lock(&files->file_lock);
4228         fdt = files_fdtable(files);
4229         if (close->fd >= fdt->max_fds) {
4230                 spin_unlock(&files->file_lock);
4231                 goto err;
4232         }
4233         file = fdt->fd[close->fd];
4234         if (!file || file->f_op == &io_uring_fops) {
4235                 spin_unlock(&files->file_lock);
4236                 file = NULL;
4237                 goto err;
4238         }
4239
4240         /* if the file has a flush method, be safe and punt to async */
4241         if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
4242                 spin_unlock(&files->file_lock);
4243                 return -EAGAIN;
4244         }
4245
4246         ret = __close_fd_get_file(close->fd, &file);
4247         spin_unlock(&files->file_lock);
4248         if (ret < 0) {
4249                 if (ret == -ENOENT)
4250                         ret = -EBADF;
4251                 goto err;
4252         }
4253
4254         /* No ->flush() or already async, safely close from here */
4255         ret = filp_close(file, current->files);
4256 err:
4257         if (ret < 0)
4258                 req_set_fail(req);
4259         if (file)
4260                 fput(file);
4261         __io_req_complete(req, issue_flags, ret, 0);
4262         return 0;
4263 }
4264
4265 static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4266 {
4267         struct io_ring_ctx *ctx = req->ctx;
4268
4269         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4270                 return -EINVAL;
4271         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
4272                 return -EINVAL;
4273
4274         req->sync.off = READ_ONCE(sqe->off);
4275         req->sync.len = READ_ONCE(sqe->len);
4276         req->sync.flags = READ_ONCE(sqe->sync_range_flags);
4277         return 0;
4278 }
4279
4280 static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
4281 {
4282         int ret;
4283
4284         /* sync_file_range always requires a blocking context */
4285         if (issue_flags & IO_URING_F_NONBLOCK)
4286                 return -EAGAIN;
4287
4288         ret = sync_file_range(req->file, req->sync.off, req->sync.len,
4289                                 req->sync.flags);
4290         if (ret < 0)
4291                 req_set_fail(req);
4292         io_req_complete(req, ret);
4293         return 0;
4294 }
4295
4296 #if defined(CONFIG_NET)
4297 static int io_setup_async_msg(struct io_kiocb *req,
4298                               struct io_async_msghdr *kmsg)
4299 {
4300         struct io_async_msghdr *async_msg = req->async_data;
4301
4302         if (async_msg)
4303                 return -EAGAIN;
4304         if (io_alloc_async_data(req)) {
4305                 kfree(kmsg->free_iov);
4306                 return -ENOMEM;
4307         }
4308         async_msg = req->async_data;
4309         req->flags |= REQ_F_NEED_CLEANUP;
4310         memcpy(async_msg, kmsg, sizeof(*kmsg));
4311         async_msg->msg.msg_name = &async_msg->addr;
4312         /* if were using fast_iov, set it to the new one */
4313         if (!async_msg->free_iov)
4314                 async_msg->msg.msg_iter.iov = async_msg->fast_iov;
4315
4316         return -EAGAIN;
4317 }
4318
4319 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
4320                                struct io_async_msghdr *iomsg)
4321 {
4322         iomsg->msg.msg_name = &iomsg->addr;
4323         iomsg->free_iov = iomsg->fast_iov;
4324         return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
4325                                    req->sr_msg.msg_flags, &iomsg->free_iov);
4326 }
4327
4328 static int io_sendmsg_prep_async(struct io_kiocb *req)
4329 {
4330         int ret;
4331
4332         ret = io_sendmsg_copy_hdr(req, req->async_data);
4333         if (!ret)
4334                 req->flags |= REQ_F_NEED_CLEANUP;
4335         return ret;
4336 }
4337
4338 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4339 {
4340         struct io_sr_msg *sr = &req->sr_msg;
4341
4342         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4343                 return -EINVAL;
4344
4345         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4346         sr->len = READ_ONCE(sqe->len);
4347         sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
4348         if (sr->msg_flags & MSG_DONTWAIT)
4349                 req->flags |= REQ_F_NOWAIT;
4350
4351 #ifdef CONFIG_COMPAT
4352         if (req->ctx->compat)
4353                 sr->msg_flags |= MSG_CMSG_COMPAT;
4354 #endif
4355         return 0;
4356 }
4357
4358 static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
4359 {
4360         struct io_async_msghdr iomsg, *kmsg;
4361         struct socket *sock;
4362         unsigned flags;
4363         int min_ret = 0;
4364         int ret;
4365
4366         sock = sock_from_file(req->file);
4367         if (unlikely(!sock))
4368                 return -ENOTSOCK;
4369
4370         kmsg = req->async_data;
4371         if (!kmsg) {
4372                 ret = io_sendmsg_copy_hdr(req, &iomsg);
4373                 if (ret)
4374                         return ret;
4375                 kmsg = &iomsg;
4376         }
4377
4378         flags = req->sr_msg.msg_flags;
4379         if (issue_flags & IO_URING_F_NONBLOCK)
4380                 flags |= MSG_DONTWAIT;
4381         if (flags & MSG_WAITALL)
4382                 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
4383
4384         ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
4385         if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4386                 return io_setup_async_msg(req, kmsg);
4387         if (ret == -ERESTARTSYS)
4388                 ret = -EINTR;
4389
4390         /* fast path, check for non-NULL to avoid function call */
4391         if (kmsg->free_iov)
4392                 kfree(kmsg->free_iov);
4393         req->flags &= ~REQ_F_NEED_CLEANUP;
4394         if (ret < min_ret)
4395                 req_set_fail(req);
4396         __io_req_complete(req, issue_flags, ret, 0);
4397         return 0;
4398 }
4399
4400 static int io_send(struct io_kiocb *req, unsigned int issue_flags)
4401 {
4402         struct io_sr_msg *sr = &req->sr_msg;
4403         struct msghdr msg;
4404         struct iovec iov;
4405         struct socket *sock;
4406         unsigned flags;
4407         int min_ret = 0;
4408         int ret;
4409
4410         sock = sock_from_file(req->file);
4411         if (unlikely(!sock))
4412                 return -ENOTSOCK;
4413
4414         ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
4415         if (unlikely(ret))
4416                 return ret;
4417
4418         msg.msg_name = NULL;
4419         msg.msg_control = NULL;
4420         msg.msg_controllen = 0;
4421         msg.msg_namelen = 0;
4422
4423         flags = req->sr_msg.msg_flags;
4424         if (issue_flags & IO_URING_F_NONBLOCK)
4425                 flags |= MSG_DONTWAIT;
4426         if (flags & MSG_WAITALL)
4427                 min_ret = iov_iter_count(&msg.msg_iter);
4428
4429         msg.msg_flags = flags;
4430         ret = sock_sendmsg(sock, &msg);
4431         if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4432                 return -EAGAIN;
4433         if (ret == -ERESTARTSYS)
4434                 ret = -EINTR;
4435
4436         if (ret < min_ret)
4437                 req_set_fail(req);
4438         __io_req_complete(req, issue_flags, ret, 0);
4439         return 0;
4440 }
4441
4442 static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
4443                                  struct io_async_msghdr *iomsg)
4444 {
4445         struct io_sr_msg *sr = &req->sr_msg;
4446         struct iovec __user *uiov;
4447         size_t iov_len;
4448         int ret;
4449
4450         ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
4451                                         &iomsg->uaddr, &uiov, &iov_len);
4452         if (ret)
4453                 return ret;
4454
4455         if (req->flags & REQ_F_BUFFER_SELECT) {
4456                 if (iov_len > 1)
4457                         return -EINVAL;
4458                 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
4459                         return -EFAULT;
4460                 sr->len = iomsg->fast_iov[0].iov_len;
4461                 iomsg->free_iov = NULL;
4462         } else {
4463                 iomsg->free_iov = iomsg->fast_iov;
4464                 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
4465                                      &iomsg->free_iov, &iomsg->msg.msg_iter,
4466                                      false);
4467                 if (ret > 0)
4468                         ret = 0;
4469         }
4470
4471         return ret;
4472 }
4473
4474 #ifdef CONFIG_COMPAT
4475 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
4476                                         struct io_async_msghdr *iomsg)
4477 {
4478         struct io_sr_msg *sr = &req->sr_msg;
4479         struct compat_iovec __user *uiov;
4480         compat_uptr_t ptr;
4481         compat_size_t len;
4482         int ret;
4483
4484         ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
4485                                   &ptr, &len);
4486         if (ret)
4487                 return ret;
4488
4489         uiov = compat_ptr(ptr);
4490         if (req->flags & REQ_F_BUFFER_SELECT) {
4491                 compat_ssize_t clen;
4492
4493                 if (len > 1)
4494                         return -EINVAL;
4495                 if (!access_ok(uiov, sizeof(*uiov)))
4496                         return -EFAULT;
4497                 if (__get_user(clen, &uiov->iov_len))
4498                         return -EFAULT;
4499                 if (clen < 0)
4500                         return -EINVAL;
4501                 sr->len = clen;
4502                 iomsg->free_iov = NULL;
4503         } else {
4504                 iomsg->free_iov = iomsg->fast_iov;
4505                 ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
4506                                    UIO_FASTIOV, &iomsg->free_iov,
4507                                    &iomsg->msg.msg_iter, true);
4508                 if (ret < 0)
4509                         return ret;
4510         }
4511
4512         return 0;
4513 }
4514 #endif
4515
4516 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
4517                                struct io_async_msghdr *iomsg)
4518 {
4519         iomsg->msg.msg_name = &iomsg->addr;
4520
4521 #ifdef CONFIG_COMPAT
4522         if (req->ctx->compat)
4523                 return __io_compat_recvmsg_copy_hdr(req, iomsg);
4524 #endif
4525
4526         return __io_recvmsg_copy_hdr(req, iomsg);
4527 }
4528
4529 static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
4530                                                bool needs_lock)
4531 {
4532         struct io_sr_msg *sr = &req->sr_msg;
4533         struct io_buffer *kbuf;
4534
4535         kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
4536         if (IS_ERR(kbuf))
4537                 return kbuf;
4538
4539         sr->kbuf = kbuf;
4540         req->flags |= REQ_F_BUFFER_SELECTED;
4541         return kbuf;
4542 }
4543
4544 static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
4545 {
4546         return io_put_kbuf(req, req->sr_msg.kbuf);
4547 }
4548
4549 static int io_recvmsg_prep_async(struct io_kiocb *req)
4550 {
4551         int ret;
4552
4553         ret = io_recvmsg_copy_hdr(req, req->async_data);
4554         if (!ret)
4555                 req->flags |= REQ_F_NEED_CLEANUP;
4556         return ret;
4557 }
4558
4559 static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4560 {
4561         struct io_sr_msg *sr = &req->sr_msg;
4562
4563         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4564                 return -EINVAL;
4565
4566         sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4567         sr->len = READ_ONCE(sqe->len);
4568         sr->bgid = READ_ONCE(sqe->buf_group);
4569         sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
4570         if (sr->msg_flags & MSG_DONTWAIT)
4571                 req->flags |= REQ_F_NOWAIT;
4572
4573 #ifdef CONFIG_COMPAT
4574         if (req->ctx->compat)
4575                 sr->msg_flags |= MSG_CMSG_COMPAT;
4576 #endif
4577         return 0;
4578 }
4579
4580 static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
4581 {
4582         struct io_async_msghdr iomsg, *kmsg;
4583         struct socket *sock;
4584         struct io_buffer *kbuf;
4585         unsigned flags;
4586         int min_ret = 0;
4587         int ret, cflags = 0;
4588         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4589
4590         sock = sock_from_file(req->file);
4591         if (unlikely(!sock))
4592                 return -ENOTSOCK;
4593
4594         kmsg = req->async_data;
4595         if (!kmsg) {
4596                 ret = io_recvmsg_copy_hdr(req, &iomsg);
4597                 if (ret)
4598                         return ret;
4599                 kmsg = &iomsg;
4600         }
4601
4602         if (req->flags & REQ_F_BUFFER_SELECT) {
4603                 kbuf = io_recv_buffer_select(req, !force_nonblock);
4604                 if (IS_ERR(kbuf))
4605                         return PTR_ERR(kbuf);
4606                 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
4607                 kmsg->fast_iov[0].iov_len = req->sr_msg.len;
4608                 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
4609                                 1, req->sr_msg.len);
4610         }
4611
4612         flags = req->sr_msg.msg_flags;
4613         if (force_nonblock)
4614                 flags |= MSG_DONTWAIT;
4615         if (flags & MSG_WAITALL)
4616                 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
4617
4618         ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
4619                                         kmsg->uaddr, flags);
4620         if (force_nonblock && ret == -EAGAIN)
4621                 return io_setup_async_msg(req, kmsg);
4622         if (ret == -ERESTARTSYS)
4623                 ret = -EINTR;
4624
4625         if (req->flags & REQ_F_BUFFER_SELECTED)
4626                 cflags = io_put_recv_kbuf(req);
4627         /* fast path, check for non-NULL to avoid function call */
4628         if (kmsg->free_iov)
4629                 kfree(kmsg->free_iov);
4630         req->flags &= ~REQ_F_NEED_CLEANUP;
4631         if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
4632                 req_set_fail(req);
4633         __io_req_complete(req, issue_flags, ret, cflags);
4634         return 0;
4635 }
4636
4637 static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
4638 {
4639         struct io_buffer *kbuf;
4640         struct io_sr_msg *sr = &req->sr_msg;
4641         struct msghdr msg;
4642         void __user *buf = sr->buf;
4643         struct socket *sock;
4644         struct iovec iov;
4645         unsigned flags;
4646         int min_ret = 0;
4647         int ret, cflags = 0;
4648         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4649
4650         sock = sock_from_file(req->file);
4651         if (unlikely(!sock))
4652                 return -ENOTSOCK;
4653
4654         if (req->flags & REQ_F_BUFFER_SELECT) {
4655                 kbuf = io_recv_buffer_select(req, !force_nonblock);
4656                 if (IS_ERR(kbuf))
4657                         return PTR_ERR(kbuf);
4658                 buf = u64_to_user_ptr(kbuf->addr);
4659         }
4660
4661         ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
4662         if (unlikely(ret))
4663                 goto out_free;
4664
4665         msg.msg_name = NULL;
4666         msg.msg_control = NULL;
4667         msg.msg_controllen = 0;
4668         msg.msg_namelen = 0;
4669         msg.msg_iocb = NULL;
4670         msg.msg_flags = 0;
4671
4672         flags = req->sr_msg.msg_flags;
4673         if (force_nonblock)
4674                 flags |= MSG_DONTWAIT;
4675         if (flags & MSG_WAITALL)
4676                 min_ret = iov_iter_count(&msg.msg_iter);
4677
4678         ret = sock_recvmsg(sock, &msg, flags);
4679         if (force_nonblock && ret == -EAGAIN)
4680                 return -EAGAIN;
4681         if (ret == -ERESTARTSYS)
4682                 ret = -EINTR;
4683 out_free:
4684         if (req->flags & REQ_F_BUFFER_SELECTED)
4685                 cflags = io_put_recv_kbuf(req);
4686         if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
4687                 req_set_fail(req);
4688         __io_req_complete(req, issue_flags, ret, cflags);
4689         return 0;
4690 }
4691
4692 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4693 {
4694         struct io_accept *accept = &req->accept;
4695
4696         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4697                 return -EINVAL;
4698         if (sqe->ioprio || sqe->len || sqe->buf_index)
4699                 return -EINVAL;
4700
4701         accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4702         accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4703         accept->flags = READ_ONCE(sqe->accept_flags);
4704         accept->nofile = rlimit(RLIMIT_NOFILE);
4705         return 0;
4706 }
4707
4708 static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
4709 {
4710         struct io_accept *accept = &req->accept;
4711         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4712         unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
4713         int ret;
4714
4715         if (req->file->f_flags & O_NONBLOCK)
4716                 req->flags |= REQ_F_NOWAIT;
4717
4718         ret = __sys_accept4_file(req->file, file_flags, accept->addr,
4719                                         accept->addr_len, accept->flags,
4720                                         accept->nofile);
4721         if (ret == -EAGAIN && force_nonblock)
4722                 return -EAGAIN;
4723         if (ret < 0) {
4724                 if (ret == -ERESTARTSYS)
4725                         ret = -EINTR;
4726                 req_set_fail(req);
4727         }
4728         __io_req_complete(req, issue_flags, ret, 0);
4729         return 0;
4730 }
4731
4732 static int io_connect_prep_async(struct io_kiocb *req)
4733 {
4734         struct io_async_connect *io = req->async_data;
4735         struct io_connect *conn = &req->connect;
4736
4737         return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
4738 }
4739
4740 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4741 {
4742         struct io_connect *conn = &req->connect;
4743
4744         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4745                 return -EINVAL;
4746         if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
4747                 return -EINVAL;
4748
4749         conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4750         conn->addr_len =  READ_ONCE(sqe->addr2);
4751         return 0;
4752 }
4753
4754 static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
4755 {
4756         struct io_async_connect __io, *io;
4757         unsigned file_flags;
4758         int ret;
4759         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4760
4761         if (req->async_data) {
4762                 io = req->async_data;
4763         } else {
4764                 ret = move_addr_to_kernel(req->connect.addr,
4765                                                 req->connect.addr_len,
4766                                                 &__io.address);
4767                 if (ret)
4768                         goto out;
4769                 io = &__io;
4770         }
4771
4772         file_flags = force_nonblock ? O_NONBLOCK : 0;
4773
4774         ret = __sys_connect_file(req->file, &io->address,
4775                                         req->connect.addr_len, file_flags);
4776         if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
4777                 if (req->async_data)
4778                         return -EAGAIN;
4779                 if (io_alloc_async_data(req)) {
4780                         ret = -ENOMEM;
4781                         goto out;
4782                 }
4783                 memcpy(req->async_data, &__io, sizeof(__io));
4784                 return -EAGAIN;
4785         }
4786         if (ret == -ERESTARTSYS)
4787                 ret = -EINTR;
4788 out:
4789         if (ret < 0)
4790                 req_set_fail(req);
4791         __io_req_complete(req, issue_flags, ret, 0);
4792         return 0;
4793 }
4794 #else /* !CONFIG_NET */
4795 #define IO_NETOP_FN(op)                                                 \
4796 static int io_##op(struct io_kiocb *req, unsigned int issue_flags)      \
4797 {                                                                       \
4798         return -EOPNOTSUPP;                                             \
4799 }
4800
4801 #define IO_NETOP_PREP(op)                                               \
4802 IO_NETOP_FN(op)                                                         \
4803 static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
4804 {                                                                       \
4805         return -EOPNOTSUPP;                                             \
4806 }                                                                       \
4807
4808 #define IO_NETOP_PREP_ASYNC(op)                                         \
4809 IO_NETOP_PREP(op)                                                       \
4810 static int io_##op##_prep_async(struct io_kiocb *req)                   \
4811 {                                                                       \
4812         return -EOPNOTSUPP;                                             \
4813 }
4814
4815 IO_NETOP_PREP_ASYNC(sendmsg);
4816 IO_NETOP_PREP_ASYNC(recvmsg);
4817 IO_NETOP_PREP_ASYNC(connect);
4818 IO_NETOP_PREP(accept);
4819 IO_NETOP_FN(send);
4820 IO_NETOP_FN(recv);
4821 #endif /* CONFIG_NET */
4822
4823 struct io_poll_table {
4824         struct poll_table_struct pt;
4825         struct io_kiocb *req;
4826         int error;
4827 };
4828
4829 static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
4830                            __poll_t mask, task_work_func_t func)
4831 {
4832         int ret;
4833
4834         /* for instances that support it check for an event match first: */
4835         if (mask && !(mask & poll->events))
4836                 return 0;
4837
4838         trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
4839
4840         list_del_init(&poll->wait.entry);
4841
4842         req->result = mask;
4843         req->task_work.func = func;
4844
4845         /*
4846          * If this fails, then the task is exiting. When a task exits, the
4847          * work gets canceled, so just cancel this request as well instead
4848          * of executing it. We can't safely execute it anyway, as we may not
4849          * have the needed state needed for it anyway.
4850          */
4851         ret = io_req_task_work_add(req);
4852         if (unlikely(ret)) {
4853                 WRITE_ONCE(poll->canceled, true);
4854                 io_req_task_work_add_fallback(req, func);
4855         }
4856         return 1;
4857 }
4858
4859 static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
4860         __acquires(&req->ctx->completion_lock)
4861 {
4862         struct io_ring_ctx *ctx = req->ctx;
4863
4864         if (!req->result && !READ_ONCE(poll->canceled)) {
4865                 struct poll_table_struct pt = { ._key = poll->events };
4866
4867                 req->result = vfs_poll(req->file, &pt) & poll->events;
4868         }
4869
4870         spin_lock_irq(&ctx->completion_lock);
4871         if (!req->result && !READ_ONCE(poll->canceled)) {
4872                 add_wait_queue(poll->head, &poll->wait);
4873                 return true;
4874         }
4875
4876         return false;
4877 }
4878
4879 static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
4880 {
4881         /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
4882         if (req->opcode == IORING_OP_POLL_ADD)
4883                 return req->async_data;
4884         return req->apoll->double_poll;
4885 }
4886
4887 static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
4888 {
4889         if (req->opcode == IORING_OP_POLL_ADD)
4890                 return &req->poll;
4891         return &req->apoll->poll;
4892 }
4893
4894 static void io_poll_remove_double(struct io_kiocb *req)
4895         __must_hold(&req->ctx->completion_lock)
4896 {
4897         struct io_poll_iocb *poll = io_poll_get_double(req);
4898
4899         lockdep_assert_held(&req->ctx->completion_lock);
4900
4901         if (poll && poll->head) {
4902                 struct wait_queue_head *head = poll->head;
4903
4904                 spin_lock(&head->lock);
4905                 list_del_init(&poll->wait.entry);
4906                 if (poll->wait.private)
4907                         req_ref_put(req);
4908                 poll->head = NULL;
4909                 spin_unlock(&head->lock);
4910         }
4911 }
4912
4913 static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
4914         __must_hold(&req->ctx->completion_lock)
4915 {
4916         struct io_ring_ctx *ctx = req->ctx;
4917         unsigned flags = IORING_CQE_F_MORE;
4918         int error;
4919
4920         if (READ_ONCE(req->poll.canceled)) {
4921                 error = -ECANCELED;
4922                 req->poll.events |= EPOLLONESHOT;
4923         } else {
4924                 error = mangle_poll(mask);
4925         }
4926         if (req->poll.events & EPOLLONESHOT)
4927                 flags = 0;
4928         if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
4929                 io_poll_remove_waitqs(req);
4930                 req->poll.done = true;
4931                 flags = 0;
4932         }
4933         if (flags & IORING_CQE_F_MORE)
4934                 ctx->cq_extra++;
4935
4936         io_commit_cqring(ctx);
4937         return !(flags & IORING_CQE_F_MORE);
4938 }
4939
4940 static void io_poll_task_func(struct callback_head *cb)
4941 {
4942         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4943         struct io_ring_ctx *ctx = req->ctx;
4944         struct io_kiocb *nxt;
4945
4946         if (io_poll_rewait(req, &req->poll)) {
4947                 spin_unlock_irq(&ctx->completion_lock);
4948         } else {
4949                 bool done;
4950
4951                 done = io_poll_complete(req, req->result);
4952                 if (done) {
4953                         hash_del(&req->hash_node);
4954                 } else {
4955                         req->result = 0;
4956                         add_wait_queue(req->poll.head, &req->poll.wait);
4957                 }
4958                 spin_unlock_irq(&ctx->completion_lock);
4959                 io_cqring_ev_posted(ctx);
4960
4961                 if (done) {
4962                         nxt = io_put_req_find_next(req);
4963                         if (nxt)
4964                                 __io_req_task_submit(nxt);
4965                 }
4966         }
4967 }
4968
4969 static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
4970                                int sync, void *key)
4971 {
4972         struct io_kiocb *req = wait->private;
4973         struct io_poll_iocb *poll = io_poll_get_single(req);
4974         __poll_t mask = key_to_poll(key);
4975
4976         /* for instances that support it check for an event match first: */
4977         if (mask && !(mask & poll->events))
4978                 return 0;
4979         if (!(poll->events & EPOLLONESHOT))
4980                 return poll->wait.func(&poll->wait, mode, sync, key);
4981
4982         list_del_init(&wait->entry);
4983
4984         if (poll && poll->head) {
4985                 bool done;
4986
4987                 spin_lock(&poll->head->lock);
4988                 done = list_empty(&poll->wait.entry);
4989                 if (!done)
4990                         list_del_init(&poll->wait.entry);
4991                 /* make sure double remove sees this as being gone */
4992                 wait->private = NULL;
4993                 spin_unlock(&poll->head->lock);
4994                 if (!done) {
4995                         /* use wait func handler, so it matches the rq type */
4996                         poll->wait.func(&poll->wait, mode, sync, key);
4997                 }
4998         }
4999         req_ref_put(req);
5000         return 1;
5001 }
5002
5003 static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
5004                               wait_queue_func_t wake_func)
5005 {
5006         poll->head = NULL;
5007         poll->done = false;
5008         poll->canceled = false;
5009 #define IO_POLL_UNMASK  (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
5010         /* mask in events that we always want/need */
5011         poll->events = events | IO_POLL_UNMASK;
5012         INIT_LIST_HEAD(&poll->wait.entry);
5013         init_waitqueue_func_entry(&poll->wait, wake_func);
5014 }
5015
5016 static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
5017                             struct wait_queue_head *head,
5018                             struct io_poll_iocb **poll_ptr)
5019 {
5020         struct io_kiocb *req = pt->req;
5021
5022         /*
5023          * If poll->head is already set, it's because the file being polled
5024          * uses multiple waitqueues for poll handling (eg one for read, one
5025          * for write). Setup a separate io_poll_iocb if this happens.
5026          */
5027         if (unlikely(poll->head)) {
5028                 struct io_poll_iocb *poll_one = poll;
5029
5030                 /* already have a 2nd entry, fail a third attempt */
5031                 if (*poll_ptr) {
5032                         pt->error = -EINVAL;
5033                         return;
5034                 }
5035                 /*
5036                  * Can't handle multishot for double wait for now, turn it
5037                  * into one-shot mode.
5038                  */
5039                 if (!(poll_one->events & EPOLLONESHOT))
5040                         poll_one->events |= EPOLLONESHOT;
5041                 /* double add on the same waitqueue head, ignore */
5042                 if (poll_one->head == head)
5043                         return;
5044                 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
5045                 if (!poll) {
5046                         pt->error = -ENOMEM;
5047                         return;
5048                 }
5049                 io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
5050                 req_ref_get(req);
5051                 poll->wait.private = req;
5052                 *poll_ptr = poll;
5053         }
5054
5055         pt->error = 0;
5056         poll->head = head;
5057
5058         if (poll->events & EPOLLEXCLUSIVE)
5059                 add_wait_queue_exclusive(head, &poll->wait);
5060         else
5061                 add_wait_queue(head, &poll->wait);
5062 }
5063
5064 static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
5065                                struct poll_table_struct *p)
5066 {
5067         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5068         struct async_poll *apoll = pt->req->apoll;
5069
5070         __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
5071 }
5072
5073 static void io_async_task_func(struct callback_head *cb)
5074 {
5075         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
5076         struct async_poll *apoll = req->apoll;
5077         struct io_ring_ctx *ctx = req->ctx;
5078
5079         trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
5080
5081         if (io_poll_rewait(req, &apoll->poll)) {
5082                 spin_unlock_irq(&ctx->completion_lock);
5083                 return;
5084         }
5085
5086         hash_del(&req->hash_node);
5087         io_poll_remove_double(req);
5088         spin_unlock_irq(&ctx->completion_lock);
5089
5090         if (!READ_ONCE(apoll->poll.canceled))
5091                 __io_req_task_submit(req);
5092         else
5093                 io_req_complete_failed(req, -ECANCELED);
5094 }
5095
5096 static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5097                         void *key)
5098 {
5099         struct io_kiocb *req = wait->private;
5100         struct io_poll_iocb *poll = &req->apoll->poll;
5101
5102         trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
5103                                         key_to_poll(key));
5104
5105         return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
5106 }
5107
5108 static void io_poll_req_insert(struct io_kiocb *req)
5109 {
5110         struct io_ring_ctx *ctx = req->ctx;
5111         struct hlist_head *list;
5112
5113         list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
5114         hlist_add_head(&req->hash_node, list);
5115 }
5116
5117 static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
5118                                       struct io_poll_iocb *poll,
5119                                       struct io_poll_table *ipt, __poll_t mask,
5120                                       wait_queue_func_t wake_func)
5121         __acquires(&ctx->completion_lock)
5122 {
5123         struct io_ring_ctx *ctx = req->ctx;
5124         bool cancel = false;
5125
5126         INIT_HLIST_NODE(&req->hash_node);
5127         io_init_poll_iocb(poll, mask, wake_func);
5128         poll->file = req->file;
5129         poll->wait.private = req;
5130
5131         ipt->pt._key = mask;
5132         ipt->req = req;
5133         ipt->error = -EINVAL;
5134
5135         mask = vfs_poll(req->file, &ipt->pt) & poll->events;
5136
5137         spin_lock_irq(&ctx->completion_lock);
5138         if (likely(poll->head)) {
5139                 spin_lock(&poll->head->lock);
5140                 if (unlikely(list_empty(&poll->wait.entry))) {
5141                         if (ipt->error)
5142                                 cancel = true;
5143                         ipt->error = 0;
5144                         mask = 0;
5145                 }
5146                 if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error)
5147                         list_del_init(&poll->wait.entry);
5148                 else if (cancel)
5149                         WRITE_ONCE(poll->canceled, true);
5150                 else if (!poll->done) /* actually waiting for an event */
5151                         io_poll_req_insert(req);
5152                 spin_unlock(&poll->head->lock);
5153         }
5154
5155         return mask;
5156 }
5157
5158 static bool io_arm_poll_handler(struct io_kiocb *req)
5159 {
5160         const struct io_op_def *def = &io_op_defs[req->opcode];
5161         struct io_ring_ctx *ctx = req->ctx;
5162         struct async_poll *apoll;
5163         struct io_poll_table ipt;
5164         __poll_t mask, ret;
5165         int rw;
5166
5167         if (!req->file || !file_can_poll(req->file))
5168                 return false;
5169         if (req->flags & REQ_F_POLLED)
5170                 return false;
5171         if (def->pollin)
5172                 rw = READ;
5173         else if (def->pollout)
5174                 rw = WRITE;
5175         else
5176                 return false;
5177         /* if we can't nonblock try, then no point in arming a poll handler */
5178         if (!io_file_supports_async(req, rw))
5179                 return false;
5180
5181         apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5182         if (unlikely(!apoll))
5183                 return false;
5184         apoll->double_poll = NULL;
5185
5186         req->flags |= REQ_F_POLLED;
5187         req->apoll = apoll;
5188
5189         mask = EPOLLONESHOT;
5190         if (def->pollin)
5191                 mask |= POLLIN | POLLRDNORM;
5192         if (def->pollout)
5193                 mask |= POLLOUT | POLLWRNORM;
5194
5195         /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
5196         if ((req->opcode == IORING_OP_RECVMSG) &&
5197             (req->sr_msg.msg_flags & MSG_ERRQUEUE))
5198                 mask &= ~POLLIN;
5199
5200         mask |= POLLERR | POLLPRI;
5201
5202         ipt.pt._qproc = io_async_queue_proc;
5203
5204         ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
5205                                         io_async_wake);
5206         if (ret || ipt.error) {
5207                 io_poll_remove_double(req);
5208                 spin_unlock_irq(&ctx->completion_lock);
5209                 return false;
5210         }
5211         spin_unlock_irq(&ctx->completion_lock);
5212         trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
5213                                         apoll->poll.events);
5214         return true;
5215 }
5216
5217 static bool __io_poll_remove_one(struct io_kiocb *req,
5218                                  struct io_poll_iocb *poll, bool do_cancel)
5219         __must_hold(&req->ctx->completion_lock)
5220 {
5221         bool do_complete = false;
5222
5223         if (!poll->head)
5224                 return false;
5225         spin_lock(&poll->head->lock);
5226         if (do_cancel)
5227                 WRITE_ONCE(poll->canceled, true);
5228         if (!list_empty(&poll->wait.entry)) {
5229                 list_del_init(&poll->wait.entry);
5230                 do_complete = true;
5231         }
5232         spin_unlock(&poll->head->lock);
5233         hash_del(&req->hash_node);
5234         return do_complete;
5235 }
5236
5237 static bool io_poll_remove_waitqs(struct io_kiocb *req)
5238         __must_hold(&req->ctx->completion_lock)
5239 {
5240         bool do_complete;
5241
5242         io_poll_remove_double(req);
5243         do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
5244
5245         if (req->opcode != IORING_OP_POLL_ADD && do_complete) {
5246                 /* non-poll requests have submit ref still */
5247                 req_ref_put(req);
5248         }
5249         return do_complete;
5250 }
5251
5252 static bool io_poll_remove_one(struct io_kiocb *req)
5253         __must_hold(&req->ctx->completion_lock)
5254 {
5255         bool do_complete;
5256
5257         do_complete = io_poll_remove_waitqs(req);
5258         if (do_complete) {
5259                 io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
5260                 io_commit_cqring(req->ctx);
5261                 req_set_fail(req);
5262                 io_put_req_deferred(req, 1);
5263         }
5264
5265         return do_complete;
5266 }
5267
5268 /*
5269  * Returns true if we found and killed one or more poll requests
5270  */
5271 static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
5272                                bool cancel_all)
5273 {
5274         struct hlist_node *tmp;
5275         struct io_kiocb *req;
5276         int posted = 0, i;
5277
5278         spin_lock_irq(&ctx->completion_lock);
5279         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
5280                 struct hlist_head *list;
5281
5282                 list = &ctx->cancel_hash[i];
5283                 hlist_for_each_entry_safe(req, tmp, list, hash_node) {
5284                         if (io_match_task(req, tsk, cancel_all))
5285                                 posted += io_poll_remove_one(req);
5286                 }
5287         }
5288         spin_unlock_irq(&ctx->completion_lock);
5289
5290         if (posted)
5291                 io_cqring_ev_posted(ctx);
5292
5293         return posted != 0;
5294 }
5295
5296 static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
5297                                      bool poll_only)
5298         __must_hold(&ctx->completion_lock)
5299 {
5300         struct hlist_head *list;
5301         struct io_kiocb *req;
5302
5303         list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
5304         hlist_for_each_entry(req, list, hash_node) {
5305                 if (sqe_addr != req->user_data)
5306                         continue;
5307                 if (poll_only && req->opcode != IORING_OP_POLL_ADD)
5308                         continue;
5309                 return req;
5310         }
5311         return NULL;
5312 }
5313
5314 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
5315                           bool poll_only)
5316         __must_hold(&ctx->completion_lock)
5317 {
5318         struct io_kiocb *req;
5319
5320         req = io_poll_find(ctx, sqe_addr, poll_only);
5321         if (!req)
5322                 return -ENOENT;
5323         if (io_poll_remove_one(req))
5324                 return 0;
5325
5326         return -EALREADY;
5327 }
5328
5329 static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
5330                                      unsigned int flags)
5331 {
5332         u32 events;
5333
5334         events = READ_ONCE(sqe->poll32_events);
5335 #ifdef __BIG_ENDIAN
5336         events = swahw32(events);
5337 #endif
5338         if (!(flags & IORING_POLL_ADD_MULTI))
5339                 events |= EPOLLONESHOT;
5340         return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
5341 }
5342
5343 static int io_poll_update_prep(struct io_kiocb *req,
5344                                const struct io_uring_sqe *sqe)
5345 {
5346         struct io_poll_update *upd = &req->poll_update;
5347         u32 flags;
5348
5349         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5350                 return -EINVAL;
5351         if (sqe->ioprio || sqe->buf_index)
5352                 return -EINVAL;
5353         flags = READ_ONCE(sqe->len);
5354         if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
5355                       IORING_POLL_ADD_MULTI))
5356                 return -EINVAL;
5357         /* meaningless without update */
5358         if (flags == IORING_POLL_ADD_MULTI)
5359                 return -EINVAL;
5360
5361         upd->old_user_data = READ_ONCE(sqe->addr);
5362         upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
5363         upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
5364
5365         upd->new_user_data = READ_ONCE(sqe->off);
5366         if (!upd->update_user_data && upd->new_user_data)
5367                 return -EINVAL;
5368         if (upd->update_events)
5369                 upd->events = io_poll_parse_events(sqe, flags);
5370         else if (sqe->poll32_events)
5371                 return -EINVAL;
5372
5373         return 0;
5374 }
5375
5376 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5377                         void *key)
5378 {
5379         struct io_kiocb *req = wait->private;
5380         struct io_poll_iocb *poll = &req->poll;
5381
5382         return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
5383 }
5384
5385 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
5386                                struct poll_table_struct *p)
5387 {
5388         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5389
5390         __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
5391 }
5392
5393 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5394 {
5395         struct io_poll_iocb *poll = &req->poll;
5396         u32 flags;
5397
5398         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5399                 return -EINVAL;
5400         if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
5401                 return -EINVAL;
5402         flags = READ_ONCE(sqe->len);
5403         if (flags & ~IORING_POLL_ADD_MULTI)
5404                 return -EINVAL;
5405
5406         poll->events = io_poll_parse_events(sqe, flags);
5407         return 0;
5408 }
5409
5410 static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
5411 {
5412         struct io_poll_iocb *poll = &req->poll;
5413         struct io_ring_ctx *ctx = req->ctx;
5414         struct io_poll_table ipt;
5415         __poll_t mask;
5416
5417         ipt.pt._qproc = io_poll_queue_proc;
5418
5419         mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
5420                                         io_poll_wake);
5421
5422         if (mask) { /* no async, we'd stolen it */
5423                 ipt.error = 0;
5424                 io_poll_complete(req, mask);
5425         }
5426         spin_unlock_irq(&ctx->completion_lock);
5427
5428         if (mask) {
5429                 io_cqring_ev_posted(ctx);
5430                 if (poll->events & EPOLLONESHOT)
5431                         io_put_req(req);
5432         }
5433         return ipt.error;
5434 }
5435
5436 static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
5437 {
5438         struct io_ring_ctx *ctx = req->ctx;
5439         struct io_kiocb *preq;
5440         bool completing;
5441         int ret;
5442
5443         spin_lock_irq(&ctx->completion_lock);
5444         preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
5445         if (!preq) {
5446                 ret = -ENOENT;
5447                 goto err;
5448         }
5449
5450         if (!req->poll_update.update_events && !req->poll_update.update_user_data) {
5451                 completing = true;
5452                 ret = io_poll_remove_one(preq) ? 0 : -EALREADY;
5453                 goto err;
5454         }
5455
5456         /*
5457          * Don't allow racy completion with singleshot, as we cannot safely
5458          * update those. For multishot, if we're racing with completion, just
5459          * let completion re-add it.
5460          */
5461         completing = !__io_poll_remove_one(preq, &preq->poll, false);
5462         if (completing && (preq->poll.events & EPOLLONESHOT)) {
5463                 ret = -EALREADY;
5464                 goto err;
5465         }
5466         /* we now have a detached poll request. reissue. */
5467         ret = 0;
5468 err:
5469         if (ret < 0) {
5470                 spin_unlock_irq(&ctx->completion_lock);
5471                 req_set_fail(req);
5472                 io_req_complete(req, ret);
5473                 return 0;
5474         }
5475         /* only mask one event flags, keep behavior flags */
5476         if (req->poll_update.update_events) {
5477                 preq->poll.events &= ~0xffff;
5478                 preq->poll.events |= req->poll_update.events & 0xffff;
5479                 preq->poll.events |= IO_POLL_UNMASK;
5480         }
5481         if (req->poll_update.update_user_data)
5482                 preq->user_data = req->poll_update.new_user_data;
5483         spin_unlock_irq(&ctx->completion_lock);
5484
5485         /* complete update request, we're done with it */
5486         io_req_complete(req, ret);
5487
5488         if (!completing) {
5489                 ret = io_poll_add(preq, issue_flags);
5490                 if (ret < 0) {
5491                         req_set_fail(preq);
5492                         io_req_complete(preq, ret);
5493                 }
5494         }
5495         return 0;
5496 }
5497
5498 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
5499 {
5500         struct io_timeout_data *data = container_of(timer,
5501                                                 struct io_timeout_data, timer);
5502         struct io_kiocb *req = data->req;
5503         struct io_ring_ctx *ctx = req->ctx;
5504         unsigned long flags;
5505
5506         spin_lock_irqsave(&ctx->completion_lock, flags);
5507         list_del_init(&req->timeout.list);
5508         atomic_set(&req->ctx->cq_timeouts,
5509                 atomic_read(&req->ctx->cq_timeouts) + 1);
5510
5511         io_cqring_fill_event(ctx, req->user_data, -ETIME, 0);
5512         io_commit_cqring(ctx);
5513         spin_unlock_irqrestore(&ctx->completion_lock, flags);
5514
5515         io_cqring_ev_posted(ctx);
5516         req_set_fail(req);
5517         io_put_req(req);
5518         return HRTIMER_NORESTART;
5519 }
5520
5521 static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
5522                                            __u64 user_data)
5523         __must_hold(&ctx->completion_lock)
5524 {
5525         struct io_timeout_data *io;
5526         struct io_kiocb *req;
5527         bool found = false;
5528
5529         list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
5530                 found = user_data == req->user_data;
5531                 if (found)
5532                         break;
5533         }
5534         if (!found)
5535                 return ERR_PTR(-ENOENT);
5536
5537         io = req->async_data;
5538         if (hrtimer_try_to_cancel(&io->timer) == -1)
5539                 return ERR_PTR(-EALREADY);
5540         list_del_init(&req->timeout.list);
5541         return req;
5542 }
5543
5544 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
5545         __must_hold(&ctx->completion_lock)
5546 {
5547         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5548
5549         if (IS_ERR(req))
5550                 return PTR_ERR(req);
5551
5552         req_set_fail(req);
5553         io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
5554         io_put_req_deferred(req, 1);
5555         return 0;
5556 }
5557
5558 static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
5559                              struct timespec64 *ts, enum hrtimer_mode mode)
5560         __must_hold(&ctx->completion_lock)
5561 {
5562         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
5563         struct io_timeout_data *data;
5564
5565         if (IS_ERR(req))
5566                 return PTR_ERR(req);
5567
5568         req->timeout.off = 0; /* noseq */
5569         data = req->async_data;
5570         list_add_tail(&req->timeout.list, &ctx->timeout_list);
5571         hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
5572         data->timer.function = io_timeout_fn;
5573         hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
5574         return 0;
5575 }
5576
5577 static int io_timeout_remove_prep(struct io_kiocb *req,
5578                                   const struct io_uring_sqe *sqe)
5579 {
5580         struct io_timeout_rem *tr = &req->timeout_rem;
5581
5582         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5583                 return -EINVAL;
5584         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5585                 return -EINVAL;
5586         if (sqe->ioprio || sqe->buf_index || sqe->len)
5587                 return -EINVAL;
5588
5589         tr->addr = READ_ONCE(sqe->addr);
5590         tr->flags = READ_ONCE(sqe->timeout_flags);
5591         if (tr->flags & IORING_TIMEOUT_UPDATE) {
5592                 if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
5593                         return -EINVAL;
5594                 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
5595                         return -EFAULT;
5596         } else if (tr->flags) {
5597                 /* timeout removal doesn't support flags */
5598                 return -EINVAL;
5599         }
5600
5601         return 0;
5602 }
5603
5604 static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
5605 {
5606         return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
5607                                             : HRTIMER_MODE_REL;
5608 }
5609
5610 /*
5611  * Remove or update an existing timeout command
5612  */
5613 static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
5614 {
5615         struct io_timeout_rem *tr = &req->timeout_rem;
5616         struct io_ring_ctx *ctx = req->ctx;
5617         int ret;
5618
5619         spin_lock_irq(&ctx->completion_lock);
5620         if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE))
5621                 ret = io_timeout_cancel(ctx, tr->addr);
5622         else
5623                 ret = io_timeout_update(ctx, tr->addr, &tr->ts,
5624                                         io_translate_timeout_mode(tr->flags));
5625
5626         io_cqring_fill_event(ctx, req->user_data, ret, 0);
5627         io_commit_cqring(ctx);
5628         spin_unlock_irq(&ctx->completion_lock);
5629         io_cqring_ev_posted(ctx);
5630         if (ret < 0)
5631                 req_set_fail(req);
5632         io_put_req(req);
5633         return 0;
5634 }
5635
5636 static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5637                            bool is_timeout_link)
5638 {
5639         struct io_timeout_data *data;
5640         unsigned flags;
5641         u32 off = READ_ONCE(sqe->off);
5642
5643         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5644                 return -EINVAL;
5645         if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
5646                 return -EINVAL;
5647         if (off && is_timeout_link)
5648                 return -EINVAL;
5649         flags = READ_ONCE(sqe->timeout_flags);
5650         if (flags & ~IORING_TIMEOUT_ABS)
5651                 return -EINVAL;
5652
5653         req->timeout.off = off;
5654
5655         if (!req->async_data && io_alloc_async_data(req))
5656                 return -ENOMEM;
5657
5658         data = req->async_data;
5659         data->req = req;
5660
5661         if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5662                 return -EFAULT;
5663
5664         data->mode = io_translate_timeout_mode(flags);
5665         hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
5666         if (is_timeout_link)
5667                 io_req_track_inflight(req);
5668         return 0;
5669 }
5670
5671 static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
5672 {
5673         struct io_ring_ctx *ctx = req->ctx;
5674         struct io_timeout_data *data = req->async_data;
5675         struct list_head *entry;
5676         u32 tail, off = req->timeout.off;
5677
5678         spin_lock_irq(&ctx->completion_lock);
5679
5680         /*
5681          * sqe->off holds how many events that need to occur for this
5682          * timeout event to be satisfied. If it isn't set, then this is
5683          * a pure timeout request, sequence isn't used.
5684          */
5685         if (io_is_timeout_noseq(req)) {
5686                 entry = ctx->timeout_list.prev;
5687                 goto add;
5688         }
5689
5690         tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
5691         req->timeout.target_seq = tail + off;
5692
5693         /* Update the last seq here in case io_flush_timeouts() hasn't.
5694          * This is safe because ->completion_lock is held, and submissions
5695          * and completions are never mixed in the same ->completion_lock section.
5696          */
5697         ctx->cq_last_tm_flush = tail;
5698
5699         /*
5700          * Insertion sort, ensuring the first entry in the list is always
5701          * the one we need first.
5702          */
5703         list_for_each_prev(entry, &ctx->timeout_list) {
5704                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
5705                                                   timeout.list);
5706
5707                 if (io_is_timeout_noseq(nxt))
5708                         continue;
5709                 /* nxt.seq is behind @tail, otherwise would've been completed */
5710                 if (off >= nxt->timeout.target_seq - tail)
5711                         break;
5712         }
5713 add:
5714         list_add(&req->timeout.list, entry);
5715         data->timer.function = io_timeout_fn;
5716         hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
5717         spin_unlock_irq(&ctx->completion_lock);
5718         return 0;
5719 }
5720
5721 struct io_cancel_data {
5722         struct io_ring_ctx *ctx;
5723         u64 user_data;
5724 };
5725
5726 static bool io_cancel_cb(struct io_wq_work *work, void *data)
5727 {
5728         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5729         struct io_cancel_data *cd = data;
5730
5731         return req->ctx == cd->ctx && req->user_data == cd->user_data;
5732 }
5733
5734 static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
5735                                struct io_ring_ctx *ctx)
5736 {
5737         struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
5738         enum io_wq_cancel cancel_ret;
5739         int ret = 0;
5740
5741         if (!tctx || !tctx->io_wq)
5742                 return -ENOENT;
5743
5744         cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
5745         switch (cancel_ret) {
5746         case IO_WQ_CANCEL_OK:
5747                 ret = 0;
5748                 break;
5749         case IO_WQ_CANCEL_RUNNING:
5750                 ret = -EALREADY;
5751                 break;
5752         case IO_WQ_CANCEL_NOTFOUND:
5753                 ret = -ENOENT;
5754                 break;
5755         }
5756
5757         return ret;
5758 }
5759
5760 static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
5761                                      struct io_kiocb *req, __u64 sqe_addr,
5762                                      int success_ret)
5763 {
5764         unsigned long flags;
5765         int ret;
5766
5767         ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
5768         spin_lock_irqsave(&ctx->completion_lock, flags);
5769         if (ret != -ENOENT)
5770                 goto done;
5771         ret = io_timeout_cancel(ctx, sqe_addr);
5772         if (ret != -ENOENT)
5773                 goto done;
5774         ret = io_poll_cancel(ctx, sqe_addr, false);
5775 done:
5776         if (!ret)
5777                 ret = success_ret;
5778         io_cqring_fill_event(ctx, req->user_data, ret, 0);
5779         io_commit_cqring(ctx);
5780         spin_unlock_irqrestore(&ctx->completion_lock, flags);
5781         io_cqring_ev_posted(ctx);
5782
5783         if (ret < 0)
5784                 req_set_fail(req);
5785 }
5786
5787 static int io_async_cancel_prep(struct io_kiocb *req,
5788                                 const struct io_uring_sqe *sqe)
5789 {
5790         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5791                 return -EINVAL;
5792         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5793                 return -EINVAL;
5794         if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
5795                 return -EINVAL;
5796
5797         req->cancel.addr = READ_ONCE(sqe->addr);
5798         return 0;
5799 }
5800
5801 static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
5802 {
5803         struct io_ring_ctx *ctx = req->ctx;
5804         u64 sqe_addr = req->cancel.addr;
5805         struct io_tctx_node *node;
5806         int ret;
5807
5808         /* tasks should wait for their io-wq threads, so safe w/o sync */
5809         ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
5810         spin_lock_irq(&ctx->completion_lock);
5811         if (ret != -ENOENT)
5812                 goto done;
5813         ret = io_timeout_cancel(ctx, sqe_addr);
5814         if (ret != -ENOENT)
5815                 goto done;
5816         ret = io_poll_cancel(ctx, sqe_addr, false);
5817         if (ret != -ENOENT)
5818                 goto done;
5819         spin_unlock_irq(&ctx->completion_lock);
5820
5821         /* slow path, try all io-wq's */
5822         io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
5823         ret = -ENOENT;
5824         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
5825                 struct io_uring_task *tctx = node->task->io_uring;
5826
5827                 ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
5828                 if (ret != -ENOENT)
5829                         break;
5830         }
5831         io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
5832
5833         spin_lock_irq(&ctx->completion_lock);
5834 done:
5835         io_cqring_fill_event(ctx, req->user_data, ret, 0);
5836         io_commit_cqring(ctx);
5837         spin_unlock_irq(&ctx->completion_lock);
5838         io_cqring_ev_posted(ctx);
5839
5840         if (ret < 0)
5841                 req_set_fail(req);
5842         io_put_req(req);
5843         return 0;
5844 }
5845
5846 static int io_rsrc_update_prep(struct io_kiocb *req,
5847                                 const struct io_uring_sqe *sqe)
5848 {
5849         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5850                 return -EINVAL;
5851         if (sqe->ioprio || sqe->rw_flags)
5852                 return -EINVAL;
5853
5854         req->rsrc_update.offset = READ_ONCE(sqe->off);
5855         req->rsrc_update.nr_args = READ_ONCE(sqe->len);
5856         if (!req->rsrc_update.nr_args)
5857                 return -EINVAL;
5858         req->rsrc_update.arg = READ_ONCE(sqe->addr);
5859         return 0;
5860 }
5861
5862 static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
5863 {
5864         struct io_ring_ctx *ctx = req->ctx;
5865         struct io_uring_rsrc_update2 up;
5866         int ret;
5867
5868         if (issue_flags & IO_URING_F_NONBLOCK)
5869                 return -EAGAIN;
5870
5871         up.offset = req->rsrc_update.offset;
5872         up.data = req->rsrc_update.arg;
5873         up.nr = 0;
5874         up.tags = 0;
5875         up.resv = 0;
5876
5877         mutex_lock(&ctx->uring_lock);
5878         ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
5879                                         &up, req->rsrc_update.nr_args);
5880         mutex_unlock(&ctx->uring_lock);
5881
5882         if (ret < 0)
5883                 req_set_fail(req);
5884         __io_req_complete(req, issue_flags, ret, 0);
5885         return 0;
5886 }
5887
5888 static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5889 {
5890         switch (req->opcode) {
5891         case IORING_OP_NOP:
5892                 return 0;
5893         case IORING_OP_READV:
5894         case IORING_OP_READ_FIXED:
5895         case IORING_OP_READ:
5896                 return io_read_prep(req, sqe);
5897         case IORING_OP_WRITEV:
5898         case IORING_OP_WRITE_FIXED:
5899         case IORING_OP_WRITE:
5900                 return io_write_prep(req, sqe);
5901         case IORING_OP_POLL_ADD:
5902                 return io_poll_add_prep(req, sqe);
5903         case IORING_OP_POLL_REMOVE:
5904                 return io_poll_update_prep(req, sqe);
5905         case IORING_OP_FSYNC:
5906                 return io_fsync_prep(req, sqe);
5907         case IORING_OP_SYNC_FILE_RANGE:
5908                 return io_sfr_prep(req, sqe);
5909         case IORING_OP_SENDMSG:
5910         case IORING_OP_SEND:
5911                 return io_sendmsg_prep(req, sqe);
5912         case IORING_OP_RECVMSG:
5913         case IORING_OP_RECV:
5914                 return io_recvmsg_prep(req, sqe);
5915         case IORING_OP_CONNECT:
5916                 return io_connect_prep(req, sqe);
5917         case IORING_OP_TIMEOUT:
5918                 return io_timeout_prep(req, sqe, false);
5919         case IORING_OP_TIMEOUT_REMOVE:
5920                 return io_timeout_remove_prep(req, sqe);
5921         case IORING_OP_ASYNC_CANCEL:
5922                 return io_async_cancel_prep(req, sqe);
5923         case IORING_OP_LINK_TIMEOUT:
5924                 return io_timeout_prep(req, sqe, true);
5925         case IORING_OP_ACCEPT:
5926                 return io_accept_prep(req, sqe);
5927         case IORING_OP_FALLOCATE:
5928                 return io_fallocate_prep(req, sqe);
5929         case IORING_OP_OPENAT:
5930                 return io_openat_prep(req, sqe);
5931         case IORING_OP_CLOSE:
5932                 return io_close_prep(req, sqe);
5933         case IORING_OP_FILES_UPDATE:
5934                 return io_rsrc_update_prep(req, sqe);
5935         case IORING_OP_STATX:
5936                 return io_statx_prep(req, sqe);
5937         case IORING_OP_FADVISE:
5938                 return io_fadvise_prep(req, sqe);
5939         case IORING_OP_MADVISE:
5940                 return io_madvise_prep(req, sqe);
5941         case IORING_OP_OPENAT2:
5942                 return io_openat2_prep(req, sqe);
5943         case IORING_OP_EPOLL_CTL:
5944                 return io_epoll_ctl_prep(req, sqe);
5945         case IORING_OP_SPLICE:
5946                 return io_splice_prep(req, sqe);
5947         case IORING_OP_PROVIDE_BUFFERS:
5948                 return io_provide_buffers_prep(req, sqe);
5949         case IORING_OP_REMOVE_BUFFERS:
5950                 return io_remove_buffers_prep(req, sqe);
5951         case IORING_OP_TEE:
5952                 return io_tee_prep(req, sqe);
5953         case IORING_OP_SHUTDOWN:
5954                 return io_shutdown_prep(req, sqe);
5955         case IORING_OP_RENAMEAT:
5956                 return io_renameat_prep(req, sqe);
5957         case IORING_OP_UNLINKAT:
5958                 return io_unlinkat_prep(req, sqe);
5959         }
5960
5961         printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
5962                         req->opcode);
5963         return -EINVAL;
5964 }
5965
5966 static int io_req_prep_async(struct io_kiocb *req)
5967 {
5968         if (!io_op_defs[req->opcode].needs_async_setup)
5969                 return 0;
5970         if (WARN_ON_ONCE(req->async_data))
5971                 return -EFAULT;
5972         if (io_alloc_async_data(req))
5973                 return -EAGAIN;
5974
5975         switch (req->opcode) {
5976         case IORING_OP_READV:
5977                 return io_rw_prep_async(req, READ);
5978         case IORING_OP_WRITEV:
5979                 return io_rw_prep_async(req, WRITE);
5980         case IORING_OP_SENDMSG:
5981                 return io_sendmsg_prep_async(req);
5982         case IORING_OP_RECVMSG:
5983                 return io_recvmsg_prep_async(req);
5984         case IORING_OP_CONNECT:
5985                 return io_connect_prep_async(req);
5986         }
5987         printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
5988                     req->opcode);
5989         return -EFAULT;
5990 }
5991
5992 static u32 io_get_sequence(struct io_kiocb *req)
5993 {
5994         struct io_kiocb *pos;
5995         struct io_ring_ctx *ctx = req->ctx;
5996         u32 total_submitted, nr_reqs = 0;
5997
5998         io_for_each_link(pos, req)
5999                 nr_reqs++;
6000
6001         total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
6002         return total_submitted - nr_reqs;
6003 }
6004
6005 static int io_req_defer(struct io_kiocb *req)
6006 {
6007         struct io_ring_ctx *ctx = req->ctx;
6008         struct io_defer_entry *de;
6009         int ret;
6010         u32 seq;
6011
6012         /* Still need defer if there is pending req in defer list. */
6013         if (likely(list_empty_careful(&ctx->defer_list) &&
6014                 !(req->flags & REQ_F_IO_DRAIN)))
6015                 return 0;
6016
6017         seq = io_get_sequence(req);
6018         /* Still a chance to pass the sequence check */
6019         if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
6020                 return 0;
6021
6022         ret = io_req_prep_async(req);
6023         if (ret)
6024                 return ret;
6025         io_prep_async_link(req);
6026         de = kmalloc(sizeof(*de), GFP_KERNEL);
6027         if (!de)
6028                 return -ENOMEM;
6029
6030         spin_lock_irq(&ctx->completion_lock);
6031         if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
6032                 spin_unlock_irq(&ctx->completion_lock);
6033                 kfree(de);
6034                 io_queue_async_work(req);
6035                 return -EIOCBQUEUED;
6036         }
6037
6038         trace_io_uring_defer(ctx, req, req->user_data);
6039         de->req = req;
6040         de->seq = seq;
6041         list_add_tail(&de->list, &ctx->defer_list);
6042         spin_unlock_irq(&ctx->completion_lock);
6043         return -EIOCBQUEUED;
6044 }
6045
6046 static void io_clean_op(struct io_kiocb *req)
6047 {
6048         if (req->flags & REQ_F_BUFFER_SELECTED) {
6049                 switch (req->opcode) {
6050                 case IORING_OP_READV:
6051                 case IORING_OP_READ_FIXED:
6052                 case IORING_OP_READ:
6053                         kfree((void *)(unsigned long)req->rw.addr);
6054                         break;
6055                 case IORING_OP_RECVMSG:
6056                 case IORING_OP_RECV:
6057                         kfree(req->sr_msg.kbuf);
6058                         break;
6059                 }
6060                 req->flags &= ~REQ_F_BUFFER_SELECTED;
6061         }
6062
6063         if (req->flags & REQ_F_NEED_CLEANUP) {
6064                 switch (req->opcode) {
6065                 case IORING_OP_READV:
6066                 case IORING_OP_READ_FIXED:
6067                 case IORING_OP_READ:
6068                 case IORING_OP_WRITEV:
6069                 case IORING_OP_WRITE_FIXED:
6070                 case IORING_OP_WRITE: {
6071                         struct io_async_rw *io = req->async_data;
6072                         if (io->free_iovec)
6073                                 kfree(io->free_iovec);
6074                         break;
6075                         }
6076                 case IORING_OP_RECVMSG:
6077                 case IORING_OP_SENDMSG: {
6078                         struct io_async_msghdr *io = req->async_data;
6079
6080                         kfree(io->free_iov);
6081                         break;
6082                         }
6083                 case IORING_OP_SPLICE:
6084                 case IORING_OP_TEE:
6085                         if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED))
6086                                 io_put_file(req->splice.file_in);
6087                         break;
6088                 case IORING_OP_OPENAT:
6089                 case IORING_OP_OPENAT2:
6090                         if (req->open.filename)
6091                                 putname(req->open.filename);
6092                         break;
6093                 case IORING_OP_RENAMEAT:
6094                         putname(req->rename.oldpath);
6095                         putname(req->rename.newpath);
6096                         break;
6097                 case IORING_OP_UNLINKAT:
6098                         putname(req->unlink.filename);
6099                         break;
6100                 }
6101                 req->flags &= ~REQ_F_NEED_CLEANUP;
6102         }
6103         if ((req->flags & REQ_F_POLLED) && req->apoll) {
6104                 kfree(req->apoll->double_poll);
6105                 kfree(req->apoll);
6106                 req->apoll = NULL;
6107         }
6108         if (req->flags & REQ_F_INFLIGHT) {
6109                 struct io_uring_task *tctx = req->task->io_uring;
6110
6111                 atomic_dec(&tctx->inflight_tracked);
6112                 req->flags &= ~REQ_F_INFLIGHT;
6113         }
6114 }
6115
6116 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
6117 {
6118         struct io_ring_ctx *ctx = req->ctx;
6119         const struct cred *creds = NULL;
6120         int ret;
6121
6122         if (req->work.creds && req->work.creds != current_cred())
6123                 creds = override_creds(req->work.creds);
6124
6125         switch (req->opcode) {
6126         case IORING_OP_NOP:
6127                 ret = io_nop(req, issue_flags);
6128                 break;
6129         case IORING_OP_READV:
6130         case IORING_OP_READ_FIXED:
6131         case IORING_OP_READ:
6132                 ret = io_read(req, issue_flags);
6133                 break;
6134         case IORING_OP_WRITEV:
6135         case IORING_OP_WRITE_FIXED:
6136         case IORING_OP_WRITE:
6137                 ret = io_write(req, issue_flags);
6138                 break;
6139         case IORING_OP_FSYNC:
6140                 ret = io_fsync(req, issue_flags);
6141                 break;
6142         case IORING_OP_POLL_ADD:
6143                 ret = io_poll_add(req, issue_flags);
6144                 break;
6145         case IORING_OP_POLL_REMOVE:
6146                 ret = io_poll_update(req, issue_flags);
6147                 break;
6148         case IORING_OP_SYNC_FILE_RANGE:
6149                 ret = io_sync_file_range(req, issue_flags);
6150                 break;
6151         case IORING_OP_SENDMSG:
6152                 ret = io_sendmsg(req, issue_flags);
6153                 break;
6154         case IORING_OP_SEND:
6155                 ret = io_send(req, issue_flags);
6156                 break;
6157         case IORING_OP_RECVMSG:
6158                 ret = io_recvmsg(req, issue_flags);
6159                 break;
6160         case IORING_OP_RECV:
6161                 ret = io_recv(req, issue_flags);
6162                 break;
6163         case IORING_OP_TIMEOUT:
6164                 ret = io_timeout(req, issue_flags);
6165                 break;
6166         case IORING_OP_TIMEOUT_REMOVE:
6167                 ret = io_timeout_remove(req, issue_flags);
6168                 break;
6169         case IORING_OP_ACCEPT:
6170                 ret = io_accept(req, issue_flags);
6171                 break;
6172         case IORING_OP_CONNECT:
6173                 ret = io_connect(req, issue_flags);
6174                 break;
6175         case IORING_OP_ASYNC_CANCEL:
6176                 ret = io_async_cancel(req, issue_flags);
6177                 break;
6178         case IORING_OP_FALLOCATE:
6179                 ret = io_fallocate(req, issue_flags);
6180                 break;
6181         case IORING_OP_OPENAT:
6182                 ret = io_openat(req, issue_flags);
6183                 break;
6184         case IORING_OP_CLOSE:
6185                 ret = io_close(req, issue_flags);
6186                 break;
6187         case IORING_OP_FILES_UPDATE:
6188                 ret = io_files_update(req, issue_flags);
6189                 break;
6190         case IORING_OP_STATX:
6191                 ret = io_statx(req, issue_flags);
6192                 break;
6193         case IORING_OP_FADVISE:
6194                 ret = io_fadvise(req, issue_flags);
6195                 break;
6196         case IORING_OP_MADVISE:
6197                 ret = io_madvise(req, issue_flags);
6198                 break;
6199         case IORING_OP_OPENAT2:
6200                 ret = io_openat2(req, issue_flags);
6201                 break;
6202         case IORING_OP_EPOLL_CTL:
6203                 ret = io_epoll_ctl(req, issue_flags);
6204                 break;
6205         case IORING_OP_SPLICE:
6206                 ret = io_splice(req, issue_flags);
6207                 break;
6208         case IORING_OP_PROVIDE_BUFFERS:
6209                 ret = io_provide_buffers(req, issue_flags);
6210                 break;
6211         case IORING_OP_REMOVE_BUFFERS:
6212                 ret = io_remove_buffers(req, issue_flags);
6213                 break;
6214         case IORING_OP_TEE:
6215                 ret = io_tee(req, issue_flags);
6216                 break;
6217         case IORING_OP_SHUTDOWN:
6218                 ret = io_shutdown(req, issue_flags);
6219                 break;
6220         case IORING_OP_RENAMEAT:
6221                 ret = io_renameat(req, issue_flags);
6222                 break;
6223         case IORING_OP_UNLINKAT:
6224                 ret = io_unlinkat(req, issue_flags);
6225                 break;
6226         default:
6227                 ret = -EINVAL;
6228                 break;
6229         }
6230
6231         if (creds)
6232                 revert_creds(creds);
6233         if (ret)
6234                 return ret;
6235         /* If the op doesn't have a file, we're not polling for it */
6236         if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file)
6237                 io_iopoll_req_issued(req);
6238
6239         return 0;
6240 }
6241
6242 static void io_wq_submit_work(struct io_wq_work *work)
6243 {
6244         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6245         struct io_kiocb *timeout;
6246         int ret = 0;
6247
6248         timeout = io_prep_linked_timeout(req);
6249         if (timeout)
6250                 io_queue_linked_timeout(timeout);
6251
6252         if (work->flags & IO_WQ_WORK_CANCEL)
6253                 ret = -ECANCELED;
6254
6255         if (!ret) {
6256                 do {
6257                         ret = io_issue_sqe(req, 0);
6258                         /*
6259                          * We can get EAGAIN for polled IO even though we're
6260                          * forcing a sync submission from here, since we can't
6261                          * wait for request slots on the block side.
6262                          */
6263                         if (ret != -EAGAIN)
6264                                 break;
6265                         cond_resched();
6266                 } while (1);
6267         }
6268
6269         /* avoid locking problems by failing it from a clean context */
6270         if (ret) {
6271                 /* io-wq is going to take one down */
6272                 req_ref_get(req);
6273                 io_req_task_queue_fail(req, ret);
6274         }
6275 }
6276
6277 #define FFS_ASYNC_READ          0x1UL
6278 #define FFS_ASYNC_WRITE         0x2UL
6279 #ifdef CONFIG_64BIT
6280 #define FFS_ISREG               0x4UL
6281 #else
6282 #define FFS_ISREG               0x0UL
6283 #endif
6284 #define FFS_MASK                ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
6285
6286 static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
6287                                                       unsigned i)
6288 {
6289         struct io_fixed_file *table_l2;
6290
6291         table_l2 = table->files[i >> IORING_FILE_TABLE_SHIFT];
6292         return &table_l2[i & IORING_FILE_TABLE_MASK];
6293 }
6294
6295 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
6296                                               int index)
6297 {
6298         struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
6299
6300         return (struct file *) (slot->file_ptr & FFS_MASK);
6301 }
6302
6303 static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
6304 {
6305         unsigned long file_ptr = (unsigned long) file;
6306
6307         if (__io_file_supports_async(file, READ))
6308                 file_ptr |= FFS_ASYNC_READ;
6309         if (__io_file_supports_async(file, WRITE))
6310                 file_ptr |= FFS_ASYNC_WRITE;
6311         if (S_ISREG(file_inode(file)->i_mode))
6312                 file_ptr |= FFS_ISREG;
6313         file_slot->file_ptr = file_ptr;
6314 }
6315
6316 static struct file *io_file_get(struct io_submit_state *state,
6317                                 struct io_kiocb *req, int fd, bool fixed)
6318 {
6319         struct io_ring_ctx *ctx = req->ctx;
6320         struct file *file;
6321
6322         if (fixed) {
6323                 unsigned long file_ptr;
6324
6325                 if (unlikely((unsigned int)fd >= ctx->nr_user_files))
6326                         return NULL;
6327                 fd = array_index_nospec(fd, ctx->nr_user_files);
6328                 file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
6329                 file = (struct file *) (file_ptr & FFS_MASK);
6330                 file_ptr &= ~FFS_MASK;
6331                 /* mask in overlapping REQ_F and FFS bits */
6332                 req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT);
6333                 io_req_set_rsrc_node(req);
6334         } else {
6335                 trace_io_uring_file_get(ctx, fd);
6336                 file = __io_file_get(state, fd);
6337
6338                 /* we don't allow fixed io_uring files */
6339                 if (file && unlikely(file->f_op == &io_uring_fops))
6340                         io_req_track_inflight(req);
6341         }
6342
6343         return file;
6344 }
6345
6346 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
6347 {
6348         struct io_timeout_data *data = container_of(timer,
6349                                                 struct io_timeout_data, timer);
6350         struct io_kiocb *prev, *req = data->req;
6351         struct io_ring_ctx *ctx = req->ctx;
6352         unsigned long flags;
6353
6354         spin_lock_irqsave(&ctx->completion_lock, flags);
6355         prev = req->timeout.head;
6356         req->timeout.head = NULL;
6357
6358         /*
6359          * We don't expect the list to be empty, that will only happen if we
6360          * race with the completion of the linked work.
6361          */
6362         if (prev) {
6363                 io_remove_next_linked(prev);
6364                 if (!req_ref_inc_not_zero(prev))
6365                         prev = NULL;
6366         }
6367         spin_unlock_irqrestore(&ctx->completion_lock, flags);
6368
6369         if (prev) {
6370                 io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
6371                 io_put_req_deferred(prev, 1);
6372                 io_put_req_deferred(req, 1);
6373         } else {
6374                 io_req_complete_post(req, -ETIME, 0);
6375         }
6376         return HRTIMER_NORESTART;
6377 }
6378
6379 static void io_queue_linked_timeout(struct io_kiocb *req)
6380 {
6381         struct io_ring_ctx *ctx = req->ctx;
6382
6383         spin_lock_irq(&ctx->completion_lock);
6384         /*
6385          * If the back reference is NULL, then our linked request finished
6386          * before we got a chance to setup the timer
6387          */
6388         if (req->timeout.head) {
6389                 struct io_timeout_data *data = req->async_data;
6390
6391                 data->timer.function = io_link_timeout_fn;
6392                 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
6393                                 data->mode);
6394         }
6395         spin_unlock_irq(&ctx->completion_lock);
6396         /* drop submission reference */
6397         io_put_req(req);
6398 }
6399
6400 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
6401 {
6402         struct io_kiocb *nxt = req->link;
6403
6404         if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) ||
6405             nxt->opcode != IORING_OP_LINK_TIMEOUT)
6406                 return NULL;
6407
6408         nxt->timeout.head = req;
6409         nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
6410         req->flags |= REQ_F_LINK_TIMEOUT;
6411         return nxt;
6412 }
6413
6414 static void __io_queue_sqe(struct io_kiocb *req)
6415 {
6416         struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
6417         int ret;
6418
6419         ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
6420
6421         /*
6422          * We async punt it if the file wasn't marked NOWAIT, or if the file
6423          * doesn't support non-blocking read/write attempts
6424          */
6425         if (likely(!ret)) {
6426                 /* drop submission reference */
6427                 if (req->flags & REQ_F_COMPLETE_INLINE) {
6428                         struct io_ring_ctx *ctx = req->ctx;
6429                         struct io_comp_state *cs = &ctx->submit_state.comp;
6430
6431                         cs->reqs[cs->nr++] = req;
6432                         if (cs->nr == ARRAY_SIZE(cs->reqs))
6433                                 io_submit_flush_completions(cs, ctx);
6434                 } else {
6435                         io_put_req(req);
6436                 }
6437         } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
6438                 if (!io_arm_poll_handler(req)) {
6439                         /*
6440                          * Queued up for async execution, worker will release
6441                          * submit reference when the iocb is actually submitted.
6442                          */
6443                         io_queue_async_work(req);
6444                 }
6445         } else {
6446                 io_req_complete_failed(req, ret);
6447         }
6448         if (linked_timeout)
6449                 io_queue_linked_timeout(linked_timeout);
6450 }
6451
6452 static void io_queue_sqe(struct io_kiocb *req)
6453 {
6454         int ret;
6455
6456         ret = io_req_defer(req);
6457         if (ret) {
6458                 if (ret != -EIOCBQUEUED) {
6459 fail_req:
6460                         io_req_complete_failed(req, ret);
6461                 }
6462         } else if (req->flags & REQ_F_FORCE_ASYNC) {
6463                 ret = io_req_prep_async(req);
6464                 if (unlikely(ret))
6465                         goto fail_req;
6466                 io_queue_async_work(req);
6467         } else {
6468                 __io_queue_sqe(req);
6469         }
6470 }
6471
6472 /*
6473  * Check SQE restrictions (opcode and flags).
6474  *
6475  * Returns 'true' if SQE is allowed, 'false' otherwise.
6476  */
6477 static inline bool io_check_restriction(struct io_ring_ctx *ctx,
6478                                         struct io_kiocb *req,
6479                                         unsigned int sqe_flags)
6480 {
6481         if (!ctx->restricted)
6482                 return true;
6483
6484         if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
6485                 return false;
6486
6487         if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
6488             ctx->restrictions.sqe_flags_required)
6489                 return false;
6490
6491         if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
6492                           ctx->restrictions.sqe_flags_required))
6493                 return false;
6494
6495         return true;
6496 }
6497
6498 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
6499                        const struct io_uring_sqe *sqe)
6500 {
6501         struct io_submit_state *state;
6502         unsigned int sqe_flags;
6503         int personality, ret = 0;
6504
6505         req->opcode = READ_ONCE(sqe->opcode);
6506         /* same numerical values with corresponding REQ_F_*, safe to copy */
6507         req->flags = sqe_flags = READ_ONCE(sqe->flags);
6508         req->user_data = READ_ONCE(sqe->user_data);
6509         req->async_data = NULL;
6510         req->file = NULL;
6511         req->ctx = ctx;
6512         req->link = NULL;
6513         req->fixed_rsrc_refs = NULL;
6514         /* one is dropped after submission, the other at completion */
6515         atomic_set(&req->refs, 2);
6516         req->task = current;
6517         req->result = 0;
6518         req->work.creds = NULL;
6519
6520         /* enforce forwards compatibility on users */
6521         if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
6522                 return -EINVAL;
6523         if (unlikely(req->opcode >= IORING_OP_LAST))
6524                 return -EINVAL;
6525         if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
6526                 return -EACCES;
6527
6528         if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
6529             !io_op_defs[req->opcode].buffer_select)
6530                 return -EOPNOTSUPP;
6531
6532         personality = READ_ONCE(sqe->personality);
6533         if (personality) {
6534                 req->work.creds = xa_load(&ctx->personalities, personality);
6535                 if (!req->work.creds)
6536                         return -EINVAL;
6537                 get_cred(req->work.creds);
6538         }
6539         state = &ctx->submit_state;
6540
6541         /*
6542          * Plug now if we have more than 1 IO left after this, and the target
6543          * is potentially a read/write to block based storage.
6544          */
6545         if (!state->plug_started && state->ios_left > 1 &&
6546             io_op_defs[req->opcode].plug) {
6547                 blk_start_plug(&state->plug);
6548                 state->plug_started = true;
6549         }
6550
6551         if (io_op_defs[req->opcode].needs_file) {
6552                 bool fixed = req->flags & REQ_F_FIXED_FILE;
6553
6554                 req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
6555                 if (unlikely(!req->file))
6556                         ret = -EBADF;
6557         }
6558
6559         state->ios_left--;
6560         return ret;
6561 }
6562
6563 static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
6564                          const struct io_uring_sqe *sqe)
6565 {
6566         struct io_submit_link *link = &ctx->submit_state.link;
6567         int ret;
6568
6569         ret = io_init_req(ctx, req, sqe);
6570         if (unlikely(ret)) {
6571 fail_req:
6572                 if (link->head) {
6573                         /* fail even hard links since we don't submit */
6574                         req_set_fail(link->head);
6575                         io_req_complete_failed(link->head, -ECANCELED);
6576                         link->head = NULL;
6577                 }
6578                 io_req_complete_failed(req, ret);
6579                 return ret;
6580         }
6581         ret = io_req_prep(req, sqe);
6582         if (unlikely(ret))
6583                 goto fail_req;
6584
6585         /* don't need @sqe from now on */
6586         trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
6587                                 true, ctx->flags & IORING_SETUP_SQPOLL);
6588
6589         /*
6590          * If we already have a head request, queue this one for async
6591          * submittal once the head completes. If we don't have a head but
6592          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
6593          * submitted sync once the chain is complete. If none of those
6594          * conditions are true (normal request), then just queue it.
6595          */
6596         if (link->head) {
6597                 struct io_kiocb *head = link->head;
6598
6599                 /*
6600                  * Taking sequential execution of a link, draining both sides
6601                  * of the link also fullfils IOSQE_IO_DRAIN semantics for all
6602                  * requests in the link. So, it drains the head and the
6603                  * next after the link request. The last one is done via
6604                  * drain_next flag to persist the effect across calls.
6605                  */
6606                 if (req->flags & REQ_F_IO_DRAIN) {
6607                         head->flags |= REQ_F_IO_DRAIN;
6608                         ctx->drain_next = 1;
6609                 }
6610                 ret = io_req_prep_async(req);
6611                 if (unlikely(ret))
6612                         goto fail_req;
6613                 trace_io_uring_link(ctx, req, head);
6614                 link->last->link = req;
6615                 link->last = req;
6616
6617                 /* last request of a link, enqueue the link */
6618                 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
6619                         io_queue_sqe(head);
6620                         link->head = NULL;
6621                 }
6622         } else {
6623                 if (unlikely(ctx->drain_next)) {
6624                         req->flags |= REQ_F_IO_DRAIN;
6625                         ctx->drain_next = 0;
6626                 }
6627                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
6628                         link->head = req;
6629                         link->last = req;
6630                 } else {
6631                         io_queue_sqe(req);
6632                 }
6633         }
6634
6635         return 0;
6636 }
6637
6638 /*
6639  * Batched submission is done, ensure local IO is flushed out.
6640  */
6641 static void io_submit_state_end(struct io_submit_state *state,
6642                                 struct io_ring_ctx *ctx)
6643 {
6644         if (state->link.head)
6645                 io_queue_sqe(state->link.head);
6646         if (state->comp.nr)
6647                 io_submit_flush_completions(&state->comp, ctx);
6648         if (state->plug_started)
6649                 blk_finish_plug(&state->plug);
6650         io_state_file_put(state);
6651 }
6652
6653 /*
6654  * Start submission side cache.
6655  */
6656 static void io_submit_state_start(struct io_submit_state *state,
6657                                   unsigned int max_ios)
6658 {
6659         state->plug_started = false;
6660         state->ios_left = max_ios;
6661         /* set only head, no need to init link_last in advance */
6662         state->link.head = NULL;
6663 }
6664
6665 static void io_commit_sqring(struct io_ring_ctx *ctx)
6666 {
6667         struct io_rings *rings = ctx->rings;
6668
6669         /*
6670          * Ensure any loads from the SQEs are done at this point,
6671          * since once we write the new head, the application could
6672          * write new data to them.
6673          */
6674         smp_store_release(&rings->sq.head, ctx->cached_sq_head);
6675 }
6676
6677 /*
6678  * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
6679  * that is mapped by userspace. This means that care needs to be taken to
6680  * ensure that reads are stable, as we cannot rely on userspace always
6681  * being a good citizen. If members of the sqe are validated and then later
6682  * used, it's important that those reads are done through READ_ONCE() to
6683  * prevent a re-load down the line.
6684  */
6685 static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
6686 {
6687         u32 *sq_array = ctx->sq_array;
6688         unsigned head, mask = ctx->sq_entries - 1;
6689
6690         /*
6691          * The cached sq head (or cq tail) serves two purposes:
6692          *
6693          * 1) allows us to batch the cost of updating the user visible
6694          *    head updates.
6695          * 2) allows the kernel side to track the head on its own, even
6696          *    though the application is the one updating it.
6697          */
6698         head = READ_ONCE(sq_array[ctx->cached_sq_head++ & mask]);
6699         if (likely(head < ctx->sq_entries))
6700                 return &ctx->sq_sqes[head];
6701
6702         /* drop invalid entries */
6703         ctx->cached_sq_dropped++;
6704         WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
6705         return NULL;
6706 }
6707
6708 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
6709 {
6710         int submitted = 0;
6711
6712         /* make sure SQ entry isn't read before tail */
6713         nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
6714
6715         if (!percpu_ref_tryget_many(&ctx->refs, nr))
6716                 return -EAGAIN;
6717
6718         percpu_counter_add(&current->io_uring->inflight, nr);
6719         refcount_add(nr, &current->usage);
6720         io_submit_state_start(&ctx->submit_state, nr);
6721
6722         while (submitted < nr) {
6723                 const struct io_uring_sqe *sqe;
6724                 struct io_kiocb *req;
6725
6726                 req = io_alloc_req(ctx);
6727                 if (unlikely(!req)) {
6728                         if (!submitted)
6729                                 submitted = -EAGAIN;
6730                         break;
6731                 }
6732                 sqe = io_get_sqe(ctx);
6733                 if (unlikely(!sqe)) {
6734                         kmem_cache_free(req_cachep, req);
6735                         break;
6736                 }
6737                 /* will complete beyond this point, count as submitted */
6738                 submitted++;
6739                 if (io_submit_sqe(ctx, req, sqe))
6740                         break;
6741         }
6742
6743         if (unlikely(submitted != nr)) {
6744                 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
6745                 struct io_uring_task *tctx = current->io_uring;
6746                 int unused = nr - ref_used;
6747
6748                 percpu_ref_put_many(&ctx->refs, unused);
6749                 percpu_counter_sub(&tctx->inflight, unused);
6750                 put_task_struct_many(current, unused);
6751         }
6752
6753         io_submit_state_end(&ctx->submit_state, ctx);
6754          /* Commit SQ ring head once we've consumed and submitted all SQEs */
6755         io_commit_sqring(ctx);
6756
6757         return submitted;
6758 }
6759
6760 static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
6761 {
6762         return READ_ONCE(sqd->state);
6763 }
6764
6765 static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
6766 {
6767         /* Tell userspace we may need a wakeup call */
6768         spin_lock_irq(&ctx->completion_lock);
6769         ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
6770         spin_unlock_irq(&ctx->completion_lock);
6771 }
6772
6773 static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
6774 {
6775         spin_lock_irq(&ctx->completion_lock);
6776         ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6777         spin_unlock_irq(&ctx->completion_lock);
6778 }
6779
6780 static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
6781 {
6782         unsigned int to_submit;
6783         int ret = 0;
6784
6785         to_submit = io_sqring_entries(ctx);
6786         /* if we're handling multiple rings, cap submit size for fairness */
6787         if (cap_entries && to_submit > 8)
6788                 to_submit = 8;
6789
6790         if (!list_empty(&ctx->iopoll_list) || to_submit) {
6791                 unsigned nr_events = 0;
6792
6793                 mutex_lock(&ctx->uring_lock);
6794                 if (!list_empty(&ctx->iopoll_list))
6795                         io_do_iopoll(ctx, &nr_events, 0);
6796
6797                 /*
6798                  * Don't submit if refs are dying, good for io_uring_register(),
6799                  * but also it is relied upon by io_ring_exit_work()
6800                  */
6801                 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
6802                     !(ctx->flags & IORING_SETUP_R_DISABLED))
6803                         ret = io_submit_sqes(ctx, to_submit);
6804                 mutex_unlock(&ctx->uring_lock);
6805
6806                 if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
6807                         wake_up(&ctx->sqo_sq_wait);
6808         }
6809
6810         return ret;
6811 }
6812
6813 static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
6814 {
6815         struct io_ring_ctx *ctx;
6816         unsigned sq_thread_idle = 0;
6817
6818         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6819                 sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
6820         sqd->sq_thread_idle = sq_thread_idle;
6821 }
6822
6823 static bool io_sqd_handle_event(struct io_sq_data *sqd)
6824 {
6825         bool did_sig = false;
6826         struct ksignal ksig;
6827
6828         if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
6829             signal_pending(current)) {
6830                 mutex_unlock(&sqd->lock);
6831                 if (signal_pending(current))
6832                         did_sig = get_signal(&ksig);
6833                 cond_resched();
6834                 mutex_lock(&sqd->lock);
6835         }
6836         io_run_task_work();
6837         return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
6838 }
6839
6840 static int io_sq_thread(void *data)
6841 {
6842         struct io_sq_data *sqd = data;
6843         struct io_ring_ctx *ctx;
6844         unsigned long timeout = 0;
6845         char buf[TASK_COMM_LEN];
6846         DEFINE_WAIT(wait);
6847
6848         snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
6849         set_task_comm(current, buf);
6850
6851         if (sqd->sq_cpu != -1)
6852                 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
6853         else
6854                 set_cpus_allowed_ptr(current, cpu_online_mask);
6855         current->flags |= PF_NO_SETAFFINITY;
6856
6857         mutex_lock(&sqd->lock);
6858         while (1) {
6859                 int ret;
6860                 bool cap_entries, sqt_spin, needs_sched;
6861
6862                 if (io_sqd_events_pending(sqd) || signal_pending(current)) {
6863                         if (io_sqd_handle_event(sqd))
6864                                 break;
6865                         timeout = jiffies + sqd->sq_thread_idle;
6866                         continue;
6867                 }
6868
6869                 sqt_spin = false;
6870                 cap_entries = !list_is_singular(&sqd->ctx_list);
6871                 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
6872                         const struct cred *creds = NULL;
6873
6874                         if (ctx->sq_creds != current_cred())
6875                                 creds = override_creds(ctx->sq_creds);
6876                         ret = __io_sq_thread(ctx, cap_entries);
6877                         if (creds)
6878                                 revert_creds(creds);
6879                         if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
6880                                 sqt_spin = true;
6881                 }
6882
6883                 if (sqt_spin || !time_after(jiffies, timeout)) {
6884                         io_run_task_work();
6885                         cond_resched();
6886                         if (sqt_spin)
6887                                 timeout = jiffies + sqd->sq_thread_idle;
6888                         continue;
6889                 }
6890
6891                 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
6892                 if (!io_sqd_events_pending(sqd)) {
6893                         needs_sched = true;
6894                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
6895                                 io_ring_set_wakeup_flag(ctx);
6896
6897                                 if ((ctx->flags & IORING_SETUP_IOPOLL) &&
6898                                     !list_empty_careful(&ctx->iopoll_list)) {
6899                                         needs_sched = false;
6900                                         break;
6901                                 }
6902                                 if (io_sqring_entries(ctx)) {
6903                                         needs_sched = false;
6904                                         break;
6905                                 }
6906                         }
6907
6908                         if (needs_sched) {
6909                                 mutex_unlock(&sqd->lock);
6910                                 schedule();
6911                                 mutex_lock(&sqd->lock);
6912                         }
6913                         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6914                                 io_ring_clear_wakeup_flag(ctx);
6915                 }
6916
6917                 finish_wait(&sqd->wait, &wait);
6918                 timeout = jiffies + sqd->sq_thread_idle;
6919         }
6920
6921         io_uring_cancel_sqpoll(sqd);
6922         sqd->thread = NULL;
6923         list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6924                 io_ring_set_wakeup_flag(ctx);
6925         io_run_task_work();
6926         mutex_unlock(&sqd->lock);
6927
6928         complete(&sqd->exited);
6929         do_exit(0);
6930 }
6931
6932 struct io_wait_queue {
6933         struct wait_queue_entry wq;
6934         struct io_ring_ctx *ctx;
6935         unsigned to_wait;
6936         unsigned nr_timeouts;
6937 };
6938
6939 static inline bool io_should_wake(struct io_wait_queue *iowq)
6940 {
6941         struct io_ring_ctx *ctx = iowq->ctx;
6942
6943         /*
6944          * Wake up if we have enough events, or if a timeout occurred since we
6945          * started waiting. For timeouts, we always want to return to userspace,
6946          * regardless of event count.
6947          */
6948         return io_cqring_events(ctx) >= iowq->to_wait ||
6949                         atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
6950 }
6951
6952 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
6953                             int wake_flags, void *key)
6954 {
6955         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
6956                                                         wq);
6957
6958         /*
6959          * Cannot safely flush overflowed CQEs from here, ensure we wake up
6960          * the task, and the next invocation will do it.
6961          */
6962         if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->cq_check_overflow))
6963                 return autoremove_wake_function(curr, mode, wake_flags, key);
6964         return -1;
6965 }
6966
6967 static int io_run_task_work_sig(void)
6968 {
6969         if (io_run_task_work())
6970                 return 1;
6971         if (!signal_pending(current))
6972                 return 0;
6973         if (test_thread_flag(TIF_NOTIFY_SIGNAL))
6974                 return -ERESTARTSYS;
6975         return -EINTR;
6976 }
6977
6978 /* when returns >0, the caller should retry */
6979 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
6980                                           struct io_wait_queue *iowq,
6981                                           signed long *timeout)
6982 {
6983         int ret;
6984
6985         /* make sure we run task_work before checking for signals */
6986         ret = io_run_task_work_sig();
6987         if (ret || io_should_wake(iowq))
6988                 return ret;
6989         /* let the caller flush overflows, retry */
6990         if (test_bit(0, &ctx->cq_check_overflow))
6991                 return 1;
6992
6993         *timeout = schedule_timeout(*timeout);
6994         return !*timeout ? -ETIME : 1;
6995 }
6996
6997 /*
6998  * Wait until events become available, if we don't already have some. The
6999  * application must reap them itself, as they reside on the shared cq ring.
7000  */
7001 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
7002                           const sigset_t __user *sig, size_t sigsz,
7003                           struct __kernel_timespec __user *uts)
7004 {
7005         struct io_wait_queue iowq = {
7006                 .wq = {
7007                         .private        = current,
7008                         .func           = io_wake_function,
7009                         .entry          = LIST_HEAD_INIT(iowq.wq.entry),
7010                 },
7011                 .ctx            = ctx,
7012                 .to_wait        = min_events,
7013         };
7014         struct io_rings *rings = ctx->rings;
7015         signed long timeout = MAX_SCHEDULE_TIMEOUT;
7016         int ret;
7017
7018         do {
7019                 io_cqring_overflow_flush(ctx, false);
7020                 if (io_cqring_events(ctx) >= min_events)
7021                         return 0;
7022                 if (!io_run_task_work())
7023                         break;
7024         } while (1);
7025
7026         if (sig) {
7027 #ifdef CONFIG_COMPAT
7028                 if (in_compat_syscall())
7029                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
7030                                                       sigsz);
7031                 else
7032 #endif
7033                         ret = set_user_sigmask(sig, sigsz);
7034
7035                 if (ret)
7036                         return ret;
7037         }
7038
7039         if (uts) {
7040                 struct timespec64 ts;
7041
7042                 if (get_timespec64(&ts, uts))
7043                         return -EFAULT;
7044                 timeout = timespec64_to_jiffies(&ts);
7045         }
7046
7047         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
7048         trace_io_uring_cqring_wait(ctx, min_events);
7049         do {
7050                 /* if we can't even flush overflow, don't wait for more */
7051                 if (!io_cqring_overflow_flush(ctx, false)) {
7052                         ret = -EBUSY;
7053                         break;
7054                 }
7055                 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
7056                                                 TASK_INTERRUPTIBLE);
7057                 ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
7058                 finish_wait(&ctx->wait, &iowq.wq);
7059                 cond_resched();
7060         } while (ret > 0);
7061
7062         restore_saved_sigmask_unless(ret == -EINTR);
7063
7064         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
7065 }
7066
7067 static void io_free_page_table(void **table, size_t size)
7068 {
7069         unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
7070
7071         for (i = 0; i < nr_tables; i++)
7072                 kfree(table[i]);
7073         kfree(table);
7074 }
7075
7076 static void **io_alloc_page_table(size_t size)
7077 {
7078         unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
7079         size_t init_size = size;
7080         void **table;
7081
7082         table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL);
7083         if (!table)
7084                 return NULL;
7085
7086         for (i = 0; i < nr_tables; i++) {
7087                 unsigned int this_size = min(size, PAGE_SIZE);
7088
7089                 table[i] = kzalloc(this_size, GFP_KERNEL);
7090                 if (!table[i]) {
7091                         io_free_page_table(table, init_size);
7092                         return NULL;
7093                 }
7094                 size -= this_size;
7095         }
7096         return table;
7097 }
7098
7099 static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
7100 {
7101         spin_lock_bh(&ctx->rsrc_ref_lock);
7102 }
7103
7104 static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx)
7105 {
7106         spin_unlock_bh(&ctx->rsrc_ref_lock);
7107 }
7108
7109 static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
7110 {
7111         percpu_ref_exit(&ref_node->refs);
7112         kfree(ref_node);
7113 }
7114
7115 static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
7116                                 struct io_rsrc_data *data_to_kill)
7117 {
7118         WARN_ON_ONCE(!ctx->rsrc_backup_node);
7119         WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
7120
7121         if (data_to_kill) {
7122                 struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
7123
7124                 rsrc_node->rsrc_data = data_to_kill;
7125                 io_rsrc_ref_lock(ctx);
7126                 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
7127                 io_rsrc_ref_unlock(ctx);
7128
7129                 atomic_inc(&data_to_kill->refs);
7130                 percpu_ref_kill(&rsrc_node->refs);
7131                 ctx->rsrc_node = NULL;
7132         }
7133
7134         if (!ctx->rsrc_node) {
7135                 ctx->rsrc_node = ctx->rsrc_backup_node;
7136                 ctx->rsrc_backup_node = NULL;
7137         }
7138 }
7139
7140 static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
7141 {
7142         if (ctx->rsrc_backup_node)
7143                 return 0;
7144         ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
7145         return ctx->rsrc_backup_node ? 0 : -ENOMEM;
7146 }
7147
7148 static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx)
7149 {
7150         int ret;
7151
7152         /* As we may drop ->uring_lock, other task may have started quiesce */
7153         if (data->quiesce)
7154                 return -ENXIO;
7155
7156         data->quiesce = true;
7157         do {
7158                 ret = io_rsrc_node_switch_start(ctx);
7159                 if (ret)
7160                         break;
7161                 io_rsrc_node_switch(ctx, data);
7162
7163                 /* kill initial ref, already quiesced if zero */
7164                 if (atomic_dec_and_test(&data->refs))
7165                         break;
7166                 flush_delayed_work(&ctx->rsrc_put_work);
7167                 ret = wait_for_completion_interruptible(&data->done);
7168                 if (!ret)
7169                         break;
7170
7171                 atomic_inc(&data->refs);
7172                 /* wait for all works potentially completing data->done */
7173                 flush_delayed_work(&ctx->rsrc_put_work);
7174                 reinit_completion(&data->done);
7175
7176                 mutex_unlock(&ctx->uring_lock);
7177                 ret = io_run_task_work_sig();
7178                 mutex_lock(&ctx->uring_lock);
7179         } while (ret >= 0);
7180         data->quiesce = false;
7181
7182         return ret;
7183 }
7184
7185 static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
7186 {
7187         unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
7188         unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
7189
7190         return &data->tags[table_idx][off];
7191 }
7192
7193 static void io_rsrc_data_free(struct io_rsrc_data *data)
7194 {
7195         size_t size = data->nr * sizeof(data->tags[0][0]);
7196
7197         if (data->tags)
7198                 io_free_page_table((void **)data->tags, size);
7199         kfree(data);
7200 }
7201
7202 static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
7203                               u64 __user *utags, unsigned nr,
7204                               struct io_rsrc_data **pdata)
7205 {
7206         struct io_rsrc_data *data;
7207         int ret = -ENOMEM;
7208         unsigned i;
7209
7210         data = kzalloc(sizeof(*data), GFP_KERNEL);
7211         if (!data)
7212                 return -ENOMEM;
7213         data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
7214         if (!data->tags) {
7215                 kfree(data);
7216                 return -ENOMEM;
7217         }
7218
7219         data->nr = nr;
7220         data->ctx = ctx;
7221         data->do_put = do_put;
7222         if (utags) {
7223                 ret = -EFAULT;
7224                 for (i = 0; i < nr; i++) {
7225                         if (copy_from_user(io_get_tag_slot(data, i), &utags[i],
7226                                            sizeof(data->tags[i])))
7227                                 goto fail;
7228                 }
7229         }
7230
7231         atomic_set(&data->refs, 1);
7232         init_completion(&data->done);
7233         *pdata = data;
7234         return 0;
7235 fail:
7236         io_rsrc_data_free(data);
7237         return ret;
7238 }
7239
7240 static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
7241 {
7242         size_t size = nr_files * sizeof(struct io_fixed_file);
7243
7244         table->files = (struct io_fixed_file **)io_alloc_page_table(size);
7245         return !!table->files;
7246 }
7247
7248 static void io_free_file_tables(struct io_file_table *table, unsigned nr_files)
7249 {
7250         size_t size = nr_files * sizeof(struct io_fixed_file);
7251
7252         io_free_page_table((void **)table->files, size);
7253         table->files = NULL;
7254 }
7255
7256 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
7257 {
7258 #if defined(CONFIG_UNIX)
7259         if (ctx->ring_sock) {
7260                 struct sock *sock = ctx->ring_sock->sk;
7261                 struct sk_buff *skb;
7262
7263                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
7264                         kfree_skb(skb);
7265         }
7266 #else
7267         int i;
7268
7269         for (i = 0; i < ctx->nr_user_files; i++) {
7270                 struct file *file;
7271
7272                 file = io_file_from_index(ctx, i);
7273                 if (file)
7274                         fput(file);
7275         }
7276 #endif
7277         io_free_file_tables(&ctx->file_table, ctx->nr_user_files);
7278         io_rsrc_data_free(ctx->file_data);
7279         ctx->file_data = NULL;
7280         ctx->nr_user_files = 0;
7281 }
7282
7283 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
7284 {
7285         int ret;
7286
7287         if (!ctx->file_data)
7288                 return -ENXIO;
7289         ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
7290         if (!ret)
7291                 __io_sqe_files_unregister(ctx);
7292         return ret;
7293 }
7294
7295 static void io_sq_thread_unpark(struct io_sq_data *sqd)
7296         __releases(&sqd->lock)
7297 {
7298         WARN_ON_ONCE(sqd->thread == current);
7299
7300         /*
7301          * Do the dance but not conditional clear_bit() because it'd race with
7302          * other threads incrementing park_pending and setting the bit.
7303          */
7304         clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7305         if (atomic_dec_return(&sqd->park_pending))
7306                 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7307         mutex_unlock(&sqd->lock);
7308 }
7309
7310 static void io_sq_thread_park(struct io_sq_data *sqd)
7311         __acquires(&sqd->lock)
7312 {
7313         WARN_ON_ONCE(sqd->thread == current);
7314
7315         atomic_inc(&sqd->park_pending);
7316         set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7317         mutex_lock(&sqd->lock);
7318         if (sqd->thread)
7319                 wake_up_process(sqd->thread);
7320 }
7321
7322 static void io_sq_thread_stop(struct io_sq_data *sqd)
7323 {
7324         WARN_ON_ONCE(sqd->thread == current);
7325         WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
7326
7327         set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
7328         mutex_lock(&sqd->lock);
7329         if (sqd->thread)
7330                 wake_up_process(sqd->thread);
7331         mutex_unlock(&sqd->lock);
7332         wait_for_completion(&sqd->exited);
7333 }
7334
7335 static void io_put_sq_data(struct io_sq_data *sqd)
7336 {
7337         if (refcount_dec_and_test(&sqd->refs)) {
7338                 WARN_ON_ONCE(atomic_read(&sqd->park_pending));
7339
7340                 io_sq_thread_stop(sqd);
7341                 kfree(sqd);
7342         }
7343 }
7344
7345 static void io_sq_thread_finish(struct io_ring_ctx *ctx)
7346 {
7347         struct io_sq_data *sqd = ctx->sq_data;
7348
7349         if (sqd) {
7350                 io_sq_thread_park(sqd);
7351                 list_del_init(&ctx->sqd_list);
7352                 io_sqd_update_thread_idle(sqd);
7353                 io_sq_thread_unpark(sqd);
7354
7355                 io_put_sq_data(sqd);
7356                 ctx->sq_data = NULL;
7357         }
7358 }
7359
7360 static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
7361 {
7362         struct io_ring_ctx *ctx_attach;
7363         struct io_sq_data *sqd;
7364         struct fd f;
7365
7366         f = fdget(p->wq_fd);
7367         if (!f.file)
7368                 return ERR_PTR(-ENXIO);
7369         if (f.file->f_op != &io_uring_fops) {
7370                 fdput(f);
7371                 return ERR_PTR(-EINVAL);
7372         }
7373
7374         ctx_attach = f.file->private_data;
7375         sqd = ctx_attach->sq_data;
7376         if (!sqd) {
7377                 fdput(f);
7378                 return ERR_PTR(-EINVAL);
7379         }
7380         if (sqd->task_tgid != current->tgid) {
7381                 fdput(f);
7382                 return ERR_PTR(-EPERM);
7383         }
7384
7385         refcount_inc(&sqd->refs);
7386         fdput(f);
7387         return sqd;
7388 }
7389
7390 static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
7391                                          bool *attached)
7392 {
7393         struct io_sq_data *sqd;
7394
7395         *attached = false;
7396         if (p->flags & IORING_SETUP_ATTACH_WQ) {
7397                 sqd = io_attach_sq_data(p);
7398                 if (!IS_ERR(sqd)) {
7399                         *attached = true;
7400                         return sqd;
7401                 }
7402                 /* fall through for EPERM case, setup new sqd/task */
7403                 if (PTR_ERR(sqd) != -EPERM)
7404                         return sqd;
7405         }
7406
7407         sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
7408         if (!sqd)
7409                 return ERR_PTR(-ENOMEM);
7410
7411         atomic_set(&sqd->park_pending, 0);
7412         refcount_set(&sqd->refs, 1);
7413         INIT_LIST_HEAD(&sqd->ctx_list);
7414         mutex_init(&sqd->lock);
7415         init_waitqueue_head(&sqd->wait);
7416         init_completion(&sqd->exited);
7417         return sqd;
7418 }
7419
7420 #if defined(CONFIG_UNIX)
7421 /*
7422  * Ensure the UNIX gc is aware of our file set, so we are certain that
7423  * the io_uring can be safely unregistered on process exit, even if we have
7424  * loops in the file referencing.
7425  */
7426 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
7427 {
7428         struct sock *sk = ctx->ring_sock->sk;
7429         struct scm_fp_list *fpl;
7430         struct sk_buff *skb;
7431         int i, nr_files;
7432
7433         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
7434         if (!fpl)
7435                 return -ENOMEM;
7436
7437         skb = alloc_skb(0, GFP_KERNEL);
7438         if (!skb) {
7439                 kfree(fpl);
7440                 return -ENOMEM;
7441         }
7442
7443         skb->sk = sk;
7444
7445         nr_files = 0;
7446         fpl->user = get_uid(current_user());
7447         for (i = 0; i < nr; i++) {
7448                 struct file *file = io_file_from_index(ctx, i + offset);
7449
7450                 if (!file)
7451                         continue;
7452                 fpl->fp[nr_files] = get_file(file);
7453                 unix_inflight(fpl->user, fpl->fp[nr_files]);
7454                 nr_files++;
7455         }
7456
7457         if (nr_files) {
7458                 fpl->max = SCM_MAX_FD;
7459                 fpl->count = nr_files;
7460                 UNIXCB(skb).fp = fpl;
7461                 skb->destructor = unix_destruct_scm;
7462                 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
7463                 skb_queue_head(&sk->sk_receive_queue, skb);
7464
7465                 for (i = 0; i < nr_files; i++)
7466                         fput(fpl->fp[i]);
7467         } else {
7468                 kfree_skb(skb);
7469                 kfree(fpl);
7470         }
7471
7472         return 0;
7473 }
7474
7475 /*
7476  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
7477  * causes regular reference counting to break down. We rely on the UNIX
7478  * garbage collection to take care of this problem for us.
7479  */
7480 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7481 {
7482         unsigned left, total;
7483         int ret = 0;
7484
7485         total = 0;
7486         left = ctx->nr_user_files;
7487         while (left) {
7488                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
7489
7490                 ret = __io_sqe_files_scm(ctx, this_files, total);
7491                 if (ret)
7492                         break;
7493                 left -= this_files;
7494                 total += this_files;
7495         }
7496
7497         if (!ret)
7498                 return 0;
7499
7500         while (total < ctx->nr_user_files) {
7501                 struct file *file = io_file_from_index(ctx, total);
7502
7503                 if (file)
7504                         fput(file);
7505                 total++;
7506         }
7507
7508         return ret;
7509 }
7510 #else
7511 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7512 {
7513         return 0;
7514 }
7515 #endif
7516
7517 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
7518 {
7519         struct file *file = prsrc->file;
7520 #if defined(CONFIG_UNIX)
7521         struct sock *sock = ctx->ring_sock->sk;
7522         struct sk_buff_head list, *head = &sock->sk_receive_queue;
7523         struct sk_buff *skb;
7524         int i;
7525
7526         __skb_queue_head_init(&list);
7527
7528         /*
7529          * Find the skb that holds this file in its SCM_RIGHTS. When found,
7530          * remove this entry and rearrange the file array.
7531          */
7532         skb = skb_dequeue(head);
7533         while (skb) {
7534                 struct scm_fp_list *fp;
7535
7536                 fp = UNIXCB(skb).fp;
7537                 for (i = 0; i < fp->count; i++) {
7538                         int left;
7539
7540                         if (fp->fp[i] != file)
7541                                 continue;
7542
7543                         unix_notinflight(fp->user, fp->fp[i]);
7544                         left = fp->count - 1 - i;
7545                         if (left) {
7546                                 memmove(&fp->fp[i], &fp->fp[i + 1],
7547                                                 left * sizeof(struct file *));
7548                         }
7549                         fp->count--;
7550                         if (!fp->count) {
7551                                 kfree_skb(skb);
7552                                 skb = NULL;
7553                         } else {
7554                                 __skb_queue_tail(&list, skb);
7555                         }
7556                         fput(file);
7557                         file = NULL;
7558                         break;
7559                 }
7560
7561                 if (!file)
7562                         break;
7563
7564                 __skb_queue_tail(&list, skb);
7565
7566                 skb = skb_dequeue(head);
7567         }
7568
7569         if (skb_peek(&list)) {
7570                 spin_lock_irq(&head->lock);
7571                 while ((skb = __skb_dequeue(&list)) != NULL)
7572                         __skb_queue_tail(head, skb);
7573                 spin_unlock_irq(&head->lock);
7574         }
7575 #else
7576         fput(file);
7577 #endif
7578 }
7579
7580 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
7581 {
7582         struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
7583         struct io_ring_ctx *ctx = rsrc_data->ctx;
7584         struct io_rsrc_put *prsrc, *tmp;
7585
7586         list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
7587                 list_del(&prsrc->list);
7588
7589                 if (prsrc->tag) {
7590                         bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
7591
7592                         io_ring_submit_lock(ctx, lock_ring);
7593                         spin_lock_irq(&ctx->completion_lock);
7594                         io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
7595                         ctx->cq_extra++;
7596                         io_commit_cqring(ctx);
7597                         spin_unlock_irq(&ctx->completion_lock);
7598                         io_cqring_ev_posted(ctx);
7599                         io_ring_submit_unlock(ctx, lock_ring);
7600                 }
7601
7602                 rsrc_data->do_put(ctx, prsrc);
7603                 kfree(prsrc);
7604         }
7605
7606         io_rsrc_node_destroy(ref_node);
7607         if (atomic_dec_and_test(&rsrc_data->refs))
7608                 complete(&rsrc_data->done);
7609 }
7610
7611 static void io_rsrc_put_work(struct work_struct *work)
7612 {
7613         struct io_ring_ctx *ctx;
7614         struct llist_node *node;
7615
7616         ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
7617         node = llist_del_all(&ctx->rsrc_put_llist);
7618
7619         while (node) {
7620                 struct io_rsrc_node *ref_node;
7621                 struct llist_node *next = node->next;
7622
7623                 ref_node = llist_entry(node, struct io_rsrc_node, llist);
7624                 __io_rsrc_put_work(ref_node);
7625                 node = next;
7626         }
7627 }
7628
7629 static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
7630 {
7631         struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
7632         struct io_ring_ctx *ctx = node->rsrc_data->ctx;
7633         bool first_add = false;
7634
7635         io_rsrc_ref_lock(ctx);
7636         node->done = true;
7637
7638         while (!list_empty(&ctx->rsrc_ref_list)) {
7639                 node = list_first_entry(&ctx->rsrc_ref_list,
7640                                             struct io_rsrc_node, node);
7641                 /* recycle ref nodes in order */
7642                 if (!node->done)
7643                         break;
7644                 list_del(&node->node);
7645                 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
7646         }
7647         io_rsrc_ref_unlock(ctx);
7648
7649         if (first_add)
7650                 mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
7651 }
7652
7653 static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
7654 {
7655         struct io_rsrc_node *ref_node;
7656
7657         ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
7658         if (!ref_node)
7659                 return NULL;
7660
7661         if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
7662                             0, GFP_KERNEL)) {
7663                 kfree(ref_node);
7664                 return NULL;
7665         }
7666         INIT_LIST_HEAD(&ref_node->node);
7667         INIT_LIST_HEAD(&ref_node->rsrc_list);
7668         ref_node->done = false;
7669         return ref_node;
7670 }
7671
7672 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
7673                                  unsigned nr_args, u64 __user *tags)
7674 {
7675         __s32 __user *fds = (__s32 __user *) arg;
7676         struct file *file;
7677         int fd, ret;
7678         unsigned i;
7679
7680         if (ctx->file_data)
7681                 return -EBUSY;
7682         if (!nr_args)
7683                 return -EINVAL;
7684         if (nr_args > IORING_MAX_FIXED_FILES)
7685                 return -EMFILE;
7686         ret = io_rsrc_node_switch_start(ctx);
7687         if (ret)
7688                 return ret;
7689         ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
7690                                  &ctx->file_data);
7691         if (ret)
7692                 return ret;
7693
7694         ret = -ENOMEM;
7695         if (!io_alloc_file_tables(&ctx->file_table, nr_args))
7696                 goto out_free;
7697
7698         for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
7699                 if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
7700                         ret = -EFAULT;
7701                         goto out_fput;
7702                 }
7703                 /* allow sparse sets */
7704                 if (fd == -1) {
7705                         ret = -EINVAL;
7706                         if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
7707                                 goto out_fput;
7708                         continue;
7709                 }
7710
7711                 file = fget(fd);
7712                 ret = -EBADF;
7713                 if (unlikely(!file))
7714                         goto out_fput;
7715
7716                 /*
7717                  * Don't allow io_uring instances to be registered. If UNIX
7718                  * isn't enabled, then this causes a reference cycle and this
7719                  * instance can never get freed. If UNIX is enabled we'll
7720                  * handle it just fine, but there's still no point in allowing
7721                  * a ring fd as it doesn't support regular read/write anyway.
7722                  */
7723                 if (file->f_op == &io_uring_fops) {
7724                         fput(file);
7725                         goto out_fput;
7726                 }
7727                 io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
7728         }
7729
7730         ret = io_sqe_files_scm(ctx);
7731         if (ret) {
7732                 __io_sqe_files_unregister(ctx);
7733                 return ret;
7734         }
7735
7736         io_rsrc_node_switch(ctx, NULL);
7737         return ret;
7738 out_fput:
7739         for (i = 0; i < ctx->nr_user_files; i++) {
7740                 file = io_file_from_index(ctx, i);
7741                 if (file)
7742                         fput(file);
7743         }
7744         io_free_file_tables(&ctx->file_table, nr_args);
7745         ctx->nr_user_files = 0;
7746 out_free:
7747         io_rsrc_data_free(ctx->file_data);
7748         ctx->file_data = NULL;
7749         return ret;
7750 }
7751
7752 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
7753                                 int index)
7754 {
7755 #if defined(CONFIG_UNIX)
7756         struct sock *sock = ctx->ring_sock->sk;
7757         struct sk_buff_head *head = &sock->sk_receive_queue;
7758         struct sk_buff *skb;
7759
7760         /*
7761          * See if we can merge this file into an existing skb SCM_RIGHTS
7762          * file set. If there's no room, fall back to allocating a new skb
7763          * and filling it in.
7764          */
7765         spin_lock_irq(&head->lock);
7766         skb = skb_peek(head);
7767         if (skb) {
7768                 struct scm_fp_list *fpl = UNIXCB(skb).fp;
7769
7770                 if (fpl->count < SCM_MAX_FD) {
7771                         __skb_unlink(skb, head);
7772                         spin_unlock_irq(&head->lock);
7773                         fpl->fp[fpl->count] = get_file(file);
7774                         unix_inflight(fpl->user, fpl->fp[fpl->count]);
7775                         fpl->count++;
7776                         spin_lock_irq(&head->lock);
7777                         __skb_queue_head(head, skb);
7778                 } else {
7779                         skb = NULL;
7780                 }
7781         }
7782         spin_unlock_irq(&head->lock);
7783
7784         if (skb) {
7785                 fput(file);
7786                 return 0;
7787         }
7788
7789         return __io_sqe_files_scm(ctx, 1, index);
7790 #else
7791         return 0;
7792 #endif
7793 }
7794
7795 static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
7796                                  struct io_rsrc_node *node, void *rsrc)
7797 {
7798         struct io_rsrc_put *prsrc;
7799
7800         prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
7801         if (!prsrc)
7802                 return -ENOMEM;
7803
7804         prsrc->tag = *io_get_tag_slot(data, idx);
7805         prsrc->rsrc = rsrc;
7806         list_add(&prsrc->list, &node->rsrc_list);
7807         return 0;
7808 }
7809
7810 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
7811                                  struct io_uring_rsrc_update2 *up,
7812                                  unsigned nr_args)
7813 {
7814         u64 __user *tags = u64_to_user_ptr(up->tags);
7815         __s32 __user *fds = u64_to_user_ptr(up->data);
7816         struct io_rsrc_data *data = ctx->file_data;
7817         struct io_fixed_file *file_slot;
7818         struct file *file;
7819         int fd, i, err = 0;
7820         unsigned int done;
7821         bool needs_switch = false;
7822
7823         if (!ctx->file_data)
7824                 return -ENXIO;
7825         if (up->offset + nr_args > ctx->nr_user_files)
7826                 return -EINVAL;
7827
7828         for (done = 0; done < nr_args; done++) {
7829                 u64 tag = 0;
7830
7831                 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
7832                     copy_from_user(&fd, &fds[done], sizeof(fd))) {
7833                         err = -EFAULT;
7834                         break;
7835                 }
7836                 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
7837                         err = -EINVAL;
7838                         break;
7839                 }
7840                 if (fd == IORING_REGISTER_FILES_SKIP)
7841                         continue;
7842
7843                 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
7844                 file_slot = io_fixed_file_slot(&ctx->file_table, i);
7845
7846                 if (file_slot->file_ptr) {
7847                         file = (struct file *)(file_slot->file_ptr & FFS_MASK);
7848                         err = io_queue_rsrc_removal(data, up->offset + done,
7849                                                     ctx->rsrc_node, file);
7850                         if (err)
7851                                 break;
7852                         file_slot->file_ptr = 0;
7853                         needs_switch = true;
7854                 }
7855                 if (fd != -1) {
7856                         file = fget(fd);
7857                         if (!file) {
7858                                 err = -EBADF;
7859                                 break;
7860                         }
7861                         /*
7862                          * Don't allow io_uring instances to be registered. If
7863                          * UNIX isn't enabled, then this causes a reference
7864                          * cycle and this instance can never get freed. If UNIX
7865                          * is enabled we'll handle it just fine, but there's
7866                          * still no point in allowing a ring fd as it doesn't
7867                          * support regular read/write anyway.
7868                          */
7869                         if (file->f_op == &io_uring_fops) {
7870                                 fput(file);
7871                                 err = -EBADF;
7872                                 break;
7873                         }
7874                         *io_get_tag_slot(data, up->offset + done) = tag;
7875                         io_fixed_file_set(file_slot, file);
7876                         err = io_sqe_file_register(ctx, file, i);
7877                         if (err) {
7878                                 file_slot->file_ptr = 0;
7879                                 fput(file);
7880                                 break;
7881                         }
7882                 }
7883         }
7884
7885         if (needs_switch)
7886                 io_rsrc_node_switch(ctx, data);
7887         return done ? done : err;
7888 }
7889
7890 static struct io_wq_work *io_free_work(struct io_wq_work *work)
7891 {
7892         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
7893
7894         req = io_put_req_find_next(req);
7895         return req ? &req->work : NULL;
7896 }
7897
7898 static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
7899                                         struct task_struct *task)
7900 {
7901         struct io_wq_hash *hash;
7902         struct io_wq_data data;
7903         unsigned int concurrency;
7904
7905         hash = ctx->hash_map;
7906         if (!hash) {
7907                 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
7908                 if (!hash)
7909                         return ERR_PTR(-ENOMEM);
7910                 refcount_set(&hash->refs, 1);
7911                 init_waitqueue_head(&hash->wait);
7912                 ctx->hash_map = hash;
7913         }
7914
7915         data.hash = hash;
7916         data.task = task;
7917         data.free_work = io_free_work;
7918         data.do_work = io_wq_submit_work;
7919
7920         /* Do QD, or 4 * CPUS, whatever is smallest */
7921         concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
7922
7923         return io_wq_create(concurrency, &data);
7924 }
7925
7926 static int io_uring_alloc_task_context(struct task_struct *task,
7927                                        struct io_ring_ctx *ctx)
7928 {
7929         struct io_uring_task *tctx;
7930         int ret;
7931
7932         tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
7933         if (unlikely(!tctx))
7934                 return -ENOMEM;
7935
7936         ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
7937         if (unlikely(ret)) {
7938                 kfree(tctx);
7939                 return ret;
7940         }
7941
7942         tctx->io_wq = io_init_wq_offload(ctx, task);
7943         if (IS_ERR(tctx->io_wq)) {
7944                 ret = PTR_ERR(tctx->io_wq);
7945                 percpu_counter_destroy(&tctx->inflight);
7946                 kfree(tctx);
7947                 return ret;
7948         }
7949
7950         xa_init(&tctx->xa);
7951         init_waitqueue_head(&tctx->wait);
7952         tctx->last = NULL;
7953         atomic_set(&tctx->in_idle, 0);
7954         atomic_set(&tctx->inflight_tracked, 0);
7955         task->io_uring = tctx;
7956         spin_lock_init(&tctx->task_lock);
7957         INIT_WQ_LIST(&tctx->task_list);
7958         tctx->task_state = 0;
7959         init_task_work(&tctx->task_work, tctx_task_work);
7960         return 0;
7961 }
7962
7963 void __io_uring_free(struct task_struct *tsk)
7964 {
7965         struct io_uring_task *tctx = tsk->io_uring;
7966
7967         WARN_ON_ONCE(!xa_empty(&tctx->xa));
7968         WARN_ON_ONCE(tctx->io_wq);
7969
7970         percpu_counter_destroy(&tctx->inflight);
7971         kfree(tctx);
7972         tsk->io_uring = NULL;
7973 }
7974
7975 static int io_sq_offload_create(struct io_ring_ctx *ctx,
7976                                 struct io_uring_params *p)
7977 {
7978         int ret;
7979
7980         /* Retain compatibility with failing for an invalid attach attempt */
7981         if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
7982                                 IORING_SETUP_ATTACH_WQ) {
7983                 struct fd f;
7984
7985                 f = fdget(p->wq_fd);
7986                 if (!f.file)
7987                         return -ENXIO;
7988                 fdput(f);
7989                 if (f.file->f_op != &io_uring_fops)
7990                         return -EINVAL;
7991         }
7992         if (ctx->flags & IORING_SETUP_SQPOLL) {
7993                 struct task_struct *tsk;
7994                 struct io_sq_data *sqd;
7995                 bool attached;
7996
7997                 sqd = io_get_sq_data(p, &attached);
7998                 if (IS_ERR(sqd)) {
7999                         ret = PTR_ERR(sqd);
8000                         goto err;
8001                 }
8002
8003                 ctx->sq_creds = get_current_cred();
8004                 ctx->sq_data = sqd;
8005                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
8006                 if (!ctx->sq_thread_idle)
8007                         ctx->sq_thread_idle = HZ;
8008
8009                 io_sq_thread_park(sqd);
8010                 list_add(&ctx->sqd_list, &sqd->ctx_list);
8011                 io_sqd_update_thread_idle(sqd);
8012                 /* don't attach to a dying SQPOLL thread, would be racy */
8013                 ret = (attached && !sqd->thread) ? -ENXIO : 0;
8014                 io_sq_thread_unpark(sqd);
8015
8016                 if (ret < 0)
8017                         goto err;
8018                 if (attached)
8019                         return 0;
8020
8021                 if (p->flags & IORING_SETUP_SQ_AFF) {
8022                         int cpu = p->sq_thread_cpu;
8023
8024                         ret = -EINVAL;
8025                         if (cpu >= nr_cpu_ids || !cpu_online(cpu))
8026                                 goto err_sqpoll;
8027                         sqd->sq_cpu = cpu;
8028                 } else {
8029                         sqd->sq_cpu = -1;
8030                 }
8031
8032                 sqd->task_pid = current->pid;
8033                 sqd->task_tgid = current->tgid;
8034                 tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
8035                 if (IS_ERR(tsk)) {
8036                         ret = PTR_ERR(tsk);
8037                         goto err_sqpoll;
8038                 }
8039
8040                 sqd->thread = tsk;
8041                 ret = io_uring_alloc_task_context(tsk, ctx);
8042                 wake_up_new_task(tsk);
8043                 if (ret)
8044                         goto err;
8045         } else if (p->flags & IORING_SETUP_SQ_AFF) {
8046                 /* Can't have SQ_AFF without SQPOLL */
8047                 ret = -EINVAL;
8048                 goto err;
8049         }
8050
8051         return 0;
8052 err_sqpoll:
8053         complete(&ctx->sq_data->exited);
8054 err:
8055         io_sq_thread_finish(ctx);
8056         return ret;
8057 }
8058
8059 static inline void __io_unaccount_mem(struct user_struct *user,
8060                                       unsigned long nr_pages)
8061 {
8062         atomic_long_sub(nr_pages, &user->locked_vm);
8063 }
8064
8065 static inline int __io_account_mem(struct user_struct *user,
8066                                    unsigned long nr_pages)
8067 {
8068         unsigned long page_limit, cur_pages, new_pages;
8069
8070         /* Don't allow more pages than we can safely lock */
8071         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
8072
8073         do {
8074                 cur_pages = atomic_long_read(&user->locked_vm);
8075                 new_pages = cur_pages + nr_pages;
8076                 if (new_pages > page_limit)
8077                         return -ENOMEM;
8078         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
8079                                         new_pages) != cur_pages);
8080
8081         return 0;
8082 }
8083
8084 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
8085 {
8086         if (ctx->user)
8087                 __io_unaccount_mem(ctx->user, nr_pages);
8088
8089         if (ctx->mm_account)
8090                 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
8091 }
8092
8093 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
8094 {
8095         int ret;
8096
8097         if (ctx->user) {
8098                 ret = __io_account_mem(ctx->user, nr_pages);
8099                 if (ret)
8100                         return ret;
8101         }
8102
8103         if (ctx->mm_account)
8104                 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
8105
8106         return 0;
8107 }
8108
8109 static void io_mem_free(void *ptr)
8110 {
8111         struct page *page;
8112
8113         if (!ptr)
8114                 return;
8115
8116         page = virt_to_head_page(ptr);
8117         if (put_page_testzero(page))
8118                 free_compound_page(page);
8119 }
8120
8121 static void *io_mem_alloc(size_t size)
8122 {
8123         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
8124                                 __GFP_NORETRY | __GFP_ACCOUNT;
8125
8126         return (void *) __get_free_pages(gfp_flags, get_order(size));
8127 }
8128
8129 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
8130                                 size_t *sq_offset)
8131 {
8132         struct io_rings *rings;
8133         size_t off, sq_array_size;
8134
8135         off = struct_size(rings, cqes, cq_entries);
8136         if (off == SIZE_MAX)
8137                 return SIZE_MAX;
8138
8139 #ifdef CONFIG_SMP
8140         off = ALIGN(off, SMP_CACHE_BYTES);
8141         if (off == 0)
8142                 return SIZE_MAX;
8143 #endif
8144
8145         if (sq_offset)
8146                 *sq_offset = off;
8147
8148         sq_array_size = array_size(sizeof(u32), sq_entries);
8149         if (sq_array_size == SIZE_MAX)
8150                 return SIZE_MAX;
8151
8152         if (check_add_overflow(off, sq_array_size, &off))
8153                 return SIZE_MAX;
8154
8155         return off;
8156 }
8157
8158 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
8159 {
8160         struct io_mapped_ubuf *imu = *slot;
8161         unsigned int i;
8162
8163         if (imu != ctx->dummy_ubuf) {
8164                 for (i = 0; i < imu->nr_bvecs; i++)
8165                         unpin_user_page(imu->bvec[i].bv_page);
8166                 if (imu->acct_pages)
8167                         io_unaccount_mem(ctx, imu->acct_pages);
8168                 kvfree(imu);
8169         }
8170         *slot = NULL;
8171 }
8172
8173 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
8174 {
8175         io_buffer_unmap(ctx, &prsrc->buf);
8176         prsrc->buf = NULL;
8177 }
8178
8179 static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
8180 {
8181         unsigned int i;
8182
8183         for (i = 0; i < ctx->nr_user_bufs; i++)
8184                 io_buffer_unmap(ctx, &ctx->user_bufs[i]);
8185         kfree(ctx->user_bufs);
8186         io_rsrc_data_free(ctx->buf_data);
8187         ctx->user_bufs = NULL;
8188         ctx->buf_data = NULL;
8189         ctx->nr_user_bufs = 0;
8190 }
8191
8192 static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
8193 {
8194         int ret;
8195
8196         if (!ctx->buf_data)
8197                 return -ENXIO;
8198
8199         ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
8200         if (!ret)
8201                 __io_sqe_buffers_unregister(ctx);
8202         return ret;
8203 }
8204
8205 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
8206                        void __user *arg, unsigned index)
8207 {
8208         struct iovec __user *src;
8209
8210 #ifdef CONFIG_COMPAT
8211         if (ctx->compat) {
8212                 struct compat_iovec __user *ciovs;
8213                 struct compat_iovec ciov;
8214
8215                 ciovs = (struct compat_iovec __user *) arg;
8216                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
8217                         return -EFAULT;
8218
8219                 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
8220                 dst->iov_len = ciov.iov_len;
8221                 return 0;
8222         }
8223 #endif
8224         src = (struct iovec __user *) arg;
8225         if (copy_from_user(dst, &src[index], sizeof(*dst)))
8226                 return -EFAULT;
8227         return 0;
8228 }
8229
8230 /*
8231  * Not super efficient, but this is just a registration time. And we do cache
8232  * the last compound head, so generally we'll only do a full search if we don't
8233  * match that one.
8234  *
8235  * We check if the given compound head page has already been accounted, to
8236  * avoid double accounting it. This allows us to account the full size of the
8237  * page, not just the constituent pages of a huge page.
8238  */
8239 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
8240                                   int nr_pages, struct page *hpage)
8241 {
8242         int i, j;
8243
8244         /* check current page array */
8245         for (i = 0; i < nr_pages; i++) {
8246                 if (!PageCompound(pages[i]))
8247                         continue;
8248                 if (compound_head(pages[i]) == hpage)
8249                         return true;
8250         }
8251
8252         /* check previously registered pages */
8253         for (i = 0; i < ctx->nr_user_bufs; i++) {
8254                 struct io_mapped_ubuf *imu = ctx->user_bufs[i];
8255
8256                 for (j = 0; j < imu->nr_bvecs; j++) {
8257                         if (!PageCompound(imu->bvec[j].bv_page))
8258                                 continue;
8259                         if (compound_head(imu->bvec[j].bv_page) == hpage)
8260                                 return true;
8261                 }
8262         }
8263
8264         return false;
8265 }
8266
8267 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
8268                                  int nr_pages, struct io_mapped_ubuf *imu,
8269                                  struct page **last_hpage)
8270 {
8271         int i, ret;
8272
8273         imu->acct_pages = 0;
8274         for (i = 0; i < nr_pages; i++) {
8275                 if (!PageCompound(pages[i])) {
8276                         imu->acct_pages++;
8277                 } else {
8278                         struct page *hpage;
8279
8280                         hpage = compound_head(pages[i]);
8281                         if (hpage == *last_hpage)
8282                                 continue;
8283                         *last_hpage = hpage;
8284                         if (headpage_already_acct(ctx, pages, i, hpage))
8285                                 continue;
8286                         imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
8287                 }
8288         }
8289
8290         if (!imu->acct_pages)
8291                 return 0;
8292
8293         ret = io_account_mem(ctx, imu->acct_pages);
8294         if (ret)
8295                 imu->acct_pages = 0;
8296         return ret;
8297 }
8298
8299 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
8300                                   struct io_mapped_ubuf **pimu,
8301                                   struct page **last_hpage)
8302 {
8303         struct io_mapped_ubuf *imu = NULL;
8304         struct vm_area_struct **vmas = NULL;
8305         struct page **pages = NULL;
8306         unsigned long off, start, end, ubuf;
8307         size_t size;
8308         int ret, pret, nr_pages, i;
8309
8310         if (!iov->iov_base) {
8311                 *pimu = ctx->dummy_ubuf;
8312                 return 0;
8313         }
8314
8315         ubuf = (unsigned long) iov->iov_base;
8316         end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
8317         start = ubuf >> PAGE_SHIFT;
8318         nr_pages = end - start;
8319
8320         *pimu = NULL;
8321         ret = -ENOMEM;
8322
8323         pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
8324         if (!pages)
8325                 goto done;
8326
8327         vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
8328                               GFP_KERNEL);
8329         if (!vmas)
8330                 goto done;
8331
8332         imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
8333         if (!imu)
8334                 goto done;
8335
8336         ret = 0;
8337         mmap_read_lock(current->mm);
8338         pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
8339                               pages, vmas);
8340         if (pret == nr_pages) {
8341                 /* don't support file backed memory */
8342                 for (i = 0; i < nr_pages; i++) {
8343                         struct vm_area_struct *vma = vmas[i];
8344
8345                         if (vma_is_shmem(vma))
8346                                 continue;
8347                         if (vma->vm_file &&
8348                             !is_file_hugepages(vma->vm_file)) {
8349                                 ret = -EOPNOTSUPP;
8350                                 break;
8351                         }
8352                 }
8353         } else {
8354                 ret = pret < 0 ? pret : -EFAULT;
8355         }
8356         mmap_read_unlock(current->mm);
8357         if (ret) {
8358                 /*
8359                  * if we did partial map, or found file backed vmas,
8360                  * release any pages we did get
8361                  */
8362                 if (pret > 0)
8363                         unpin_user_pages(pages, pret);
8364                 goto done;
8365         }
8366
8367         ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
8368         if (ret) {
8369                 unpin_user_pages(pages, pret);
8370                 goto done;
8371         }
8372
8373         off = ubuf & ~PAGE_MASK;
8374         size = iov->iov_len;
8375         for (i = 0; i < nr_pages; i++) {
8376                 size_t vec_len;
8377
8378                 vec_len = min_t(size_t, size, PAGE_SIZE - off);
8379                 imu->bvec[i].bv_page = pages[i];
8380                 imu->bvec[i].bv_len = vec_len;
8381                 imu->bvec[i].bv_offset = off;
8382                 off = 0;
8383                 size -= vec_len;
8384         }
8385         /* store original address for later verification */
8386         imu->ubuf = ubuf;
8387         imu->ubuf_end = ubuf + iov->iov_len;
8388         imu->nr_bvecs = nr_pages;
8389         *pimu = imu;
8390         ret = 0;
8391 done:
8392         if (ret)
8393                 kvfree(imu);
8394         kvfree(pages);
8395         kvfree(vmas);
8396         return ret;
8397 }
8398
8399 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
8400 {
8401         ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
8402         return ctx->user_bufs ? 0 : -ENOMEM;
8403 }
8404
8405 static int io_buffer_validate(struct iovec *iov)
8406 {
8407         unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
8408
8409         /*
8410          * Don't impose further limits on the size and buffer
8411          * constraints here, we'll -EINVAL later when IO is
8412          * submitted if they are wrong.
8413          */
8414         if (!iov->iov_base)
8415                 return iov->iov_len ? -EFAULT : 0;
8416         if (!iov->iov_len)
8417                 return -EFAULT;
8418
8419         /* arbitrary limit, but we need something */
8420         if (iov->iov_len > SZ_1G)
8421                 return -EFAULT;
8422
8423         if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
8424                 return -EOVERFLOW;
8425
8426         return 0;
8427 }
8428
8429 static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
8430                                    unsigned int nr_args, u64 __user *tags)
8431 {
8432         struct page *last_hpage = NULL;
8433         struct io_rsrc_data *data;
8434         int i, ret;
8435         struct iovec iov;
8436
8437         if (ctx->user_bufs)
8438                 return -EBUSY;
8439         if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
8440                 return -EINVAL;
8441         ret = io_rsrc_node_switch_start(ctx);
8442         if (ret)
8443                 return ret;
8444         ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
8445         if (ret)
8446                 return ret;
8447         ret = io_buffers_map_alloc(ctx, nr_args);
8448         if (ret) {
8449                 io_rsrc_data_free(data);
8450                 return ret;
8451         }
8452
8453         for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
8454                 ret = io_copy_iov(ctx, &iov, arg, i);
8455                 if (ret)
8456                         break;
8457                 ret = io_buffer_validate(&iov);
8458                 if (ret)
8459                         break;
8460                 if (!iov.iov_base && *io_get_tag_slot(data, i)) {
8461                         ret = -EINVAL;
8462                         break;
8463                 }
8464
8465                 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
8466                                              &last_hpage);
8467                 if (ret)
8468                         break;
8469         }
8470
8471         WARN_ON_ONCE(ctx->buf_data);
8472
8473         ctx->buf_data = data;
8474         if (ret)
8475                 __io_sqe_buffers_unregister(ctx);
8476         else
8477                 io_rsrc_node_switch(ctx, NULL);
8478         return ret;
8479 }
8480
8481 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
8482                                    struct io_uring_rsrc_update2 *up,
8483                                    unsigned int nr_args)
8484 {
8485         u64 __user *tags = u64_to_user_ptr(up->tags);
8486         struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
8487         struct page *last_hpage = NULL;
8488         bool needs_switch = false;
8489         __u32 done;
8490         int i, err;
8491
8492         if (!ctx->buf_data)
8493                 return -ENXIO;
8494         if (up->offset + nr_args > ctx->nr_user_bufs)
8495                 return -EINVAL;
8496
8497         for (done = 0; done < nr_args; done++) {
8498                 struct io_mapped_ubuf *imu;
8499                 int offset = up->offset + done;
8500                 u64 tag = 0;
8501
8502                 err = io_copy_iov(ctx, &iov, iovs, done);
8503                 if (err)
8504                         break;
8505                 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
8506                         err = -EFAULT;
8507                         break;
8508                 }
8509                 err = io_buffer_validate(&iov);
8510                 if (err)
8511                         break;
8512                 if (!iov.iov_base && tag) {
8513                         err = -EINVAL;
8514                         break;
8515                 }
8516                 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
8517                 if (err)
8518                         break;
8519
8520                 i = array_index_nospec(offset, ctx->nr_user_bufs);
8521                 if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
8522                         err = io_queue_rsrc_removal(ctx->buf_data, offset,
8523                                                     ctx->rsrc_node, ctx->user_bufs[i]);
8524                         if (unlikely(err)) {
8525                                 io_buffer_unmap(ctx, &imu);
8526                                 break;
8527                         }
8528                         ctx->user_bufs[i] = NULL;
8529                         needs_switch = true;
8530                 }
8531
8532                 ctx->user_bufs[i] = imu;
8533                 *io_get_tag_slot(ctx->buf_data, offset) = tag;
8534         }
8535
8536         if (needs_switch)
8537                 io_rsrc_node_switch(ctx, ctx->buf_data);
8538         return done ? done : err;
8539 }
8540
8541 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
8542 {
8543         __s32 __user *fds = arg;
8544         int fd;
8545
8546         if (ctx->cq_ev_fd)
8547                 return -EBUSY;
8548
8549         if (copy_from_user(&fd, fds, sizeof(*fds)))
8550                 return -EFAULT;
8551
8552         ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
8553         if (IS_ERR(ctx->cq_ev_fd)) {
8554                 int ret = PTR_ERR(ctx->cq_ev_fd);
8555                 ctx->cq_ev_fd = NULL;
8556                 return ret;
8557         }
8558
8559         return 0;
8560 }
8561
8562 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
8563 {
8564         if (ctx->cq_ev_fd) {
8565                 eventfd_ctx_put(ctx->cq_ev_fd);
8566                 ctx->cq_ev_fd = NULL;
8567                 return 0;
8568         }
8569
8570         return -ENXIO;
8571 }
8572
8573 static void io_destroy_buffers(struct io_ring_ctx *ctx)
8574 {
8575         struct io_buffer *buf;
8576         unsigned long index;
8577
8578         xa_for_each(&ctx->io_buffers, index, buf)
8579                 __io_remove_buffers(ctx, buf, index, -1U);
8580 }
8581
8582 static void io_req_cache_free(struct list_head *list, struct task_struct *tsk)
8583 {
8584         struct io_kiocb *req, *nxt;
8585
8586         list_for_each_entry_safe(req, nxt, list, compl.list) {
8587                 if (tsk && req->task != tsk)
8588                         continue;
8589                 list_del(&req->compl.list);
8590                 kmem_cache_free(req_cachep, req);
8591         }
8592 }
8593
8594 static void io_req_caches_free(struct io_ring_ctx *ctx)
8595 {
8596         struct io_submit_state *submit_state = &ctx->submit_state;
8597         struct io_comp_state *cs = &ctx->submit_state.comp;
8598
8599         mutex_lock(&ctx->uring_lock);
8600
8601         if (submit_state->free_reqs) {
8602                 kmem_cache_free_bulk(req_cachep, submit_state->free_reqs,
8603                                      submit_state->reqs);
8604                 submit_state->free_reqs = 0;
8605         }
8606
8607         io_flush_cached_locked_reqs(ctx, cs);
8608         io_req_cache_free(&cs->free_list, NULL);
8609         mutex_unlock(&ctx->uring_lock);
8610 }
8611
8612 static bool io_wait_rsrc_data(struct io_rsrc_data *data)
8613 {
8614         if (!data)
8615                 return false;
8616         if (!atomic_dec_and_test(&data->refs))
8617                 wait_for_completion(&data->done);
8618         return true;
8619 }
8620
8621 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
8622 {
8623         io_sq_thread_finish(ctx);
8624
8625         if (ctx->mm_account) {
8626                 mmdrop(ctx->mm_account);
8627                 ctx->mm_account = NULL;
8628         }
8629
8630         mutex_lock(&ctx->uring_lock);
8631         if (io_wait_rsrc_data(ctx->buf_data))
8632                 __io_sqe_buffers_unregister(ctx);
8633         if (io_wait_rsrc_data(ctx->file_data))
8634                 __io_sqe_files_unregister(ctx);
8635         if (ctx->rings)
8636                 __io_cqring_overflow_flush(ctx, true);
8637         mutex_unlock(&ctx->uring_lock);
8638         io_eventfd_unregister(ctx);
8639         io_destroy_buffers(ctx);
8640         if (ctx->sq_creds)
8641                 put_cred(ctx->sq_creds);
8642
8643         /* there are no registered resources left, nobody uses it */
8644         if (ctx->rsrc_node)
8645                 io_rsrc_node_destroy(ctx->rsrc_node);
8646         if (ctx->rsrc_backup_node)
8647                 io_rsrc_node_destroy(ctx->rsrc_backup_node);
8648         flush_delayed_work(&ctx->rsrc_put_work);
8649
8650         WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
8651         WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
8652
8653 #if defined(CONFIG_UNIX)
8654         if (ctx->ring_sock) {
8655                 ctx->ring_sock->file = NULL; /* so that iput() is called */
8656                 sock_release(ctx->ring_sock);
8657         }
8658 #endif
8659
8660         io_mem_free(ctx->rings);
8661         io_mem_free(ctx->sq_sqes);
8662
8663         percpu_ref_exit(&ctx->refs);
8664         free_uid(ctx->user);
8665         io_req_caches_free(ctx);
8666         if (ctx->hash_map)
8667                 io_wq_put_hash(ctx->hash_map);
8668         kfree(ctx->cancel_hash);
8669         kfree(ctx->dummy_ubuf);
8670         kfree(ctx);
8671 }
8672
8673 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
8674 {
8675         struct io_ring_ctx *ctx = file->private_data;
8676         __poll_t mask = 0;
8677
8678         poll_wait(file, &ctx->cq_wait, wait);
8679         /*
8680          * synchronizes with barrier from wq_has_sleeper call in
8681          * io_commit_cqring
8682          */
8683         smp_rmb();
8684         if (!io_sqring_full(ctx))
8685                 mask |= EPOLLOUT | EPOLLWRNORM;
8686
8687         /*
8688          * Don't flush cqring overflow list here, just do a simple check.
8689          * Otherwise there could possible be ABBA deadlock:
8690          *      CPU0                    CPU1
8691          *      ----                    ----
8692          * lock(&ctx->uring_lock);
8693          *                              lock(&ep->mtx);
8694          *                              lock(&ctx->uring_lock);
8695          * lock(&ep->mtx);
8696          *
8697          * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
8698          * pushs them to do the flush.
8699          */
8700         if (io_cqring_events(ctx) || test_bit(0, &ctx->cq_check_overflow))
8701                 mask |= EPOLLIN | EPOLLRDNORM;
8702
8703         return mask;
8704 }
8705
8706 static int io_uring_fasync(int fd, struct file *file, int on)
8707 {
8708         struct io_ring_ctx *ctx = file->private_data;
8709
8710         return fasync_helper(fd, file, on, &ctx->cq_fasync);
8711 }
8712
8713 static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
8714 {
8715         const struct cred *creds;
8716
8717         creds = xa_erase(&ctx->personalities, id);
8718         if (creds) {
8719                 put_cred(creds);
8720                 return 0;
8721         }
8722
8723         return -EINVAL;
8724 }
8725
8726 static inline bool io_run_ctx_fallback(struct io_ring_ctx *ctx)
8727 {
8728         return io_run_task_work_head(&ctx->exit_task_work);
8729 }
8730
8731 struct io_tctx_exit {
8732         struct callback_head            task_work;
8733         struct completion               completion;
8734         struct io_ring_ctx              *ctx;
8735 };
8736
8737 static void io_tctx_exit_cb(struct callback_head *cb)
8738 {
8739         struct io_uring_task *tctx = current->io_uring;
8740         struct io_tctx_exit *work;
8741
8742         work = container_of(cb, struct io_tctx_exit, task_work);
8743         /*
8744          * When @in_idle, we're in cancellation and it's racy to remove the
8745          * node. It'll be removed by the end of cancellation, just ignore it.
8746          */
8747         if (!atomic_read(&tctx->in_idle))
8748                 io_uring_del_tctx_node((unsigned long)work->ctx);
8749         complete(&work->completion);
8750 }
8751
8752 static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
8753 {
8754         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
8755
8756         return req->ctx == data;
8757 }
8758
8759 static void io_ring_exit_work(struct work_struct *work)
8760 {
8761         struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
8762         unsigned long timeout = jiffies + HZ * 60 * 5;
8763         struct io_tctx_exit exit;
8764         struct io_tctx_node *node;
8765         int ret;
8766
8767         /*
8768          * If we're doing polled IO and end up having requests being
8769          * submitted async (out-of-line), then completions can come in while
8770          * we're waiting for refs to drop. We need to reap these manually,
8771          * as nobody else will be looking for them.
8772          */
8773         do {
8774                 io_uring_try_cancel_requests(ctx, NULL, true);
8775                 if (ctx->sq_data) {
8776                         struct io_sq_data *sqd = ctx->sq_data;
8777                         struct task_struct *tsk;
8778
8779                         io_sq_thread_park(sqd);
8780                         tsk = sqd->thread;
8781                         if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
8782                                 io_wq_cancel_cb(tsk->io_uring->io_wq,
8783                                                 io_cancel_ctx_cb, ctx, true);
8784                         io_sq_thread_unpark(sqd);
8785                 }
8786
8787                 WARN_ON_ONCE(time_after(jiffies, timeout));
8788         } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
8789
8790         init_completion(&exit.completion);
8791         init_task_work(&exit.task_work, io_tctx_exit_cb);
8792         exit.ctx = ctx;
8793         /*
8794          * Some may use context even when all refs and requests have been put,
8795          * and they are free to do so while still holding uring_lock or
8796          * completion_lock, see __io_req_task_submit(). Apart from other work,
8797          * this lock/unlock section also waits them to finish.
8798          */
8799         mutex_lock(&ctx->uring_lock);
8800         while (!list_empty(&ctx->tctx_list)) {
8801                 WARN_ON_ONCE(time_after(jiffies, timeout));
8802
8803                 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
8804                                         ctx_node);
8805                 /* don't spin on a single task if cancellation failed */
8806                 list_rotate_left(&ctx->tctx_list);
8807                 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
8808                 if (WARN_ON_ONCE(ret))
8809                         continue;
8810                 wake_up_process(node->task);
8811
8812                 mutex_unlock(&ctx->uring_lock);
8813                 wait_for_completion(&exit.completion);
8814                 mutex_lock(&ctx->uring_lock);
8815         }
8816         mutex_unlock(&ctx->uring_lock);
8817         spin_lock_irq(&ctx->completion_lock);
8818         spin_unlock_irq(&ctx->completion_lock);
8819
8820         io_ring_ctx_free(ctx);
8821 }
8822
8823 /* Returns true if we found and killed one or more timeouts */
8824 static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
8825                              bool cancel_all)
8826 {
8827         struct io_kiocb *req, *tmp;
8828         int canceled = 0;
8829
8830         spin_lock_irq(&ctx->completion_lock);
8831         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
8832                 if (io_match_task(req, tsk, cancel_all)) {
8833                         io_kill_timeout(req, -ECANCELED);
8834                         canceled++;
8835                 }
8836         }
8837         if (canceled != 0)
8838                 io_commit_cqring(ctx);
8839         spin_unlock_irq(&ctx->completion_lock);
8840         if (canceled != 0)
8841                 io_cqring_ev_posted(ctx);
8842         return canceled != 0;
8843 }
8844
8845 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
8846 {
8847         unsigned long index;
8848         struct creds *creds;
8849
8850         mutex_lock(&ctx->uring_lock);
8851         percpu_ref_kill(&ctx->refs);
8852         if (ctx->rings)
8853                 __io_cqring_overflow_flush(ctx, true);
8854         xa_for_each(&ctx->personalities, index, creds)
8855                 io_unregister_personality(ctx, index);
8856         mutex_unlock(&ctx->uring_lock);
8857
8858         io_kill_timeouts(ctx, NULL, true);
8859         io_poll_remove_all(ctx, NULL, true);
8860
8861         /* if we failed setting up the ctx, we might not have any rings */
8862         io_iopoll_try_reap_events(ctx);
8863
8864         INIT_WORK(&ctx->exit_work, io_ring_exit_work);
8865         /*
8866          * Use system_unbound_wq to avoid spawning tons of event kworkers
8867          * if we're exiting a ton of rings at the same time. It just adds
8868          * noise and overhead, there's no discernable change in runtime
8869          * over using system_wq.
8870          */
8871         queue_work(system_unbound_wq, &ctx->exit_work);
8872 }
8873
8874 static int io_uring_release(struct inode *inode, struct file *file)
8875 {
8876         struct io_ring_ctx *ctx = file->private_data;
8877
8878         file->private_data = NULL;
8879         io_ring_ctx_wait_and_kill(ctx);
8880         return 0;
8881 }
8882
8883 struct io_task_cancel {
8884         struct task_struct *task;
8885         bool all;
8886 };
8887
8888 static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
8889 {
8890         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
8891         struct io_task_cancel *cancel = data;
8892         bool ret;
8893
8894         if (!cancel->all && (req->flags & REQ_F_LINK_TIMEOUT)) {
8895                 unsigned long flags;
8896                 struct io_ring_ctx *ctx = req->ctx;
8897
8898                 /* protect against races with linked timeouts */
8899                 spin_lock_irqsave(&ctx->completion_lock, flags);
8900                 ret = io_match_task(req, cancel->task, cancel->all);
8901                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
8902         } else {
8903                 ret = io_match_task(req, cancel->task, cancel->all);
8904         }
8905         return ret;
8906 }
8907
8908 static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
8909                                   struct task_struct *task, bool cancel_all)
8910 {
8911         struct io_defer_entry *de;
8912         LIST_HEAD(list);
8913
8914         spin_lock_irq(&ctx->completion_lock);
8915         list_for_each_entry_reverse(de, &ctx->defer_list, list) {
8916                 if (io_match_task(de->req, task, cancel_all)) {
8917                         list_cut_position(&list, &ctx->defer_list, &de->list);
8918                         break;
8919                 }
8920         }
8921         spin_unlock_irq(&ctx->completion_lock);
8922         if (list_empty(&list))
8923                 return false;
8924
8925         while (!list_empty(&list)) {
8926                 de = list_first_entry(&list, struct io_defer_entry, list);
8927                 list_del_init(&de->list);
8928                 io_req_complete_failed(de->req, -ECANCELED);
8929                 kfree(de);
8930         }
8931         return true;
8932 }
8933
8934 static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
8935 {
8936         struct io_tctx_node *node;
8937         enum io_wq_cancel cret;
8938         bool ret = false;
8939
8940         mutex_lock(&ctx->uring_lock);
8941         list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
8942                 struct io_uring_task *tctx = node->task->io_uring;
8943
8944                 /*
8945                  * io_wq will stay alive while we hold uring_lock, because it's
8946                  * killed after ctx nodes, which requires to take the lock.
8947                  */
8948                 if (!tctx || !tctx->io_wq)
8949                         continue;
8950                 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
8951                 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
8952         }
8953         mutex_unlock(&ctx->uring_lock);
8954
8955         return ret;
8956 }
8957
8958 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
8959                                          struct task_struct *task,
8960                                          bool cancel_all)
8961 {
8962         struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
8963         struct io_uring_task *tctx = task ? task->io_uring : NULL;
8964
8965         while (1) {
8966                 enum io_wq_cancel cret;
8967                 bool ret = false;
8968
8969                 if (!task) {
8970                         ret |= io_uring_try_cancel_iowq(ctx);
8971                 } else if (tctx && tctx->io_wq) {
8972                         /*
8973                          * Cancels requests of all rings, not only @ctx, but
8974                          * it's fine as the task is in exit/exec.
8975                          */
8976                         cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
8977                                                &cancel, true);
8978                         ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
8979                 }
8980
8981                 /* SQPOLL thread does its own polling */
8982                 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
8983                     (ctx->sq_data && ctx->sq_data->thread == current)) {
8984                         while (!list_empty_careful(&ctx->iopoll_list)) {
8985                                 io_iopoll_try_reap_events(ctx);
8986                                 ret = true;
8987                         }
8988                 }
8989
8990                 ret |= io_cancel_defer_files(ctx, task, cancel_all);
8991                 ret |= io_poll_remove_all(ctx, task, cancel_all);
8992                 ret |= io_kill_timeouts(ctx, task, cancel_all);
8993                 ret |= io_run_task_work();
8994                 ret |= io_run_ctx_fallback(ctx);
8995                 if (!ret)
8996                         break;
8997                 cond_resched();
8998         }
8999 }
9000
9001 static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
9002 {
9003         struct io_uring_task *tctx = current->io_uring;
9004         struct io_tctx_node *node;
9005         int ret;
9006
9007         if (unlikely(!tctx)) {
9008                 ret = io_uring_alloc_task_context(current, ctx);
9009                 if (unlikely(ret))
9010                         return ret;
9011                 tctx = current->io_uring;
9012         }
9013         if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
9014                 node = kmalloc(sizeof(*node), GFP_KERNEL);
9015                 if (!node)
9016                         return -ENOMEM;
9017                 node->ctx = ctx;
9018                 node->task = current;
9019
9020                 ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
9021                                         node, GFP_KERNEL));
9022                 if (ret) {
9023                         kfree(node);
9024                         return ret;
9025                 }
9026
9027                 mutex_lock(&ctx->uring_lock);
9028                 list_add(&node->ctx_node, &ctx->tctx_list);
9029                 mutex_unlock(&ctx->uring_lock);
9030         }
9031         tctx->last = ctx;
9032         return 0;
9033 }
9034
9035 /*
9036  * Note that this task has used io_uring. We use it for cancelation purposes.
9037  */
9038 static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
9039 {
9040         struct io_uring_task *tctx = current->io_uring;
9041
9042         if (likely(tctx && tctx->last == ctx))
9043                 return 0;
9044         return __io_uring_add_tctx_node(ctx);
9045 }
9046
9047 /*
9048  * Remove this io_uring_file -> task mapping.
9049  */
9050 static void io_uring_del_tctx_node(unsigned long index)
9051 {
9052         struct io_uring_task *tctx = current->io_uring;
9053         struct io_tctx_node *node;
9054
9055         if (!tctx)
9056                 return;
9057         node = xa_erase(&tctx->xa, index);
9058         if (!node)
9059                 return;
9060
9061         WARN_ON_ONCE(current != node->task);
9062         WARN_ON_ONCE(list_empty(&node->ctx_node));
9063
9064         mutex_lock(&node->ctx->uring_lock);
9065         list_del(&node->ctx_node);
9066         mutex_unlock(&node->ctx->uring_lock);
9067
9068         if (tctx->last == node->ctx)
9069                 tctx->last = NULL;
9070         kfree(node);
9071 }
9072
9073 static void io_uring_clean_tctx(struct io_uring_task *tctx)
9074 {
9075         struct io_wq *wq = tctx->io_wq;
9076         struct io_tctx_node *node;
9077         unsigned long index;
9078
9079         xa_for_each(&tctx->xa, index, node)
9080                 io_uring_del_tctx_node(index);
9081         if (wq) {
9082                 /*
9083                  * Must be after io_uring_del_task_file() (removes nodes under
9084                  * uring_lock) to avoid race with io_uring_try_cancel_iowq().
9085                  */
9086                 tctx->io_wq = NULL;
9087                 io_wq_put_and_exit(wq);
9088         }
9089 }
9090
9091 static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
9092 {
9093         if (tracked)
9094                 return atomic_read(&tctx->inflight_tracked);
9095         return percpu_counter_sum(&tctx->inflight);
9096 }
9097
9098 static void io_uring_try_cancel(bool cancel_all)
9099 {
9100         struct io_uring_task *tctx = current->io_uring;
9101         struct io_tctx_node *node;
9102         unsigned long index;
9103
9104         xa_for_each(&tctx->xa, index, node) {
9105                 struct io_ring_ctx *ctx = node->ctx;
9106
9107                 /* sqpoll task will cancel all its requests */
9108                 if (!ctx->sq_data)
9109                         io_uring_try_cancel_requests(ctx, current, cancel_all);
9110         }
9111 }
9112
9113 /* should only be called by SQPOLL task */
9114 static void io_uring_cancel_sqpoll(struct io_sq_data *sqd)
9115 {
9116         struct io_uring_task *tctx = current->io_uring;
9117         struct io_ring_ctx *ctx;
9118         s64 inflight;
9119         DEFINE_WAIT(wait);
9120
9121         if (!current->io_uring)
9122                 return;
9123         if (tctx->io_wq)
9124                 io_wq_exit_start(tctx->io_wq);
9125
9126         WARN_ON_ONCE(!sqd || sqd->thread != current);
9127
9128         atomic_inc(&tctx->in_idle);
9129         do {
9130                 /* read completions before cancelations */
9131                 inflight = tctx_inflight(tctx, false);
9132                 if (!inflight)
9133                         break;
9134                 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
9135                         io_uring_try_cancel_requests(ctx, current, true);
9136
9137                 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
9138                 /*
9139                  * If we've seen completions, retry without waiting. This
9140                  * avoids a race where a completion comes in before we did
9141                  * prepare_to_wait().
9142                  */
9143                 if (inflight == tctx_inflight(tctx, false))
9144                         schedule();
9145                 finish_wait(&tctx->wait, &wait);
9146         } while (1);
9147         atomic_dec(&tctx->in_idle);
9148 }
9149
9150 /*
9151  * Find any io_uring fd that this task has registered or done IO on, and cancel
9152  * requests.
9153  */
9154 void __io_uring_cancel(struct files_struct *files)
9155 {
9156         struct io_uring_task *tctx = current->io_uring;
9157         DEFINE_WAIT(wait);
9158         s64 inflight;
9159         bool cancel_all = !files;
9160
9161         if (tctx->io_wq)
9162                 io_wq_exit_start(tctx->io_wq);
9163
9164         /* make sure overflow events are dropped */
9165         atomic_inc(&tctx->in_idle);
9166         do {
9167                 /* read completions before cancelations */
9168                 inflight = tctx_inflight(tctx, !cancel_all);
9169                 if (!inflight)
9170                         break;
9171                 io_uring_try_cancel(cancel_all);
9172                 prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
9173
9174                 /*
9175                  * If we've seen completions, retry without waiting. This
9176                  * avoids a race where a completion comes in before we did
9177                  * prepare_to_wait().
9178                  */
9179                 if (inflight == tctx_inflight(tctx, !cancel_all))
9180                         schedule();
9181                 finish_wait(&tctx->wait, &wait);
9182         } while (1);
9183         atomic_dec(&tctx->in_idle);
9184
9185         io_uring_clean_tctx(tctx);
9186         if (cancel_all) {
9187                 /* for exec all current's requests should be gone, kill tctx */
9188                 __io_uring_free(current);
9189         }
9190 }
9191
9192 static void *io_uring_validate_mmap_request(struct file *file,
9193                                             loff_t pgoff, size_t sz)
9194 {
9195         struct io_ring_ctx *ctx = file->private_data;
9196         loff_t offset = pgoff << PAGE_SHIFT;
9197         struct page *page;
9198         void *ptr;
9199
9200         switch (offset) {
9201         case IORING_OFF_SQ_RING:
9202         case IORING_OFF_CQ_RING:
9203                 ptr = ctx->rings;
9204                 break;
9205         case IORING_OFF_SQES:
9206                 ptr = ctx->sq_sqes;
9207                 break;
9208         default:
9209                 return ERR_PTR(-EINVAL);
9210         }
9211
9212         page = virt_to_head_page(ptr);
9213         if (sz > page_size(page))
9214                 return ERR_PTR(-EINVAL);
9215
9216         return ptr;
9217 }
9218
9219 #ifdef CONFIG_MMU
9220
9221 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
9222 {
9223         size_t sz = vma->vm_end - vma->vm_start;
9224         unsigned long pfn;
9225         void *ptr;
9226
9227         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
9228         if (IS_ERR(ptr))
9229                 return PTR_ERR(ptr);
9230
9231         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
9232         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
9233 }
9234
9235 #else /* !CONFIG_MMU */
9236
9237 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
9238 {
9239         return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
9240 }
9241
9242 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
9243 {
9244         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
9245 }
9246
9247 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
9248         unsigned long addr, unsigned long len,
9249         unsigned long pgoff, unsigned long flags)
9250 {
9251         void *ptr;
9252
9253         ptr = io_uring_validate_mmap_request(file, pgoff, len);
9254         if (IS_ERR(ptr))
9255                 return PTR_ERR(ptr);
9256
9257         return (unsigned long) ptr;
9258 }
9259
9260 #endif /* !CONFIG_MMU */
9261
9262 static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
9263 {
9264         DEFINE_WAIT(wait);
9265
9266         do {
9267                 if (!io_sqring_full(ctx))
9268                         break;
9269                 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
9270
9271                 if (!io_sqring_full(ctx))
9272                         break;
9273                 schedule();
9274         } while (!signal_pending(current));
9275
9276         finish_wait(&ctx->sqo_sq_wait, &wait);
9277         return 0;
9278 }
9279
9280 static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
9281                           struct __kernel_timespec __user **ts,
9282                           const sigset_t __user **sig)
9283 {
9284         struct io_uring_getevents_arg arg;
9285
9286         /*
9287          * If EXT_ARG isn't set, then we have no timespec and the argp pointer
9288          * is just a pointer to the sigset_t.
9289          */
9290         if (!(flags & IORING_ENTER_EXT_ARG)) {
9291                 *sig = (const sigset_t __user *) argp;
9292                 *ts = NULL;
9293                 return 0;
9294         }
9295
9296         /*
9297          * EXT_ARG is set - ensure we agree on the size of it and copy in our
9298          * timespec and sigset_t pointers if good.
9299          */
9300         if (*argsz != sizeof(arg))
9301                 return -EINVAL;
9302         if (copy_from_user(&arg, argp, sizeof(arg)))
9303                 return -EFAULT;
9304         *sig = u64_to_user_ptr(arg.sigmask);
9305         *argsz = arg.sigmask_sz;
9306         *ts = u64_to_user_ptr(arg.ts);
9307         return 0;
9308 }
9309
9310 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
9311                 u32, min_complete, u32, flags, const void __user *, argp,
9312                 size_t, argsz)
9313 {
9314         struct io_ring_ctx *ctx;
9315         int submitted = 0;
9316         struct fd f;
9317         long ret;
9318
9319         io_run_task_work();
9320
9321         if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
9322                                IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
9323                 return -EINVAL;
9324
9325         f = fdget(fd);
9326         if (unlikely(!f.file))
9327                 return -EBADF;
9328
9329         ret = -EOPNOTSUPP;
9330         if (unlikely(f.file->f_op != &io_uring_fops))
9331                 goto out_fput;
9332
9333         ret = -ENXIO;
9334         ctx = f.file->private_data;
9335         if (unlikely(!percpu_ref_tryget(&ctx->refs)))
9336                 goto out_fput;
9337
9338         ret = -EBADFD;
9339         if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
9340                 goto out;
9341
9342         /*
9343          * For SQ polling, the thread will do all submissions and completions.
9344          * Just return the requested submit count, and wake the thread if
9345          * we were asked to.
9346          */
9347         ret = 0;
9348         if (ctx->flags & IORING_SETUP_SQPOLL) {
9349                 io_cqring_overflow_flush(ctx, false);
9350
9351                 ret = -EOWNERDEAD;
9352                 if (unlikely(ctx->sq_data->thread == NULL)) {
9353                         goto out;
9354                 }
9355                 if (flags & IORING_ENTER_SQ_WAKEUP)
9356                         wake_up(&ctx->sq_data->wait);
9357                 if (flags & IORING_ENTER_SQ_WAIT) {
9358                         ret = io_sqpoll_wait_sq(ctx);
9359                         if (ret)
9360                                 goto out;
9361                 }
9362                 submitted = to_submit;
9363         } else if (to_submit) {
9364                 ret = io_uring_add_tctx_node(ctx);
9365                 if (unlikely(ret))
9366                         goto out;
9367                 mutex_lock(&ctx->uring_lock);
9368                 submitted = io_submit_sqes(ctx, to_submit);
9369                 mutex_unlock(&ctx->uring_lock);
9370
9371                 if (submitted != to_submit)
9372                         goto out;
9373         }
9374         if (flags & IORING_ENTER_GETEVENTS) {
9375                 const sigset_t __user *sig;
9376                 struct __kernel_timespec __user *ts;
9377
9378                 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
9379                 if (unlikely(ret))
9380                         goto out;
9381
9382                 min_complete = min(min_complete, ctx->cq_entries);
9383
9384                 /*
9385                  * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
9386                  * space applications don't need to do io completion events
9387                  * polling again, they can rely on io_sq_thread to do polling
9388                  * work, which can reduce cpu usage and uring_lock contention.
9389                  */
9390                 if (ctx->flags & IORING_SETUP_IOPOLL &&
9391                     !(ctx->flags & IORING_SETUP_SQPOLL)) {
9392                         ret = io_iopoll_check(ctx, min_complete);
9393                 } else {
9394                         ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
9395                 }
9396         }
9397
9398 out:
9399         percpu_ref_put(&ctx->refs);
9400 out_fput:
9401         fdput(f);
9402         return submitted ? submitted : ret;
9403 }
9404
9405 #ifdef CONFIG_PROC_FS
9406 static int io_uring_show_cred(struct seq_file *m, unsigned int id,
9407                 const struct cred *cred)
9408 {
9409         struct user_namespace *uns = seq_user_ns(m);
9410         struct group_info *gi;
9411         kernel_cap_t cap;
9412         unsigned __capi;
9413         int g;
9414
9415         seq_printf(m, "%5d\n", id);
9416         seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
9417         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
9418         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
9419         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
9420         seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
9421         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
9422         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
9423         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
9424         seq_puts(m, "\n\tGroups:\t");
9425         gi = cred->group_info;
9426         for (g = 0; g < gi->ngroups; g++) {
9427                 seq_put_decimal_ull(m, g ? " " : "",
9428                                         from_kgid_munged(uns, gi->gid[g]));
9429         }
9430         seq_puts(m, "\n\tCapEff:\t");
9431         cap = cred->cap_effective;
9432         CAP_FOR_EACH_U32(__capi)
9433                 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
9434         seq_putc(m, '\n');
9435         return 0;
9436 }
9437
9438 static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
9439 {
9440         struct io_sq_data *sq = NULL;
9441         bool has_lock;
9442         int i;
9443
9444         /*
9445          * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
9446          * since fdinfo case grabs it in the opposite direction of normal use
9447          * cases. If we fail to get the lock, we just don't iterate any
9448          * structures that could be going away outside the io_uring mutex.
9449          */
9450         has_lock = mutex_trylock(&ctx->uring_lock);
9451
9452         if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
9453                 sq = ctx->sq_data;
9454                 if (!sq->thread)
9455                         sq = NULL;
9456         }
9457
9458         seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
9459         seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
9460         seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
9461         for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
9462                 struct file *f = io_file_from_index(ctx, i);
9463
9464                 if (f)
9465                         seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
9466                 else
9467                         seq_printf(m, "%5u: <none>\n", i);
9468         }
9469         seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
9470         for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
9471                 struct io_mapped_ubuf *buf = ctx->user_bufs[i];
9472                 unsigned int len = buf->ubuf_end - buf->ubuf;
9473
9474                 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
9475         }
9476         if (has_lock && !xa_empty(&ctx->personalities)) {
9477                 unsigned long index;
9478                 const struct cred *cred;
9479
9480                 seq_printf(m, "Personalities:\n");
9481                 xa_for_each(&ctx->personalities, index, cred)
9482                         io_uring_show_cred(m, index, cred);
9483         }
9484         seq_printf(m, "PollList:\n");
9485         spin_lock_irq(&ctx->completion_lock);
9486         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
9487                 struct hlist_head *list = &ctx->cancel_hash[i];
9488                 struct io_kiocb *req;
9489
9490                 hlist_for_each_entry(req, list, hash_node)
9491                         seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
9492                                         req->task->task_works != NULL);
9493         }
9494         spin_unlock_irq(&ctx->completion_lock);
9495         if (has_lock)
9496                 mutex_unlock(&ctx->uring_lock);
9497 }
9498
9499 static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
9500 {
9501         struct io_ring_ctx *ctx = f->private_data;
9502
9503         if (percpu_ref_tryget(&ctx->refs)) {
9504                 __io_uring_show_fdinfo(ctx, m);
9505                 percpu_ref_put(&ctx->refs);
9506         }
9507 }
9508 #endif
9509
9510 static const struct file_operations io_uring_fops = {
9511         .release        = io_uring_release,
9512         .mmap           = io_uring_mmap,
9513 #ifndef CONFIG_MMU
9514         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
9515         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
9516 #endif
9517         .poll           = io_uring_poll,
9518         .fasync         = io_uring_fasync,
9519 #ifdef CONFIG_PROC_FS
9520         .show_fdinfo    = io_uring_show_fdinfo,
9521 #endif
9522 };
9523
9524 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
9525                                   struct io_uring_params *p)
9526 {
9527         struct io_rings *rings;
9528         size_t size, sq_array_offset;
9529
9530         /* make sure these are sane, as we already accounted them */
9531         ctx->sq_entries = p->sq_entries;
9532         ctx->cq_entries = p->cq_entries;
9533
9534         size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
9535         if (size == SIZE_MAX)
9536                 return -EOVERFLOW;
9537
9538         rings = io_mem_alloc(size);
9539         if (!rings)
9540                 return -ENOMEM;
9541
9542         ctx->rings = rings;
9543         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
9544         rings->sq_ring_mask = p->sq_entries - 1;
9545         rings->cq_ring_mask = p->cq_entries - 1;
9546         rings->sq_ring_entries = p->sq_entries;
9547         rings->cq_ring_entries = p->cq_entries;
9548
9549         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
9550         if (size == SIZE_MAX) {
9551                 io_mem_free(ctx->rings);
9552                 ctx->rings = NULL;
9553                 return -EOVERFLOW;
9554         }
9555
9556         ctx->sq_sqes = io_mem_alloc(size);
9557         if (!ctx->sq_sqes) {
9558                 io_mem_free(ctx->rings);
9559                 ctx->rings = NULL;
9560                 return -ENOMEM;
9561         }
9562
9563         return 0;
9564 }
9565
9566 static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
9567 {
9568         int ret, fd;
9569
9570         fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
9571         if (fd < 0)
9572                 return fd;
9573
9574         ret = io_uring_add_tctx_node(ctx);
9575         if (ret) {
9576                 put_unused_fd(fd);
9577                 return ret;
9578         }
9579         fd_install(fd, file);
9580         return fd;
9581 }
9582
9583 /*
9584  * Allocate an anonymous fd, this is what constitutes the application
9585  * visible backing of an io_uring instance. The application mmaps this
9586  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
9587  * we have to tie this fd to a socket for file garbage collection purposes.
9588  */
9589 static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
9590 {
9591         struct file *file;
9592 #if defined(CONFIG_UNIX)
9593         int ret;
9594
9595         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
9596                                 &ctx->ring_sock);
9597         if (ret)
9598                 return ERR_PTR(ret);
9599 #endif
9600
9601         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
9602                                         O_RDWR | O_CLOEXEC);
9603 #if defined(CONFIG_UNIX)
9604         if (IS_ERR(file)) {
9605                 sock_release(ctx->ring_sock);
9606                 ctx->ring_sock = NULL;
9607         } else {
9608                 ctx->ring_sock->file = file;
9609         }
9610 #endif
9611         return file;
9612 }
9613
9614 static int io_uring_create(unsigned entries, struct io_uring_params *p,
9615                            struct io_uring_params __user *params)
9616 {
9617         struct io_ring_ctx *ctx;
9618         struct file *file;
9619         int ret;
9620
9621         if (!entries)
9622                 return -EINVAL;
9623         if (entries > IORING_MAX_ENTRIES) {
9624                 if (!(p->flags & IORING_SETUP_CLAMP))
9625                         return -EINVAL;
9626                 entries = IORING_MAX_ENTRIES;
9627         }
9628
9629         /*
9630          * Use twice as many entries for the CQ ring. It's possible for the
9631          * application to drive a higher depth than the size of the SQ ring,
9632          * since the sqes are only used at submission time. This allows for
9633          * some flexibility in overcommitting a bit. If the application has
9634          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
9635          * of CQ ring entries manually.
9636          */
9637         p->sq_entries = roundup_pow_of_two(entries);
9638         if (p->flags & IORING_SETUP_CQSIZE) {
9639                 /*
9640                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
9641                  * to a power-of-two, if it isn't already. We do NOT impose
9642                  * any cq vs sq ring sizing.
9643                  */
9644                 if (!p->cq_entries)
9645                         return -EINVAL;
9646                 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
9647                         if (!(p->flags & IORING_SETUP_CLAMP))
9648                                 return -EINVAL;
9649                         p->cq_entries = IORING_MAX_CQ_ENTRIES;
9650                 }
9651                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
9652                 if (p->cq_entries < p->sq_entries)
9653                         return -EINVAL;
9654         } else {
9655                 p->cq_entries = 2 * p->sq_entries;
9656         }
9657
9658         ctx = io_ring_ctx_alloc(p);
9659         if (!ctx)
9660                 return -ENOMEM;
9661         ctx->compat = in_compat_syscall();
9662         if (!capable(CAP_IPC_LOCK))
9663                 ctx->user = get_uid(current_user());
9664
9665         /*
9666          * This is just grabbed for accounting purposes. When a process exits,
9667          * the mm is exited and dropped before the files, hence we need to hang
9668          * on to this mm purely for the purposes of being able to unaccount
9669          * memory (locked/pinned vm). It's not used for anything else.
9670          */
9671         mmgrab(current->mm);
9672         ctx->mm_account = current->mm;
9673
9674         ret = io_allocate_scq_urings(ctx, p);
9675         if (ret)
9676                 goto err;
9677
9678         ret = io_sq_offload_create(ctx, p);
9679         if (ret)
9680                 goto err;
9681         /* always set a rsrc node */
9682         ret = io_rsrc_node_switch_start(ctx);
9683         if (ret)
9684                 goto err;
9685         io_rsrc_node_switch(ctx, NULL);
9686
9687         memset(&p->sq_off, 0, sizeof(p->sq_off));
9688         p->sq_off.head = offsetof(struct io_rings, sq.head);
9689         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
9690         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
9691         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
9692         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
9693         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
9694         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
9695
9696         memset(&p->cq_off, 0, sizeof(p->cq_off));
9697         p->cq_off.head = offsetof(struct io_rings, cq.head);
9698         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
9699         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
9700         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
9701         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
9702         p->cq_off.cqes = offsetof(struct io_rings, cqes);
9703         p->cq_off.flags = offsetof(struct io_rings, cq_flags);
9704
9705         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
9706                         IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
9707                         IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
9708                         IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
9709                         IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
9710                         IORING_FEAT_RSRC_TAGS;
9711
9712         if (copy_to_user(params, p, sizeof(*p))) {
9713                 ret = -EFAULT;
9714                 goto err;
9715         }
9716
9717         file = io_uring_get_file(ctx);
9718         if (IS_ERR(file)) {
9719                 ret = PTR_ERR(file);
9720                 goto err;
9721         }
9722
9723         /*
9724          * Install ring fd as the very last thing, so we don't risk someone
9725          * having closed it before we finish setup
9726          */
9727         ret = io_uring_install_fd(ctx, file);
9728         if (ret < 0) {
9729                 /* fput will clean it up */
9730                 fput(file);
9731                 return ret;
9732         }
9733
9734         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
9735         return ret;
9736 err:
9737         io_ring_ctx_wait_and_kill(ctx);
9738         return ret;
9739 }
9740
9741 /*
9742  * Sets up an aio uring context, and returns the fd. Applications asks for a
9743  * ring size, we return the actual sq/cq ring sizes (among other things) in the
9744  * params structure passed in.
9745  */
9746 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
9747 {
9748         struct io_uring_params p;
9749         int i;
9750
9751         if (copy_from_user(&p, params, sizeof(p)))
9752                 return -EFAULT;
9753         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
9754                 if (p.resv[i])
9755                         return -EINVAL;
9756         }
9757
9758         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
9759                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
9760                         IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
9761                         IORING_SETUP_R_DISABLED))
9762                 return -EINVAL;
9763
9764         return  io_uring_create(entries, &p, params);
9765 }
9766
9767 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
9768                 struct io_uring_params __user *, params)
9769 {
9770         return io_uring_setup(entries, params);
9771 }
9772
9773 static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
9774 {
9775         struct io_uring_probe *p;
9776         size_t size;
9777         int i, ret;
9778
9779         size = struct_size(p, ops, nr_args);
9780         if (size == SIZE_MAX)
9781                 return -EOVERFLOW;
9782         p = kzalloc(size, GFP_KERNEL);
9783         if (!p)
9784                 return -ENOMEM;
9785
9786         ret = -EFAULT;
9787         if (copy_from_user(p, arg, size))
9788                 goto out;
9789         ret = -EINVAL;
9790         if (memchr_inv(p, 0, size))
9791                 goto out;
9792
9793         p->last_op = IORING_OP_LAST - 1;
9794         if (nr_args > IORING_OP_LAST)
9795                 nr_args = IORING_OP_LAST;
9796
9797         for (i = 0; i < nr_args; i++) {
9798                 p->ops[i].op = i;
9799                 if (!io_op_defs[i].not_supported)
9800                         p->ops[i].flags = IO_URING_OP_SUPPORTED;
9801         }
9802         p->ops_len = i;
9803
9804         ret = 0;
9805         if (copy_to_user(arg, p, size))
9806                 ret = -EFAULT;
9807 out:
9808         kfree(p);
9809         return ret;
9810 }
9811
9812 static int io_register_personality(struct io_ring_ctx *ctx)
9813 {
9814         const struct cred *creds;
9815         u32 id;
9816         int ret;
9817
9818         creds = get_current_cred();
9819
9820         ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
9821                         XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
9822         if (!ret)
9823                 return id;
9824         put_cred(creds);
9825         return ret;
9826 }
9827
9828 static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
9829                                     unsigned int nr_args)
9830 {
9831         struct io_uring_restriction *res;
9832         size_t size;
9833         int i, ret;
9834
9835         /* Restrictions allowed only if rings started disabled */
9836         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
9837                 return -EBADFD;
9838
9839         /* We allow only a single restrictions registration */
9840         if (ctx->restrictions.registered)
9841                 return -EBUSY;
9842
9843         if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
9844                 return -EINVAL;
9845
9846         size = array_size(nr_args, sizeof(*res));
9847         if (size == SIZE_MAX)
9848                 return -EOVERFLOW;
9849
9850         res = memdup_user(arg, size);
9851         if (IS_ERR(res))
9852                 return PTR_ERR(res);
9853
9854         ret = 0;
9855
9856         for (i = 0; i < nr_args; i++) {
9857                 switch (res[i].opcode) {
9858                 case IORING_RESTRICTION_REGISTER_OP:
9859                         if (res[i].register_op >= IORING_REGISTER_LAST) {
9860                                 ret = -EINVAL;
9861                                 goto out;
9862                         }
9863
9864                         __set_bit(res[i].register_op,
9865                                   ctx->restrictions.register_op);
9866                         break;
9867                 case IORING_RESTRICTION_SQE_OP:
9868                         if (res[i].sqe_op >= IORING_OP_LAST) {
9869                                 ret = -EINVAL;
9870                                 goto out;
9871                         }
9872
9873                         __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
9874                         break;
9875                 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
9876                         ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
9877                         break;
9878                 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
9879                         ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
9880                         break;
9881                 default:
9882                         ret = -EINVAL;
9883                         goto out;
9884                 }
9885         }
9886
9887 out:
9888         /* Reset all restrictions if an error happened */
9889         if (ret != 0)
9890                 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
9891         else
9892                 ctx->restrictions.registered = true;
9893
9894         kfree(res);
9895         return ret;
9896 }
9897
9898 static int io_register_enable_rings(struct io_ring_ctx *ctx)
9899 {
9900         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
9901                 return -EBADFD;
9902
9903         if (ctx->restrictions.registered)
9904                 ctx->restricted = 1;
9905
9906         ctx->flags &= ~IORING_SETUP_R_DISABLED;
9907         if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
9908                 wake_up(&ctx->sq_data->wait);
9909         return 0;
9910 }
9911
9912 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
9913                                      struct io_uring_rsrc_update2 *up,
9914                                      unsigned nr_args)
9915 {
9916         __u32 tmp;
9917         int err;
9918
9919         if (up->resv)
9920                 return -EINVAL;
9921         if (check_add_overflow(up->offset, nr_args, &tmp))
9922                 return -EOVERFLOW;
9923         err = io_rsrc_node_switch_start(ctx);
9924         if (err)
9925                 return err;
9926
9927         switch (type) {
9928         case IORING_RSRC_FILE:
9929                 return __io_sqe_files_update(ctx, up, nr_args);
9930         case IORING_RSRC_BUFFER:
9931                 return __io_sqe_buffers_update(ctx, up, nr_args);
9932         }
9933         return -EINVAL;
9934 }
9935
9936 static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
9937                                     unsigned nr_args)
9938 {
9939         struct io_uring_rsrc_update2 up;
9940
9941         if (!nr_args)
9942                 return -EINVAL;
9943         memset(&up, 0, sizeof(up));
9944         if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
9945                 return -EFAULT;
9946         return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
9947 }
9948
9949 static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
9950                                    unsigned size, unsigned type)
9951 {
9952         struct io_uring_rsrc_update2 up;
9953
9954         if (size != sizeof(up))
9955                 return -EINVAL;
9956         if (copy_from_user(&up, arg, sizeof(up)))
9957                 return -EFAULT;
9958         if (!up.nr || up.resv)
9959                 return -EINVAL;
9960         return __io_register_rsrc_update(ctx, type, &up, up.nr);
9961 }
9962
9963 static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
9964                             unsigned int size, unsigned int type)
9965 {
9966         struct io_uring_rsrc_register rr;
9967
9968         /* keep it extendible */
9969         if (size != sizeof(rr))
9970                 return -EINVAL;
9971
9972         memset(&rr, 0, sizeof(rr));
9973         if (copy_from_user(&rr, arg, size))
9974                 return -EFAULT;
9975         if (!rr.nr || rr.resv || rr.resv2)
9976                 return -EINVAL;
9977
9978         switch (type) {
9979         case IORING_RSRC_FILE:
9980                 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
9981                                              rr.nr, u64_to_user_ptr(rr.tags));
9982         case IORING_RSRC_BUFFER:
9983                 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
9984                                                rr.nr, u64_to_user_ptr(rr.tags));
9985         }
9986         return -EINVAL;
9987 }
9988
9989 static bool io_register_op_must_quiesce(int op)
9990 {
9991         switch (op) {
9992         case IORING_REGISTER_BUFFERS:
9993         case IORING_UNREGISTER_BUFFERS:
9994         case IORING_REGISTER_FILES:
9995         case IORING_UNREGISTER_FILES:
9996         case IORING_REGISTER_FILES_UPDATE:
9997         case IORING_REGISTER_PROBE:
9998         case IORING_REGISTER_PERSONALITY:
9999         case IORING_UNREGISTER_PERSONALITY:
10000         case IORING_REGISTER_FILES2:
10001         case IORING_REGISTER_FILES_UPDATE2:
10002         case IORING_REGISTER_BUFFERS2:
10003         case IORING_REGISTER_BUFFERS_UPDATE:
10004                 return false;
10005         default:
10006                 return true;
10007         }
10008 }
10009
10010 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
10011                                void __user *arg, unsigned nr_args)
10012         __releases(ctx->uring_lock)
10013         __acquires(ctx->uring_lock)
10014 {
10015         int ret;
10016
10017         /*
10018          * We're inside the ring mutex, if the ref is already dying, then
10019          * someone else killed the ctx or is already going through
10020          * io_uring_register().
10021          */
10022         if (percpu_ref_is_dying(&ctx->refs))
10023                 return -ENXIO;
10024
10025         if (ctx->restricted) {
10026                 if (opcode >= IORING_REGISTER_LAST)
10027                         return -EINVAL;
10028                 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
10029                 if (!test_bit(opcode, ctx->restrictions.register_op))
10030                         return -EACCES;
10031         }
10032
10033         if (io_register_op_must_quiesce(opcode)) {
10034                 percpu_ref_kill(&ctx->refs);
10035
10036                 /*
10037                  * Drop uring mutex before waiting for references to exit. If
10038                  * another thread is currently inside io_uring_enter() it might
10039                  * need to grab the uring_lock to make progress. If we hold it
10040                  * here across the drain wait, then we can deadlock. It's safe
10041                  * to drop the mutex here, since no new references will come in
10042                  * after we've killed the percpu ref.
10043                  */
10044                 mutex_unlock(&ctx->uring_lock);
10045                 do {
10046                         ret = wait_for_completion_interruptible(&ctx->ref_comp);
10047                         if (!ret)
10048                                 break;
10049                         ret = io_run_task_work_sig();
10050                         if (ret < 0)
10051                                 break;
10052                 } while (1);
10053                 mutex_lock(&ctx->uring_lock);
10054
10055                 if (ret) {
10056                         io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
10057                         return ret;
10058                 }
10059         }
10060
10061         switch (opcode) {
10062         case IORING_REGISTER_BUFFERS:
10063                 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
10064                 break;
10065         case IORING_UNREGISTER_BUFFERS:
10066                 ret = -EINVAL;
10067                 if (arg || nr_args)
10068                         break;
10069                 ret = io_sqe_buffers_unregister(ctx);
10070                 break;
10071         case IORING_REGISTER_FILES:
10072                 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
10073                 break;
10074         case IORING_UNREGISTER_FILES:
10075                 ret = -EINVAL;
10076                 if (arg || nr_args)
10077                         break;
10078                 ret = io_sqe_files_unregister(ctx);
10079                 break;
10080         case IORING_REGISTER_FILES_UPDATE:
10081                 ret = io_register_files_update(ctx, arg, nr_args);
10082                 break;
10083         case IORING_REGISTER_EVENTFD:
10084         case IORING_REGISTER_EVENTFD_ASYNC:
10085                 ret = -EINVAL;
10086                 if (nr_args != 1)
10087                         break;
10088                 ret = io_eventfd_register(ctx, arg);
10089                 if (ret)
10090                         break;
10091                 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
10092                         ctx->eventfd_async = 1;
10093                 else
10094                         ctx->eventfd_async = 0;
10095                 break;
10096         case IORING_UNREGISTER_EVENTFD:
10097                 ret = -EINVAL;
10098                 if (arg || nr_args)
10099                         break;
10100                 ret = io_eventfd_unregister(ctx);
10101                 break;
10102         case IORING_REGISTER_PROBE:
10103                 ret = -EINVAL;
10104                 if (!arg || nr_args > 256)
10105                         break;
10106                 ret = io_probe(ctx, arg, nr_args);
10107                 break;
10108         case IORING_REGISTER_PERSONALITY:
10109                 ret = -EINVAL;
10110                 if (arg || nr_args)
10111                         break;
10112                 ret = io_register_personality(ctx);
10113                 break;
10114         case IORING_UNREGISTER_PERSONALITY:
10115                 ret = -EINVAL;
10116                 if (arg)
10117                         break;
10118                 ret = io_unregister_personality(ctx, nr_args);
10119                 break;
10120         case IORING_REGISTER_ENABLE_RINGS:
10121                 ret = -EINVAL;
10122                 if (arg || nr_args)
10123                         break;
10124                 ret = io_register_enable_rings(ctx);
10125                 break;
10126         case IORING_REGISTER_RESTRICTIONS:
10127                 ret = io_register_restrictions(ctx, arg, nr_args);
10128                 break;
10129         case IORING_REGISTER_FILES2:
10130                 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
10131                 break;
10132         case IORING_REGISTER_FILES_UPDATE2:
10133                 ret = io_register_rsrc_update(ctx, arg, nr_args,
10134                                               IORING_RSRC_FILE);
10135                 break;
10136         case IORING_REGISTER_BUFFERS2:
10137                 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
10138                 break;
10139         case IORING_REGISTER_BUFFERS_UPDATE:
10140                 ret = io_register_rsrc_update(ctx, arg, nr_args,
10141                                               IORING_RSRC_BUFFER);
10142                 break;
10143         default:
10144                 ret = -EINVAL;
10145                 break;
10146         }
10147
10148         if (io_register_op_must_quiesce(opcode)) {
10149                 /* bring the ctx back to life */
10150                 percpu_ref_reinit(&ctx->refs);
10151                 reinit_completion(&ctx->ref_comp);
10152         }
10153         return ret;
10154 }
10155
10156 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
10157                 void __user *, arg, unsigned int, nr_args)
10158 {
10159         struct io_ring_ctx *ctx;
10160         long ret = -EBADF;
10161         struct fd f;
10162
10163         f = fdget(fd);
10164         if (!f.file)
10165                 return -EBADF;
10166
10167         ret = -EOPNOTSUPP;
10168         if (f.file->f_op != &io_uring_fops)
10169                 goto out_fput;
10170
10171         ctx = f.file->private_data;
10172
10173         io_run_task_work();
10174
10175         mutex_lock(&ctx->uring_lock);
10176         ret = __io_uring_register(ctx, opcode, arg, nr_args);
10177         mutex_unlock(&ctx->uring_lock);
10178         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
10179                                                         ctx->cq_ev_fd != NULL, ret);
10180 out_fput:
10181         fdput(f);
10182         return ret;
10183 }
10184
10185 static int __init io_uring_init(void)
10186 {
10187 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
10188         BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
10189         BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
10190 } while (0)
10191
10192 #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
10193         __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
10194         BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
10195         BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
10196         BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
10197         BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
10198         BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
10199         BUILD_BUG_SQE_ELEM(8,  __u64,  off);
10200         BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
10201         BUILD_BUG_SQE_ELEM(16, __u64,  addr);
10202         BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
10203         BUILD_BUG_SQE_ELEM(24, __u32,  len);
10204         BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
10205         BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
10206         BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
10207         BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
10208         BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
10209         BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
10210         BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
10211         BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
10212         BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
10213         BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
10214         BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
10215         BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
10216         BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
10217         BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
10218         BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
10219         BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
10220         BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
10221         BUILD_BUG_SQE_ELEM(42, __u16,  personality);
10222         BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
10223
10224         BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
10225                      sizeof(struct io_uring_rsrc_update));
10226         BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
10227                      sizeof(struct io_uring_rsrc_update2));
10228         /* should fit into one byte */
10229         BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
10230
10231         BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
10232         BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
10233         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
10234                                 SLAB_ACCOUNT);
10235         return 0;
10236 };
10237 __initcall(io_uring_init);