fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqring (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <linux/compat.h>
  47 #include <net/compat.h>
  48 #include <linux/refcount.h>
  49 #include <linux/uio.h>
  50 #include <linux/bits.h>
  51
  52 #include <linux/sched/signal.h>
  53 #include <linux/fs.h>
  54 #include <linux/file.h>
  55 #include <linux/fdtable.h>
  56 #include <linux/mm.h>
  57 #include <linux/mman.h>
  58 #include <linux/percpu.h>
  59 #include <linux/slab.h>
  60 #include <linux/kthread.h>
  61 #include <linux/blkdev.h>
  62 #include <linux/bvec.h>
  63 #include <linux/net.h>
  64 #include <net/sock.h>
  65 #include <net/af_unix.h>
  66 #include <net/scm.h>
  67 #include <linux/anon_inodes.h>
  68 #include <linux/sched/mm.h>
  69 #include <linux/uaccess.h>
  70 #include <linux/nospec.h>
  71 #include <linux/sizes.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/highmem.h>
  74 #include <linux/namei.h>
  75 #include <linux/fsnotify.h>
  76 #include <linux/fadvise.h>
  77 #include <linux/eventpoll.h>
  78 #include <linux/fs_struct.h>
  79 #include <linux/splice.h>
  80 #include <linux/task_work.h>
  81 #include <linux/pagemap.h>
  82
  83 #define CREATE_TRACE_POINTS
  84 #include <trace/events/io_uring.h>
  85
  86 #include <uapi/linux/io_uring.h>
  87
  88 #include "internal.h"
  89 #include "io-wq.h"
  90
  91 #define IORING_MAX_ENTRIES      32768
  92 #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
  93
  94 /*
  95  * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
  96  */
  97 #define IORING_FILE_TABLE_SHIFT 9
  98 #define IORING_MAX_FILES_TABLE  (1U << IORING_FILE_TABLE_SHIFT)
  99 #define IORING_FILE_TABLE_MASK  (IORING_MAX_FILES_TABLE - 1)
 100 #define IORING_MAX_FIXED_FILES  (64 * IORING_MAX_FILES_TABLE)
 101
 102 struct io_uring {
 103         u32 head ____cacheline_aligned_in_smp;
 104         u32 tail ____cacheline_aligned_in_smp;
 105 };
 106
 107 /*
 108  * This data is shared with the application through the mmap at offsets
 109  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 110  *
 111  * The offsets to the member fields are published through struct
 112  * io_sqring_offsets when calling io_uring_setup.
 113  */
 114 struct io_rings {
 115         /*
 116          * Head and tail offsets into the ring; the offsets need to be
 117          * masked to get valid indices.
 118          *
 119          * The kernel controls head of the sq ring and the tail of the cq ring,
 120          * and the application controls tail of the sq ring and the head of the
 121          * cq ring.
 122          */
 123         struct io_uring         sq, cq;
 124         /*
 125          * Bitmasks to apply to head and tail offsets (constant, equals
 126          * ring_entries - 1)
 127          */
 128         u32                     sq_ring_mask, cq_ring_mask;
 129         /* Ring sizes (constant, power of 2) */
 130         u32                     sq_ring_entries, cq_ring_entries;
 131         /*
 132          * Number of invalid entries dropped by the kernel due to
 133          * invalid index stored in array
 134          *
 135          * Written by the kernel, shouldn't be modified by the
 136          * application (i.e. get number of "new events" by comparing to
 137          * cached value).
 138          *
 139          * After a new SQ head value was read by the application this
 140          * counter includes all submissions that were dropped reaching
 141          * the new SQ head (and possibly more).
 142          */
 143         u32                     sq_dropped;
 144         /*
 145          * Runtime SQ flags
 146          *
 147          * Written by the kernel, shouldn't be modified by the
 148          * application.
 149          *
 150          * The application needs a full memory barrier before checking
 151          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 152          */
 153         u32                     sq_flags;
 154         /*
 155          * Runtime CQ flags
 156          *
 157          * Written by the application, shouldn't be modified by the
 158          * kernel.
 159          */
 160         u32                     cq_flags;
 161         /*
 162          * Number of completion events lost because the queue was full;
 163          * this should be avoided by the application by making sure
 164          * there are not more requests pending than there is space in
 165          * the completion queue.
 166          *
 167          * Written by the kernel, shouldn't be modified by the
 168          * application (i.e. get number of "new events" by comparing to
 169          * cached value).
 170          *
 171          * As completion events come in out of order this counter is not
 172          * ordered with any other data.
 173          */
 174         u32                     cq_overflow;
 175         /*
 176          * Ring buffer of completion events.
 177          *
 178          * The kernel writes completion events fresh every time they are
 179          * produced, so the application is allowed to modify pending
 180          * entries.
 181          */
 182         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 183 };
 184
 185 struct io_mapped_ubuf {
 186         u64             ubuf;
 187         size_t          len;
 188         struct          bio_vec *bvec;
 189         unsigned int    nr_bvecs;
 190 };
 191
 192 struct fixed_file_table {
 193         struct file             **files;
 194 };
 195
 196 struct fixed_file_ref_node {
 197         struct percpu_ref               refs;
 198         struct list_head                node;
 199         struct list_head                file_list;
 200         struct fixed_file_data          *file_data;
 201         struct llist_node               llist;
 202 };
 203
 204 struct fixed_file_data {
 205         struct fixed_file_table         *table;
 206         struct io_ring_ctx              *ctx;
 207
 208         struct percpu_ref               *cur_refs;
 209         struct percpu_ref               refs;
 210         struct completion               done;
 211         struct list_head                ref_list;
 212         spinlock_t                      lock;
 213 };
 214
 215 struct io_buffer {
 216         struct list_head list;
 217         __u64 addr;
 218         __s32 len;
 219         __u16 bid;
 220 };
 221
 222 struct io_ring_ctx {
 223         struct {
 224                 struct percpu_ref       refs;
 225         } ____cacheline_aligned_in_smp;
 226
 227         struct {
 228                 unsigned int            flags;
 229                 unsigned int            compat: 1;
 230                 unsigned int            limit_mem: 1;
 231                 unsigned int            cq_overflow_flushed: 1;
 232                 unsigned int            drain_next: 1;
 233                 unsigned int            eventfd_async: 1;
 234
 235                 /*
 236                  * Ring buffer of indices into array of io_uring_sqe, which is
 237                  * mmapped by the application using the IORING_OFF_SQES offset.
 238                  *
 239                  * This indirection could e.g. be used to assign fixed
 240                  * io_uring_sqe entries to operations and only submit them to
 241                  * the queue when needed.
 242                  *
 243                  * The kernel modifies neither the indices array nor the entries
 244                  * array.
 245                  */
 246                 u32                     *sq_array;
 247                 unsigned                cached_sq_head;
 248                 unsigned                sq_entries;
 249                 unsigned                sq_mask;
 250                 unsigned                sq_thread_idle;
 251                 unsigned                cached_sq_dropped;
 252                 atomic_t                cached_cq_overflow;
 253                 unsigned long           sq_check_overflow;
 254
 255                 struct list_head        defer_list;
 256                 struct list_head        timeout_list;
 257                 struct list_head        cq_overflow_list;
 258
 259                 wait_queue_head_t       inflight_wait;
 260                 struct io_uring_sqe     *sq_sqes;
 261         } ____cacheline_aligned_in_smp;
 262
 263         struct io_rings *rings;
 264
 265         /* IO offload */
 266         struct io_wq            *io_wq;
 267         struct task_struct      *sqo_thread;    /* if using sq thread polling */
 268         struct mm_struct        *sqo_mm;
 269         wait_queue_head_t       sqo_wait;
 270
 271         /*
 272          * If used, fixed file set. Writers must ensure that ->refs is dead,
 273          * readers must ensure that ->refs is alive as long as the file* is
 274          * used. Only updated through io_uring_register(2).
 275          */
 276         struct fixed_file_data  *file_data;
 277         unsigned                nr_user_files;
 278         int                     ring_fd;
 279         struct file             *ring_file;
 280
 281         /* if used, fixed mapped user buffers */
 282         unsigned                nr_user_bufs;
 283         struct io_mapped_ubuf   *user_bufs;
 284
 285         struct user_struct      *user;
 286
 287         const struct cred       *creds;
 288
 289         struct completion       ref_comp;
 290         struct completion       sq_thread_comp;
 291
 292         /* if all else fails... */
 293         struct io_kiocb         *fallback_req;
 294
 295 #if defined(CONFIG_UNIX)
 296         struct socket           *ring_sock;
 297 #endif
 298
 299         struct idr              io_buffer_idr;
 300
 301         struct idr              personality_idr;
 302
 303         struct {
 304                 unsigned                cached_cq_tail;
 305                 unsigned                cq_entries;
 306                 unsigned                cq_mask;
 307                 atomic_t                cq_timeouts;
 308                 unsigned long           cq_check_overflow;
 309                 struct wait_queue_head  cq_wait;
 310                 struct fasync_struct    *cq_fasync;
 311                 struct eventfd_ctx      *cq_ev_fd;
 312         } ____cacheline_aligned_in_smp;
 313
 314         struct {
 315                 struct mutex            uring_lock;
 316                 wait_queue_head_t       wait;
 317         } ____cacheline_aligned_in_smp;
 318
 319         struct {
 320                 spinlock_t              completion_lock;
 321
 322                 /*
 323                  * ->poll_list is protected by the ctx->uring_lock for
 324                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 325                  * For SQPOLL, only the single threaded io_sq_thread() will
 326                  * manipulate the list, hence no extra locking is needed there.
 327                  */
 328                 struct list_head        poll_list;
 329                 struct hlist_head       *cancel_hash;
 330                 unsigned                cancel_hash_bits;
 331                 bool                    poll_multi_file;
 332
 333                 spinlock_t              inflight_lock;
 334                 struct list_head        inflight_list;
 335         } ____cacheline_aligned_in_smp;
 336
 337         struct delayed_work             file_put_work;
 338         struct llist_head               file_put_llist;
 339
 340         struct work_struct              exit_work;
 341 };
 342
 343 /*
 344  * First field must be the file pointer in all the
 345  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 346  */
 347 struct io_poll_iocb {
 348         struct file                     *file;
 349         union {
 350                 struct wait_queue_head  *head;
 351                 u64                     addr;
 352         };
 353         __poll_t                        events;
 354         bool                            done;
 355         bool                            canceled;
 356         struct wait_queue_entry         wait;
 357 };
 358
 359 struct io_close {
 360         struct file                     *file;
 361         struct file                     *put_file;
 362         int                             fd;
 363 };
 364
 365 struct io_timeout_data {
 366         struct io_kiocb                 *req;
 367         struct hrtimer                  timer;
 368         struct timespec64               ts;
 369         enum hrtimer_mode               mode;
 370 };
 371
 372 struct io_accept {
 373         struct file                     *file;
 374         struct sockaddr __user          *addr;
 375         int __user                      *addr_len;
 376         int                             flags;
 377         unsigned long                   nofile;
 378 };
 379
 380 struct io_sync {
 381         struct file                     *file;
 382         loff_t                          len;
 383         loff_t                          off;
 384         int                             flags;
 385         int                             mode;
 386 };
 387
 388 struct io_cancel {
 389         struct file                     *file;
 390         u64                             addr;
 391 };
 392
 393 struct io_timeout {
 394         struct file                     *file;
 395         u64                             addr;
 396         int                             flags;
 397         u32                             off;
 398         u32                             target_seq;
 399 };
 400
 401 struct io_rw {
 402         /* NOTE: kiocb has the file as the first member, so don't do it here */
 403         struct kiocb                    kiocb;
 404         u64                             addr;
 405         u64                             len;
 406 };
 407
 408 struct io_connect {
 409         struct file                     *file;
 410         struct sockaddr __user          *addr;
 411         int                             addr_len;
 412 };
 413
 414 struct io_sr_msg {
 415         struct file                     *file;
 416         union {
 417                 struct user_msghdr __user *msg;
 418                 void __user             *buf;
 419         };
 420         int                             msg_flags;
 421         int                             bgid;
 422         size_t                          len;
 423         struct io_buffer                *kbuf;
 424 };
 425
 426 struct io_open {
 427         struct file                     *file;
 428         int                             dfd;
 429         struct filename                 *filename;
 430         struct open_how                 how;
 431         unsigned long                   nofile;
 432 };
 433
 434 struct io_files_update {
 435         struct file                     *file;
 436         u64                             arg;
 437         u32                             nr_args;
 438         u32                             offset;
 439 };
 440
 441 struct io_fadvise {
 442         struct file                     *file;
 443         u64                             offset;
 444         u32                             len;
 445         u32                             advice;
 446 };
 447
 448 struct io_madvise {
 449         struct file                     *file;
 450         u64                             addr;
 451         u32                             len;
 452         u32                             advice;
 453 };
 454
 455 struct io_epoll {
 456         struct file                     *file;
 457         int                             epfd;
 458         int                             op;
 459         int                             fd;
 460         struct epoll_event              event;
 461 };
 462
 463 struct io_splice {
 464         struct file                     *file_out;
 465         struct file                     *file_in;
 466         loff_t                          off_out;
 467         loff_t                          off_in;
 468         u64                             len;
 469         unsigned int                    flags;
 470 };
 471
 472 struct io_provide_buf {
 473         struct file                     *file;
 474         __u64                           addr;
 475         __s32                           len;
 476         __u32                           bgid;
 477         __u16                           nbufs;
 478         __u16                           bid;
 479 };
 480
 481 struct io_statx {
 482         struct file                     *file;
 483         int                             dfd;
 484         unsigned int                    mask;
 485         unsigned int                    flags;
 486         const char __user               *filename;
 487         struct statx __user             *buffer;
 488 };
 489
 490 struct io_async_connect {
 491         struct sockaddr_storage         address;
 492 };
 493
 494 struct io_async_msghdr {
 495         struct iovec                    fast_iov[UIO_FASTIOV];
 496         struct iovec                    *iov;
 497         struct sockaddr __user          *uaddr;
 498         struct msghdr                   msg;
 499         struct sockaddr_storage         addr;
 500 };
 501
 502 struct io_async_rw {
 503         struct iovec                    fast_iov[UIO_FASTIOV];
 504         struct iovec                    *iov;
 505         ssize_t                         nr_segs;
 506         ssize_t                         size;
 507         struct wait_page_queue          wpq;
 508         struct callback_head            task_work;
 509 };
 510
 511 struct io_async_ctx {
 512         union {
 513                 struct io_async_rw      rw;
 514                 struct io_async_msghdr  msg;
 515                 struct io_async_connect connect;
 516                 struct io_timeout_data  timeout;
 517         };
 518 };
 519
 520 enum {
 521         REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
 522         REQ_F_IO_DRAIN_BIT      = IOSQE_IO_DRAIN_BIT,
 523         REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
 524         REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
 525         REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
 526         REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
 527
 528         REQ_F_LINK_HEAD_BIT,
 529         REQ_F_FAIL_LINK_BIT,
 530         REQ_F_INFLIGHT_BIT,
 531         REQ_F_CUR_POS_BIT,
 532         REQ_F_NOWAIT_BIT,
 533         REQ_F_LINK_TIMEOUT_BIT,
 534         REQ_F_TIMEOUT_BIT,
 535         REQ_F_ISREG_BIT,
 536         REQ_F_TIMEOUT_NOSEQ_BIT,
 537         REQ_F_COMP_LOCKED_BIT,
 538         REQ_F_NEED_CLEANUP_BIT,
 539         REQ_F_OVERFLOW_BIT,
 540         REQ_F_POLLED_BIT,
 541         REQ_F_BUFFER_SELECTED_BIT,
 542         REQ_F_NO_FILE_TABLE_BIT,
 543         REQ_F_QUEUE_TIMEOUT_BIT,
 544         REQ_F_WORK_INITIALIZED_BIT,
 545         REQ_F_TASK_PINNED_BIT,
 546
 547         /* not a real bit, just to check we're not overflowing the space */
 548         __REQ_F_LAST_BIT,
 549 };
 550
 551 enum {
 552         /* ctx owns file */
 553         REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
 554         /* drain existing IO first */
 555         REQ_F_IO_DRAIN          = BIT(REQ_F_IO_DRAIN_BIT),
 556         /* linked sqes */
 557         REQ_F_LINK              = BIT(REQ_F_LINK_BIT),
 558         /* doesn't sever on completion < 0 */
 559         REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
 560         /* IOSQE_ASYNC */
 561         REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
 562         /* IOSQE_BUFFER_SELECT */
 563         REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
 564
 565         /* head of a link */
 566         REQ_F_LINK_HEAD         = BIT(REQ_F_LINK_HEAD_BIT),
 567         /* fail rest of links */
 568         REQ_F_FAIL_LINK         = BIT(REQ_F_FAIL_LINK_BIT),
 569         /* on inflight list */
 570         REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
 571         /* read/write uses file position */
 572         REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
 573         /* must not punt to workers */
 574         REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
 575         /* has linked timeout */
 576         REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
 577         /* timeout request */
 578         REQ_F_TIMEOUT           = BIT(REQ_F_TIMEOUT_BIT),
 579         /* regular file */
 580         REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
 581         /* no timeout sequence */
 582         REQ_F_TIMEOUT_NOSEQ     = BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
 583         /* completion under lock */
 584         REQ_F_COMP_LOCKED       = BIT(REQ_F_COMP_LOCKED_BIT),
 585         /* needs cleanup */
 586         REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
 587         /* in overflow list */
 588         REQ_F_OVERFLOW          = BIT(REQ_F_OVERFLOW_BIT),
 589         /* already went through poll handler */
 590         REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
 591         /* buffer already selected */
 592         REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
 593         /* doesn't need file table for this request */
 594         REQ_F_NO_FILE_TABLE     = BIT(REQ_F_NO_FILE_TABLE_BIT),
 595         /* needs to queue linked timeout */
 596         REQ_F_QUEUE_TIMEOUT     = BIT(REQ_F_QUEUE_TIMEOUT_BIT),
 597         /* io_wq_work is initialized */
 598         REQ_F_WORK_INITIALIZED  = BIT(REQ_F_WORK_INITIALIZED_BIT),
 599         /* req->task is refcounted */
 600         REQ_F_TASK_PINNED       = BIT(REQ_F_TASK_PINNED_BIT),
 601 };
 602
 603 struct async_poll {
 604         struct io_poll_iocb     poll;
 605         struct io_wq_work       work;
 606 };
 607
 608 /*
 609  * NOTE! Each of the iocb union members has the file pointer
 610  * as the first entry in their struct definition. So you can
 611  * access the file pointer through any of the sub-structs,
 612  * or directly as just 'ki_filp' in this struct.
 613  */
 614 struct io_kiocb {
 615         union {
 616                 struct file             *file;
 617                 struct io_rw            rw;
 618                 struct io_poll_iocb     poll;
 619                 struct io_accept        accept;
 620                 struct io_sync          sync;
 621                 struct io_cancel        cancel;
 622                 struct io_timeout       timeout;
 623                 struct io_connect       connect;
 624                 struct io_sr_msg        sr_msg;
 625                 struct io_open          open;
 626                 struct io_close         close;
 627                 struct io_files_update  files_update;
 628                 struct io_fadvise       fadvise;
 629                 struct io_madvise       madvise;
 630                 struct io_epoll         epoll;
 631                 struct io_splice        splice;
 632                 struct io_provide_buf   pbuf;
 633                 struct io_statx         statx;
 634         };
 635
 636         struct io_async_ctx             *io;
 637         int                             cflags;
 638         u8                              opcode;
 639         /* polled IO has completed */
 640         u8                              iopoll_completed;
 641
 642         u16                             buf_index;
 643
 644         struct io_ring_ctx      *ctx;
 645         struct list_head        list;
 646         unsigned int            flags;
 647         refcount_t              refs;
 648         struct task_struct      *task;
 649         unsigned long           fsize;
 650         u64                     user_data;
 651         u32                     result;
 652         u32                     sequence;
 653
 654         struct list_head        link_list;
 655
 656         struct list_head        inflight_entry;
 657
 658         struct percpu_ref       *fixed_file_refs;
 659
 660         union {
 661                 /*
 662                  * Only commands that never go async can use the below fields,
 663                  * obviously. Right now only IORING_OP_POLL_ADD uses them, and
 664                  * async armed poll handlers for regular commands. The latter
 665                  * restore the work, if needed.
 666                  */
 667                 struct {
 668                         struct hlist_node       hash_node;
 669                         struct async_poll       *apoll;
 670                 };
 671                 struct io_wq_work       work;
 672         };
 673         struct callback_head    task_work;
 674 };
 675
 676 #define IO_IOPOLL_BATCH                 8
 677
 678 struct io_comp_state {
 679         unsigned int            nr;
 680         struct list_head        list;
 681         struct io_ring_ctx      *ctx;
 682 };
 683
 684 struct io_submit_state {
 685         struct blk_plug         plug;
 686
 687         /*
 688          * io_kiocb alloc cache
 689          */
 690         void                    *reqs[IO_IOPOLL_BATCH];
 691         unsigned int            free_reqs;
 692
 693         /*
 694          * Batch completion logic
 695          */
 696         struct io_comp_state    comp;
 697
 698         /*
 699          * File reference cache
 700          */
 701         struct file             *file;
 702         unsigned int            fd;
 703         unsigned int            has_refs;
 704         unsigned int            used_refs;
 705         unsigned int            ios_left;
 706 };
 707
 708 struct io_op_def {
 709         /* needs req->io allocated for deferral/async */
 710         unsigned                async_ctx : 1;
 711         /* needs current->mm setup, does mm access */
 712         unsigned                needs_mm : 1;
 713         /* needs req->file assigned */
 714         unsigned                needs_file : 1;
 715         /* don't fail if file grab fails */
 716         unsigned                needs_file_no_error : 1;
 717         /* hash wq insertion if file is a regular file */
 718         unsigned                hash_reg_file : 1;
 719         /* unbound wq insertion if file is a non-regular file */
 720         unsigned                unbound_nonreg_file : 1;
 721         /* opcode is not supported by this kernel */
 722         unsigned                not_supported : 1;
 723         /* needs file table */
 724         unsigned                file_table : 1;
 725         /* needs ->fs */
 726         unsigned                needs_fs : 1;
 727         /* set if opcode supports polled "wait" */
 728         unsigned                pollin : 1;
 729         unsigned                pollout : 1;
 730         /* op supports buffer selection */
 731         unsigned                buffer_select : 1;
 732 };
 733
 734 static const struct io_op_def io_op_defs[] = {
 735         [IORING_OP_NOP] = {},
 736         [IORING_OP_READV] = {
 737                 .async_ctx              = 1,
 738                 .needs_mm               = 1,
 739                 .needs_file             = 1,
 740                 .unbound_nonreg_file    = 1,
 741                 .pollin                 = 1,
 742                 .buffer_select          = 1,
 743         },
 744         [IORING_OP_WRITEV] = {
 745                 .async_ctx              = 1,
 746                 .needs_mm               = 1,
 747                 .needs_file             = 1,
 748                 .hash_reg_file          = 1,
 749                 .unbound_nonreg_file    = 1,
 750                 .pollout                = 1,
 751         },
 752         [IORING_OP_FSYNC] = {
 753                 .needs_file             = 1,
 754         },
 755         [IORING_OP_READ_FIXED] = {
 756                 .needs_file             = 1,
 757                 .unbound_nonreg_file    = 1,
 758                 .pollin                 = 1,
 759         },
 760         [IORING_OP_WRITE_FIXED] = {
 761                 .needs_file             = 1,
 762                 .hash_reg_file          = 1,
 763                 .unbound_nonreg_file    = 1,
 764                 .pollout                = 1,
 765         },
 766         [IORING_OP_POLL_ADD] = {
 767                 .needs_file             = 1,
 768                 .unbound_nonreg_file    = 1,
 769         },
 770         [IORING_OP_POLL_REMOVE] = {},
 771         [IORING_OP_SYNC_FILE_RANGE] = {
 772                 .needs_file             = 1,
 773         },
 774         [IORING_OP_SENDMSG] = {
 775                 .async_ctx              = 1,
 776                 .needs_mm               = 1,
 777                 .needs_file             = 1,
 778                 .unbound_nonreg_file    = 1,
 779                 .needs_fs               = 1,
 780                 .pollout                = 1,
 781         },
 782         [IORING_OP_RECVMSG] = {
 783                 .async_ctx              = 1,
 784                 .needs_mm               = 1,
 785                 .needs_file             = 1,
 786                 .unbound_nonreg_file    = 1,
 787                 .needs_fs               = 1,
 788                 .pollin                 = 1,
 789                 .buffer_select          = 1,
 790         },
 791         [IORING_OP_TIMEOUT] = {
 792                 .async_ctx              = 1,
 793                 .needs_mm               = 1,
 794         },
 795         [IORING_OP_TIMEOUT_REMOVE] = {},
 796         [IORING_OP_ACCEPT] = {
 797                 .needs_mm               = 1,
 798                 .needs_file             = 1,
 799                 .unbound_nonreg_file    = 1,
 800                 .file_table             = 1,
 801                 .pollin                 = 1,
 802         },
 803         [IORING_OP_ASYNC_CANCEL] = {},
 804         [IORING_OP_LINK_TIMEOUT] = {
 805                 .async_ctx              = 1,
 806                 .needs_mm               = 1,
 807         },
 808         [IORING_OP_CONNECT] = {
 809                 .async_ctx              = 1,
 810                 .needs_mm               = 1,
 811                 .needs_file             = 1,
 812                 .unbound_nonreg_file    = 1,
 813                 .pollout                = 1,
 814         },
 815         [IORING_OP_FALLOCATE] = {
 816                 .needs_file             = 1,
 817         },
 818         [IORING_OP_OPENAT] = {
 819                 .file_table             = 1,
 820                 .needs_fs               = 1,
 821         },
 822         [IORING_OP_CLOSE] = {
 823                 .needs_file             = 1,
 824                 .needs_file_no_error    = 1,
 825                 .file_table             = 1,
 826         },
 827         [IORING_OP_FILES_UPDATE] = {
 828                 .needs_mm               = 1,
 829                 .file_table             = 1,
 830         },
 831         [IORING_OP_STATX] = {
 832                 .needs_mm               = 1,
 833                 .needs_fs               = 1,
 834                 .file_table             = 1,
 835         },
 836         [IORING_OP_READ] = {
 837                 .needs_mm               = 1,
 838                 .needs_file             = 1,
 839                 .unbound_nonreg_file    = 1,
 840                 .pollin                 = 1,
 841                 .buffer_select          = 1,
 842         },
 843         [IORING_OP_WRITE] = {
 844                 .needs_mm               = 1,
 845                 .needs_file             = 1,
 846                 .unbound_nonreg_file    = 1,
 847                 .pollout                = 1,
 848         },
 849         [IORING_OP_FADVISE] = {
 850                 .needs_file             = 1,
 851         },
 852         [IORING_OP_MADVISE] = {
 853                 .needs_mm               = 1,
 854         },
 855         [IORING_OP_SEND] = {
 856                 .needs_mm               = 1,
 857                 .needs_file             = 1,
 858                 .unbound_nonreg_file    = 1,
 859                 .pollout                = 1,
 860         },
 861         [IORING_OP_RECV] = {
 862                 .needs_mm               = 1,
 863                 .needs_file             = 1,
 864                 .unbound_nonreg_file    = 1,
 865                 .pollin                 = 1,
 866                 .buffer_select          = 1,
 867         },
 868         [IORING_OP_OPENAT2] = {
 869                 .file_table             = 1,
 870                 .needs_fs               = 1,
 871         },
 872         [IORING_OP_EPOLL_CTL] = {
 873                 .unbound_nonreg_file    = 1,
 874                 .file_table             = 1,
 875         },
 876         [IORING_OP_SPLICE] = {
 877                 .needs_file             = 1,
 878                 .hash_reg_file          = 1,
 879                 .unbound_nonreg_file    = 1,
 880         },
 881         [IORING_OP_PROVIDE_BUFFERS] = {},
 882         [IORING_OP_REMOVE_BUFFERS] = {},
 883         [IORING_OP_TEE] = {
 884                 .needs_file             = 1,
 885                 .hash_reg_file          = 1,
 886                 .unbound_nonreg_file    = 1,
 887         },
 888 };
 889
 890 enum io_mem_account {
 891         ACCT_LOCKED,
 892         ACCT_PINNED,
 893 };
 894
 895 static bool io_rw_reissue(struct io_kiocb *req, long res);
 896 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 897 static void io_put_req(struct io_kiocb *req);
 898 static void io_double_put_req(struct io_kiocb *req);
 899 static void __io_double_put_req(struct io_kiocb *req);
 900 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 901 static void io_queue_linked_timeout(struct io_kiocb *req);
 902 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 903                                  struct io_uring_files_update *ip,
 904                                  unsigned nr_args);
 905 static int io_grab_files(struct io_kiocb *req);
 906 static void io_complete_rw_common(struct kiocb *kiocb, long res,
 907                                   struct io_comp_state *cs);
 908 static void io_cleanup_req(struct io_kiocb *req);
 909 static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
 910                        int fd, struct file **out_file, bool fixed);
 911 static void __io_queue_sqe(struct io_kiocb *req,
 912                            const struct io_uring_sqe *sqe,
 913                            struct io_comp_state *cs);
 914
 915 static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 916                                struct iovec **iovec, struct iov_iter *iter,
 917                                bool needs_lock);
 918 static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
 919                              struct iovec *iovec, struct iovec *fast_iov,
 920                              struct iov_iter *iter);
 921
 922 static struct kmem_cache *req_cachep;
 923
 924 static const struct file_operations io_uring_fops;
 925
 926 struct sock *io_uring_get_socket(struct file *file)
 927 {
 928 #if defined(CONFIG_UNIX)
 929         if (file->f_op == &io_uring_fops) {
 930                 struct io_ring_ctx *ctx = file->private_data;
 931
 932                 return ctx->ring_sock->sk;
 933         }
 934 #endif
 935         return NULL;
 936 }
 937 EXPORT_SYMBOL(io_uring_get_socket);
 938
 939 static void io_get_req_task(struct io_kiocb *req)
 940 {
 941         if (req->flags & REQ_F_TASK_PINNED)
 942                 return;
 943         get_task_struct(req->task);
 944         req->flags |= REQ_F_TASK_PINNED;
 945 }
 946
 947 /* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */
 948 static void __io_put_req_task(struct io_kiocb *req)
 949 {
 950         if (req->flags & REQ_F_TASK_PINNED)
 951                 put_task_struct(req->task);
 952 }
 953
 954 static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
 955 {
 956         struct mm_struct *mm = current->mm;
 957
 958         if (mm) {
 959                 kthread_unuse_mm(mm);
 960                 mmput(mm);
 961         }
 962 }
 963
 964 static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
 965 {
 966         if (!current->mm) {
 967                 if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
 968                         return -EFAULT;
 969                 kthread_use_mm(ctx->sqo_mm);
 970         }
 971
 972         return 0;
 973 }
 974
 975 static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
 976                                    struct io_kiocb *req)
 977 {
 978         if (!io_op_defs[req->opcode].needs_mm)
 979                 return 0;
 980         return __io_sq_thread_acquire_mm(ctx);
 981 }
 982
 983 static inline void req_set_fail_links(struct io_kiocb *req)
 984 {
 985         if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
 986                 req->flags |= REQ_F_FAIL_LINK;
 987 }
 988
 989 static void io_file_put_work(struct work_struct *work);
 990
 991 /*
 992  * Note: must call io_req_init_async() for the first time you
 993  * touch any members of io_wq_work.
 994  */
 995 static inline void io_req_init_async(struct io_kiocb *req)
 996 {
 997         if (req->flags & REQ_F_WORK_INITIALIZED)
 998                 return;
 999
1000         memset(&req->work, 0, sizeof(req->work));
1001         req->flags |= REQ_F_WORK_INITIALIZED;
1002 }
1003
1004 static inline bool io_async_submit(struct io_ring_ctx *ctx)
1005 {
1006         return ctx->flags & IORING_SETUP_SQPOLL;
1007 }
1008
1009 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
1010 {
1011         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1012
1013         complete(&ctx->ref_comp);
1014 }
1015
1016 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1017 {
1018         struct io_ring_ctx *ctx;
1019         int hash_bits;
1020
1021         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1022         if (!ctx)
1023                 return NULL;
1024
1025         ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
1026         if (!ctx->fallback_req)
1027                 goto err;
1028
1029         /*
1030          * Use 5 bits less than the max cq entries, that should give us around
1031          * 32 entries per hash list if totally full and uniformly spread.
1032          */
1033         hash_bits = ilog2(p->cq_entries);
1034         hash_bits -= 5;
1035         if (hash_bits <= 0)
1036                 hash_bits = 1;
1037         ctx->cancel_hash_bits = hash_bits;
1038         ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1039                                         GFP_KERNEL);
1040         if (!ctx->cancel_hash)
1041                 goto err;
1042         __hash_init(ctx->cancel_hash, 1U << hash_bits);
1043
1044         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
1045                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1046                 goto err;
1047
1048         ctx->flags = p->flags;
1049         init_waitqueue_head(&ctx->sqo_wait);
1050         init_waitqueue_head(&ctx->cq_wait);
1051         INIT_LIST_HEAD(&ctx->cq_overflow_list);
1052         init_completion(&ctx->ref_comp);
1053         init_completion(&ctx->sq_thread_comp);
1054         idr_init(&ctx->io_buffer_idr);
1055         idr_init(&ctx->personality_idr);
1056         mutex_init(&ctx->uring_lock);
1057         init_waitqueue_head(&ctx->wait);
1058         spin_lock_init(&ctx->completion_lock);
1059         INIT_LIST_HEAD(&ctx->poll_list);
1060         INIT_LIST_HEAD(&ctx->defer_list);
1061         INIT_LIST_HEAD(&ctx->timeout_list);
1062         init_waitqueue_head(&ctx->inflight_wait);
1063         spin_lock_init(&ctx->inflight_lock);
1064         INIT_LIST_HEAD(&ctx->inflight_list);
1065         INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
1066         init_llist_head(&ctx->file_put_llist);
1067         return ctx;
1068 err:
1069         if (ctx->fallback_req)
1070                 kmem_cache_free(req_cachep, ctx->fallback_req);
1071         kfree(ctx->cancel_hash);
1072         kfree(ctx);
1073         return NULL;
1074 }
1075
1076 static inline bool __req_need_defer(struct io_kiocb *req)
1077 {
1078         struct io_ring_ctx *ctx = req->ctx;
1079
1080         return req->sequence != ctx->cached_cq_tail
1081                                 + atomic_read(&ctx->cached_cq_overflow);
1082 }
1083
1084 static inline bool req_need_defer(struct io_kiocb *req)
1085 {
1086         if (unlikely(req->flags & REQ_F_IO_DRAIN))
1087                 return __req_need_defer(req);
1088
1089         return false;
1090 }
1091
1092 static void __io_commit_cqring(struct io_ring_ctx *ctx)
1093 {
1094         struct io_rings *rings = ctx->rings;
1095
1096         /* order cqe stores with ring update */
1097         smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
1098
1099         if (wq_has_sleeper(&ctx->cq_wait)) {
1100                 wake_up_interruptible(&ctx->cq_wait);
1101                 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1102         }
1103 }
1104
1105 static inline void io_req_work_grab_env(struct io_kiocb *req,
1106                                         const struct io_op_def *def)
1107 {
1108         if (!req->work.mm && def->needs_mm) {
1109                 mmgrab(current->mm);
1110                 req->work.mm = current->mm;
1111         }
1112         if (!req->work.creds)
1113                 req->work.creds = get_current_cred();
1114         if (!req->work.fs && def->needs_fs) {
1115                 spin_lock(&current->fs->lock);
1116                 if (!current->fs->in_exec) {
1117                         req->work.fs = current->fs;
1118                         req->work.fs->users++;
1119                 } else {
1120                         req->work.flags |= IO_WQ_WORK_CANCEL;
1121                 }
1122                 spin_unlock(&current->fs->lock);
1123         }
1124 }
1125
1126 static inline void io_req_work_drop_env(struct io_kiocb *req)
1127 {
1128         if (!(req->flags & REQ_F_WORK_INITIALIZED))
1129                 return;
1130
1131         if (req->work.mm) {
1132                 mmdrop(req->work.mm);
1133                 req->work.mm = NULL;
1134         }
1135         if (req->work.creds) {
1136                 put_cred(req->work.creds);
1137                 req->work.creds = NULL;
1138         }
1139         if (req->work.fs) {
1140                 struct fs_struct *fs = req->work.fs;
1141
1142                 spin_lock(&req->work.fs->lock);
1143                 if (--fs->users)
1144                         fs = NULL;
1145                 spin_unlock(&req->work.fs->lock);
1146                 if (fs)
1147                         free_fs_struct(fs);
1148         }
1149 }
1150
1151 static inline void io_prep_async_work(struct io_kiocb *req,
1152                                       struct io_kiocb **link)
1153 {
1154         const struct io_op_def *def = &io_op_defs[req->opcode];
1155
1156         if (req->flags & REQ_F_ISREG) {
1157                 if (def->hash_reg_file)
1158                         io_wq_hash_work(&req->work, file_inode(req->file));
1159         } else {
1160                 if (def->unbound_nonreg_file)
1161                         req->work.flags |= IO_WQ_WORK_UNBOUND;
1162         }
1163
1164         io_req_init_async(req);
1165         io_req_work_grab_env(req, def);
1166
1167         *link = io_prep_linked_timeout(req);
1168 }
1169
1170 static inline void io_queue_async_work(struct io_kiocb *req)
1171 {
1172         struct io_ring_ctx *ctx = req->ctx;
1173         struct io_kiocb *link;
1174
1175         io_prep_async_work(req, &link);
1176
1177         trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1178                                         &req->work, req->flags);
1179         io_wq_enqueue(ctx->io_wq, &req->work);
1180
1181         if (link)
1182                 io_queue_linked_timeout(link);
1183 }
1184
1185 static void io_kill_timeout(struct io_kiocb *req)
1186 {
1187         int ret;
1188
1189         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
1190         if (ret != -1) {
1191                 atomic_inc(&req->ctx->cq_timeouts);
1192                 list_del_init(&req->list);
1193                 req->flags |= REQ_F_COMP_LOCKED;
1194                 io_cqring_fill_event(req, 0);
1195                 io_put_req(req);
1196         }
1197 }
1198
1199 static void io_kill_timeouts(struct io_ring_ctx *ctx)
1200 {
1201         struct io_kiocb *req, *tmp;
1202
1203         spin_lock_irq(&ctx->completion_lock);
1204         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
1205                 io_kill_timeout(req);
1206         spin_unlock_irq(&ctx->completion_lock);
1207 }
1208
1209 static void __io_queue_deferred(struct io_ring_ctx *ctx)
1210 {
1211         do {
1212                 struct io_kiocb *req = list_first_entry(&ctx->defer_list,
1213                                                         struct io_kiocb, list);
1214
1215                 if (req_need_defer(req))
1216                         break;
1217                 list_del_init(&req->list);
1218                 io_queue_async_work(req);
1219         } while (!list_empty(&ctx->defer_list));
1220 }
1221
1222 static void io_flush_timeouts(struct io_ring_ctx *ctx)
1223 {
1224         while (!list_empty(&ctx->timeout_list)) {
1225                 struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
1226                                                         struct io_kiocb, list);
1227
1228                 if (req->flags & REQ_F_TIMEOUT_NOSEQ)
1229                         break;
1230                 if (req->timeout.target_seq != ctx->cached_cq_tail
1231                                         - atomic_read(&ctx->cq_timeouts))
1232                         break;
1233
1234                 list_del_init(&req->list);
1235                 io_kill_timeout(req);
1236         }
1237 }
1238
1239 static void io_commit_cqring(struct io_ring_ctx *ctx)
1240 {
1241         io_flush_timeouts(ctx);
1242         __io_commit_cqring(ctx);
1243
1244         if (unlikely(!list_empty(&ctx->defer_list)))
1245                 __io_queue_deferred(ctx);
1246 }
1247
1248 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
1249 {
1250         struct io_rings *rings = ctx->rings;
1251         unsigned tail;
1252
1253         tail = ctx->cached_cq_tail;
1254         /*
1255          * writes to the cq entry need to come after reading head; the
1256          * control dependency is enough as we're using WRITE_ONCE to
1257          * fill the cq entry
1258          */
1259         if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
1260                 return NULL;
1261
1262         ctx->cached_cq_tail++;
1263         return &rings->cqes[tail & ctx->cq_mask];
1264 }
1265
1266 static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1267 {
1268         if (!ctx->cq_ev_fd)
1269                 return false;
1270         if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1271                 return false;
1272         if (!ctx->eventfd_async)
1273                 return true;
1274         return io_wq_current_is_worker();
1275 }
1276
1277 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1278 {
1279         if (waitqueue_active(&ctx->wait))
1280                 wake_up(&ctx->wait);
1281         if (waitqueue_active(&ctx->sqo_wait))
1282                 wake_up(&ctx->sqo_wait);
1283         if (io_should_trigger_evfd(ctx))
1284                 eventfd_signal(ctx->cq_ev_fd, 1);
1285 }
1286
1287 /* Returns true if there are no backlogged entries after the flush */
1288 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1289 {
1290         struct io_rings *rings = ctx->rings;
1291         struct io_uring_cqe *cqe;
1292         struct io_kiocb *req;
1293         unsigned long flags;
1294         LIST_HEAD(list);
1295
1296         if (!force) {
1297                 if (list_empty_careful(&ctx->cq_overflow_list))
1298                         return true;
1299                 if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
1300                     rings->cq_ring_entries))
1301                         return false;
1302         }
1303
1304         spin_lock_irqsave(&ctx->completion_lock, flags);
1305
1306         /* if force is set, the ring is going away. always drop after that */
1307         if (force)
1308                 ctx->cq_overflow_flushed = 1;
1309
1310         cqe = NULL;
1311         while (!list_empty(&ctx->cq_overflow_list)) {
1312                 cqe = io_get_cqring(ctx);
1313                 if (!cqe && !force)
1314                         break;
1315
1316                 req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
1317                                                 list);
1318                 list_move(&req->list, &list);
1319                 req->flags &= ~REQ_F_OVERFLOW;
1320                 if (cqe) {
1321                         WRITE_ONCE(cqe->user_data, req->user_data);
1322                         WRITE_ONCE(cqe->res, req->result);
1323                         WRITE_ONCE(cqe->flags, req->cflags);
1324                 } else {
1325                         WRITE_ONCE(ctx->rings->cq_overflow,
1326                                 atomic_inc_return(&ctx->cached_cq_overflow));
1327                 }
1328         }
1329
1330         io_commit_cqring(ctx);
1331         if (cqe) {
1332                 clear_bit(0, &ctx->sq_check_overflow);
1333                 clear_bit(0, &ctx->cq_check_overflow);
1334         }
1335         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1336         io_cqring_ev_posted(ctx);
1337
1338         while (!list_empty(&list)) {
1339                 req = list_first_entry(&list, struct io_kiocb, list);
1340                 list_del(&req->list);
1341                 io_put_req(req);
1342         }
1343
1344         return cqe != NULL;
1345 }
1346
1347 static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
1348 {
1349         struct io_ring_ctx *ctx = req->ctx;
1350         struct io_uring_cqe *cqe;
1351
1352         trace_io_uring_complete(ctx, req->user_data, res);
1353
1354         /*
1355          * If we can't get a cq entry, userspace overflowed the
1356          * submission (by quite a lot). Increment the overflow count in
1357          * the ring.
1358          */
1359         cqe = io_get_cqring(ctx);
1360         if (likely(cqe)) {
1361                 WRITE_ONCE(cqe->user_data, req->user_data);
1362                 WRITE_ONCE(cqe->res, res);
1363                 WRITE_ONCE(cqe->flags, cflags);
1364         } else if (ctx->cq_overflow_flushed) {
1365                 WRITE_ONCE(ctx->rings->cq_overflow,
1366                                 atomic_inc_return(&ctx->cached_cq_overflow));
1367         } else {
1368                 if (list_empty(&ctx->cq_overflow_list)) {
1369                         set_bit(0, &ctx->sq_check_overflow);
1370                         set_bit(0, &ctx->cq_check_overflow);
1371                 }
1372                 req->flags |= REQ_F_OVERFLOW;
1373                 refcount_inc(&req->refs);
1374                 req->result = res;
1375                 req->cflags = cflags;
1376                 list_add_tail(&req->list, &ctx->cq_overflow_list);
1377         }
1378 }
1379
1380 static void io_cqring_fill_event(struct io_kiocb *req, long res)
1381 {
1382         __io_cqring_fill_event(req, res, 0);
1383 }
1384
1385 static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
1386 {
1387         struct io_ring_ctx *ctx = req->ctx;
1388         unsigned long flags;
1389
1390         spin_lock_irqsave(&ctx->completion_lock, flags);
1391         __io_cqring_fill_event(req, res, cflags);
1392         io_commit_cqring(ctx);
1393         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1394
1395         io_cqring_ev_posted(ctx);
1396 }
1397
1398 static void io_submit_flush_completions(struct io_comp_state *cs)
1399 {
1400         struct io_ring_ctx *ctx = cs->ctx;
1401
1402         spin_lock_irq(&ctx->completion_lock);
1403         while (!list_empty(&cs->list)) {
1404                 struct io_kiocb *req;
1405
1406                 req = list_first_entry(&cs->list, struct io_kiocb, list);
1407                 list_del(&req->list);
1408                 io_cqring_fill_event(req, req->result);
1409                 if (!(req->flags & REQ_F_LINK_HEAD)) {
1410                         req->flags |= REQ_F_COMP_LOCKED;
1411                         io_put_req(req);
1412                 } else {
1413                         spin_unlock_irq(&ctx->completion_lock);
1414                         io_put_req(req);
1415                         spin_lock_irq(&ctx->completion_lock);
1416                 }
1417         }
1418         io_commit_cqring(ctx);
1419         spin_unlock_irq(&ctx->completion_lock);
1420
1421         io_cqring_ev_posted(ctx);
1422         cs->nr = 0;
1423 }
1424
1425 static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags,
1426                               struct io_comp_state *cs)
1427 {
1428         if (!cs) {
1429                 io_cqring_add_event(req, res, cflags);
1430                 io_put_req(req);
1431         } else {
1432                 req->result = res;
1433                 list_add_tail(&req->list, &cs->list);
1434                 if (++cs->nr >= 32)
1435                         io_submit_flush_completions(cs);
1436         }
1437 }
1438
1439 static void io_req_complete(struct io_kiocb *req, long res)
1440 {
1441         __io_req_complete(req, res, 0, NULL);
1442 }
1443
1444 static inline bool io_is_fallback_req(struct io_kiocb *req)
1445 {
1446         return req == (struct io_kiocb *)
1447                         ((unsigned long) req->ctx->fallback_req & ~1UL);
1448 }
1449
1450 static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
1451 {
1452         struct io_kiocb *req;
1453
1454         req = ctx->fallback_req;
1455         if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req))
1456                 return req;
1457
1458         return NULL;
1459 }
1460
1461 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
1462                                      struct io_submit_state *state)
1463 {
1464         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1465         struct io_kiocb *req;
1466
1467         if (!state->free_reqs) {
1468                 size_t sz;
1469                 int ret;
1470
1471                 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
1472                 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
1473
1474                 /*
1475                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
1476                  * retry single alloc to be on the safe side.
1477                  */
1478                 if (unlikely(ret <= 0)) {
1479                         state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1480                         if (!state->reqs[0])
1481                                 goto fallback;
1482                         ret = 1;
1483                 }
1484                 state->free_reqs = ret - 1;
1485                 req = state->reqs[ret - 1];
1486         } else {
1487                 state->free_reqs--;
1488                 req = state->reqs[state->free_reqs];
1489         }
1490
1491         return req;
1492 fallback:
1493         return io_get_fallback_req(ctx);
1494 }
1495
1496 static inline void io_put_file(struct io_kiocb *req, struct file *file,
1497                           bool fixed)
1498 {
1499         if (fixed)
1500                 percpu_ref_put(req->fixed_file_refs);
1501         else
1502                 fput(file);
1503 }
1504
1505 static void io_dismantle_req(struct io_kiocb *req)
1506 {
1507         if (req->flags & REQ_F_NEED_CLEANUP)
1508                 io_cleanup_req(req);
1509
1510         kfree(req->io);
1511         if (req->file)
1512                 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
1513         __io_put_req_task(req);
1514         io_req_work_drop_env(req);
1515
1516         if (req->flags & REQ_F_INFLIGHT) {
1517                 struct io_ring_ctx *ctx = req->ctx;
1518                 unsigned long flags;
1519
1520                 spin_lock_irqsave(&ctx->inflight_lock, flags);
1521                 list_del(&req->inflight_entry);
1522                 if (waitqueue_active(&ctx->inflight_wait))
1523                         wake_up(&ctx->inflight_wait);
1524                 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1525         }
1526 }
1527
1528 static void __io_free_req(struct io_kiocb *req)
1529 {
1530         io_dismantle_req(req);
1531         percpu_ref_put(&req->ctx->refs);
1532         if (likely(!io_is_fallback_req(req)))
1533                 kmem_cache_free(req_cachep, req);
1534         else
1535                 clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req);
1536 }
1537
1538 static bool io_link_cancel_timeout(struct io_kiocb *req)
1539 {
1540         struct io_ring_ctx *ctx = req->ctx;
1541         int ret;
1542
1543         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
1544         if (ret != -1) {
1545                 io_cqring_fill_event(req, -ECANCELED);
1546                 io_commit_cqring(ctx);
1547                 req->flags &= ~REQ_F_LINK_HEAD;
1548                 io_put_req(req);
1549                 return true;
1550         }
1551
1552         return false;
1553 }
1554
1555 static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
1556 {
1557         struct io_ring_ctx *ctx = req->ctx;
1558         bool wake_ev = false;
1559
1560         /*
1561          * The list should never be empty when we are called here. But could
1562          * potentially happen if the chain is messed up, check to be on the
1563          * safe side.
1564          */
1565         while (!list_empty(&req->link_list)) {
1566                 struct io_kiocb *nxt = list_first_entry(&req->link_list,
1567                                                 struct io_kiocb, link_list);
1568
1569                 if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
1570                              (nxt->flags & REQ_F_TIMEOUT))) {
1571                         list_del_init(&nxt->link_list);
1572                         wake_ev |= io_link_cancel_timeout(nxt);
1573                         req->flags &= ~REQ_F_LINK_TIMEOUT;
1574                         continue;
1575                 }
1576
1577                 list_del_init(&req->link_list);
1578                 if (!list_empty(&nxt->link_list))
1579                         nxt->flags |= REQ_F_LINK_HEAD;
1580                 *nxtptr = nxt;
1581                 break;
1582         }
1583
1584         if (wake_ev)
1585                 io_cqring_ev_posted(ctx);
1586 }
1587
1588 /*
1589  * Called if REQ_F_LINK_HEAD is set, and we fail the head request
1590  */
1591 static void io_fail_links(struct io_kiocb *req)
1592 {
1593         struct io_ring_ctx *ctx = req->ctx;
1594         unsigned long flags;
1595
1596         spin_lock_irqsave(&ctx->completion_lock, flags);
1597
1598         while (!list_empty(&req->link_list)) {
1599                 struct io_kiocb *link = list_first_entry(&req->link_list,
1600                                                 struct io_kiocb, link_list);
1601
1602                 list_del_init(&link->link_list);
1603                 trace_io_uring_fail_link(req, link);
1604
1605                 if ((req->flags & REQ_F_LINK_TIMEOUT) &&
1606                     link->opcode == IORING_OP_LINK_TIMEOUT) {
1607                         io_link_cancel_timeout(link);
1608                 } else {
1609                         io_cqring_fill_event(link, -ECANCELED);
1610                         __io_double_put_req(link);
1611                 }
1612                 req->flags &= ~REQ_F_LINK_TIMEOUT;
1613         }
1614
1615         io_commit_cqring(ctx);
1616         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1617         io_cqring_ev_posted(ctx);
1618 }
1619
1620 static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
1621 {
1622         if (likely(!(req->flags & REQ_F_LINK_HEAD)))
1623                 return;
1624         req->flags &= ~REQ_F_LINK_HEAD;
1625
1626         /*
1627          * If LINK is set, we have dependent requests in this chain. If we
1628          * didn't fail this request, queue the first one up, moving any other
1629          * dependencies to the next request. In case of failure, fail the rest
1630          * of the chain.
1631          */
1632         if (req->flags & REQ_F_FAIL_LINK) {
1633                 io_fail_links(req);
1634         } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
1635                         REQ_F_LINK_TIMEOUT) {
1636                 struct io_ring_ctx *ctx = req->ctx;
1637                 unsigned long flags;
1638
1639                 /*
1640                  * If this is a timeout link, we could be racing with the
1641                  * timeout timer. Grab the completion lock for this case to
1642                  * protect against that.
1643                  */
1644                 spin_lock_irqsave(&ctx->completion_lock, flags);
1645                 io_req_link_next(req, nxt);
1646                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1647         } else {
1648                 io_req_link_next(req, nxt);
1649         }
1650 }
1651
1652 static void __io_req_task_cancel(struct io_kiocb *req, int error)
1653 {
1654         struct io_ring_ctx *ctx = req->ctx;
1655
1656         spin_lock_irq(&ctx->completion_lock);
1657         io_cqring_fill_event(req, error);
1658         io_commit_cqring(ctx);
1659         spin_unlock_irq(&ctx->completion_lock);
1660
1661         io_cqring_ev_posted(ctx);
1662         req_set_fail_links(req);
1663         io_double_put_req(req);
1664 }
1665
1666 static void io_req_task_cancel(struct callback_head *cb)
1667 {
1668         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
1669
1670         __io_req_task_cancel(req, -ECANCELED);
1671 }
1672
1673 static void __io_req_task_submit(struct io_kiocb *req)
1674 {
1675         struct io_ring_ctx *ctx = req->ctx;
1676
1677         __set_current_state(TASK_RUNNING);
1678         if (!__io_sq_thread_acquire_mm(ctx)) {
1679                 mutex_lock(&ctx->uring_lock);
1680                 __io_queue_sqe(req, NULL, NULL);
1681                 mutex_unlock(&ctx->uring_lock);
1682         } else {
1683                 __io_req_task_cancel(req, -EFAULT);
1684         }
1685 }
1686
1687 static void io_req_task_submit(struct callback_head *cb)
1688 {
1689         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
1690
1691         __io_req_task_submit(req);
1692 }
1693
1694 static void io_req_task_queue(struct io_kiocb *req)
1695 {
1696         struct task_struct *tsk = req->task;
1697         int ret;
1698
1699         init_task_work(&req->task_work, io_req_task_submit);
1700
1701         ret = task_work_add(tsk, &req->task_work, true);
1702         if (unlikely(ret)) {
1703                 init_task_work(&req->task_work, io_req_task_cancel);
1704                 tsk = io_wq_get_task(req->ctx->io_wq);
1705                 task_work_add(tsk, &req->task_work, true);
1706         }
1707         wake_up_process(tsk);
1708 }
1709
1710 static void io_queue_next(struct io_kiocb *req)
1711 {
1712         struct io_kiocb *nxt = NULL;
1713
1714         io_req_find_next(req, &nxt);
1715         if (nxt)
1716                 io_req_task_queue(nxt);
1717 }
1718
1719 static void io_free_req(struct io_kiocb *req)
1720 {
1721         io_queue_next(req);
1722         __io_free_req(req);
1723 }
1724
1725 struct req_batch {
1726         void *reqs[IO_IOPOLL_BATCH];
1727         int to_free;
1728 };
1729
1730 static void __io_req_free_batch_flush(struct io_ring_ctx *ctx,
1731                                       struct req_batch *rb)
1732 {
1733         kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
1734         percpu_ref_put_many(&ctx->refs, rb->to_free);
1735         rb->to_free = 0;
1736 }
1737
1738 static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
1739                                      struct req_batch *rb)
1740 {
1741         if (rb->to_free)
1742                 __io_req_free_batch_flush(ctx, rb);
1743 }
1744
1745 static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
1746 {
1747         if (unlikely(io_is_fallback_req(req))) {
1748                 io_free_req(req);
1749                 return;
1750         }
1751         if (req->flags & REQ_F_LINK_HEAD)
1752                 io_queue_next(req);
1753
1754         io_dismantle_req(req);
1755         rb->reqs[rb->to_free++] = req;
1756         if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
1757                 __io_req_free_batch_flush(req->ctx, rb);
1758 }
1759
1760 /*
1761  * Drop reference to request, return next in chain (if there is one) if this
1762  * was the last reference to this request.
1763  */
1764 __attribute__((nonnull))
1765 static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
1766 {
1767         if (refcount_dec_and_test(&req->refs)) {
1768                 io_req_find_next(req, nxtptr);
1769                 __io_free_req(req);
1770         }
1771 }
1772
1773 static void io_put_req(struct io_kiocb *req)
1774 {
1775         if (refcount_dec_and_test(&req->refs))
1776                 io_free_req(req);
1777 }
1778
1779 static struct io_wq_work *io_steal_work(struct io_kiocb *req)
1780 {
1781         struct io_kiocb *nxt = NULL;
1782
1783         /*
1784          * A ref is owned by io-wq in which context we're. So, if that's the
1785          * last one, it's safe to steal next work. False negatives are Ok,
1786          * it just will be re-punted async in io_put_work()
1787          */
1788         if (refcount_read(&req->refs) != 1)
1789                 return NULL;
1790
1791         io_req_find_next(req, &nxt);
1792         if (!nxt)
1793                 return NULL;
1794
1795         if ((nxt->flags & REQ_F_ISREG) && io_op_defs[nxt->opcode].hash_reg_file)
1796                 io_wq_hash_work(&nxt->work, file_inode(nxt->file));
1797
1798         io_req_task_queue(nxt);
1799         /*
1800          * If we're going to return actual work, here should be timeout prep:
1801          *
1802          * link = io_prep_linked_timeout(nxt);
1803          * if (link)
1804          *      nxt->flags |= REQ_F_QUEUE_TIMEOUT;
1805          */
1806         return NULL;
1807 }
1808
1809 /*
1810  * Must only be used if we don't need to care about links, usually from
1811  * within the completion handling itself.
1812  */
1813 static void __io_double_put_req(struct io_kiocb *req)
1814 {
1815         /* drop both submit and complete references */
1816         if (refcount_sub_and_test(2, &req->refs))
1817                 __io_free_req(req);
1818 }
1819
1820 static void io_double_put_req(struct io_kiocb *req)
1821 {
1822         /* drop both submit and complete references */
1823         if (refcount_sub_and_test(2, &req->refs))
1824                 io_free_req(req);
1825 }
1826
1827 static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
1828 {
1829         struct io_rings *rings = ctx->rings;
1830
1831         if (test_bit(0, &ctx->cq_check_overflow)) {
1832                 /*
1833                  * noflush == true is from the waitqueue handler, just ensure
1834                  * we wake up the task, and the next invocation will flush the
1835                  * entries. We cannot safely to it from here.
1836                  */
1837                 if (noflush && !list_empty(&ctx->cq_overflow_list))
1838                         return -1U;
1839
1840                 io_cqring_overflow_flush(ctx, false);
1841         }
1842
1843         /* See comment at the top of this file */
1844         smp_rmb();
1845         return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
1846 }
1847
1848 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
1849 {
1850         struct io_rings *rings = ctx->rings;
1851
1852         /* make sure SQ entry isn't read before tail */
1853         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
1854 }
1855
1856 static int io_put_kbuf(struct io_kiocb *req)
1857 {
1858         struct io_buffer *kbuf;
1859         int cflags;
1860
1861         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
1862         cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
1863         cflags |= IORING_CQE_F_BUFFER;
1864         req->rw.addr = 0;
1865         kfree(kbuf);
1866         return cflags;
1867 }
1868
1869 static void io_iopoll_queue(struct list_head *again)
1870 {
1871         struct io_kiocb *req;
1872
1873         do {
1874                 req = list_first_entry(again, struct io_kiocb, list);
1875                 list_del(&req->list);
1876
1877                 /* should have ->mm unless io_uring is dying, kill reqs then */
1878                 if (unlikely(!current->mm) || !io_rw_reissue(req, -EAGAIN))
1879                         io_complete_rw_common(&req->rw.kiocb, -EAGAIN, NULL);
1880         } while (!list_empty(again));
1881 }
1882
1883 /*
1884  * Find and free completed poll iocbs
1885  */
1886 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
1887                                struct list_head *done)
1888 {
1889         struct req_batch rb;
1890         struct io_kiocb *req;
1891         LIST_HEAD(again);
1892
1893         /* order with ->result store in io_complete_rw_iopoll() */
1894         smp_rmb();
1895
1896         rb.to_free = 0;
1897         while (!list_empty(done)) {
1898                 int cflags = 0;
1899
1900                 req = list_first_entry(done, struct io_kiocb, list);
1901                 if (READ_ONCE(req->result) == -EAGAIN) {
1902                         req->iopoll_completed = 0;
1903                         list_move_tail(&req->list, &again);
1904                         continue;
1905                 }
1906                 list_del(&req->list);
1907
1908                 if (req->flags & REQ_F_BUFFER_SELECTED)
1909                         cflags = io_put_kbuf(req);
1910
1911                 __io_cqring_fill_event(req, req->result, cflags);
1912                 (*nr_events)++;
1913
1914                 if (refcount_dec_and_test(&req->refs))
1915                         io_req_free_batch(&rb, req);
1916         }
1917
1918         io_commit_cqring(ctx);
1919         if (ctx->flags & IORING_SETUP_SQPOLL)
1920                 io_cqring_ev_posted(ctx);
1921         io_req_free_batch_finish(ctx, &rb);
1922
1923         if (!list_empty(&again))
1924                 io_iopoll_queue(&again);
1925 }
1926
1927 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
1928                         long min)
1929 {
1930         struct io_kiocb *req, *tmp;
1931         LIST_HEAD(done);
1932         bool spin;
1933         int ret;
1934
1935         /*
1936          * Only spin for completions if we don't have multiple devices hanging
1937          * off our complete list, and we're under the requested amount.
1938          */
1939         spin = !ctx->poll_multi_file && *nr_events < min;
1940
1941         ret = 0;
1942         list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
1943                 struct kiocb *kiocb = &req->rw.kiocb;
1944
1945                 /*
1946                  * Move completed and retryable entries to our local lists.
1947                  * If we find a request that requires polling, break out
1948                  * and complete those lists first, if we have entries there.
1949                  */
1950                 if (READ_ONCE(req->iopoll_completed)) {
1951                         list_move_tail(&req->list, &done);
1952                         continue;
1953                 }
1954                 if (!list_empty(&done))
1955                         break;
1956
1957                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
1958                 if (ret < 0)
1959                         break;
1960
1961                 if (ret && spin)
1962                         spin = false;
1963                 ret = 0;
1964         }
1965
1966         if (!list_empty(&done))
1967                 io_iopoll_complete(ctx, nr_events, &done);
1968
1969         return ret;
1970 }
1971
1972 /*
1973  * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
1974  * non-spinning poll check - we'll still enter the driver poll loop, but only
1975  * as a non-spinning completion check.
1976  */
1977 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
1978                                 long min)
1979 {
1980         while (!list_empty(&ctx->poll_list) && !need_resched()) {
1981                 int ret;
1982
1983                 ret = io_do_iopoll(ctx, nr_events, min);
1984                 if (ret < 0)
1985                         return ret;
1986                 if (!min || *nr_events >= min)
1987                         return 0;
1988         }
1989
1990         return 1;
1991 }
1992
1993 /*
1994  * We can't just wait for polled events to come to us, we have to actively
1995  * find and complete them.
1996  */
1997 static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
1998 {
1999         if (!(ctx->flags & IORING_SETUP_IOPOLL))
2000                 return;
2001
2002         mutex_lock(&ctx->uring_lock);
2003         while (!list_empty(&ctx->poll_list)) {
2004                 unsigned int nr_events = 0;
2005
2006                 io_iopoll_getevents(ctx, &nr_events, 1);
2007
2008                 /*
2009                  * Ensure we allow local-to-the-cpu processing to take place,
2010                  * in this case we need to ensure that we reap all events.
2011                  */
2012                 cond_resched();
2013         }
2014         mutex_unlock(&ctx->uring_lock);
2015 }
2016
2017 static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
2018                            long min)
2019 {
2020         int iters = 0, ret = 0;
2021
2022         /*
2023          * We disallow the app entering submit/complete with polling, but we
2024          * still need to lock the ring to prevent racing with polled issue
2025          * that got punted to a workqueue.
2026          */
2027         mutex_lock(&ctx->uring_lock);
2028         do {
2029                 int tmin = 0;
2030
2031                 /*
2032                  * Don't enter poll loop if we already have events pending.
2033                  * If we do, we can potentially be spinning for commands that
2034                  * already triggered a CQE (eg in error).
2035                  */
2036                 if (io_cqring_events(ctx, false))
2037                         break;
2038
2039                 /*
2040                  * If a submit got punted to a workqueue, we can have the
2041                  * application entering polling for a command before it gets
2042                  * issued. That app will hold the uring_lock for the duration
2043                  * of the poll right here, so we need to take a breather every
2044                  * now and then to ensure that the issue has a chance to add
2045                  * the poll to the issued list. Otherwise we can spin here
2046                  * forever, while the workqueue is stuck trying to acquire the
2047                  * very same mutex.
2048                  */
2049                 if (!(++iters & 7)) {
2050                         mutex_unlock(&ctx->uring_lock);
2051                         if (current->task_works)
2052                                 task_work_run();
2053                         mutex_lock(&ctx->uring_lock);
2054                 }
2055
2056                 if (*nr_events < min)
2057                         tmin = min - *nr_events;
2058
2059                 ret = io_iopoll_getevents(ctx, nr_events, tmin);
2060                 if (ret <= 0)
2061                         break;
2062                 ret = 0;
2063         } while (min && !*nr_events && !need_resched());
2064
2065         mutex_unlock(&ctx->uring_lock);
2066         return ret;
2067 }
2068
2069 static void kiocb_end_write(struct io_kiocb *req)
2070 {
2071         /*
2072          * Tell lockdep we inherited freeze protection from submission
2073          * thread.
2074          */
2075         if (req->flags & REQ_F_ISREG) {
2076                 struct inode *inode = file_inode(req->file);
2077
2078                 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
2079         }
2080         file_end_write(req->file);
2081 }
2082
2083 static void io_complete_rw_common(struct kiocb *kiocb, long res,
2084                                   struct io_comp_state *cs)
2085 {
2086         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2087         int cflags = 0;
2088
2089         if (kiocb->ki_flags & IOCB_WRITE)
2090                 kiocb_end_write(req);
2091
2092         if (res != req->result)
2093                 req_set_fail_links(req);
2094         if (req->flags & REQ_F_BUFFER_SELECTED)
2095                 cflags = io_put_kbuf(req);
2096         __io_req_complete(req, res, cflags, cs);
2097 }
2098
2099 #ifdef CONFIG_BLOCK
2100 static bool io_resubmit_prep(struct io_kiocb *req, int error)
2101 {
2102         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2103         ssize_t ret = -ECANCELED;
2104         struct iov_iter iter;
2105         int rw;
2106
2107         if (error) {
2108                 ret = error;
2109                 goto end_req;
2110         }
2111
2112         switch (req->opcode) {
2113         case IORING_OP_READV:
2114         case IORING_OP_READ_FIXED:
2115         case IORING_OP_READ:
2116                 rw = READ;
2117                 break;
2118         case IORING_OP_WRITEV:
2119         case IORING_OP_WRITE_FIXED:
2120         case IORING_OP_WRITE:
2121                 rw = WRITE;
2122                 break;
2123         default:
2124                 printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n",
2125                                 req->opcode);
2126                 goto end_req;
2127         }
2128
2129         ret = io_import_iovec(rw, req, &iovec, &iter, false);
2130         if (ret < 0)
2131                 goto end_req;
2132         ret = io_setup_async_rw(req, ret, iovec, inline_vecs, &iter);
2133         if (!ret)
2134                 return true;
2135         kfree(iovec);
2136 end_req:
2137         req_set_fail_links(req);
2138         io_req_complete(req, ret);
2139         return false;
2140 }
2141
2142 static void io_rw_resubmit(struct callback_head *cb)
2143 {
2144         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2145         struct io_ring_ctx *ctx = req->ctx;
2146         int err;
2147
2148         __set_current_state(TASK_RUNNING);
2149
2150         err = io_sq_thread_acquire_mm(ctx, req);
2151
2152         if (io_resubmit_prep(req, err)) {
2153                 refcount_inc(&req->refs);
2154                 io_queue_async_work(req);
2155         }
2156 }
2157 #endif
2158
2159 static bool io_rw_reissue(struct io_kiocb *req, long res)
2160 {
2161 #ifdef CONFIG_BLOCK
2162         struct task_struct *tsk;
2163         int ret;
2164
2165         if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
2166                 return false;
2167
2168         tsk = req->task;
2169         init_task_work(&req->task_work, io_rw_resubmit);
2170         ret = task_work_add(tsk, &req->task_work, true);
2171         if (!ret)
2172                 return true;
2173 #endif
2174         return false;
2175 }
2176
2177 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
2178                              struct io_comp_state *cs)
2179 {
2180         if (!io_rw_reissue(req, res))
2181                 io_complete_rw_common(&req->rw.kiocb, res, cs);
2182 }
2183
2184 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
2185 {
2186         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2187
2188         __io_complete_rw(req, res, res2, NULL);
2189 }
2190
2191 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
2192 {
2193         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2194
2195         if (kiocb->ki_flags & IOCB_WRITE)
2196                 kiocb_end_write(req);
2197
2198         if (res != -EAGAIN && res != req->result)
2199                 req_set_fail_links(req);
2200
2201         WRITE_ONCE(req->result, res);
2202         /* order with io_poll_complete() checking ->result */
2203         smp_wmb();
2204         WRITE_ONCE(req->iopoll_completed, 1);
2205 }
2206
2207 /*
2208  * After the iocb has been issued, it's safe to be found on the poll list.
2209  * Adding the kiocb to the list AFTER submission ensures that we don't
2210  * find it from a io_iopoll_getevents() thread before the issuer is done
2211  * accessing the kiocb cookie.
2212  */
2213 static void io_iopoll_req_issued(struct io_kiocb *req)
2214 {
2215         struct io_ring_ctx *ctx = req->ctx;
2216
2217         /*
2218          * Track whether we have multiple files in our lists. This will impact
2219          * how we do polling eventually, not spinning if we're on potentially
2220          * different devices.
2221          */
2222         if (list_empty(&ctx->poll_list)) {
2223                 ctx->poll_multi_file = false;
2224         } else if (!ctx->poll_multi_file) {
2225                 struct io_kiocb *list_req;
2226
2227                 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
2228                                                 list);
2229                 if (list_req->file != req->file)
2230                         ctx->poll_multi_file = true;
2231         }
2232
2233         /*
2234          * For fast devices, IO may have already completed. If it has, add
2235          * it to the front so we find it first.
2236          */
2237         if (READ_ONCE(req->iopoll_completed))
2238                 list_add(&req->list, &ctx->poll_list);
2239         else
2240                 list_add_tail(&req->list, &ctx->poll_list);
2241
2242         if ((ctx->flags & IORING_SETUP_SQPOLL) &&
2243             wq_has_sleeper(&ctx->sqo_wait))
2244                 wake_up(&ctx->sqo_wait);
2245 }
2246
2247 static void __io_state_file_put(struct io_submit_state *state)
2248 {
2249         int diff = state->has_refs - state->used_refs;
2250
2251         if (diff)
2252                 fput_many(state->file, diff);
2253         state->file = NULL;
2254 }
2255
2256 static inline void io_state_file_put(struct io_submit_state *state)
2257 {
2258         if (state->file)
2259                 __io_state_file_put(state);
2260 }
2261
2262 /*
2263  * Get as many references to a file as we have IOs left in this submission,
2264  * assuming most submissions are for one file, or at least that each file
2265  * has more than one submission.
2266  */
2267 static struct file *__io_file_get(struct io_submit_state *state, int fd)
2268 {
2269         if (!state)
2270                 return fget(fd);
2271
2272         if (state->file) {
2273                 if (state->fd == fd) {
2274                         state->used_refs++;
2275                         state->ios_left--;
2276                         return state->file;
2277                 }
2278                 __io_state_file_put(state);
2279         }
2280         state->file = fget_many(fd, state->ios_left);
2281         if (!state->file)
2282                 return NULL;
2283
2284         state->fd = fd;
2285         state->has_refs = state->ios_left;
2286         state->used_refs = 1;
2287         state->ios_left--;
2288         return state->file;
2289 }
2290
2291 static bool io_bdev_nowait(struct block_device *bdev)
2292 {
2293 #ifdef CONFIG_BLOCK
2294         return !bdev || queue_is_mq(bdev_get_queue(bdev));
2295 #else
2296         return true;
2297 #endif
2298 }
2299
2300 /*
2301  * If we tracked the file through the SCM inflight mechanism, we could support
2302  * any file. For now, just ensure that anything potentially problematic is done
2303  * inline.
2304  */
2305 static bool io_file_supports_async(struct file *file, int rw)
2306 {
2307         umode_t mode = file_inode(file)->i_mode;
2308
2309         if (S_ISBLK(mode)) {
2310                 if (io_bdev_nowait(file->f_inode->i_bdev))
2311                         return true;
2312                 return false;
2313         }
2314         if (S_ISCHR(mode) || S_ISSOCK(mode))
2315                 return true;
2316         if (S_ISREG(mode)) {
2317                 if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
2318                     file->f_op != &io_uring_fops)
2319                         return true;
2320                 return false;
2321         }
2322
2323         /* any ->read/write should understand O_NONBLOCK */
2324         if (file->f_flags & O_NONBLOCK)
2325                 return true;
2326
2327         if (!(file->f_mode & FMODE_NOWAIT))
2328                 return false;
2329
2330         if (rw == READ)
2331                 return file->f_op->read_iter != NULL;
2332
2333         return file->f_op->write_iter != NULL;
2334 }
2335
2336 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2337                       bool force_nonblock)
2338 {
2339         struct io_ring_ctx *ctx = req->ctx;
2340         struct kiocb *kiocb = &req->rw.kiocb;
2341         unsigned ioprio;
2342         int ret;
2343
2344         if (S_ISREG(file_inode(req->file)->i_mode))
2345                 req->flags |= REQ_F_ISREG;
2346
2347         kiocb->ki_pos = READ_ONCE(sqe->off);
2348         if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
2349                 req->flags |= REQ_F_CUR_POS;
2350                 kiocb->ki_pos = req->file->f_pos;
2351         }
2352         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
2353         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2354         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2355         if (unlikely(ret))
2356                 return ret;
2357
2358         ioprio = READ_ONCE(sqe->ioprio);
2359         if (ioprio) {
2360                 ret = ioprio_check_cap(ioprio);
2361                 if (ret)
2362                         return ret;
2363
2364                 kiocb->ki_ioprio = ioprio;
2365         } else
2366                 kiocb->ki_ioprio = get_current_ioprio();
2367
2368         /* don't allow async punt if RWF_NOWAIT was requested */
2369         if (kiocb->ki_flags & IOCB_NOWAIT)
2370                 req->flags |= REQ_F_NOWAIT;
2371
2372         if (kiocb->ki_flags & IOCB_DIRECT)
2373                 io_get_req_task(req);
2374
2375         if (force_nonblock)
2376                 kiocb->ki_flags |= IOCB_NOWAIT;
2377
2378         if (ctx->flags & IORING_SETUP_IOPOLL) {
2379                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2380                     !kiocb->ki_filp->f_op->iopoll)
2381                         return -EOPNOTSUPP;
2382
2383                 kiocb->ki_flags |= IOCB_HIPRI;
2384                 kiocb->ki_complete = io_complete_rw_iopoll;
2385                 req->iopoll_completed = 0;
2386                 io_get_req_task(req);
2387         } else {
2388                 if (kiocb->ki_flags & IOCB_HIPRI)
2389                         return -EINVAL;
2390                 kiocb->ki_complete = io_complete_rw;
2391         }
2392
2393         req->rw.addr = READ_ONCE(sqe->addr);
2394         req->rw.len = READ_ONCE(sqe->len);
2395         req->buf_index = READ_ONCE(sqe->buf_index);
2396         return 0;
2397 }
2398
2399 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2400 {
2401         switch (ret) {
2402         case -EIOCBQUEUED:
2403                 break;
2404         case -ERESTARTSYS:
2405         case -ERESTARTNOINTR:
2406         case -ERESTARTNOHAND:
2407         case -ERESTART_RESTARTBLOCK:
2408                 /*
2409                  * We can't just restart the syscall, since previously
2410                  * submitted sqes may already be in progress. Just fail this
2411                  * IO with EINTR.
2412                  */
2413                 ret = -EINTR;
2414                 /* fall through */
2415         default:
2416                 kiocb->ki_complete(kiocb, ret, 0);
2417         }
2418 }
2419
2420 static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
2421                        struct io_comp_state *cs)
2422 {
2423         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2424
2425         if (req->flags & REQ_F_CUR_POS)
2426                 req->file->f_pos = kiocb->ki_pos;
2427         if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
2428                 __io_complete_rw(req, ret, 0, cs);
2429         else
2430                 io_rw_done(kiocb, ret);
2431 }
2432
2433 static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
2434                                struct iov_iter *iter)
2435 {
2436         struct io_ring_ctx *ctx = req->ctx;
2437         size_t len = req->rw.len;
2438         struct io_mapped_ubuf *imu;
2439         u16 index, buf_index;
2440         size_t offset;
2441         u64 buf_addr;
2442
2443         /* attempt to use fixed buffers without having provided iovecs */
2444         if (unlikely(!ctx->user_bufs))
2445                 return -EFAULT;
2446
2447         buf_index = req->buf_index;
2448         if (unlikely(buf_index >= ctx->nr_user_bufs))
2449                 return -EFAULT;
2450
2451         index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2452         imu = &ctx->user_bufs[index];
2453         buf_addr = req->rw.addr;
2454
2455         /* overflow */
2456         if (buf_addr + len < buf_addr)
2457                 return -EFAULT;
2458         /* not inside the mapped region */
2459         if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
2460                 return -EFAULT;
2461
2462         /*
2463          * May not be a start of buffer, set size appropriately
2464          * and advance us to the beginning.
2465          */
2466         offset = buf_addr - imu->ubuf;
2467         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
2468
2469         if (offset) {
2470                 /*
2471                  * Don't use iov_iter_advance() here, as it's really slow for
2472                  * using the latter parts of a big fixed buffer - it iterates
2473                  * over each segment manually. We can cheat a bit here, because
2474                  * we know that:
2475                  *
2476                  * 1) it's a BVEC iter, we set it up
2477                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
2478                  *    first and last bvec
2479                  *
2480                  * So just find our index, and adjust the iterator afterwards.
2481                  * If the offset is within the first bvec (or the whole first
2482                  * bvec, just use iov_iter_advance(). This makes it easier
2483                  * since we can just skip the first segment, which may not
2484                  * be PAGE_SIZE aligned.
2485                  */
2486                 const struct bio_vec *bvec = imu->bvec;
2487
2488                 if (offset <= bvec->bv_len) {
2489                         iov_iter_advance(iter, offset);
2490                 } else {
2491                         unsigned long seg_skip;
2492
2493                         /* skip first vec */
2494                         offset -= bvec->bv_len;
2495                         seg_skip = 1 + (offset >> PAGE_SHIFT);
2496
2497                         iter->bvec = bvec + seg_skip;
2498                         iter->nr_segs -= seg_skip;
2499                         iter->count -= bvec->bv_len + offset;
2500                         iter->iov_offset = offset & ~PAGE_MASK;
2501                 }
2502         }
2503
2504         return len;
2505 }
2506
2507 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2508 {
2509         if (needs_lock)
2510                 mutex_unlock(&ctx->uring_lock);
2511 }
2512
2513 static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2514 {
2515         /*
2516          * "Normal" inline submissions always hold the uring_lock, since we
2517          * grab it from the system call. Same is true for the SQPOLL offload.
2518          * The only exception is when we've detached the request and issue it
2519          * from an async worker thread, grab the lock for that case.
2520          */
2521         if (needs_lock)
2522                 mutex_lock(&ctx->uring_lock);
2523 }
2524
2525 static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
2526                                           int bgid, struct io_buffer *kbuf,
2527                                           bool needs_lock)
2528 {
2529         struct io_buffer *head;
2530
2531         if (req->flags & REQ_F_BUFFER_SELECTED)
2532                 return kbuf;
2533
2534         io_ring_submit_lock(req->ctx, needs_lock);
2535
2536         lockdep_assert_held(&req->ctx->uring_lock);
2537
2538         head = idr_find(&req->ctx->io_buffer_idr, bgid);
2539         if (head) {
2540                 if (!list_empty(&head->list)) {
2541                         kbuf = list_last_entry(&head->list, struct io_buffer,
2542                                                         list);
2543                         list_del(&kbuf->list);
2544                 } else {
2545                         kbuf = head;
2546                         idr_remove(&req->ctx->io_buffer_idr, bgid);
2547                 }
2548                 if (*len > kbuf->len)
2549                         *len = kbuf->len;
2550         } else {
2551                 kbuf = ERR_PTR(-ENOBUFS);
2552         }
2553
2554         io_ring_submit_unlock(req->ctx, needs_lock);
2555
2556         return kbuf;
2557 }
2558
2559 static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
2560                                         bool needs_lock)
2561 {
2562         struct io_buffer *kbuf;
2563         u16 bgid;
2564
2565         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2566         bgid = req->buf_index;
2567         kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
2568         if (IS_ERR(kbuf))
2569                 return kbuf;
2570         req->rw.addr = (u64) (unsigned long) kbuf;
2571         req->flags |= REQ_F_BUFFER_SELECTED;
2572         return u64_to_user_ptr(kbuf->addr);
2573 }
2574
2575 #ifdef CONFIG_COMPAT
2576 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
2577                                 bool needs_lock)
2578 {
2579         struct compat_iovec __user *uiov;
2580         compat_ssize_t clen;
2581         void __user *buf;
2582         ssize_t len;
2583
2584         uiov = u64_to_user_ptr(req->rw.addr);
2585         if (!access_ok(uiov, sizeof(*uiov)))
2586                 return -EFAULT;
2587         if (__get_user(clen, &uiov->iov_len))
2588                 return -EFAULT;
2589         if (clen < 0)
2590                 return -EINVAL;
2591
2592         len = clen;
2593         buf = io_rw_buffer_select(req, &len, needs_lock);
2594         if (IS_ERR(buf))
2595                 return PTR_ERR(buf);
2596         iov[0].iov_base = buf;
2597         iov[0].iov_len = (compat_size_t) len;
2598         return 0;
2599 }
2600 #endif
2601
2602 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2603                                       bool needs_lock)
2604 {
2605         struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
2606         void __user *buf;
2607         ssize_t len;
2608
2609         if (copy_from_user(iov, uiov, sizeof(*uiov)))
2610                 return -EFAULT;
2611
2612         len = iov[0].iov_len;
2613         if (len < 0)
2614                 return -EINVAL;
2615         buf = io_rw_buffer_select(req, &len, needs_lock);
2616         if (IS_ERR(buf))
2617                 return PTR_ERR(buf);
2618         iov[0].iov_base = buf;
2619         iov[0].iov_len = len;
2620         return 0;
2621 }
2622
2623 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2624                                     bool needs_lock)
2625 {
2626         if (req->flags & REQ_F_BUFFER_SELECTED) {
2627                 struct io_buffer *kbuf;
2628
2629                 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2630                 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
2631                 iov[0].iov_len = kbuf->len;
2632                 return 0;
2633         }
2634         if (!req->rw.len)
2635                 return 0;
2636         else if (req->rw.len > 1)
2637                 return -EINVAL;
2638
2639 #ifdef CONFIG_COMPAT
2640         if (req->ctx->compat)
2641                 return io_compat_import(req, iov, needs_lock);
2642 #endif
2643
2644         return __io_iov_buffer_select(req, iov, needs_lock);
2645 }
2646
2647 static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
2648                                struct iovec **iovec, struct iov_iter *iter,
2649                                bool needs_lock)
2650 {
2651         void __user *buf = u64_to_user_ptr(req->rw.addr);
2652         size_t sqe_len = req->rw.len;
2653         ssize_t ret;
2654         u8 opcode;
2655
2656         opcode = req->opcode;
2657         if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
2658                 *iovec = NULL;
2659                 return io_import_fixed(req, rw, iter);
2660         }
2661
2662         /* buffer index only valid with fixed read/write, or buffer select  */
2663         if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
2664                 return -EINVAL;
2665
2666         if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
2667                 if (req->flags & REQ_F_BUFFER_SELECT) {
2668                         buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
2669                         if (IS_ERR(buf)) {
2670                                 *iovec = NULL;
2671                                 return PTR_ERR(buf);
2672                         }
2673                         req->rw.len = sqe_len;
2674                 }
2675
2676                 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
2677                 *iovec = NULL;
2678                 return ret < 0 ? ret : sqe_len;
2679         }
2680
2681         if (req->io) {
2682                 struct io_async_rw *iorw = &req->io->rw;
2683
2684                 *iovec = iorw->iov;
2685                 iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
2686                 if (iorw->iov == iorw->fast_iov)
2687                         *iovec = NULL;
2688                 return iorw->size;
2689         }
2690
2691         if (req->flags & REQ_F_BUFFER_SELECT) {
2692                 ret = io_iov_buffer_select(req, *iovec, needs_lock);
2693                 if (!ret) {
2694                         ret = (*iovec)->iov_len;
2695                         iov_iter_init(iter, rw, *iovec, 1, ret);
2696                 }
2697                 *iovec = NULL;
2698                 return ret;
2699         }
2700
2701 #ifdef CONFIG_COMPAT
2702         if (req->ctx->compat)
2703                 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
2704                                                 iovec, iter);
2705 #endif
2706
2707         return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
2708 }
2709
2710 /*
2711  * For files that don't have ->read_iter() and ->write_iter(), handle them
2712  * by looping over ->read() or ->write() manually.
2713  */
2714 static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
2715                            struct iov_iter *iter)
2716 {
2717         ssize_t ret = 0;
2718
2719         /*
2720          * Don't support polled IO through this interface, and we can't
2721          * support non-blocking either. For the latter, this just causes
2722          * the kiocb to be handled from an async context.
2723          */
2724         if (kiocb->ki_flags & IOCB_HIPRI)
2725                 return -EOPNOTSUPP;
2726         if (kiocb->ki_flags & IOCB_NOWAIT)
2727                 return -EAGAIN;
2728
2729         while (iov_iter_count(iter)) {
2730                 struct iovec iovec;
2731                 ssize_t nr;
2732
2733                 if (!iov_iter_is_bvec(iter)) {
2734                         iovec = iov_iter_iovec(iter);
2735                 } else {
2736                         /* fixed buffers import bvec */
2737                         iovec.iov_base = kmap(iter->bvec->bv_page)
2738                                                 + iter->iov_offset;
2739                         iovec.iov_len = min(iter->count,
2740                                         iter->bvec->bv_len - iter->iov_offset);
2741                 }
2742
2743                 if (rw == READ) {
2744                         nr = file->f_op->read(file, iovec.iov_base,
2745                                               iovec.iov_len, &kiocb->ki_pos);
2746                 } else {
2747                         nr = file->f_op->write(file, iovec.iov_base,
2748                                                iovec.iov_len, &kiocb->ki_pos);
2749                 }
2750
2751                 if (iov_iter_is_bvec(iter))
2752                         kunmap(iter->bvec->bv_page);
2753
2754                 if (nr < 0) {
2755                         if (!ret)
2756                                 ret = nr;
2757                         break;
2758                 }
2759                 ret += nr;
2760                 if (nr != iovec.iov_len)
2761                         break;
2762                 iov_iter_advance(iter, nr);
2763         }
2764
2765         return ret;
2766 }
2767
2768 static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
2769                           struct iovec *iovec, struct iovec *fast_iov,
2770                           struct iov_iter *iter)
2771 {
2772         req->io->rw.nr_segs = iter->nr_segs;
2773         req->io->rw.size = io_size;
2774         req->io->rw.iov = iovec;
2775         if (!req->io->rw.iov) {
2776                 req->io->rw.iov = req->io->rw.fast_iov;
2777                 if (req->io->rw.iov != fast_iov)
2778                         memcpy(req->io->rw.iov, fast_iov,
2779                                sizeof(struct iovec) * iter->nr_segs);
2780         } else {
2781                 req->flags |= REQ_F_NEED_CLEANUP;
2782         }
2783 }
2784
2785 static inline int __io_alloc_async_ctx(struct io_kiocb *req)
2786 {
2787         req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
2788         return req->io == NULL;
2789 }
2790
2791 static int io_alloc_async_ctx(struct io_kiocb *req)
2792 {
2793         if (!io_op_defs[req->opcode].async_ctx)
2794                 return 0;
2795
2796         return  __io_alloc_async_ctx(req);
2797 }
2798
2799 static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
2800                              struct iovec *iovec, struct iovec *fast_iov,
2801                              struct iov_iter *iter)
2802 {
2803         if (!io_op_defs[req->opcode].async_ctx)
2804                 return 0;
2805         if (!req->io) {
2806                 if (__io_alloc_async_ctx(req))
2807                         return -ENOMEM;
2808
2809                 io_req_map_rw(req, io_size, iovec, fast_iov, iter);
2810         }
2811         return 0;
2812 }
2813
2814 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2815                         bool force_nonblock)
2816 {
2817         struct io_async_ctx *io;
2818         struct iov_iter iter;
2819         ssize_t ret;
2820
2821         ret = io_prep_rw(req, sqe, force_nonblock);
2822         if (ret)
2823                 return ret;
2824
2825         if (unlikely(!(req->file->f_mode & FMODE_READ)))
2826                 return -EBADF;
2827
2828         /* either don't need iovec imported or already have it */
2829         if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
2830                 return 0;
2831
2832         io = req->io;
2833         io->rw.iov = io->rw.fast_iov;
2834         req->io = NULL;
2835         ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock);
2836         req->io = io;
2837         if (ret < 0)
2838                 return ret;
2839
2840         io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2841         return 0;
2842 }
2843
2844 static void io_async_buf_cancel(struct callback_head *cb)
2845 {
2846         struct io_async_rw *rw;
2847         struct io_kiocb *req;
2848
2849         rw = container_of(cb, struct io_async_rw, task_work);
2850         req = rw->wpq.wait.private;
2851         __io_req_task_cancel(req, -ECANCELED);
2852 }
2853
2854 static void io_async_buf_retry(struct callback_head *cb)
2855 {
2856         struct io_async_rw *rw;
2857         struct io_kiocb *req;
2858
2859         rw = container_of(cb, struct io_async_rw, task_work);
2860         req = rw->wpq.wait.private;
2861
2862         __io_req_task_submit(req);
2863 }
2864
2865 static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
2866                              int sync, void *arg)
2867 {
2868         struct wait_page_queue *wpq;
2869         struct io_kiocb *req = wait->private;
2870         struct io_async_rw *rw = &req->io->rw;
2871         struct wait_page_key *key = arg;
2872         struct task_struct *tsk;
2873         int ret;
2874
2875         wpq = container_of(wait, struct wait_page_queue, wait);
2876
2877         ret = wake_page_match(wpq, key);
2878         if (ret != 1)
2879                 return ret;
2880
2881         list_del_init(&wait->entry);
2882
2883         init_task_work(&rw->task_work, io_async_buf_retry);
2884         /* submit ref gets dropped, acquire a new one */
2885         refcount_inc(&req->refs);
2886         tsk = req->task;
2887         ret = task_work_add(tsk, &rw->task_work, true);
2888         if (unlikely(ret)) {
2889                 /* queue just for cancelation */
2890                 init_task_work(&rw->task_work, io_async_buf_cancel);
2891                 tsk = io_wq_get_task(req->ctx->io_wq);
2892                 task_work_add(tsk, &rw->task_work, true);
2893         }
2894         wake_up_process(tsk);
2895         return 1;
2896 }
2897
2898 static bool io_rw_should_retry(struct io_kiocb *req)
2899 {
2900         struct kiocb *kiocb = &req->rw.kiocb;
2901         int ret;
2902
2903         /* never retry for NOWAIT, we just complete with -EAGAIN */
2904         if (req->flags & REQ_F_NOWAIT)
2905                 return false;
2906
2907         /* already tried, or we're doing O_DIRECT */
2908         if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_WAITQ))
2909                 return false;
2910         /*
2911          * just use poll if we can, and don't attempt if the fs doesn't
2912          * support callback based unlocks
2913          */
2914         if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
2915                 return false;
2916
2917         /*
2918          * If request type doesn't require req->io to defer in general,
2919          * we need to allocate it here
2920          */
2921         if (!req->io && __io_alloc_async_ctx(req))
2922                 return false;
2923
2924         ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq,
2925                                                 io_async_buf_func, req);
2926         if (!ret) {
2927                 io_get_req_task(req);
2928                 return true;
2929         }
2930
2931         return false;
2932 }
2933
2934 static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
2935 {
2936         if (req->file->f_op->read_iter)
2937                 return call_read_iter(req->file, &req->rw.kiocb, iter);
2938         return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter);
2939 }
2940
2941 static int io_read(struct io_kiocb *req, bool force_nonblock,
2942                    struct io_comp_state *cs)
2943 {
2944         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2945         struct kiocb *kiocb = &req->rw.kiocb;
2946         struct iov_iter iter;
2947         size_t iov_count;
2948         ssize_t io_size, ret;
2949
2950         ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock);
2951         if (ret < 0)
2952                 return ret;
2953
2954         /* Ensure we clear previously set non-block flag */
2955         if (!force_nonblock)
2956                 kiocb->ki_flags &= ~IOCB_NOWAIT;
2957
2958         io_size = ret;
2959         req->result = io_size;
2960
2961         /* If the file doesn't support async, just async punt */
2962         if (force_nonblock && !io_file_supports_async(req->file, READ))
2963                 goto copy_iov;
2964
2965         iov_count = iov_iter_count(&iter);
2966         ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
2967         if (!ret) {
2968                 unsigned long nr_segs = iter.nr_segs;
2969                 ssize_t ret2 = 0;
2970
2971                 ret2 = io_iter_do_read(req, &iter);
2972
2973                 /* Catch -EAGAIN return for forced non-blocking submission */
2974                 if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) {
2975                         kiocb_done(kiocb, ret2, cs);
2976                 } else {
2977                         iter.count = iov_count;
2978                         iter.nr_segs = nr_segs;
2979 copy_iov:
2980                         ret = io_setup_async_rw(req, io_size, iovec,
2981                                                 inline_vecs, &iter);
2982                         if (ret)
2983                                 goto out_free;
2984                         /* if we can retry, do so with the callbacks armed */
2985                         if (io_rw_should_retry(req)) {
2986                                 ret2 = io_iter_do_read(req, &iter);
2987                                 if (ret2 == -EIOCBQUEUED) {
2988                                         goto out_free;
2989                                 } else if (ret2 != -EAGAIN) {
2990                                         kiocb_done(kiocb, ret2, cs);
2991                                         goto out_free;
2992                                 }
2993                         }
2994                         kiocb->ki_flags &= ~IOCB_WAITQ;
2995                         return -EAGAIN;
2996                 }
2997         }
2998 out_free:
2999         if (!(req->flags & REQ_F_NEED_CLEANUP))
3000                 kfree(iovec);
3001         return ret;
3002 }
3003
3004 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
3005                          bool force_nonblock)
3006 {
3007         struct io_async_ctx *io;
3008         struct iov_iter iter;
3009         ssize_t ret;
3010
3011         ret = io_prep_rw(req, sqe, force_nonblock);
3012         if (ret)
3013                 return ret;
3014
3015         if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3016                 return -EBADF;
3017
3018         req->fsize = rlimit(RLIMIT_FSIZE);
3019
3020         /* either don't need iovec imported or already have it */
3021         if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
3022                 return 0;
3023
3024         io = req->io;
3025         io->rw.iov = io->rw.fast_iov;
3026         req->io = NULL;
3027         ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock);
3028         req->io = io;
3029         if (ret < 0)
3030                 return ret;
3031
3032         io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
3033         return 0;
3034 }
3035
3036 static int io_write(struct io_kiocb *req, bool force_nonblock,
3037                     struct io_comp_state *cs)
3038 {
3039         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3040         struct kiocb *kiocb = &req->rw.kiocb;
3041         struct iov_iter iter;
3042         size_t iov_count;
3043         ssize_t ret, io_size;
3044
3045         ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock);
3046         if (ret < 0)
3047                 return ret;
3048
3049         /* Ensure we clear previously set non-block flag */
3050         if (!force_nonblock)
3051                 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
3052
3053         io_size = ret;
3054         req->result = io_size;
3055
3056         /* If the file doesn't support async, just async punt */
3057         if (force_nonblock && !io_file_supports_async(req->file, WRITE))
3058                 goto copy_iov;
3059
3060         /* file path doesn't support NOWAIT for non-direct_IO */
3061         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3062             (req->flags & REQ_F_ISREG))
3063                 goto copy_iov;
3064
3065         iov_count = iov_iter_count(&iter);
3066         ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
3067         if (!ret) {
3068                 unsigned long nr_segs = iter.nr_segs;
3069                 ssize_t ret2;
3070
3071                 /*
3072                  * Open-code file_start_write here to grab freeze protection,
3073                  * which will be released by another thread in
3074                  * io_complete_rw().  Fool lockdep by telling it the lock got
3075                  * released so that it doesn't complain about the held lock when
3076                  * we return to userspace.
3077                  */
3078                 if (req->flags & REQ_F_ISREG) {
3079                         __sb_start_write(file_inode(req->file)->i_sb,
3080                                                 SB_FREEZE_WRITE, true);
3081                         __sb_writers_release(file_inode(req->file)->i_sb,
3082                                                 SB_FREEZE_WRITE);
3083                 }
3084                 kiocb->ki_flags |= IOCB_WRITE;
3085
3086                 if (!force_nonblock)
3087                         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
3088
3089                 if (req->file->f_op->write_iter)
3090                         ret2 = call_write_iter(req->file, kiocb, &iter);
3091                 else
3092                         ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
3093
3094                 if (!force_nonblock)
3095                         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
3096
3097                 /*
3098                  * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3099                  * retry them without IOCB_NOWAIT.
3100                  */
3101                 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3102                         ret2 = -EAGAIN;
3103                 if (!force_nonblock || ret2 != -EAGAIN) {
3104                         kiocb_done(kiocb, ret2, cs);
3105                 } else {
3106                         iter.count = iov_count;
3107                         iter.nr_segs = nr_segs;
3108 copy_iov:
3109                         ret = io_setup_async_rw(req, io_size, iovec,
3110                                                 inline_vecs, &iter);
3111                         if (ret)
3112                                 goto out_free;
3113                         return -EAGAIN;
3114                 }
3115         }
3116 out_free:
3117         if (!(req->flags & REQ_F_NEED_CLEANUP))
3118                 kfree(iovec);
3119         return ret;
3120 }
3121
3122 static int __io_splice_prep(struct io_kiocb *req,
3123                             const struct io_uring_sqe *sqe)
3124 {
3125         struct io_splice* sp = &req->splice;
3126         unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
3127         int ret;
3128
3129         if (req->flags & REQ_F_NEED_CLEANUP)
3130                 return 0;
3131         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3132                 return -EINVAL;
3133
3134         sp->file_in = NULL;
3135         sp->len = READ_ONCE(sqe->len);
3136         sp->flags = READ_ONCE(sqe->splice_flags);
3137
3138         if (unlikely(sp->flags & ~valid_flags))
3139                 return -EINVAL;
3140
3141         ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in,
3142                           (sp->flags & SPLICE_F_FD_IN_FIXED));
3143         if (ret)
3144                 return ret;
3145         req->flags |= REQ_F_NEED_CLEANUP;
3146
3147         if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
3148                 /*
3149                  * Splice operation will be punted aync, and here need to
3150                  * modify io_wq_work.flags, so initialize io_wq_work firstly.
3151                  */
3152                 io_req_init_async(req);
3153                 req->work.flags |= IO_WQ_WORK_UNBOUND;
3154         }
3155
3156         return 0;
3157 }
3158
3159 static int io_tee_prep(struct io_kiocb *req,
3160                        const struct io_uring_sqe *sqe)
3161 {
3162         if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
3163                 return -EINVAL;
3164         return __io_splice_prep(req, sqe);
3165 }
3166
3167 static int io_tee(struct io_kiocb *req, bool force_nonblock)
3168 {
3169         struct io_splice *sp = &req->splice;
3170         struct file *in = sp->file_in;
3171         struct file *out = sp->file_out;
3172         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3173         long ret = 0;
3174
3175         if (force_nonblock)
3176                 return -EAGAIN;
3177         if (sp->len)
3178                 ret = do_tee(in, out, sp->len, flags);
3179
3180         io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3181         req->flags &= ~REQ_F_NEED_CLEANUP;
3182
3183         if (ret != sp->len)
3184                 req_set_fail_links(req);
3185         io_req_complete(req, ret);
3186         return 0;
3187 }
3188
3189 static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3190 {
3191         struct io_splice* sp = &req->splice;
3192
3193         sp->off_in = READ_ONCE(sqe->splice_off_in);
3194         sp->off_out = READ_ONCE(sqe->off);
3195         return __io_splice_prep(req, sqe);
3196 }
3197
3198 static int io_splice(struct io_kiocb *req, bool force_nonblock)
3199 {
3200         struct io_splice *sp = &req->splice;
3201         struct file *in = sp->file_in;
3202         struct file *out = sp->file_out;
3203         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3204         loff_t *poff_in, *poff_out;
3205         long ret = 0;
3206
3207         if (force_nonblock)
3208                 return -EAGAIN;
3209
3210         poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
3211         poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
3212
3213         if (sp->len)
3214                 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
3215
3216         io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3217         req->flags &= ~REQ_F_NEED_CLEANUP;
3218
3219         if (ret != sp->len)
3220                 req_set_fail_links(req);
3221         io_req_complete(req, ret);
3222         return 0;
3223 }
3224
3225 /*
3226  * IORING_OP_NOP just posts a completion event, nothing else.
3227  */
3228 static int io_nop(struct io_kiocb *req, struct io_comp_state *cs)
3229 {
3230         struct io_ring_ctx *ctx = req->ctx;
3231
3232         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3233                 return -EINVAL;
3234
3235         __io_req_complete(req, 0, 0, cs);
3236         return 0;
3237 }
3238
3239 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3240 {
3241         struct io_ring_ctx *ctx = req->ctx;
3242
3243         if (!req->file)
3244                 return -EBADF;
3245
3246         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3247                 return -EINVAL;
3248         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
3249                 return -EINVAL;
3250
3251         req->sync.flags = READ_ONCE(sqe->fsync_flags);
3252         if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
3253                 return -EINVAL;
3254
3255         req->sync.off = READ_ONCE(sqe->off);
3256         req->sync.len = READ_ONCE(sqe->len);
3257         return 0;
3258 }
3259
3260 static int io_fsync(struct io_kiocb *req, bool force_nonblock)
3261 {
3262         loff_t end = req->sync.off + req->sync.len;
3263         int ret;
3264
3265         /* fsync always requires a blocking context */
3266         if (force_nonblock)
3267                 return -EAGAIN;
3268
3269         ret = vfs_fsync_range(req->file, req->sync.off,
3270                                 end > 0 ? end : LLONG_MAX,
3271                                 req->sync.flags & IORING_FSYNC_DATASYNC);
3272         if (ret < 0)
3273                 req_set_fail_links(req);
3274         io_req_complete(req, ret);
3275         return 0;
3276 }
3277
3278 static int io_fallocate_prep(struct io_kiocb *req,
3279                              const struct io_uring_sqe *sqe)
3280 {
3281         if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
3282                 return -EINVAL;
3283         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3284                 return -EINVAL;
3285
3286         req->sync.off = READ_ONCE(sqe->off);
3287         req->sync.len = READ_ONCE(sqe->addr);
3288         req->sync.mode = READ_ONCE(sqe->len);
3289         req->fsize = rlimit(RLIMIT_FSIZE);
3290         return 0;
3291 }
3292
3293 static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
3294 {
3295         int ret;
3296
3297         /* fallocate always requiring blocking context */
3298         if (force_nonblock)
3299                 return -EAGAIN;
3300
3301         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
3302         ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
3303                                 req->sync.len);
3304         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
3305         if (ret < 0)
3306                 req_set_fail_links(req);
3307         io_req_complete(req, ret);
3308         return 0;
3309 }
3310
3311 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3312 {
3313         const char __user *fname;
3314         int ret;
3315
3316         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3317                 return -EINVAL;
3318         if (unlikely(sqe->ioprio || sqe->buf_index))
3319                 return -EINVAL;
3320         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3321                 return -EBADF;
3322
3323         /* open.how should be already initialised */
3324         if (!(req->open.how.flags & O_PATH) && force_o_largefile())
3325                 req->open.how.flags |= O_LARGEFILE;
3326
3327         req->open.dfd = READ_ONCE(sqe->fd);
3328         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3329         req->open.filename = getname(fname);
3330         if (IS_ERR(req->open.filename)) {
3331                 ret = PTR_ERR(req->open.filename);
3332                 req->open.filename = NULL;
3333                 return ret;
3334         }
3335         req->open.nofile = rlimit(RLIMIT_NOFILE);
3336         req->flags |= REQ_F_NEED_CLEANUP;
3337         return 0;
3338 }
3339
3340 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3341 {
3342         u64 flags, mode;
3343
3344         if (req->flags & REQ_F_NEED_CLEANUP)
3345                 return 0;
3346         mode = READ_ONCE(sqe->len);
3347         flags = READ_ONCE(sqe->open_flags);
3348         req->open.how = build_open_how(flags, mode);
3349         return __io_openat_prep(req, sqe);
3350 }
3351
3352 static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3353 {
3354         struct open_how __user *how;
3355         size_t len;
3356         int ret;
3357
3358         if (req->flags & REQ_F_NEED_CLEANUP)
3359                 return 0;
3360         how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3361         len = READ_ONCE(sqe->len);
3362         if (len < OPEN_HOW_SIZE_VER0)
3363                 return -EINVAL;
3364
3365         ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
3366                                         len);
3367         if (ret)
3368                 return ret;
3369
3370         return __io_openat_prep(req, sqe);
3371 }
3372
3373 static int io_openat2(struct io_kiocb *req, bool force_nonblock)
3374 {
3375         struct open_flags op;
3376         struct file *file;
3377         int ret;
3378
3379         if (force_nonblock)
3380                 return -EAGAIN;
3381
3382         ret = build_open_flags(&req->open.how, &op);
3383         if (ret)
3384                 goto err;
3385
3386         ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
3387         if (ret < 0)
3388                 goto err;
3389
3390         file = do_filp_open(req->open.dfd, req->open.filename, &op);
3391         if (IS_ERR(file)) {
3392                 put_unused_fd(ret);
3393                 ret = PTR_ERR(file);
3394         } else {
3395                 fsnotify_open(file);
3396                 fd_install(ret, file);
3397         }
3398 err:
3399         putname(req->open.filename);
3400         req->flags &= ~REQ_F_NEED_CLEANUP;
3401         if (ret < 0)
3402                 req_set_fail_links(req);
3403         io_req_complete(req, ret);
3404         return 0;
3405 }
3406
3407 static int io_openat(struct io_kiocb *req, bool force_nonblock)
3408 {
3409         return io_openat2(req, force_nonblock);
3410 }
3411
3412 static int io_remove_buffers_prep(struct io_kiocb *req,
3413                                   const struct io_uring_sqe *sqe)
3414 {
3415         struct io_provide_buf *p = &req->pbuf;
3416         u64 tmp;
3417
3418         if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
3419                 return -EINVAL;
3420
3421         tmp = READ_ONCE(sqe->fd);
3422         if (!tmp || tmp > USHRT_MAX)
3423                 return -EINVAL;
3424
3425         memset(p, 0, sizeof(*p));
3426         p->nbufs = tmp;
3427         p->bgid = READ_ONCE(sqe->buf_group);
3428         return 0;
3429 }
3430
3431 static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
3432                                int bgid, unsigned nbufs)
3433 {
3434         unsigned i = 0;
3435
3436         /* shouldn't happen */
3437         if (!nbufs)
3438                 return 0;
3439
3440         /* the head kbuf is the list itself */
3441         while (!list_empty(&buf->list)) {
3442                 struct io_buffer *nxt;
3443
3444                 nxt = list_first_entry(&buf->list, struct io_buffer, list);
3445                 list_del(&nxt->list);
3446                 kfree(nxt);
3447                 if (++i == nbufs)
3448                         return i;
3449         }
3450         i++;
3451         kfree(buf);
3452         idr_remove(&ctx->io_buffer_idr, bgid);
3453
3454         return i;
3455 }
3456
3457 static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock,
3458                              struct io_comp_state *cs)
3459 {
3460         struct io_provide_buf *p = &req->pbuf;
3461         struct io_ring_ctx *ctx = req->ctx;
3462         struct io_buffer *head;
3463         int ret = 0;
3464
3465         io_ring_submit_lock(ctx, !force_nonblock);
3466
3467         lockdep_assert_held(&ctx->uring_lock);
3468
3469         ret = -ENOENT;
3470         head = idr_find(&ctx->io_buffer_idr, p->bgid);
3471         if (head)
3472                 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
3473
3474         io_ring_submit_lock(ctx, !force_nonblock);
3475         if (ret < 0)
3476                 req_set_fail_links(req);
3477         __io_req_complete(req, ret, 0, cs);
3478         return 0;
3479 }
3480
3481 static int io_provide_buffers_prep(struct io_kiocb *req,
3482                                    const struct io_uring_sqe *sqe)
3483 {
3484         struct io_provide_buf *p = &req->pbuf;
3485         u64 tmp;
3486
3487         if (sqe->ioprio || sqe->rw_flags)
3488                 return -EINVAL;
3489
3490         tmp = READ_ONCE(sqe->fd);
3491         if (!tmp || tmp > USHRT_MAX)
3492                 return -E2BIG;
3493         p->nbufs = tmp;
3494         p->addr = READ_ONCE(sqe->addr);
3495         p->len = READ_ONCE(sqe->len);
3496
3497         if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
3498                 return -EFAULT;
3499
3500         p->bgid = READ_ONCE(sqe->buf_group);
3501         tmp = READ_ONCE(sqe->off);
3502         if (tmp > USHRT_MAX)
3503                 return -E2BIG;
3504         p->bid = tmp;
3505         return 0;
3506 }
3507
3508 static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
3509 {
3510         struct io_buffer *buf;
3511         u64 addr = pbuf->addr;
3512         int i, bid = pbuf->bid;
3513
3514         for (i = 0; i < pbuf->nbufs; i++) {
3515                 buf = kmalloc(sizeof(*buf), GFP_KERNEL);
3516                 if (!buf)
3517                         break;
3518
3519                 buf->addr = addr;
3520                 buf->len = pbuf->len;
3521                 buf->bid = bid;
3522                 addr += pbuf->len;
3523                 bid++;
3524                 if (!*head) {
3525                         INIT_LIST_HEAD(&buf->list);
3526                         *head = buf;
3527                 } else {
3528                         list_add_tail(&buf->list, &(*head)->list);
3529                 }
3530         }
3531
3532         return i ? i : -ENOMEM;
3533 }
3534
3535 static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock,
3536                               struct io_comp_state *cs)
3537 {
3538         struct io_provide_buf *p = &req->pbuf;
3539         struct io_ring_ctx *ctx = req->ctx;
3540         struct io_buffer *head, *list;
3541         int ret = 0;
3542
3543         io_ring_submit_lock(ctx, !force_nonblock);
3544
3545         lockdep_assert_held(&ctx->uring_lock);
3546
3547         list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
3548
3549         ret = io_add_buffers(p, &head);
3550         if (ret < 0)
3551                 goto out;
3552
3553         if (!list) {
3554                 ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
3555                                         GFP_KERNEL);
3556                 if (ret < 0) {
3557                         __io_remove_buffers(ctx, head, p->bgid, -1U);
3558                         goto out;
3559                 }
3560         }
3561 out:
3562         io_ring_submit_unlock(ctx, !force_nonblock);
3563         if (ret < 0)
3564                 req_set_fail_links(req);
3565         __io_req_complete(req, ret, 0, cs);
3566         return 0;
3567 }
3568
3569 static int io_epoll_ctl_prep(struct io_kiocb *req,
3570                              const struct io_uring_sqe *sqe)
3571 {
3572 #if defined(CONFIG_EPOLL)
3573         if (sqe->ioprio || sqe->buf_index)
3574                 return -EINVAL;
3575         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3576                 return -EINVAL;
3577
3578         req->epoll.epfd = READ_ONCE(sqe->fd);
3579         req->epoll.op = READ_ONCE(sqe->len);
3580         req->epoll.fd = READ_ONCE(sqe->off);
3581
3582         if (ep_op_has_event(req->epoll.op)) {
3583                 struct epoll_event __user *ev;
3584
3585                 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
3586                 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
3587                         return -EFAULT;
3588         }
3589
3590         return 0;
3591 #else
3592         return -EOPNOTSUPP;
3593 #endif
3594 }
3595
3596 static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock,
3597                         struct io_comp_state *cs)
3598 {
3599 #if defined(CONFIG_EPOLL)
3600         struct io_epoll *ie = &req->epoll;
3601         int ret;
3602
3603         ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
3604         if (force_nonblock && ret == -EAGAIN)
3605                 return -EAGAIN;
3606
3607         if (ret < 0)
3608                 req_set_fail_links(req);
3609         __io_req_complete(req, ret, 0, cs);
3610         return 0;
3611 #else
3612         return -EOPNOTSUPP;
3613 #endif
3614 }
3615
3616 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3617 {
3618 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
3619         if (sqe->ioprio || sqe->buf_index || sqe->off)
3620                 return -EINVAL;
3621         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3622                 return -EINVAL;
3623
3624         req->madvise.addr = READ_ONCE(sqe->addr);
3625         req->madvise.len = READ_ONCE(sqe->len);
3626         req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
3627         return 0;
3628 #else
3629         return -EOPNOTSUPP;
3630 #endif
3631 }
3632
3633 static int io_madvise(struct io_kiocb *req, bool force_nonblock)
3634 {
3635 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
3636         struct io_madvise *ma = &req->madvise;
3637         int ret;
3638
3639         if (force_nonblock)
3640                 return -EAGAIN;
3641
3642         ret = do_madvise(ma->addr, ma->len, ma->advice);
3643         if (ret < 0)
3644                 req_set_fail_links(req);
3645         io_req_complete(req, ret);
3646         return 0;
3647 #else
3648         return -EOPNOTSUPP;
3649 #endif
3650 }
3651
3652 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3653 {
3654         if (sqe->ioprio || sqe->buf_index || sqe->addr)
3655                 return -EINVAL;
3656         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3657                 return -EINVAL;
3658
3659         req->fadvise.offset = READ_ONCE(sqe->off);
3660         req->fadvise.len = READ_ONCE(sqe->len);
3661         req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
3662         return 0;
3663 }
3664
3665 static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
3666 {
3667         struct io_fadvise *fa = &req->fadvise;
3668         int ret;
3669
3670         if (force_nonblock) {
3671                 switch (fa->advice) {
3672                 case POSIX_FADV_NORMAL:
3673                 case POSIX_FADV_RANDOM:
3674                 case POSIX_FADV_SEQUENTIAL:
3675                         break;
3676                 default:
3677                         return -EAGAIN;
3678                 }
3679         }
3680
3681         ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
3682         if (ret < 0)
3683                 req_set_fail_links(req);
3684         io_req_complete(req, ret);
3685         return 0;
3686 }
3687
3688 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3689 {
3690         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3691                 return -EINVAL;
3692         if (sqe->ioprio || sqe->buf_index)
3693                 return -EINVAL;
3694         if (req->flags & REQ_F_FIXED_FILE)
3695                 return -EBADF;
3696
3697         req->statx.dfd = READ_ONCE(sqe->fd);
3698         req->statx.mask = READ_ONCE(sqe->len);
3699         req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
3700         req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3701         req->statx.flags = READ_ONCE(sqe->statx_flags);
3702
3703         return 0;
3704 }
3705
3706 static int io_statx(struct io_kiocb *req, bool force_nonblock)
3707 {
3708         struct io_statx *ctx = &req->statx;
3709         int ret;
3710
3711         if (force_nonblock) {
3712                 /* only need file table for an actual valid fd */
3713                 if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
3714                         req->flags |= REQ_F_NO_FILE_TABLE;
3715                 return -EAGAIN;
3716         }
3717
3718         ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
3719                        ctx->buffer);
3720
3721         if (ret < 0)
3722                 req_set_fail_links(req);
3723         io_req_complete(req, ret);
3724         return 0;
3725 }
3726
3727 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3728 {
3729         /*
3730          * If we queue this for async, it must not be cancellable. That would
3731          * leave the 'file' in an undeterminate state, and here need to modify
3732          * io_wq_work.flags, so initialize io_wq_work firstly.
3733          */
3734         io_req_init_async(req);
3735         req->work.flags |= IO_WQ_WORK_NO_CANCEL;
3736
3737         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3738                 return -EINVAL;
3739         if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
3740             sqe->rw_flags || sqe->buf_index)
3741                 return -EINVAL;
3742         if (req->flags & REQ_F_FIXED_FILE)
3743                 return -EBADF;
3744
3745         req->close.fd = READ_ONCE(sqe->fd);
3746         if ((req->file && req->file->f_op == &io_uring_fops) ||
3747             req->close.fd == req->ctx->ring_fd)
3748                 return -EBADF;
3749
3750         req->close.put_file = NULL;
3751         return 0;
3752 }
3753
3754 static int io_close(struct io_kiocb *req, bool force_nonblock,
3755                     struct io_comp_state *cs)
3756 {
3757         struct io_close *close = &req->close;
3758         int ret;
3759
3760         /* might be already done during nonblock submission */
3761         if (!close->put_file) {
3762                 ret = __close_fd_get_file(close->fd, &close->put_file);
3763                 if (ret < 0)
3764                         return (ret == -ENOENT) ? -EBADF : ret;
3765         }
3766
3767         /* if the file has a flush method, be safe and punt to async */
3768         if (close->put_file->f_op->flush && force_nonblock) {
3769                 /* was never set, but play safe */
3770                 req->flags &= ~REQ_F_NOWAIT;
3771                 /* avoid grabbing files - we don't need the files */
3772                 req->flags |= REQ_F_NO_FILE_TABLE;
3773                 return -EAGAIN;
3774         }
3775
3776         /* No ->flush() or already async, safely close from here */
3777         ret = filp_close(close->put_file, req->work.files);
3778         if (ret < 0)
3779                 req_set_fail_links(req);
3780         fput(close->put_file);
3781         close->put_file = NULL;
3782         __io_req_complete(req, ret, 0, cs);
3783         return 0;
3784 }
3785
3786 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3787 {
3788         struct io_ring_ctx *ctx = req->ctx;
3789
3790         if (!req->file)
3791                 return -EBADF;
3792
3793         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3794                 return -EINVAL;
3795         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
3796                 return -EINVAL;
3797
3798         req->sync.off = READ_ONCE(sqe->off);
3799         req->sync.len = READ_ONCE(sqe->len);
3800         req->sync.flags = READ_ONCE(sqe->sync_range_flags);
3801         return 0;
3802 }
3803
3804 static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
3805 {
3806         int ret;
3807
3808         /* sync_file_range always requires a blocking context */
3809         if (force_nonblock)
3810                 return -EAGAIN;
3811
3812         ret = sync_file_range(req->file, req->sync.off, req->sync.len,
3813                                 req->sync.flags);
3814         if (ret < 0)
3815                 req_set_fail_links(req);
3816         io_req_complete(req, ret);
3817         return 0;
3818 }
3819
3820 #if defined(CONFIG_NET)
3821 static int io_setup_async_msg(struct io_kiocb *req,
3822                               struct io_async_msghdr *kmsg)
3823 {
3824         if (req->io)
3825                 return -EAGAIN;
3826         if (io_alloc_async_ctx(req)) {
3827                 if (kmsg->iov != kmsg->fast_iov)
3828                         kfree(kmsg->iov);
3829                 return -ENOMEM;
3830         }
3831         req->flags |= REQ_F_NEED_CLEANUP;
3832         memcpy(&req->io->msg, kmsg, sizeof(*kmsg));
3833         return -EAGAIN;
3834 }
3835
3836 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3837 {
3838         struct io_sr_msg *sr = &req->sr_msg;
3839         struct io_async_ctx *io = req->io;
3840         int ret;
3841
3842         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3843                 return -EINVAL;
3844
3845         sr->msg_flags = READ_ONCE(sqe->msg_flags);
3846         sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
3847         sr->len = READ_ONCE(sqe->len);
3848
3849 #ifdef CONFIG_COMPAT
3850         if (req->ctx->compat)
3851                 sr->msg_flags |= MSG_CMSG_COMPAT;
3852 #endif
3853
3854         if (!io || req->opcode == IORING_OP_SEND)
3855                 return 0;
3856         /* iovec is already imported */
3857         if (req->flags & REQ_F_NEED_CLEANUP)
3858                 return 0;
3859
3860         io->msg.iov = io->msg.fast_iov;
3861         ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
3862                                         &io->msg.iov);
3863         if (!ret)
3864                 req->flags |= REQ_F_NEED_CLEANUP;
3865         return ret;
3866 }
3867
3868 static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
3869                       struct io_comp_state *cs)
3870 {
3871         struct io_async_msghdr *kmsg = NULL;
3872         struct socket *sock;
3873         int ret;
3874
3875         sock = sock_from_file(req->file, &ret);
3876         if (sock) {
3877                 struct io_async_ctx io;
3878                 unsigned flags;
3879
3880                 if (req->io) {
3881                         kmsg = &req->io->msg;
3882                         kmsg->msg.msg_name = &req->io->msg.addr;
3883                         /* if iov is set, it's allocated already */
3884                         if (!kmsg->iov)
3885                                 kmsg->iov = kmsg->fast_iov;
3886                         kmsg->msg.msg_iter.iov = kmsg->iov;
3887                 } else {
3888                         struct io_sr_msg *sr = &req->sr_msg;
3889
3890                         kmsg = &io.msg;
3891                         kmsg->msg.msg_name = &io.msg.addr;
3892
3893                         io.msg.iov = io.msg.fast_iov;
3894                         ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
3895                                         sr->msg_flags, &io.msg.iov);
3896                         if (ret)
3897                                 return ret;
3898                 }
3899
3900                 flags = req->sr_msg.msg_flags;
3901                 if (flags & MSG_DONTWAIT)
3902                         req->flags |= REQ_F_NOWAIT;
3903                 else if (force_nonblock)
3904                         flags |= MSG_DONTWAIT;
3905
3906                 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
3907                 if (force_nonblock && ret == -EAGAIN)
3908                         return io_setup_async_msg(req, kmsg);
3909                 if (ret == -ERESTARTSYS)
3910                         ret = -EINTR;
3911         }
3912
3913         if (kmsg && kmsg->iov != kmsg->fast_iov)
3914                 kfree(kmsg->iov);
3915         req->flags &= ~REQ_F_NEED_CLEANUP;
3916         if (ret < 0)
3917                 req_set_fail_links(req);
3918         __io_req_complete(req, ret, 0, cs);
3919         return 0;
3920 }
3921
3922 static int io_send(struct io_kiocb *req, bool force_nonblock,
3923                    struct io_comp_state *cs)
3924 {
3925         struct socket *sock;
3926         int ret;
3927
3928         sock = sock_from_file(req->file, &ret);
3929         if (sock) {
3930                 struct io_sr_msg *sr = &req->sr_msg;
3931                 struct msghdr msg;
3932                 struct iovec iov;
3933                 unsigned flags;
3934
3935                 ret = import_single_range(WRITE, sr->buf, sr->len, &iov,
3936                                                 &msg.msg_iter);
3937                 if (ret)
3938                         return ret;
3939
3940                 msg.msg_name = NULL;
3941                 msg.msg_control = NULL;
3942                 msg.msg_controllen = 0;
3943                 msg.msg_namelen = 0;
3944
3945                 flags = req->sr_msg.msg_flags;
3946                 if (flags & MSG_DONTWAIT)
3947                         req->flags |= REQ_F_NOWAIT;
3948                 else if (force_nonblock)
3949                         flags |= MSG_DONTWAIT;
3950
3951                 msg.msg_flags = flags;
3952                 ret = sock_sendmsg(sock, &msg);
3953                 if (force_nonblock && ret == -EAGAIN)
3954                         return -EAGAIN;
3955                 if (ret == -ERESTARTSYS)
3956                         ret = -EINTR;
3957         }
3958
3959         if (ret < 0)
3960                 req_set_fail_links(req);
3961         __io_req_complete(req, ret, 0, cs);
3962         return 0;
3963 }
3964
3965 static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
3966 {
3967         struct io_sr_msg *sr = &req->sr_msg;
3968         struct iovec __user *uiov;
3969         size_t iov_len;
3970         int ret;
3971
3972         ret = __copy_msghdr_from_user(&io->msg.msg, sr->msg, &io->msg.uaddr,
3973                                         &uiov, &iov_len);
3974         if (ret)
3975                 return ret;
3976
3977         if (req->flags & REQ_F_BUFFER_SELECT) {
3978                 if (iov_len > 1)
3979                         return -EINVAL;
3980                 if (copy_from_user(io->msg.iov, uiov, sizeof(*uiov)))
3981                         return -EFAULT;
3982                 sr->len = io->msg.iov[0].iov_len;
3983                 iov_iter_init(&io->msg.msg.msg_iter, READ, io->msg.iov, 1,
3984                                 sr->len);
3985                 io->msg.iov = NULL;
3986         } else {
3987                 ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
3988                                         &io->msg.iov, &io->msg.msg.msg_iter);
3989                 if (ret > 0)
3990                         ret = 0;
3991         }
3992
3993         return ret;
3994 }
3995
3996 #ifdef CONFIG_COMPAT
3997 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
3998                                         struct io_async_ctx *io)
3999 {
4000         struct compat_msghdr __user *msg_compat;
4001         struct io_sr_msg *sr = &req->sr_msg;
4002         struct compat_iovec __user *uiov;
4003         compat_uptr_t ptr;
4004         compat_size_t len;
4005         int ret;
4006
4007         msg_compat = (struct compat_msghdr __user *) sr->msg;
4008         ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr,
4009                                         &ptr, &len);
4010         if (ret)
4011                 return ret;
4012
4013         uiov = compat_ptr(ptr);
4014         if (req->flags & REQ_F_BUFFER_SELECT) {
4015                 compat_ssize_t clen;
4016
4017                 if (len > 1)
4018                         return -EINVAL;
4019                 if (!access_ok(uiov, sizeof(*uiov)))
4020                         return -EFAULT;
4021                 if (__get_user(clen, &uiov->iov_len))
4022                         return -EFAULT;
4023                 if (clen < 0)
4024                         return -EINVAL;
4025                 sr->len = io->msg.iov[0].iov_len;
4026                 io->msg.iov = NULL;
4027         } else {
4028                 ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV,
4029                                                 &io->msg.iov,
4030                                                 &io->msg.msg.msg_iter);
4031                 if (ret < 0)
4032                         return ret;
4033         }
4034
4035         return 0;
4036 }
4037 #endif
4038
4039 static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
4040 {
4041         io->msg.iov = io->msg.fast_iov;
4042
4043 #ifdef CONFIG_COMPAT
4044         if (req->ctx->compat)
4045                 return __io_compat_recvmsg_copy_hdr(req, io);
4046 #endif
4047
4048         return __io_recvmsg_copy_hdr(req, io);
4049 }
4050
4051 static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
4052                                                int *cflags, bool needs_lock)
4053 {
4054         struct io_sr_msg *sr = &req->sr_msg;
4055         struct io_buffer *kbuf;
4056
4057         if (!(req->flags & REQ_F_BUFFER_SELECT))
4058                 return NULL;
4059
4060         kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
4061         if (IS_ERR(kbuf))
4062                 return kbuf;
4063
4064         sr->kbuf = kbuf;
4065         req->flags |= REQ_F_BUFFER_SELECTED;
4066
4067         *cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
4068         *cflags |= IORING_CQE_F_BUFFER;
4069         return kbuf;
4070 }
4071
4072 static int io_recvmsg_prep(struct io_kiocb *req,
4073                            const struct io_uring_sqe *sqe)
4074 {
4075         struct io_sr_msg *sr = &req->sr_msg;
4076         struct io_async_ctx *io = req->io;
4077         int ret;
4078
4079         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4080                 return -EINVAL;
4081
4082         sr->msg_flags = READ_ONCE(sqe->msg_flags);
4083         sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4084         sr->len = READ_ONCE(sqe->len);
4085         sr->bgid = READ_ONCE(sqe->buf_group);
4086
4087 #ifdef CONFIG_COMPAT
4088         if (req->ctx->compat)
4089                 sr->msg_flags |= MSG_CMSG_COMPAT;
4090 #endif
4091
4092         if (!io || req->opcode == IORING_OP_RECV)
4093                 return 0;
4094         /* iovec is already imported */
4095         if (req->flags & REQ_F_NEED_CLEANUP)
4096                 return 0;
4097
4098         ret = io_recvmsg_copy_hdr(req, io);
4099         if (!ret)
4100                 req->flags |= REQ_F_NEED_CLEANUP;
4101         return ret;
4102 }
4103
4104 static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
4105                       struct io_comp_state *cs)
4106 {
4107         struct io_async_msghdr *kmsg = NULL;
4108         struct socket *sock;
4109         int ret, cflags = 0;
4110
4111         sock = sock_from_file(req->file, &ret);
4112         if (sock) {
4113                 struct io_buffer *kbuf;
4114                 struct io_async_ctx io;
4115                 unsigned flags;
4116
4117                 if (req->io) {
4118                         kmsg = &req->io->msg;
4119                         kmsg->msg.msg_name = &req->io->msg.addr;
4120                         /* if iov is set, it's allocated already */
4121                         if (!kmsg->iov)
4122                                 kmsg->iov = kmsg->fast_iov;
4123                         kmsg->msg.msg_iter.iov = kmsg->iov;
4124                 } else {
4125                         kmsg = &io.msg;
4126                         kmsg->msg.msg_name = &io.msg.addr;
4127
4128                         ret = io_recvmsg_copy_hdr(req, &io);
4129                         if (ret)
4130                                 return ret;
4131                 }
4132
4133                 kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
4134                 if (IS_ERR(kbuf)) {
4135                         return PTR_ERR(kbuf);
4136                 } else if (kbuf) {
4137                         kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
4138                         iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
4139                                         1, req->sr_msg.len);
4140                 }
4141
4142                 flags = req->sr_msg.msg_flags;
4143                 if (flags & MSG_DONTWAIT)
4144                         req->flags |= REQ_F_NOWAIT;
4145                 else if (force_nonblock)
4146                         flags |= MSG_DONTWAIT;
4147
4148                 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
4149                                                 kmsg->uaddr, flags);
4150                 if (force_nonblock && ret == -EAGAIN)
4151                         return io_setup_async_msg(req, kmsg);
4152                 if (ret == -ERESTARTSYS)
4153                         ret = -EINTR;
4154         }
4155
4156         if (kmsg && kmsg->iov != kmsg->fast_iov)
4157                 kfree(kmsg->iov);
4158         req->flags &= ~REQ_F_NEED_CLEANUP;
4159         if (ret < 0)
4160                 req_set_fail_links(req);
4161         __io_req_complete(req, ret, cflags, cs);
4162         return 0;
4163 }
4164
4165 static int io_recv(struct io_kiocb *req, bool force_nonblock,
4166                    struct io_comp_state *cs)
4167 {
4168         struct io_buffer *kbuf = NULL;
4169         struct socket *sock;
4170         int ret, cflags = 0;
4171
4172         sock = sock_from_file(req->file, &ret);
4173         if (sock) {
4174                 struct io_sr_msg *sr = &req->sr_msg;
4175                 void __user *buf = sr->buf;
4176                 struct msghdr msg;
4177                 struct iovec iov;
4178                 unsigned flags;
4179
4180                 kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
4181                 if (IS_ERR(kbuf))
4182                         return PTR_ERR(kbuf);
4183                 else if (kbuf)
4184                         buf = u64_to_user_ptr(kbuf->addr);
4185
4186                 ret = import_single_range(READ, buf, sr->len, &iov,
4187                                                 &msg.msg_iter);
4188                 if (ret) {
4189                         kfree(kbuf);
4190                         return ret;
4191                 }
4192
4193                 req->flags |= REQ_F_NEED_CLEANUP;
4194                 msg.msg_name = NULL;
4195                 msg.msg_control = NULL;
4196                 msg.msg_controllen = 0;
4197                 msg.msg_namelen = 0;
4198                 msg.msg_iocb = NULL;
4199                 msg.msg_flags = 0;
4200
4201                 flags = req->sr_msg.msg_flags;
4202                 if (flags & MSG_DONTWAIT)
4203                         req->flags |= REQ_F_NOWAIT;
4204                 else if (force_nonblock)
4205                         flags |= MSG_DONTWAIT;
4206
4207                 ret = sock_recvmsg(sock, &msg, flags);
4208                 if (force_nonblock && ret == -EAGAIN)
4209                         return -EAGAIN;
4210                 if (ret == -ERESTARTSYS)
4211                         ret = -EINTR;
4212         }
4213
4214         kfree(kbuf);
4215         req->flags &= ~REQ_F_NEED_CLEANUP;
4216         if (ret < 0)
4217                 req_set_fail_links(req);
4218         __io_req_complete(req, ret, cflags, cs);
4219         return 0;
4220 }
4221
4222 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4223 {
4224         struct io_accept *accept = &req->accept;
4225
4226         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
4227                 return -EINVAL;
4228         if (sqe->ioprio || sqe->len || sqe->buf_index)
4229                 return -EINVAL;
4230
4231         accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4232         accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4233         accept->flags = READ_ONCE(sqe->accept_flags);
4234         accept->nofile = rlimit(RLIMIT_NOFILE);
4235         return 0;
4236 }
4237
4238 static int io_accept(struct io_kiocb *req, bool force_nonblock,
4239                      struct io_comp_state *cs)
4240 {
4241         struct io_accept *accept = &req->accept;
4242         unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
4243         int ret;
4244
4245         if (req->file->f_flags & O_NONBLOCK)
4246                 req->flags |= REQ_F_NOWAIT;
4247
4248         ret = __sys_accept4_file(req->file, file_flags, accept->addr,
4249                                         accept->addr_len, accept->flags,
4250                                         accept->nofile);
4251         if (ret == -EAGAIN && force_nonblock)
4252                 return -EAGAIN;
4253         if (ret < 0) {
4254                 if (ret == -ERESTARTSYS)
4255                         ret = -EINTR;
4256                 req_set_fail_links(req);
4257         }
4258         __io_req_complete(req, ret, 0, cs);
4259         return 0;
4260 }
4261
4262 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4263 {
4264         struct io_connect *conn = &req->connect;
4265         struct io_async_ctx *io = req->io;
4266
4267         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
4268                 return -EINVAL;
4269         if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
4270                 return -EINVAL;
4271
4272         conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4273         conn->addr_len =  READ_ONCE(sqe->addr2);
4274
4275         if (!io)
4276                 return 0;
4277
4278         return move_addr_to_kernel(conn->addr, conn->addr_len,
4279                                         &io->connect.address);
4280 }
4281
4282 static int io_connect(struct io_kiocb *req, bool force_nonblock,
4283                       struct io_comp_state *cs)
4284 {
4285         struct io_async_ctx __io, *io;
4286         unsigned file_flags;
4287         int ret;
4288
4289         if (req->io) {
4290                 io = req->io;
4291         } else {
4292                 ret = move_addr_to_kernel(req->connect.addr,
4293                                                 req->connect.addr_len,
4294                                                 &__io.connect.address);
4295                 if (ret)
4296                         goto out;
4297                 io = &__io;
4298         }
4299
4300         file_flags = force_nonblock ? O_NONBLOCK : 0;
4301
4302         ret = __sys_connect_file(req->file, &io->connect.address,
4303                                         req->connect.addr_len, file_flags);
4304         if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
4305                 if (req->io)
4306                         return -EAGAIN;
4307                 if (io_alloc_async_ctx(req)) {
4308                         ret = -ENOMEM;
4309                         goto out;
4310                 }
4311                 memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect));
4312                 return -EAGAIN;
4313         }
4314         if (ret == -ERESTARTSYS)
4315                 ret = -EINTR;
4316 out:
4317         if (ret < 0)
4318                 req_set_fail_links(req);
4319         __io_req_complete(req, ret, 0, cs);
4320         return 0;
4321 }
4322 #else /* !CONFIG_NET */
4323 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4324 {
4325         return -EOPNOTSUPP;
4326 }
4327
4328 static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
4329                       struct io_comp_state *cs)
4330 {
4331         return -EOPNOTSUPP;
4332 }
4333
4334 static int io_send(struct io_kiocb *req, bool force_nonblock,
4335                    struct io_comp_state *cs)
4336 {
4337         return -EOPNOTSUPP;
4338 }
4339
4340 static int io_recvmsg_prep(struct io_kiocb *req,
4341                            const struct io_uring_sqe *sqe)
4342 {
4343         return -EOPNOTSUPP;
4344 }
4345
4346 static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
4347                       struct io_comp_state *cs)
4348 {
4349         return -EOPNOTSUPP;
4350 }
4351
4352 static int io_recv(struct io_kiocb *req, bool force_nonblock,
4353                    struct io_comp_state *cs)
4354 {
4355         return -EOPNOTSUPP;
4356 }
4357
4358 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4359 {
4360         return -EOPNOTSUPP;
4361 }
4362
4363 static int io_accept(struct io_kiocb *req, bool force_nonblock,
4364                      struct io_comp_state *cs)
4365 {
4366         return -EOPNOTSUPP;
4367 }
4368
4369 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4370 {
4371         return -EOPNOTSUPP;
4372 }
4373
4374 static int io_connect(struct io_kiocb *req, bool force_nonblock,
4375                       struct io_comp_state *cs)
4376 {
4377         return -EOPNOTSUPP;
4378 }
4379 #endif /* CONFIG_NET */
4380
4381 struct io_poll_table {
4382         struct poll_table_struct pt;
4383         struct io_kiocb *req;
4384         int error;
4385 };
4386
4387 static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
4388                            __poll_t mask, task_work_func_t func)
4389 {
4390         struct task_struct *tsk;
4391         int ret;
4392
4393         /* for instances that support it check for an event match first: */
4394         if (mask && !(mask & poll->events))
4395                 return 0;
4396
4397         trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
4398
4399         list_del_init(&poll->wait.entry);
4400
4401         tsk = req->task;
4402         req->result = mask;
4403         init_task_work(&req->task_work, func);
4404         /*
4405          * If this fails, then the task is exiting. When a task exits, the
4406          * work gets canceled, so just cancel this request as well instead
4407          * of executing it. We can't safely execute it anyway, as we may not
4408          * have the needed state needed for it anyway.
4409          */
4410         ret = task_work_add(tsk, &req->task_work, true);
4411         if (unlikely(ret)) {
4412                 WRITE_ONCE(poll->canceled, true);
4413                 tsk = io_wq_get_task(req->ctx->io_wq);
4414                 task_work_add(tsk, &req->task_work, true);
4415         }
4416         wake_up_process(tsk);
4417         return 1;
4418 }
4419
4420 static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
4421         __acquires(&req->ctx->completion_lock)
4422 {
4423         struct io_ring_ctx *ctx = req->ctx;
4424
4425         if (!req->result && !READ_ONCE(poll->canceled)) {
4426                 struct poll_table_struct pt = { ._key = poll->events };
4427
4428                 req->result = vfs_poll(req->file, &pt) & poll->events;
4429         }
4430
4431         spin_lock_irq(&ctx->completion_lock);
4432         if (!req->result && !READ_ONCE(poll->canceled)) {
4433                 add_wait_queue(poll->head, &poll->wait);
4434                 return true;
4435         }
4436
4437         return false;
4438 }
4439
4440 static void io_poll_remove_double(struct io_kiocb *req)
4441 {
4442         struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io;
4443
4444         lockdep_assert_held(&req->ctx->completion_lock);
4445
4446         if (poll && poll->head) {
4447                 struct wait_queue_head *head = poll->head;
4448
4449                 spin_lock(&head->lock);
4450                 list_del_init(&poll->wait.entry);
4451                 if (poll->wait.private)
4452                         refcount_dec(&req->refs);
4453                 poll->head = NULL;
4454                 spin_unlock(&head->lock);
4455         }
4456 }
4457
4458 static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
4459 {
4460         struct io_ring_ctx *ctx = req->ctx;
4461
4462         io_poll_remove_double(req);
4463         req->poll.done = true;
4464         io_cqring_fill_event(req, error ? error : mangle_poll(mask));
4465         io_commit_cqring(ctx);
4466 }
4467
4468 static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
4469 {
4470         struct io_ring_ctx *ctx = req->ctx;
4471
4472         if (io_poll_rewait(req, &req->poll)) {
4473                 spin_unlock_irq(&ctx->completion_lock);
4474                 return;
4475         }
4476
4477         hash_del(&req->hash_node);
4478         io_poll_complete(req, req->result, 0);
4479         req->flags |= REQ_F_COMP_LOCKED;
4480         io_put_req_find_next(req, nxt);
4481         spin_unlock_irq(&ctx->completion_lock);
4482
4483         io_cqring_ev_posted(ctx);
4484 }
4485
4486 static void io_poll_task_func(struct callback_head *cb)
4487 {
4488         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4489         struct io_kiocb *nxt = NULL;
4490
4491         io_poll_task_handler(req, &nxt);
4492         if (nxt) {
4493                 struct io_ring_ctx *ctx = nxt->ctx;
4494
4495                 mutex_lock(&ctx->uring_lock);
4496                 __io_queue_sqe(nxt, NULL, NULL);
4497                 mutex_unlock(&ctx->uring_lock);
4498         }
4499 }
4500
4501 static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
4502                                int sync, void *key)
4503 {
4504         struct io_kiocb *req = wait->private;
4505         struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io;
4506         __poll_t mask = key_to_poll(key);
4507
4508         /* for instances that support it check for an event match first: */
4509         if (mask && !(mask & poll->events))
4510                 return 0;
4511
4512         if (req->poll.head) {
4513                 bool done;
4514
4515                 spin_lock(&req->poll.head->lock);
4516                 done = list_empty(&req->poll.wait.entry);
4517                 if (!done)
4518                         list_del_init(&req->poll.wait.entry);
4519                 spin_unlock(&req->poll.head->lock);
4520                 if (!done)
4521                         __io_async_wake(req, poll, mask, io_poll_task_func);
4522         }
4523         refcount_dec(&req->refs);
4524         return 1;
4525 }
4526
4527 static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
4528                               wait_queue_func_t wake_func)
4529 {
4530         poll->head = NULL;
4531         poll->done = false;
4532         poll->canceled = false;
4533         poll->events = events;
4534         INIT_LIST_HEAD(&poll->wait.entry);
4535         init_waitqueue_func_entry(&poll->wait, wake_func);
4536 }
4537
4538 static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
4539                             struct wait_queue_head *head)
4540 {
4541         struct io_kiocb *req = pt->req;
4542
4543         /*
4544          * If poll->head is already set, it's because the file being polled
4545          * uses multiple waitqueues for poll handling (eg one for read, one
4546          * for write). Setup a separate io_poll_iocb if this happens.
4547          */
4548         if (unlikely(poll->head)) {
4549                 /* already have a 2nd entry, fail a third attempt */
4550                 if (req->io) {
4551                         pt->error = -EINVAL;
4552                         return;
4553                 }
4554                 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
4555                 if (!poll) {
4556                         pt->error = -ENOMEM;
4557                         return;
4558                 }
4559                 io_init_poll_iocb(poll, req->poll.events, io_poll_double_wake);
4560                 refcount_inc(&req->refs);
4561                 poll->wait.private = req;
4562                 req->io = (void *) poll;
4563         }
4564
4565         pt->error = 0;
4566         poll->head = head;
4567
4568         if (poll->events & EPOLLEXCLUSIVE)
4569                 add_wait_queue_exclusive(head, &poll->wait);
4570         else
4571                 add_wait_queue(head, &poll->wait);
4572 }
4573
4574 static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
4575                                struct poll_table_struct *p)
4576 {
4577         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
4578
4579         __io_queue_proc(&pt->req->apoll->poll, pt, head);
4580 }
4581
4582 static void io_async_task_func(struct callback_head *cb)
4583 {
4584         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4585         struct async_poll *apoll = req->apoll;
4586         struct io_ring_ctx *ctx = req->ctx;
4587         bool canceled = false;
4588
4589         trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
4590
4591         if (io_poll_rewait(req, &apoll->poll)) {
4592                 spin_unlock_irq(&ctx->completion_lock);
4593                 return;
4594         }
4595
4596         /* If req is still hashed, it cannot have been canceled. Don't check. */
4597         if (hash_hashed(&req->hash_node)) {
4598                 hash_del(&req->hash_node);
4599         } else {
4600                 canceled = READ_ONCE(apoll->poll.canceled);
4601                 if (canceled) {
4602                         io_cqring_fill_event(req, -ECANCELED);
4603                         io_commit_cqring(ctx);
4604                 }
4605         }
4606
4607         spin_unlock_irq(&ctx->completion_lock);
4608
4609         /* restore ->work in case we need to retry again */
4610         if (req->flags & REQ_F_WORK_INITIALIZED)
4611                 memcpy(&req->work, &apoll->work, sizeof(req->work));
4612         kfree(apoll);
4613
4614         if (!canceled) {
4615                 __set_current_state(TASK_RUNNING);
4616                 if (io_sq_thread_acquire_mm(ctx, req)) {
4617                         io_cqring_add_event(req, -EFAULT, 0);
4618                         goto end_req;
4619                 }
4620                 mutex_lock(&ctx->uring_lock);
4621                 __io_queue_sqe(req, NULL, NULL);
4622                 mutex_unlock(&ctx->uring_lock);
4623         } else {
4624                 io_cqring_ev_posted(ctx);
4625 end_req:
4626                 req_set_fail_links(req);
4627                 io_double_put_req(req);
4628         }
4629 }
4630
4631 static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
4632                         void *key)
4633 {
4634         struct io_kiocb *req = wait->private;
4635         struct io_poll_iocb *poll = &req->apoll->poll;
4636
4637         trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
4638                                         key_to_poll(key));
4639
4640         return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
4641 }
4642
4643 static void io_poll_req_insert(struct io_kiocb *req)
4644 {
4645         struct io_ring_ctx *ctx = req->ctx;
4646         struct hlist_head *list;
4647
4648         list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
4649         hlist_add_head(&req->hash_node, list);
4650 }
4651
4652 static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
4653                                       struct io_poll_iocb *poll,
4654                                       struct io_poll_table *ipt, __poll_t mask,
4655                                       wait_queue_func_t wake_func)
4656         __acquires(&ctx->completion_lock)
4657 {
4658         struct io_ring_ctx *ctx = req->ctx;
4659         bool cancel = false;
4660
4661         io_init_poll_iocb(poll, mask, wake_func);
4662         poll->file = req->file;
4663         poll->wait.private = req;
4664
4665         ipt->pt._key = mask;
4666         ipt->req = req;
4667         ipt->error = -EINVAL;
4668
4669         mask = vfs_poll(req->file, &ipt->pt) & poll->events;
4670
4671         spin_lock_irq(&ctx->completion_lock);
4672         if (likely(poll->head)) {
4673                 spin_lock(&poll->head->lock);
4674                 if (unlikely(list_empty(&poll->wait.entry))) {
4675                         if (ipt->error)
4676                                 cancel = true;
4677                         ipt->error = 0;
4678                         mask = 0;
4679                 }
4680                 if (mask || ipt->error)
4681                         list_del_init(&poll->wait.entry);
4682                 else if (cancel)
4683                         WRITE_ONCE(poll->canceled, true);
4684                 else if (!poll->done) /* actually waiting for an event */
4685                         io_poll_req_insert(req);
4686                 spin_unlock(&poll->head->lock);
4687         }
4688
4689         return mask;
4690 }
4691
4692 static bool io_arm_poll_handler(struct io_kiocb *req)
4693 {
4694         const struct io_op_def *def = &io_op_defs[req->opcode];
4695         struct io_ring_ctx *ctx = req->ctx;
4696         struct async_poll *apoll;
4697         struct io_poll_table ipt;
4698         __poll_t mask, ret;
4699         bool had_io;
4700
4701         if (!req->file || !file_can_poll(req->file))
4702                 return false;
4703         if (req->flags & REQ_F_POLLED)
4704                 return false;
4705         if (!def->pollin && !def->pollout)
4706                 return false;
4707
4708         apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
4709         if (unlikely(!apoll))
4710                 return false;
4711
4712         req->flags |= REQ_F_POLLED;
4713         if (req->flags & REQ_F_WORK_INITIALIZED)
4714                 memcpy(&apoll->work, &req->work, sizeof(req->work));
4715         had_io = req->io != NULL;
4716
4717         io_get_req_task(req);
4718         req->apoll = apoll;
4719         INIT_HLIST_NODE(&req->hash_node);
4720
4721         mask = 0;
4722         if (def->pollin)
4723                 mask |= POLLIN | POLLRDNORM;
4724         if (def->pollout)
4725                 mask |= POLLOUT | POLLWRNORM;
4726         mask |= POLLERR | POLLPRI;
4727
4728         ipt.pt._qproc = io_async_queue_proc;
4729
4730         ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
4731                                         io_async_wake);
4732         if (ret) {
4733                 ipt.error = 0;
4734                 /* only remove double add if we did it here */
4735                 if (!had_io)
4736                         io_poll_remove_double(req);
4737                 spin_unlock_irq(&ctx->completion_lock);
4738                 if (req->flags & REQ_F_WORK_INITIALIZED)
4739                         memcpy(&req->work, &apoll->work, sizeof(req->work));
4740                 kfree(apoll);
4741                 return false;
4742         }
4743         spin_unlock_irq(&ctx->completion_lock);
4744         trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
4745                                         apoll->poll.events);
4746         return true;
4747 }
4748
4749 static bool __io_poll_remove_one(struct io_kiocb *req,
4750                                  struct io_poll_iocb *poll)
4751 {
4752         bool do_complete = false;
4753
4754         spin_lock(&poll->head->lock);
4755         WRITE_ONCE(poll->canceled, true);
4756         if (!list_empty(&poll->wait.entry)) {
4757                 list_del_init(&poll->wait.entry);
4758                 do_complete = true;
4759         }
4760         spin_unlock(&poll->head->lock);
4761         hash_del(&req->hash_node);
4762         return do_complete;
4763 }
4764
4765 static bool io_poll_remove_one(struct io_kiocb *req)
4766 {
4767         bool do_complete;
4768
4769         if (req->opcode == IORING_OP_POLL_ADD) {
4770                 io_poll_remove_double(req);
4771                 do_complete = __io_poll_remove_one(req, &req->poll);
4772         } else {
4773                 struct async_poll *apoll = req->apoll;
4774
4775                 /* non-poll requests have submit ref still */
4776                 do_complete = __io_poll_remove_one(req, &apoll->poll);
4777                 if (do_complete) {
4778                         io_put_req(req);
4779                         /*
4780                          * restore ->work because we will call
4781                          * io_req_work_drop_env below when dropping the
4782                          * final reference.
4783                          */
4784                         if (req->flags & REQ_F_WORK_INITIALIZED)
4785                                 memcpy(&req->work, &apoll->work,
4786                                        sizeof(req->work));
4787                         kfree(apoll);
4788                 }
4789         }
4790
4791         if (do_complete) {
4792                 io_cqring_fill_event(req, -ECANCELED);
4793                 io_commit_cqring(req->ctx);
4794                 req->flags |= REQ_F_COMP_LOCKED;
4795                 io_put_req(req);
4796         }
4797
4798         return do_complete;
4799 }
4800
4801 static void io_poll_remove_all(struct io_ring_ctx *ctx)
4802 {
4803         struct hlist_node *tmp;
4804         struct io_kiocb *req;
4805         int posted = 0, i;
4806
4807         spin_lock_irq(&ctx->completion_lock);
4808         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
4809                 struct hlist_head *list;
4810
4811                 list = &ctx->cancel_hash[i];
4812                 hlist_for_each_entry_safe(req, tmp, list, hash_node)
4813                         posted += io_poll_remove_one(req);
4814         }
4815         spin_unlock_irq(&ctx->completion_lock);
4816
4817         if (posted)
4818                 io_cqring_ev_posted(ctx);
4819 }
4820
4821 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
4822 {
4823         struct hlist_head *list;
4824         struct io_kiocb *req;
4825
4826         list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
4827         hlist_for_each_entry(req, list, hash_node) {
4828                 if (sqe_addr != req->user_data)
4829                         continue;
4830                 if (io_poll_remove_one(req))
4831                         return 0;
4832                 return -EALREADY;
4833         }
4834
4835         return -ENOENT;
4836 }
4837
4838 static int io_poll_remove_prep(struct io_kiocb *req,
4839                                const struct io_uring_sqe *sqe)
4840 {
4841         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4842                 return -EINVAL;
4843         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
4844             sqe->poll_events)
4845                 return -EINVAL;
4846
4847         req->poll.addr = READ_ONCE(sqe->addr);
4848         return 0;
4849 }
4850
4851 /*
4852  * Find a running poll command that matches one specified in sqe->addr,
4853  * and remove it if found.
4854  */
4855 static int io_poll_remove(struct io_kiocb *req)
4856 {
4857         struct io_ring_ctx *ctx = req->ctx;
4858         u64 addr;
4859         int ret;
4860
4861         addr = req->poll.addr;
4862         spin_lock_irq(&ctx->completion_lock);
4863         ret = io_poll_cancel(ctx, addr);
4864         spin_unlock_irq(&ctx->completion_lock);
4865
4866         if (ret < 0)
4867                 req_set_fail_links(req);
4868         io_req_complete(req, ret);
4869         return 0;
4870 }
4871
4872 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
4873                         void *key)
4874 {
4875         struct io_kiocb *req = wait->private;
4876         struct io_poll_iocb *poll = &req->poll;
4877
4878         return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
4879 }
4880
4881 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
4882                                struct poll_table_struct *p)
4883 {
4884         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
4885
4886         __io_queue_proc(&pt->req->poll, pt, head);
4887 }
4888
4889 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4890 {
4891         struct io_poll_iocb *poll = &req->poll;
4892         u32 events;
4893
4894         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4895                 return -EINVAL;
4896         if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
4897                 return -EINVAL;
4898         if (!poll->file)
4899                 return -EBADF;
4900
4901         events = READ_ONCE(sqe->poll32_events);
4902 #ifdef __BIG_ENDIAN
4903         events = swahw32(events);
4904 #endif
4905         poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
4906                        (events & EPOLLEXCLUSIVE);
4907
4908         io_get_req_task(req);
4909         return 0;
4910 }
4911
4912 static int io_poll_add(struct io_kiocb *req)
4913 {
4914         struct io_poll_iocb *poll = &req->poll;
4915         struct io_ring_ctx *ctx = req->ctx;
4916         struct io_poll_table ipt;
4917         __poll_t mask;
4918
4919         INIT_HLIST_NODE(&req->hash_node);
4920         INIT_LIST_HEAD(&req->list);
4921         ipt.pt._qproc = io_poll_queue_proc;
4922
4923         mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
4924                                         io_poll_wake);
4925
4926         if (mask) { /* no async, we'd stolen it */
4927                 ipt.error = 0;
4928                 io_poll_complete(req, mask, 0);
4929         }
4930         spin_unlock_irq(&ctx->completion_lock);
4931
4932         if (mask) {
4933                 io_cqring_ev_posted(ctx);
4934                 io_put_req(req);
4935         }
4936         return ipt.error;
4937 }
4938
4939 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
4940 {
4941         struct io_timeout_data *data = container_of(timer,
4942                                                 struct io_timeout_data, timer);
4943         struct io_kiocb *req = data->req;
4944         struct io_ring_ctx *ctx = req->ctx;
4945         unsigned long flags;
4946
4947         atomic_inc(&ctx->cq_timeouts);
4948
4949         spin_lock_irqsave(&ctx->completion_lock, flags);
4950         /*
4951          * We could be racing with timeout deletion. If the list is empty,
4952          * then timeout lookup already found it and will be handling it.
4953          */
4954         if (!list_empty(&req->list))
4955                 list_del_init(&req->list);
4956
4957         io_cqring_fill_event(req, -ETIME);
4958         io_commit_cqring(ctx);
4959         spin_unlock_irqrestore(&ctx->completion_lock, flags);
4960
4961         io_cqring_ev_posted(ctx);
4962         req_set_fail_links(req);
4963         io_put_req(req);
4964         return HRTIMER_NORESTART;
4965 }
4966
4967 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
4968 {
4969         struct io_kiocb *req;
4970         int ret = -ENOENT;
4971
4972         list_for_each_entry(req, &ctx->timeout_list, list) {
4973                 if (user_data == req->user_data) {
4974                         list_del_init(&req->list);
4975                         ret = 0;
4976                         break;
4977                 }
4978         }
4979
4980         if (ret == -ENOENT)
4981                 return ret;
4982
4983         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
4984         if (ret == -1)
4985                 return -EALREADY;
4986
4987         req_set_fail_links(req);
4988         io_cqring_fill_event(req, -ECANCELED);
4989         io_put_req(req);
4990         return 0;
4991 }
4992
4993 static int io_timeout_remove_prep(struct io_kiocb *req,
4994                                   const struct io_uring_sqe *sqe)
4995 {
4996         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4997                 return -EINVAL;
4998         if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
4999                 return -EINVAL;
5000
5001         req->timeout.addr = READ_ONCE(sqe->addr);
5002         req->timeout.flags = READ_ONCE(sqe->timeout_flags);
5003         if (req->timeout.flags)
5004                 return -EINVAL;
5005
5006         return 0;
5007 }
5008
5009 /*
5010  * Remove or update an existing timeout command
5011  */
5012 static int io_timeout_remove(struct io_kiocb *req)
5013 {
5014         struct io_ring_ctx *ctx = req->ctx;
5015         int ret;
5016
5017         spin_lock_irq(&ctx->completion_lock);
5018         ret = io_timeout_cancel(ctx, req->timeout.addr);
5019
5020         io_cqring_fill_event(req, ret);
5021         io_commit_cqring(ctx);
5022         spin_unlock_irq(&ctx->completion_lock);
5023         io_cqring_ev_posted(ctx);
5024         if (ret < 0)
5025                 req_set_fail_links(req);
5026         io_put_req(req);
5027         return 0;
5028 }
5029
5030 static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5031                            bool is_timeout_link)
5032 {
5033         struct io_timeout_data *data;
5034         unsigned flags;
5035         u32 off = READ_ONCE(sqe->off);
5036
5037         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5038                 return -EINVAL;
5039         if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
5040                 return -EINVAL;
5041         if (off && is_timeout_link)
5042                 return -EINVAL;
5043         flags = READ_ONCE(sqe->timeout_flags);
5044         if (flags & ~IORING_TIMEOUT_ABS)
5045                 return -EINVAL;
5046
5047         req->timeout.off = off;
5048
5049         if (!req->io && io_alloc_async_ctx(req))
5050                 return -ENOMEM;
5051
5052         data = &req->io->timeout;
5053         data->req = req;
5054         req->flags |= REQ_F_TIMEOUT;
5055
5056         if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5057                 return -EFAULT;
5058
5059         if (flags & IORING_TIMEOUT_ABS)
5060                 data->mode = HRTIMER_MODE_ABS;
5061         else
5062                 data->mode = HRTIMER_MODE_REL;
5063
5064         hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
5065         return 0;
5066 }
5067
5068 static int io_timeout(struct io_kiocb *req)
5069 {
5070         struct io_ring_ctx *ctx = req->ctx;
5071         struct io_timeout_data *data = &req->io->timeout;
5072         struct list_head *entry;
5073         u32 tail, off = req->timeout.off;
5074
5075         spin_lock_irq(&ctx->completion_lock);
5076
5077         /*
5078          * sqe->off holds how many events that need to occur for this
5079          * timeout event to be satisfied. If it isn't set, then this is
5080          * a pure timeout request, sequence isn't used.
5081          */
5082         if (!off) {
5083                 req->flags |= REQ_F_TIMEOUT_NOSEQ;
5084                 entry = ctx->timeout_list.prev;
5085                 goto add;
5086         }
5087
5088         tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
5089         req->timeout.target_seq = tail + off;
5090
5091         /*
5092          * Insertion sort, ensuring the first entry in the list is always
5093          * the one we need first.
5094          */
5095         list_for_each_prev(entry, &ctx->timeout_list) {
5096                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
5097
5098                 if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
5099                         continue;
5100                 /* nxt.seq is behind @tail, otherwise would've been completed */
5101                 if (off >= nxt->timeout.target_seq - tail)
5102                         break;
5103         }
5104 add:
5105         list_add(&req->list, entry);
5106         data->timer.function = io_timeout_fn;
5107         hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
5108         spin_unlock_irq(&ctx->completion_lock);
5109         return 0;
5110 }
5111
5112 static bool io_cancel_cb(struct io_wq_work *work, void *data)
5113 {
5114         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5115
5116         return req->user_data == (unsigned long) data;
5117 }
5118
5119 static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
5120 {
5121         enum io_wq_cancel cancel_ret;
5122         int ret = 0;
5123
5124         cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false);
5125         switch (cancel_ret) {
5126         case IO_WQ_CANCEL_OK:
5127                 ret = 0;
5128                 break;
5129         case IO_WQ_CANCEL_RUNNING:
5130                 ret = -EALREADY;
5131                 break;
5132         case IO_WQ_CANCEL_NOTFOUND:
5133                 ret = -ENOENT;
5134                 break;
5135         }
5136
5137         return ret;
5138 }
5139
5140 static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
5141                                      struct io_kiocb *req, __u64 sqe_addr,
5142                                      int success_ret)
5143 {
5144         unsigned long flags;
5145         int ret;
5146
5147         ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
5148         if (ret != -ENOENT) {
5149                 spin_lock_irqsave(&ctx->completion_lock, flags);
5150                 goto done;
5151         }
5152
5153         spin_lock_irqsave(&ctx->completion_lock, flags);
5154         ret = io_timeout_cancel(ctx, sqe_addr);
5155         if (ret != -ENOENT)
5156                 goto done;
5157         ret = io_poll_cancel(ctx, sqe_addr);
5158 done:
5159         if (!ret)
5160                 ret = success_ret;
5161         io_cqring_fill_event(req, ret);
5162         io_commit_cqring(ctx);
5163         spin_unlock_irqrestore(&ctx->completion_lock, flags);
5164         io_cqring_ev_posted(ctx);
5165
5166         if (ret < 0)
5167                 req_set_fail_links(req);
5168         io_put_req(req);
5169 }
5170
5171 static int io_async_cancel_prep(struct io_kiocb *req,
5172                                 const struct io_uring_sqe *sqe)
5173 {
5174         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5175                 return -EINVAL;
5176         if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
5177             sqe->cancel_flags)
5178                 return -EINVAL;
5179
5180         req->cancel.addr = READ_ONCE(sqe->addr);
5181         return 0;
5182 }
5183
5184 static int io_async_cancel(struct io_kiocb *req)
5185 {
5186         struct io_ring_ctx *ctx = req->ctx;
5187
5188         io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
5189         return 0;
5190 }
5191
5192 static int io_files_update_prep(struct io_kiocb *req,
5193                                 const struct io_uring_sqe *sqe)
5194 {
5195         if (sqe->flags || sqe->ioprio || sqe->rw_flags)
5196                 return -EINVAL;
5197
5198         req->files_update.offset = READ_ONCE(sqe->off);
5199         req->files_update.nr_args = READ_ONCE(sqe->len);
5200         if (!req->files_update.nr_args)
5201                 return -EINVAL;
5202         req->files_update.arg = READ_ONCE(sqe->addr);
5203         return 0;
5204 }
5205
5206 static int io_files_update(struct io_kiocb *req, bool force_nonblock,
5207                            struct io_comp_state *cs)
5208 {
5209         struct io_ring_ctx *ctx = req->ctx;
5210         struct io_uring_files_update up;
5211         int ret;
5212
5213         if (force_nonblock)
5214                 return -EAGAIN;
5215
5216         up.offset = req->files_update.offset;
5217         up.fds = req->files_update.arg;
5218
5219         mutex_lock(&ctx->uring_lock);
5220         ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
5221         mutex_unlock(&ctx->uring_lock);
5222
5223         if (ret < 0)
5224                 req_set_fail_links(req);
5225         __io_req_complete(req, ret, 0, cs);
5226         return 0;
5227 }
5228
5229 static int io_req_defer_prep(struct io_kiocb *req,
5230                              const struct io_uring_sqe *sqe, bool for_async)
5231 {
5232         ssize_t ret = 0;
5233
5234         if (!sqe)
5235                 return 0;
5236
5237         if (io_op_defs[req->opcode].file_table) {
5238                 io_req_init_async(req);
5239                 ret = io_grab_files(req);
5240                 if (unlikely(ret))
5241                         return ret;
5242         }
5243
5244         if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) {
5245                 io_req_init_async(req);
5246                 io_req_work_grab_env(req, &io_op_defs[req->opcode]);
5247         }
5248
5249         switch (req->opcode) {
5250         case IORING_OP_NOP:
5251                 break;
5252         case IORING_OP_READV:
5253         case IORING_OP_READ_FIXED:
5254         case IORING_OP_READ:
5255                 ret = io_read_prep(req, sqe, true);
5256                 break;
5257         case IORING_OP_WRITEV:
5258         case IORING_OP_WRITE_FIXED:
5259         case IORING_OP_WRITE:
5260                 ret = io_write_prep(req, sqe, true);
5261                 break;
5262         case IORING_OP_POLL_ADD:
5263                 ret = io_poll_add_prep(req, sqe);
5264                 break;
5265         case IORING_OP_POLL_REMOVE:
5266                 ret = io_poll_remove_prep(req, sqe);
5267                 break;
5268         case IORING_OP_FSYNC:
5269                 ret = io_prep_fsync(req, sqe);
5270                 break;
5271         case IORING_OP_SYNC_FILE_RANGE:
5272                 ret = io_prep_sfr(req, sqe);
5273                 break;
5274         case IORING_OP_SENDMSG:
5275         case IORING_OP_SEND:
5276                 ret = io_sendmsg_prep(req, sqe);
5277                 break;
5278         case IORING_OP_RECVMSG:
5279         case IORING_OP_RECV:
5280                 ret = io_recvmsg_prep(req, sqe);
5281                 break;
5282         case IORING_OP_CONNECT:
5283                 ret = io_connect_prep(req, sqe);
5284                 break;
5285         case IORING_OP_TIMEOUT:
5286                 ret = io_timeout_prep(req, sqe, false);
5287                 break;
5288         case IORING_OP_TIMEOUT_REMOVE:
5289                 ret = io_timeout_remove_prep(req, sqe);
5290                 break;
5291         case IORING_OP_ASYNC_CANCEL:
5292                 ret = io_async_cancel_prep(req, sqe);
5293                 break;
5294         case IORING_OP_LINK_TIMEOUT:
5295                 ret = io_timeout_prep(req, sqe, true);
5296                 break;
5297         case IORING_OP_ACCEPT:
5298                 ret = io_accept_prep(req, sqe);
5299                 break;
5300         case IORING_OP_FALLOCATE:
5301                 ret = io_fallocate_prep(req, sqe);
5302                 break;
5303         case IORING_OP_OPENAT:
5304                 ret = io_openat_prep(req, sqe);
5305                 break;
5306         case IORING_OP_CLOSE:
5307                 ret = io_close_prep(req, sqe);
5308                 break;
5309         case IORING_OP_FILES_UPDATE:
5310                 ret = io_files_update_prep(req, sqe);
5311                 break;
5312         case IORING_OP_STATX:
5313                 ret = io_statx_prep(req, sqe);
5314                 break;
5315         case IORING_OP_FADVISE:
5316                 ret = io_fadvise_prep(req, sqe);
5317                 break;
5318         case IORING_OP_MADVISE:
5319                 ret = io_madvise_prep(req, sqe);
5320                 break;
5321         case IORING_OP_OPENAT2:
5322                 ret = io_openat2_prep(req, sqe);
5323                 break;
5324         case IORING_OP_EPOLL_CTL:
5325                 ret = io_epoll_ctl_prep(req, sqe);
5326                 break;
5327         case IORING_OP_SPLICE:
5328                 ret = io_splice_prep(req, sqe);
5329                 break;
5330         case IORING_OP_PROVIDE_BUFFERS:
5331                 ret = io_provide_buffers_prep(req, sqe);
5332                 break;
5333         case IORING_OP_REMOVE_BUFFERS:
5334                 ret = io_remove_buffers_prep(req, sqe);
5335                 break;
5336         case IORING_OP_TEE:
5337                 ret = io_tee_prep(req, sqe);
5338                 break;
5339         default:
5340                 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
5341                                 req->opcode);
5342                 ret = -EINVAL;
5343                 break;
5344         }
5345
5346         return ret;
5347 }
5348
5349 static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5350 {
5351         struct io_ring_ctx *ctx = req->ctx;
5352         int ret;
5353
5354         /* Still need defer if there is pending req in defer list. */
5355         if (!req_need_defer(req) && list_empty_careful(&ctx->defer_list))
5356                 return 0;
5357
5358         if (!req->io) {
5359                 if (io_alloc_async_ctx(req))
5360                         return -EAGAIN;
5361                 ret = io_req_defer_prep(req, sqe, true);
5362                 if (ret < 0)
5363                         return ret;
5364         }
5365
5366         spin_lock_irq(&ctx->completion_lock);
5367         if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
5368                 spin_unlock_irq(&ctx->completion_lock);
5369                 return 0;
5370         }
5371
5372         trace_io_uring_defer(ctx, req, req->user_data);
5373         list_add_tail(&req->list, &ctx->defer_list);
5374         spin_unlock_irq(&ctx->completion_lock);
5375         return -EIOCBQUEUED;
5376 }
5377
5378 static void io_cleanup_req(struct io_kiocb *req)
5379 {
5380         struct io_async_ctx *io = req->io;
5381
5382         switch (req->opcode) {
5383         case IORING_OP_READV:
5384         case IORING_OP_READ_FIXED:
5385         case IORING_OP_READ:
5386                 if (req->flags & REQ_F_BUFFER_SELECTED)
5387                         kfree((void *)(unsigned long)req->rw.addr);
5388                 /* fallthrough */
5389         case IORING_OP_WRITEV:
5390         case IORING_OP_WRITE_FIXED:
5391         case IORING_OP_WRITE:
5392                 if (io->rw.iov != io->rw.fast_iov)
5393                         kfree(io->rw.iov);
5394                 break;
5395         case IORING_OP_RECVMSG:
5396                 if (req->flags & REQ_F_BUFFER_SELECTED)
5397                         kfree(req->sr_msg.kbuf);
5398                 /* fallthrough */
5399         case IORING_OP_SENDMSG:
5400                 if (io->msg.iov != io->msg.fast_iov)
5401                         kfree(io->msg.iov);
5402                 break;
5403         case IORING_OP_RECV:
5404                 if (req->flags & REQ_F_BUFFER_SELECTED)
5405                         kfree(req->sr_msg.kbuf);
5406                 break;
5407         case IORING_OP_OPENAT:
5408         case IORING_OP_OPENAT2:
5409                 break;
5410         case IORING_OP_SPLICE:
5411         case IORING_OP_TEE:
5412                 io_put_file(req, req->splice.file_in,
5413                             (req->splice.flags & SPLICE_F_FD_IN_FIXED));
5414                 break;
5415         }
5416
5417         req->flags &= ~REQ_F_NEED_CLEANUP;
5418 }
5419
5420 static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5421                         bool force_nonblock, struct io_comp_state *cs)
5422 {
5423         struct io_ring_ctx *ctx = req->ctx;
5424         int ret;
5425
5426         switch (req->opcode) {
5427         case IORING_OP_NOP:
5428                 ret = io_nop(req, cs);
5429                 break;
5430         case IORING_OP_READV:
5431         case IORING_OP_READ_FIXED:
5432         case IORING_OP_READ:
5433                 if (sqe) {
5434                         ret = io_read_prep(req, sqe, force_nonblock);
5435                         if (ret < 0)
5436                                 break;
5437                 }
5438                 ret = io_read(req, force_nonblock, cs);
5439                 break;
5440         case IORING_OP_WRITEV:
5441         case IORING_OP_WRITE_FIXED:
5442         case IORING_OP_WRITE:
5443                 if (sqe) {
5444                         ret = io_write_prep(req, sqe, force_nonblock);
5445                         if (ret < 0)
5446                                 break;
5447                 }
5448                 ret = io_write(req, force_nonblock, cs);
5449                 break;
5450         case IORING_OP_FSYNC:
5451                 if (sqe) {
5452                         ret = io_prep_fsync(req, sqe);
5453                         if (ret < 0)
5454                                 break;
5455                 }
5456                 ret = io_fsync(req, force_nonblock);
5457                 break;
5458         case IORING_OP_POLL_ADD:
5459                 if (sqe) {
5460                         ret = io_poll_add_prep(req, sqe);
5461                         if (ret)
5462                                 break;
5463                 }
5464                 ret = io_poll_add(req);
5465                 break;
5466         case IORING_OP_POLL_REMOVE:
5467                 if (sqe) {
5468                         ret = io_poll_remove_prep(req, sqe);
5469                         if (ret < 0)
5470                                 break;
5471                 }
5472                 ret = io_poll_remove(req);
5473                 break;
5474         case IORING_OP_SYNC_FILE_RANGE:
5475                 if (sqe) {
5476                         ret = io_prep_sfr(req, sqe);
5477                         if (ret < 0)
5478                                 break;
5479                 }
5480                 ret = io_sync_file_range(req, force_nonblock);
5481                 break;
5482         case IORING_OP_SENDMSG:
5483         case IORING_OP_SEND:
5484                 if (sqe) {
5485                         ret = io_sendmsg_prep(req, sqe);
5486                         if (ret < 0)
5487                                 break;
5488                 }
5489                 if (req->opcode == IORING_OP_SENDMSG)
5490                         ret = io_sendmsg(req, force_nonblock, cs);
5491                 else
5492                         ret = io_send(req, force_nonblock, cs);
5493                 break;
5494         case IORING_OP_RECVMSG:
5495         case IORING_OP_RECV:
5496                 if (sqe) {
5497                         ret = io_recvmsg_prep(req, sqe);
5498                         if (ret)
5499                                 break;
5500                 }
5501                 if (req->opcode == IORING_OP_RECVMSG)
5502                         ret = io_recvmsg(req, force_nonblock, cs);
5503                 else
5504                         ret = io_recv(req, force_nonblock, cs);
5505                 break;
5506         case IORING_OP_TIMEOUT:
5507                 if (sqe) {
5508                         ret = io_timeout_prep(req, sqe, false);
5509                         if (ret)
5510                                 break;
5511                 }
5512                 ret = io_timeout(req);
5513                 break;
5514         case IORING_OP_TIMEOUT_REMOVE:
5515                 if (sqe) {
5516                         ret = io_timeout_remove_prep(req, sqe);
5517                         if (ret)
5518                                 break;
5519                 }
5520                 ret = io_timeout_remove(req);
5521                 break;
5522         case IORING_OP_ACCEPT:
5523                 if (sqe) {
5524                         ret = io_accept_prep(req, sqe);
5525                         if (ret)
5526                                 break;
5527                 }
5528                 ret = io_accept(req, force_nonblock, cs);
5529                 break;
5530         case IORING_OP_CONNECT:
5531                 if (sqe) {
5532                         ret = io_connect_prep(req, sqe);
5533                         if (ret)
5534                                 break;
5535                 }
5536                 ret = io_connect(req, force_nonblock, cs);
5537                 break;
5538         case IORING_OP_ASYNC_CANCEL:
5539                 if (sqe) {
5540                         ret = io_async_cancel_prep(req, sqe);
5541                         if (ret)
5542                                 break;
5543                 }
5544                 ret = io_async_cancel(req);
5545                 break;
5546         case IORING_OP_FALLOCATE:
5547                 if (sqe) {
5548                         ret = io_fallocate_prep(req, sqe);
5549                         if (ret)
5550                                 break;
5551                 }
5552                 ret = io_fallocate(req, force_nonblock);
5553                 break;
5554         case IORING_OP_OPENAT:
5555                 if (sqe) {
5556                         ret = io_openat_prep(req, sqe);
5557                         if (ret)
5558                                 break;
5559                 }
5560                 ret = io_openat(req, force_nonblock);
5561                 break;
5562         case IORING_OP_CLOSE:
5563                 if (sqe) {
5564                         ret = io_close_prep(req, sqe);
5565                         if (ret)
5566                                 break;
5567                 }
5568                 ret = io_close(req, force_nonblock, cs);
5569                 break;
5570         case IORING_OP_FILES_UPDATE:
5571                 if (sqe) {
5572                         ret = io_files_update_prep(req, sqe);
5573                         if (ret)
5574                                 break;
5575                 }
5576                 ret = io_files_update(req, force_nonblock, cs);
5577                 break;
5578         case IORING_OP_STATX:
5579                 if (sqe) {
5580                         ret = io_statx_prep(req, sqe);
5581                         if (ret)
5582                                 break;
5583                 }
5584                 ret = io_statx(req, force_nonblock);
5585                 break;
5586         case IORING_OP_FADVISE:
5587                 if (sqe) {
5588                         ret = io_fadvise_prep(req, sqe);
5589                         if (ret)
5590                                 break;
5591                 }
5592                 ret = io_fadvise(req, force_nonblock);
5593                 break;
5594         case IORING_OP_MADVISE:
5595                 if (sqe) {
5596                         ret = io_madvise_prep(req, sqe);
5597                         if (ret)
5598                                 break;
5599                 }
5600                 ret = io_madvise(req, force_nonblock);
5601                 break;
5602         case IORING_OP_OPENAT2:
5603                 if (sqe) {
5604                         ret = io_openat2_prep(req, sqe);
5605                         if (ret)
5606                                 break;
5607                 }
5608                 ret = io_openat2(req, force_nonblock);
5609                 break;
5610         case IORING_OP_EPOLL_CTL:
5611                 if (sqe) {
5612                         ret = io_epoll_ctl_prep(req, sqe);
5613                         if (ret)
5614                                 break;
5615                 }
5616                 ret = io_epoll_ctl(req, force_nonblock, cs);
5617                 break;
5618         case IORING_OP_SPLICE:
5619                 if (sqe) {
5620                         ret = io_splice_prep(req, sqe);
5621                         if (ret < 0)
5622                                 break;
5623                 }
5624                 ret = io_splice(req, force_nonblock);
5625                 break;
5626         case IORING_OP_PROVIDE_BUFFERS:
5627                 if (sqe) {
5628                         ret = io_provide_buffers_prep(req, sqe);
5629                         if (ret)
5630                                 break;
5631                 }
5632                 ret = io_provide_buffers(req, force_nonblock, cs);
5633                 break;
5634         case IORING_OP_REMOVE_BUFFERS:
5635                 if (sqe) {
5636                         ret = io_remove_buffers_prep(req, sqe);
5637                         if (ret)
5638                                 break;
5639                 }
5640                 ret = io_remove_buffers(req, force_nonblock, cs);
5641                 break;
5642         case IORING_OP_TEE:
5643                 if (sqe) {
5644                         ret = io_tee_prep(req, sqe);
5645                         if (ret < 0)
5646                                 break;
5647                 }
5648                 ret = io_tee(req, force_nonblock);
5649                 break;
5650         default:
5651                 ret = -EINVAL;
5652                 break;
5653         }
5654
5655         if (ret)
5656                 return ret;
5657
5658         /* If the op doesn't have a file, we're not polling for it */
5659         if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
5660                 const bool in_async = io_wq_current_is_worker();
5661
5662                 /* workqueue context doesn't hold uring_lock, grab it now */
5663                 if (in_async)
5664                         mutex_lock(&ctx->uring_lock);
5665
5666                 io_iopoll_req_issued(req);
5667
5668                 if (in_async)
5669                         mutex_unlock(&ctx->uring_lock);
5670         }
5671
5672         return 0;
5673 }
5674
5675 static void io_arm_async_linked_timeout(struct io_kiocb *req)
5676 {
5677         struct io_kiocb *link;
5678
5679         /* link head's timeout is queued in io_queue_async_work() */
5680         if (!(req->flags & REQ_F_QUEUE_TIMEOUT))
5681                 return;
5682
5683         link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
5684         io_queue_linked_timeout(link);
5685 }
5686
5687 static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
5688 {
5689         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5690         int ret = 0;
5691
5692         io_arm_async_linked_timeout(req);
5693
5694         /* if NO_CANCEL is set, we must still run the work */
5695         if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
5696                                 IO_WQ_WORK_CANCEL) {
5697                 ret = -ECANCELED;
5698         }
5699
5700         if (!ret) {
5701                 do {
5702                         ret = io_issue_sqe(req, NULL, false, NULL);
5703                         /*
5704                          * We can get EAGAIN for polled IO even though we're
5705                          * forcing a sync submission from here, since we can't
5706                          * wait for request slots on the block side.
5707                          */
5708                         if (ret != -EAGAIN)
5709                                 break;
5710                         cond_resched();
5711                 } while (1);
5712         }
5713
5714         if (ret) {
5715                 req_set_fail_links(req);
5716                 io_req_complete(req, ret);
5717         }
5718
5719         return io_steal_work(req);
5720 }
5721
5722 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
5723                                               int index)
5724 {
5725         struct fixed_file_table *table;
5726
5727         table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
5728         return table->files[index & IORING_FILE_TABLE_MASK];
5729 }
5730
5731 static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
5732                         int fd, struct file **out_file, bool fixed)
5733 {
5734         struct io_ring_ctx *ctx = req->ctx;
5735         struct file *file;
5736
5737         if (fixed) {
5738                 if (unlikely(!ctx->file_data ||
5739                     (unsigned) fd >= ctx->nr_user_files))
5740                         return -EBADF;
5741                 fd = array_index_nospec(fd, ctx->nr_user_files);
5742                 file = io_file_from_index(ctx, fd);
5743                 if (file) {
5744                         req->fixed_file_refs = ctx->file_data->cur_refs;
5745                         percpu_ref_get(req->fixed_file_refs);
5746                 }
5747         } else {
5748                 trace_io_uring_file_get(ctx, fd);
5749                 file = __io_file_get(state, fd);
5750         }
5751
5752         if (file || io_op_defs[req->opcode].needs_file_no_error) {
5753                 *out_file = file;
5754                 return 0;
5755         }
5756         return -EBADF;
5757 }
5758
5759 static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
5760                            int fd)
5761 {
5762         bool fixed;
5763
5764         fixed = (req->flags & REQ_F_FIXED_FILE) != 0;
5765         if (unlikely(!fixed && io_async_submit(req->ctx)))
5766                 return -EBADF;
5767
5768         return io_file_get(state, req, fd, &req->file, fixed);
5769 }
5770
5771 static int io_grab_files(struct io_kiocb *req)
5772 {
5773         int ret = -EBADF;
5774         struct io_ring_ctx *ctx = req->ctx;
5775
5776         if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE))
5777                 return 0;
5778         if (!ctx->ring_file)
5779                 return -EBADF;
5780
5781         rcu_read_lock();
5782         spin_lock_irq(&ctx->inflight_lock);
5783         /*
5784          * We use the f_ops->flush() handler to ensure that we can flush
5785          * out work accessing these files if the fd is closed. Check if
5786          * the fd has changed since we started down this path, and disallow
5787          * this operation if it has.
5788          */
5789         if (fcheck(ctx->ring_fd) == ctx->ring_file) {
5790                 list_add(&req->inflight_entry, &ctx->inflight_list);
5791                 req->flags |= REQ_F_INFLIGHT;
5792                 req->work.files = current->files;
5793                 ret = 0;
5794         }
5795         spin_unlock_irq(&ctx->inflight_lock);
5796         rcu_read_unlock();
5797
5798         return ret;
5799 }
5800
5801 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
5802 {
5803         struct io_timeout_data *data = container_of(timer,
5804                                                 struct io_timeout_data, timer);
5805         struct io_kiocb *req = data->req;
5806         struct io_ring_ctx *ctx = req->ctx;
5807         struct io_kiocb *prev = NULL;
5808         unsigned long flags;
5809
5810         spin_lock_irqsave(&ctx->completion_lock, flags);
5811
5812         /*
5813          * We don't expect the list to be empty, that will only happen if we
5814          * race with the completion of the linked work.
5815          */
5816         if (!list_empty(&req->link_list)) {
5817                 prev = list_entry(req->link_list.prev, struct io_kiocb,
5818                                   link_list);
5819                 if (refcount_inc_not_zero(&prev->refs)) {
5820                         list_del_init(&req->link_list);
5821                         prev->flags &= ~REQ_F_LINK_TIMEOUT;
5822                 } else
5823                         prev = NULL;
5824         }
5825
5826         spin_unlock_irqrestore(&ctx->completion_lock, flags);
5827
5828         if (prev) {
5829                 req_set_fail_links(prev);
5830                 io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
5831                 io_put_req(prev);
5832         } else {
5833                 io_req_complete(req, -ETIME);
5834         }
5835         return HRTIMER_NORESTART;
5836 }
5837
5838 static void io_queue_linked_timeout(struct io_kiocb *req)
5839 {
5840         struct io_ring_ctx *ctx = req->ctx;
5841
5842         /*
5843          * If the list is now empty, then our linked request finished before
5844          * we got a chance to setup the timer
5845          */
5846         spin_lock_irq(&ctx->completion_lock);
5847         if (!list_empty(&req->link_list)) {
5848                 struct io_timeout_data *data = &req->io->timeout;
5849
5850                 data->timer.function = io_link_timeout_fn;
5851                 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
5852                                 data->mode);
5853         }
5854         spin_unlock_irq(&ctx->completion_lock);
5855
5856         /* drop submission reference */
5857         io_put_req(req);
5858 }
5859
5860 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
5861 {
5862         struct io_kiocb *nxt;
5863
5864         if (!(req->flags & REQ_F_LINK_HEAD))
5865                 return NULL;
5866         /* for polled retry, if flag is set, we already went through here */
5867         if (req->flags & REQ_F_POLLED)
5868                 return NULL;
5869
5870         nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
5871                                         link_list);
5872         if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
5873                 return NULL;
5874
5875         req->flags |= REQ_F_LINK_TIMEOUT;
5876         return nxt;
5877 }
5878
5879 static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5880                            struct io_comp_state *cs)
5881 {
5882         struct io_kiocb *linked_timeout;
5883         struct io_kiocb *nxt;
5884         const struct cred *old_creds = NULL;
5885         int ret;
5886
5887 again:
5888         linked_timeout = io_prep_linked_timeout(req);
5889
5890         if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds &&
5891             req->work.creds != current_cred()) {
5892                 if (old_creds)
5893                         revert_creds(old_creds);
5894                 if (old_creds == req->work.creds)
5895                         old_creds = NULL; /* restored original creds */
5896                 else
5897                         old_creds = override_creds(req->work.creds);
5898         }
5899
5900         ret = io_issue_sqe(req, sqe, true, cs);
5901
5902         /*
5903          * We async punt it if the file wasn't marked NOWAIT, or if the file
5904          * doesn't support non-blocking read/write attempts
5905          */
5906         if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
5907                 if (io_arm_poll_handler(req)) {
5908                         if (linked_timeout)
5909                                 io_queue_linked_timeout(linked_timeout);
5910                         goto exit;
5911                 }
5912 punt:
5913                 io_req_init_async(req);
5914
5915                 if (io_op_defs[req->opcode].file_table) {
5916                         ret = io_grab_files(req);
5917                         if (ret)
5918                                 goto err;
5919                 }
5920
5921                 /*
5922                  * Queued up for async execution, worker will release
5923                  * submit reference when the iocb is actually submitted.
5924                  */
5925                 io_queue_async_work(req);
5926                 goto exit;
5927         }
5928
5929 err:
5930         nxt = NULL;
5931         /* drop submission reference */
5932         io_put_req_find_next(req, &nxt);
5933
5934         if (linked_timeout) {
5935                 if (!ret)
5936                         io_queue_linked_timeout(linked_timeout);
5937                 else
5938                         io_put_req(linked_timeout);
5939         }
5940
5941         /* and drop final reference, if we failed */
5942         if (ret) {
5943                 req_set_fail_links(req);
5944                 io_req_complete(req, ret);
5945         }
5946         if (nxt) {
5947                 req = nxt;
5948
5949                 if (req->flags & REQ_F_FORCE_ASYNC)
5950                         goto punt;
5951                 goto again;
5952         }
5953 exit:
5954         if (old_creds)
5955                 revert_creds(old_creds);
5956 }
5957
5958 static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5959                          struct io_comp_state *cs)
5960 {
5961         int ret;
5962
5963         ret = io_req_defer(req, sqe);
5964         if (ret) {
5965                 if (ret != -EIOCBQUEUED) {
5966 fail_req:
5967                         req_set_fail_links(req);
5968                         io_put_req(req);
5969                         io_req_complete(req, ret);
5970                 }
5971         } else if (req->flags & REQ_F_FORCE_ASYNC) {
5972                 if (!req->io) {
5973                         ret = -EAGAIN;
5974                         if (io_alloc_async_ctx(req))
5975                                 goto fail_req;
5976                         ret = io_req_defer_prep(req, sqe, true);
5977                         if (unlikely(ret < 0))
5978                                 goto fail_req;
5979                 }
5980
5981                 /*
5982                  * Never try inline submit of IOSQE_ASYNC is set, go straight
5983                  * to async execution.
5984                  */
5985                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
5986                 io_queue_async_work(req);
5987         } else {
5988                 __io_queue_sqe(req, sqe, cs);
5989         }
5990 }
5991
5992 static inline void io_queue_link_head(struct io_kiocb *req,
5993                                       struct io_comp_state *cs)
5994 {
5995         if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
5996                 io_put_req(req);
5997                 io_req_complete(req, -ECANCELED);
5998         } else
5999                 io_queue_sqe(req, NULL, cs);
6000 }
6001
6002 static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
6003                          struct io_kiocb **link, struct io_comp_state *cs)
6004 {
6005         struct io_ring_ctx *ctx = req->ctx;
6006         int ret;
6007
6008         /*
6009          * If we already have a head request, queue this one for async
6010          * submittal once the head completes. If we don't have a head but
6011          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
6012          * submitted sync once the chain is complete. If none of those
6013          * conditions are true (normal request), then just queue it.
6014          */
6015         if (*link) {
6016                 struct io_kiocb *head = *link;
6017
6018                 /*
6019                  * Taking sequential execution of a link, draining both sides
6020                  * of the link also fullfils IOSQE_IO_DRAIN semantics for all
6021                  * requests in the link. So, it drains the head and the
6022                  * next after the link request. The last one is done via
6023                  * drain_next flag to persist the effect across calls.
6024                  */
6025                 if (req->flags & REQ_F_IO_DRAIN) {
6026                         head->flags |= REQ_F_IO_DRAIN;
6027                         ctx->drain_next = 1;
6028                 }
6029                 if (io_alloc_async_ctx(req))
6030                         return -EAGAIN;
6031
6032                 ret = io_req_defer_prep(req, sqe, false);
6033                 if (ret) {
6034                         /* fail even hard links since we don't submit */
6035                         head->flags |= REQ_F_FAIL_LINK;
6036                         return ret;
6037                 }
6038                 trace_io_uring_link(ctx, req, head);
6039                 io_get_req_task(req);
6040                 list_add_tail(&req->link_list, &head->link_list);
6041
6042                 /* last request of a link, enqueue the link */
6043                 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
6044                         io_queue_link_head(head, cs);
6045                         *link = NULL;
6046                 }
6047         } else {
6048                 if (unlikely(ctx->drain_next)) {
6049                         req->flags |= REQ_F_IO_DRAIN;
6050                         ctx->drain_next = 0;
6051                 }
6052                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
6053                         req->flags |= REQ_F_LINK_HEAD;
6054                         INIT_LIST_HEAD(&req->link_list);
6055
6056                         if (io_alloc_async_ctx(req))
6057                                 return -EAGAIN;
6058
6059                         ret = io_req_defer_prep(req, sqe, false);
6060                         if (ret)
6061                                 req->flags |= REQ_F_FAIL_LINK;
6062                         *link = req;
6063                 } else {
6064                         io_queue_sqe(req, sqe, cs);
6065                 }
6066         }
6067
6068         return 0;
6069 }
6070
6071 /*
6072  * Batched submission is done, ensure local IO is flushed out.
6073  */
6074 static void io_submit_state_end(struct io_submit_state *state)
6075 {
6076         if (!list_empty(&state->comp.list))
6077                 io_submit_flush_completions(&state->comp);
6078         blk_finish_plug(&state->plug);
6079         io_state_file_put(state);
6080         if (state->free_reqs)
6081                 kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
6082 }
6083
6084 /*
6085  * Start submission side cache.
6086  */
6087 static void io_submit_state_start(struct io_submit_state *state,
6088                                   struct io_ring_ctx *ctx, unsigned int max_ios)
6089 {
6090         blk_start_plug(&state->plug);
6091 #ifdef CONFIG_BLOCK
6092         state->plug.nowait = true;
6093 #endif
6094         state->comp.nr = 0;
6095         INIT_LIST_HEAD(&state->comp.list);
6096         state->comp.ctx = ctx;
6097         state->free_reqs = 0;
6098         state->file = NULL;
6099         state->ios_left = max_ios;
6100 }
6101
6102 static void io_commit_sqring(struct io_ring_ctx *ctx)
6103 {
6104         struct io_rings *rings = ctx->rings;
6105
6106         /*
6107          * Ensure any loads from the SQEs are done at this point,
6108          * since once we write the new head, the application could
6109          * write new data to them.
6110          */
6111         smp_store_release(&rings->sq.head, ctx->cached_sq_head);
6112 }
6113
6114 /*
6115  * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
6116  * that is mapped by userspace. This means that care needs to be taken to
6117  * ensure that reads are stable, as we cannot rely on userspace always
6118  * being a good citizen. If members of the sqe are validated and then later
6119  * used, it's important that those reads are done through READ_ONCE() to
6120  * prevent a re-load down the line.
6121  */
6122 static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
6123 {
6124         u32 *sq_array = ctx->sq_array;
6125         unsigned head;
6126
6127         /*
6128          * The cached sq head (or cq tail) serves two purposes:
6129          *
6130          * 1) allows us to batch the cost of updating the user visible
6131          *    head updates.
6132          * 2) allows the kernel side to track the head on its own, even
6133          *    though the application is the one updating it.
6134          */
6135         head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
6136         if (likely(head < ctx->sq_entries))
6137                 return &ctx->sq_sqes[head];
6138
6139         /* drop invalid entries */
6140         ctx->cached_sq_dropped++;
6141         WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
6142         return NULL;
6143 }
6144
6145 static inline void io_consume_sqe(struct io_ring_ctx *ctx)
6146 {
6147         ctx->cached_sq_head++;
6148 }
6149
6150 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
6151                                 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
6152                                 IOSQE_BUFFER_SELECT)
6153
6154 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
6155                        const struct io_uring_sqe *sqe,
6156                        struct io_submit_state *state)
6157 {
6158         unsigned int sqe_flags;
6159         int id;
6160
6161         /*
6162          * All io need record the previous position, if LINK vs DARIN,
6163          * it can be used to mark the position of the first IO in the
6164          * link list.
6165          */
6166         req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
6167         req->opcode = READ_ONCE(sqe->opcode);
6168         req->user_data = READ_ONCE(sqe->user_data);
6169         req->io = NULL;
6170         req->file = NULL;
6171         req->ctx = ctx;
6172         req->flags = 0;
6173         /* one is dropped after submission, the other at completion */
6174         refcount_set(&req->refs, 2);
6175         req->task = current;
6176         req->result = 0;
6177
6178         if (unlikely(req->opcode >= IORING_OP_LAST))
6179                 return -EINVAL;
6180
6181         if (unlikely(io_sq_thread_acquire_mm(ctx, req)))
6182                 return -EFAULT;
6183
6184         sqe_flags = READ_ONCE(sqe->flags);
6185         /* enforce forwards compatibility on users */
6186         if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
6187                 return -EINVAL;
6188
6189         if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
6190             !io_op_defs[req->opcode].buffer_select)
6191                 return -EOPNOTSUPP;
6192
6193         id = READ_ONCE(sqe->personality);
6194         if (id) {
6195                 io_req_init_async(req);
6196                 req->work.creds = idr_find(&ctx->personality_idr, id);
6197                 if (unlikely(!req->work.creds))
6198                         return -EINVAL;
6199                 get_cred(req->work.creds);
6200         }
6201
6202         /* same numerical values with corresponding REQ_F_*, safe to copy */
6203         req->flags |= sqe_flags;
6204
6205         if (!io_op_defs[req->opcode].needs_file)
6206                 return 0;
6207
6208         return io_req_set_file(state, req, READ_ONCE(sqe->fd));
6209 }
6210
6211 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
6212                           struct file *ring_file, int ring_fd)
6213 {
6214         struct io_submit_state state;
6215         struct io_kiocb *link = NULL;
6216         int i, submitted = 0;
6217
6218         /* if we have a backlog and couldn't flush it all, return BUSY */
6219         if (test_bit(0, &ctx->sq_check_overflow)) {
6220                 if (!list_empty(&ctx->cq_overflow_list) &&
6221                     !io_cqring_overflow_flush(ctx, false))
6222                         return -EBUSY;
6223         }
6224
6225         /* make sure SQ entry isn't read before tail */
6226         nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
6227
6228         if (!percpu_ref_tryget_many(&ctx->refs, nr))
6229                 return -EAGAIN;
6230
6231         io_submit_state_start(&state, ctx, nr);
6232
6233         ctx->ring_fd = ring_fd;
6234         ctx->ring_file = ring_file;
6235
6236         for (i = 0; i < nr; i++) {
6237                 const struct io_uring_sqe *sqe;
6238                 struct io_kiocb *req;
6239                 int err;
6240
6241                 sqe = io_get_sqe(ctx);
6242                 if (unlikely(!sqe)) {
6243                         io_consume_sqe(ctx);
6244                         break;
6245                 }
6246                 req = io_alloc_req(ctx, &state);
6247                 if (unlikely(!req)) {
6248                         if (!submitted)
6249                                 submitted = -EAGAIN;
6250                         break;
6251                 }
6252
6253                 err = io_init_req(ctx, req, sqe, &state);
6254                 io_consume_sqe(ctx);
6255                 /* will complete beyond this point, count as submitted */
6256                 submitted++;
6257
6258                 if (unlikely(err)) {
6259 fail_req:
6260                         io_put_req(req);
6261                         io_req_complete(req, err);
6262                         break;
6263                 }
6264
6265                 trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
6266                                                 true, io_async_submit(ctx));
6267                 err = io_submit_sqe(req, sqe, &link, &state.comp);
6268                 if (err)
6269                         goto fail_req;
6270         }
6271
6272         if (unlikely(submitted != nr)) {
6273                 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
6274
6275                 percpu_ref_put_many(&ctx->refs, nr - ref_used);
6276         }
6277         if (link)
6278                 io_queue_link_head(link, &state.comp);
6279         io_submit_state_end(&state);
6280
6281          /* Commit SQ ring head once we've consumed and submitted all SQEs */
6282         io_commit_sqring(ctx);
6283
6284         return submitted;
6285 }
6286
6287 static int io_sq_thread(void *data)
6288 {
6289         struct io_ring_ctx *ctx = data;
6290         const struct cred *old_cred;
6291         DEFINE_WAIT(wait);
6292         unsigned long timeout;
6293         int ret = 0;
6294
6295         complete(&ctx->sq_thread_comp);
6296
6297         old_cred = override_creds(ctx->creds);
6298
6299         timeout = jiffies + ctx->sq_thread_idle;
6300         while (!kthread_should_park()) {
6301                 unsigned int to_submit;
6302
6303                 if (!list_empty(&ctx->poll_list)) {
6304                         unsigned nr_events = 0;
6305
6306                         mutex_lock(&ctx->uring_lock);
6307                         if (!list_empty(&ctx->poll_list))
6308                                 io_iopoll_getevents(ctx, &nr_events, 0);
6309                         else
6310                                 timeout = jiffies + ctx->sq_thread_idle;
6311                         mutex_unlock(&ctx->uring_lock);
6312                 }
6313
6314                 to_submit = io_sqring_entries(ctx);
6315
6316                 /*
6317                  * If submit got -EBUSY, flag us as needing the application
6318                  * to enter the kernel to reap and flush events.
6319                  */
6320                 if (!to_submit || ret == -EBUSY || need_resched()) {
6321                         /*
6322                          * Drop cur_mm before scheduling, we can't hold it for
6323                          * long periods (or over schedule()). Do this before
6324                          * adding ourselves to the waitqueue, as the unuse/drop
6325                          * may sleep.
6326                          */
6327                         io_sq_thread_drop_mm(ctx);
6328
6329                         /*
6330                          * We're polling. If we're within the defined idle
6331                          * period, then let us spin without work before going
6332                          * to sleep. The exception is if we got EBUSY doing
6333                          * more IO, we should wait for the application to
6334                          * reap events and wake us up.
6335                          */
6336                         if (!list_empty(&ctx->poll_list) || need_resched() ||
6337                             (!time_after(jiffies, timeout) && ret != -EBUSY &&
6338                             !percpu_ref_is_dying(&ctx->refs))) {
6339                                 if (current->task_works)
6340                                         task_work_run();
6341                                 cond_resched();
6342                                 continue;
6343                         }
6344
6345                         prepare_to_wait(&ctx->sqo_wait, &wait,
6346                                                 TASK_INTERRUPTIBLE);
6347
6348                         /*
6349                          * While doing polled IO, before going to sleep, we need
6350                          * to check if there are new reqs added to poll_list, it
6351                          * is because reqs may have been punted to io worker and
6352                          * will be added to poll_list later, hence check the
6353                          * poll_list again.
6354                          */
6355                         if ((ctx->flags & IORING_SETUP_IOPOLL) &&
6356                             !list_empty_careful(&ctx->poll_list)) {
6357                                 finish_wait(&ctx->sqo_wait, &wait);
6358                                 continue;
6359                         }
6360
6361                         /* Tell userspace we may need a wakeup call */
6362                         ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
6363                         /* make sure to read SQ tail after writing flags */
6364                         smp_mb();
6365
6366                         to_submit = io_sqring_entries(ctx);
6367                         if (!to_submit || ret == -EBUSY) {
6368                                 if (kthread_should_park()) {
6369                                         finish_wait(&ctx->sqo_wait, &wait);
6370                                         break;
6371                                 }
6372                                 if (current->task_works) {
6373                                         task_work_run();
6374                                         finish_wait(&ctx->sqo_wait, &wait);
6375                                         continue;
6376                                 }
6377                                 if (signal_pending(current))
6378                                         flush_signals(current);
6379                                 schedule();
6380                                 finish_wait(&ctx->sqo_wait, &wait);
6381
6382                                 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6383                                 ret = 0;
6384                                 continue;
6385                         }
6386                         finish_wait(&ctx->sqo_wait, &wait);
6387
6388                         ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6389                 }
6390
6391                 mutex_lock(&ctx->uring_lock);
6392                 if (likely(!percpu_ref_is_dying(&ctx->refs)))
6393                         ret = io_submit_sqes(ctx, to_submit, NULL, -1);
6394                 mutex_unlock(&ctx->uring_lock);
6395                 timeout = jiffies + ctx->sq_thread_idle;
6396         }
6397
6398         if (current->task_works)
6399                 task_work_run();
6400
6401         io_sq_thread_drop_mm(ctx);
6402         revert_creds(old_cred);
6403
6404         kthread_parkme();
6405
6406         return 0;
6407 }
6408
6409 struct io_wait_queue {
6410         struct wait_queue_entry wq;
6411         struct io_ring_ctx *ctx;
6412         unsigned to_wait;
6413         unsigned nr_timeouts;
6414 };
6415
6416 static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
6417 {
6418         struct io_ring_ctx *ctx = iowq->ctx;
6419
6420         /*
6421          * Wake up if we have enough events, or if a timeout occurred since we
6422          * started waiting. For timeouts, we always want to return to userspace,
6423          * regardless of event count.
6424          */
6425         return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
6426                         atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
6427 }
6428
6429 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
6430                             int wake_flags, void *key)
6431 {
6432         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
6433                                                         wq);
6434
6435         /* use noflush == true, as we can't safely rely on locking context */
6436         if (!io_should_wake(iowq, true))
6437                 return -1;
6438
6439         return autoremove_wake_function(curr, mode, wake_flags, key);
6440 }
6441
6442 /*
6443  * Wait until events become available, if we don't already have some. The
6444  * application must reap them itself, as they reside on the shared cq ring.
6445  */
6446 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
6447                           const sigset_t __user *sig, size_t sigsz)
6448 {
6449         struct io_wait_queue iowq = {
6450                 .wq = {
6451                         .private        = current,
6452                         .func           = io_wake_function,
6453                         .entry          = LIST_HEAD_INIT(iowq.wq.entry),
6454                 },
6455                 .ctx            = ctx,
6456                 .to_wait        = min_events,
6457         };
6458         struct io_rings *rings = ctx->rings;
6459         int ret = 0;
6460
6461         do {
6462                 if (io_cqring_events(ctx, false) >= min_events)
6463                         return 0;
6464                 if (!current->task_works)
6465                         break;
6466                 task_work_run();
6467         } while (1);
6468
6469         if (sig) {
6470 #ifdef CONFIG_COMPAT
6471                 if (in_compat_syscall())
6472                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
6473                                                       sigsz);
6474                 else
6475 #endif
6476                         ret = set_user_sigmask(sig, sigsz);
6477
6478                 if (ret)
6479                         return ret;
6480         }
6481
6482         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
6483         trace_io_uring_cqring_wait(ctx, min_events);
6484         do {
6485                 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
6486                                                 TASK_INTERRUPTIBLE);
6487                 if (current->task_works)
6488                         task_work_run();
6489                 if (io_should_wake(&iowq, false))
6490                         break;
6491                 schedule();
6492                 if (signal_pending(current)) {
6493                         ret = -EINTR;
6494                         break;
6495                 }
6496         } while (1);
6497         finish_wait(&ctx->wait, &iowq.wq);
6498
6499         restore_saved_sigmask_unless(ret == -EINTR);
6500
6501         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
6502 }
6503
6504 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
6505 {
6506 #if defined(CONFIG_UNIX)
6507         if (ctx->ring_sock) {
6508                 struct sock *sock = ctx->ring_sock->sk;
6509                 struct sk_buff *skb;
6510
6511                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
6512                         kfree_skb(skb);
6513         }
6514 #else
6515         int i;
6516
6517         for (i = 0; i < ctx->nr_user_files; i++) {
6518                 struct file *file;
6519
6520                 file = io_file_from_index(ctx, i);
6521                 if (file)
6522                         fput(file);
6523         }
6524 #endif
6525 }
6526
6527 static void io_file_ref_kill(struct percpu_ref *ref)
6528 {
6529         struct fixed_file_data *data;
6530
6531         data = container_of(ref, struct fixed_file_data, refs);
6532         complete(&data->done);
6533 }
6534
6535 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
6536 {
6537         struct fixed_file_data *data = ctx->file_data;
6538         struct fixed_file_ref_node *ref_node = NULL;
6539         unsigned nr_tables, i;
6540
6541         if (!data)
6542                 return -ENXIO;
6543
6544         spin_lock(&data->lock);
6545         if (!list_empty(&data->ref_list))
6546                 ref_node = list_first_entry(&data->ref_list,
6547                                 struct fixed_file_ref_node, node);
6548         spin_unlock(&data->lock);
6549         if (ref_node)
6550                 percpu_ref_kill(&ref_node->refs);
6551
6552         percpu_ref_kill(&data->refs);
6553
6554         /* wait for all refs nodes to complete */
6555         flush_delayed_work(&ctx->file_put_work);
6556         wait_for_completion(&data->done);
6557
6558         __io_sqe_files_unregister(ctx);
6559         nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
6560         for (i = 0; i < nr_tables; i++)
6561                 kfree(data->table[i].files);
6562         kfree(data->table);
6563         percpu_ref_exit(&data->refs);
6564         kfree(data);
6565         ctx->file_data = NULL;
6566         ctx->nr_user_files = 0;
6567         return 0;
6568 }
6569
6570 static void io_sq_thread_stop(struct io_ring_ctx *ctx)
6571 {
6572         if (ctx->sqo_thread) {
6573                 wait_for_completion(&ctx->sq_thread_comp);
6574                 /*
6575                  * The park is a bit of a work-around, without it we get
6576                  * warning spews on shutdown with SQPOLL set and affinity
6577                  * set to a single CPU.
6578                  */
6579                 kthread_park(ctx->sqo_thread);
6580                 kthread_stop(ctx->sqo_thread);
6581                 ctx->sqo_thread = NULL;
6582         }
6583 }
6584
6585 static void io_finish_async(struct io_ring_ctx *ctx)
6586 {
6587         io_sq_thread_stop(ctx);
6588
6589         if (ctx->io_wq) {
6590                 io_wq_destroy(ctx->io_wq);
6591                 ctx->io_wq = NULL;
6592         }
6593 }
6594
6595 #if defined(CONFIG_UNIX)
6596 /*
6597  * Ensure the UNIX gc is aware of our file set, so we are certain that
6598  * the io_uring can be safely unregistered on process exit, even if we have
6599  * loops in the file referencing.
6600  */
6601 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
6602 {
6603         struct sock *sk = ctx->ring_sock->sk;
6604         struct scm_fp_list *fpl;
6605         struct sk_buff *skb;
6606         int i, nr_files;
6607
6608         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
6609         if (!fpl)
6610                 return -ENOMEM;
6611
6612         skb = alloc_skb(0, GFP_KERNEL);
6613         if (!skb) {
6614                 kfree(fpl);
6615                 return -ENOMEM;
6616         }
6617
6618         skb->sk = sk;
6619
6620         nr_files = 0;
6621         fpl->user = get_uid(ctx->user);
6622         for (i = 0; i < nr; i++) {
6623                 struct file *file = io_file_from_index(ctx, i + offset);
6624
6625                 if (!file)
6626                         continue;
6627                 fpl->fp[nr_files] = get_file(file);
6628                 unix_inflight(fpl->user, fpl->fp[nr_files]);
6629                 nr_files++;
6630         }
6631
6632         if (nr_files) {
6633                 fpl->max = SCM_MAX_FD;
6634                 fpl->count = nr_files;
6635                 UNIXCB(skb).fp = fpl;
6636                 skb->destructor = unix_destruct_scm;
6637                 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
6638                 skb_queue_head(&sk->sk_receive_queue, skb);
6639
6640                 for (i = 0; i < nr_files; i++)
6641                         fput(fpl->fp[i]);
6642         } else {
6643                 kfree_skb(skb);
6644                 kfree(fpl);
6645         }
6646
6647         return 0;
6648 }
6649
6650 /*
6651  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
6652  * causes regular reference counting to break down. We rely on the UNIX
6653  * garbage collection to take care of this problem for us.
6654  */
6655 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
6656 {
6657         unsigned left, total;
6658         int ret = 0;
6659
6660         total = 0;
6661         left = ctx->nr_user_files;
6662         while (left) {
6663                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
6664
6665                 ret = __io_sqe_files_scm(ctx, this_files, total);
6666                 if (ret)
6667                         break;
6668                 left -= this_files;
6669                 total += this_files;
6670         }
6671
6672         if (!ret)
6673                 return 0;
6674
6675         while (total < ctx->nr_user_files) {
6676                 struct file *file = io_file_from_index(ctx, total);
6677
6678                 if (file)
6679                         fput(file);
6680                 total++;
6681         }
6682
6683         return ret;
6684 }
6685 #else
6686 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
6687 {
6688         return 0;
6689 }
6690 #endif
6691
6692 static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
6693                                     unsigned nr_files)
6694 {
6695         int i;
6696
6697         for (i = 0; i < nr_tables; i++) {
6698                 struct fixed_file_table *table = &ctx->file_data->table[i];
6699                 unsigned this_files;
6700
6701                 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
6702                 table->files = kcalloc(this_files, sizeof(struct file *),
6703                                         GFP_KERNEL);
6704                 if (!table->files)
6705                         break;
6706                 nr_files -= this_files;
6707         }
6708
6709         if (i == nr_tables)
6710                 return 0;
6711
6712         for (i = 0; i < nr_tables; i++) {
6713                 struct fixed_file_table *table = &ctx->file_data->table[i];
6714                 kfree(table->files);
6715         }
6716         return 1;
6717 }
6718
6719 static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
6720 {
6721 #if defined(CONFIG_UNIX)
6722         struct sock *sock = ctx->ring_sock->sk;
6723         struct sk_buff_head list, *head = &sock->sk_receive_queue;
6724         struct sk_buff *skb;
6725         int i;
6726
6727         __skb_queue_head_init(&list);
6728
6729         /*
6730          * Find the skb that holds this file in its SCM_RIGHTS. When found,
6731          * remove this entry and rearrange the file array.
6732          */
6733         skb = skb_dequeue(head);
6734         while (skb) {
6735                 struct scm_fp_list *fp;
6736
6737                 fp = UNIXCB(skb).fp;
6738                 for (i = 0; i < fp->count; i++) {
6739                         int left;
6740
6741                         if (fp->fp[i] != file)
6742                                 continue;
6743
6744                         unix_notinflight(fp->user, fp->fp[i]);
6745                         left = fp->count - 1 - i;
6746                         if (left) {
6747                                 memmove(&fp->fp[i], &fp->fp[i + 1],
6748                                                 left * sizeof(struct file *));
6749                         }
6750                         fp->count--;
6751                         if (!fp->count) {
6752                                 kfree_skb(skb);
6753                                 skb = NULL;
6754                         } else {
6755                                 __skb_queue_tail(&list, skb);
6756                         }
6757                         fput(file);
6758                         file = NULL;
6759                         break;
6760                 }
6761
6762                 if (!file)
6763                         break;
6764
6765                 __skb_queue_tail(&list, skb);
6766
6767                 skb = skb_dequeue(head);
6768         }
6769
6770         if (skb_peek(&list)) {
6771                 spin_lock_irq(&head->lock);
6772                 while ((skb = __skb_dequeue(&list)) != NULL)
6773                         __skb_queue_tail(head, skb);
6774                 spin_unlock_irq(&head->lock);
6775         }
6776 #else
6777         fput(file);
6778 #endif
6779 }
6780
6781 struct io_file_put {
6782         struct list_head list;
6783         struct file *file;
6784 };
6785
6786 static void __io_file_put_work(struct fixed_file_ref_node *ref_node)
6787 {
6788         struct fixed_file_data *file_data = ref_node->file_data;
6789         struct io_ring_ctx *ctx = file_data->ctx;
6790         struct io_file_put *pfile, *tmp;
6791
6792         list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) {
6793                 list_del(&pfile->list);
6794                 io_ring_file_put(ctx, pfile->file);
6795                 kfree(pfile);
6796         }
6797
6798         spin_lock(&file_data->lock);
6799         list_del(&ref_node->node);
6800         spin_unlock(&file_data->lock);
6801
6802         percpu_ref_exit(&ref_node->refs);
6803         kfree(ref_node);
6804         percpu_ref_put(&file_data->refs);
6805 }
6806
6807 static void io_file_put_work(struct work_struct *work)
6808 {
6809         struct io_ring_ctx *ctx;
6810         struct llist_node *node;
6811
6812         ctx = container_of(work, struct io_ring_ctx, file_put_work.work);
6813         node = llist_del_all(&ctx->file_put_llist);
6814
6815         while (node) {
6816                 struct fixed_file_ref_node *ref_node;
6817                 struct llist_node *next = node->next;
6818
6819                 ref_node = llist_entry(node, struct fixed_file_ref_node, llist);
6820                 __io_file_put_work(ref_node);
6821                 node = next;
6822         }
6823 }
6824
6825 static void io_file_data_ref_zero(struct percpu_ref *ref)
6826 {
6827         struct fixed_file_ref_node *ref_node;
6828         struct io_ring_ctx *ctx;
6829         bool first_add;
6830         int delay = HZ;
6831
6832         ref_node = container_of(ref, struct fixed_file_ref_node, refs);
6833         ctx = ref_node->file_data->ctx;
6834
6835         if (percpu_ref_is_dying(&ctx->file_data->refs))
6836                 delay = 0;
6837
6838         first_add = llist_add(&ref_node->llist, &ctx->file_put_llist);
6839         if (!delay)
6840                 mod_delayed_work(system_wq, &ctx->file_put_work, 0);
6841         else if (first_add)
6842                 queue_delayed_work(system_wq, &ctx->file_put_work, delay);
6843 }
6844
6845 static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
6846                         struct io_ring_ctx *ctx)
6847 {
6848         struct fixed_file_ref_node *ref_node;
6849
6850         ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
6851         if (!ref_node)
6852                 return ERR_PTR(-ENOMEM);
6853
6854         if (percpu_ref_init(&ref_node->refs, io_file_data_ref_zero,
6855                             0, GFP_KERNEL)) {
6856                 kfree(ref_node);
6857                 return ERR_PTR(-ENOMEM);
6858         }
6859         INIT_LIST_HEAD(&ref_node->node);
6860         INIT_LIST_HEAD(&ref_node->file_list);
6861         ref_node->file_data = ctx->file_data;
6862         return ref_node;
6863 }
6864
6865 static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node)
6866 {
6867         percpu_ref_exit(&ref_node->refs);
6868         kfree(ref_node);
6869 }
6870
6871 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
6872                                  unsigned nr_args)
6873 {
6874         __s32 __user *fds = (__s32 __user *) arg;
6875         unsigned nr_tables;
6876         struct file *file;
6877         int fd, ret = 0;
6878         unsigned i;
6879         struct fixed_file_ref_node *ref_node;
6880
6881         if (ctx->file_data)
6882                 return -EBUSY;
6883         if (!nr_args)
6884                 return -EINVAL;
6885         if (nr_args > IORING_MAX_FIXED_FILES)
6886                 return -EMFILE;
6887
6888         ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
6889         if (!ctx->file_data)
6890                 return -ENOMEM;
6891         ctx->file_data->ctx = ctx;
6892         init_completion(&ctx->file_data->done);
6893         INIT_LIST_HEAD(&ctx->file_data->ref_list);
6894         spin_lock_init(&ctx->file_data->lock);
6895
6896         nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
6897         ctx->file_data->table = kcalloc(nr_tables,
6898                                         sizeof(struct fixed_file_table),
6899                                         GFP_KERNEL);
6900         if (!ctx->file_data->table) {
6901                 kfree(ctx->file_data);
6902                 ctx->file_data = NULL;
6903                 return -ENOMEM;
6904         }
6905
6906         if (percpu_ref_init(&ctx->file_data->refs, io_file_ref_kill,
6907                                 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
6908                 kfree(ctx->file_data->table);
6909                 kfree(ctx->file_data);
6910                 ctx->file_data = NULL;
6911                 return -ENOMEM;
6912         }
6913
6914         if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
6915                 percpu_ref_exit(&ctx->file_data->refs);
6916                 kfree(ctx->file_data->table);
6917                 kfree(ctx->file_data);
6918                 ctx->file_data = NULL;
6919                 return -ENOMEM;
6920         }
6921
6922         for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
6923                 struct fixed_file_table *table;
6924                 unsigned index;
6925
6926                 ret = -EFAULT;
6927                 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
6928                         break;
6929                 /* allow sparse sets */
6930                 if (fd == -1) {
6931                         ret = 0;
6932                         continue;
6933                 }
6934
6935                 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
6936                 index = i & IORING_FILE_TABLE_MASK;
6937                 file = fget(fd);
6938
6939                 ret = -EBADF;
6940                 if (!file)
6941                         break;
6942
6943                 /*
6944                  * Don't allow io_uring instances to be registered. If UNIX
6945                  * isn't enabled, then this causes a reference cycle and this
6946                  * instance can never get freed. If UNIX is enabled we'll
6947                  * handle it just fine, but there's still no point in allowing
6948                  * a ring fd as it doesn't support regular read/write anyway.
6949                  */
6950                 if (file->f_op == &io_uring_fops) {
6951                         fput(file);
6952                         break;
6953                 }
6954                 ret = 0;
6955                 table->files[index] = file;
6956         }
6957
6958         if (ret) {
6959                 for (i = 0; i < ctx->nr_user_files; i++) {
6960                         file = io_file_from_index(ctx, i);
6961                         if (file)
6962                                 fput(file);
6963                 }
6964                 for (i = 0; i < nr_tables; i++)
6965                         kfree(ctx->file_data->table[i].files);
6966
6967                 kfree(ctx->file_data->table);
6968                 kfree(ctx->file_data);
6969                 ctx->file_data = NULL;
6970                 ctx->nr_user_files = 0;
6971                 return ret;
6972         }
6973
6974         ret = io_sqe_files_scm(ctx);
6975         if (ret) {
6976                 io_sqe_files_unregister(ctx);
6977                 return ret;
6978         }
6979
6980         ref_node = alloc_fixed_file_ref_node(ctx);
6981         if (IS_ERR(ref_node)) {
6982                 io_sqe_files_unregister(ctx);
6983                 return PTR_ERR(ref_node);
6984         }
6985
6986         ctx->file_data->cur_refs = &ref_node->refs;
6987         spin_lock(&ctx->file_data->lock);
6988         list_add(&ref_node->node, &ctx->file_data->ref_list);
6989         spin_unlock(&ctx->file_data->lock);
6990         percpu_ref_get(&ctx->file_data->refs);
6991         return ret;
6992 }
6993
6994 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
6995                                 int index)
6996 {
6997 #if defined(CONFIG_UNIX)
6998         struct sock *sock = ctx->ring_sock->sk;
6999         struct sk_buff_head *head = &sock->sk_receive_queue;
7000         struct sk_buff *skb;
7001
7002         /*
7003          * See if we can merge this file into an existing skb SCM_RIGHTS
7004          * file set. If there's no room, fall back to allocating a new skb
7005          * and filling it in.
7006          */
7007         spin_lock_irq(&head->lock);
7008         skb = skb_peek(head);
7009         if (skb) {
7010                 struct scm_fp_list *fpl = UNIXCB(skb).fp;
7011
7012                 if (fpl->count < SCM_MAX_FD) {
7013                         __skb_unlink(skb, head);
7014                         spin_unlock_irq(&head->lock);
7015                         fpl->fp[fpl->count] = get_file(file);
7016                         unix_inflight(fpl->user, fpl->fp[fpl->count]);
7017                         fpl->count++;
7018                         spin_lock_irq(&head->lock);
7019                         __skb_queue_head(head, skb);
7020                 } else {
7021                         skb = NULL;
7022                 }
7023         }
7024         spin_unlock_irq(&head->lock);
7025
7026         if (skb) {
7027                 fput(file);
7028                 return 0;
7029         }
7030
7031         return __io_sqe_files_scm(ctx, 1, index);
7032 #else
7033         return 0;
7034 #endif
7035 }
7036
7037 static int io_queue_file_removal(struct fixed_file_data *data,
7038                                  struct file *file)
7039 {
7040         struct io_file_put *pfile;
7041         struct percpu_ref *refs = data->cur_refs;
7042         struct fixed_file_ref_node *ref_node;
7043
7044         pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
7045         if (!pfile)
7046                 return -ENOMEM;
7047
7048         ref_node = container_of(refs, struct fixed_file_ref_node, refs);
7049         pfile->file = file;
7050         list_add(&pfile->list, &ref_node->file_list);
7051
7052         return 0;
7053 }
7054
7055 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
7056                                  struct io_uring_files_update *up,
7057                                  unsigned nr_args)
7058 {
7059         struct fixed_file_data *data = ctx->file_data;
7060         struct fixed_file_ref_node *ref_node;
7061         struct file *file;
7062         __s32 __user *fds;
7063         int fd, i, err;
7064         __u32 done;
7065         bool needs_switch = false;
7066
7067         if (check_add_overflow(up->offset, nr_args, &done))
7068                 return -EOVERFLOW;
7069         if (done > ctx->nr_user_files)
7070                 return -EINVAL;
7071
7072         ref_node = alloc_fixed_file_ref_node(ctx);
7073         if (IS_ERR(ref_node))
7074                 return PTR_ERR(ref_node);
7075
7076         done = 0;
7077         fds = u64_to_user_ptr(up->fds);
7078         while (nr_args) {
7079                 struct fixed_file_table *table;
7080                 unsigned index;
7081
7082                 err = 0;
7083                 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
7084                         err = -EFAULT;
7085                         break;
7086                 }
7087                 i = array_index_nospec(up->offset, ctx->nr_user_files);
7088                 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
7089                 index = i & IORING_FILE_TABLE_MASK;
7090                 if (table->files[index]) {
7091                         file = io_file_from_index(ctx, index);
7092                         err = io_queue_file_removal(data, file);
7093                         if (err)
7094                                 break;
7095                         table->files[index] = NULL;
7096                         needs_switch = true;
7097                 }
7098                 if (fd != -1) {
7099                         file = fget(fd);
7100                         if (!file) {
7101                                 err = -EBADF;
7102                                 break;
7103                         }
7104                         /*
7105                          * Don't allow io_uring instances to be registered. If
7106                          * UNIX isn't enabled, then this causes a reference
7107                          * cycle and this instance can never get freed. If UNIX
7108                          * is enabled we'll handle it just fine, but there's
7109                          * still no point in allowing a ring fd as it doesn't
7110                          * support regular read/write anyway.
7111                          */
7112                         if (file->f_op == &io_uring_fops) {
7113                                 fput(file);
7114                                 err = -EBADF;
7115                                 break;
7116                         }
7117                         table->files[index] = file;
7118                         err = io_sqe_file_register(ctx, file, i);
7119                         if (err)
7120                                 break;
7121                 }
7122                 nr_args--;
7123                 done++;
7124                 up->offset++;
7125         }
7126
7127         if (needs_switch) {
7128                 percpu_ref_kill(data->cur_refs);
7129                 spin_lock(&data->lock);
7130                 list_add(&ref_node->node, &data->ref_list);
7131                 data->cur_refs = &ref_node->refs;
7132                 spin_unlock(&data->lock);
7133                 percpu_ref_get(&ctx->file_data->refs);
7134         } else
7135                 destroy_fixed_file_ref_node(ref_node);
7136
7137         return done ? done : err;
7138 }
7139
7140 static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
7141                                unsigned nr_args)
7142 {
7143         struct io_uring_files_update up;
7144
7145         if (!ctx->file_data)
7146                 return -ENXIO;
7147         if (!nr_args)
7148                 return -EINVAL;
7149         if (copy_from_user(&up, arg, sizeof(up)))
7150                 return -EFAULT;
7151         if (up.resv)
7152                 return -EINVAL;
7153
7154         return __io_sqe_files_update(ctx, &up, nr_args);
7155 }
7156
7157 static void io_free_work(struct io_wq_work *work)
7158 {
7159         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
7160
7161         /* Consider that io_steal_work() relies on this ref */
7162         io_put_req(req);
7163 }
7164
7165 static int io_init_wq_offload(struct io_ring_ctx *ctx,
7166                               struct io_uring_params *p)
7167 {
7168         struct io_wq_data data;
7169         struct fd f;
7170         struct io_ring_ctx *ctx_attach;
7171         unsigned int concurrency;
7172         int ret = 0;
7173
7174         data.user = ctx->user;
7175         data.free_work = io_free_work;
7176         data.do_work = io_wq_submit_work;
7177
7178         if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
7179                 /* Do QD, or 4 * CPUS, whatever is smallest */
7180                 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
7181
7182                 ctx->io_wq = io_wq_create(concurrency, &data);
7183                 if (IS_ERR(ctx->io_wq)) {
7184                         ret = PTR_ERR(ctx->io_wq);
7185                         ctx->io_wq = NULL;
7186                 }
7187                 return ret;
7188         }
7189
7190         f = fdget(p->wq_fd);
7191         if (!f.file)
7192                 return -EBADF;
7193
7194         if (f.file->f_op != &io_uring_fops) {
7195                 ret = -EINVAL;
7196                 goto out_fput;
7197         }
7198
7199         ctx_attach = f.file->private_data;
7200         /* @io_wq is protected by holding the fd */
7201         if (!io_wq_get(ctx_attach->io_wq, &data)) {
7202                 ret = -EINVAL;
7203                 goto out_fput;
7204         }
7205
7206         ctx->io_wq = ctx_attach->io_wq;
7207 out_fput:
7208         fdput(f);
7209         return ret;
7210 }
7211
7212 static int io_sq_offload_start(struct io_ring_ctx *ctx,
7213                                struct io_uring_params *p)
7214 {
7215         int ret;
7216
7217         mmgrab(current->mm);
7218         ctx->sqo_mm = current->mm;
7219
7220         if (ctx->flags & IORING_SETUP_SQPOLL) {
7221                 ret = -EPERM;
7222                 if (!capable(CAP_SYS_ADMIN))
7223                         goto err;
7224
7225                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
7226                 if (!ctx->sq_thread_idle)
7227                         ctx->sq_thread_idle = HZ;
7228
7229                 if (p->flags & IORING_SETUP_SQ_AFF) {
7230                         int cpu = p->sq_thread_cpu;
7231
7232                         ret = -EINVAL;
7233                         if (cpu >= nr_cpu_ids)
7234                                 goto err;
7235                         if (!cpu_online(cpu))
7236                                 goto err;
7237
7238                         ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
7239                                                         ctx, cpu,
7240                                                         "io_uring-sq");
7241                 } else {
7242                         ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
7243                                                         "io_uring-sq");
7244                 }
7245                 if (IS_ERR(ctx->sqo_thread)) {
7246                         ret = PTR_ERR(ctx->sqo_thread);
7247                         ctx->sqo_thread = NULL;
7248                         goto err;
7249                 }
7250                 wake_up_process(ctx->sqo_thread);
7251         } else if (p->flags & IORING_SETUP_SQ_AFF) {
7252                 /* Can't have SQ_AFF without SQPOLL */
7253                 ret = -EINVAL;
7254                 goto err;
7255         }
7256
7257         ret = io_init_wq_offload(ctx, p);
7258         if (ret)
7259                 goto err;
7260
7261         return 0;
7262 err:
7263         io_finish_async(ctx);
7264         mmdrop(ctx->sqo_mm);
7265         ctx->sqo_mm = NULL;
7266         return ret;
7267 }
7268
7269 static inline void __io_unaccount_mem(struct user_struct *user,
7270                                       unsigned long nr_pages)
7271 {
7272         atomic_long_sub(nr_pages, &user->locked_vm);
7273 }
7274
7275 static inline int __io_account_mem(struct user_struct *user,
7276                                    unsigned long nr_pages)
7277 {
7278         unsigned long page_limit, cur_pages, new_pages;
7279
7280         /* Don't allow more pages than we can safely lock */
7281         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
7282
7283         do {
7284                 cur_pages = atomic_long_read(&user->locked_vm);
7285                 new_pages = cur_pages + nr_pages;
7286                 if (new_pages > page_limit)
7287                         return -ENOMEM;
7288         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
7289                                         new_pages) != cur_pages);
7290
7291         return 0;
7292 }
7293
7294 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
7295                              enum io_mem_account acct)
7296 {
7297         if (ctx->limit_mem)
7298                 __io_unaccount_mem(ctx->user, nr_pages);
7299
7300         if (ctx->sqo_mm) {
7301                 if (acct == ACCT_LOCKED)
7302                         ctx->sqo_mm->locked_vm -= nr_pages;
7303                 else if (acct == ACCT_PINNED)
7304                         atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm);
7305         }
7306 }
7307
7308 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
7309                           enum io_mem_account acct)
7310 {
7311         int ret;
7312
7313         if (ctx->limit_mem) {
7314                 ret = __io_account_mem(ctx->user, nr_pages);
7315                 if (ret)
7316                         return ret;
7317         }
7318
7319         if (ctx->sqo_mm) {
7320                 if (acct == ACCT_LOCKED)
7321                         ctx->sqo_mm->locked_vm += nr_pages;
7322                 else if (acct == ACCT_PINNED)
7323                         atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm);
7324         }
7325
7326         return 0;
7327 }
7328
7329 static void io_mem_free(void *ptr)
7330 {
7331         struct page *page;
7332
7333         if (!ptr)
7334                 return;
7335
7336         page = virt_to_head_page(ptr);
7337         if (put_page_testzero(page))
7338                 free_compound_page(page);
7339 }
7340
7341 static void *io_mem_alloc(size_t size)
7342 {
7343         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
7344                                 __GFP_NORETRY;
7345
7346         return (void *) __get_free_pages(gfp_flags, get_order(size));
7347 }
7348
7349 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
7350                                 size_t *sq_offset)
7351 {
7352         struct io_rings *rings;
7353         size_t off, sq_array_size;
7354
7355         off = struct_size(rings, cqes, cq_entries);
7356         if (off == SIZE_MAX)
7357                 return SIZE_MAX;
7358
7359 #ifdef CONFIG_SMP
7360         off = ALIGN(off, SMP_CACHE_BYTES);
7361         if (off == 0)
7362                 return SIZE_MAX;
7363 #endif
7364
7365         sq_array_size = array_size(sizeof(u32), sq_entries);
7366         if (sq_array_size == SIZE_MAX)
7367                 return SIZE_MAX;
7368
7369         if (check_add_overflow(off, sq_array_size, &off))
7370                 return SIZE_MAX;
7371
7372         if (sq_offset)
7373                 *sq_offset = off;
7374
7375         return off;
7376 }
7377
7378 static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
7379 {
7380         size_t pages;
7381
7382         pages = (size_t)1 << get_order(
7383                 rings_size(sq_entries, cq_entries, NULL));
7384         pages += (size_t)1 << get_order(
7385                 array_size(sizeof(struct io_uring_sqe), sq_entries));
7386
7387         return pages;
7388 }
7389
7390 static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
7391 {
7392         int i, j;
7393
7394         if (!ctx->user_bufs)
7395                 return -ENXIO;
7396
7397         for (i = 0; i < ctx->nr_user_bufs; i++) {
7398                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
7399
7400                 for (j = 0; j < imu->nr_bvecs; j++)
7401                         unpin_user_page(imu->bvec[j].bv_page);
7402
7403                 io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED);
7404                 kvfree(imu->bvec);
7405                 imu->nr_bvecs = 0;
7406         }
7407
7408         kfree(ctx->user_bufs);
7409         ctx->user_bufs = NULL;
7410         ctx->nr_user_bufs = 0;
7411         return 0;
7412 }
7413
7414 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
7415                        void __user *arg, unsigned index)
7416 {
7417         struct iovec __user *src;
7418
7419 #ifdef CONFIG_COMPAT
7420         if (ctx->compat) {
7421                 struct compat_iovec __user *ciovs;
7422                 struct compat_iovec ciov;
7423
7424                 ciovs = (struct compat_iovec __user *) arg;
7425                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
7426                         return -EFAULT;
7427
7428                 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
7429                 dst->iov_len = ciov.iov_len;
7430                 return 0;
7431         }
7432 #endif
7433         src = (struct iovec __user *) arg;
7434         if (copy_from_user(dst, &src[index], sizeof(*dst)))
7435                 return -EFAULT;
7436         return 0;
7437 }
7438
7439 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
7440                                   unsigned nr_args)
7441 {
7442         struct vm_area_struct **vmas = NULL;
7443         struct page **pages = NULL;
7444         int i, j, got_pages = 0;
7445         int ret = -EINVAL;
7446
7447         if (ctx->user_bufs)
7448                 return -EBUSY;
7449         if (!nr_args || nr_args > UIO_MAXIOV)
7450                 return -EINVAL;
7451
7452         ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
7453                                         GFP_KERNEL);
7454         if (!ctx->user_bufs)
7455                 return -ENOMEM;
7456
7457         for (i = 0; i < nr_args; i++) {
7458                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
7459                 unsigned long off, start, end, ubuf;
7460                 int pret, nr_pages;
7461                 struct iovec iov;
7462                 size_t size;
7463
7464                 ret = io_copy_iov(ctx, &iov, arg, i);
7465                 if (ret)
7466                         goto err;
7467
7468                 /*
7469                  * Don't impose further limits on the size and buffer
7470                  * constraints here, we'll -EINVAL later when IO is
7471                  * submitted if they are wrong.
7472                  */
7473                 ret = -EFAULT;
7474                 if (!iov.iov_base || !iov.iov_len)
7475                         goto err;
7476
7477                 /* arbitrary limit, but we need something */
7478                 if (iov.iov_len > SZ_1G)
7479                         goto err;
7480
7481                 ubuf = (unsigned long) iov.iov_base;
7482                 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
7483                 start = ubuf >> PAGE_SHIFT;
7484                 nr_pages = end - start;
7485
7486                 ret = io_account_mem(ctx, nr_pages, ACCT_PINNED);
7487                 if (ret)
7488                         goto err;
7489
7490                 ret = 0;
7491                 if (!pages || nr_pages > got_pages) {
7492                         kvfree(vmas);
7493                         kvfree(pages);
7494                         pages = kvmalloc_array(nr_pages, sizeof(struct page *),
7495                                                 GFP_KERNEL);
7496                         vmas = kvmalloc_array(nr_pages,
7497                                         sizeof(struct vm_area_struct *),
7498                                         GFP_KERNEL);
7499                         if (!pages || !vmas) {
7500                                 ret = -ENOMEM;
7501                                 io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
7502                                 goto err;
7503                         }
7504                         got_pages = nr_pages;
7505                 }
7506
7507                 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
7508                                                 GFP_KERNEL);
7509                 ret = -ENOMEM;
7510                 if (!imu->bvec) {
7511                         io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
7512                         goto err;
7513                 }
7514
7515                 ret = 0;
7516                 mmap_read_lock(current->mm);
7517                 pret = pin_user_pages(ubuf, nr_pages,
7518                                       FOLL_WRITE | FOLL_LONGTERM,
7519                                       pages, vmas);
7520                 if (pret == nr_pages) {
7521                         /* don't support file backed memory */
7522                         for (j = 0; j < nr_pages; j++) {
7523                                 struct vm_area_struct *vma = vmas[j];
7524
7525                                 if (vma->vm_file &&
7526                                     !is_file_hugepages(vma->vm_file)) {
7527                                         ret = -EOPNOTSUPP;
7528                                         break;
7529                                 }
7530                         }
7531                 } else {
7532                         ret = pret < 0 ? pret : -EFAULT;
7533                 }
7534                 mmap_read_unlock(current->mm);
7535                 if (ret) {
7536                         /*
7537                          * if we did partial map, or found file backed vmas,
7538                          * release any pages we did get
7539                          */
7540                         if (pret > 0)
7541                                 unpin_user_pages(pages, pret);
7542                         io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
7543                         kvfree(imu->bvec);
7544                         goto err;
7545                 }
7546
7547                 off = ubuf & ~PAGE_MASK;
7548                 size = iov.iov_len;
7549                 for (j = 0; j < nr_pages; j++) {
7550                         size_t vec_len;
7551
7552                         vec_len = min_t(size_t, size, PAGE_SIZE - off);
7553                         imu->bvec[j].bv_page = pages[j];
7554                         imu->bvec[j].bv_len = vec_len;
7555                         imu->bvec[j].bv_offset = off;
7556                         off = 0;
7557                         size -= vec_len;
7558                 }
7559                 /* store original address for later verification */
7560                 imu->ubuf = ubuf;
7561                 imu->len = iov.iov_len;
7562                 imu->nr_bvecs = nr_pages;
7563
7564                 ctx->nr_user_bufs++;
7565         }
7566         kvfree(pages);
7567         kvfree(vmas);
7568         return 0;
7569 err:
7570         kvfree(pages);
7571         kvfree(vmas);
7572         io_sqe_buffer_unregister(ctx);
7573         return ret;
7574 }
7575
7576 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
7577 {
7578         __s32 __user *fds = arg;
7579         int fd;
7580
7581         if (ctx->cq_ev_fd)
7582                 return -EBUSY;
7583
7584         if (copy_from_user(&fd, fds, sizeof(*fds)))
7585                 return -EFAULT;
7586
7587         ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
7588         if (IS_ERR(ctx->cq_ev_fd)) {
7589                 int ret = PTR_ERR(ctx->cq_ev_fd);
7590                 ctx->cq_ev_fd = NULL;
7591                 return ret;
7592         }
7593
7594         return 0;
7595 }
7596
7597 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
7598 {
7599         if (ctx->cq_ev_fd) {
7600                 eventfd_ctx_put(ctx->cq_ev_fd);
7601                 ctx->cq_ev_fd = NULL;
7602                 return 0;
7603         }
7604
7605         return -ENXIO;
7606 }
7607
7608 static int __io_destroy_buffers(int id, void *p, void *data)
7609 {
7610         struct io_ring_ctx *ctx = data;
7611         struct io_buffer *buf = p;
7612
7613         __io_remove_buffers(ctx, buf, id, -1U);
7614         return 0;
7615 }
7616
7617 static void io_destroy_buffers(struct io_ring_ctx *ctx)
7618 {
7619         idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
7620         idr_destroy(&ctx->io_buffer_idr);
7621 }
7622
7623 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
7624 {
7625         io_finish_async(ctx);
7626         if (ctx->sqo_mm) {
7627                 mmdrop(ctx->sqo_mm);
7628                 ctx->sqo_mm = NULL;
7629         }
7630
7631         io_iopoll_reap_events(ctx);
7632         io_sqe_buffer_unregister(ctx);
7633         io_sqe_files_unregister(ctx);
7634         io_eventfd_unregister(ctx);
7635         io_destroy_buffers(ctx);
7636         idr_destroy(&ctx->personality_idr);
7637
7638 #if defined(CONFIG_UNIX)
7639         if (ctx->ring_sock) {
7640                 ctx->ring_sock->file = NULL; /* so that iput() is called */
7641                 sock_release(ctx->ring_sock);
7642         }
7643 #endif
7644
7645         io_mem_free(ctx->rings);
7646         io_mem_free(ctx->sq_sqes);
7647
7648         percpu_ref_exit(&ctx->refs);
7649         io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries),
7650                          ACCT_LOCKED);
7651         free_uid(ctx->user);
7652         put_cred(ctx->creds);
7653         kfree(ctx->cancel_hash);
7654         kmem_cache_free(req_cachep, ctx->fallback_req);
7655         kfree(ctx);
7656 }
7657
7658 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
7659 {
7660         struct io_ring_ctx *ctx = file->private_data;
7661         __poll_t mask = 0;
7662
7663         poll_wait(file, &ctx->cq_wait, wait);
7664         /*
7665          * synchronizes with barrier from wq_has_sleeper call in
7666          * io_commit_cqring
7667          */
7668         smp_rmb();
7669         if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
7670             ctx->rings->sq_ring_entries)
7671                 mask |= EPOLLOUT | EPOLLWRNORM;
7672         if (io_cqring_events(ctx, false))
7673                 mask |= EPOLLIN | EPOLLRDNORM;
7674
7675         return mask;
7676 }
7677
7678 static int io_uring_fasync(int fd, struct file *file, int on)
7679 {
7680         struct io_ring_ctx *ctx = file->private_data;
7681
7682         return fasync_helper(fd, file, on, &ctx->cq_fasync);
7683 }
7684
7685 static int io_remove_personalities(int id, void *p, void *data)
7686 {
7687         struct io_ring_ctx *ctx = data;
7688         const struct cred *cred;
7689
7690         cred = idr_remove(&ctx->personality_idr, id);
7691         if (cred)
7692                 put_cred(cred);
7693         return 0;
7694 }
7695
7696 static void io_ring_exit_work(struct work_struct *work)
7697 {
7698         struct io_ring_ctx *ctx;
7699
7700         ctx = container_of(work, struct io_ring_ctx, exit_work);
7701         if (ctx->rings)
7702                 io_cqring_overflow_flush(ctx, true);
7703
7704         /*
7705          * If we're doing polled IO and end up having requests being
7706          * submitted async (out-of-line), then completions can come in while
7707          * we're waiting for refs to drop. We need to reap these manually,
7708          * as nobody else will be looking for them.
7709          */
7710         while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) {
7711                 io_iopoll_reap_events(ctx);
7712                 if (ctx->rings)
7713                         io_cqring_overflow_flush(ctx, true);
7714         }
7715         io_ring_ctx_free(ctx);
7716 }
7717
7718 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
7719 {
7720         mutex_lock(&ctx->uring_lock);
7721         percpu_ref_kill(&ctx->refs);
7722         mutex_unlock(&ctx->uring_lock);
7723
7724         io_kill_timeouts(ctx);
7725         io_poll_remove_all(ctx);
7726
7727         if (ctx->io_wq)
7728                 io_wq_cancel_all(ctx->io_wq);
7729
7730         io_iopoll_reap_events(ctx);
7731         /* if we failed setting up the ctx, we might not have any rings */
7732         if (ctx->rings)
7733                 io_cqring_overflow_flush(ctx, true);
7734         idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
7735         INIT_WORK(&ctx->exit_work, io_ring_exit_work);
7736         queue_work(system_wq, &ctx->exit_work);
7737 }
7738
7739 static int io_uring_release(struct inode *inode, struct file *file)
7740 {
7741         struct io_ring_ctx *ctx = file->private_data;
7742
7743         file->private_data = NULL;
7744         io_ring_ctx_wait_and_kill(ctx);
7745         return 0;
7746 }
7747
7748 static bool io_wq_files_match(struct io_wq_work *work, void *data)
7749 {
7750         struct files_struct *files = data;
7751
7752         return work->files == files;
7753 }
7754
7755 static void io_uring_cancel_files(struct io_ring_ctx *ctx,
7756                                   struct files_struct *files)
7757 {
7758         if (list_empty_careful(&ctx->inflight_list))
7759                 return;
7760
7761         /* cancel all at once, should be faster than doing it one by one*/
7762         io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
7763
7764         while (!list_empty_careful(&ctx->inflight_list)) {
7765                 struct io_kiocb *cancel_req = NULL, *req;
7766                 DEFINE_WAIT(wait);
7767
7768                 spin_lock_irq(&ctx->inflight_lock);
7769                 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
7770                         if (req->work.files != files)
7771                                 continue;
7772                         /* req is being completed, ignore */
7773                         if (!refcount_inc_not_zero(&req->refs))
7774                                 continue;
7775                         cancel_req = req;
7776                         break;
7777                 }
7778                 if (cancel_req)
7779                         prepare_to_wait(&ctx->inflight_wait, &wait,
7780                                                 TASK_UNINTERRUPTIBLE);
7781                 spin_unlock_irq(&ctx->inflight_lock);
7782
7783                 /* We need to keep going until we don't find a matching req */
7784                 if (!cancel_req)
7785                         break;
7786
7787                 if (cancel_req->flags & REQ_F_OVERFLOW) {
7788                         spin_lock_irq(&ctx->completion_lock);
7789                         list_del(&cancel_req->list);
7790                         cancel_req->flags &= ~REQ_F_OVERFLOW;
7791                         if (list_empty(&ctx->cq_overflow_list)) {
7792                                 clear_bit(0, &ctx->sq_check_overflow);
7793                                 clear_bit(0, &ctx->cq_check_overflow);
7794                         }
7795                         spin_unlock_irq(&ctx->completion_lock);
7796
7797                         WRITE_ONCE(ctx->rings->cq_overflow,
7798                                 atomic_inc_return(&ctx->cached_cq_overflow));
7799
7800                         /*
7801                          * Put inflight ref and overflow ref. If that's
7802                          * all we had, then we're done with this request.
7803                          */
7804                         if (refcount_sub_and_test(2, &cancel_req->refs)) {
7805                                 io_free_req(cancel_req);
7806                                 finish_wait(&ctx->inflight_wait, &wait);
7807                                 continue;
7808                         }
7809                 } else {
7810                         io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
7811                         io_put_req(cancel_req);
7812                 }
7813
7814                 schedule();
7815                 finish_wait(&ctx->inflight_wait, &wait);
7816         }
7817 }
7818
7819 static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
7820 {
7821         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
7822         struct task_struct *task = data;
7823
7824         return req->task == task;
7825 }
7826
7827 static int io_uring_flush(struct file *file, void *data)
7828 {
7829         struct io_ring_ctx *ctx = file->private_data;
7830
7831         io_uring_cancel_files(ctx, data);
7832
7833         /*
7834          * If the task is going away, cancel work it may have pending
7835          */
7836         if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
7837                 io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true);
7838
7839         return 0;
7840 }
7841
7842 static void *io_uring_validate_mmap_request(struct file *file,
7843                                             loff_t pgoff, size_t sz)
7844 {
7845         struct io_ring_ctx *ctx = file->private_data;
7846         loff_t offset = pgoff << PAGE_SHIFT;
7847         struct page *page;
7848         void *ptr;
7849
7850         switch (offset) {
7851         case IORING_OFF_SQ_RING:
7852         case IORING_OFF_CQ_RING:
7853                 ptr = ctx->rings;
7854                 break;
7855         case IORING_OFF_SQES:
7856                 ptr = ctx->sq_sqes;
7857                 break;
7858         default:
7859                 return ERR_PTR(-EINVAL);
7860         }
7861
7862         page = virt_to_head_page(ptr);
7863         if (sz > page_size(page))
7864                 return ERR_PTR(-EINVAL);
7865
7866         return ptr;
7867 }
7868
7869 #ifdef CONFIG_MMU
7870
7871 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
7872 {
7873         size_t sz = vma->vm_end - vma->vm_start;
7874         unsigned long pfn;
7875         void *ptr;
7876
7877         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
7878         if (IS_ERR(ptr))
7879                 return PTR_ERR(ptr);
7880
7881         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
7882         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
7883 }
7884
7885 #else /* !CONFIG_MMU */
7886
7887 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
7888 {
7889         return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
7890 }
7891
7892 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
7893 {
7894         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
7895 }
7896
7897 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
7898         unsigned long addr, unsigned long len,
7899         unsigned long pgoff, unsigned long flags)
7900 {
7901         void *ptr;
7902
7903         ptr = io_uring_validate_mmap_request(file, pgoff, len);
7904         if (IS_ERR(ptr))
7905                 return PTR_ERR(ptr);
7906
7907         return (unsigned long) ptr;
7908 }
7909
7910 #endif /* !CONFIG_MMU */
7911
7912 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
7913                 u32, min_complete, u32, flags, const sigset_t __user *, sig,
7914                 size_t, sigsz)
7915 {
7916         struct io_ring_ctx *ctx;
7917         long ret = -EBADF;
7918         int submitted = 0;
7919         struct fd f;
7920
7921         if (current->task_works)
7922                 task_work_run();
7923
7924         if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
7925                 return -EINVAL;
7926
7927         f = fdget(fd);
7928         if (!f.file)
7929                 return -EBADF;
7930
7931         ret = -EOPNOTSUPP;
7932         if (f.file->f_op != &io_uring_fops)
7933                 goto out_fput;
7934
7935         ret = -ENXIO;
7936         ctx = f.file->private_data;
7937         if (!percpu_ref_tryget(&ctx->refs))
7938                 goto out_fput;
7939
7940         /*
7941          * For SQ polling, the thread will do all submissions and completions.
7942          * Just return the requested submit count, and wake the thread if
7943          * we were asked to.
7944          */
7945         ret = 0;
7946         if (ctx->flags & IORING_SETUP_SQPOLL) {
7947                 if (!list_empty_careful(&ctx->cq_overflow_list))
7948                         io_cqring_overflow_flush(ctx, false);
7949                 if (flags & IORING_ENTER_SQ_WAKEUP)
7950                         wake_up(&ctx->sqo_wait);
7951                 submitted = to_submit;
7952         } else if (to_submit) {
7953                 mutex_lock(&ctx->uring_lock);
7954                 submitted = io_submit_sqes(ctx, to_submit, f.file, fd);
7955                 mutex_unlock(&ctx->uring_lock);
7956
7957                 if (submitted != to_submit)
7958                         goto out;
7959         }
7960         if (flags & IORING_ENTER_GETEVENTS) {
7961                 unsigned nr_events = 0;
7962
7963                 min_complete = min(min_complete, ctx->cq_entries);
7964
7965                 /*
7966                  * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
7967                  * space applications don't need to do io completion events
7968                  * polling again, they can rely on io_sq_thread to do polling
7969                  * work, which can reduce cpu usage and uring_lock contention.
7970                  */
7971                 if (ctx->flags & IORING_SETUP_IOPOLL &&
7972                     !(ctx->flags & IORING_SETUP_SQPOLL)) {
7973                         ret = io_iopoll_check(ctx, &nr_events, min_complete);
7974                 } else {
7975                         ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
7976                 }
7977         }
7978
7979 out:
7980         percpu_ref_put(&ctx->refs);
7981 out_fput:
7982         fdput(f);
7983         return submitted ? submitted : ret;
7984 }
7985
7986 #ifdef CONFIG_PROC_FS
7987 static int io_uring_show_cred(int id, void *p, void *data)
7988 {
7989         const struct cred *cred = p;
7990         struct seq_file *m = data;
7991         struct user_namespace *uns = seq_user_ns(m);
7992         struct group_info *gi;
7993         kernel_cap_t cap;
7994         unsigned __capi;
7995         int g;
7996
7997         seq_printf(m, "%5d\n", id);
7998         seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
7999         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
8000         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
8001         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
8002         seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
8003         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
8004         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
8005         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
8006         seq_puts(m, "\n\tGroups:\t");
8007         gi = cred->group_info;
8008         for (g = 0; g < gi->ngroups; g++) {
8009                 seq_put_decimal_ull(m, g ? " " : "",
8010                                         from_kgid_munged(uns, gi->gid[g]));
8011         }
8012         seq_puts(m, "\n\tCapEff:\t");
8013         cap = cred->cap_effective;
8014         CAP_FOR_EACH_U32(__capi)
8015                 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
8016         seq_putc(m, '\n');
8017         return 0;
8018 }
8019
8020 static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
8021 {
8022         int i;
8023
8024         mutex_lock(&ctx->uring_lock);
8025         seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
8026         for (i = 0; i < ctx->nr_user_files; i++) {
8027                 struct fixed_file_table *table;
8028                 struct file *f;
8029
8030                 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
8031                 f = table->files[i & IORING_FILE_TABLE_MASK];
8032                 if (f)
8033                         seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
8034                 else
8035                         seq_printf(m, "%5u: <none>\n", i);
8036         }
8037         seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
8038         for (i = 0; i < ctx->nr_user_bufs; i++) {
8039                 struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
8040
8041                 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
8042                                                 (unsigned int) buf->len);
8043         }
8044         if (!idr_is_empty(&ctx->personality_idr)) {
8045                 seq_printf(m, "Personalities:\n");
8046                 idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
8047         }
8048         seq_printf(m, "PollList:\n");
8049         spin_lock_irq(&ctx->completion_lock);
8050         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
8051                 struct hlist_head *list = &ctx->cancel_hash[i];
8052                 struct io_kiocb *req;
8053
8054                 hlist_for_each_entry(req, list, hash_node)
8055                         seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
8056                                         req->task->task_works != NULL);
8057         }
8058         spin_unlock_irq(&ctx->completion_lock);
8059         mutex_unlock(&ctx->uring_lock);
8060 }
8061
8062 static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
8063 {
8064         struct io_ring_ctx *ctx = f->private_data;
8065
8066         if (percpu_ref_tryget(&ctx->refs)) {
8067                 __io_uring_show_fdinfo(ctx, m);
8068                 percpu_ref_put(&ctx->refs);
8069         }
8070 }
8071 #endif
8072
8073 static const struct file_operations io_uring_fops = {
8074         .release        = io_uring_release,
8075         .flush          = io_uring_flush,
8076         .mmap           = io_uring_mmap,
8077 #ifndef CONFIG_MMU
8078         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
8079         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
8080 #endif
8081         .poll           = io_uring_poll,
8082         .fasync         = io_uring_fasync,
8083 #ifdef CONFIG_PROC_FS
8084         .show_fdinfo    = io_uring_show_fdinfo,
8085 #endif
8086 };
8087
8088 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
8089                                   struct io_uring_params *p)
8090 {
8091         struct io_rings *rings;
8092         size_t size, sq_array_offset;
8093
8094         size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
8095         if (size == SIZE_MAX)
8096                 return -EOVERFLOW;
8097
8098         rings = io_mem_alloc(size);
8099         if (!rings)
8100                 return -ENOMEM;
8101
8102         ctx->rings = rings;
8103         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
8104         rings->sq_ring_mask = p->sq_entries - 1;
8105         rings->cq_ring_mask = p->cq_entries - 1;
8106         rings->sq_ring_entries = p->sq_entries;
8107         rings->cq_ring_entries = p->cq_entries;
8108         ctx->sq_mask = rings->sq_ring_mask;
8109         ctx->cq_mask = rings->cq_ring_mask;
8110         ctx->sq_entries = rings->sq_ring_entries;
8111         ctx->cq_entries = rings->cq_ring_entries;
8112
8113         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
8114         if (size == SIZE_MAX) {
8115                 io_mem_free(ctx->rings);
8116                 ctx->rings = NULL;
8117                 return -EOVERFLOW;
8118         }
8119
8120         ctx->sq_sqes = io_mem_alloc(size);
8121         if (!ctx->sq_sqes) {
8122                 io_mem_free(ctx->rings);
8123                 ctx->rings = NULL;
8124                 return -ENOMEM;
8125         }
8126
8127         return 0;
8128 }
8129
8130 /*
8131  * Allocate an anonymous fd, this is what constitutes the application
8132  * visible backing of an io_uring instance. The application mmaps this
8133  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
8134  * we have to tie this fd to a socket for file garbage collection purposes.
8135  */
8136 static int io_uring_get_fd(struct io_ring_ctx *ctx)
8137 {
8138         struct file *file;
8139         int ret;
8140
8141 #if defined(CONFIG_UNIX)
8142         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
8143                                 &ctx->ring_sock);
8144         if (ret)
8145                 return ret;
8146 #endif
8147
8148         ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
8149         if (ret < 0)
8150                 goto err;
8151
8152         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
8153                                         O_RDWR | O_CLOEXEC);
8154         if (IS_ERR(file)) {
8155                 put_unused_fd(ret);
8156                 ret = PTR_ERR(file);
8157                 goto err;
8158         }
8159
8160 #if defined(CONFIG_UNIX)
8161         ctx->ring_sock->file = file;
8162 #endif
8163         fd_install(ret, file);
8164         return ret;
8165 err:
8166 #if defined(CONFIG_UNIX)
8167         sock_release(ctx->ring_sock);
8168         ctx->ring_sock = NULL;
8169 #endif
8170         return ret;
8171 }
8172
8173 static int io_uring_create(unsigned entries, struct io_uring_params *p,
8174                            struct io_uring_params __user *params)
8175 {
8176         struct user_struct *user = NULL;
8177         struct io_ring_ctx *ctx;
8178         bool limit_mem;
8179         int ret;
8180
8181         if (!entries)
8182                 return -EINVAL;
8183         if (entries > IORING_MAX_ENTRIES) {
8184                 if (!(p->flags & IORING_SETUP_CLAMP))
8185                         return -EINVAL;
8186                 entries = IORING_MAX_ENTRIES;
8187         }
8188
8189         /*
8190          * Use twice as many entries for the CQ ring. It's possible for the
8191          * application to drive a higher depth than the size of the SQ ring,
8192          * since the sqes are only used at submission time. This allows for
8193          * some flexibility in overcommitting a bit. If the application has
8194          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
8195          * of CQ ring entries manually.
8196          */
8197         p->sq_entries = roundup_pow_of_two(entries);
8198         if (p->flags & IORING_SETUP_CQSIZE) {
8199                 /*
8200                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
8201                  * to a power-of-two, if it isn't already. We do NOT impose
8202                  * any cq vs sq ring sizing.
8203                  */
8204                 if (p->cq_entries < p->sq_entries)
8205                         return -EINVAL;
8206                 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
8207                         if (!(p->flags & IORING_SETUP_CLAMP))
8208                                 return -EINVAL;
8209                         p->cq_entries = IORING_MAX_CQ_ENTRIES;
8210                 }
8211                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
8212         } else {
8213                 p->cq_entries = 2 * p->sq_entries;
8214         }
8215
8216         user = get_uid(current_user());
8217         limit_mem = !capable(CAP_IPC_LOCK);
8218
8219         if (limit_mem) {
8220                 ret = __io_account_mem(user,
8221                                 ring_pages(p->sq_entries, p->cq_entries));
8222                 if (ret) {
8223                         free_uid(user);
8224                         return ret;
8225                 }
8226         }
8227
8228         ctx = io_ring_ctx_alloc(p);
8229         if (!ctx) {
8230                 if (limit_mem)
8231                         __io_unaccount_mem(user, ring_pages(p->sq_entries,
8232                                                                 p->cq_entries));
8233                 free_uid(user);
8234                 return -ENOMEM;
8235         }
8236         ctx->compat = in_compat_syscall();
8237         ctx->user = user;
8238         ctx->creds = get_current_cred();
8239
8240         ret = io_allocate_scq_urings(ctx, p);
8241         if (ret)
8242                 goto err;
8243
8244         ret = io_sq_offload_start(ctx, p);
8245         if (ret)
8246                 goto err;
8247
8248         memset(&p->sq_off, 0, sizeof(p->sq_off));
8249         p->sq_off.head = offsetof(struct io_rings, sq.head);
8250         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
8251         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
8252         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
8253         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
8254         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
8255         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
8256
8257         memset(&p->cq_off, 0, sizeof(p->cq_off));
8258         p->cq_off.head = offsetof(struct io_rings, cq.head);
8259         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
8260         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
8261         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
8262         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
8263         p->cq_off.cqes = offsetof(struct io_rings, cqes);
8264         p->cq_off.flags = offsetof(struct io_rings, cq_flags);
8265
8266         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
8267                         IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
8268                         IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
8269                         IORING_FEAT_POLL_32BITS;
8270
8271         if (copy_to_user(params, p, sizeof(*p))) {
8272                 ret = -EFAULT;
8273                 goto err;
8274         }
8275         /*
8276          * Install ring fd as the very last thing, so we don't risk someone
8277          * having closed it before we finish setup
8278          */
8279         ret = io_uring_get_fd(ctx);
8280         if (ret < 0)
8281                 goto err;
8282
8283         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
8284         io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries),
8285                        ACCT_LOCKED);
8286         ctx->limit_mem = limit_mem;
8287         return ret;
8288 err:
8289         io_ring_ctx_wait_and_kill(ctx);
8290         return ret;
8291 }
8292
8293 /*
8294  * Sets up an aio uring context, and returns the fd. Applications asks for a
8295  * ring size, we return the actual sq/cq ring sizes (among other things) in the
8296  * params structure passed in.
8297  */
8298 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
8299 {
8300         struct io_uring_params p;
8301         int i;
8302
8303         if (copy_from_user(&p, params, sizeof(p)))
8304                 return -EFAULT;
8305         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
8306                 if (p.resv[i])
8307                         return -EINVAL;
8308         }
8309
8310         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
8311                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
8312                         IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ))
8313                 return -EINVAL;
8314
8315         return  io_uring_create(entries, &p, params);
8316 }
8317
8318 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
8319                 struct io_uring_params __user *, params)
8320 {
8321         return io_uring_setup(entries, params);
8322 }
8323
8324 static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
8325 {
8326         struct io_uring_probe *p;
8327         size_t size;
8328         int i, ret;
8329
8330         size = struct_size(p, ops, nr_args);
8331         if (size == SIZE_MAX)
8332                 return -EOVERFLOW;
8333         p = kzalloc(size, GFP_KERNEL);
8334         if (!p)
8335                 return -ENOMEM;
8336
8337         ret = -EFAULT;
8338         if (copy_from_user(p, arg, size))
8339                 goto out;
8340         ret = -EINVAL;
8341         if (memchr_inv(p, 0, size))
8342                 goto out;
8343
8344         p->last_op = IORING_OP_LAST - 1;
8345         if (nr_args > IORING_OP_LAST)
8346                 nr_args = IORING_OP_LAST;
8347
8348         for (i = 0; i < nr_args; i++) {
8349                 p->ops[i].op = i;
8350                 if (!io_op_defs[i].not_supported)
8351                         p->ops[i].flags = IO_URING_OP_SUPPORTED;
8352         }
8353         p->ops_len = i;
8354
8355         ret = 0;
8356         if (copy_to_user(arg, p, size))
8357                 ret = -EFAULT;
8358 out:
8359         kfree(p);
8360         return ret;
8361 }
8362
8363 static int io_register_personality(struct io_ring_ctx *ctx)
8364 {
8365         const struct cred *creds = get_current_cred();
8366         int id;
8367
8368         id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
8369                                 USHRT_MAX, GFP_KERNEL);
8370         if (id < 0)
8371                 put_cred(creds);
8372         return id;
8373 }
8374
8375 static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
8376 {
8377         const struct cred *old_creds;
8378
8379         old_creds = idr_remove(&ctx->personality_idr, id);
8380         if (old_creds) {
8381                 put_cred(old_creds);
8382                 return 0;
8383         }
8384
8385         return -EINVAL;
8386 }
8387
8388 static bool io_register_op_must_quiesce(int op)
8389 {
8390         switch (op) {
8391         case IORING_UNREGISTER_FILES:
8392         case IORING_REGISTER_FILES_UPDATE:
8393         case IORING_REGISTER_PROBE:
8394         case IORING_REGISTER_PERSONALITY:
8395         case IORING_UNREGISTER_PERSONALITY:
8396                 return false;
8397         default:
8398                 return true;
8399         }
8400 }
8401
8402 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
8403                                void __user *arg, unsigned nr_args)
8404         __releases(ctx->uring_lock)
8405         __acquires(ctx->uring_lock)
8406 {
8407         int ret;
8408
8409         /*
8410          * We're inside the ring mutex, if the ref is already dying, then
8411          * someone else killed the ctx or is already going through
8412          * io_uring_register().
8413          */
8414         if (percpu_ref_is_dying(&ctx->refs))
8415                 return -ENXIO;
8416
8417         if (io_register_op_must_quiesce(opcode)) {
8418                 percpu_ref_kill(&ctx->refs);
8419
8420                 /*
8421                  * Drop uring mutex before waiting for references to exit. If
8422                  * another thread is currently inside io_uring_enter() it might
8423                  * need to grab the uring_lock to make progress. If we hold it
8424                  * here across the drain wait, then we can deadlock. It's safe
8425                  * to drop the mutex here, since no new references will come in
8426                  * after we've killed the percpu ref.
8427                  */
8428                 mutex_unlock(&ctx->uring_lock);
8429                 ret = wait_for_completion_interruptible(&ctx->ref_comp);
8430                 mutex_lock(&ctx->uring_lock);
8431                 if (ret) {
8432                         percpu_ref_resurrect(&ctx->refs);
8433                         ret = -EINTR;
8434                         goto out;
8435                 }
8436         }
8437
8438         switch (opcode) {
8439         case IORING_REGISTER_BUFFERS:
8440                 ret = io_sqe_buffer_register(ctx, arg, nr_args);
8441                 break;
8442         case IORING_UNREGISTER_BUFFERS:
8443                 ret = -EINVAL;
8444                 if (arg || nr_args)
8445                         break;
8446                 ret = io_sqe_buffer_unregister(ctx);
8447                 break;
8448         case IORING_REGISTER_FILES:
8449                 ret = io_sqe_files_register(ctx, arg, nr_args);
8450                 break;
8451         case IORING_UNREGISTER_FILES:
8452                 ret = -EINVAL;
8453                 if (arg || nr_args)
8454                         break;
8455                 ret = io_sqe_files_unregister(ctx);
8456                 break;
8457         case IORING_REGISTER_FILES_UPDATE:
8458                 ret = io_sqe_files_update(ctx, arg, nr_args);
8459                 break;
8460         case IORING_REGISTER_EVENTFD:
8461         case IORING_REGISTER_EVENTFD_ASYNC:
8462                 ret = -EINVAL;
8463                 if (nr_args != 1)
8464                         break;
8465                 ret = io_eventfd_register(ctx, arg);
8466                 if (ret)
8467                         break;
8468                 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
8469                         ctx->eventfd_async = 1;
8470                 else
8471                         ctx->eventfd_async = 0;
8472                 break;
8473         case IORING_UNREGISTER_EVENTFD:
8474                 ret = -EINVAL;
8475                 if (arg || nr_args)
8476                         break;
8477                 ret = io_eventfd_unregister(ctx);
8478                 break;
8479         case IORING_REGISTER_PROBE:
8480                 ret = -EINVAL;
8481                 if (!arg || nr_args > 256)
8482                         break;
8483                 ret = io_probe(ctx, arg, nr_args);
8484                 break;
8485         case IORING_REGISTER_PERSONALITY:
8486                 ret = -EINVAL;
8487                 if (arg || nr_args)
8488                         break;
8489                 ret = io_register_personality(ctx);
8490                 break;
8491         case IORING_UNREGISTER_PERSONALITY:
8492                 ret = -EINVAL;
8493                 if (arg)
8494                         break;
8495                 ret = io_unregister_personality(ctx, nr_args);
8496                 break;
8497         default:
8498                 ret = -EINVAL;
8499                 break;
8500         }
8501
8502         if (io_register_op_must_quiesce(opcode)) {
8503                 /* bring the ctx back to life */
8504                 percpu_ref_reinit(&ctx->refs);
8505 out:
8506                 reinit_completion(&ctx->ref_comp);
8507         }
8508         return ret;
8509 }
8510
8511 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
8512                 void __user *, arg, unsigned int, nr_args)
8513 {
8514         struct io_ring_ctx *ctx;
8515         long ret = -EBADF;
8516         struct fd f;
8517
8518         f = fdget(fd);
8519         if (!f.file)
8520                 return -EBADF;
8521
8522         ret = -EOPNOTSUPP;
8523         if (f.file->f_op != &io_uring_fops)
8524                 goto out_fput;
8525
8526         ctx = f.file->private_data;
8527
8528         mutex_lock(&ctx->uring_lock);
8529         ret = __io_uring_register(ctx, opcode, arg, nr_args);
8530         mutex_unlock(&ctx->uring_lock);
8531         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
8532                                                         ctx->cq_ev_fd != NULL, ret);
8533 out_fput:
8534         fdput(f);
8535         return ret;
8536 }
8537
8538 static int __init io_uring_init(void)
8539 {
8540 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
8541         BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
8542         BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
8543 } while (0)
8544
8545 #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
8546         __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
8547         BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
8548         BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
8549         BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
8550         BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
8551         BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
8552         BUILD_BUG_SQE_ELEM(8,  __u64,  off);
8553         BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
8554         BUILD_BUG_SQE_ELEM(16, __u64,  addr);
8555         BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
8556         BUILD_BUG_SQE_ELEM(24, __u32,  len);
8557         BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
8558         BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
8559         BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
8560         BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
8561         BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
8562         BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
8563         BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
8564         BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
8565         BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
8566         BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
8567         BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
8568         BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
8569         BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
8570         BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
8571         BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
8572         BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
8573         BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
8574         BUILD_BUG_SQE_ELEM(42, __u16,  personality);
8575         BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
8576
8577         BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
8578         BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
8579         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
8580         return 0;
8581 };
8582 __initcall(io_uring_init);