fs/io_uring.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Shared application/kernel submission and completion ring pairs, for
   4  * supporting fast/efficient IO.
   5  *
   6  * A note on the read/write ordering memory barriers that are matched between
   7  * the application and kernel side.
   8  *
   9  * After the application reads the CQ ring tail, it must use an
  10  * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11  * before writing the tail (using smp_load_acquire to read the tail will
  12  * do). It also needs a smp_mb() before updating CQ head (ordering the
  13  * entry load(s) with the head store), pairing with an implicit barrier
  14  * through a control-dependency in io_get_cqring (smp_store_release to
  15  * store head will do). Failure to do so could lead to reading invalid
  16  * CQ entries.
  17  *
  18  * Likewise, the application must use an appropriate smp_wmb() before
  19  * writing the SQ tail (ordering SQ entry stores with the tail store),
  20  * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21  * to store the tail will do). And it needs a barrier ordering the SQ
  22  * head load before writing new SQ entries (smp_load_acquire to read
  23  * head will do).
  24  *
  25  * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26  * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27  * updating the SQ tail; a full memory barrier smp_mb() is needed
  28  * between.
  29  *
  30  * Also see the examples in the liburing library:
  31  *
  32  *      git://git.kernel.dk/liburing
  33  *
  34  * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35  * from data shared between the kernel and application. This is done both
  36  * for ordering purposes, but also to ensure that once a value is loaded from
  37  * data that the application could potentially modify, it remains stable.
  38  *
  39  * Copyright (C) 2018-2019 Jens Axboe
  40  * Copyright (c) 2018-2019 Christoph Hellwig
  41  */
  42 #include <linux/kernel.h>
  43 #include <linux/init.h>
  44 #include <linux/errno.h>
  45 #include <linux/syscalls.h>
  46 #include <linux/compat.h>
  47 #include <net/compat.h>
  48 #include <linux/refcount.h>
  49 #include <linux/uio.h>
  50 #include <linux/bits.h>
  51
  52 #include <linux/sched/signal.h>
  53 #include <linux/fs.h>
  54 #include <linux/file.h>
  55 #include <linux/fdtable.h>
  56 #include <linux/mm.h>
  57 #include <linux/mman.h>
  58 #include <linux/percpu.h>
  59 #include <linux/slab.h>
  60 #include <linux/kthread.h>
  61 #include <linux/blkdev.h>
  62 #include <linux/bvec.h>
  63 #include <linux/net.h>
  64 #include <net/sock.h>
  65 #include <net/af_unix.h>
  66 #include <net/scm.h>
  67 #include <linux/anon_inodes.h>
  68 #include <linux/sched/mm.h>
  69 #include <linux/uaccess.h>
  70 #include <linux/nospec.h>
  71 #include <linux/sizes.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/highmem.h>
  74 #include <linux/namei.h>
  75 #include <linux/fsnotify.h>
  76 #include <linux/fadvise.h>
  77 #include <linux/eventpoll.h>
  78 #include <linux/fs_struct.h>
  79 #include <linux/splice.h>
  80 #include <linux/task_work.h>
  81
  82 #define CREATE_TRACE_POINTS
  83 #include <trace/events/io_uring.h>
  84
  85 #include <uapi/linux/io_uring.h>
  86
  87 #include "internal.h"
  88 #include "io-wq.h"
  89
  90 #define IORING_MAX_ENTRIES      32768
  91 #define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
  92
  93 /*
  94  * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
  95  */
  96 #define IORING_FILE_TABLE_SHIFT 9
  97 #define IORING_MAX_FILES_TABLE  (1U << IORING_FILE_TABLE_SHIFT)
  98 #define IORING_FILE_TABLE_MASK  (IORING_MAX_FILES_TABLE - 1)
  99 #define IORING_MAX_FIXED_FILES  (64 * IORING_MAX_FILES_TABLE)
 100
 101 struct io_uring {
 102         u32 head ____cacheline_aligned_in_smp;
 103         u32 tail ____cacheline_aligned_in_smp;
 104 };
 105
 106 /*
 107  * This data is shared with the application through the mmap at offsets
 108  * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 109  *
 110  * The offsets to the member fields are published through struct
 111  * io_sqring_offsets when calling io_uring_setup.
 112  */
 113 struct io_rings {
 114         /*
 115          * Head and tail offsets into the ring; the offsets need to be
 116          * masked to get valid indices.
 117          *
 118          * The kernel controls head of the sq ring and the tail of the cq ring,
 119          * and the application controls tail of the sq ring and the head of the
 120          * cq ring.
 121          */
 122         struct io_uring         sq, cq;
 123         /*
 124          * Bitmasks to apply to head and tail offsets (constant, equals
 125          * ring_entries - 1)
 126          */
 127         u32                     sq_ring_mask, cq_ring_mask;
 128         /* Ring sizes (constant, power of 2) */
 129         u32                     sq_ring_entries, cq_ring_entries;
 130         /*
 131          * Number of invalid entries dropped by the kernel due to
 132          * invalid index stored in array
 133          *
 134          * Written by the kernel, shouldn't be modified by the
 135          * application (i.e. get number of "new events" by comparing to
 136          * cached value).
 137          *
 138          * After a new SQ head value was read by the application this
 139          * counter includes all submissions that were dropped reaching
 140          * the new SQ head (and possibly more).
 141          */
 142         u32                     sq_dropped;
 143         /*
 144          * Runtime SQ flags
 145          *
 146          * Written by the kernel, shouldn't be modified by the
 147          * application.
 148          *
 149          * The application needs a full memory barrier before checking
 150          * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 151          */
 152         u32                     sq_flags;
 153         /*
 154          * Runtime CQ flags
 155          *
 156          * Written by the application, shouldn't be modified by the
 157          * kernel.
 158          */
 159         u32                     cq_flags;
 160         /*
 161          * Number of completion events lost because the queue was full;
 162          * this should be avoided by the application by making sure
 163          * there are not more requests pending than there is space in
 164          * the completion queue.
 165          *
 166          * Written by the kernel, shouldn't be modified by the
 167          * application (i.e. get number of "new events" by comparing to
 168          * cached value).
 169          *
 170          * As completion events come in out of order this counter is not
 171          * ordered with any other data.
 172          */
 173         u32                     cq_overflow;
 174         /*
 175          * Ring buffer of completion events.
 176          *
 177          * The kernel writes completion events fresh every time they are
 178          * produced, so the application is allowed to modify pending
 179          * entries.
 180          */
 181         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 182 };
 183
 184 struct io_mapped_ubuf {
 185         u64             ubuf;
 186         size_t          len;
 187         struct          bio_vec *bvec;
 188         unsigned int    nr_bvecs;
 189 };
 190
 191 struct fixed_file_table {
 192         struct file             **files;
 193 };
 194
 195 struct fixed_file_ref_node {
 196         struct percpu_ref               refs;
 197         struct list_head                node;
 198         struct list_head                file_list;
 199         struct fixed_file_data          *file_data;
 200         struct llist_node               llist;
 201 };
 202
 203 struct fixed_file_data {
 204         struct fixed_file_table         *table;
 205         struct io_ring_ctx              *ctx;
 206
 207         struct percpu_ref               *cur_refs;
 208         struct percpu_ref               refs;
 209         struct completion               done;
 210         struct list_head                ref_list;
 211         spinlock_t                      lock;
 212 };
 213
 214 struct io_buffer {
 215         struct list_head list;
 216         __u64 addr;
 217         __s32 len;
 218         __u16 bid;
 219 };
 220
 221 struct io_ring_ctx {
 222         struct {
 223                 struct percpu_ref       refs;
 224         } ____cacheline_aligned_in_smp;
 225
 226         struct {
 227                 unsigned int            flags;
 228                 unsigned int            compat: 1;
 229                 unsigned int            account_mem: 1;
 230                 unsigned int            cq_overflow_flushed: 1;
 231                 unsigned int            drain_next: 1;
 232                 unsigned int            eventfd_async: 1;
 233
 234                 /*
 235                  * Ring buffer of indices into array of io_uring_sqe, which is
 236                  * mmapped by the application using the IORING_OFF_SQES offset.
 237                  *
 238                  * This indirection could e.g. be used to assign fixed
 239                  * io_uring_sqe entries to operations and only submit them to
 240                  * the queue when needed.
 241                  *
 242                  * The kernel modifies neither the indices array nor the entries
 243                  * array.
 244                  */
 245                 u32                     *sq_array;
 246                 unsigned                cached_sq_head;
 247                 unsigned                sq_entries;
 248                 unsigned                sq_mask;
 249                 unsigned                sq_thread_idle;
 250                 unsigned                cached_sq_dropped;
 251                 atomic_t                cached_cq_overflow;
 252                 unsigned long           sq_check_overflow;
 253
 254                 struct list_head        defer_list;
 255                 struct list_head        timeout_list;
 256                 struct list_head        cq_overflow_list;
 257
 258                 wait_queue_head_t       inflight_wait;
 259                 struct io_uring_sqe     *sq_sqes;
 260         } ____cacheline_aligned_in_smp;
 261
 262         struct io_rings *rings;
 263
 264         /* IO offload */
 265         struct io_wq            *io_wq;
 266         struct task_struct      *sqo_thread;    /* if using sq thread polling */
 267         struct mm_struct        *sqo_mm;
 268         wait_queue_head_t       sqo_wait;
 269
 270         /*
 271          * If used, fixed file set. Writers must ensure that ->refs is dead,
 272          * readers must ensure that ->refs is alive as long as the file* is
 273          * used. Only updated through io_uring_register(2).
 274          */
 275         struct fixed_file_data  *file_data;
 276         unsigned                nr_user_files;
 277         int                     ring_fd;
 278         struct file             *ring_file;
 279
 280         /* if used, fixed mapped user buffers */
 281         unsigned                nr_user_bufs;
 282         struct io_mapped_ubuf   *user_bufs;
 283
 284         struct user_struct      *user;
 285
 286         const struct cred       *creds;
 287
 288         struct completion       ref_comp;
 289         struct completion       sq_thread_comp;
 290
 291         /* if all else fails... */
 292         struct io_kiocb         *fallback_req;
 293
 294 #if defined(CONFIG_UNIX)
 295         struct socket           *ring_sock;
 296 #endif
 297
 298         struct idr              io_buffer_idr;
 299
 300         struct idr              personality_idr;
 301
 302         struct {
 303                 unsigned                cached_cq_tail;
 304                 unsigned                cq_entries;
 305                 unsigned                cq_mask;
 306                 atomic_t                cq_timeouts;
 307                 unsigned long           cq_check_overflow;
 308                 struct wait_queue_head  cq_wait;
 309                 struct fasync_struct    *cq_fasync;
 310                 struct eventfd_ctx      *cq_ev_fd;
 311         } ____cacheline_aligned_in_smp;
 312
 313         struct {
 314                 struct mutex            uring_lock;
 315                 wait_queue_head_t       wait;
 316         } ____cacheline_aligned_in_smp;
 317
 318         struct {
 319                 spinlock_t              completion_lock;
 320
 321                 /*
 322                  * ->poll_list is protected by the ctx->uring_lock for
 323                  * io_uring instances that don't use IORING_SETUP_SQPOLL.
 324                  * For SQPOLL, only the single threaded io_sq_thread() will
 325                  * manipulate the list, hence no extra locking is needed there.
 326                  */
 327                 struct list_head        poll_list;
 328                 struct hlist_head       *cancel_hash;
 329                 unsigned                cancel_hash_bits;
 330                 bool                    poll_multi_file;
 331
 332                 spinlock_t              inflight_lock;
 333                 struct list_head        inflight_list;
 334         } ____cacheline_aligned_in_smp;
 335
 336         struct delayed_work             file_put_work;
 337         struct llist_head               file_put_llist;
 338
 339         struct work_struct              exit_work;
 340 };
 341
 342 /*
 343  * First field must be the file pointer in all the
 344  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 345  */
 346 struct io_poll_iocb {
 347         struct file                     *file;
 348         union {
 349                 struct wait_queue_head  *head;
 350                 u64                     addr;
 351         };
 352         __poll_t                        events;
 353         bool                            done;
 354         bool                            canceled;
 355         struct wait_queue_entry         wait;
 356 };
 357
 358 struct io_close {
 359         struct file                     *file;
 360         struct file                     *put_file;
 361         int                             fd;
 362 };
 363
 364 struct io_timeout_data {
 365         struct io_kiocb                 *req;
 366         struct hrtimer                  timer;
 367         struct timespec64               ts;
 368         enum hrtimer_mode               mode;
 369 };
 370
 371 struct io_accept {
 372         struct file                     *file;
 373         struct sockaddr __user          *addr;
 374         int __user                      *addr_len;
 375         int                             flags;
 376         unsigned long                   nofile;
 377 };
 378
 379 struct io_sync {
 380         struct file                     *file;
 381         loff_t                          len;
 382         loff_t                          off;
 383         int                             flags;
 384         int                             mode;
 385 };
 386
 387 struct io_cancel {
 388         struct file                     *file;
 389         u64                             addr;
 390 };
 391
 392 struct io_timeout {
 393         struct file                     *file;
 394         u64                             addr;
 395         int                             flags;
 396         u32                             off;
 397         u32                             target_seq;
 398 };
 399
 400 struct io_rw {
 401         /* NOTE: kiocb has the file as the first member, so don't do it here */
 402         struct kiocb                    kiocb;
 403         u64                             addr;
 404         u64                             len;
 405 };
 406
 407 struct io_connect {
 408         struct file                     *file;
 409         struct sockaddr __user          *addr;
 410         int                             addr_len;
 411 };
 412
 413 struct io_sr_msg {
 414         struct file                     *file;
 415         union {
 416                 struct user_msghdr __user *msg;
 417                 void __user             *buf;
 418         };
 419         int                             msg_flags;
 420         int                             bgid;
 421         size_t                          len;
 422         struct io_buffer                *kbuf;
 423 };
 424
 425 struct io_open {
 426         struct file                     *file;
 427         int                             dfd;
 428         struct filename                 *filename;
 429         struct open_how                 how;
 430         unsigned long                   nofile;
 431 };
 432
 433 struct io_files_update {
 434         struct file                     *file;
 435         u64                             arg;
 436         u32                             nr_args;
 437         u32                             offset;
 438 };
 439
 440 struct io_fadvise {
 441         struct file                     *file;
 442         u64                             offset;
 443         u32                             len;
 444         u32                             advice;
 445 };
 446
 447 struct io_madvise {
 448         struct file                     *file;
 449         u64                             addr;
 450         u32                             len;
 451         u32                             advice;
 452 };
 453
 454 struct io_epoll {
 455         struct file                     *file;
 456         int                             epfd;
 457         int                             op;
 458         int                             fd;
 459         struct epoll_event              event;
 460 };
 461
 462 struct io_splice {
 463         struct file                     *file_out;
 464         struct file                     *file_in;
 465         loff_t                          off_out;
 466         loff_t                          off_in;
 467         u64                             len;
 468         unsigned int                    flags;
 469 };
 470
 471 struct io_provide_buf {
 472         struct file                     *file;
 473         __u64                           addr;
 474         __s32                           len;
 475         __u32                           bgid;
 476         __u16                           nbufs;
 477         __u16                           bid;
 478 };
 479
 480 struct io_statx {
 481         struct file                     *file;
 482         int                             dfd;
 483         unsigned int                    mask;
 484         unsigned int                    flags;
 485         const char __user               *filename;
 486         struct statx __user             *buffer;
 487 };
 488
 489 struct io_async_connect {
 490         struct sockaddr_storage         address;
 491 };
 492
 493 struct io_async_msghdr {
 494         struct iovec                    fast_iov[UIO_FASTIOV];
 495         struct iovec                    *iov;
 496         struct sockaddr __user          *uaddr;
 497         struct msghdr                   msg;
 498         struct sockaddr_storage         addr;
 499 };
 500
 501 struct io_async_rw {
 502         struct iovec                    fast_iov[UIO_FASTIOV];
 503         struct iovec                    *iov;
 504         ssize_t                         nr_segs;
 505         ssize_t                         size;
 506 };
 507
 508 struct io_async_ctx {
 509         union {
 510                 struct io_async_rw      rw;
 511                 struct io_async_msghdr  msg;
 512                 struct io_async_connect connect;
 513                 struct io_timeout_data  timeout;
 514         };
 515 };
 516
 517 enum {
 518         REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
 519         REQ_F_IO_DRAIN_BIT      = IOSQE_IO_DRAIN_BIT,
 520         REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
 521         REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
 522         REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
 523         REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
 524
 525         REQ_F_LINK_HEAD_BIT,
 526         REQ_F_LINK_NEXT_BIT,
 527         REQ_F_FAIL_LINK_BIT,
 528         REQ_F_INFLIGHT_BIT,
 529         REQ_F_CUR_POS_BIT,
 530         REQ_F_NOWAIT_BIT,
 531         REQ_F_LINK_TIMEOUT_BIT,
 532         REQ_F_TIMEOUT_BIT,
 533         REQ_F_ISREG_BIT,
 534         REQ_F_MUST_PUNT_BIT,
 535         REQ_F_TIMEOUT_NOSEQ_BIT,
 536         REQ_F_COMP_LOCKED_BIT,
 537         REQ_F_NEED_CLEANUP_BIT,
 538         REQ_F_OVERFLOW_BIT,
 539         REQ_F_POLLED_BIT,
 540         REQ_F_BUFFER_SELECTED_BIT,
 541         REQ_F_NO_FILE_TABLE_BIT,
 542         REQ_F_QUEUE_TIMEOUT_BIT,
 543         REQ_F_WORK_INITIALIZED_BIT,
 544         REQ_F_TASK_PINNED_BIT,
 545
 546         /* not a real bit, just to check we're not overflowing the space */
 547         __REQ_F_LAST_BIT,
 548 };
 549
 550 enum {
 551         /* ctx owns file */
 552         REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
 553         /* drain existing IO first */
 554         REQ_F_IO_DRAIN          = BIT(REQ_F_IO_DRAIN_BIT),
 555         /* linked sqes */
 556         REQ_F_LINK              = BIT(REQ_F_LINK_BIT),
 557         /* doesn't sever on completion < 0 */
 558         REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
 559         /* IOSQE_ASYNC */
 560         REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
 561         /* IOSQE_BUFFER_SELECT */
 562         REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
 563
 564         /* head of a link */
 565         REQ_F_LINK_HEAD         = BIT(REQ_F_LINK_HEAD_BIT),
 566         /* already grabbed next link */
 567         REQ_F_LINK_NEXT         = BIT(REQ_F_LINK_NEXT_BIT),
 568         /* fail rest of links */
 569         REQ_F_FAIL_LINK         = BIT(REQ_F_FAIL_LINK_BIT),
 570         /* on inflight list */
 571         REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
 572         /* read/write uses file position */
 573         REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
 574         /* must not punt to workers */
 575         REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
 576         /* has linked timeout */
 577         REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
 578         /* timeout request */
 579         REQ_F_TIMEOUT           = BIT(REQ_F_TIMEOUT_BIT),
 580         /* regular file */
 581         REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
 582         /* must be punted even for NONBLOCK */
 583         REQ_F_MUST_PUNT         = BIT(REQ_F_MUST_PUNT_BIT),
 584         /* no timeout sequence */
 585         REQ_F_TIMEOUT_NOSEQ     = BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
 586         /* completion under lock */
 587         REQ_F_COMP_LOCKED       = BIT(REQ_F_COMP_LOCKED_BIT),
 588         /* needs cleanup */
 589         REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
 590         /* in overflow list */
 591         REQ_F_OVERFLOW          = BIT(REQ_F_OVERFLOW_BIT),
 592         /* already went through poll handler */
 593         REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
 594         /* buffer already selected */
 595         REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
 596         /* doesn't need file table for this request */
 597         REQ_F_NO_FILE_TABLE     = BIT(REQ_F_NO_FILE_TABLE_BIT),
 598         /* needs to queue linked timeout */
 599         REQ_F_QUEUE_TIMEOUT     = BIT(REQ_F_QUEUE_TIMEOUT_BIT),
 600         /* io_wq_work is initialized */
 601         REQ_F_WORK_INITIALIZED  = BIT(REQ_F_WORK_INITIALIZED_BIT),
 602         /* req->task is refcounted */
 603         REQ_F_TASK_PINNED       = BIT(REQ_F_TASK_PINNED_BIT),
 604 };
 605
 606 struct async_poll {
 607         struct io_poll_iocb     poll;
 608         struct io_wq_work       work;
 609 };
 610
 611 /*
 612  * NOTE! Each of the iocb union members has the file pointer
 613  * as the first entry in their struct definition. So you can
 614  * access the file pointer through any of the sub-structs,
 615  * or directly as just 'ki_filp' in this struct.
 616  */
 617 struct io_kiocb {
 618         union {
 619                 struct file             *file;
 620                 struct io_rw            rw;
 621                 struct io_poll_iocb     poll;
 622                 struct io_accept        accept;
 623                 struct io_sync          sync;
 624                 struct io_cancel        cancel;
 625                 struct io_timeout       timeout;
 626                 struct io_connect       connect;
 627                 struct io_sr_msg        sr_msg;
 628                 struct io_open          open;
 629                 struct io_close         close;
 630                 struct io_files_update  files_update;
 631                 struct io_fadvise       fadvise;
 632                 struct io_madvise       madvise;
 633                 struct io_epoll         epoll;
 634                 struct io_splice        splice;
 635                 struct io_provide_buf   pbuf;
 636                 struct io_statx         statx;
 637         };
 638
 639         struct io_async_ctx             *io;
 640         int                             cflags;
 641         u8                              opcode;
 642         /* polled IO has completed */
 643         u8                              iopoll_completed;
 644
 645         u16                             buf_index;
 646
 647         struct io_ring_ctx      *ctx;
 648         struct list_head        list;
 649         unsigned int            flags;
 650         refcount_t              refs;
 651         struct task_struct      *task;
 652         unsigned long           fsize;
 653         u64                     user_data;
 654         u32                     result;
 655         u32                     sequence;
 656
 657         struct list_head        link_list;
 658
 659         struct list_head        inflight_entry;
 660
 661         struct percpu_ref       *fixed_file_refs;
 662
 663         union {
 664                 /*
 665                  * Only commands that never go async can use the below fields,
 666                  * obviously. Right now only IORING_OP_POLL_ADD uses them, and
 667                  * async armed poll handlers for regular commands. The latter
 668                  * restore the work, if needed.
 669                  */
 670                 struct {
 671                         struct callback_head    task_work;
 672                         struct hlist_node       hash_node;
 673                         struct async_poll       *apoll;
 674                 };
 675                 struct io_wq_work       work;
 676         };
 677 };
 678
 679 #define IO_PLUG_THRESHOLD               2
 680 #define IO_IOPOLL_BATCH                 8
 681
 682 struct io_submit_state {
 683         struct blk_plug         plug;
 684
 685         /*
 686          * io_kiocb alloc cache
 687          */
 688         void                    *reqs[IO_IOPOLL_BATCH];
 689         unsigned int            free_reqs;
 690
 691         /*
 692          * File reference cache
 693          */
 694         struct file             *file;
 695         unsigned int            fd;
 696         unsigned int            has_refs;
 697         unsigned int            used_refs;
 698         unsigned int            ios_left;
 699 };
 700
 701 struct io_op_def {
 702         /* needs req->io allocated for deferral/async */
 703         unsigned                async_ctx : 1;
 704         /* needs current->mm setup, does mm access */
 705         unsigned                needs_mm : 1;
 706         /* needs req->file assigned */
 707         unsigned                needs_file : 1;
 708         /* don't fail if file grab fails */
 709         unsigned                needs_file_no_error : 1;
 710         /* hash wq insertion if file is a regular file */
 711         unsigned                hash_reg_file : 1;
 712         /* unbound wq insertion if file is a non-regular file */
 713         unsigned                unbound_nonreg_file : 1;
 714         /* opcode is not supported by this kernel */
 715         unsigned                not_supported : 1;
 716         /* needs file table */
 717         unsigned                file_table : 1;
 718         /* needs ->fs */
 719         unsigned                needs_fs : 1;
 720         /* set if opcode supports polled "wait" */
 721         unsigned                pollin : 1;
 722         unsigned                pollout : 1;
 723         /* op supports buffer selection */
 724         unsigned                buffer_select : 1;
 725 };
 726
 727 static const struct io_op_def io_op_defs[] = {
 728         [IORING_OP_NOP] = {},
 729         [IORING_OP_READV] = {
 730                 .async_ctx              = 1,
 731                 .needs_mm               = 1,
 732                 .needs_file             = 1,
 733                 .unbound_nonreg_file    = 1,
 734                 .pollin                 = 1,
 735                 .buffer_select          = 1,
 736         },
 737         [IORING_OP_WRITEV] = {
 738                 .async_ctx              = 1,
 739                 .needs_mm               = 1,
 740                 .needs_file             = 1,
 741                 .hash_reg_file          = 1,
 742                 .unbound_nonreg_file    = 1,
 743                 .pollout                = 1,
 744         },
 745         [IORING_OP_FSYNC] = {
 746                 .needs_file             = 1,
 747         },
 748         [IORING_OP_READ_FIXED] = {
 749                 .needs_file             = 1,
 750                 .unbound_nonreg_file    = 1,
 751                 .pollin                 = 1,
 752         },
 753         [IORING_OP_WRITE_FIXED] = {
 754                 .needs_file             = 1,
 755                 .hash_reg_file          = 1,
 756                 .unbound_nonreg_file    = 1,
 757                 .pollout                = 1,
 758         },
 759         [IORING_OP_POLL_ADD] = {
 760                 .needs_file             = 1,
 761                 .unbound_nonreg_file    = 1,
 762         },
 763         [IORING_OP_POLL_REMOVE] = {},
 764         [IORING_OP_SYNC_FILE_RANGE] = {
 765                 .needs_file             = 1,
 766         },
 767         [IORING_OP_SENDMSG] = {
 768                 .async_ctx              = 1,
 769                 .needs_mm               = 1,
 770                 .needs_file             = 1,
 771                 .unbound_nonreg_file    = 1,
 772                 .needs_fs               = 1,
 773                 .pollout                = 1,
 774         },
 775         [IORING_OP_RECVMSG] = {
 776                 .async_ctx              = 1,
 777                 .needs_mm               = 1,
 778                 .needs_file             = 1,
 779                 .unbound_nonreg_file    = 1,
 780                 .needs_fs               = 1,
 781                 .pollin                 = 1,
 782                 .buffer_select          = 1,
 783         },
 784         [IORING_OP_TIMEOUT] = {
 785                 .async_ctx              = 1,
 786                 .needs_mm               = 1,
 787         },
 788         [IORING_OP_TIMEOUT_REMOVE] = {},
 789         [IORING_OP_ACCEPT] = {
 790                 .needs_mm               = 1,
 791                 .needs_file             = 1,
 792                 .unbound_nonreg_file    = 1,
 793                 .file_table             = 1,
 794                 .pollin                 = 1,
 795         },
 796         [IORING_OP_ASYNC_CANCEL] = {},
 797         [IORING_OP_LINK_TIMEOUT] = {
 798                 .async_ctx              = 1,
 799                 .needs_mm               = 1,
 800         },
 801         [IORING_OP_CONNECT] = {
 802                 .async_ctx              = 1,
 803                 .needs_mm               = 1,
 804                 .needs_file             = 1,
 805                 .unbound_nonreg_file    = 1,
 806                 .pollout                = 1,
 807         },
 808         [IORING_OP_FALLOCATE] = {
 809                 .needs_file             = 1,
 810         },
 811         [IORING_OP_OPENAT] = {
 812                 .file_table             = 1,
 813                 .needs_fs               = 1,
 814         },
 815         [IORING_OP_CLOSE] = {
 816                 .needs_file             = 1,
 817                 .needs_file_no_error    = 1,
 818                 .file_table             = 1,
 819         },
 820         [IORING_OP_FILES_UPDATE] = {
 821                 .needs_mm               = 1,
 822                 .file_table             = 1,
 823         },
 824         [IORING_OP_STATX] = {
 825                 .needs_mm               = 1,
 826                 .needs_fs               = 1,
 827                 .file_table             = 1,
 828         },
 829         [IORING_OP_READ] = {
 830                 .needs_mm               = 1,
 831                 .needs_file             = 1,
 832                 .unbound_nonreg_file    = 1,
 833                 .pollin                 = 1,
 834                 .buffer_select          = 1,
 835         },
 836         [IORING_OP_WRITE] = {
 837                 .needs_mm               = 1,
 838                 .needs_file             = 1,
 839                 .unbound_nonreg_file    = 1,
 840                 .pollout                = 1,
 841         },
 842         [IORING_OP_FADVISE] = {
 843                 .needs_file             = 1,
 844         },
 845         [IORING_OP_MADVISE] = {
 846                 .needs_mm               = 1,
 847         },
 848         [IORING_OP_SEND] = {
 849                 .needs_mm               = 1,
 850                 .needs_file             = 1,
 851                 .unbound_nonreg_file    = 1,
 852                 .pollout                = 1,
 853         },
 854         [IORING_OP_RECV] = {
 855                 .needs_mm               = 1,
 856                 .needs_file             = 1,
 857                 .unbound_nonreg_file    = 1,
 858                 .pollin                 = 1,
 859                 .buffer_select          = 1,
 860         },
 861         [IORING_OP_OPENAT2] = {
 862                 .file_table             = 1,
 863                 .needs_fs               = 1,
 864         },
 865         [IORING_OP_EPOLL_CTL] = {
 866                 .unbound_nonreg_file    = 1,
 867                 .file_table             = 1,
 868         },
 869         [IORING_OP_SPLICE] = {
 870                 .needs_file             = 1,
 871                 .hash_reg_file          = 1,
 872                 .unbound_nonreg_file    = 1,
 873         },
 874         [IORING_OP_PROVIDE_BUFFERS] = {},
 875         [IORING_OP_REMOVE_BUFFERS] = {},
 876         [IORING_OP_TEE] = {
 877                 .needs_file             = 1,
 878                 .hash_reg_file          = 1,
 879                 .unbound_nonreg_file    = 1,
 880         },
 881 };
 882
 883 static void io_wq_submit_work(struct io_wq_work **workptr);
 884 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 885 static void io_put_req(struct io_kiocb *req);
 886 static void __io_double_put_req(struct io_kiocb *req);
 887 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 888 static void io_queue_linked_timeout(struct io_kiocb *req);
 889 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 890                                  struct io_uring_files_update *ip,
 891                                  unsigned nr_args);
 892 static int io_grab_files(struct io_kiocb *req);
 893 static void io_complete_rw_common(struct kiocb *kiocb, long res);
 894 static void io_cleanup_req(struct io_kiocb *req);
 895 static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
 896                        int fd, struct file **out_file, bool fixed);
 897 static void __io_queue_sqe(struct io_kiocb *req,
 898                            const struct io_uring_sqe *sqe);
 899
 900 static struct kmem_cache *req_cachep;
 901
 902 static const struct file_operations io_uring_fops;
 903
 904 struct sock *io_uring_get_socket(struct file *file)
 905 {
 906 #if defined(CONFIG_UNIX)
 907         if (file->f_op == &io_uring_fops) {
 908                 struct io_ring_ctx *ctx = file->private_data;
 909
 910                 return ctx->ring_sock->sk;
 911         }
 912 #endif
 913         return NULL;
 914 }
 915 EXPORT_SYMBOL(io_uring_get_socket);
 916
 917 static void io_get_req_task(struct io_kiocb *req)
 918 {
 919         if (req->flags & REQ_F_TASK_PINNED)
 920                 return;
 921         get_task_struct(req->task);
 922         req->flags |= REQ_F_TASK_PINNED;
 923 }
 924
 925 /* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */
 926 static void __io_put_req_task(struct io_kiocb *req)
 927 {
 928         if (req->flags & REQ_F_TASK_PINNED)
 929                 put_task_struct(req->task);
 930 }
 931
 932 static void io_file_put_work(struct work_struct *work);
 933
 934 /*
 935  * Note: must call io_req_init_async() for the first time you
 936  * touch any members of io_wq_work.
 937  */
 938 static inline void io_req_init_async(struct io_kiocb *req)
 939 {
 940         if (req->flags & REQ_F_WORK_INITIALIZED)
 941                 return;
 942
 943         memset(&req->work, 0, sizeof(req->work));
 944         req->flags |= REQ_F_WORK_INITIALIZED;
 945 }
 946
 947 static inline bool io_async_submit(struct io_ring_ctx *ctx)
 948 {
 949         return ctx->flags & IORING_SETUP_SQPOLL;
 950 }
 951
 952 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
 953 {
 954         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
 955
 956         complete(&ctx->ref_comp);
 957 }
 958
 959 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 960 {
 961         struct io_ring_ctx *ctx;
 962         int hash_bits;
 963
 964         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 965         if (!ctx)
 966                 return NULL;
 967
 968         ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
 969         if (!ctx->fallback_req)
 970                 goto err;
 971
 972         /*
 973          * Use 5 bits less than the max cq entries, that should give us around
 974          * 32 entries per hash list if totally full and uniformly spread.
 975          */
 976         hash_bits = ilog2(p->cq_entries);
 977         hash_bits -= 5;
 978         if (hash_bits <= 0)
 979                 hash_bits = 1;
 980         ctx->cancel_hash_bits = hash_bits;
 981         ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
 982                                         GFP_KERNEL);
 983         if (!ctx->cancel_hash)
 984                 goto err;
 985         __hash_init(ctx->cancel_hash, 1U << hash_bits);
 986
 987         if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
 988                             PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
 989                 goto err;
 990
 991         ctx->flags = p->flags;
 992         init_waitqueue_head(&ctx->sqo_wait);
 993         init_waitqueue_head(&ctx->cq_wait);
 994         INIT_LIST_HEAD(&ctx->cq_overflow_list);
 995         init_completion(&ctx->ref_comp);
 996         init_completion(&ctx->sq_thread_comp);
 997         idr_init(&ctx->io_buffer_idr);
 998         idr_init(&ctx->personality_idr);
 999         mutex_init(&ctx->uring_lock);
1000         init_waitqueue_head(&ctx->wait);
1001         spin_lock_init(&ctx->completion_lock);
1002         INIT_LIST_HEAD(&ctx->poll_list);
1003         INIT_LIST_HEAD(&ctx->defer_list);
1004         INIT_LIST_HEAD(&ctx->timeout_list);
1005         init_waitqueue_head(&ctx->inflight_wait);
1006         spin_lock_init(&ctx->inflight_lock);
1007         INIT_LIST_HEAD(&ctx->inflight_list);
1008         INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
1009         init_llist_head(&ctx->file_put_llist);
1010         return ctx;
1011 err:
1012         if (ctx->fallback_req)
1013                 kmem_cache_free(req_cachep, ctx->fallback_req);
1014         kfree(ctx->cancel_hash);
1015         kfree(ctx);
1016         return NULL;
1017 }
1018
1019 static inline bool __req_need_defer(struct io_kiocb *req)
1020 {
1021         struct io_ring_ctx *ctx = req->ctx;
1022
1023         return req->sequence != ctx->cached_cq_tail
1024                                 + atomic_read(&ctx->cached_cq_overflow);
1025 }
1026
1027 static inline bool req_need_defer(struct io_kiocb *req)
1028 {
1029         if (unlikely(req->flags & REQ_F_IO_DRAIN))
1030                 return __req_need_defer(req);
1031
1032         return false;
1033 }
1034
1035 static void __io_commit_cqring(struct io_ring_ctx *ctx)
1036 {
1037         struct io_rings *rings = ctx->rings;
1038
1039         /* order cqe stores with ring update */
1040         smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
1041
1042         if (wq_has_sleeper(&ctx->cq_wait)) {
1043                 wake_up_interruptible(&ctx->cq_wait);
1044                 kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1045         }
1046 }
1047
1048 static inline void io_req_work_grab_env(struct io_kiocb *req,
1049                                         const struct io_op_def *def)
1050 {
1051         if (!req->work.mm && def->needs_mm) {
1052                 mmgrab(current->mm);
1053                 req->work.mm = current->mm;
1054         }
1055         if (!req->work.creds)
1056                 req->work.creds = get_current_cred();
1057         if (!req->work.fs && def->needs_fs) {
1058                 spin_lock(&current->fs->lock);
1059                 if (!current->fs->in_exec) {
1060                         req->work.fs = current->fs;
1061                         req->work.fs->users++;
1062                 } else {
1063                         req->work.flags |= IO_WQ_WORK_CANCEL;
1064                 }
1065                 spin_unlock(&current->fs->lock);
1066         }
1067 }
1068
1069 static inline void io_req_work_drop_env(struct io_kiocb *req)
1070 {
1071         if (!(req->flags & REQ_F_WORK_INITIALIZED))
1072                 return;
1073
1074         if (req->work.mm) {
1075                 mmdrop(req->work.mm);
1076                 req->work.mm = NULL;
1077         }
1078         if (req->work.creds) {
1079                 put_cred(req->work.creds);
1080                 req->work.creds = NULL;
1081         }
1082         if (req->work.fs) {
1083                 struct fs_struct *fs = req->work.fs;
1084
1085                 spin_lock(&req->work.fs->lock);
1086                 if (--fs->users)
1087                         fs = NULL;
1088                 spin_unlock(&req->work.fs->lock);
1089                 if (fs)
1090                         free_fs_struct(fs);
1091         }
1092 }
1093
1094 static inline void io_prep_async_work(struct io_kiocb *req,
1095                                       struct io_kiocb **link)
1096 {
1097         const struct io_op_def *def = &io_op_defs[req->opcode];
1098
1099         if (req->flags & REQ_F_ISREG) {
1100                 if (def->hash_reg_file)
1101                         io_wq_hash_work(&req->work, file_inode(req->file));
1102         } else {
1103                 if (def->unbound_nonreg_file)
1104                         req->work.flags |= IO_WQ_WORK_UNBOUND;
1105         }
1106
1107         io_req_init_async(req);
1108         io_req_work_grab_env(req, def);
1109
1110         *link = io_prep_linked_timeout(req);
1111 }
1112
1113 static inline void io_queue_async_work(struct io_kiocb *req)
1114 {
1115         struct io_ring_ctx *ctx = req->ctx;
1116         struct io_kiocb *link;
1117
1118         io_prep_async_work(req, &link);
1119
1120         trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1121                                         &req->work, req->flags);
1122         io_wq_enqueue(ctx->io_wq, &req->work);
1123
1124         if (link)
1125                 io_queue_linked_timeout(link);
1126 }
1127
1128 static void io_kill_timeout(struct io_kiocb *req)
1129 {
1130         int ret;
1131
1132         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
1133         if (ret != -1) {
1134                 atomic_inc(&req->ctx->cq_timeouts);
1135                 list_del_init(&req->list);
1136                 req->flags |= REQ_F_COMP_LOCKED;
1137                 io_cqring_fill_event(req, 0);
1138                 io_put_req(req);
1139         }
1140 }
1141
1142 static void io_kill_timeouts(struct io_ring_ctx *ctx)
1143 {
1144         struct io_kiocb *req, *tmp;
1145
1146         spin_lock_irq(&ctx->completion_lock);
1147         list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
1148                 io_kill_timeout(req);
1149         spin_unlock_irq(&ctx->completion_lock);
1150 }
1151
1152 static void __io_queue_deferred(struct io_ring_ctx *ctx)
1153 {
1154         do {
1155                 struct io_kiocb *req = list_first_entry(&ctx->defer_list,
1156                                                         struct io_kiocb, list);
1157
1158                 if (req_need_defer(req))
1159                         break;
1160                 list_del_init(&req->list);
1161                 io_queue_async_work(req);
1162         } while (!list_empty(&ctx->defer_list));
1163 }
1164
1165 static void io_flush_timeouts(struct io_ring_ctx *ctx)
1166 {
1167         while (!list_empty(&ctx->timeout_list)) {
1168                 struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
1169                                                         struct io_kiocb, list);
1170
1171                 if (req->flags & REQ_F_TIMEOUT_NOSEQ)
1172                         break;
1173                 if (req->timeout.target_seq != ctx->cached_cq_tail
1174                                         - atomic_read(&ctx->cq_timeouts))
1175                         break;
1176
1177                 list_del_init(&req->list);
1178                 io_kill_timeout(req);
1179         }
1180 }
1181
1182 static void io_commit_cqring(struct io_ring_ctx *ctx)
1183 {
1184         io_flush_timeouts(ctx);
1185         __io_commit_cqring(ctx);
1186
1187         if (unlikely(!list_empty(&ctx->defer_list)))
1188                 __io_queue_deferred(ctx);
1189 }
1190
1191 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
1192 {
1193         struct io_rings *rings = ctx->rings;
1194         unsigned tail;
1195
1196         tail = ctx->cached_cq_tail;
1197         /*
1198          * writes to the cq entry need to come after reading head; the
1199          * control dependency is enough as we're using WRITE_ONCE to
1200          * fill the cq entry
1201          */
1202         if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
1203                 return NULL;
1204
1205         ctx->cached_cq_tail++;
1206         return &rings->cqes[tail & ctx->cq_mask];
1207 }
1208
1209 static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1210 {
1211         if (!ctx->cq_ev_fd)
1212                 return false;
1213         if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1214                 return false;
1215         if (!ctx->eventfd_async)
1216                 return true;
1217         return io_wq_current_is_worker();
1218 }
1219
1220 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1221 {
1222         if (waitqueue_active(&ctx->wait))
1223                 wake_up(&ctx->wait);
1224         if (waitqueue_active(&ctx->sqo_wait))
1225                 wake_up(&ctx->sqo_wait);
1226         if (io_should_trigger_evfd(ctx))
1227                 eventfd_signal(ctx->cq_ev_fd, 1);
1228 }
1229
1230 /* Returns true if there are no backlogged entries after the flush */
1231 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1232 {
1233         struct io_rings *rings = ctx->rings;
1234         struct io_uring_cqe *cqe;
1235         struct io_kiocb *req;
1236         unsigned long flags;
1237         LIST_HEAD(list);
1238
1239         if (!force) {
1240                 if (list_empty_careful(&ctx->cq_overflow_list))
1241                         return true;
1242                 if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
1243                     rings->cq_ring_entries))
1244                         return false;
1245         }
1246
1247         spin_lock_irqsave(&ctx->completion_lock, flags);
1248
1249         /* if force is set, the ring is going away. always drop after that */
1250         if (force)
1251                 ctx->cq_overflow_flushed = 1;
1252
1253         cqe = NULL;
1254         while (!list_empty(&ctx->cq_overflow_list)) {
1255                 cqe = io_get_cqring(ctx);
1256                 if (!cqe && !force)
1257                         break;
1258
1259                 req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
1260                                                 list);
1261                 list_move(&req->list, &list);
1262                 req->flags &= ~REQ_F_OVERFLOW;
1263                 if (cqe) {
1264                         WRITE_ONCE(cqe->user_data, req->user_data);
1265                         WRITE_ONCE(cqe->res, req->result);
1266                         WRITE_ONCE(cqe->flags, req->cflags);
1267                 } else {
1268                         WRITE_ONCE(ctx->rings->cq_overflow,
1269                                 atomic_inc_return(&ctx->cached_cq_overflow));
1270                 }
1271         }
1272
1273         io_commit_cqring(ctx);
1274         if (cqe) {
1275                 clear_bit(0, &ctx->sq_check_overflow);
1276                 clear_bit(0, &ctx->cq_check_overflow);
1277                 ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
1278         }
1279         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1280         io_cqring_ev_posted(ctx);
1281
1282         while (!list_empty(&list)) {
1283                 req = list_first_entry(&list, struct io_kiocb, list);
1284                 list_del(&req->list);
1285                 io_put_req(req);
1286         }
1287
1288         return cqe != NULL;
1289 }
1290
1291 static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
1292 {
1293         struct io_ring_ctx *ctx = req->ctx;
1294         struct io_uring_cqe *cqe;
1295
1296         trace_io_uring_complete(ctx, req->user_data, res);
1297
1298         /*
1299          * If we can't get a cq entry, userspace overflowed the
1300          * submission (by quite a lot). Increment the overflow count in
1301          * the ring.
1302          */
1303         cqe = io_get_cqring(ctx);
1304         if (likely(cqe)) {
1305                 WRITE_ONCE(cqe->user_data, req->user_data);
1306                 WRITE_ONCE(cqe->res, res);
1307                 WRITE_ONCE(cqe->flags, cflags);
1308         } else if (ctx->cq_overflow_flushed) {
1309                 WRITE_ONCE(ctx->rings->cq_overflow,
1310                                 atomic_inc_return(&ctx->cached_cq_overflow));
1311         } else {
1312                 if (list_empty(&ctx->cq_overflow_list)) {
1313                         set_bit(0, &ctx->sq_check_overflow);
1314                         set_bit(0, &ctx->cq_check_overflow);
1315                         ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
1316                 }
1317                 req->flags |= REQ_F_OVERFLOW;
1318                 refcount_inc(&req->refs);
1319                 req->result = res;
1320                 req->cflags = cflags;
1321                 list_add_tail(&req->list, &ctx->cq_overflow_list);
1322         }
1323 }
1324
1325 static void io_cqring_fill_event(struct io_kiocb *req, long res)
1326 {
1327         __io_cqring_fill_event(req, res, 0);
1328 }
1329
1330 static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
1331 {
1332         struct io_ring_ctx *ctx = req->ctx;
1333         unsigned long flags;
1334
1335         spin_lock_irqsave(&ctx->completion_lock, flags);
1336         __io_cqring_fill_event(req, res, cflags);
1337         io_commit_cqring(ctx);
1338         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1339
1340         io_cqring_ev_posted(ctx);
1341 }
1342
1343 static void io_cqring_add_event(struct io_kiocb *req, long res)
1344 {
1345         __io_cqring_add_event(req, res, 0);
1346 }
1347
1348 static inline bool io_is_fallback_req(struct io_kiocb *req)
1349 {
1350         return req == (struct io_kiocb *)
1351                         ((unsigned long) req->ctx->fallback_req & ~1UL);
1352 }
1353
1354 static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
1355 {
1356         struct io_kiocb *req;
1357
1358         req = ctx->fallback_req;
1359         if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req))
1360                 return req;
1361
1362         return NULL;
1363 }
1364
1365 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
1366                                      struct io_submit_state *state)
1367 {
1368         gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1369         struct io_kiocb *req;
1370
1371         if (!state) {
1372                 req = kmem_cache_alloc(req_cachep, gfp);
1373                 if (unlikely(!req))
1374                         goto fallback;
1375         } else if (!state->free_reqs) {
1376                 size_t sz;
1377                 int ret;
1378
1379                 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
1380                 ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
1381
1382                 /*
1383                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
1384                  * retry single alloc to be on the safe side.
1385                  */
1386                 if (unlikely(ret <= 0)) {
1387                         state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1388                         if (!state->reqs[0])
1389                                 goto fallback;
1390                         ret = 1;
1391                 }
1392                 state->free_reqs = ret - 1;
1393                 req = state->reqs[ret - 1];
1394         } else {
1395                 state->free_reqs--;
1396                 req = state->reqs[state->free_reqs];
1397         }
1398
1399         return req;
1400 fallback:
1401         return io_get_fallback_req(ctx);
1402 }
1403
1404 static inline void io_put_file(struct io_kiocb *req, struct file *file,
1405                           bool fixed)
1406 {
1407         if (fixed)
1408                 percpu_ref_put(req->fixed_file_refs);
1409         else
1410                 fput(file);
1411 }
1412
1413 static void __io_req_aux_free(struct io_kiocb *req)
1414 {
1415         if (req->flags & REQ_F_NEED_CLEANUP)
1416                 io_cleanup_req(req);
1417
1418         kfree(req->io);
1419         if (req->file)
1420                 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
1421         __io_put_req_task(req);
1422         io_req_work_drop_env(req);
1423 }
1424
1425 static void __io_free_req(struct io_kiocb *req)
1426 {
1427         __io_req_aux_free(req);
1428
1429         if (req->flags & REQ_F_INFLIGHT) {
1430                 struct io_ring_ctx *ctx = req->ctx;
1431                 unsigned long flags;
1432
1433                 spin_lock_irqsave(&ctx->inflight_lock, flags);
1434                 list_del(&req->inflight_entry);
1435                 if (waitqueue_active(&ctx->inflight_wait))
1436                         wake_up(&ctx->inflight_wait);
1437                 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1438         }
1439
1440         percpu_ref_put(&req->ctx->refs);
1441         if (likely(!io_is_fallback_req(req)))
1442                 kmem_cache_free(req_cachep, req);
1443         else
1444                 clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req);
1445 }
1446
1447 struct req_batch {
1448         void *reqs[IO_IOPOLL_BATCH];
1449         int to_free;
1450         int need_iter;
1451 };
1452
1453 static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
1454 {
1455         if (!rb->to_free)
1456                 return;
1457         if (rb->need_iter) {
1458                 int i, inflight = 0;
1459                 unsigned long flags;
1460
1461                 for (i = 0; i < rb->to_free; i++) {
1462                         struct io_kiocb *req = rb->reqs[i];
1463
1464                         if (req->flags & REQ_F_INFLIGHT)
1465                                 inflight++;
1466                         __io_req_aux_free(req);
1467                 }
1468                 if (!inflight)
1469                         goto do_free;
1470
1471                 spin_lock_irqsave(&ctx->inflight_lock, flags);
1472                 for (i = 0; i < rb->to_free; i++) {
1473                         struct io_kiocb *req = rb->reqs[i];
1474
1475                         if (req->flags & REQ_F_INFLIGHT) {
1476                                 list_del(&req->inflight_entry);
1477                                 if (!--inflight)
1478                                         break;
1479                         }
1480                 }
1481                 spin_unlock_irqrestore(&ctx->inflight_lock, flags);
1482
1483                 if (waitqueue_active(&ctx->inflight_wait))
1484                         wake_up(&ctx->inflight_wait);
1485         }
1486 do_free:
1487         kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
1488         percpu_ref_put_many(&ctx->refs, rb->to_free);
1489         rb->to_free = rb->need_iter = 0;
1490 }
1491
1492 static bool io_link_cancel_timeout(struct io_kiocb *req)
1493 {
1494         struct io_ring_ctx *ctx = req->ctx;
1495         int ret;
1496
1497         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
1498         if (ret != -1) {
1499                 io_cqring_fill_event(req, -ECANCELED);
1500                 io_commit_cqring(ctx);
1501                 req->flags &= ~REQ_F_LINK_HEAD;
1502                 io_put_req(req);
1503                 return true;
1504         }
1505
1506         return false;
1507 }
1508
1509 static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
1510 {
1511         struct io_ring_ctx *ctx = req->ctx;
1512         bool wake_ev = false;
1513
1514         /* Already got next link */
1515         if (req->flags & REQ_F_LINK_NEXT)
1516                 return;
1517
1518         /*
1519          * The list should never be empty when we are called here. But could
1520          * potentially happen if the chain is messed up, check to be on the
1521          * safe side.
1522          */
1523         while (!list_empty(&req->link_list)) {
1524                 struct io_kiocb *nxt = list_first_entry(&req->link_list,
1525                                                 struct io_kiocb, link_list);
1526
1527                 if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
1528                              (nxt->flags & REQ_F_TIMEOUT))) {
1529                         list_del_init(&nxt->link_list);
1530                         wake_ev |= io_link_cancel_timeout(nxt);
1531                         req->flags &= ~REQ_F_LINK_TIMEOUT;
1532                         continue;
1533                 }
1534
1535                 list_del_init(&req->link_list);
1536                 if (!list_empty(&nxt->link_list))
1537                         nxt->flags |= REQ_F_LINK_HEAD;
1538                 *nxtptr = nxt;
1539                 break;
1540         }
1541
1542         req->flags |= REQ_F_LINK_NEXT;
1543         if (wake_ev)
1544                 io_cqring_ev_posted(ctx);
1545 }
1546
1547 /*
1548  * Called if REQ_F_LINK_HEAD is set, and we fail the head request
1549  */
1550 static void io_fail_links(struct io_kiocb *req)
1551 {
1552         struct io_ring_ctx *ctx = req->ctx;
1553         unsigned long flags;
1554
1555         spin_lock_irqsave(&ctx->completion_lock, flags);
1556
1557         while (!list_empty(&req->link_list)) {
1558                 struct io_kiocb *link = list_first_entry(&req->link_list,
1559                                                 struct io_kiocb, link_list);
1560
1561                 list_del_init(&link->link_list);
1562                 trace_io_uring_fail_link(req, link);
1563
1564                 if ((req->flags & REQ_F_LINK_TIMEOUT) &&
1565                     link->opcode == IORING_OP_LINK_TIMEOUT) {
1566                         io_link_cancel_timeout(link);
1567                 } else {
1568                         io_cqring_fill_event(link, -ECANCELED);
1569                         __io_double_put_req(link);
1570                 }
1571                 req->flags &= ~REQ_F_LINK_TIMEOUT;
1572         }
1573
1574         io_commit_cqring(ctx);
1575         spin_unlock_irqrestore(&ctx->completion_lock, flags);
1576         io_cqring_ev_posted(ctx);
1577 }
1578
1579 static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
1580 {
1581         if (likely(!(req->flags & REQ_F_LINK_HEAD)))
1582                 return;
1583
1584         /*
1585          * If LINK is set, we have dependent requests in this chain. If we
1586          * didn't fail this request, queue the first one up, moving any other
1587          * dependencies to the next request. In case of failure, fail the rest
1588          * of the chain.
1589          */
1590         if (req->flags & REQ_F_FAIL_LINK) {
1591                 io_fail_links(req);
1592         } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
1593                         REQ_F_LINK_TIMEOUT) {
1594                 struct io_ring_ctx *ctx = req->ctx;
1595                 unsigned long flags;
1596
1597                 /*
1598                  * If this is a timeout link, we could be racing with the
1599                  * timeout timer. Grab the completion lock for this case to
1600                  * protect against that.
1601                  */
1602                 spin_lock_irqsave(&ctx->completion_lock, flags);
1603                 io_req_link_next(req, nxt);
1604                 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1605         } else {
1606                 io_req_link_next(req, nxt);
1607         }
1608 }
1609
1610 static void io_free_req(struct io_kiocb *req)
1611 {
1612         struct io_kiocb *nxt = NULL;
1613
1614         io_req_find_next(req, &nxt);
1615         __io_free_req(req);
1616
1617         if (nxt)
1618                 io_queue_async_work(nxt);
1619 }
1620
1621 static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
1622 {
1623         struct io_kiocb *link;
1624         const struct io_op_def *def = &io_op_defs[nxt->opcode];
1625
1626         if ((nxt->flags & REQ_F_ISREG) && def->hash_reg_file)
1627                 io_wq_hash_work(&nxt->work, file_inode(nxt->file));
1628
1629         *workptr = &nxt->work;
1630         link = io_prep_linked_timeout(nxt);
1631         if (link)
1632                 nxt->flags |= REQ_F_QUEUE_TIMEOUT;
1633 }
1634
1635 /*
1636  * Drop reference to request, return next in chain (if there is one) if this
1637  * was the last reference to this request.
1638  */
1639 __attribute__((nonnull))
1640 static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
1641 {
1642         if (refcount_dec_and_test(&req->refs)) {
1643                 io_req_find_next(req, nxtptr);
1644                 __io_free_req(req);
1645         }
1646 }
1647
1648 static void io_put_req(struct io_kiocb *req)
1649 {
1650         if (refcount_dec_and_test(&req->refs))
1651                 io_free_req(req);
1652 }
1653
1654 static void io_steal_work(struct io_kiocb *req,
1655                           struct io_wq_work **workptr)
1656 {
1657         /*
1658          * It's in an io-wq worker, so there always should be at least
1659          * one reference, which will be dropped in io_put_work() just
1660          * after the current handler returns.
1661          *
1662          * It also means, that if the counter dropped to 1, then there is
1663          * no asynchronous users left, so it's safe to steal the next work.
1664          */
1665         if (refcount_read(&req->refs) == 1) {
1666                 struct io_kiocb *nxt = NULL;
1667
1668                 io_req_find_next(req, &nxt);
1669                 if (nxt)
1670                         io_wq_assign_next(workptr, nxt);
1671         }
1672 }
1673
1674 /*
1675  * Must only be used if we don't need to care about links, usually from
1676  * within the completion handling itself.
1677  */
1678 static void __io_double_put_req(struct io_kiocb *req)
1679 {
1680         /* drop both submit and complete references */
1681         if (refcount_sub_and_test(2, &req->refs))
1682                 __io_free_req(req);
1683 }
1684
1685 static void io_double_put_req(struct io_kiocb *req)
1686 {
1687         /* drop both submit and complete references */
1688         if (refcount_sub_and_test(2, &req->refs))
1689                 io_free_req(req);
1690 }
1691
1692 static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
1693 {
1694         struct io_rings *rings = ctx->rings;
1695
1696         if (test_bit(0, &ctx->cq_check_overflow)) {
1697                 /*
1698                  * noflush == true is from the waitqueue handler, just ensure
1699                  * we wake up the task, and the next invocation will flush the
1700                  * entries. We cannot safely to it from here.
1701                  */
1702                 if (noflush && !list_empty(&ctx->cq_overflow_list))
1703                         return -1U;
1704
1705                 io_cqring_overflow_flush(ctx, false);
1706         }
1707
1708         /* See comment at the top of this file */
1709         smp_rmb();
1710         return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
1711 }
1712
1713 static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
1714 {
1715         struct io_rings *rings = ctx->rings;
1716
1717         /* make sure SQ entry isn't read before tail */
1718         return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
1719 }
1720
1721 static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
1722 {
1723         if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req))
1724                 return false;
1725
1726         if (req->file || req->io)
1727                 rb->need_iter++;
1728
1729         rb->reqs[rb->to_free++] = req;
1730         if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
1731                 io_free_req_many(req->ctx, rb);
1732         return true;
1733 }
1734
1735 static int io_put_kbuf(struct io_kiocb *req)
1736 {
1737         struct io_buffer *kbuf;
1738         int cflags;
1739
1740         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
1741         cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
1742         cflags |= IORING_CQE_F_BUFFER;
1743         req->rw.addr = 0;
1744         kfree(kbuf);
1745         return cflags;
1746 }
1747
1748 static void io_iopoll_queue(struct list_head *again)
1749 {
1750         struct io_kiocb *req;
1751
1752         do {
1753                 req = list_first_entry(again, struct io_kiocb, list);
1754                 list_del(&req->list);
1755
1756                 /* shouldn't happen unless io_uring is dying, cancel reqs */
1757                 if (unlikely(!current->mm)) {
1758                         io_complete_rw_common(&req->rw.kiocb, -EAGAIN);
1759                         io_put_req(req);
1760                         continue;
1761                 }
1762
1763                 refcount_inc(&req->refs);
1764                 io_queue_async_work(req);
1765         } while (!list_empty(again));
1766 }
1767
1768 /*
1769  * Find and free completed poll iocbs
1770  */
1771 static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
1772                                struct list_head *done)
1773 {
1774         struct req_batch rb;
1775         struct io_kiocb *req;
1776         LIST_HEAD(again);
1777
1778         /* order with ->result store in io_complete_rw_iopoll() */
1779         smp_rmb();
1780
1781         rb.to_free = rb.need_iter = 0;
1782         while (!list_empty(done)) {
1783                 int cflags = 0;
1784
1785                 req = list_first_entry(done, struct io_kiocb, list);
1786                 if (READ_ONCE(req->result) == -EAGAIN) {
1787                         req->iopoll_completed = 0;
1788                         list_move_tail(&req->list, &again);
1789                         continue;
1790                 }
1791                 list_del(&req->list);
1792
1793                 if (req->flags & REQ_F_BUFFER_SELECTED)
1794                         cflags = io_put_kbuf(req);
1795
1796                 __io_cqring_fill_event(req, req->result, cflags);
1797                 (*nr_events)++;
1798
1799                 if (refcount_dec_and_test(&req->refs) &&
1800                     !io_req_multi_free(&rb, req))
1801                         io_free_req(req);
1802         }
1803
1804         io_commit_cqring(ctx);
1805         if (ctx->flags & IORING_SETUP_SQPOLL)
1806                 io_cqring_ev_posted(ctx);
1807         io_free_req_many(ctx, &rb);
1808
1809         if (!list_empty(&again))
1810                 io_iopoll_queue(&again);
1811 }
1812
1813 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
1814                         long min)
1815 {
1816         struct io_kiocb *req, *tmp;
1817         LIST_HEAD(done);
1818         bool spin;
1819         int ret;
1820
1821         /*
1822          * Only spin for completions if we don't have multiple devices hanging
1823          * off our complete list, and we're under the requested amount.
1824          */
1825         spin = !ctx->poll_multi_file && *nr_events < min;
1826
1827         ret = 0;
1828         list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
1829                 struct kiocb *kiocb = &req->rw.kiocb;
1830
1831                 /*
1832                  * Move completed and retryable entries to our local lists.
1833                  * If we find a request that requires polling, break out
1834                  * and complete those lists first, if we have entries there.
1835                  */
1836                 if (READ_ONCE(req->iopoll_completed)) {
1837                         list_move_tail(&req->list, &done);
1838                         continue;
1839                 }
1840                 if (!list_empty(&done))
1841                         break;
1842
1843                 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
1844                 if (ret < 0)
1845                         break;
1846
1847                 if (ret && spin)
1848                         spin = false;
1849                 ret = 0;
1850         }
1851
1852         if (!list_empty(&done))
1853                 io_iopoll_complete(ctx, nr_events, &done);
1854
1855         return ret;
1856 }
1857
1858 /*
1859  * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
1860  * non-spinning poll check - we'll still enter the driver poll loop, but only
1861  * as a non-spinning completion check.
1862  */
1863 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
1864                                 long min)
1865 {
1866         while (!list_empty(&ctx->poll_list) && !need_resched()) {
1867                 int ret;
1868
1869                 ret = io_do_iopoll(ctx, nr_events, min);
1870                 if (ret < 0)
1871                         return ret;
1872                 if (!min || *nr_events >= min)
1873                         return 0;
1874         }
1875
1876         return 1;
1877 }
1878
1879 /*
1880  * We can't just wait for polled events to come to us, we have to actively
1881  * find and complete them.
1882  */
1883 static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
1884 {
1885         if (!(ctx->flags & IORING_SETUP_IOPOLL))
1886                 return;
1887
1888         mutex_lock(&ctx->uring_lock);
1889         while (!list_empty(&ctx->poll_list)) {
1890                 unsigned int nr_events = 0;
1891
1892                 io_iopoll_getevents(ctx, &nr_events, 1);
1893
1894                 /*
1895                  * Ensure we allow local-to-the-cpu processing to take place,
1896                  * in this case we need to ensure that we reap all events.
1897                  */
1898                 cond_resched();
1899         }
1900         mutex_unlock(&ctx->uring_lock);
1901 }
1902
1903 static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
1904                            long min)
1905 {
1906         int iters = 0, ret = 0;
1907
1908         /*
1909          * We disallow the app entering submit/complete with polling, but we
1910          * still need to lock the ring to prevent racing with polled issue
1911          * that got punted to a workqueue.
1912          */
1913         mutex_lock(&ctx->uring_lock);
1914         do {
1915                 int tmin = 0;
1916
1917                 /*
1918                  * Don't enter poll loop if we already have events pending.
1919                  * If we do, we can potentially be spinning for commands that
1920                  * already triggered a CQE (eg in error).
1921                  */
1922                 if (io_cqring_events(ctx, false))
1923                         break;
1924
1925                 /*
1926                  * If a submit got punted to a workqueue, we can have the
1927                  * application entering polling for a command before it gets
1928                  * issued. That app will hold the uring_lock for the duration
1929                  * of the poll right here, so we need to take a breather every
1930                  * now and then to ensure that the issue has a chance to add
1931                  * the poll to the issued list. Otherwise we can spin here
1932                  * forever, while the workqueue is stuck trying to acquire the
1933                  * very same mutex.
1934                  */
1935                 if (!(++iters & 7)) {
1936                         mutex_unlock(&ctx->uring_lock);
1937                         mutex_lock(&ctx->uring_lock);
1938                 }
1939
1940                 if (*nr_events < min)
1941                         tmin = min - *nr_events;
1942
1943                 ret = io_iopoll_getevents(ctx, nr_events, tmin);
1944                 if (ret <= 0)
1945                         break;
1946                 ret = 0;
1947         } while (min && !*nr_events && !need_resched());
1948
1949         mutex_unlock(&ctx->uring_lock);
1950         return ret;
1951 }
1952
1953 static void kiocb_end_write(struct io_kiocb *req)
1954 {
1955         /*
1956          * Tell lockdep we inherited freeze protection from submission
1957          * thread.
1958          */
1959         if (req->flags & REQ_F_ISREG) {
1960                 struct inode *inode = file_inode(req->file);
1961
1962                 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
1963         }
1964         file_end_write(req->file);
1965 }
1966
1967 static inline void req_set_fail_links(struct io_kiocb *req)
1968 {
1969         if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1970                 req->flags |= REQ_F_FAIL_LINK;
1971 }
1972
1973 static void io_complete_rw_common(struct kiocb *kiocb, long res)
1974 {
1975         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1976         int cflags = 0;
1977
1978         if (kiocb->ki_flags & IOCB_WRITE)
1979                 kiocb_end_write(req);
1980
1981         if (res != req->result)
1982                 req_set_fail_links(req);
1983         if (req->flags & REQ_F_BUFFER_SELECTED)
1984                 cflags = io_put_kbuf(req);
1985         __io_cqring_add_event(req, res, cflags);
1986 }
1987
1988 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
1989 {
1990         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1991
1992         io_complete_rw_common(kiocb, res);
1993         io_put_req(req);
1994 }
1995
1996 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
1997 {
1998         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
1999
2000         if (kiocb->ki_flags & IOCB_WRITE)
2001                 kiocb_end_write(req);
2002
2003         if (res != -EAGAIN && res != req->result)
2004                 req_set_fail_links(req);
2005
2006         WRITE_ONCE(req->result, res);
2007         /* order with io_poll_complete() checking ->result */
2008         smp_wmb();
2009         WRITE_ONCE(req->iopoll_completed, 1);
2010 }
2011
2012 /*
2013  * After the iocb has been issued, it's safe to be found on the poll list.
2014  * Adding the kiocb to the list AFTER submission ensures that we don't
2015  * find it from a io_iopoll_getevents() thread before the issuer is done
2016  * accessing the kiocb cookie.
2017  */
2018 static void io_iopoll_req_issued(struct io_kiocb *req)
2019 {
2020         struct io_ring_ctx *ctx = req->ctx;
2021
2022         /*
2023          * Track whether we have multiple files in our lists. This will impact
2024          * how we do polling eventually, not spinning if we're on potentially
2025          * different devices.
2026          */
2027         if (list_empty(&ctx->poll_list)) {
2028                 ctx->poll_multi_file = false;
2029         } else if (!ctx->poll_multi_file) {
2030                 struct io_kiocb *list_req;
2031
2032                 list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
2033                                                 list);
2034                 if (list_req->file != req->file)
2035                         ctx->poll_multi_file = true;
2036         }
2037
2038         /*
2039          * For fast devices, IO may have already completed. If it has, add
2040          * it to the front so we find it first.
2041          */
2042         if (READ_ONCE(req->iopoll_completed))
2043                 list_add(&req->list, &ctx->poll_list);
2044         else
2045                 list_add_tail(&req->list, &ctx->poll_list);
2046
2047         if ((ctx->flags & IORING_SETUP_SQPOLL) &&
2048             wq_has_sleeper(&ctx->sqo_wait))
2049                 wake_up(&ctx->sqo_wait);
2050 }
2051
2052 static void __io_state_file_put(struct io_submit_state *state)
2053 {
2054         int diff = state->has_refs - state->used_refs;
2055
2056         if (diff)
2057                 fput_many(state->file, diff);
2058         state->file = NULL;
2059 }
2060
2061 static inline void io_state_file_put(struct io_submit_state *state)
2062 {
2063         if (state->file)
2064                 __io_state_file_put(state);
2065 }
2066
2067 /*
2068  * Get as many references to a file as we have IOs left in this submission,
2069  * assuming most submissions are for one file, or at least that each file
2070  * has more than one submission.
2071  */
2072 static struct file *__io_file_get(struct io_submit_state *state, int fd)
2073 {
2074         if (!state)
2075                 return fget(fd);
2076
2077         if (state->file) {
2078                 if (state->fd == fd) {
2079                         state->used_refs++;
2080                         state->ios_left--;
2081                         return state->file;
2082                 }
2083                 __io_state_file_put(state);
2084         }
2085         state->file = fget_many(fd, state->ios_left);
2086         if (!state->file)
2087                 return NULL;
2088
2089         state->fd = fd;
2090         state->has_refs = state->ios_left;
2091         state->used_refs = 1;
2092         state->ios_left--;
2093         return state->file;
2094 }
2095
2096 /*
2097  * If we tracked the file through the SCM inflight mechanism, we could support
2098  * any file. For now, just ensure that anything potentially problematic is done
2099  * inline.
2100  */
2101 static bool io_file_supports_async(struct file *file, int rw)
2102 {
2103         umode_t mode = file_inode(file)->i_mode;
2104
2105         if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode))
2106                 return true;
2107         if (S_ISREG(mode) && file->f_op != &io_uring_fops)
2108                 return true;
2109
2110         /* any ->read/write should understand O_NONBLOCK */
2111         if (file->f_flags & O_NONBLOCK)
2112                 return true;
2113
2114         if (!(file->f_mode & FMODE_NOWAIT))
2115                 return false;
2116
2117         if (rw == READ)
2118                 return file->f_op->read_iter != NULL;
2119
2120         return file->f_op->write_iter != NULL;
2121 }
2122
2123 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2124                       bool force_nonblock)
2125 {
2126         struct io_ring_ctx *ctx = req->ctx;
2127         struct kiocb *kiocb = &req->rw.kiocb;
2128         unsigned ioprio;
2129         int ret;
2130
2131         if (S_ISREG(file_inode(req->file)->i_mode))
2132                 req->flags |= REQ_F_ISREG;
2133
2134         kiocb->ki_pos = READ_ONCE(sqe->off);
2135         if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
2136                 req->flags |= REQ_F_CUR_POS;
2137                 kiocb->ki_pos = req->file->f_pos;
2138         }
2139         kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
2140         kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2141         ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2142         if (unlikely(ret))
2143                 return ret;
2144
2145         ioprio = READ_ONCE(sqe->ioprio);
2146         if (ioprio) {
2147                 ret = ioprio_check_cap(ioprio);
2148                 if (ret)
2149                         return ret;
2150
2151                 kiocb->ki_ioprio = ioprio;
2152         } else
2153                 kiocb->ki_ioprio = get_current_ioprio();
2154
2155         /* don't allow async punt if RWF_NOWAIT was requested */
2156         if (kiocb->ki_flags & IOCB_NOWAIT)
2157                 req->flags |= REQ_F_NOWAIT;
2158
2159         if (force_nonblock)
2160                 kiocb->ki_flags |= IOCB_NOWAIT;
2161
2162         if (ctx->flags & IORING_SETUP_IOPOLL) {
2163                 if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2164                     !kiocb->ki_filp->f_op->iopoll)
2165                         return -EOPNOTSUPP;
2166
2167                 kiocb->ki_flags |= IOCB_HIPRI;
2168                 kiocb->ki_complete = io_complete_rw_iopoll;
2169                 req->result = 0;
2170                 req->iopoll_completed = 0;
2171         } else {
2172                 if (kiocb->ki_flags & IOCB_HIPRI)
2173                         return -EINVAL;
2174                 kiocb->ki_complete = io_complete_rw;
2175         }
2176
2177         req->rw.addr = READ_ONCE(sqe->addr);
2178         req->rw.len = READ_ONCE(sqe->len);
2179         req->buf_index = READ_ONCE(sqe->buf_index);
2180         return 0;
2181 }
2182
2183 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2184 {
2185         switch (ret) {
2186         case -EIOCBQUEUED:
2187                 break;
2188         case -ERESTARTSYS:
2189         case -ERESTARTNOINTR:
2190         case -ERESTARTNOHAND:
2191         case -ERESTART_RESTARTBLOCK:
2192                 /*
2193                  * We can't just restart the syscall, since previously
2194                  * submitted sqes may already be in progress. Just fail this
2195                  * IO with EINTR.
2196                  */
2197                 ret = -EINTR;
2198                 /* fall through */
2199         default:
2200                 kiocb->ki_complete(kiocb, ret, 0);
2201         }
2202 }
2203
2204 static void kiocb_done(struct kiocb *kiocb, ssize_t ret)
2205 {
2206         struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2207
2208         if (req->flags & REQ_F_CUR_POS)
2209                 req->file->f_pos = kiocb->ki_pos;
2210         if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
2211                 io_complete_rw(kiocb, ret, 0);
2212         else
2213                 io_rw_done(kiocb, ret);
2214 }
2215
2216 static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
2217                                struct iov_iter *iter)
2218 {
2219         struct io_ring_ctx *ctx = req->ctx;
2220         size_t len = req->rw.len;
2221         struct io_mapped_ubuf *imu;
2222         u16 index, buf_index;
2223         size_t offset;
2224         u64 buf_addr;
2225
2226         /* attempt to use fixed buffers without having provided iovecs */
2227         if (unlikely(!ctx->user_bufs))
2228                 return -EFAULT;
2229
2230         buf_index = req->buf_index;
2231         if (unlikely(buf_index >= ctx->nr_user_bufs))
2232                 return -EFAULT;
2233
2234         index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2235         imu = &ctx->user_bufs[index];
2236         buf_addr = req->rw.addr;
2237
2238         /* overflow */
2239         if (buf_addr + len < buf_addr)
2240                 return -EFAULT;
2241         /* not inside the mapped region */
2242         if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
2243                 return -EFAULT;
2244
2245         /*
2246          * May not be a start of buffer, set size appropriately
2247          * and advance us to the beginning.
2248          */
2249         offset = buf_addr - imu->ubuf;
2250         iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
2251
2252         if (offset) {
2253                 /*
2254                  * Don't use iov_iter_advance() here, as it's really slow for
2255                  * using the latter parts of a big fixed buffer - it iterates
2256                  * over each segment manually. We can cheat a bit here, because
2257                  * we know that:
2258                  *
2259                  * 1) it's a BVEC iter, we set it up
2260                  * 2) all bvecs are PAGE_SIZE in size, except potentially the
2261                  *    first and last bvec
2262                  *
2263                  * So just find our index, and adjust the iterator afterwards.
2264                  * If the offset is within the first bvec (or the whole first
2265                  * bvec, just use iov_iter_advance(). This makes it easier
2266                  * since we can just skip the first segment, which may not
2267                  * be PAGE_SIZE aligned.
2268                  */
2269                 const struct bio_vec *bvec = imu->bvec;
2270
2271                 if (offset <= bvec->bv_len) {
2272                         iov_iter_advance(iter, offset);
2273                 } else {
2274                         unsigned long seg_skip;
2275
2276                         /* skip first vec */
2277                         offset -= bvec->bv_len;
2278                         seg_skip = 1 + (offset >> PAGE_SHIFT);
2279
2280                         iter->bvec = bvec + seg_skip;
2281                         iter->nr_segs -= seg_skip;
2282                         iter->count -= bvec->bv_len + offset;
2283                         iter->iov_offset = offset & ~PAGE_MASK;
2284                 }
2285         }
2286
2287         return len;
2288 }
2289
2290 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2291 {
2292         if (needs_lock)
2293                 mutex_unlock(&ctx->uring_lock);
2294 }
2295
2296 static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2297 {
2298         /*
2299          * "Normal" inline submissions always hold the uring_lock, since we
2300          * grab it from the system call. Same is true for the SQPOLL offload.
2301          * The only exception is when we've detached the request and issue it
2302          * from an async worker thread, grab the lock for that case.
2303          */
2304         if (needs_lock)
2305                 mutex_lock(&ctx->uring_lock);
2306 }
2307
2308 static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
2309                                           int bgid, struct io_buffer *kbuf,
2310                                           bool needs_lock)
2311 {
2312         struct io_buffer *head;
2313
2314         if (req->flags & REQ_F_BUFFER_SELECTED)
2315                 return kbuf;
2316
2317         io_ring_submit_lock(req->ctx, needs_lock);
2318
2319         lockdep_assert_held(&req->ctx->uring_lock);
2320
2321         head = idr_find(&req->ctx->io_buffer_idr, bgid);
2322         if (head) {
2323                 if (!list_empty(&head->list)) {
2324                         kbuf = list_last_entry(&head->list, struct io_buffer,
2325                                                         list);
2326                         list_del(&kbuf->list);
2327                 } else {
2328                         kbuf = head;
2329                         idr_remove(&req->ctx->io_buffer_idr, bgid);
2330                 }
2331                 if (*len > kbuf->len)
2332                         *len = kbuf->len;
2333         } else {
2334                 kbuf = ERR_PTR(-ENOBUFS);
2335         }
2336
2337         io_ring_submit_unlock(req->ctx, needs_lock);
2338
2339         return kbuf;
2340 }
2341
2342 static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
2343                                         bool needs_lock)
2344 {
2345         struct io_buffer *kbuf;
2346         u16 bgid;
2347
2348         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2349         bgid = req->buf_index;
2350         kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
2351         if (IS_ERR(kbuf))
2352                 return kbuf;
2353         req->rw.addr = (u64) (unsigned long) kbuf;
2354         req->flags |= REQ_F_BUFFER_SELECTED;
2355         return u64_to_user_ptr(kbuf->addr);
2356 }
2357
2358 #ifdef CONFIG_COMPAT
2359 static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
2360                                 bool needs_lock)
2361 {
2362         struct compat_iovec __user *uiov;
2363         compat_ssize_t clen;
2364         void __user *buf;
2365         ssize_t len;
2366
2367         uiov = u64_to_user_ptr(req->rw.addr);
2368         if (!access_ok(uiov, sizeof(*uiov)))
2369                 return -EFAULT;
2370         if (__get_user(clen, &uiov->iov_len))
2371                 return -EFAULT;
2372         if (clen < 0)
2373                 return -EINVAL;
2374
2375         len = clen;
2376         buf = io_rw_buffer_select(req, &len, needs_lock);
2377         if (IS_ERR(buf))
2378                 return PTR_ERR(buf);
2379         iov[0].iov_base = buf;
2380         iov[0].iov_len = (compat_size_t) len;
2381         return 0;
2382 }
2383 #endif
2384
2385 static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2386                                       bool needs_lock)
2387 {
2388         struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
2389         void __user *buf;
2390         ssize_t len;
2391
2392         if (copy_from_user(iov, uiov, sizeof(*uiov)))
2393                 return -EFAULT;
2394
2395         len = iov[0].iov_len;
2396         if (len < 0)
2397                 return -EINVAL;
2398         buf = io_rw_buffer_select(req, &len, needs_lock);
2399         if (IS_ERR(buf))
2400                 return PTR_ERR(buf);
2401         iov[0].iov_base = buf;
2402         iov[0].iov_len = len;
2403         return 0;
2404 }
2405
2406 static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
2407                                     bool needs_lock)
2408 {
2409         if (req->flags & REQ_F_BUFFER_SELECTED) {
2410                 struct io_buffer *kbuf;
2411
2412                 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2413                 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
2414                 iov[0].iov_len = kbuf->len;
2415                 return 0;
2416         }
2417         if (!req->rw.len)
2418                 return 0;
2419         else if (req->rw.len > 1)
2420                 return -EINVAL;
2421
2422 #ifdef CONFIG_COMPAT
2423         if (req->ctx->compat)
2424                 return io_compat_import(req, iov, needs_lock);
2425 #endif
2426
2427         return __io_iov_buffer_select(req, iov, needs_lock);
2428 }
2429
2430 static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
2431                                struct iovec **iovec, struct iov_iter *iter,
2432                                bool needs_lock)
2433 {
2434         void __user *buf = u64_to_user_ptr(req->rw.addr);
2435         size_t sqe_len = req->rw.len;
2436         ssize_t ret;
2437         u8 opcode;
2438
2439         opcode = req->opcode;
2440         if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
2441                 *iovec = NULL;
2442                 return io_import_fixed(req, rw, iter);
2443         }
2444
2445         /* buffer index only valid with fixed read/write, or buffer select  */
2446         if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
2447                 return -EINVAL;
2448
2449         if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
2450                 if (req->flags & REQ_F_BUFFER_SELECT) {
2451                         buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
2452                         if (IS_ERR(buf)) {
2453                                 *iovec = NULL;
2454                                 return PTR_ERR(buf);
2455                         }
2456                         req->rw.len = sqe_len;
2457                 }
2458
2459                 ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
2460                 *iovec = NULL;
2461                 return ret < 0 ? ret : sqe_len;
2462         }
2463
2464         if (req->io) {
2465                 struct io_async_rw *iorw = &req->io->rw;
2466
2467                 *iovec = iorw->iov;
2468                 iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
2469                 if (iorw->iov == iorw->fast_iov)
2470                         *iovec = NULL;
2471                 return iorw->size;
2472         }
2473
2474         if (req->flags & REQ_F_BUFFER_SELECT) {
2475                 ret = io_iov_buffer_select(req, *iovec, needs_lock);
2476                 if (!ret) {
2477                         ret = (*iovec)->iov_len;
2478                         iov_iter_init(iter, rw, *iovec, 1, ret);
2479                 }
2480                 *iovec = NULL;
2481                 return ret;
2482         }
2483
2484 #ifdef CONFIG_COMPAT
2485         if (req->ctx->compat)
2486                 return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
2487                                                 iovec, iter);
2488 #endif
2489
2490         return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
2491 }
2492
2493 /*
2494  * For files that don't have ->read_iter() and ->write_iter(), handle them
2495  * by looping over ->read() or ->write() manually.
2496  */
2497 static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
2498                            struct iov_iter *iter)
2499 {
2500         ssize_t ret = 0;
2501
2502         /*
2503          * Don't support polled IO through this interface, and we can't
2504          * support non-blocking either. For the latter, this just causes
2505          * the kiocb to be handled from an async context.
2506          */
2507         if (kiocb->ki_flags & IOCB_HIPRI)
2508                 return -EOPNOTSUPP;
2509         if (kiocb->ki_flags & IOCB_NOWAIT)
2510                 return -EAGAIN;
2511
2512         while (iov_iter_count(iter)) {
2513                 struct iovec iovec;
2514                 ssize_t nr;
2515
2516                 if (!iov_iter_is_bvec(iter)) {
2517                         iovec = iov_iter_iovec(iter);
2518                 } else {
2519                         /* fixed buffers import bvec */
2520                         iovec.iov_base = kmap(iter->bvec->bv_page)
2521                                                 + iter->iov_offset;
2522                         iovec.iov_len = min(iter->count,
2523                                         iter->bvec->bv_len - iter->iov_offset);
2524                 }
2525
2526                 if (rw == READ) {
2527                         nr = file->f_op->read(file, iovec.iov_base,
2528                                               iovec.iov_len, &kiocb->ki_pos);
2529                 } else {
2530                         nr = file->f_op->write(file, iovec.iov_base,
2531                                                iovec.iov_len, &kiocb->ki_pos);
2532                 }
2533
2534                 if (iov_iter_is_bvec(iter))
2535                         kunmap(iter->bvec->bv_page);
2536
2537                 if (nr < 0) {
2538                         if (!ret)
2539                                 ret = nr;
2540                         break;
2541                 }
2542                 ret += nr;
2543                 if (nr != iovec.iov_len)
2544                         break;
2545                 iov_iter_advance(iter, nr);
2546         }
2547
2548         return ret;
2549 }
2550
2551 static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
2552                           struct iovec *iovec, struct iovec *fast_iov,
2553                           struct iov_iter *iter)
2554 {
2555         req->io->rw.nr_segs = iter->nr_segs;
2556         req->io->rw.size = io_size;
2557         req->io->rw.iov = iovec;
2558         if (!req->io->rw.iov) {
2559                 req->io->rw.iov = req->io->rw.fast_iov;
2560                 if (req->io->rw.iov != fast_iov)
2561                         memcpy(req->io->rw.iov, fast_iov,
2562                                sizeof(struct iovec) * iter->nr_segs);
2563         } else {
2564                 req->flags |= REQ_F_NEED_CLEANUP;
2565         }
2566 }
2567
2568 static inline int __io_alloc_async_ctx(struct io_kiocb *req)
2569 {
2570         req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
2571         return req->io == NULL;
2572 }
2573
2574 static int io_alloc_async_ctx(struct io_kiocb *req)
2575 {
2576         if (!io_op_defs[req->opcode].async_ctx)
2577                 return 0;
2578
2579         return  __io_alloc_async_ctx(req);
2580 }
2581
2582 static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
2583                              struct iovec *iovec, struct iovec *fast_iov,
2584                              struct iov_iter *iter)
2585 {
2586         if (!io_op_defs[req->opcode].async_ctx)
2587                 return 0;
2588         if (!req->io) {
2589                 if (__io_alloc_async_ctx(req))
2590                         return -ENOMEM;
2591
2592                 io_req_map_rw(req, io_size, iovec, fast_iov, iter);
2593         }
2594         return 0;
2595 }
2596
2597 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2598                         bool force_nonblock)
2599 {
2600         struct io_async_ctx *io;
2601         struct iov_iter iter;
2602         ssize_t ret;
2603
2604         ret = io_prep_rw(req, sqe, force_nonblock);
2605         if (ret)
2606                 return ret;
2607
2608         if (unlikely(!(req->file->f_mode & FMODE_READ)))
2609                 return -EBADF;
2610
2611         /* either don't need iovec imported or already have it */
2612         if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
2613                 return 0;
2614
2615         io = req->io;
2616         io->rw.iov = io->rw.fast_iov;
2617         req->io = NULL;
2618         ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock);
2619         req->io = io;
2620         if (ret < 0)
2621                 return ret;
2622
2623         io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2624         return 0;
2625 }
2626
2627 static int io_read(struct io_kiocb *req, bool force_nonblock)
2628 {
2629         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2630         struct kiocb *kiocb = &req->rw.kiocb;
2631         struct iov_iter iter;
2632         size_t iov_count;
2633         ssize_t io_size, ret;
2634
2635         ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock);
2636         if (ret < 0)
2637                 return ret;
2638
2639         /* Ensure we clear previously set non-block flag */
2640         if (!force_nonblock)
2641                 kiocb->ki_flags &= ~IOCB_NOWAIT;
2642
2643         req->result = 0;
2644         io_size = ret;
2645         if (req->flags & REQ_F_LINK_HEAD)
2646                 req->result = io_size;
2647
2648         /*
2649          * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2650          * we know to async punt it even if it was opened O_NONBLOCK
2651          */
2652         if (force_nonblock && !io_file_supports_async(req->file, READ))
2653                 goto copy_iov;
2654
2655         iov_count = iov_iter_count(&iter);
2656         ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
2657         if (!ret) {
2658                 ssize_t ret2;
2659
2660                 if (req->file->f_op->read_iter)
2661                         ret2 = call_read_iter(req->file, kiocb, &iter);
2662                 else
2663                         ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
2664
2665                 /* Catch -EAGAIN return for forced non-blocking submission */
2666                 if (!force_nonblock || ret2 != -EAGAIN) {
2667                         kiocb_done(kiocb, ret2);
2668                 } else {
2669 copy_iov:
2670                         ret = io_setup_async_rw(req, io_size, iovec,
2671                                                 inline_vecs, &iter);
2672                         if (ret)
2673                                 goto out_free;
2674                         /* any defer here is final, must blocking retry */
2675                         if (!(req->flags & REQ_F_NOWAIT) &&
2676                             !file_can_poll(req->file))
2677                                 req->flags |= REQ_F_MUST_PUNT;
2678                         return -EAGAIN;
2679                 }
2680         }
2681 out_free:
2682         if (!(req->flags & REQ_F_NEED_CLEANUP))
2683                 kfree(iovec);
2684         return ret;
2685 }
2686
2687 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2688                          bool force_nonblock)
2689 {
2690         struct io_async_ctx *io;
2691         struct iov_iter iter;
2692         ssize_t ret;
2693
2694         ret = io_prep_rw(req, sqe, force_nonblock);
2695         if (ret)
2696                 return ret;
2697
2698         if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
2699                 return -EBADF;
2700
2701         req->fsize = rlimit(RLIMIT_FSIZE);
2702
2703         /* either don't need iovec imported or already have it */
2704         if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
2705                 return 0;
2706
2707         io = req->io;
2708         io->rw.iov = io->rw.fast_iov;
2709         req->io = NULL;
2710         ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock);
2711         req->io = io;
2712         if (ret < 0)
2713                 return ret;
2714
2715         io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
2716         return 0;
2717 }
2718
2719 static int io_write(struct io_kiocb *req, bool force_nonblock)
2720 {
2721         struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2722         struct kiocb *kiocb = &req->rw.kiocb;
2723         struct iov_iter iter;
2724         size_t iov_count;
2725         ssize_t ret, io_size;
2726
2727         ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock);
2728         if (ret < 0)
2729                 return ret;
2730
2731         /* Ensure we clear previously set non-block flag */
2732         if (!force_nonblock)
2733                 req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
2734
2735         req->result = 0;
2736         io_size = ret;
2737         if (req->flags & REQ_F_LINK_HEAD)
2738                 req->result = io_size;
2739
2740         /*
2741          * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
2742          * we know to async punt it even if it was opened O_NONBLOCK
2743          */
2744         if (force_nonblock && !io_file_supports_async(req->file, WRITE))
2745                 goto copy_iov;
2746
2747         /* file path doesn't support NOWAIT for non-direct_IO */
2748         if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
2749             (req->flags & REQ_F_ISREG))
2750                 goto copy_iov;
2751
2752         iov_count = iov_iter_count(&iter);
2753         ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
2754         if (!ret) {
2755                 ssize_t ret2;
2756
2757                 /*
2758                  * Open-code file_start_write here to grab freeze protection,
2759                  * which will be released by another thread in
2760                  * io_complete_rw().  Fool lockdep by telling it the lock got
2761                  * released so that it doesn't complain about the held lock when
2762                  * we return to userspace.
2763                  */
2764                 if (req->flags & REQ_F_ISREG) {
2765                         __sb_start_write(file_inode(req->file)->i_sb,
2766                                                 SB_FREEZE_WRITE, true);
2767                         __sb_writers_release(file_inode(req->file)->i_sb,
2768                                                 SB_FREEZE_WRITE);
2769                 }
2770                 kiocb->ki_flags |= IOCB_WRITE;
2771
2772                 if (!force_nonblock)
2773                         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
2774
2775                 if (req->file->f_op->write_iter)
2776                         ret2 = call_write_iter(req->file, kiocb, &iter);
2777                 else
2778                         ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
2779
2780                 if (!force_nonblock)
2781                         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
2782
2783                 /*
2784                  * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
2785                  * retry them without IOCB_NOWAIT.
2786                  */
2787                 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
2788                         ret2 = -EAGAIN;
2789                 if (!force_nonblock || ret2 != -EAGAIN) {
2790                         kiocb_done(kiocb, ret2);
2791                 } else {
2792 copy_iov:
2793                         ret = io_setup_async_rw(req, io_size, iovec,
2794                                                 inline_vecs, &iter);
2795                         if (ret)
2796                                 goto out_free;
2797                         /* any defer here is final, must blocking retry */
2798                         if (!(req->flags & REQ_F_NOWAIT) &&
2799                             !file_can_poll(req->file))
2800                                 req->flags |= REQ_F_MUST_PUNT;
2801                         return -EAGAIN;
2802                 }
2803         }
2804 out_free:
2805         if (!(req->flags & REQ_F_NEED_CLEANUP))
2806                 kfree(iovec);
2807         return ret;
2808 }
2809
2810 static int __io_splice_prep(struct io_kiocb *req,
2811                             const struct io_uring_sqe *sqe)
2812 {
2813         struct io_splice* sp = &req->splice;
2814         unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
2815         int ret;
2816
2817         if (req->flags & REQ_F_NEED_CLEANUP)
2818                 return 0;
2819         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2820                 return -EINVAL;
2821
2822         sp->file_in = NULL;
2823         sp->len = READ_ONCE(sqe->len);
2824         sp->flags = READ_ONCE(sqe->splice_flags);
2825
2826         if (unlikely(sp->flags & ~valid_flags))
2827                 return -EINVAL;
2828
2829         ret = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), &sp->file_in,
2830                           (sp->flags & SPLICE_F_FD_IN_FIXED));
2831         if (ret)
2832                 return ret;
2833         req->flags |= REQ_F_NEED_CLEANUP;
2834
2835         if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
2836                 /*
2837                  * Splice operation will be punted aync, and here need to
2838                  * modify io_wq_work.flags, so initialize io_wq_work firstly.
2839                  */
2840                 io_req_init_async(req);
2841                 req->work.flags |= IO_WQ_WORK_UNBOUND;
2842         }
2843
2844         return 0;
2845 }
2846
2847 static int io_tee_prep(struct io_kiocb *req,
2848                        const struct io_uring_sqe *sqe)
2849 {
2850         if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
2851                 return -EINVAL;
2852         return __io_splice_prep(req, sqe);
2853 }
2854
2855 static int io_tee(struct io_kiocb *req, bool force_nonblock)
2856 {
2857         struct io_splice *sp = &req->splice;
2858         struct file *in = sp->file_in;
2859         struct file *out = sp->file_out;
2860         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
2861         long ret = 0;
2862
2863         if (force_nonblock)
2864                 return -EAGAIN;
2865         if (sp->len)
2866                 ret = do_tee(in, out, sp->len, flags);
2867
2868         io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
2869         req->flags &= ~REQ_F_NEED_CLEANUP;
2870
2871         io_cqring_add_event(req, ret);
2872         if (ret != sp->len)
2873                 req_set_fail_links(req);
2874         io_put_req(req);
2875         return 0;
2876 }
2877
2878 static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2879 {
2880         struct io_splice* sp = &req->splice;
2881
2882         sp->off_in = READ_ONCE(sqe->splice_off_in);
2883         sp->off_out = READ_ONCE(sqe->off);
2884         return __io_splice_prep(req, sqe);
2885 }
2886
2887 static int io_splice(struct io_kiocb *req, bool force_nonblock)
2888 {
2889         struct io_splice *sp = &req->splice;
2890         struct file *in = sp->file_in;
2891         struct file *out = sp->file_out;
2892         unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
2893         loff_t *poff_in, *poff_out;
2894         long ret = 0;
2895
2896         if (force_nonblock)
2897                 return -EAGAIN;
2898
2899         poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
2900         poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
2901
2902         if (sp->len)
2903                 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
2904
2905         io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
2906         req->flags &= ~REQ_F_NEED_CLEANUP;
2907
2908         io_cqring_add_event(req, ret);
2909         if (ret != sp->len)
2910                 req_set_fail_links(req);
2911         io_put_req(req);
2912         return 0;
2913 }
2914
2915 /*
2916  * IORING_OP_NOP just posts a completion event, nothing else.
2917  */
2918 static int io_nop(struct io_kiocb *req)
2919 {
2920         struct io_ring_ctx *ctx = req->ctx;
2921
2922         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2923                 return -EINVAL;
2924
2925         io_cqring_add_event(req, 0);
2926         io_put_req(req);
2927         return 0;
2928 }
2929
2930 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2931 {
2932         struct io_ring_ctx *ctx = req->ctx;
2933
2934         if (!req->file)
2935                 return -EBADF;
2936
2937         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
2938                 return -EINVAL;
2939         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
2940                 return -EINVAL;
2941
2942         req->sync.flags = READ_ONCE(sqe->fsync_flags);
2943         if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
2944                 return -EINVAL;
2945
2946         req->sync.off = READ_ONCE(sqe->off);
2947         req->sync.len = READ_ONCE(sqe->len);
2948         return 0;
2949 }
2950
2951 static int io_fsync(struct io_kiocb *req, bool force_nonblock)
2952 {
2953         loff_t end = req->sync.off + req->sync.len;
2954         int ret;
2955
2956         /* fsync always requires a blocking context */
2957         if (force_nonblock)
2958                 return -EAGAIN;
2959
2960         ret = vfs_fsync_range(req->file, req->sync.off,
2961                                 end > 0 ? end : LLONG_MAX,
2962                                 req->sync.flags & IORING_FSYNC_DATASYNC);
2963         if (ret < 0)
2964                 req_set_fail_links(req);
2965         io_cqring_add_event(req, ret);
2966         io_put_req(req);
2967         return 0;
2968 }
2969
2970 static int io_fallocate_prep(struct io_kiocb *req,
2971                              const struct io_uring_sqe *sqe)
2972 {
2973         if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
2974                 return -EINVAL;
2975         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
2976                 return -EINVAL;
2977
2978         req->sync.off = READ_ONCE(sqe->off);
2979         req->sync.len = READ_ONCE(sqe->addr);
2980         req->sync.mode = READ_ONCE(sqe->len);
2981         req->fsize = rlimit(RLIMIT_FSIZE);
2982         return 0;
2983 }
2984
2985 static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
2986 {
2987         int ret;
2988
2989         /* fallocate always requiring blocking context */
2990         if (force_nonblock)
2991                 return -EAGAIN;
2992
2993         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
2994         ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
2995                                 req->sync.len);
2996         current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
2997         if (ret < 0)
2998                 req_set_fail_links(req);
2999         io_cqring_add_event(req, ret);
3000         io_put_req(req);
3001         return 0;
3002 }
3003
3004 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3005 {
3006         const char __user *fname;
3007         int ret;
3008
3009         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3010                 return -EINVAL;
3011         if (unlikely(sqe->ioprio || sqe->buf_index))
3012                 return -EINVAL;
3013         if (unlikely(req->flags & REQ_F_FIXED_FILE))
3014                 return -EBADF;
3015
3016         /* open.how should be already initialised */
3017         if (!(req->open.how.flags & O_PATH) && force_o_largefile())
3018                 req->open.how.flags |= O_LARGEFILE;
3019
3020         req->open.dfd = READ_ONCE(sqe->fd);
3021         fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3022         req->open.filename = getname(fname);
3023         if (IS_ERR(req->open.filename)) {
3024                 ret = PTR_ERR(req->open.filename);
3025                 req->open.filename = NULL;
3026                 return ret;
3027         }
3028         req->open.nofile = rlimit(RLIMIT_NOFILE);
3029         req->flags |= REQ_F_NEED_CLEANUP;
3030         return 0;
3031 }
3032
3033 static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3034 {
3035         u64 flags, mode;
3036
3037         if (req->flags & REQ_F_NEED_CLEANUP)
3038                 return 0;
3039         mode = READ_ONCE(sqe->len);
3040         flags = READ_ONCE(sqe->open_flags);
3041         req->open.how = build_open_how(flags, mode);
3042         return __io_openat_prep(req, sqe);
3043 }
3044
3045 static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3046 {
3047         struct open_how __user *how;
3048         size_t len;
3049         int ret;
3050
3051         if (req->flags & REQ_F_NEED_CLEANUP)
3052                 return 0;
3053         how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3054         len = READ_ONCE(sqe->len);
3055         if (len < OPEN_HOW_SIZE_VER0)
3056                 return -EINVAL;
3057
3058         ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
3059                                         len);
3060         if (ret)
3061                 return ret;
3062
3063         return __io_openat_prep(req, sqe);
3064 }
3065
3066 static int io_openat2(struct io_kiocb *req, bool force_nonblock)
3067 {
3068         struct open_flags op;
3069         struct file *file;
3070         int ret;
3071
3072         if (force_nonblock)
3073                 return -EAGAIN;
3074
3075         ret = build_open_flags(&req->open.how, &op);
3076         if (ret)
3077                 goto err;
3078
3079         ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
3080         if (ret < 0)
3081                 goto err;
3082
3083         file = do_filp_open(req->open.dfd, req->open.filename, &op);
3084         if (IS_ERR(file)) {
3085                 put_unused_fd(ret);
3086                 ret = PTR_ERR(file);
3087         } else {
3088                 fsnotify_open(file);
3089                 fd_install(ret, file);
3090         }
3091 err:
3092         putname(req->open.filename);
3093         req->flags &= ~REQ_F_NEED_CLEANUP;
3094         if (ret < 0)
3095                 req_set_fail_links(req);
3096         io_cqring_add_event(req, ret);
3097         io_put_req(req);
3098         return 0;
3099 }
3100
3101 static int io_openat(struct io_kiocb *req, bool force_nonblock)
3102 {
3103         return io_openat2(req, force_nonblock);
3104 }
3105
3106 static int io_remove_buffers_prep(struct io_kiocb *req,
3107                                   const struct io_uring_sqe *sqe)
3108 {
3109         struct io_provide_buf *p = &req->pbuf;
3110         u64 tmp;
3111
3112         if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
3113                 return -EINVAL;
3114
3115         tmp = READ_ONCE(sqe->fd);
3116         if (!tmp || tmp > USHRT_MAX)
3117                 return -EINVAL;
3118
3119         memset(p, 0, sizeof(*p));
3120         p->nbufs = tmp;
3121         p->bgid = READ_ONCE(sqe->buf_group);
3122         return 0;
3123 }
3124
3125 static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
3126                                int bgid, unsigned nbufs)
3127 {
3128         unsigned i = 0;
3129
3130         /* shouldn't happen */
3131         if (!nbufs)
3132                 return 0;
3133
3134         /* the head kbuf is the list itself */
3135         while (!list_empty(&buf->list)) {
3136                 struct io_buffer *nxt;
3137
3138                 nxt = list_first_entry(&buf->list, struct io_buffer, list);
3139                 list_del(&nxt->list);
3140                 kfree(nxt);
3141                 if (++i == nbufs)
3142                         return i;
3143         }
3144         i++;
3145         kfree(buf);
3146         idr_remove(&ctx->io_buffer_idr, bgid);
3147
3148         return i;
3149 }
3150
3151 static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock)
3152 {
3153         struct io_provide_buf *p = &req->pbuf;
3154         struct io_ring_ctx *ctx = req->ctx;
3155         struct io_buffer *head;
3156         int ret = 0;
3157
3158         io_ring_submit_lock(ctx, !force_nonblock);
3159
3160         lockdep_assert_held(&ctx->uring_lock);
3161
3162         ret = -ENOENT;
3163         head = idr_find(&ctx->io_buffer_idr, p->bgid);
3164         if (head)
3165                 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
3166
3167         io_ring_submit_lock(ctx, !force_nonblock);
3168         if (ret < 0)
3169                 req_set_fail_links(req);
3170         io_cqring_add_event(req, ret);
3171         io_put_req(req);
3172         return 0;
3173 }
3174
3175 static int io_provide_buffers_prep(struct io_kiocb *req,
3176                                    const struct io_uring_sqe *sqe)
3177 {
3178         struct io_provide_buf *p = &req->pbuf;
3179         u64 tmp;
3180
3181         if (sqe->ioprio || sqe->rw_flags)
3182                 return -EINVAL;
3183
3184         tmp = READ_ONCE(sqe->fd);
3185         if (!tmp || tmp > USHRT_MAX)
3186                 return -E2BIG;
3187         p->nbufs = tmp;
3188         p->addr = READ_ONCE(sqe->addr);
3189         p->len = READ_ONCE(sqe->len);
3190
3191         if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
3192                 return -EFAULT;
3193
3194         p->bgid = READ_ONCE(sqe->buf_group);
3195         tmp = READ_ONCE(sqe->off);
3196         if (tmp > USHRT_MAX)
3197                 return -E2BIG;
3198         p->bid = tmp;
3199         return 0;
3200 }
3201
3202 static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
3203 {
3204         struct io_buffer *buf;
3205         u64 addr = pbuf->addr;
3206         int i, bid = pbuf->bid;
3207
3208         for (i = 0; i < pbuf->nbufs; i++) {
3209                 buf = kmalloc(sizeof(*buf), GFP_KERNEL);
3210                 if (!buf)
3211                         break;
3212
3213                 buf->addr = addr;
3214                 buf->len = pbuf->len;
3215                 buf->bid = bid;
3216                 addr += pbuf->len;
3217                 bid++;
3218                 if (!*head) {
3219                         INIT_LIST_HEAD(&buf->list);
3220                         *head = buf;
3221                 } else {
3222                         list_add_tail(&buf->list, &(*head)->list);
3223                 }
3224         }
3225
3226         return i ? i : -ENOMEM;
3227 }
3228
3229 static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock)
3230 {
3231         struct io_provide_buf *p = &req->pbuf;
3232         struct io_ring_ctx *ctx = req->ctx;
3233         struct io_buffer *head, *list;
3234         int ret = 0;
3235
3236         io_ring_submit_lock(ctx, !force_nonblock);
3237
3238         lockdep_assert_held(&ctx->uring_lock);
3239
3240         list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
3241
3242         ret = io_add_buffers(p, &head);
3243         if (ret < 0)
3244                 goto out;
3245
3246         if (!list) {
3247                 ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
3248                                         GFP_KERNEL);
3249                 if (ret < 0) {
3250                         __io_remove_buffers(ctx, head, p->bgid, -1U);
3251                         goto out;
3252                 }
3253         }
3254 out:
3255         io_ring_submit_unlock(ctx, !force_nonblock);
3256         if (ret < 0)
3257                 req_set_fail_links(req);
3258         io_cqring_add_event(req, ret);
3259         io_put_req(req);
3260         return 0;
3261 }
3262
3263 static int io_epoll_ctl_prep(struct io_kiocb *req,
3264                              const struct io_uring_sqe *sqe)
3265 {
3266 #if defined(CONFIG_EPOLL)
3267         if (sqe->ioprio || sqe->buf_index)
3268                 return -EINVAL;
3269         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3270                 return -EINVAL;
3271
3272         req->epoll.epfd = READ_ONCE(sqe->fd);
3273         req->epoll.op = READ_ONCE(sqe->len);
3274         req->epoll.fd = READ_ONCE(sqe->off);
3275
3276         if (ep_op_has_event(req->epoll.op)) {
3277                 struct epoll_event __user *ev;
3278
3279                 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
3280                 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
3281                         return -EFAULT;
3282         }
3283
3284         return 0;
3285 #else
3286         return -EOPNOTSUPP;
3287 #endif
3288 }
3289
3290 static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock)
3291 {
3292 #if defined(CONFIG_EPOLL)
3293         struct io_epoll *ie = &req->epoll;
3294         int ret;
3295
3296         ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
3297         if (force_nonblock && ret == -EAGAIN)
3298                 return -EAGAIN;
3299
3300         if (ret < 0)
3301                 req_set_fail_links(req);
3302         io_cqring_add_event(req, ret);
3303         io_put_req(req);
3304         return 0;
3305 #else
3306         return -EOPNOTSUPP;
3307 #endif
3308 }
3309
3310 static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3311 {
3312 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
3313         if (sqe->ioprio || sqe->buf_index || sqe->off)
3314                 return -EINVAL;
3315         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3316                 return -EINVAL;
3317
3318         req->madvise.addr = READ_ONCE(sqe->addr);
3319         req->madvise.len = READ_ONCE(sqe->len);
3320         req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
3321         return 0;
3322 #else
3323         return -EOPNOTSUPP;
3324 #endif
3325 }
3326
3327 static int io_madvise(struct io_kiocb *req, bool force_nonblock)
3328 {
3329 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
3330         struct io_madvise *ma = &req->madvise;
3331         int ret;
3332
3333         if (force_nonblock)
3334                 return -EAGAIN;
3335
3336         ret = do_madvise(ma->addr, ma->len, ma->advice);
3337         if (ret < 0)
3338                 req_set_fail_links(req);
3339         io_cqring_add_event(req, ret);
3340         io_put_req(req);
3341         return 0;
3342 #else
3343         return -EOPNOTSUPP;
3344 #endif
3345 }
3346
3347 static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3348 {
3349         if (sqe->ioprio || sqe->buf_index || sqe->addr)
3350                 return -EINVAL;
3351         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3352                 return -EINVAL;
3353
3354         req->fadvise.offset = READ_ONCE(sqe->off);
3355         req->fadvise.len = READ_ONCE(sqe->len);
3356         req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
3357         return 0;
3358 }
3359
3360 static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
3361 {
3362         struct io_fadvise *fa = &req->fadvise;
3363         int ret;
3364
3365         if (force_nonblock) {
3366                 switch (fa->advice) {
3367                 case POSIX_FADV_NORMAL:
3368                 case POSIX_FADV_RANDOM:
3369                 case POSIX_FADV_SEQUENTIAL:
3370                         break;
3371                 default:
3372                         return -EAGAIN;
3373                 }
3374         }
3375
3376         ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
3377         if (ret < 0)
3378                 req_set_fail_links(req);
3379         io_cqring_add_event(req, ret);
3380         io_put_req(req);
3381         return 0;
3382 }
3383
3384 static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3385 {
3386         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3387                 return -EINVAL;
3388         if (sqe->ioprio || sqe->buf_index)
3389                 return -EINVAL;
3390         if (req->flags & REQ_F_FIXED_FILE)
3391                 return -EBADF;
3392
3393         req->statx.dfd = READ_ONCE(sqe->fd);
3394         req->statx.mask = READ_ONCE(sqe->len);
3395         req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
3396         req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3397         req->statx.flags = READ_ONCE(sqe->statx_flags);
3398
3399         return 0;
3400 }
3401
3402 static int io_statx(struct io_kiocb *req, bool force_nonblock)
3403 {
3404         struct io_statx *ctx = &req->statx;
3405         int ret;
3406
3407         if (force_nonblock) {
3408                 /* only need file table for an actual valid fd */
3409                 if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
3410                         req->flags |= REQ_F_NO_FILE_TABLE;
3411                 return -EAGAIN;
3412         }
3413
3414         ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
3415                        ctx->buffer);
3416
3417         if (ret < 0)
3418                 req_set_fail_links(req);
3419         io_cqring_add_event(req, ret);
3420         io_put_req(req);
3421         return 0;
3422 }
3423
3424 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3425 {
3426         /*
3427          * If we queue this for async, it must not be cancellable. That would
3428          * leave the 'file' in an undeterminate state, and here need to modify
3429          * io_wq_work.flags, so initialize io_wq_work firstly.
3430          */
3431         io_req_init_async(req);
3432         req->work.flags |= IO_WQ_WORK_NO_CANCEL;
3433
3434         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3435                 return -EINVAL;
3436         if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
3437             sqe->rw_flags || sqe->buf_index)
3438                 return -EINVAL;
3439         if (req->flags & REQ_F_FIXED_FILE)
3440                 return -EBADF;
3441
3442         req->close.fd = READ_ONCE(sqe->fd);
3443         if ((req->file && req->file->f_op == &io_uring_fops) ||
3444             req->close.fd == req->ctx->ring_fd)
3445                 return -EBADF;
3446
3447         req->close.put_file = NULL;
3448         return 0;
3449 }
3450
3451 static int io_close(struct io_kiocb *req, bool force_nonblock)
3452 {
3453         struct io_close *close = &req->close;
3454         int ret;
3455
3456         /* might be already done during nonblock submission */
3457         if (!close->put_file) {
3458                 ret = __close_fd_get_file(close->fd, &close->put_file);
3459                 if (ret < 0)
3460                         return (ret == -ENOENT) ? -EBADF : ret;
3461         }
3462
3463         /* if the file has a flush method, be safe and punt to async */
3464         if (close->put_file->f_op->flush && force_nonblock) {
3465                 /* avoid grabbing files - we don't need the files */
3466                 req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT;
3467                 return -EAGAIN;
3468         }
3469
3470         /* No ->flush() or already async, safely close from here */
3471         ret = filp_close(close->put_file, req->work.files);
3472         if (ret < 0)
3473                 req_set_fail_links(req);
3474         io_cqring_add_event(req, ret);
3475         fput(close->put_file);
3476         close->put_file = NULL;
3477         io_put_req(req);
3478         return 0;
3479 }
3480
3481 static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3482 {
3483         struct io_ring_ctx *ctx = req->ctx;
3484
3485         if (!req->file)
3486                 return -EBADF;
3487
3488         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3489                 return -EINVAL;
3490         if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
3491                 return -EINVAL;
3492
3493         req->sync.off = READ_ONCE(sqe->off);
3494         req->sync.len = READ_ONCE(sqe->len);
3495         req->sync.flags = READ_ONCE(sqe->sync_range_flags);
3496         return 0;
3497 }
3498
3499 static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
3500 {
3501         int ret;
3502
3503         /* sync_file_range always requires a blocking context */
3504         if (force_nonblock)
3505                 return -EAGAIN;
3506
3507         ret = sync_file_range(req->file, req->sync.off, req->sync.len,
3508                                 req->sync.flags);
3509         if (ret < 0)
3510                 req_set_fail_links(req);
3511         io_cqring_add_event(req, ret);
3512         io_put_req(req);
3513         return 0;
3514 }
3515
3516 #if defined(CONFIG_NET)
3517 static int io_setup_async_msg(struct io_kiocb *req,
3518                               struct io_async_msghdr *kmsg)
3519 {
3520         if (req->io)
3521                 return -EAGAIN;
3522         if (io_alloc_async_ctx(req)) {
3523                 if (kmsg->iov != kmsg->fast_iov)
3524                         kfree(kmsg->iov);
3525                 return -ENOMEM;
3526         }
3527         req->flags |= REQ_F_NEED_CLEANUP;
3528         memcpy(&req->io->msg, kmsg, sizeof(*kmsg));
3529         return -EAGAIN;
3530 }
3531
3532 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3533 {
3534         struct io_sr_msg *sr = &req->sr_msg;
3535         struct io_async_ctx *io = req->io;
3536         int ret;
3537
3538         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3539                 return -EINVAL;
3540
3541         sr->msg_flags = READ_ONCE(sqe->msg_flags);
3542         sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
3543         sr->len = READ_ONCE(sqe->len);
3544
3545 #ifdef CONFIG_COMPAT
3546         if (req->ctx->compat)
3547                 sr->msg_flags |= MSG_CMSG_COMPAT;
3548 #endif
3549
3550         if (!io || req->opcode == IORING_OP_SEND)
3551                 return 0;
3552         /* iovec is already imported */
3553         if (req->flags & REQ_F_NEED_CLEANUP)
3554                 return 0;
3555
3556         io->msg.iov = io->msg.fast_iov;
3557         ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
3558                                         &io->msg.iov);
3559         if (!ret)
3560                 req->flags |= REQ_F_NEED_CLEANUP;
3561         return ret;
3562 }
3563
3564 static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
3565 {
3566         struct io_async_msghdr *kmsg = NULL;
3567         struct socket *sock;
3568         int ret;
3569
3570         sock = sock_from_file(req->file, &ret);
3571         if (sock) {
3572                 struct io_async_ctx io;
3573                 unsigned flags;
3574
3575                 if (req->io) {
3576                         kmsg = &req->io->msg;
3577                         kmsg->msg.msg_name = &req->io->msg.addr;
3578                         /* if iov is set, it's allocated already */
3579                         if (!kmsg->iov)
3580                                 kmsg->iov = kmsg->fast_iov;
3581                         kmsg->msg.msg_iter.iov = kmsg->iov;
3582                 } else {
3583                         struct io_sr_msg *sr = &req->sr_msg;
3584
3585                         kmsg = &io.msg;
3586                         kmsg->msg.msg_name = &io.msg.addr;
3587
3588                         io.msg.iov = io.msg.fast_iov;
3589                         ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
3590                                         sr->msg_flags, &io.msg.iov);
3591                         if (ret)
3592                                 return ret;
3593                 }
3594
3595                 flags = req->sr_msg.msg_flags;
3596                 if (flags & MSG_DONTWAIT)
3597                         req->flags |= REQ_F_NOWAIT;
3598                 else if (force_nonblock)
3599                         flags |= MSG_DONTWAIT;
3600
3601                 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
3602                 if (force_nonblock && ret == -EAGAIN)
3603                         return io_setup_async_msg(req, kmsg);
3604                 if (ret == -ERESTARTSYS)
3605                         ret = -EINTR;
3606         }
3607
3608         if (kmsg && kmsg->iov != kmsg->fast_iov)
3609                 kfree(kmsg->iov);
3610         req->flags &= ~REQ_F_NEED_CLEANUP;
3611         io_cqring_add_event(req, ret);
3612         if (ret < 0)
3613                 req_set_fail_links(req);
3614         io_put_req(req);
3615         return 0;
3616 }
3617
3618 static int io_send(struct io_kiocb *req, bool force_nonblock)
3619 {
3620         struct socket *sock;
3621         int ret;
3622
3623         sock = sock_from_file(req->file, &ret);
3624         if (sock) {
3625                 struct io_sr_msg *sr = &req->sr_msg;
3626                 struct msghdr msg;
3627                 struct iovec iov;
3628                 unsigned flags;
3629
3630                 ret = import_single_range(WRITE, sr->buf, sr->len, &iov,
3631                                                 &msg.msg_iter);
3632                 if (ret)
3633                         return ret;
3634
3635                 msg.msg_name = NULL;
3636                 msg.msg_control = NULL;
3637                 msg.msg_controllen = 0;
3638                 msg.msg_namelen = 0;
3639
3640                 flags = req->sr_msg.msg_flags;
3641                 if (flags & MSG_DONTWAIT)
3642                         req->flags |= REQ_F_NOWAIT;
3643                 else if (force_nonblock)
3644                         flags |= MSG_DONTWAIT;
3645
3646                 msg.msg_flags = flags;
3647                 ret = sock_sendmsg(sock, &msg);
3648                 if (force_nonblock && ret == -EAGAIN)
3649                         return -EAGAIN;
3650                 if (ret == -ERESTARTSYS)
3651                         ret = -EINTR;
3652         }
3653
3654         io_cqring_add_event(req, ret);
3655         if (ret < 0)
3656                 req_set_fail_links(req);
3657         io_put_req(req);
3658         return 0;
3659 }
3660
3661 static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
3662 {
3663         struct io_sr_msg *sr = &req->sr_msg;
3664         struct iovec __user *uiov;
3665         size_t iov_len;
3666         int ret;
3667
3668         ret = __copy_msghdr_from_user(&io->msg.msg, sr->msg, &io->msg.uaddr,
3669                                         &uiov, &iov_len);
3670         if (ret)
3671                 return ret;
3672
3673         if (req->flags & REQ_F_BUFFER_SELECT) {
3674                 if (iov_len > 1)
3675                         return -EINVAL;
3676                 if (copy_from_user(io->msg.iov, uiov, sizeof(*uiov)))
3677                         return -EFAULT;
3678                 sr->len = io->msg.iov[0].iov_len;
3679                 iov_iter_init(&io->msg.msg.msg_iter, READ, io->msg.iov, 1,
3680                                 sr->len);
3681                 io->msg.iov = NULL;
3682         } else {
3683                 ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
3684                                         &io->msg.iov, &io->msg.msg.msg_iter);
3685                 if (ret > 0)
3686                         ret = 0;
3687         }
3688
3689         return ret;
3690 }
3691
3692 #ifdef CONFIG_COMPAT
3693 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
3694                                         struct io_async_ctx *io)
3695 {
3696         struct compat_msghdr __user *msg_compat;
3697         struct io_sr_msg *sr = &req->sr_msg;
3698         struct compat_iovec __user *uiov;
3699         compat_uptr_t ptr;
3700         compat_size_t len;
3701         int ret;
3702
3703         msg_compat = (struct compat_msghdr __user *) sr->msg;
3704         ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr,
3705                                         &ptr, &len);
3706         if (ret)
3707                 return ret;
3708
3709         uiov = compat_ptr(ptr);
3710         if (req->flags & REQ_F_BUFFER_SELECT) {
3711                 compat_ssize_t clen;
3712
3713                 if (len > 1)
3714                         return -EINVAL;
3715                 if (!access_ok(uiov, sizeof(*uiov)))
3716                         return -EFAULT;
3717                 if (__get_user(clen, &uiov->iov_len))
3718                         return -EFAULT;
3719                 if (clen < 0)
3720                         return -EINVAL;
3721                 sr->len = io->msg.iov[0].iov_len;
3722                 io->msg.iov = NULL;
3723         } else {
3724                 ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV,
3725                                                 &io->msg.iov,
3726                                                 &io->msg.msg.msg_iter);
3727                 if (ret < 0)
3728                         return ret;
3729         }
3730
3731         return 0;
3732 }
3733 #endif
3734
3735 static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
3736 {
3737         io->msg.iov = io->msg.fast_iov;
3738
3739 #ifdef CONFIG_COMPAT
3740         if (req->ctx->compat)
3741                 return __io_compat_recvmsg_copy_hdr(req, io);
3742 #endif
3743
3744         return __io_recvmsg_copy_hdr(req, io);
3745 }
3746
3747 static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
3748                                                int *cflags, bool needs_lock)
3749 {
3750         struct io_sr_msg *sr = &req->sr_msg;
3751         struct io_buffer *kbuf;
3752
3753         if (!(req->flags & REQ_F_BUFFER_SELECT))
3754                 return NULL;
3755
3756         kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
3757         if (IS_ERR(kbuf))
3758                 return kbuf;
3759
3760         sr->kbuf = kbuf;
3761         req->flags |= REQ_F_BUFFER_SELECTED;
3762
3763         *cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
3764         *cflags |= IORING_CQE_F_BUFFER;
3765         return kbuf;
3766 }
3767
3768 static int io_recvmsg_prep(struct io_kiocb *req,
3769                            const struct io_uring_sqe *sqe)
3770 {
3771         struct io_sr_msg *sr = &req->sr_msg;
3772         struct io_async_ctx *io = req->io;
3773         int ret;
3774
3775         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3776                 return -EINVAL;
3777
3778         sr->msg_flags = READ_ONCE(sqe->msg_flags);
3779         sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
3780         sr->len = READ_ONCE(sqe->len);
3781         sr->bgid = READ_ONCE(sqe->buf_group);
3782
3783 #ifdef CONFIG_COMPAT
3784         if (req->ctx->compat)
3785                 sr->msg_flags |= MSG_CMSG_COMPAT;
3786 #endif
3787
3788         if (!io || req->opcode == IORING_OP_RECV)
3789                 return 0;
3790         /* iovec is already imported */
3791         if (req->flags & REQ_F_NEED_CLEANUP)
3792                 return 0;
3793
3794         ret = io_recvmsg_copy_hdr(req, io);
3795         if (!ret)
3796                 req->flags |= REQ_F_NEED_CLEANUP;
3797         return ret;
3798 }
3799
3800 static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
3801 {
3802         struct io_async_msghdr *kmsg = NULL;
3803         struct socket *sock;
3804         int ret, cflags = 0;
3805
3806         sock = sock_from_file(req->file, &ret);
3807         if (sock) {
3808                 struct io_buffer *kbuf;
3809                 struct io_async_ctx io;
3810                 unsigned flags;
3811
3812                 if (req->io) {
3813                         kmsg = &req->io->msg;
3814                         kmsg->msg.msg_name = &req->io->msg.addr;
3815                         /* if iov is set, it's allocated already */
3816                         if (!kmsg->iov)
3817                                 kmsg->iov = kmsg->fast_iov;
3818                         kmsg->msg.msg_iter.iov = kmsg->iov;
3819                 } else {
3820                         kmsg = &io.msg;
3821                         kmsg->msg.msg_name = &io.msg.addr;
3822
3823                         ret = io_recvmsg_copy_hdr(req, &io);
3824                         if (ret)
3825                                 return ret;
3826                 }
3827
3828                 kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
3829                 if (IS_ERR(kbuf)) {
3830                         return PTR_ERR(kbuf);
3831                 } else if (kbuf) {
3832                         kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
3833                         iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
3834                                         1, req->sr_msg.len);
3835                 }
3836
3837                 flags = req->sr_msg.msg_flags;
3838                 if (flags & MSG_DONTWAIT)
3839                         req->flags |= REQ_F_NOWAIT;
3840                 else if (force_nonblock)
3841                         flags |= MSG_DONTWAIT;
3842
3843                 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
3844                                                 kmsg->uaddr, flags);
3845                 if (force_nonblock && ret == -EAGAIN)
3846                         return io_setup_async_msg(req, kmsg);
3847                 if (ret == -ERESTARTSYS)
3848                         ret = -EINTR;
3849         }
3850
3851         if (kmsg && kmsg->iov != kmsg->fast_iov)
3852                 kfree(kmsg->iov);
3853         req->flags &= ~REQ_F_NEED_CLEANUP;
3854         __io_cqring_add_event(req, ret, cflags);
3855         if (ret < 0)
3856                 req_set_fail_links(req);
3857         io_put_req(req);
3858         return 0;
3859 }
3860
3861 static int io_recv(struct io_kiocb *req, bool force_nonblock)
3862 {
3863         struct io_buffer *kbuf = NULL;
3864         struct socket *sock;
3865         int ret, cflags = 0;
3866
3867         sock = sock_from_file(req->file, &ret);
3868         if (sock) {
3869                 struct io_sr_msg *sr = &req->sr_msg;
3870                 void __user *buf = sr->buf;
3871                 struct msghdr msg;
3872                 struct iovec iov;
3873                 unsigned flags;
3874
3875                 kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
3876                 if (IS_ERR(kbuf))
3877                         return PTR_ERR(kbuf);
3878                 else if (kbuf)
3879                         buf = u64_to_user_ptr(kbuf->addr);
3880
3881                 ret = import_single_range(READ, buf, sr->len, &iov,
3882                                                 &msg.msg_iter);
3883                 if (ret) {
3884                         kfree(kbuf);
3885                         return ret;
3886                 }
3887
3888                 req->flags |= REQ_F_NEED_CLEANUP;
3889                 msg.msg_name = NULL;
3890                 msg.msg_control = NULL;
3891                 msg.msg_controllen = 0;
3892                 msg.msg_namelen = 0;
3893                 msg.msg_iocb = NULL;
3894                 msg.msg_flags = 0;
3895
3896                 flags = req->sr_msg.msg_flags;
3897                 if (flags & MSG_DONTWAIT)
3898                         req->flags |= REQ_F_NOWAIT;
3899                 else if (force_nonblock)
3900                         flags |= MSG_DONTWAIT;
3901
3902                 ret = sock_recvmsg(sock, &msg, flags);
3903                 if (force_nonblock && ret == -EAGAIN)
3904                         return -EAGAIN;
3905                 if (ret == -ERESTARTSYS)
3906                         ret = -EINTR;
3907         }
3908
3909         kfree(kbuf);
3910         req->flags &= ~REQ_F_NEED_CLEANUP;
3911         __io_cqring_add_event(req, ret, cflags);
3912         if (ret < 0)
3913                 req_set_fail_links(req);
3914         io_put_req(req);
3915         return 0;
3916 }
3917
3918 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3919 {
3920         struct io_accept *accept = &req->accept;
3921
3922         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3923                 return -EINVAL;
3924         if (sqe->ioprio || sqe->len || sqe->buf_index)
3925                 return -EINVAL;
3926
3927         accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
3928         accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3929         accept->flags = READ_ONCE(sqe->accept_flags);
3930         accept->nofile = rlimit(RLIMIT_NOFILE);
3931         return 0;
3932 }
3933
3934 static int io_accept(struct io_kiocb *req, bool force_nonblock)
3935 {
3936         struct io_accept *accept = &req->accept;
3937         unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
3938         int ret;
3939
3940         if (req->file->f_flags & O_NONBLOCK)
3941                 req->flags |= REQ_F_NOWAIT;
3942
3943         ret = __sys_accept4_file(req->file, file_flags, accept->addr,
3944                                         accept->addr_len, accept->flags,
3945                                         accept->nofile);
3946         if (ret == -EAGAIN && force_nonblock)
3947                 return -EAGAIN;
3948         if (ret < 0) {
3949                 if (ret == -ERESTARTSYS)
3950                         ret = -EINTR;
3951                 req_set_fail_links(req);
3952         }
3953         io_cqring_add_event(req, ret);
3954         io_put_req(req);
3955         return 0;
3956 }
3957
3958 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3959 {
3960         struct io_connect *conn = &req->connect;
3961         struct io_async_ctx *io = req->io;
3962
3963         if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3964                 return -EINVAL;
3965         if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
3966                 return -EINVAL;
3967
3968         conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
3969         conn->addr_len =  READ_ONCE(sqe->addr2);
3970
3971         if (!io)
3972                 return 0;
3973
3974         return move_addr_to_kernel(conn->addr, conn->addr_len,
3975                                         &io->connect.address);
3976 }
3977
3978 static int io_connect(struct io_kiocb *req, bool force_nonblock)
3979 {
3980         struct io_async_ctx __io, *io;
3981         unsigned file_flags;
3982         int ret;
3983
3984         if (req->io) {
3985                 io = req->io;
3986         } else {
3987                 ret = move_addr_to_kernel(req->connect.addr,
3988                                                 req->connect.addr_len,
3989                                                 &__io.connect.address);
3990                 if (ret)
3991                         goto out;
3992                 io = &__io;
3993         }
3994
3995         file_flags = force_nonblock ? O_NONBLOCK : 0;
3996
3997         ret = __sys_connect_file(req->file, &io->connect.address,
3998                                         req->connect.addr_len, file_flags);
3999         if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
4000                 if (req->io)
4001                         return -EAGAIN;
4002                 if (io_alloc_async_ctx(req)) {
4003                         ret = -ENOMEM;
4004                         goto out;
4005                 }
4006                 memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect));
4007                 return -EAGAIN;
4008         }
4009         if (ret == -ERESTARTSYS)
4010                 ret = -EINTR;
4011 out:
4012         if (ret < 0)
4013                 req_set_fail_links(req);
4014         io_cqring_add_event(req, ret);
4015         io_put_req(req);
4016         return 0;
4017 }
4018 #else /* !CONFIG_NET */
4019 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4020 {
4021         return -EOPNOTSUPP;
4022 }
4023
4024 static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
4025 {
4026         return -EOPNOTSUPP;
4027 }
4028
4029 static int io_send(struct io_kiocb *req, bool force_nonblock)
4030 {
4031         return -EOPNOTSUPP;
4032 }
4033
4034 static int io_recvmsg_prep(struct io_kiocb *req,
4035                            const struct io_uring_sqe *sqe)
4036 {
4037         return -EOPNOTSUPP;
4038 }
4039
4040 static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
4041 {
4042         return -EOPNOTSUPP;
4043 }
4044
4045 static int io_recv(struct io_kiocb *req, bool force_nonblock)
4046 {
4047         return -EOPNOTSUPP;
4048 }
4049
4050 static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4051 {
4052         return -EOPNOTSUPP;
4053 }
4054
4055 static int io_accept(struct io_kiocb *req, bool force_nonblock)
4056 {
4057         return -EOPNOTSUPP;
4058 }
4059
4060 static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4061 {
4062         return -EOPNOTSUPP;
4063 }
4064
4065 static int io_connect(struct io_kiocb *req, bool force_nonblock)
4066 {
4067         return -EOPNOTSUPP;
4068 }
4069 #endif /* CONFIG_NET */
4070
4071 struct io_poll_table {
4072         struct poll_table_struct pt;
4073         struct io_kiocb *req;
4074         int error;
4075 };
4076
4077 static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb)
4078 {
4079         struct task_struct *tsk = req->task;
4080         struct io_ring_ctx *ctx = req->ctx;
4081         int ret, notify = TWA_RESUME;
4082
4083         /*
4084          * SQPOLL kernel thread doesn't need notification, just a wakeup.
4085          * If we're not using an eventfd, then TWA_RESUME is always fine,
4086          * as we won't have dependencies between request completions for
4087          * other kernel wait conditions.
4088          */
4089         if (ctx->flags & IORING_SETUP_SQPOLL)
4090                 notify = 0;
4091         else if (ctx->cq_ev_fd)
4092                 notify = TWA_SIGNAL;
4093
4094         ret = task_work_add(tsk, cb, notify);
4095         if (!ret)
4096                 wake_up_process(tsk);
4097         return ret;
4098 }
4099
4100 static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
4101                            __poll_t mask, task_work_func_t func)
4102 {
4103         struct task_struct *tsk;
4104         int ret;
4105
4106         /* for instances that support it check for an event match first: */
4107         if (mask && !(mask & poll->events))
4108                 return 0;
4109
4110         trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
4111
4112         list_del_init(&poll->wait.entry);
4113
4114         tsk = req->task;
4115         req->result = mask;
4116         init_task_work(&req->task_work, func);
4117         /*
4118          * If this fails, then the task is exiting. When a task exits, the
4119          * work gets canceled, so just cancel this request as well instead
4120          * of executing it. We can't safely execute it anyway, as we may not
4121          * have the needed state needed for it anyway.
4122          */
4123         ret = io_req_task_work_add(req, &req->task_work);
4124         if (unlikely(ret)) {
4125                 WRITE_ONCE(poll->canceled, true);
4126                 tsk = io_wq_get_task(req->ctx->io_wq);
4127                 task_work_add(tsk, &req->task_work, 0);
4128                 wake_up_process(tsk);
4129         }
4130         return 1;
4131 }
4132
4133 static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
4134         __acquires(&req->ctx->completion_lock)
4135 {
4136         struct io_ring_ctx *ctx = req->ctx;
4137
4138         if (!req->result && !READ_ONCE(poll->canceled)) {
4139                 struct poll_table_struct pt = { ._key = poll->events };
4140
4141                 req->result = vfs_poll(req->file, &pt) & poll->events;
4142         }
4143
4144         spin_lock_irq(&ctx->completion_lock);
4145         if (!req->result && !READ_ONCE(poll->canceled)) {
4146                 add_wait_queue(poll->head, &poll->wait);
4147                 return true;
4148         }
4149
4150         return false;
4151 }
4152
4153 static void io_poll_remove_double(struct io_kiocb *req)
4154 {
4155         struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io;
4156
4157         lockdep_assert_held(&req->ctx->completion_lock);
4158
4159         if (poll && poll->head) {
4160                 struct wait_queue_head *head = poll->head;
4161
4162                 spin_lock(&head->lock);
4163                 list_del_init(&poll->wait.entry);
4164                 if (poll->wait.private)
4165                         refcount_dec(&req->refs);
4166                 poll->head = NULL;
4167                 spin_unlock(&head->lock);
4168         }
4169 }
4170
4171 static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
4172 {
4173         struct io_ring_ctx *ctx = req->ctx;
4174
4175         io_poll_remove_double(req);
4176         req->poll.done = true;
4177         io_cqring_fill_event(req, error ? error : mangle_poll(mask));
4178         io_commit_cqring(ctx);
4179 }
4180
4181 static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
4182 {
4183         struct io_ring_ctx *ctx = req->ctx;
4184
4185         if (io_poll_rewait(req, &req->poll)) {
4186                 spin_unlock_irq(&ctx->completion_lock);
4187                 return;
4188         }
4189
4190         hash_del(&req->hash_node);
4191         io_poll_complete(req, req->result, 0);
4192         req->flags |= REQ_F_COMP_LOCKED;
4193         io_put_req_find_next(req, nxt);
4194         spin_unlock_irq(&ctx->completion_lock);
4195
4196         io_cqring_ev_posted(ctx);
4197 }
4198
4199 static void io_poll_task_func(struct callback_head *cb)
4200 {
4201         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4202         struct io_kiocb *nxt = NULL;
4203
4204         io_poll_task_handler(req, &nxt);
4205         if (nxt) {
4206                 struct io_ring_ctx *ctx = nxt->ctx;
4207
4208                 mutex_lock(&ctx->uring_lock);
4209                 __io_queue_sqe(nxt, NULL);
4210                 mutex_unlock(&ctx->uring_lock);
4211         }
4212 }
4213
4214 static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
4215                                int sync, void *key)
4216 {
4217         struct io_kiocb *req = wait->private;
4218         struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io;
4219         __poll_t mask = key_to_poll(key);
4220
4221         /* for instances that support it check for an event match first: */
4222         if (mask && !(mask & poll->events))
4223                 return 0;
4224
4225         if (req->poll.head) {
4226                 bool done;
4227
4228                 spin_lock(&req->poll.head->lock);
4229                 done = list_empty(&req->poll.wait.entry);
4230                 if (!done)
4231                         list_del_init(&req->poll.wait.entry);
4232                 spin_unlock(&req->poll.head->lock);
4233                 if (!done)
4234                         __io_async_wake(req, poll, mask, io_poll_task_func);
4235         }
4236         refcount_dec(&req->refs);
4237         return 1;
4238 }
4239
4240 static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
4241                               wait_queue_func_t wake_func)
4242 {
4243         poll->head = NULL;
4244         poll->done = false;
4245         poll->canceled = false;
4246         poll->events = events;
4247         INIT_LIST_HEAD(&poll->wait.entry);
4248         init_waitqueue_func_entry(&poll->wait, wake_func);
4249 }
4250
4251 static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
4252                             struct wait_queue_head *head)
4253 {
4254         struct io_kiocb *req = pt->req;
4255
4256         /*
4257          * If poll->head is already set, it's because the file being polled
4258          * uses multiple waitqueues for poll handling (eg one for read, one
4259          * for write). Setup a separate io_poll_iocb if this happens.
4260          */
4261         if (unlikely(poll->head)) {
4262                 /* already have a 2nd entry, fail a third attempt */
4263                 if (req->io) {
4264                         pt->error = -EINVAL;
4265                         return;
4266                 }
4267                 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
4268                 if (!poll) {
4269                         pt->error = -ENOMEM;
4270                         return;
4271                 }
4272                 io_init_poll_iocb(poll, req->poll.events, io_poll_double_wake);
4273                 refcount_inc(&req->refs);
4274                 poll->wait.private = req;
4275                 req->io = (void *) poll;
4276         }
4277
4278         pt->error = 0;
4279         poll->head = head;
4280         add_wait_queue(head, &poll->wait);
4281 }
4282
4283 static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
4284                                struct poll_table_struct *p)
4285 {
4286         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
4287
4288         __io_queue_proc(&pt->req->apoll->poll, pt, head);
4289 }
4290
4291 static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
4292 {
4293         struct mm_struct *mm = current->mm;
4294
4295         if (mm) {
4296                 kthread_unuse_mm(mm);
4297                 mmput(mm);
4298         }
4299 }
4300
4301 static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
4302                                    struct io_kiocb *req)
4303 {
4304         if (io_op_defs[req->opcode].needs_mm && !current->mm) {
4305                 if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
4306                         return -EFAULT;
4307                 kthread_use_mm(ctx->sqo_mm);
4308         }
4309
4310         return 0;
4311 }
4312
4313 static void io_async_task_func(struct callback_head *cb)
4314 {
4315         struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4316         struct async_poll *apoll = req->apoll;
4317         struct io_ring_ctx *ctx = req->ctx;
4318         bool canceled = false;
4319
4320         trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
4321
4322         if (io_poll_rewait(req, &apoll->poll)) {
4323                 spin_unlock_irq(&ctx->completion_lock);
4324                 return;
4325         }
4326
4327         /* If req is still hashed, it cannot have been canceled. Don't check. */
4328         if (hash_hashed(&req->hash_node)) {
4329                 hash_del(&req->hash_node);
4330         } else {
4331                 canceled = READ_ONCE(apoll->poll.canceled);
4332                 if (canceled) {
4333                         io_cqring_fill_event(req, -ECANCELED);
4334                         io_commit_cqring(ctx);
4335                 }
4336         }
4337
4338         spin_unlock_irq(&ctx->completion_lock);
4339
4340         /* restore ->work in case we need to retry again */
4341         if (req->flags & REQ_F_WORK_INITIALIZED)
4342                 memcpy(&req->work, &apoll->work, sizeof(req->work));
4343         kfree(apoll);
4344
4345         if (!canceled) {
4346                 __set_current_state(TASK_RUNNING);
4347                 if (io_sq_thread_acquire_mm(ctx, req)) {
4348                         io_cqring_add_event(req, -EFAULT);
4349                         goto end_req;
4350                 }
4351                 mutex_lock(&ctx->uring_lock);
4352                 __io_queue_sqe(req, NULL);
4353                 mutex_unlock(&ctx->uring_lock);
4354         } else {
4355                 io_cqring_ev_posted(ctx);
4356 end_req:
4357                 req_set_fail_links(req);
4358                 io_double_put_req(req);
4359         }
4360 }
4361
4362 static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
4363                         void *key)
4364 {
4365         struct io_kiocb *req = wait->private;
4366         struct io_poll_iocb *poll = &req->apoll->poll;
4367
4368         trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
4369                                         key_to_poll(key));
4370
4371         return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
4372 }
4373
4374 static void io_poll_req_insert(struct io_kiocb *req)
4375 {
4376         struct io_ring_ctx *ctx = req->ctx;
4377         struct hlist_head *list;
4378
4379         list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
4380         hlist_add_head(&req->hash_node, list);
4381 }
4382
4383 static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
4384                                       struct io_poll_iocb *poll,
4385                                       struct io_poll_table *ipt, __poll_t mask,
4386                                       wait_queue_func_t wake_func)
4387         __acquires(&ctx->completion_lock)
4388 {
4389         struct io_ring_ctx *ctx = req->ctx;
4390         bool cancel = false;
4391
4392         poll->file = req->file;
4393         io_init_poll_iocb(poll, mask, wake_func);
4394         poll->wait.private = req;
4395
4396         ipt->pt._key = mask;
4397         ipt->req = req;
4398         ipt->error = -EINVAL;
4399
4400         mask = vfs_poll(req->file, &ipt->pt) & poll->events;
4401
4402         spin_lock_irq(&ctx->completion_lock);
4403         if (likely(poll->head)) {
4404                 spin_lock(&poll->head->lock);
4405                 if (unlikely(list_empty(&poll->wait.entry))) {
4406                         if (ipt->error)
4407                                 cancel = true;
4408                         ipt->error = 0;
4409                         mask = 0;
4410                 }
4411                 if (mask || ipt->error)
4412                         list_del_init(&poll->wait.entry);
4413                 else if (cancel)
4414                         WRITE_ONCE(poll->canceled, true);
4415                 else if (!poll->done) /* actually waiting for an event */
4416                         io_poll_req_insert(req);
4417                 spin_unlock(&poll->head->lock);
4418         }
4419
4420         return mask;
4421 }
4422
4423 static bool io_arm_poll_handler(struct io_kiocb *req)
4424 {
4425         const struct io_op_def *def = &io_op_defs[req->opcode];
4426         struct io_ring_ctx *ctx = req->ctx;
4427         struct async_poll *apoll;
4428         struct io_poll_table ipt;
4429         __poll_t mask, ret;
4430         bool had_io;
4431
4432         if (!req->file || !file_can_poll(req->file))
4433                 return false;
4434         if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED))
4435                 return false;
4436         if (!def->pollin && !def->pollout)
4437                 return false;
4438
4439         apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
4440         if (unlikely(!apoll))
4441                 return false;
4442
4443         req->flags |= REQ_F_POLLED;
4444         if (req->flags & REQ_F_WORK_INITIALIZED)
4445                 memcpy(&apoll->work, &req->work, sizeof(req->work));
4446         had_io = req->io != NULL;
4447
4448         io_get_req_task(req);
4449         req->apoll = apoll;
4450         INIT_HLIST_NODE(&req->hash_node);
4451
4452         mask = 0;
4453         if (def->pollin)
4454                 mask |= POLLIN | POLLRDNORM;
4455         if (def->pollout)
4456                 mask |= POLLOUT | POLLWRNORM;
4457         mask |= POLLERR | POLLPRI;
4458
4459         ipt.pt._qproc = io_async_queue_proc;
4460
4461         ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
4462                                         io_async_wake);
4463         if (ret) {
4464                 ipt.error = 0;
4465                 /* only remove double add if we did it here */
4466                 if (!had_io)
4467                         io_poll_remove_double(req);
4468                 spin_unlock_irq(&ctx->completion_lock);
4469                 if (req->flags & REQ_F_WORK_INITIALIZED)
4470                         memcpy(&req->work, &apoll->work, sizeof(req->work));
4471                 kfree(apoll);
4472                 return false;
4473         }
4474         spin_unlock_irq(&ctx->completion_lock);
4475         trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
4476                                         apoll->poll.events);
4477         return true;
4478 }
4479
4480 static bool __io_poll_remove_one(struct io_kiocb *req,
4481                                  struct io_poll_iocb *poll)
4482 {
4483         bool do_complete = false;
4484
4485         spin_lock(&poll->head->lock);
4486         WRITE_ONCE(poll->canceled, true);
4487         if (!list_empty(&poll->wait.entry)) {
4488                 list_del_init(&poll->wait.entry);
4489                 do_complete = true;
4490         }
4491         spin_unlock(&poll->head->lock);
4492         hash_del(&req->hash_node);
4493         return do_complete;
4494 }
4495
4496 static bool io_poll_remove_one(struct io_kiocb *req)
4497 {
4498         bool do_complete;
4499
4500         if (req->opcode == IORING_OP_POLL_ADD) {
4501                 io_poll_remove_double(req);
4502                 do_complete = __io_poll_remove_one(req, &req->poll);
4503         } else {
4504                 struct async_poll *apoll = req->apoll;
4505
4506                 /* non-poll requests have submit ref still */
4507                 do_complete = __io_poll_remove_one(req, &apoll->poll);
4508                 if (do_complete) {
4509                         io_put_req(req);
4510                         /*
4511                          * restore ->work because we will call
4512                          * io_req_work_drop_env below when dropping the
4513                          * final reference.
4514                          */
4515                         if (req->flags & REQ_F_WORK_INITIALIZED)
4516                                 memcpy(&req->work, &apoll->work,
4517                                        sizeof(req->work));
4518                         kfree(apoll);
4519                 }
4520         }
4521
4522         if (do_complete) {
4523                 io_cqring_fill_event(req, -ECANCELED);
4524                 io_commit_cqring(req->ctx);
4525                 req->flags |= REQ_F_COMP_LOCKED;
4526                 io_put_req(req);
4527         }
4528
4529         return do_complete;
4530 }
4531
4532 static void io_poll_remove_all(struct io_ring_ctx *ctx)
4533 {
4534         struct hlist_node *tmp;
4535         struct io_kiocb *req;
4536         int posted = 0, i;
4537
4538         spin_lock_irq(&ctx->completion_lock);
4539         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
4540                 struct hlist_head *list;
4541
4542                 list = &ctx->cancel_hash[i];
4543                 hlist_for_each_entry_safe(req, tmp, list, hash_node)
4544                         posted += io_poll_remove_one(req);
4545         }
4546         spin_unlock_irq(&ctx->completion_lock);
4547
4548         if (posted)
4549                 io_cqring_ev_posted(ctx);
4550 }
4551
4552 static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
4553 {
4554         struct hlist_head *list;
4555         struct io_kiocb *req;
4556
4557         list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
4558         hlist_for_each_entry(req, list, hash_node) {
4559                 if (sqe_addr != req->user_data)
4560                         continue;
4561                 if (io_poll_remove_one(req))
4562                         return 0;
4563                 return -EALREADY;
4564         }
4565
4566         return -ENOENT;
4567 }
4568
4569 static int io_poll_remove_prep(struct io_kiocb *req,
4570                                const struct io_uring_sqe *sqe)
4571 {
4572         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4573                 return -EINVAL;
4574         if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
4575             sqe->poll_events)
4576                 return -EINVAL;
4577
4578         req->poll.addr = READ_ONCE(sqe->addr);
4579         return 0;
4580 }
4581
4582 /*
4583  * Find a running poll command that matches one specified in sqe->addr,
4584  * and remove it if found.
4585  */
4586 static int io_poll_remove(struct io_kiocb *req)
4587 {
4588         struct io_ring_ctx *ctx = req->ctx;
4589         u64 addr;
4590         int ret;
4591
4592         addr = req->poll.addr;
4593         spin_lock_irq(&ctx->completion_lock);
4594         ret = io_poll_cancel(ctx, addr);
4595         spin_unlock_irq(&ctx->completion_lock);
4596
4597         io_cqring_add_event(req, ret);
4598         if (ret < 0)
4599                 req_set_fail_links(req);
4600         io_put_req(req);
4601         return 0;
4602 }
4603
4604 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
4605                         void *key)
4606 {
4607         struct io_kiocb *req = wait->private;
4608         struct io_poll_iocb *poll = &req->poll;
4609
4610         return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
4611 }
4612
4613 static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
4614                                struct poll_table_struct *p)
4615 {
4616         struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
4617
4618         __io_queue_proc(&pt->req->poll, pt, head);
4619 }
4620
4621 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4622 {
4623         struct io_poll_iocb *poll = &req->poll;
4624         u16 events;
4625
4626         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4627                 return -EINVAL;
4628         if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
4629                 return -EINVAL;
4630         if (!poll->file)
4631                 return -EBADF;
4632
4633         events = READ_ONCE(sqe->poll_events);
4634         poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
4635
4636         io_get_req_task(req);
4637         return 0;
4638 }
4639
4640 static int io_poll_add(struct io_kiocb *req)
4641 {
4642         struct io_poll_iocb *poll = &req->poll;
4643         struct io_ring_ctx *ctx = req->ctx;
4644         struct io_poll_table ipt;
4645         __poll_t mask;
4646
4647         INIT_HLIST_NODE(&req->hash_node);
4648         INIT_LIST_HEAD(&req->list);
4649         ipt.pt._qproc = io_poll_queue_proc;
4650
4651         mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
4652                                         io_poll_wake);
4653
4654         if (mask) { /* no async, we'd stolen it */
4655                 ipt.error = 0;
4656                 io_poll_complete(req, mask, 0);
4657         }
4658         spin_unlock_irq(&ctx->completion_lock);
4659
4660         if (mask) {
4661                 io_cqring_ev_posted(ctx);
4662                 io_put_req(req);
4663         }
4664         return ipt.error;
4665 }
4666
4667 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
4668 {
4669         struct io_timeout_data *data = container_of(timer,
4670                                                 struct io_timeout_data, timer);
4671         struct io_kiocb *req = data->req;
4672         struct io_ring_ctx *ctx = req->ctx;
4673         unsigned long flags;
4674
4675         atomic_inc(&ctx->cq_timeouts);
4676
4677         spin_lock_irqsave(&ctx->completion_lock, flags);
4678         /*
4679          * We could be racing with timeout deletion. If the list is empty,
4680          * then timeout lookup already found it and will be handling it.
4681          */
4682         if (!list_empty(&req->list))
4683                 list_del_init(&req->list);
4684
4685         io_cqring_fill_event(req, -ETIME);
4686         io_commit_cqring(ctx);
4687         spin_unlock_irqrestore(&ctx->completion_lock, flags);
4688
4689         io_cqring_ev_posted(ctx);
4690         req_set_fail_links(req);
4691         io_put_req(req);
4692         return HRTIMER_NORESTART;
4693 }
4694
4695 static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
4696 {
4697         struct io_kiocb *req;
4698         int ret = -ENOENT;
4699
4700         list_for_each_entry(req, &ctx->timeout_list, list) {
4701                 if (user_data == req->user_data) {
4702                         list_del_init(&req->list);
4703                         ret = 0;
4704                         break;
4705                 }
4706         }
4707
4708         if (ret == -ENOENT)
4709                 return ret;
4710
4711         ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
4712         if (ret == -1)
4713                 return -EALREADY;
4714
4715         req_set_fail_links(req);
4716         io_cqring_fill_event(req, -ECANCELED);
4717         io_put_req(req);
4718         return 0;
4719 }
4720
4721 static int io_timeout_remove_prep(struct io_kiocb *req,
4722                                   const struct io_uring_sqe *sqe)
4723 {
4724         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4725                 return -EINVAL;
4726         if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
4727                 return -EINVAL;
4728
4729         req->timeout.addr = READ_ONCE(sqe->addr);
4730         req->timeout.flags = READ_ONCE(sqe->timeout_flags);
4731         if (req->timeout.flags)
4732                 return -EINVAL;
4733
4734         return 0;
4735 }
4736
4737 /*
4738  * Remove or update an existing timeout command
4739  */
4740 static int io_timeout_remove(struct io_kiocb *req)
4741 {
4742         struct io_ring_ctx *ctx = req->ctx;
4743         int ret;
4744
4745         spin_lock_irq(&ctx->completion_lock);
4746         ret = io_timeout_cancel(ctx, req->timeout.addr);
4747
4748         io_cqring_fill_event(req, ret);
4749         io_commit_cqring(ctx);
4750         spin_unlock_irq(&ctx->completion_lock);
4751         io_cqring_ev_posted(ctx);
4752         if (ret < 0)
4753                 req_set_fail_links(req);
4754         io_put_req(req);
4755         return 0;
4756 }
4757
4758 static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
4759                            bool is_timeout_link)
4760 {
4761         struct io_timeout_data *data;
4762         unsigned flags;
4763         u32 off = READ_ONCE(sqe->off);
4764
4765         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4766                 return -EINVAL;
4767         if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
4768                 return -EINVAL;
4769         if (off && is_timeout_link)
4770                 return -EINVAL;
4771         flags = READ_ONCE(sqe->timeout_flags);
4772         if (flags & ~IORING_TIMEOUT_ABS)
4773                 return -EINVAL;
4774
4775         req->timeout.off = off;
4776
4777         if (!req->io && io_alloc_async_ctx(req))
4778                 return -ENOMEM;
4779
4780         data = &req->io->timeout;
4781         data->req = req;
4782         req->flags |= REQ_F_TIMEOUT;
4783
4784         if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
4785                 return -EFAULT;
4786
4787         if (flags & IORING_TIMEOUT_ABS)
4788                 data->mode = HRTIMER_MODE_ABS;
4789         else
4790                 data->mode = HRTIMER_MODE_REL;
4791
4792         hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
4793         return 0;
4794 }
4795
4796 static int io_timeout(struct io_kiocb *req)
4797 {
4798         struct io_ring_ctx *ctx = req->ctx;
4799         struct io_timeout_data *data = &req->io->timeout;
4800         struct list_head *entry;
4801         u32 tail, off = req->timeout.off;
4802
4803         spin_lock_irq(&ctx->completion_lock);
4804
4805         /*
4806          * sqe->off holds how many events that need to occur for this
4807          * timeout event to be satisfied. If it isn't set, then this is
4808          * a pure timeout request, sequence isn't used.
4809          */
4810         if (!off) {
4811                 req->flags |= REQ_F_TIMEOUT_NOSEQ;
4812                 entry = ctx->timeout_list.prev;
4813                 goto add;
4814         }
4815
4816         tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
4817         req->timeout.target_seq = tail + off;
4818
4819         /*
4820          * Insertion sort, ensuring the first entry in the list is always
4821          * the one we need first.
4822          */
4823         list_for_each_prev(entry, &ctx->timeout_list) {
4824                 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
4825
4826                 if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
4827                         continue;
4828                 /* nxt.seq is behind @tail, otherwise would've been completed */
4829                 if (off >= nxt->timeout.target_seq - tail)
4830                         break;
4831         }
4832 add:
4833         list_add(&req->list, entry);
4834         data->timer.function = io_timeout_fn;
4835         hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
4836         spin_unlock_irq(&ctx->completion_lock);
4837         return 0;
4838 }
4839
4840 static bool io_cancel_cb(struct io_wq_work *work, void *data)
4841 {
4842         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
4843
4844         return req->user_data == (unsigned long) data;
4845 }
4846
4847 static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
4848 {
4849         enum io_wq_cancel cancel_ret;
4850         int ret = 0;
4851
4852         cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false);
4853         switch (cancel_ret) {
4854         case IO_WQ_CANCEL_OK:
4855                 ret = 0;
4856                 break;
4857         case IO_WQ_CANCEL_RUNNING:
4858                 ret = -EALREADY;
4859                 break;
4860         case IO_WQ_CANCEL_NOTFOUND:
4861                 ret = -ENOENT;
4862                 break;
4863         }
4864
4865         return ret;
4866 }
4867
4868 static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
4869                                      struct io_kiocb *req, __u64 sqe_addr,
4870                                      int success_ret)
4871 {
4872         unsigned long flags;
4873         int ret;
4874
4875         ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
4876         if (ret != -ENOENT) {
4877                 spin_lock_irqsave(&ctx->completion_lock, flags);
4878                 goto done;
4879         }
4880
4881         spin_lock_irqsave(&ctx->completion_lock, flags);
4882         ret = io_timeout_cancel(ctx, sqe_addr);
4883         if (ret != -ENOENT)
4884                 goto done;
4885         ret = io_poll_cancel(ctx, sqe_addr);
4886 done:
4887         if (!ret)
4888                 ret = success_ret;
4889         io_cqring_fill_event(req, ret);
4890         io_commit_cqring(ctx);
4891         spin_unlock_irqrestore(&ctx->completion_lock, flags);
4892         io_cqring_ev_posted(ctx);
4893
4894         if (ret < 0)
4895                 req_set_fail_links(req);
4896         io_put_req(req);
4897 }
4898
4899 static int io_async_cancel_prep(struct io_kiocb *req,
4900                                 const struct io_uring_sqe *sqe)
4901 {
4902         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4903                 return -EINVAL;
4904         if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
4905             sqe->cancel_flags)
4906                 return -EINVAL;
4907
4908         req->cancel.addr = READ_ONCE(sqe->addr);
4909         return 0;
4910 }
4911
4912 static int io_async_cancel(struct io_kiocb *req)
4913 {
4914         struct io_ring_ctx *ctx = req->ctx;
4915
4916         io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
4917         return 0;
4918 }
4919
4920 static int io_files_update_prep(struct io_kiocb *req,
4921                                 const struct io_uring_sqe *sqe)
4922 {
4923         if (sqe->flags || sqe->ioprio || sqe->rw_flags)
4924                 return -EINVAL;
4925
4926         req->files_update.offset = READ_ONCE(sqe->off);
4927         req->files_update.nr_args = READ_ONCE(sqe->len);
4928         if (!req->files_update.nr_args)
4929                 return -EINVAL;
4930         req->files_update.arg = READ_ONCE(sqe->addr);
4931         return 0;
4932 }
4933
4934 static int io_files_update(struct io_kiocb *req, bool force_nonblock)
4935 {
4936         struct io_ring_ctx *ctx = req->ctx;
4937         struct io_uring_files_update up;
4938         int ret;
4939
4940         if (force_nonblock)
4941                 return -EAGAIN;
4942
4943         up.offset = req->files_update.offset;
4944         up.fds = req->files_update.arg;
4945
4946         mutex_lock(&ctx->uring_lock);
4947         ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
4948         mutex_unlock(&ctx->uring_lock);
4949
4950         if (ret < 0)
4951                 req_set_fail_links(req);
4952         io_cqring_add_event(req, ret);
4953         io_put_req(req);
4954         return 0;
4955 }
4956
4957 static int io_req_defer_prep(struct io_kiocb *req,
4958                              const struct io_uring_sqe *sqe)
4959 {
4960         ssize_t ret = 0;
4961
4962         if (!sqe)
4963                 return 0;
4964
4965         io_req_init_async(req);
4966
4967         if (io_op_defs[req->opcode].file_table) {
4968                 ret = io_grab_files(req);
4969                 if (unlikely(ret))
4970                         return ret;
4971         }
4972
4973         io_req_work_grab_env(req, &io_op_defs[req->opcode]);
4974
4975         switch (req->opcode) {
4976         case IORING_OP_NOP:
4977                 break;
4978         case IORING_OP_READV:
4979         case IORING_OP_READ_FIXED:
4980         case IORING_OP_READ:
4981                 ret = io_read_prep(req, sqe, true);
4982                 break;
4983         case IORING_OP_WRITEV:
4984         case IORING_OP_WRITE_FIXED:
4985         case IORING_OP_WRITE:
4986                 ret = io_write_prep(req, sqe, true);
4987                 break;
4988         case IORING_OP_POLL_ADD:
4989                 ret = io_poll_add_prep(req, sqe);
4990                 break;
4991         case IORING_OP_POLL_REMOVE:
4992                 ret = io_poll_remove_prep(req, sqe);
4993                 break;
4994         case IORING_OP_FSYNC:
4995                 ret = io_prep_fsync(req, sqe);
4996                 break;
4997         case IORING_OP_SYNC_FILE_RANGE:
4998                 ret = io_prep_sfr(req, sqe);
4999                 break;
5000         case IORING_OP_SENDMSG:
5001         case IORING_OP_SEND:
5002                 ret = io_sendmsg_prep(req, sqe);
5003                 break;
5004         case IORING_OP_RECVMSG:
5005         case IORING_OP_RECV:
5006                 ret = io_recvmsg_prep(req, sqe);
5007                 break;
5008         case IORING_OP_CONNECT:
5009                 ret = io_connect_prep(req, sqe);
5010                 break;
5011         case IORING_OP_TIMEOUT:
5012                 ret = io_timeout_prep(req, sqe, false);
5013                 break;
5014         case IORING_OP_TIMEOUT_REMOVE:
5015                 ret = io_timeout_remove_prep(req, sqe);
5016                 break;
5017         case IORING_OP_ASYNC_CANCEL:
5018                 ret = io_async_cancel_prep(req, sqe);
5019                 break;
5020         case IORING_OP_LINK_TIMEOUT:
5021                 ret = io_timeout_prep(req, sqe, true);
5022                 break;
5023         case IORING_OP_ACCEPT:
5024                 ret = io_accept_prep(req, sqe);
5025                 break;
5026         case IORING_OP_FALLOCATE:
5027                 ret = io_fallocate_prep(req, sqe);
5028                 break;
5029         case IORING_OP_OPENAT:
5030                 ret = io_openat_prep(req, sqe);
5031                 break;
5032         case IORING_OP_CLOSE:
5033                 ret = io_close_prep(req, sqe);
5034                 break;
5035         case IORING_OP_FILES_UPDATE:
5036                 ret = io_files_update_prep(req, sqe);
5037                 break;
5038         case IORING_OP_STATX:
5039                 ret = io_statx_prep(req, sqe);
5040                 break;
5041         case IORING_OP_FADVISE:
5042                 ret = io_fadvise_prep(req, sqe);
5043                 break;
5044         case IORING_OP_MADVISE:
5045                 ret = io_madvise_prep(req, sqe);
5046                 break;
5047         case IORING_OP_OPENAT2:
5048                 ret = io_openat2_prep(req, sqe);
5049                 break;
5050         case IORING_OP_EPOLL_CTL:
5051                 ret = io_epoll_ctl_prep(req, sqe);
5052                 break;
5053         case IORING_OP_SPLICE:
5054                 ret = io_splice_prep(req, sqe);
5055                 break;
5056         case IORING_OP_PROVIDE_BUFFERS:
5057                 ret = io_provide_buffers_prep(req, sqe);
5058                 break;
5059         case IORING_OP_REMOVE_BUFFERS:
5060                 ret = io_remove_buffers_prep(req, sqe);
5061                 break;
5062         case IORING_OP_TEE:
5063                 ret = io_tee_prep(req, sqe);
5064                 break;
5065         default:
5066                 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
5067                                 req->opcode);
5068                 ret = -EINVAL;
5069                 break;
5070         }
5071
5072         return ret;
5073 }
5074
5075 static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5076 {
5077         struct io_ring_ctx *ctx = req->ctx;
5078         int ret;
5079
5080         /* Still need defer if there is pending req in defer list. */
5081         if (!req_need_defer(req) && list_empty_careful(&ctx->defer_list))
5082                 return 0;
5083
5084         if (!req->io) {
5085                 if (io_alloc_async_ctx(req))
5086                         return -EAGAIN;
5087                 ret = io_req_defer_prep(req, sqe);
5088                 if (ret < 0)
5089                         return ret;
5090         }
5091
5092         spin_lock_irq(&ctx->completion_lock);
5093         if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
5094                 spin_unlock_irq(&ctx->completion_lock);
5095                 return 0;
5096         }
5097
5098         trace_io_uring_defer(ctx, req, req->user_data);
5099         list_add_tail(&req->list, &ctx->defer_list);
5100         spin_unlock_irq(&ctx->completion_lock);
5101         return -EIOCBQUEUED;
5102 }
5103
5104 static void io_cleanup_req(struct io_kiocb *req)
5105 {
5106         struct io_async_ctx *io = req->io;
5107
5108         switch (req->opcode) {
5109         case IORING_OP_READV:
5110         case IORING_OP_READ_FIXED:
5111         case IORING_OP_READ:
5112                 if (req->flags & REQ_F_BUFFER_SELECTED)
5113                         kfree((void *)(unsigned long)req->rw.addr);
5114                 /* fallthrough */
5115         case IORING_OP_WRITEV:
5116         case IORING_OP_WRITE_FIXED:
5117         case IORING_OP_WRITE:
5118                 if (io->rw.iov != io->rw.fast_iov)
5119                         kfree(io->rw.iov);
5120                 break;
5121         case IORING_OP_RECVMSG:
5122                 if (req->flags & REQ_F_BUFFER_SELECTED)
5123                         kfree(req->sr_msg.kbuf);
5124                 /* fallthrough */
5125         case IORING_OP_SENDMSG:
5126                 if (io->msg.iov != io->msg.fast_iov)
5127                         kfree(io->msg.iov);
5128                 break;
5129         case IORING_OP_RECV:
5130                 if (req->flags & REQ_F_BUFFER_SELECTED)
5131                         kfree(req->sr_msg.kbuf);
5132                 break;
5133         case IORING_OP_OPENAT:
5134         case IORING_OP_OPENAT2:
5135                 break;
5136         case IORING_OP_SPLICE:
5137         case IORING_OP_TEE:
5138                 io_put_file(req, req->splice.file_in,
5139                             (req->splice.flags & SPLICE_F_FD_IN_FIXED));
5140                 break;
5141         }
5142
5143         req->flags &= ~REQ_F_NEED_CLEANUP;
5144 }
5145
5146 static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5147                         bool force_nonblock)
5148 {
5149         struct io_ring_ctx *ctx = req->ctx;
5150         int ret;
5151
5152         switch (req->opcode) {
5153         case IORING_OP_NOP:
5154                 ret = io_nop(req);
5155                 break;
5156         case IORING_OP_READV:
5157         case IORING_OP_READ_FIXED:
5158         case IORING_OP_READ:
5159                 if (sqe) {
5160                         ret = io_read_prep(req, sqe, force_nonblock);
5161                         if (ret < 0)
5162                                 break;
5163                 }
5164                 ret = io_read(req, force_nonblock);
5165                 break;
5166         case IORING_OP_WRITEV:
5167         case IORING_OP_WRITE_FIXED:
5168         case IORING_OP_WRITE:
5169                 if (sqe) {
5170                         ret = io_write_prep(req, sqe, force_nonblock);
5171                         if (ret < 0)
5172                                 break;
5173                 }
5174                 ret = io_write(req, force_nonblock);
5175                 break;
5176         case IORING_OP_FSYNC:
5177                 if (sqe) {
5178                         ret = io_prep_fsync(req, sqe);
5179                         if (ret < 0)
5180                                 break;
5181                 }
5182                 ret = io_fsync(req, force_nonblock);
5183                 break;
5184         case IORING_OP_POLL_ADD:
5185                 if (sqe) {
5186                         ret = io_poll_add_prep(req, sqe);
5187                         if (ret)
5188                                 break;
5189                 }
5190                 ret = io_poll_add(req);
5191                 break;
5192         case IORING_OP_POLL_REMOVE:
5193                 if (sqe) {
5194                         ret = io_poll_remove_prep(req, sqe);
5195                         if (ret < 0)
5196                                 break;
5197                 }
5198                 ret = io_poll_remove(req);
5199                 break;
5200         case IORING_OP_SYNC_FILE_RANGE:
5201                 if (sqe) {
5202                         ret = io_prep_sfr(req, sqe);
5203                         if (ret < 0)
5204                                 break;
5205                 }
5206                 ret = io_sync_file_range(req, force_nonblock);
5207                 break;
5208         case IORING_OP_SENDMSG:
5209         case IORING_OP_SEND:
5210                 if (sqe) {
5211                         ret = io_sendmsg_prep(req, sqe);
5212                         if (ret < 0)
5213                                 break;
5214                 }
5215                 if (req->opcode == IORING_OP_SENDMSG)
5216                         ret = io_sendmsg(req, force_nonblock);
5217                 else
5218                         ret = io_send(req, force_nonblock);
5219                 break;
5220         case IORING_OP_RECVMSG:
5221         case IORING_OP_RECV:
5222                 if (sqe) {
5223                         ret = io_recvmsg_prep(req, sqe);
5224                         if (ret)
5225                                 break;
5226                 }
5227                 if (req->opcode == IORING_OP_RECVMSG)
5228                         ret = io_recvmsg(req, force_nonblock);
5229                 else
5230                         ret = io_recv(req, force_nonblock);
5231                 break;
5232         case IORING_OP_TIMEOUT:
5233                 if (sqe) {
5234                         ret = io_timeout_prep(req, sqe, false);
5235                         if (ret)
5236                                 break;
5237                 }
5238                 ret = io_timeout(req);
5239                 break;
5240         case IORING_OP_TIMEOUT_REMOVE:
5241                 if (sqe) {
5242                         ret = io_timeout_remove_prep(req, sqe);
5243                         if (ret)
5244                                 break;
5245                 }
5246                 ret = io_timeout_remove(req);
5247                 break;
5248         case IORING_OP_ACCEPT:
5249                 if (sqe) {
5250                         ret = io_accept_prep(req, sqe);
5251                         if (ret)
5252                                 break;
5253                 }
5254                 ret = io_accept(req, force_nonblock);
5255                 break;
5256         case IORING_OP_CONNECT:
5257                 if (sqe) {
5258                         ret = io_connect_prep(req, sqe);
5259                         if (ret)
5260                                 break;
5261                 }
5262                 ret = io_connect(req, force_nonblock);
5263                 break;
5264         case IORING_OP_ASYNC_CANCEL:
5265                 if (sqe) {
5266                         ret = io_async_cancel_prep(req, sqe);
5267                         if (ret)
5268                                 break;
5269                 }
5270                 ret = io_async_cancel(req);
5271                 break;
5272         case IORING_OP_FALLOCATE:
5273                 if (sqe) {
5274                         ret = io_fallocate_prep(req, sqe);
5275                         if (ret)
5276                                 break;
5277                 }
5278                 ret = io_fallocate(req, force_nonblock);
5279                 break;
5280         case IORING_OP_OPENAT:
5281                 if (sqe) {
5282                         ret = io_openat_prep(req, sqe);
5283                         if (ret)
5284                                 break;
5285                 }
5286                 ret = io_openat(req, force_nonblock);
5287                 break;
5288         case IORING_OP_CLOSE:
5289                 if (sqe) {
5290                         ret = io_close_prep(req, sqe);
5291                         if (ret)
5292                                 break;
5293                 }
5294                 ret = io_close(req, force_nonblock);
5295                 break;
5296         case IORING_OP_FILES_UPDATE:
5297                 if (sqe) {
5298                         ret = io_files_update_prep(req, sqe);
5299                         if (ret)
5300                                 break;
5301                 }
5302                 ret = io_files_update(req, force_nonblock);
5303                 break;
5304         case IORING_OP_STATX:
5305                 if (sqe) {
5306                         ret = io_statx_prep(req, sqe);
5307                         if (ret)
5308                                 break;
5309                 }
5310                 ret = io_statx(req, force_nonblock);
5311                 break;
5312         case IORING_OP_FADVISE:
5313                 if (sqe) {
5314                         ret = io_fadvise_prep(req, sqe);
5315                         if (ret)
5316                                 break;
5317                 }
5318                 ret = io_fadvise(req, force_nonblock);
5319                 break;
5320         case IORING_OP_MADVISE:
5321                 if (sqe) {
5322                         ret = io_madvise_prep(req, sqe);
5323                         if (ret)
5324                                 break;
5325                 }
5326                 ret = io_madvise(req, force_nonblock);
5327                 break;
5328         case IORING_OP_OPENAT2:
5329                 if (sqe) {
5330                         ret = io_openat2_prep(req, sqe);
5331                         if (ret)
5332                                 break;
5333                 }
5334                 ret = io_openat2(req, force_nonblock);
5335                 break;
5336         case IORING_OP_EPOLL_CTL:
5337                 if (sqe) {
5338                         ret = io_epoll_ctl_prep(req, sqe);
5339                         if (ret)
5340                                 break;
5341                 }
5342                 ret = io_epoll_ctl(req, force_nonblock);
5343                 break;
5344         case IORING_OP_SPLICE:
5345                 if (sqe) {
5346                         ret = io_splice_prep(req, sqe);
5347                         if (ret < 0)
5348                                 break;
5349                 }
5350                 ret = io_splice(req, force_nonblock);
5351                 break;
5352         case IORING_OP_PROVIDE_BUFFERS:
5353                 if (sqe) {
5354                         ret = io_provide_buffers_prep(req, sqe);
5355                         if (ret)
5356                                 break;
5357                 }
5358                 ret = io_provide_buffers(req, force_nonblock);
5359                 break;
5360         case IORING_OP_REMOVE_BUFFERS:
5361                 if (sqe) {
5362                         ret = io_remove_buffers_prep(req, sqe);
5363                         if (ret)
5364                                 break;
5365                 }
5366                 ret = io_remove_buffers(req, force_nonblock);
5367                 break;
5368         case IORING_OP_TEE:
5369                 if (sqe) {
5370                         ret = io_tee_prep(req, sqe);
5371                         if (ret < 0)
5372                                 break;
5373                 }
5374                 ret = io_tee(req, force_nonblock);
5375                 break;
5376         default:
5377                 ret = -EINVAL;
5378                 break;
5379         }
5380
5381         if (ret)
5382                 return ret;
5383
5384         /* If the op doesn't have a file, we're not polling for it */
5385         if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
5386                 const bool in_async = io_wq_current_is_worker();
5387
5388                 /* workqueue context doesn't hold uring_lock, grab it now */
5389                 if (in_async)
5390                         mutex_lock(&ctx->uring_lock);
5391
5392                 io_iopoll_req_issued(req);
5393
5394                 if (in_async)
5395                         mutex_unlock(&ctx->uring_lock);
5396         }
5397
5398         return 0;
5399 }
5400
5401 static void io_arm_async_linked_timeout(struct io_kiocb *req)
5402 {
5403         struct io_kiocb *link;
5404
5405         /* link head's timeout is queued in io_queue_async_work() */
5406         if (!(req->flags & REQ_F_QUEUE_TIMEOUT))
5407                 return;
5408
5409         link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
5410         io_queue_linked_timeout(link);
5411 }
5412
5413 static void io_wq_submit_work(struct io_wq_work **workptr)
5414 {
5415         struct io_wq_work *work = *workptr;
5416         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5417         int ret = 0;
5418
5419         io_arm_async_linked_timeout(req);
5420
5421         /* if NO_CANCEL is set, we must still run the work */
5422         if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
5423                                 IO_WQ_WORK_CANCEL) {
5424                 ret = -ECANCELED;
5425         }
5426
5427         if (!ret) {
5428                 do {
5429                         ret = io_issue_sqe(req, NULL, false);
5430                         /*
5431                          * We can get EAGAIN for polled IO even though we're
5432                          * forcing a sync submission from here, since we can't
5433                          * wait for request slots on the block side.
5434                          */
5435                         if (ret != -EAGAIN)
5436                                 break;
5437                         cond_resched();
5438                 } while (1);
5439         }
5440
5441         if (ret) {
5442                 req_set_fail_links(req);
5443                 io_cqring_add_event(req, ret);
5444                 io_put_req(req);
5445         }
5446
5447         io_steal_work(req, workptr);
5448 }
5449
5450 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
5451                                               int index)
5452 {
5453         struct fixed_file_table *table;
5454
5455         table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
5456         return table->files[index & IORING_FILE_TABLE_MASK];
5457 }
5458
5459 static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
5460                         int fd, struct file **out_file, bool fixed)
5461 {
5462         struct io_ring_ctx *ctx = req->ctx;
5463         struct file *file;
5464
5465         if (fixed) {
5466                 if (unlikely(!ctx->file_data ||
5467                     (unsigned) fd >= ctx->nr_user_files))
5468                         return -EBADF;
5469                 fd = array_index_nospec(fd, ctx->nr_user_files);
5470                 file = io_file_from_index(ctx, fd);
5471                 if (file) {
5472                         req->fixed_file_refs = ctx->file_data->cur_refs;
5473                         percpu_ref_get(req->fixed_file_refs);
5474                 }
5475         } else {
5476                 trace_io_uring_file_get(ctx, fd);
5477                 file = __io_file_get(state, fd);
5478         }
5479
5480         if (file || io_op_defs[req->opcode].needs_file_no_error) {
5481                 *out_file = file;
5482                 return 0;
5483         }
5484         return -EBADF;
5485 }
5486
5487 static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
5488                            int fd)
5489 {
5490         bool fixed;
5491
5492         fixed = (req->flags & REQ_F_FIXED_FILE) != 0;
5493         if (unlikely(!fixed && io_async_submit(req->ctx)))
5494                 return -EBADF;
5495
5496         return io_file_get(state, req, fd, &req->file, fixed);
5497 }
5498
5499 static int io_grab_files(struct io_kiocb *req)
5500 {
5501         int ret = -EBADF;
5502         struct io_ring_ctx *ctx = req->ctx;
5503
5504         if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE))
5505                 return 0;
5506         if (!ctx->ring_file)
5507                 return -EBADF;
5508
5509         rcu_read_lock();
5510         spin_lock_irq(&ctx->inflight_lock);
5511         /*
5512          * We use the f_ops->flush() handler to ensure that we can flush
5513          * out work accessing these files if the fd is closed. Check if
5514          * the fd has changed since we started down this path, and disallow
5515          * this operation if it has.
5516          */
5517         if (fcheck(ctx->ring_fd) == ctx->ring_file) {
5518                 list_add(&req->inflight_entry, &ctx->inflight_list);
5519                 req->flags |= REQ_F_INFLIGHT;
5520                 req->work.files = current->files;
5521                 ret = 0;
5522         }
5523         spin_unlock_irq(&ctx->inflight_lock);
5524         rcu_read_unlock();
5525
5526         return ret;
5527 }
5528
5529 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
5530 {
5531         struct io_timeout_data *data = container_of(timer,
5532                                                 struct io_timeout_data, timer);
5533         struct io_kiocb *req = data->req;
5534         struct io_ring_ctx *ctx = req->ctx;
5535         struct io_kiocb *prev = NULL;
5536         unsigned long flags;
5537
5538         spin_lock_irqsave(&ctx->completion_lock, flags);
5539
5540         /*
5541          * We don't expect the list to be empty, that will only happen if we
5542          * race with the completion of the linked work.
5543          */
5544         if (!list_empty(&req->link_list)) {
5545                 prev = list_entry(req->link_list.prev, struct io_kiocb,
5546                                   link_list);
5547                 if (refcount_inc_not_zero(&prev->refs)) {
5548                         list_del_init(&req->link_list);
5549                         prev->flags &= ~REQ_F_LINK_TIMEOUT;
5550                 } else
5551                         prev = NULL;
5552         }
5553
5554         spin_unlock_irqrestore(&ctx->completion_lock, flags);
5555
5556         if (prev) {
5557                 req_set_fail_links(prev);
5558                 io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
5559                 io_put_req(prev);
5560         } else {
5561                 io_cqring_add_event(req, -ETIME);
5562                 io_put_req(req);
5563         }
5564         return HRTIMER_NORESTART;
5565 }
5566
5567 static void io_queue_linked_timeout(struct io_kiocb *req)
5568 {
5569         struct io_ring_ctx *ctx = req->ctx;
5570
5571         /*
5572          * If the list is now empty, then our linked request finished before
5573          * we got a chance to setup the timer
5574          */
5575         spin_lock_irq(&ctx->completion_lock);
5576         if (!list_empty(&req->link_list)) {
5577                 struct io_timeout_data *data = &req->io->timeout;
5578
5579                 data->timer.function = io_link_timeout_fn;
5580                 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
5581                                 data->mode);
5582         }
5583         spin_unlock_irq(&ctx->completion_lock);
5584
5585         /* drop submission reference */
5586         io_put_req(req);
5587 }
5588
5589 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
5590 {
5591         struct io_kiocb *nxt;
5592
5593         if (!(req->flags & REQ_F_LINK_HEAD))
5594                 return NULL;
5595         /* for polled retry, if flag is set, we already went through here */
5596         if (req->flags & REQ_F_POLLED)
5597                 return NULL;
5598
5599         nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
5600                                         link_list);
5601         if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
5602                 return NULL;
5603
5604         req->flags |= REQ_F_LINK_TIMEOUT;
5605         return nxt;
5606 }
5607
5608 static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5609 {
5610         struct io_kiocb *linked_timeout;
5611         struct io_kiocb *nxt;
5612         const struct cred *old_creds = NULL;
5613         int ret;
5614
5615 again:
5616         linked_timeout = io_prep_linked_timeout(req);
5617
5618         if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds &&
5619             req->work.creds != current_cred()) {
5620                 if (old_creds)
5621                         revert_creds(old_creds);
5622                 if (old_creds == req->work.creds)
5623                         old_creds = NULL; /* restored original creds */
5624                 else
5625                         old_creds = override_creds(req->work.creds);
5626         }
5627
5628         ret = io_issue_sqe(req, sqe, true);
5629
5630         /*
5631          * We async punt it if the file wasn't marked NOWAIT, or if the file
5632          * doesn't support non-blocking read/write attempts
5633          */
5634         if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
5635             (req->flags & REQ_F_MUST_PUNT))) {
5636                 if (io_arm_poll_handler(req)) {
5637                         if (linked_timeout)
5638                                 io_queue_linked_timeout(linked_timeout);
5639                         goto exit;
5640                 }
5641 punt:
5642                 io_req_init_async(req);
5643
5644                 if (io_op_defs[req->opcode].file_table) {
5645                         ret = io_grab_files(req);
5646                         if (ret)
5647                                 goto err;
5648                 }
5649
5650                 /*
5651                  * Queued up for async execution, worker will release
5652                  * submit reference when the iocb is actually submitted.
5653                  */
5654                 io_queue_async_work(req);
5655                 goto exit;
5656         }
5657
5658 err:
5659         nxt = NULL;
5660         /* drop submission reference */
5661         io_put_req_find_next(req, &nxt);
5662
5663         if (linked_timeout) {
5664                 if (!ret)
5665                         io_queue_linked_timeout(linked_timeout);
5666                 else
5667                         io_put_req(linked_timeout);
5668         }
5669
5670         /* and drop final reference, if we failed */
5671         if (ret) {
5672                 io_cqring_add_event(req, ret);
5673                 req_set_fail_links(req);
5674                 io_put_req(req);
5675         }
5676         if (nxt) {
5677                 req = nxt;
5678
5679                 if (req->flags & REQ_F_FORCE_ASYNC)
5680                         goto punt;
5681                 goto again;
5682         }
5683 exit:
5684         if (old_creds)
5685                 revert_creds(old_creds);
5686 }
5687
5688 static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5689 {
5690         int ret;
5691
5692         ret = io_req_defer(req, sqe);
5693         if (ret) {
5694                 if (ret != -EIOCBQUEUED) {
5695 fail_req:
5696                         io_cqring_add_event(req, ret);
5697                         req_set_fail_links(req);
5698                         io_double_put_req(req);
5699                 }
5700         } else if (req->flags & REQ_F_FORCE_ASYNC) {
5701                 if (!req->io) {
5702                         ret = -EAGAIN;
5703                         if (io_alloc_async_ctx(req))
5704                                 goto fail_req;
5705                         ret = io_req_defer_prep(req, sqe);
5706                         if (unlikely(ret < 0))
5707                                 goto fail_req;
5708                 }
5709
5710                 /*
5711                  * Never try inline submit of IOSQE_ASYNC is set, go straight
5712                  * to async execution.
5713                  */
5714                 req->work.flags |= IO_WQ_WORK_CONCURRENT;
5715                 io_queue_async_work(req);
5716         } else {
5717                 __io_queue_sqe(req, sqe);
5718         }
5719 }
5720
5721 static inline void io_queue_link_head(struct io_kiocb *req)
5722 {
5723         if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
5724                 io_cqring_add_event(req, -ECANCELED);
5725                 io_double_put_req(req);
5726         } else
5727                 io_queue_sqe(req, NULL);
5728 }
5729
5730 static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5731                          struct io_kiocb **link)
5732 {
5733         struct io_ring_ctx *ctx = req->ctx;
5734         int ret;
5735
5736         /*
5737          * If we already have a head request, queue this one for async
5738          * submittal once the head completes. If we don't have a head but
5739          * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
5740          * submitted sync once the chain is complete. If none of those
5741          * conditions are true (normal request), then just queue it.
5742          */
5743         if (*link) {
5744                 struct io_kiocb *head = *link;
5745
5746                 /*
5747                  * Taking sequential execution of a link, draining both sides
5748                  * of the link also fullfils IOSQE_IO_DRAIN semantics for all
5749                  * requests in the link. So, it drains the head and the
5750                  * next after the link request. The last one is done via
5751                  * drain_next flag to persist the effect across calls.
5752                  */
5753                 if (req->flags & REQ_F_IO_DRAIN) {
5754                         head->flags |= REQ_F_IO_DRAIN;
5755                         ctx->drain_next = 1;
5756                 }
5757                 if (io_alloc_async_ctx(req))
5758                         return -EAGAIN;
5759
5760                 ret = io_req_defer_prep(req, sqe);
5761                 if (ret) {
5762                         /* fail even hard links since we don't submit */
5763                         head->flags |= REQ_F_FAIL_LINK;
5764                         return ret;
5765                 }
5766                 trace_io_uring_link(ctx, req, head);
5767                 list_add_tail(&req->link_list, &head->link_list);
5768
5769                 /* last request of a link, enqueue the link */
5770                 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
5771                         io_queue_link_head(head);
5772                         *link = NULL;
5773                 }
5774         } else {
5775                 if (unlikely(ctx->drain_next)) {
5776                         req->flags |= REQ_F_IO_DRAIN;
5777                         ctx->drain_next = 0;
5778                 }
5779                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
5780                         req->flags |= REQ_F_LINK_HEAD;
5781                         INIT_LIST_HEAD(&req->link_list);
5782
5783                         if (io_alloc_async_ctx(req))
5784                                 return -EAGAIN;
5785
5786                         ret = io_req_defer_prep(req, sqe);
5787                         if (ret)
5788                                 req->flags |= REQ_F_FAIL_LINK;
5789                         *link = req;
5790                 } else {
5791                         io_queue_sqe(req, sqe);
5792                 }
5793         }
5794
5795         return 0;
5796 }
5797
5798 /*
5799  * Batched submission is done, ensure local IO is flushed out.
5800  */
5801 static void io_submit_state_end(struct io_submit_state *state)
5802 {
5803         blk_finish_plug(&state->plug);
5804         io_state_file_put(state);
5805         if (state->free_reqs)
5806                 kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
5807 }
5808
5809 /*
5810  * Start submission side cache.
5811  */
5812 static void io_submit_state_start(struct io_submit_state *state,
5813                                   unsigned int max_ios)
5814 {
5815         blk_start_plug(&state->plug);
5816         state->free_reqs = 0;
5817         state->file = NULL;
5818         state->ios_left = max_ios;
5819 }
5820
5821 static void io_commit_sqring(struct io_ring_ctx *ctx)
5822 {
5823         struct io_rings *rings = ctx->rings;
5824
5825         /*
5826          * Ensure any loads from the SQEs are done at this point,
5827          * since once we write the new head, the application could
5828          * write new data to them.
5829          */
5830         smp_store_release(&rings->sq.head, ctx->cached_sq_head);
5831 }
5832
5833 /*
5834  * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
5835  * that is mapped by userspace. This means that care needs to be taken to
5836  * ensure that reads are stable, as we cannot rely on userspace always
5837  * being a good citizen. If members of the sqe are validated and then later
5838  * used, it's important that those reads are done through READ_ONCE() to
5839  * prevent a re-load down the line.
5840  */
5841 static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
5842 {
5843         u32 *sq_array = ctx->sq_array;
5844         unsigned head;
5845
5846         /*
5847          * The cached sq head (or cq tail) serves two purposes:
5848          *
5849          * 1) allows us to batch the cost of updating the user visible
5850          *    head updates.
5851          * 2) allows the kernel side to track the head on its own, even
5852          *    though the application is the one updating it.
5853          */
5854         head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
5855         if (likely(head < ctx->sq_entries))
5856                 return &ctx->sq_sqes[head];
5857
5858         /* drop invalid entries */
5859         ctx->cached_sq_dropped++;
5860         WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
5861         return NULL;
5862 }
5863
5864 static inline void io_consume_sqe(struct io_ring_ctx *ctx)
5865 {
5866         ctx->cached_sq_head++;
5867 }
5868
5869 #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
5870                                 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
5871                                 IOSQE_BUFFER_SELECT)
5872
5873 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
5874                        const struct io_uring_sqe *sqe,
5875                        struct io_submit_state *state)
5876 {
5877         unsigned int sqe_flags;
5878         int id;
5879
5880         /*
5881          * All io need record the previous position, if LINK vs DARIN,
5882          * it can be used to mark the position of the first IO in the
5883          * link list.
5884          */
5885         req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
5886         req->opcode = READ_ONCE(sqe->opcode);
5887         req->user_data = READ_ONCE(sqe->user_data);
5888         req->io = NULL;
5889         req->file = NULL;
5890         req->ctx = ctx;
5891         req->flags = 0;
5892         /* one is dropped after submission, the other at completion */
5893         refcount_set(&req->refs, 2);
5894         req->task = current;
5895         req->result = 0;
5896
5897         if (unlikely(req->opcode >= IORING_OP_LAST))
5898                 return -EINVAL;
5899
5900         if (unlikely(io_sq_thread_acquire_mm(ctx, req)))
5901                 return -EFAULT;
5902
5903         sqe_flags = READ_ONCE(sqe->flags);
5904         /* enforce forwards compatibility on users */
5905         if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
5906                 return -EINVAL;
5907
5908         if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
5909             !io_op_defs[req->opcode].buffer_select)
5910                 return -EOPNOTSUPP;
5911
5912         id = READ_ONCE(sqe->personality);
5913         if (id) {
5914                 io_req_init_async(req);
5915                 req->work.creds = idr_find(&ctx->personality_idr, id);
5916                 if (unlikely(!req->work.creds))
5917                         return -EINVAL;
5918                 get_cred(req->work.creds);
5919         }
5920
5921         /* same numerical values with corresponding REQ_F_*, safe to copy */
5922         req->flags |= sqe_flags;
5923
5924         if (!io_op_defs[req->opcode].needs_file)
5925                 return 0;
5926
5927         return io_req_set_file(state, req, READ_ONCE(sqe->fd));
5928 }
5929
5930 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
5931                           struct file *ring_file, int ring_fd)
5932 {
5933         struct io_submit_state state, *statep = NULL;
5934         struct io_kiocb *link = NULL;
5935         int i, submitted = 0;
5936
5937         /* if we have a backlog and couldn't flush it all, return BUSY */
5938         if (test_bit(0, &ctx->sq_check_overflow)) {
5939                 if (!list_empty(&ctx->cq_overflow_list) &&
5940                     !io_cqring_overflow_flush(ctx, false))
5941                         return -EBUSY;
5942         }
5943
5944         /* make sure SQ entry isn't read before tail */
5945         nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
5946
5947         if (!percpu_ref_tryget_many(&ctx->refs, nr))
5948                 return -EAGAIN;
5949
5950         if (nr > IO_PLUG_THRESHOLD) {
5951                 io_submit_state_start(&state, nr);
5952                 statep = &state;
5953         }
5954
5955         ctx->ring_fd = ring_fd;
5956         ctx->ring_file = ring_file;
5957
5958         for (i = 0; i < nr; i++) {
5959                 const struct io_uring_sqe *sqe;
5960                 struct io_kiocb *req;
5961                 int err;
5962
5963                 sqe = io_get_sqe(ctx);
5964                 if (unlikely(!sqe)) {
5965                         io_consume_sqe(ctx);
5966                         break;
5967                 }
5968                 req = io_alloc_req(ctx, statep);
5969                 if (unlikely(!req)) {
5970                         if (!submitted)
5971                                 submitted = -EAGAIN;
5972                         break;
5973                 }
5974
5975                 err = io_init_req(ctx, req, sqe, statep);
5976                 io_consume_sqe(ctx);
5977                 /* will complete beyond this point, count as submitted */
5978                 submitted++;
5979
5980                 if (unlikely(err)) {
5981 fail_req:
5982                         io_cqring_add_event(req, err);
5983                         io_double_put_req(req);
5984                         break;
5985                 }
5986
5987                 trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
5988                                                 true, io_async_submit(ctx));
5989                 err = io_submit_sqe(req, sqe, &link);
5990                 if (err)
5991                         goto fail_req;
5992         }
5993
5994         if (unlikely(submitted != nr)) {
5995                 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
5996
5997                 percpu_ref_put_many(&ctx->refs, nr - ref_used);
5998         }
5999         if (link)
6000                 io_queue_link_head(link);
6001         if (statep)
6002                 io_submit_state_end(&state);
6003
6004          /* Commit SQ ring head once we've consumed and submitted all SQEs */
6005         io_commit_sqring(ctx);
6006
6007         return submitted;
6008 }
6009
6010 static int io_sq_thread(void *data)
6011 {
6012         struct io_ring_ctx *ctx = data;
6013         const struct cred *old_cred;
6014         DEFINE_WAIT(wait);
6015         unsigned long timeout;
6016         int ret = 0;
6017
6018         complete(&ctx->sq_thread_comp);
6019
6020         old_cred = override_creds(ctx->creds);
6021
6022         timeout = jiffies + ctx->sq_thread_idle;
6023         while (!kthread_should_park()) {
6024                 unsigned int to_submit;
6025
6026                 if (!list_empty(&ctx->poll_list)) {
6027                         unsigned nr_events = 0;
6028
6029                         mutex_lock(&ctx->uring_lock);
6030                         if (!list_empty(&ctx->poll_list))
6031                                 io_iopoll_getevents(ctx, &nr_events, 0);
6032                         else
6033                                 timeout = jiffies + ctx->sq_thread_idle;
6034                         mutex_unlock(&ctx->uring_lock);
6035                 }
6036
6037                 to_submit = io_sqring_entries(ctx);
6038
6039                 /*
6040                  * If submit got -EBUSY, flag us as needing the application
6041                  * to enter the kernel to reap and flush events.
6042                  */
6043                 if (!to_submit || ret == -EBUSY || need_resched()) {
6044                         /*
6045                          * Drop cur_mm before scheduling, we can't hold it for
6046                          * long periods (or over schedule()). Do this before
6047                          * adding ourselves to the waitqueue, as the unuse/drop
6048                          * may sleep.
6049                          */
6050                         io_sq_thread_drop_mm(ctx);
6051
6052                         /*
6053                          * We're polling. If we're within the defined idle
6054                          * period, then let us spin without work before going
6055                          * to sleep. The exception is if we got EBUSY doing
6056                          * more IO, we should wait for the application to
6057                          * reap events and wake us up.
6058                          */
6059                         if (!list_empty(&ctx->poll_list) || need_resched() ||
6060                             (!time_after(jiffies, timeout) && ret != -EBUSY &&
6061                             !percpu_ref_is_dying(&ctx->refs))) {
6062                                 if (current->task_works)
6063                                         task_work_run();
6064                                 cond_resched();
6065                                 continue;
6066                         }
6067
6068                         prepare_to_wait(&ctx->sqo_wait, &wait,
6069                                                 TASK_INTERRUPTIBLE);
6070
6071                         /*
6072                          * While doing polled IO, before going to sleep, we need
6073                          * to check if there are new reqs added to poll_list, it
6074                          * is because reqs may have been punted to io worker and
6075                          * will be added to poll_list later, hence check the
6076                          * poll_list again.
6077                          */
6078                         if ((ctx->flags & IORING_SETUP_IOPOLL) &&
6079                             !list_empty_careful(&ctx->poll_list)) {
6080                                 finish_wait(&ctx->sqo_wait, &wait);
6081                                 continue;
6082                         }
6083
6084                         /* Tell userspace we may need a wakeup call */
6085                         spin_lock_irq(&ctx->completion_lock);
6086                         ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
6087                         spin_unlock_irq(&ctx->completion_lock);
6088
6089                         to_submit = io_sqring_entries(ctx);
6090                         if (!to_submit || ret == -EBUSY) {
6091                                 if (kthread_should_park()) {
6092                                         finish_wait(&ctx->sqo_wait, &wait);
6093                                         break;
6094                                 }
6095                                 if (current->task_works) {
6096                                         task_work_run();
6097                                         finish_wait(&ctx->sqo_wait, &wait);
6098                                         continue;
6099                                 }
6100                                 if (signal_pending(current))
6101                                         flush_signals(current);
6102                                 schedule();
6103                                 finish_wait(&ctx->sqo_wait, &wait);
6104
6105                                 spin_lock_irq(&ctx->completion_lock);
6106                                 ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6107                                 spin_unlock_irq(&ctx->completion_lock);
6108                                 ret = 0;
6109                                 continue;
6110                         }
6111                         finish_wait(&ctx->sqo_wait, &wait);
6112
6113                         spin_lock_irq(&ctx->completion_lock);
6114                         ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6115                         spin_unlock_irq(&ctx->completion_lock);
6116                 }
6117
6118                 mutex_lock(&ctx->uring_lock);
6119                 if (likely(!percpu_ref_is_dying(&ctx->refs)))
6120                         ret = io_submit_sqes(ctx, to_submit, NULL, -1);
6121                 mutex_unlock(&ctx->uring_lock);
6122                 timeout = jiffies + ctx->sq_thread_idle;
6123         }
6124
6125         if (current->task_works)
6126                 task_work_run();
6127
6128         io_sq_thread_drop_mm(ctx);
6129         revert_creds(old_cred);
6130
6131         kthread_parkme();
6132
6133         return 0;
6134 }
6135
6136 struct io_wait_queue {
6137         struct wait_queue_entry wq;
6138         struct io_ring_ctx *ctx;
6139         unsigned to_wait;
6140         unsigned nr_timeouts;
6141 };
6142
6143 static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
6144 {
6145         struct io_ring_ctx *ctx = iowq->ctx;
6146
6147         /*
6148          * Wake up if we have enough events, or if a timeout occurred since we
6149          * started waiting. For timeouts, we always want to return to userspace,
6150          * regardless of event count.
6151          */
6152         return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
6153                         atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
6154 }
6155
6156 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
6157                             int wake_flags, void *key)
6158 {
6159         struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
6160                                                         wq);
6161
6162         /* use noflush == true, as we can't safely rely on locking context */
6163         if (!io_should_wake(iowq, true))
6164                 return -1;
6165
6166         return autoremove_wake_function(curr, mode, wake_flags, key);
6167 }
6168
6169 /*
6170  * Wait until events become available, if we don't already have some. The
6171  * application must reap them itself, as they reside on the shared cq ring.
6172  */
6173 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
6174                           const sigset_t __user *sig, size_t sigsz)
6175 {
6176         struct io_wait_queue iowq = {
6177                 .wq = {
6178                         .private        = current,
6179                         .func           = io_wake_function,
6180                         .entry          = LIST_HEAD_INIT(iowq.wq.entry),
6181                 },
6182                 .ctx            = ctx,
6183                 .to_wait        = min_events,
6184         };
6185         struct io_rings *rings = ctx->rings;
6186         int ret = 0;
6187
6188         do {
6189                 if (io_cqring_events(ctx, false) >= min_events)
6190                         return 0;
6191                 if (!current->task_works)
6192                         break;
6193                 task_work_run();
6194         } while (1);
6195
6196         if (sig) {
6197 #ifdef CONFIG_COMPAT
6198                 if (in_compat_syscall())
6199                         ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
6200                                                       sigsz);
6201                 else
6202 #endif
6203                         ret = set_user_sigmask(sig, sigsz);
6204
6205                 if (ret)
6206                         return ret;
6207         }
6208
6209         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
6210         trace_io_uring_cqring_wait(ctx, min_events);
6211         do {
6212                 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
6213                                                 TASK_INTERRUPTIBLE);
6214                 /* make sure we run task_work before checking for signals */
6215                 if (current->task_works)
6216                         task_work_run();
6217                 if (signal_pending(current)) {
6218                         if (current->jobctl & JOBCTL_TASK_WORK) {
6219                                 spin_lock_irq(&current->sighand->siglock);
6220                                 current->jobctl &= ~JOBCTL_TASK_WORK;
6221                                 recalc_sigpending();
6222                                 spin_unlock_irq(&current->sighand->siglock);
6223                                 continue;
6224                         }
6225                         ret = -EINTR;
6226                         break;
6227                 }
6228                 if (io_should_wake(&iowq, false))
6229                         break;
6230                 schedule();
6231         } while (1);
6232         finish_wait(&ctx->wait, &iowq.wq);
6233
6234         restore_saved_sigmask_unless(ret == -EINTR);
6235
6236         return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
6237 }
6238
6239 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
6240 {
6241 #if defined(CONFIG_UNIX)
6242         if (ctx->ring_sock) {
6243                 struct sock *sock = ctx->ring_sock->sk;
6244                 struct sk_buff *skb;
6245
6246                 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
6247                         kfree_skb(skb);
6248         }
6249 #else
6250         int i;
6251
6252         for (i = 0; i < ctx->nr_user_files; i++) {
6253                 struct file *file;
6254
6255                 file = io_file_from_index(ctx, i);
6256                 if (file)
6257                         fput(file);
6258         }
6259 #endif
6260 }
6261
6262 static void io_file_ref_kill(struct percpu_ref *ref)
6263 {
6264         struct fixed_file_data *data;
6265
6266         data = container_of(ref, struct fixed_file_data, refs);
6267         complete(&data->done);
6268 }
6269
6270 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
6271 {
6272         struct fixed_file_data *data = ctx->file_data;
6273         struct fixed_file_ref_node *ref_node = NULL;
6274         unsigned nr_tables, i;
6275
6276         if (!data)
6277                 return -ENXIO;
6278
6279         spin_lock(&data->lock);
6280         if (!list_empty(&data->ref_list))
6281                 ref_node = list_first_entry(&data->ref_list,
6282                                 struct fixed_file_ref_node, node);
6283         spin_unlock(&data->lock);
6284         if (ref_node)
6285                 percpu_ref_kill(&ref_node->refs);
6286
6287         percpu_ref_kill(&data->refs);
6288
6289         /* wait for all refs nodes to complete */
6290         flush_delayed_work(&ctx->file_put_work);
6291         wait_for_completion(&data->done);
6292
6293         __io_sqe_files_unregister(ctx);
6294         nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
6295         for (i = 0; i < nr_tables; i++)
6296                 kfree(data->table[i].files);
6297         kfree(data->table);
6298         percpu_ref_exit(&data->refs);
6299         kfree(data);
6300         ctx->file_data = NULL;
6301         ctx->nr_user_files = 0;
6302         return 0;
6303 }
6304
6305 static void io_sq_thread_stop(struct io_ring_ctx *ctx)
6306 {
6307         if (ctx->sqo_thread) {
6308                 wait_for_completion(&ctx->sq_thread_comp);
6309                 /*
6310                  * The park is a bit of a work-around, without it we get
6311                  * warning spews on shutdown with SQPOLL set and affinity
6312                  * set to a single CPU.
6313                  */
6314                 kthread_park(ctx->sqo_thread);
6315                 kthread_stop(ctx->sqo_thread);
6316                 ctx->sqo_thread = NULL;
6317         }
6318 }
6319
6320 static void io_finish_async(struct io_ring_ctx *ctx)
6321 {
6322         io_sq_thread_stop(ctx);
6323
6324         if (ctx->io_wq) {
6325                 io_wq_destroy(ctx->io_wq);
6326                 ctx->io_wq = NULL;
6327         }
6328 }
6329
6330 #if defined(CONFIG_UNIX)
6331 /*
6332  * Ensure the UNIX gc is aware of our file set, so we are certain that
6333  * the io_uring can be safely unregistered on process exit, even if we have
6334  * loops in the file referencing.
6335  */
6336 static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
6337 {
6338         struct sock *sk = ctx->ring_sock->sk;
6339         struct scm_fp_list *fpl;
6340         struct sk_buff *skb;
6341         int i, nr_files;
6342
6343         fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
6344         if (!fpl)
6345                 return -ENOMEM;
6346
6347         skb = alloc_skb(0, GFP_KERNEL);
6348         if (!skb) {
6349                 kfree(fpl);
6350                 return -ENOMEM;
6351         }
6352
6353         skb->sk = sk;
6354
6355         nr_files = 0;
6356         fpl->user = get_uid(ctx->user);
6357         for (i = 0; i < nr; i++) {
6358                 struct file *file = io_file_from_index(ctx, i + offset);
6359
6360                 if (!file)
6361                         continue;
6362                 fpl->fp[nr_files] = get_file(file);
6363                 unix_inflight(fpl->user, fpl->fp[nr_files]);
6364                 nr_files++;
6365         }
6366
6367         if (nr_files) {
6368                 fpl->max = SCM_MAX_FD;
6369                 fpl->count = nr_files;
6370                 UNIXCB(skb).fp = fpl;
6371                 skb->destructor = unix_destruct_scm;
6372                 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
6373                 skb_queue_head(&sk->sk_receive_queue, skb);
6374
6375                 for (i = 0; i < nr_files; i++)
6376                         fput(fpl->fp[i]);
6377         } else {
6378                 kfree_skb(skb);
6379                 kfree(fpl);
6380         }
6381
6382         return 0;
6383 }
6384
6385 /*
6386  * If UNIX sockets are enabled, fd passing can cause a reference cycle which
6387  * causes regular reference counting to break down. We rely on the UNIX
6388  * garbage collection to take care of this problem for us.
6389  */
6390 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
6391 {
6392         unsigned left, total;
6393         int ret = 0;
6394
6395         total = 0;
6396         left = ctx->nr_user_files;
6397         while (left) {
6398                 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
6399
6400                 ret = __io_sqe_files_scm(ctx, this_files, total);
6401                 if (ret)
6402                         break;
6403                 left -= this_files;
6404                 total += this_files;
6405         }
6406
6407         if (!ret)
6408                 return 0;
6409
6410         while (total < ctx->nr_user_files) {
6411                 struct file *file = io_file_from_index(ctx, total);
6412
6413                 if (file)
6414                         fput(file);
6415                 total++;
6416         }
6417
6418         return ret;
6419 }
6420 #else
6421 static int io_sqe_files_scm(struct io_ring_ctx *ctx)
6422 {
6423         return 0;
6424 }
6425 #endif
6426
6427 static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
6428                                     unsigned nr_files)
6429 {
6430         int i;
6431
6432         for (i = 0; i < nr_tables; i++) {
6433                 struct fixed_file_table *table = &ctx->file_data->table[i];
6434                 unsigned this_files;
6435
6436                 this_files = min(nr_files, IORING_MAX_FILES_TABLE);
6437                 table->files = kcalloc(this_files, sizeof(struct file *),
6438                                         GFP_KERNEL);
6439                 if (!table->files)
6440                         break;
6441                 nr_files -= this_files;
6442         }
6443
6444         if (i == nr_tables)
6445                 return 0;
6446
6447         for (i = 0; i < nr_tables; i++) {
6448                 struct fixed_file_table *table = &ctx->file_data->table[i];
6449                 kfree(table->files);
6450         }
6451         return 1;
6452 }
6453
6454 static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
6455 {
6456 #if defined(CONFIG_UNIX)
6457         struct sock *sock = ctx->ring_sock->sk;
6458         struct sk_buff_head list, *head = &sock->sk_receive_queue;
6459         struct sk_buff *skb;
6460         int i;
6461
6462         __skb_queue_head_init(&list);
6463
6464         /*
6465          * Find the skb that holds this file in its SCM_RIGHTS. When found,
6466          * remove this entry and rearrange the file array.
6467          */
6468         skb = skb_dequeue(head);
6469         while (skb) {
6470                 struct scm_fp_list *fp;
6471
6472                 fp = UNIXCB(skb).fp;
6473                 for (i = 0; i < fp->count; i++) {
6474                         int left;
6475
6476                         if (fp->fp[i] != file)
6477                                 continue;
6478
6479                         unix_notinflight(fp->user, fp->fp[i]);
6480                         left = fp->count - 1 - i;
6481                         if (left) {
6482                                 memmove(&fp->fp[i], &fp->fp[i + 1],
6483                                                 left * sizeof(struct file *));
6484                         }
6485                         fp->count--;
6486                         if (!fp->count) {
6487                                 kfree_skb(skb);
6488                                 skb = NULL;
6489                         } else {
6490                                 __skb_queue_tail(&list, skb);
6491                         }
6492                         fput(file);
6493                         file = NULL;
6494                         break;
6495                 }
6496
6497                 if (!file)
6498                         break;
6499
6500                 __skb_queue_tail(&list, skb);
6501
6502                 skb = skb_dequeue(head);
6503         }
6504
6505         if (skb_peek(&list)) {
6506                 spin_lock_irq(&head->lock);
6507                 while ((skb = __skb_dequeue(&list)) != NULL)
6508                         __skb_queue_tail(head, skb);
6509                 spin_unlock_irq(&head->lock);
6510         }
6511 #else
6512         fput(file);
6513 #endif
6514 }
6515
6516 struct io_file_put {
6517         struct list_head list;
6518         struct file *file;
6519 };
6520
6521 static void __io_file_put_work(struct fixed_file_ref_node *ref_node)
6522 {
6523         struct fixed_file_data *file_data = ref_node->file_data;
6524         struct io_ring_ctx *ctx = file_data->ctx;
6525         struct io_file_put *pfile, *tmp;
6526
6527         list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) {
6528                 list_del(&pfile->list);
6529                 io_ring_file_put(ctx, pfile->file);
6530                 kfree(pfile);
6531         }
6532
6533         spin_lock(&file_data->lock);
6534         list_del(&ref_node->node);
6535         spin_unlock(&file_data->lock);
6536
6537         percpu_ref_exit(&ref_node->refs);
6538         kfree(ref_node);
6539         percpu_ref_put(&file_data->refs);
6540 }
6541
6542 static void io_file_put_work(struct work_struct *work)
6543 {
6544         struct io_ring_ctx *ctx;
6545         struct llist_node *node;
6546
6547         ctx = container_of(work, struct io_ring_ctx, file_put_work.work);
6548         node = llist_del_all(&ctx->file_put_llist);
6549
6550         while (node) {
6551                 struct fixed_file_ref_node *ref_node;
6552                 struct llist_node *next = node->next;
6553
6554                 ref_node = llist_entry(node, struct fixed_file_ref_node, llist);
6555                 __io_file_put_work(ref_node);
6556                 node = next;
6557         }
6558 }
6559
6560 static void io_file_data_ref_zero(struct percpu_ref *ref)
6561 {
6562         struct fixed_file_ref_node *ref_node;
6563         struct io_ring_ctx *ctx;
6564         bool first_add;
6565         int delay = HZ;
6566
6567         ref_node = container_of(ref, struct fixed_file_ref_node, refs);
6568         ctx = ref_node->file_data->ctx;
6569
6570         if (percpu_ref_is_dying(&ctx->file_data->refs))
6571                 delay = 0;
6572
6573         first_add = llist_add(&ref_node->llist, &ctx->file_put_llist);
6574         if (!delay)
6575                 mod_delayed_work(system_wq, &ctx->file_put_work, 0);
6576         else if (first_add)
6577                 queue_delayed_work(system_wq, &ctx->file_put_work, delay);
6578 }
6579
6580 static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
6581                         struct io_ring_ctx *ctx)
6582 {
6583         struct fixed_file_ref_node *ref_node;
6584
6585         ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
6586         if (!ref_node)
6587                 return ERR_PTR(-ENOMEM);
6588
6589         if (percpu_ref_init(&ref_node->refs, io_file_data_ref_zero,
6590                             0, GFP_KERNEL)) {
6591                 kfree(ref_node);
6592                 return ERR_PTR(-ENOMEM);
6593         }
6594         INIT_LIST_HEAD(&ref_node->node);
6595         INIT_LIST_HEAD(&ref_node->file_list);
6596         ref_node->file_data = ctx->file_data;
6597         return ref_node;
6598 }
6599
6600 static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node)
6601 {
6602         percpu_ref_exit(&ref_node->refs);
6603         kfree(ref_node);
6604 }
6605
6606 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
6607                                  unsigned nr_args)
6608 {
6609         __s32 __user *fds = (__s32 __user *) arg;
6610         unsigned nr_tables;
6611         struct file *file;
6612         int fd, ret = 0;
6613         unsigned i;
6614         struct fixed_file_ref_node *ref_node;
6615
6616         if (ctx->file_data)
6617                 return -EBUSY;
6618         if (!nr_args)
6619                 return -EINVAL;
6620         if (nr_args > IORING_MAX_FIXED_FILES)
6621                 return -EMFILE;
6622
6623         ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
6624         if (!ctx->file_data)
6625                 return -ENOMEM;
6626         ctx->file_data->ctx = ctx;
6627         init_completion(&ctx->file_data->done);
6628         INIT_LIST_HEAD(&ctx->file_data->ref_list);
6629         spin_lock_init(&ctx->file_data->lock);
6630
6631         nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
6632         ctx->file_data->table = kcalloc(nr_tables,
6633                                         sizeof(struct fixed_file_table),
6634                                         GFP_KERNEL);
6635         if (!ctx->file_data->table) {
6636                 kfree(ctx->file_data);
6637                 ctx->file_data = NULL;
6638                 return -ENOMEM;
6639         }
6640
6641         if (percpu_ref_init(&ctx->file_data->refs, io_file_ref_kill,
6642                                 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
6643                 kfree(ctx->file_data->table);
6644                 kfree(ctx->file_data);
6645                 ctx->file_data = NULL;
6646                 return -ENOMEM;
6647         }
6648
6649         if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
6650                 percpu_ref_exit(&ctx->file_data->refs);
6651                 kfree(ctx->file_data->table);
6652                 kfree(ctx->file_data);
6653                 ctx->file_data = NULL;
6654                 return -ENOMEM;
6655         }
6656
6657         for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
6658                 struct fixed_file_table *table;
6659                 unsigned index;
6660
6661                 ret = -EFAULT;
6662                 if (copy_from_user(&fd, &fds[i], sizeof(fd)))
6663                         break;
6664                 /* allow sparse sets */
6665                 if (fd == -1) {
6666                         ret = 0;
6667                         continue;
6668                 }
6669
6670                 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
6671                 index = i & IORING_FILE_TABLE_MASK;
6672                 file = fget(fd);
6673
6674                 ret = -EBADF;
6675                 if (!file)
6676                         break;
6677
6678                 /*
6679                  * Don't allow io_uring instances to be registered. If UNIX
6680                  * isn't enabled, then this causes a reference cycle and this
6681                  * instance can never get freed. If UNIX is enabled we'll
6682                  * handle it just fine, but there's still no point in allowing
6683                  * a ring fd as it doesn't support regular read/write anyway.
6684                  */
6685                 if (file->f_op == &io_uring_fops) {
6686                         fput(file);
6687                         break;
6688                 }
6689                 ret = 0;
6690                 table->files[index] = file;
6691         }
6692
6693         if (ret) {
6694                 for (i = 0; i < ctx->nr_user_files; i++) {
6695                         file = io_file_from_index(ctx, i);
6696                         if (file)
6697                                 fput(file);
6698                 }
6699                 for (i = 0; i < nr_tables; i++)
6700                         kfree(ctx->file_data->table[i].files);
6701
6702                 percpu_ref_exit(&ctx->file_data->refs);
6703                 kfree(ctx->file_data->table);
6704                 kfree(ctx->file_data);
6705                 ctx->file_data = NULL;
6706                 ctx->nr_user_files = 0;
6707                 return ret;
6708         }
6709
6710         ret = io_sqe_files_scm(ctx);
6711         if (ret) {
6712                 io_sqe_files_unregister(ctx);
6713                 return ret;
6714         }
6715
6716         ref_node = alloc_fixed_file_ref_node(ctx);
6717         if (IS_ERR(ref_node)) {
6718                 io_sqe_files_unregister(ctx);
6719                 return PTR_ERR(ref_node);
6720         }
6721
6722         ctx->file_data->cur_refs = &ref_node->refs;
6723         spin_lock(&ctx->file_data->lock);
6724         list_add(&ref_node->node, &ctx->file_data->ref_list);
6725         spin_unlock(&ctx->file_data->lock);
6726         percpu_ref_get(&ctx->file_data->refs);
6727         return ret;
6728 }
6729
6730 static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
6731                                 int index)
6732 {
6733 #if defined(CONFIG_UNIX)
6734         struct sock *sock = ctx->ring_sock->sk;
6735         struct sk_buff_head *head = &sock->sk_receive_queue;
6736         struct sk_buff *skb;
6737
6738         /*
6739          * See if we can merge this file into an existing skb SCM_RIGHTS
6740          * file set. If there's no room, fall back to allocating a new skb
6741          * and filling it in.
6742          */
6743         spin_lock_irq(&head->lock);
6744         skb = skb_peek(head);
6745         if (skb) {
6746                 struct scm_fp_list *fpl = UNIXCB(skb).fp;
6747
6748                 if (fpl->count < SCM_MAX_FD) {
6749                         __skb_unlink(skb, head);
6750                         spin_unlock_irq(&head->lock);
6751                         fpl->fp[fpl->count] = get_file(file);
6752                         unix_inflight(fpl->user, fpl->fp[fpl->count]);
6753                         fpl->count++;
6754                         spin_lock_irq(&head->lock);
6755                         __skb_queue_head(head, skb);
6756                 } else {
6757                         skb = NULL;
6758                 }
6759         }
6760         spin_unlock_irq(&head->lock);
6761
6762         if (skb) {
6763                 fput(file);
6764                 return 0;
6765         }
6766
6767         return __io_sqe_files_scm(ctx, 1, index);
6768 #else
6769         return 0;
6770 #endif
6771 }
6772
6773 static int io_queue_file_removal(struct fixed_file_data *data,
6774                                  struct file *file)
6775 {
6776         struct io_file_put *pfile;
6777         struct percpu_ref *refs = data->cur_refs;
6778         struct fixed_file_ref_node *ref_node;
6779
6780         pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
6781         if (!pfile)
6782                 return -ENOMEM;
6783
6784         ref_node = container_of(refs, struct fixed_file_ref_node, refs);
6785         pfile->file = file;
6786         list_add(&pfile->list, &ref_node->file_list);
6787
6788         return 0;
6789 }
6790
6791 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
6792                                  struct io_uring_files_update *up,
6793                                  unsigned nr_args)
6794 {
6795         struct fixed_file_data *data = ctx->file_data;
6796         struct fixed_file_ref_node *ref_node;
6797         struct file *file;
6798         __s32 __user *fds;
6799         int fd, i, err;
6800         __u32 done;
6801         bool needs_switch = false;
6802
6803         if (check_add_overflow(up->offset, nr_args, &done))
6804                 return -EOVERFLOW;
6805         if (done > ctx->nr_user_files)
6806                 return -EINVAL;
6807
6808         ref_node = alloc_fixed_file_ref_node(ctx);
6809         if (IS_ERR(ref_node))
6810                 return PTR_ERR(ref_node);
6811
6812         done = 0;
6813         fds = u64_to_user_ptr(up->fds);
6814         while (nr_args) {
6815                 struct fixed_file_table *table;
6816                 unsigned index;
6817
6818                 err = 0;
6819                 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
6820                         err = -EFAULT;
6821                         break;
6822                 }
6823                 i = array_index_nospec(up->offset, ctx->nr_user_files);
6824                 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
6825                 index = i & IORING_FILE_TABLE_MASK;
6826                 if (table->files[index]) {
6827                         file = io_file_from_index(ctx, index);
6828                         err = io_queue_file_removal(data, file);
6829                         if (err)
6830                                 break;
6831                         table->files[index] = NULL;
6832                         needs_switch = true;
6833                 }
6834                 if (fd != -1) {
6835                         file = fget(fd);
6836                         if (!file) {
6837                                 err = -EBADF;
6838                                 break;
6839                         }
6840                         /*
6841                          * Don't allow io_uring instances to be registered. If
6842                          * UNIX isn't enabled, then this causes a reference
6843                          * cycle and this instance can never get freed. If UNIX
6844                          * is enabled we'll handle it just fine, but there's
6845                          * still no point in allowing a ring fd as it doesn't
6846                          * support regular read/write anyway.
6847                          */
6848                         if (file->f_op == &io_uring_fops) {
6849                                 fput(file);
6850                                 err = -EBADF;
6851                                 break;
6852                         }
6853                         table->files[index] = file;
6854                         err = io_sqe_file_register(ctx, file, i);
6855                         if (err) {
6856                                 fput(file);
6857                                 break;
6858                         }
6859                 }
6860                 nr_args--;
6861                 done++;
6862                 up->offset++;
6863         }
6864
6865         if (needs_switch) {
6866                 percpu_ref_kill(data->cur_refs);
6867                 spin_lock(&data->lock);
6868                 list_add(&ref_node->node, &data->ref_list);
6869                 data->cur_refs = &ref_node->refs;
6870                 spin_unlock(&data->lock);
6871                 percpu_ref_get(&ctx->file_data->refs);
6872         } else
6873                 destroy_fixed_file_ref_node(ref_node);
6874
6875         return done ? done : err;
6876 }
6877
6878 static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
6879                                unsigned nr_args)
6880 {
6881         struct io_uring_files_update up;
6882
6883         if (!ctx->file_data)
6884                 return -ENXIO;
6885         if (!nr_args)
6886                 return -EINVAL;
6887         if (copy_from_user(&up, arg, sizeof(up)))
6888                 return -EFAULT;
6889         if (up.resv)
6890                 return -EINVAL;
6891
6892         return __io_sqe_files_update(ctx, &up, nr_args);
6893 }
6894
6895 static void io_free_work(struct io_wq_work *work)
6896 {
6897         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6898
6899         /* Consider that io_steal_work() relies on this ref */
6900         io_put_req(req);
6901 }
6902
6903 static int io_init_wq_offload(struct io_ring_ctx *ctx,
6904                               struct io_uring_params *p)
6905 {
6906         struct io_wq_data data;
6907         struct fd f;
6908         struct io_ring_ctx *ctx_attach;
6909         unsigned int concurrency;
6910         int ret = 0;
6911
6912         data.user = ctx->user;
6913         data.free_work = io_free_work;
6914         data.do_work = io_wq_submit_work;
6915
6916         if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
6917                 /* Do QD, or 4 * CPUS, whatever is smallest */
6918                 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
6919
6920                 ctx->io_wq = io_wq_create(concurrency, &data);
6921                 if (IS_ERR(ctx->io_wq)) {
6922                         ret = PTR_ERR(ctx->io_wq);
6923                         ctx->io_wq = NULL;
6924                 }
6925                 return ret;
6926         }
6927
6928         f = fdget(p->wq_fd);
6929         if (!f.file)
6930                 return -EBADF;
6931
6932         if (f.file->f_op != &io_uring_fops) {
6933                 ret = -EINVAL;
6934                 goto out_fput;
6935         }
6936
6937         ctx_attach = f.file->private_data;
6938         /* @io_wq is protected by holding the fd */
6939         if (!io_wq_get(ctx_attach->io_wq, &data)) {
6940                 ret = -EINVAL;
6941                 goto out_fput;
6942         }
6943
6944         ctx->io_wq = ctx_attach->io_wq;
6945 out_fput:
6946         fdput(f);
6947         return ret;
6948 }
6949
6950 static int io_sq_offload_start(struct io_ring_ctx *ctx,
6951                                struct io_uring_params *p)
6952 {
6953         int ret;
6954
6955         mmgrab(current->mm);
6956         ctx->sqo_mm = current->mm;
6957
6958         if (ctx->flags & IORING_SETUP_SQPOLL) {
6959                 ret = -EPERM;
6960                 if (!capable(CAP_SYS_ADMIN))
6961                         goto err;
6962
6963                 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
6964                 if (!ctx->sq_thread_idle)
6965                         ctx->sq_thread_idle = HZ;
6966
6967                 if (p->flags & IORING_SETUP_SQ_AFF) {
6968                         int cpu = p->sq_thread_cpu;
6969
6970                         ret = -EINVAL;
6971                         if (cpu >= nr_cpu_ids)
6972                                 goto err;
6973                         if (!cpu_online(cpu))
6974                                 goto err;
6975
6976                         ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
6977                                                         ctx, cpu,
6978                                                         "io_uring-sq");
6979                 } else {
6980                         ctx->sqo_thread = kthread_create(io_sq_thread, ctx,
6981                                                         "io_uring-sq");
6982                 }
6983                 if (IS_ERR(ctx->sqo_thread)) {
6984                         ret = PTR_ERR(ctx->sqo_thread);
6985                         ctx->sqo_thread = NULL;
6986                         goto err;
6987                 }
6988                 wake_up_process(ctx->sqo_thread);
6989         } else if (p->flags & IORING_SETUP_SQ_AFF) {
6990                 /* Can't have SQ_AFF without SQPOLL */
6991                 ret = -EINVAL;
6992                 goto err;
6993         }
6994
6995         ret = io_init_wq_offload(ctx, p);
6996         if (ret)
6997                 goto err;
6998
6999         return 0;
7000 err:
7001         io_finish_async(ctx);
7002         mmdrop(ctx->sqo_mm);
7003         ctx->sqo_mm = NULL;
7004         return ret;
7005 }
7006
7007 static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
7008 {
7009         atomic_long_sub(nr_pages, &user->locked_vm);
7010 }
7011
7012 static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
7013 {
7014         unsigned long page_limit, cur_pages, new_pages;
7015
7016         /* Don't allow more pages than we can safely lock */
7017         page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
7018
7019         do {
7020                 cur_pages = atomic_long_read(&user->locked_vm);
7021                 new_pages = cur_pages + nr_pages;
7022                 if (new_pages > page_limit)
7023                         return -ENOMEM;
7024         } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
7025                                         new_pages) != cur_pages);
7026
7027         return 0;
7028 }
7029
7030 static void io_mem_free(void *ptr)
7031 {
7032         struct page *page;
7033
7034         if (!ptr)
7035                 return;
7036
7037         page = virt_to_head_page(ptr);
7038         if (put_page_testzero(page))
7039                 free_compound_page(page);
7040 }
7041
7042 static void *io_mem_alloc(size_t size)
7043 {
7044         gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
7045                                 __GFP_NORETRY;
7046
7047         return (void *) __get_free_pages(gfp_flags, get_order(size));
7048 }
7049
7050 static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
7051                                 size_t *sq_offset)
7052 {
7053         struct io_rings *rings;
7054         size_t off, sq_array_size;
7055
7056         off = struct_size(rings, cqes, cq_entries);
7057         if (off == SIZE_MAX)
7058                 return SIZE_MAX;
7059
7060 #ifdef CONFIG_SMP
7061         off = ALIGN(off, SMP_CACHE_BYTES);
7062         if (off == 0)
7063                 return SIZE_MAX;
7064 #endif
7065
7066         sq_array_size = array_size(sizeof(u32), sq_entries);
7067         if (sq_array_size == SIZE_MAX)
7068                 return SIZE_MAX;
7069
7070         if (check_add_overflow(off, sq_array_size, &off))
7071                 return SIZE_MAX;
7072
7073         if (sq_offset)
7074                 *sq_offset = off;
7075
7076         return off;
7077 }
7078
7079 static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
7080 {
7081         size_t pages;
7082
7083         pages = (size_t)1 << get_order(
7084                 rings_size(sq_entries, cq_entries, NULL));
7085         pages += (size_t)1 << get_order(
7086                 array_size(sizeof(struct io_uring_sqe), sq_entries));
7087
7088         return pages;
7089 }
7090
7091 static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
7092 {
7093         int i, j;
7094
7095         if (!ctx->user_bufs)
7096                 return -ENXIO;
7097
7098         for (i = 0; i < ctx->nr_user_bufs; i++) {
7099                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
7100
7101                 for (j = 0; j < imu->nr_bvecs; j++)
7102                         unpin_user_page(imu->bvec[j].bv_page);
7103
7104                 if (ctx->account_mem)
7105                         io_unaccount_mem(ctx->user, imu->nr_bvecs);
7106                 kvfree(imu->bvec);
7107                 imu->nr_bvecs = 0;
7108         }
7109
7110         kfree(ctx->user_bufs);
7111         ctx->user_bufs = NULL;
7112         ctx->nr_user_bufs = 0;
7113         return 0;
7114 }
7115
7116 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
7117                        void __user *arg, unsigned index)
7118 {
7119         struct iovec __user *src;
7120
7121 #ifdef CONFIG_COMPAT
7122         if (ctx->compat) {
7123                 struct compat_iovec __user *ciovs;
7124                 struct compat_iovec ciov;
7125
7126                 ciovs = (struct compat_iovec __user *) arg;
7127                 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
7128                         return -EFAULT;
7129
7130                 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
7131                 dst->iov_len = ciov.iov_len;
7132                 return 0;
7133         }
7134 #endif
7135         src = (struct iovec __user *) arg;
7136         if (copy_from_user(dst, &src[index], sizeof(*dst)))
7137                 return -EFAULT;
7138         return 0;
7139 }
7140
7141 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
7142                                   unsigned nr_args)
7143 {
7144         struct vm_area_struct **vmas = NULL;
7145         struct page **pages = NULL;
7146         int i, j, got_pages = 0;
7147         int ret = -EINVAL;
7148
7149         if (ctx->user_bufs)
7150                 return -EBUSY;
7151         if (!nr_args || nr_args > UIO_MAXIOV)
7152                 return -EINVAL;
7153
7154         ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
7155                                         GFP_KERNEL);
7156         if (!ctx->user_bufs)
7157                 return -ENOMEM;
7158
7159         for (i = 0; i < nr_args; i++) {
7160                 struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
7161                 unsigned long off, start, end, ubuf;
7162                 int pret, nr_pages;
7163                 struct iovec iov;
7164                 size_t size;
7165
7166                 ret = io_copy_iov(ctx, &iov, arg, i);
7167                 if (ret)
7168                         goto err;
7169
7170                 /*
7171                  * Don't impose further limits on the size and buffer
7172                  * constraints here, we'll -EINVAL later when IO is
7173                  * submitted if they are wrong.
7174                  */
7175                 ret = -EFAULT;
7176                 if (!iov.iov_base || !iov.iov_len)
7177                         goto err;
7178
7179                 /* arbitrary limit, but we need something */
7180                 if (iov.iov_len > SZ_1G)
7181                         goto err;
7182
7183                 ubuf = (unsigned long) iov.iov_base;
7184                 end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
7185                 start = ubuf >> PAGE_SHIFT;
7186                 nr_pages = end - start;
7187
7188                 if (ctx->account_mem) {
7189                         ret = io_account_mem(ctx->user, nr_pages);
7190                         if (ret)
7191                                 goto err;
7192                 }
7193
7194                 ret = 0;
7195                 if (!pages || nr_pages > got_pages) {
7196                         kvfree(vmas);
7197                         kvfree(pages);
7198                         pages = kvmalloc_array(nr_pages, sizeof(struct page *),
7199                                                 GFP_KERNEL);
7200                         vmas = kvmalloc_array(nr_pages,
7201                                         sizeof(struct vm_area_struct *),
7202                                         GFP_KERNEL);
7203                         if (!pages || !vmas) {
7204                                 ret = -ENOMEM;
7205                                 if (ctx->account_mem)
7206                                         io_unaccount_mem(ctx->user, nr_pages);
7207                                 goto err;
7208                         }
7209                         got_pages = nr_pages;
7210                 }
7211
7212                 imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
7213                                                 GFP_KERNEL);
7214                 ret = -ENOMEM;
7215                 if (!imu->bvec) {
7216                         if (ctx->account_mem)
7217                                 io_unaccount_mem(ctx->user, nr_pages);
7218                         goto err;
7219                 }
7220
7221                 ret = 0;
7222                 mmap_read_lock(current->mm);
7223                 pret = pin_user_pages(ubuf, nr_pages,
7224                                       FOLL_WRITE | FOLL_LONGTERM,
7225                                       pages, vmas);
7226                 if (pret == nr_pages) {
7227                         /* don't support file backed memory */
7228                         for (j = 0; j < nr_pages; j++) {
7229                                 struct vm_area_struct *vma = vmas[j];
7230
7231                                 if (vma->vm_file &&
7232                                     !is_file_hugepages(vma->vm_file)) {
7233                                         ret = -EOPNOTSUPP;
7234                                         break;
7235                                 }
7236                         }
7237                 } else {
7238                         ret = pret < 0 ? pret : -EFAULT;
7239                 }
7240                 mmap_read_unlock(current->mm);
7241                 if (ret) {
7242                         /*
7243                          * if we did partial map, or found file backed vmas,
7244                          * release any pages we did get
7245                          */
7246                         if (pret > 0)
7247                                 unpin_user_pages(pages, pret);
7248                         if (ctx->account_mem)
7249                                 io_unaccount_mem(ctx->user, nr_pages);
7250                         kvfree(imu->bvec);
7251                         goto err;
7252                 }
7253
7254                 off = ubuf & ~PAGE_MASK;
7255                 size = iov.iov_len;
7256                 for (j = 0; j < nr_pages; j++) {
7257                         size_t vec_len;
7258
7259                         vec_len = min_t(size_t, size, PAGE_SIZE - off);
7260                         imu->bvec[j].bv_page = pages[j];
7261                         imu->bvec[j].bv_len = vec_len;
7262                         imu->bvec[j].bv_offset = off;
7263                         off = 0;
7264                         size -= vec_len;
7265                 }
7266                 /* store original address for later verification */
7267                 imu->ubuf = ubuf;
7268                 imu->len = iov.iov_len;
7269                 imu->nr_bvecs = nr_pages;
7270
7271                 ctx->nr_user_bufs++;
7272         }
7273         kvfree(pages);
7274         kvfree(vmas);
7275         return 0;
7276 err:
7277         kvfree(pages);
7278         kvfree(vmas);
7279         io_sqe_buffer_unregister(ctx);
7280         return ret;
7281 }
7282
7283 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
7284 {
7285         __s32 __user *fds = arg;
7286         int fd;
7287
7288         if (ctx->cq_ev_fd)
7289                 return -EBUSY;
7290
7291         if (copy_from_user(&fd, fds, sizeof(*fds)))
7292                 return -EFAULT;
7293
7294         ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
7295         if (IS_ERR(ctx->cq_ev_fd)) {
7296                 int ret = PTR_ERR(ctx->cq_ev_fd);
7297                 ctx->cq_ev_fd = NULL;
7298                 return ret;
7299         }
7300
7301         return 0;
7302 }
7303
7304 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
7305 {
7306         if (ctx->cq_ev_fd) {
7307                 eventfd_ctx_put(ctx->cq_ev_fd);
7308                 ctx->cq_ev_fd = NULL;
7309                 return 0;
7310         }
7311
7312         return -ENXIO;
7313 }
7314
7315 static int __io_destroy_buffers(int id, void *p, void *data)
7316 {
7317         struct io_ring_ctx *ctx = data;
7318         struct io_buffer *buf = p;
7319
7320         __io_remove_buffers(ctx, buf, id, -1U);
7321         return 0;
7322 }
7323
7324 static void io_destroy_buffers(struct io_ring_ctx *ctx)
7325 {
7326         idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
7327         idr_destroy(&ctx->io_buffer_idr);
7328 }
7329
7330 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
7331 {
7332         io_finish_async(ctx);
7333         if (ctx->sqo_mm)
7334                 mmdrop(ctx->sqo_mm);
7335
7336         io_iopoll_reap_events(ctx);
7337         io_sqe_buffer_unregister(ctx);
7338         io_sqe_files_unregister(ctx);
7339         io_eventfd_unregister(ctx);
7340         io_destroy_buffers(ctx);
7341         idr_destroy(&ctx->personality_idr);
7342
7343 #if defined(CONFIG_UNIX)
7344         if (ctx->ring_sock) {
7345                 ctx->ring_sock->file = NULL; /* so that iput() is called */
7346                 sock_release(ctx->ring_sock);
7347         }
7348 #endif
7349
7350         io_mem_free(ctx->rings);
7351         io_mem_free(ctx->sq_sqes);
7352
7353         percpu_ref_exit(&ctx->refs);
7354         free_uid(ctx->user);
7355         put_cred(ctx->creds);
7356         kfree(ctx->cancel_hash);
7357         kmem_cache_free(req_cachep, ctx->fallback_req);
7358         kfree(ctx);
7359 }
7360
7361 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
7362 {
7363         struct io_ring_ctx *ctx = file->private_data;
7364         __poll_t mask = 0;
7365
7366         poll_wait(file, &ctx->cq_wait, wait);
7367         /*
7368          * synchronizes with barrier from wq_has_sleeper call in
7369          * io_commit_cqring
7370          */
7371         smp_rmb();
7372         if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head !=
7373             ctx->rings->sq_ring_entries)
7374                 mask |= EPOLLOUT | EPOLLWRNORM;
7375         if (io_cqring_events(ctx, false))
7376                 mask |= EPOLLIN | EPOLLRDNORM;
7377
7378         return mask;
7379 }
7380
7381 static int io_uring_fasync(int fd, struct file *file, int on)
7382 {
7383         struct io_ring_ctx *ctx = file->private_data;
7384
7385         return fasync_helper(fd, file, on, &ctx->cq_fasync);
7386 }
7387
7388 static int io_remove_personalities(int id, void *p, void *data)
7389 {
7390         struct io_ring_ctx *ctx = data;
7391         const struct cred *cred;
7392
7393         cred = idr_remove(&ctx->personality_idr, id);
7394         if (cred)
7395                 put_cred(cred);
7396         return 0;
7397 }
7398
7399 static void io_ring_exit_work(struct work_struct *work)
7400 {
7401         struct io_ring_ctx *ctx;
7402
7403         ctx = container_of(work, struct io_ring_ctx, exit_work);
7404         if (ctx->rings)
7405                 io_cqring_overflow_flush(ctx, true);
7406
7407         /*
7408          * If we're doing polled IO and end up having requests being
7409          * submitted async (out-of-line), then completions can come in while
7410          * we're waiting for refs to drop. We need to reap these manually,
7411          * as nobody else will be looking for them.
7412          */
7413         while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) {
7414                 io_iopoll_reap_events(ctx);
7415                 if (ctx->rings)
7416                         io_cqring_overflow_flush(ctx, true);
7417         }
7418         io_ring_ctx_free(ctx);
7419 }
7420
7421 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
7422 {
7423         mutex_lock(&ctx->uring_lock);
7424         percpu_ref_kill(&ctx->refs);
7425         mutex_unlock(&ctx->uring_lock);
7426
7427         io_kill_timeouts(ctx);
7428         io_poll_remove_all(ctx);
7429
7430         if (ctx->io_wq)
7431                 io_wq_cancel_all(ctx->io_wq);
7432
7433         io_iopoll_reap_events(ctx);
7434         /* if we failed setting up the ctx, we might not have any rings */
7435         if (ctx->rings)
7436                 io_cqring_overflow_flush(ctx, true);
7437         idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
7438
7439         /*
7440          * Do this upfront, so we won't have a grace period where the ring
7441          * is closed but resources aren't reaped yet. This can cause
7442          * spurious failure in setting up a new ring.
7443          */
7444         if (ctx->account_mem)
7445                 io_unaccount_mem(ctx->user,
7446                                 ring_pages(ctx->sq_entries, ctx->cq_entries));
7447
7448         INIT_WORK(&ctx->exit_work, io_ring_exit_work);
7449         queue_work(system_wq, &ctx->exit_work);
7450 }
7451
7452 static int io_uring_release(struct inode *inode, struct file *file)
7453 {
7454         struct io_ring_ctx *ctx = file->private_data;
7455
7456         file->private_data = NULL;
7457         io_ring_ctx_wait_and_kill(ctx);
7458         return 0;
7459 }
7460
7461 static bool io_wq_files_match(struct io_wq_work *work, void *data)
7462 {
7463         struct files_struct *files = data;
7464
7465         return work->files == files;
7466 }
7467
7468 static void io_uring_cancel_files(struct io_ring_ctx *ctx,
7469                                   struct files_struct *files)
7470 {
7471         if (list_empty_careful(&ctx->inflight_list))
7472                 return;
7473
7474         /* cancel all at once, should be faster than doing it one by one*/
7475         io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
7476
7477         while (!list_empty_careful(&ctx->inflight_list)) {
7478                 struct io_kiocb *cancel_req = NULL, *req;
7479                 DEFINE_WAIT(wait);
7480
7481                 spin_lock_irq(&ctx->inflight_lock);
7482                 list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
7483                         if (req->work.files != files)
7484                                 continue;
7485                         /* req is being completed, ignore */
7486                         if (!refcount_inc_not_zero(&req->refs))
7487                                 continue;
7488                         cancel_req = req;
7489                         break;
7490                 }
7491                 if (cancel_req)
7492                         prepare_to_wait(&ctx->inflight_wait, &wait,
7493                                                 TASK_UNINTERRUPTIBLE);
7494                 spin_unlock_irq(&ctx->inflight_lock);
7495
7496                 /* We need to keep going until we don't find a matching req */
7497                 if (!cancel_req)
7498                         break;
7499
7500                 if (cancel_req->flags & REQ_F_OVERFLOW) {
7501                         spin_lock_irq(&ctx->completion_lock);
7502                         list_del(&cancel_req->list);
7503                         cancel_req->flags &= ~REQ_F_OVERFLOW;
7504                         if (list_empty(&ctx->cq_overflow_list)) {
7505                                 clear_bit(0, &ctx->sq_check_overflow);
7506                                 clear_bit(0, &ctx->cq_check_overflow);
7507                                 ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
7508                         }
7509                         spin_unlock_irq(&ctx->completion_lock);
7510
7511                         WRITE_ONCE(ctx->rings->cq_overflow,
7512                                 atomic_inc_return(&ctx->cached_cq_overflow));
7513
7514                         /*
7515                          * Put inflight ref and overflow ref. If that's
7516                          * all we had, then we're done with this request.
7517                          */
7518                         if (refcount_sub_and_test(2, &cancel_req->refs)) {
7519                                 io_free_req(cancel_req);
7520                                 finish_wait(&ctx->inflight_wait, &wait);
7521                                 continue;
7522                         }
7523                 } else {
7524                         io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
7525                         io_put_req(cancel_req);
7526                 }
7527
7528                 schedule();
7529                 finish_wait(&ctx->inflight_wait, &wait);
7530         }
7531 }
7532
7533 static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
7534 {
7535         struct io_kiocb *req = container_of(work, struct io_kiocb, work);
7536         struct task_struct *task = data;
7537
7538         return req->task == task;
7539 }
7540
7541 static int io_uring_flush(struct file *file, void *data)
7542 {
7543         struct io_ring_ctx *ctx = file->private_data;
7544
7545         io_uring_cancel_files(ctx, data);
7546
7547         /*
7548          * If the task is going away, cancel work it may have pending
7549          */
7550         if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
7551                 io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true);
7552
7553         return 0;
7554 }
7555
7556 static void *io_uring_validate_mmap_request(struct file *file,
7557                                             loff_t pgoff, size_t sz)
7558 {
7559         struct io_ring_ctx *ctx = file->private_data;
7560         loff_t offset = pgoff << PAGE_SHIFT;
7561         struct page *page;
7562         void *ptr;
7563
7564         switch (offset) {
7565         case IORING_OFF_SQ_RING:
7566         case IORING_OFF_CQ_RING:
7567                 ptr = ctx->rings;
7568                 break;
7569         case IORING_OFF_SQES:
7570                 ptr = ctx->sq_sqes;
7571                 break;
7572         default:
7573                 return ERR_PTR(-EINVAL);
7574         }
7575
7576         page = virt_to_head_page(ptr);
7577         if (sz > page_size(page))
7578                 return ERR_PTR(-EINVAL);
7579
7580         return ptr;
7581 }
7582
7583 #ifdef CONFIG_MMU
7584
7585 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
7586 {
7587         size_t sz = vma->vm_end - vma->vm_start;
7588         unsigned long pfn;
7589         void *ptr;
7590
7591         ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
7592         if (IS_ERR(ptr))
7593                 return PTR_ERR(ptr);
7594
7595         pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
7596         return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
7597 }
7598
7599 #else /* !CONFIG_MMU */
7600
7601 static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
7602 {
7603         return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
7604 }
7605
7606 static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
7607 {
7608         return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
7609 }
7610
7611 static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
7612         unsigned long addr, unsigned long len,
7613         unsigned long pgoff, unsigned long flags)
7614 {
7615         void *ptr;
7616
7617         ptr = io_uring_validate_mmap_request(file, pgoff, len);
7618         if (IS_ERR(ptr))
7619                 return PTR_ERR(ptr);
7620
7621         return (unsigned long) ptr;
7622 }
7623
7624 #endif /* !CONFIG_MMU */
7625
7626 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
7627                 u32, min_complete, u32, flags, const sigset_t __user *, sig,
7628                 size_t, sigsz)
7629 {
7630         struct io_ring_ctx *ctx;
7631         long ret = -EBADF;
7632         int submitted = 0;
7633         struct fd f;
7634
7635         if (current->task_works)
7636                 task_work_run();
7637
7638         if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
7639                 return -EINVAL;
7640
7641         f = fdget(fd);
7642         if (!f.file)
7643                 return -EBADF;
7644
7645         ret = -EOPNOTSUPP;
7646         if (f.file->f_op != &io_uring_fops)
7647                 goto out_fput;
7648
7649         ret = -ENXIO;
7650         ctx = f.file->private_data;
7651         if (!percpu_ref_tryget(&ctx->refs))
7652                 goto out_fput;
7653
7654         /*
7655          * For SQ polling, the thread will do all submissions and completions.
7656          * Just return the requested submit count, and wake the thread if
7657          * we were asked to.
7658          */
7659         ret = 0;
7660         if (ctx->flags & IORING_SETUP_SQPOLL) {
7661                 if (!list_empty_careful(&ctx->cq_overflow_list))
7662                         io_cqring_overflow_flush(ctx, false);
7663                 if (flags & IORING_ENTER_SQ_WAKEUP)
7664                         wake_up(&ctx->sqo_wait);
7665                 submitted = to_submit;
7666         } else if (to_submit) {
7667                 mutex_lock(&ctx->uring_lock);
7668                 submitted = io_submit_sqes(ctx, to_submit, f.file, fd);
7669                 mutex_unlock(&ctx->uring_lock);
7670
7671                 if (submitted != to_submit)
7672                         goto out;
7673         }
7674         if (flags & IORING_ENTER_GETEVENTS) {
7675                 unsigned nr_events = 0;
7676
7677                 min_complete = min(min_complete, ctx->cq_entries);
7678
7679                 /*
7680                  * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
7681                  * space applications don't need to do io completion events
7682                  * polling again, they can rely on io_sq_thread to do polling
7683                  * work, which can reduce cpu usage and uring_lock contention.
7684                  */
7685                 if (ctx->flags & IORING_SETUP_IOPOLL &&
7686                     !(ctx->flags & IORING_SETUP_SQPOLL)) {
7687                         ret = io_iopoll_check(ctx, &nr_events, min_complete);
7688                 } else {
7689                         ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
7690                 }
7691         }
7692
7693 out:
7694         percpu_ref_put(&ctx->refs);
7695 out_fput:
7696         fdput(f);
7697         return submitted ? submitted : ret;
7698 }
7699
7700 #ifdef CONFIG_PROC_FS
7701 static int io_uring_show_cred(int id, void *p, void *data)
7702 {
7703         const struct cred *cred = p;
7704         struct seq_file *m = data;
7705         struct user_namespace *uns = seq_user_ns(m);
7706         struct group_info *gi;
7707         kernel_cap_t cap;
7708         unsigned __capi;
7709         int g;
7710
7711         seq_printf(m, "%5d\n", id);
7712         seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
7713         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
7714         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
7715         seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
7716         seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
7717         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
7718         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
7719         seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
7720         seq_puts(m, "\n\tGroups:\t");
7721         gi = cred->group_info;
7722         for (g = 0; g < gi->ngroups; g++) {
7723                 seq_put_decimal_ull(m, g ? " " : "",
7724                                         from_kgid_munged(uns, gi->gid[g]));
7725         }
7726         seq_puts(m, "\n\tCapEff:\t");
7727         cap = cred->cap_effective;
7728         CAP_FOR_EACH_U32(__capi)
7729                 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
7730         seq_putc(m, '\n');
7731         return 0;
7732 }
7733
7734 static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
7735 {
7736         int i;
7737
7738         mutex_lock(&ctx->uring_lock);
7739         seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
7740         for (i = 0; i < ctx->nr_user_files; i++) {
7741                 struct fixed_file_table *table;
7742                 struct file *f;
7743
7744                 table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
7745                 f = table->files[i & IORING_FILE_TABLE_MASK];
7746                 if (f)
7747                         seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
7748                 else
7749                         seq_printf(m, "%5u: <none>\n", i);
7750         }
7751         seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
7752         for (i = 0; i < ctx->nr_user_bufs; i++) {
7753                 struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
7754
7755                 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
7756                                                 (unsigned int) buf->len);
7757         }
7758         if (!idr_is_empty(&ctx->personality_idr)) {
7759                 seq_printf(m, "Personalities:\n");
7760                 idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
7761         }
7762         seq_printf(m, "PollList:\n");
7763         spin_lock_irq(&ctx->completion_lock);
7764         for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
7765                 struct hlist_head *list = &ctx->cancel_hash[i];
7766                 struct io_kiocb *req;
7767
7768                 hlist_for_each_entry(req, list, hash_node)
7769                         seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
7770                                         req->task->task_works != NULL);
7771         }
7772         spin_unlock_irq(&ctx->completion_lock);
7773         mutex_unlock(&ctx->uring_lock);
7774 }
7775
7776 static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
7777 {
7778         struct io_ring_ctx *ctx = f->private_data;
7779
7780         if (percpu_ref_tryget(&ctx->refs)) {
7781                 __io_uring_show_fdinfo(ctx, m);
7782                 percpu_ref_put(&ctx->refs);
7783         }
7784 }
7785 #endif
7786
7787 static const struct file_operations io_uring_fops = {
7788         .release        = io_uring_release,
7789         .flush          = io_uring_flush,
7790         .mmap           = io_uring_mmap,
7791 #ifndef CONFIG_MMU
7792         .get_unmapped_area = io_uring_nommu_get_unmapped_area,
7793         .mmap_capabilities = io_uring_nommu_mmap_capabilities,
7794 #endif
7795         .poll           = io_uring_poll,
7796         .fasync         = io_uring_fasync,
7797 #ifdef CONFIG_PROC_FS
7798         .show_fdinfo    = io_uring_show_fdinfo,
7799 #endif
7800 };
7801
7802 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
7803                                   struct io_uring_params *p)
7804 {
7805         struct io_rings *rings;
7806         size_t size, sq_array_offset;
7807
7808         size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
7809         if (size == SIZE_MAX)
7810                 return -EOVERFLOW;
7811
7812         rings = io_mem_alloc(size);
7813         if (!rings)
7814                 return -ENOMEM;
7815
7816         ctx->rings = rings;
7817         ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
7818         rings->sq_ring_mask = p->sq_entries - 1;
7819         rings->cq_ring_mask = p->cq_entries - 1;
7820         rings->sq_ring_entries = p->sq_entries;
7821         rings->cq_ring_entries = p->cq_entries;
7822         ctx->sq_mask = rings->sq_ring_mask;
7823         ctx->cq_mask = rings->cq_ring_mask;
7824         ctx->sq_entries = rings->sq_ring_entries;
7825         ctx->cq_entries = rings->cq_ring_entries;
7826
7827         size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
7828         if (size == SIZE_MAX) {
7829                 io_mem_free(ctx->rings);
7830                 ctx->rings = NULL;
7831                 return -EOVERFLOW;
7832         }
7833
7834         ctx->sq_sqes = io_mem_alloc(size);
7835         if (!ctx->sq_sqes) {
7836                 io_mem_free(ctx->rings);
7837                 ctx->rings = NULL;
7838                 return -ENOMEM;
7839         }
7840
7841         return 0;
7842 }
7843
7844 /*
7845  * Allocate an anonymous fd, this is what constitutes the application
7846  * visible backing of an io_uring instance. The application mmaps this
7847  * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
7848  * we have to tie this fd to a socket for file garbage collection purposes.
7849  */
7850 static int io_uring_get_fd(struct io_ring_ctx *ctx)
7851 {
7852         struct file *file;
7853         int ret;
7854
7855 #if defined(CONFIG_UNIX)
7856         ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
7857                                 &ctx->ring_sock);
7858         if (ret)
7859                 return ret;
7860 #endif
7861
7862         ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
7863         if (ret < 0)
7864                 goto err;
7865
7866         file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
7867                                         O_RDWR | O_CLOEXEC);
7868         if (IS_ERR(file)) {
7869                 put_unused_fd(ret);
7870                 ret = PTR_ERR(file);
7871                 goto err;
7872         }
7873
7874 #if defined(CONFIG_UNIX)
7875         ctx->ring_sock->file = file;
7876 #endif
7877         fd_install(ret, file);
7878         return ret;
7879 err:
7880 #if defined(CONFIG_UNIX)
7881         sock_release(ctx->ring_sock);
7882         ctx->ring_sock = NULL;
7883 #endif
7884         return ret;
7885 }
7886
7887 static int io_uring_create(unsigned entries, struct io_uring_params *p,
7888                            struct io_uring_params __user *params)
7889 {
7890         struct user_struct *user = NULL;
7891         struct io_ring_ctx *ctx;
7892         bool account_mem;
7893         int ret;
7894
7895         if (!entries)
7896                 return -EINVAL;
7897         if (entries > IORING_MAX_ENTRIES) {
7898                 if (!(p->flags & IORING_SETUP_CLAMP))
7899                         return -EINVAL;
7900                 entries = IORING_MAX_ENTRIES;
7901         }
7902
7903         /*
7904          * Use twice as many entries for the CQ ring. It's possible for the
7905          * application to drive a higher depth than the size of the SQ ring,
7906          * since the sqes are only used at submission time. This allows for
7907          * some flexibility in overcommitting a bit. If the application has
7908          * set IORING_SETUP_CQSIZE, it will have passed in the desired number
7909          * of CQ ring entries manually.
7910          */
7911         p->sq_entries = roundup_pow_of_two(entries);
7912         if (p->flags & IORING_SETUP_CQSIZE) {
7913                 /*
7914                  * If IORING_SETUP_CQSIZE is set, we do the same roundup
7915                  * to a power-of-two, if it isn't already. We do NOT impose
7916                  * any cq vs sq ring sizing.
7917                  */
7918                 if (p->cq_entries < p->sq_entries)
7919                         return -EINVAL;
7920                 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
7921                         if (!(p->flags & IORING_SETUP_CLAMP))
7922                                 return -EINVAL;
7923                         p->cq_entries = IORING_MAX_CQ_ENTRIES;
7924                 }
7925                 p->cq_entries = roundup_pow_of_two(p->cq_entries);
7926         } else {
7927                 p->cq_entries = 2 * p->sq_entries;
7928         }
7929
7930         user = get_uid(current_user());
7931         account_mem = !capable(CAP_IPC_LOCK);
7932
7933         if (account_mem) {
7934                 ret = io_account_mem(user,
7935                                 ring_pages(p->sq_entries, p->cq_entries));
7936                 if (ret) {
7937                         free_uid(user);
7938                         return ret;
7939                 }
7940         }
7941
7942         ctx = io_ring_ctx_alloc(p);
7943         if (!ctx) {
7944                 if (account_mem)
7945                         io_unaccount_mem(user, ring_pages(p->sq_entries,
7946                                                                 p->cq_entries));
7947                 free_uid(user);
7948                 return -ENOMEM;
7949         }
7950         ctx->compat = in_compat_syscall();
7951         ctx->account_mem = account_mem;
7952         ctx->user = user;
7953         ctx->creds = get_current_cred();
7954
7955         ret = io_allocate_scq_urings(ctx, p);
7956         if (ret)
7957                 goto err;
7958
7959         ret = io_sq_offload_start(ctx, p);
7960         if (ret)
7961                 goto err;
7962
7963         memset(&p->sq_off, 0, sizeof(p->sq_off));
7964         p->sq_off.head = offsetof(struct io_rings, sq.head);
7965         p->sq_off.tail = offsetof(struct io_rings, sq.tail);
7966         p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
7967         p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
7968         p->sq_off.flags = offsetof(struct io_rings, sq_flags);
7969         p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
7970         p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
7971
7972         memset(&p->cq_off, 0, sizeof(p->cq_off));
7973         p->cq_off.head = offsetof(struct io_rings, cq.head);
7974         p->cq_off.tail = offsetof(struct io_rings, cq.tail);
7975         p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
7976         p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
7977         p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
7978         p->cq_off.cqes = offsetof(struct io_rings, cqes);
7979         p->cq_off.flags = offsetof(struct io_rings, cq_flags);
7980
7981         p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
7982                         IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
7983                         IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL;
7984
7985         if (copy_to_user(params, p, sizeof(*p))) {
7986                 ret = -EFAULT;
7987                 goto err;
7988         }
7989         /*
7990          * Install ring fd as the very last thing, so we don't risk someone
7991          * having closed it before we finish setup
7992          */
7993         ret = io_uring_get_fd(ctx);
7994         if (ret < 0)
7995                 goto err;
7996
7997         trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
7998         return ret;
7999 err:
8000         io_ring_ctx_wait_and_kill(ctx);
8001         return ret;
8002 }
8003
8004 /*
8005  * Sets up an aio uring context, and returns the fd. Applications asks for a
8006  * ring size, we return the actual sq/cq ring sizes (among other things) in the
8007  * params structure passed in.
8008  */
8009 static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
8010 {
8011         struct io_uring_params p;
8012         int i;
8013
8014         if (copy_from_user(&p, params, sizeof(p)))
8015                 return -EFAULT;
8016         for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
8017                 if (p.resv[i])
8018                         return -EINVAL;
8019         }
8020
8021         if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
8022                         IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
8023                         IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ))
8024                 return -EINVAL;
8025
8026         return  io_uring_create(entries, &p, params);
8027 }
8028
8029 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
8030                 struct io_uring_params __user *, params)
8031 {
8032         return io_uring_setup(entries, params);
8033 }
8034
8035 static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
8036 {
8037         struct io_uring_probe *p;
8038         size_t size;
8039         int i, ret;
8040
8041         size = struct_size(p, ops, nr_args);
8042         if (size == SIZE_MAX)
8043                 return -EOVERFLOW;
8044         p = kzalloc(size, GFP_KERNEL);
8045         if (!p)
8046                 return -ENOMEM;
8047
8048         ret = -EFAULT;
8049         if (copy_from_user(p, arg, size))
8050                 goto out;
8051         ret = -EINVAL;
8052         if (memchr_inv(p, 0, size))
8053                 goto out;
8054
8055         p->last_op = IORING_OP_LAST - 1;
8056         if (nr_args > IORING_OP_LAST)
8057                 nr_args = IORING_OP_LAST;
8058
8059         for (i = 0; i < nr_args; i++) {
8060                 p->ops[i].op = i;
8061                 if (!io_op_defs[i].not_supported)
8062                         p->ops[i].flags = IO_URING_OP_SUPPORTED;
8063         }
8064         p->ops_len = i;
8065
8066         ret = 0;
8067         if (copy_to_user(arg, p, size))
8068                 ret = -EFAULT;
8069 out:
8070         kfree(p);
8071         return ret;
8072 }
8073
8074 static int io_register_personality(struct io_ring_ctx *ctx)
8075 {
8076         const struct cred *creds = get_current_cred();
8077         int id;
8078
8079         id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
8080                                 USHRT_MAX, GFP_KERNEL);
8081         if (id < 0)
8082                 put_cred(creds);
8083         return id;
8084 }
8085
8086 static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
8087 {
8088         const struct cred *old_creds;
8089
8090         old_creds = idr_remove(&ctx->personality_idr, id);
8091         if (old_creds) {
8092                 put_cred(old_creds);
8093                 return 0;
8094         }
8095
8096         return -EINVAL;
8097 }
8098
8099 static bool io_register_op_must_quiesce(int op)
8100 {
8101         switch (op) {
8102         case IORING_UNREGISTER_FILES:
8103         case IORING_REGISTER_FILES_UPDATE:
8104         case IORING_REGISTER_PROBE:
8105         case IORING_REGISTER_PERSONALITY:
8106         case IORING_UNREGISTER_PERSONALITY:
8107                 return false;
8108         default:
8109                 return true;
8110         }
8111 }
8112
8113 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
8114                                void __user *arg, unsigned nr_args)
8115         __releases(ctx->uring_lock)
8116         __acquires(ctx->uring_lock)
8117 {
8118         int ret;
8119
8120         /*
8121          * We're inside the ring mutex, if the ref is already dying, then
8122          * someone else killed the ctx or is already going through
8123          * io_uring_register().
8124          */
8125         if (percpu_ref_is_dying(&ctx->refs))
8126                 return -ENXIO;
8127
8128         if (io_register_op_must_quiesce(opcode)) {
8129                 percpu_ref_kill(&ctx->refs);
8130
8131                 /*
8132                  * Drop uring mutex before waiting for references to exit. If
8133                  * another thread is currently inside io_uring_enter() it might
8134                  * need to grab the uring_lock to make progress. If we hold it
8135                  * here across the drain wait, then we can deadlock. It's safe
8136                  * to drop the mutex here, since no new references will come in
8137                  * after we've killed the percpu ref.
8138                  */
8139                 mutex_unlock(&ctx->uring_lock);
8140                 ret = wait_for_completion_interruptible(&ctx->ref_comp);
8141                 mutex_lock(&ctx->uring_lock);
8142                 if (ret) {
8143                         percpu_ref_resurrect(&ctx->refs);
8144                         ret = -EINTR;
8145                         goto out;
8146                 }
8147         }
8148
8149         switch (opcode) {
8150         case IORING_REGISTER_BUFFERS:
8151                 ret = io_sqe_buffer_register(ctx, arg, nr_args);
8152                 break;
8153         case IORING_UNREGISTER_BUFFERS:
8154                 ret = -EINVAL;
8155                 if (arg || nr_args)
8156                         break;
8157                 ret = io_sqe_buffer_unregister(ctx);
8158                 break;
8159         case IORING_REGISTER_FILES:
8160                 ret = io_sqe_files_register(ctx, arg, nr_args);
8161                 break;
8162         case IORING_UNREGISTER_FILES:
8163                 ret = -EINVAL;
8164                 if (arg || nr_args)
8165                         break;
8166                 ret = io_sqe_files_unregister(ctx);
8167                 break;
8168         case IORING_REGISTER_FILES_UPDATE:
8169                 ret = io_sqe_files_update(ctx, arg, nr_args);
8170                 break;
8171         case IORING_REGISTER_EVENTFD:
8172         case IORING_REGISTER_EVENTFD_ASYNC:
8173                 ret = -EINVAL;
8174                 if (nr_args != 1)
8175                         break;
8176                 ret = io_eventfd_register(ctx, arg);
8177                 if (ret)
8178                         break;
8179                 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
8180                         ctx->eventfd_async = 1;
8181                 else
8182                         ctx->eventfd_async = 0;
8183                 break;
8184         case IORING_UNREGISTER_EVENTFD:
8185                 ret = -EINVAL;
8186                 if (arg || nr_args)
8187                         break;
8188                 ret = io_eventfd_unregister(ctx);
8189                 break;
8190         case IORING_REGISTER_PROBE:
8191                 ret = -EINVAL;
8192                 if (!arg || nr_args > 256)
8193                         break;
8194                 ret = io_probe(ctx, arg, nr_args);
8195                 break;
8196         case IORING_REGISTER_PERSONALITY:
8197                 ret = -EINVAL;
8198                 if (arg || nr_args)
8199                         break;
8200                 ret = io_register_personality(ctx);
8201                 break;
8202         case IORING_UNREGISTER_PERSONALITY:
8203                 ret = -EINVAL;
8204                 if (arg)
8205                         break;
8206                 ret = io_unregister_personality(ctx, nr_args);
8207                 break;
8208         default:
8209                 ret = -EINVAL;
8210                 break;
8211         }
8212
8213         if (io_register_op_must_quiesce(opcode)) {
8214                 /* bring the ctx back to life */
8215                 percpu_ref_reinit(&ctx->refs);
8216 out:
8217                 reinit_completion(&ctx->ref_comp);
8218         }
8219         return ret;
8220 }
8221
8222 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
8223                 void __user *, arg, unsigned int, nr_args)
8224 {
8225         struct io_ring_ctx *ctx;
8226         long ret = -EBADF;
8227         struct fd f;
8228
8229         f = fdget(fd);
8230         if (!f.file)
8231                 return -EBADF;
8232
8233         ret = -EOPNOTSUPP;
8234         if (f.file->f_op != &io_uring_fops)
8235                 goto out_fput;
8236
8237         ctx = f.file->private_data;
8238
8239         mutex_lock(&ctx->uring_lock);
8240         ret = __io_uring_register(ctx, opcode, arg, nr_args);
8241         mutex_unlock(&ctx->uring_lock);
8242         trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
8243                                                         ctx->cq_ev_fd != NULL, ret);
8244 out_fput:
8245         fdput(f);
8246         return ret;
8247 }
8248
8249 static int __init io_uring_init(void)
8250 {
8251 #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
8252         BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
8253         BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
8254 } while (0)
8255
8256 #define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
8257         __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
8258         BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
8259         BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
8260         BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
8261         BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
8262         BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
8263         BUILD_BUG_SQE_ELEM(8,  __u64,  off);
8264         BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
8265         BUILD_BUG_SQE_ELEM(16, __u64,  addr);
8266         BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
8267         BUILD_BUG_SQE_ELEM(24, __u32,  len);
8268         BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
8269         BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
8270         BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
8271         BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
8272         BUILD_BUG_SQE_ELEM(28, __u16,  poll_events);
8273         BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
8274         BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
8275         BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
8276         BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
8277         BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
8278         BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
8279         BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
8280         BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
8281         BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
8282         BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
8283         BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
8284         BUILD_BUG_SQE_ELEM(42, __u16,  personality);
8285         BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
8286
8287         BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
8288         BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
8289         req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
8290         return 0;
8291 };
8292 __initcall(io_uring_init);