Merge tag 'io_uring-bio-cache.5-2021-08-30' of git://git.kernel.dk/linux-block

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 31 Aug 2021 02:30:30 +0000 (19:30 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 31 Aug 2021 02:30:30 +0000 (19:30 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 31 Aug 2021 02:30:30 +0000 (19:30 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 31 Aug 2021 02:30:30 +0000 (19:30 -0700)
diff --combined block/bio.c

index 265bff6,ef88fa3..1319dd2
--- 1/block/bio.c
--- 2/block/bio.c
+++ b/block/bio.c
@@@ -25,6 -25,11 +25,11 @@@
   #include "blk.h"
   #include "blk-rq-qos.h"
   
+ struct bio_alloc_cache {
+       struct bio_list         free_list;
+       unsigned int            nr;
+ };
+ 
   static struct biovec_slab {
         int nr_vecs;
         char *name;
@@@ -246,12 -251,40 +251,40 @@@ static void bio_free(struct bio *bio
   void bio_init(struct bio *bio, struct bio_vec *table,
               unsigned short max_vecs)
   {
-       memset(bio, 0, sizeof(*bio));
+       bio->bi_next = NULL;
+       bio->bi_bdev = NULL;
+       bio->bi_opf = 0;
+       bio->bi_flags = 0;
+       bio->bi_ioprio = 0;
+       bio->bi_write_hint = 0;
+       bio->bi_status = 0;
+       bio->bi_iter.bi_sector = 0;
+       bio->bi_iter.bi_size = 0;
+       bio->bi_iter.bi_idx = 0;
+       bio->bi_iter.bi_bvec_done = 0;
+       bio->bi_end_io = NULL;
+       bio->bi_private = NULL;
+ #ifdef CONFIG_BLK_CGROUP
+       bio->bi_blkg = NULL;
+       bio->bi_issue.value = 0;
+ #ifdef CONFIG_BLK_CGROUP_IOCOST
+       bio->bi_iocost_cost = 0;
+ #endif
+ #endif
+ #ifdef CONFIG_BLK_INLINE_ENCRYPTION
+       bio->bi_crypt_context = NULL;
+ #endif
+ #ifdef CONFIG_BLK_DEV_INTEGRITY
+       bio->bi_integrity = NULL;
+ #endif
+       bio->bi_vcnt = 0;
+ 
         atomic_set(&bio->__bi_remaining, 1);
         atomic_set(&bio->__bi_cnt, 1);
   
-       bio->bi_io_vec = table;
         bio->bi_max_vecs = max_vecs;
+       bio->bi_io_vec = table;
+       bio->bi_pool = NULL;
   }
   EXPORT_SYMBOL(bio_init);
   
@@@ -495,11 -528,16 +528,11 @@@ EXPORT_SYMBOL(bio_kmalloc)
   
   void zero_fill_bio(struct bio *bio)
   {
- -      unsigned long flags;
         struct bio_vec bv;
         struct bvec_iter iter;
   
- -      bio_for_each_segment(bv, bio, iter) {
- -              char *data = bvec_kmap_irq(&bv, &flags);
- -              memset(data, 0, bv.bv_len);
- -              flush_dcache_page(bv.bv_page);
- -              bvec_kunmap_irq(data, &flags);
- -      }
+ +      bio_for_each_segment(bv, bio, iter)
+ +              memzero_bvec(&bv);
   }
   EXPORT_SYMBOL(zero_fill_bio);
   
@@@ -586,6 -624,53 +619,53 @@@ void guard_bio_eod(struct bio *bio
         bio_truncate(bio, maxsector << 9);
   }
   
+ #define ALLOC_CACHE_MAX               512
+ #define ALLOC_CACHE_SLACK      64
+ 
+ static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
+                                 unsigned int nr)
+ {
+       unsigned int i = 0;
+       struct bio *bio;
+ 
+       while ((bio = bio_list_pop(&cache->free_list)) != NULL) {
+               cache->nr--;
+               bio_free(bio);
+               if (++i == nr)
+                       break;
+       }
+ }
+ 
+ static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node)
+ {
+       struct bio_set *bs;
+ 
+       bs = hlist_entry_safe(node, struct bio_set, cpuhp_dead);
+       if (bs->cache) {
+               struct bio_alloc_cache *cache = per_cpu_ptr(bs->cache, cpu);
+ 
+               bio_alloc_cache_prune(cache, -1U);
+       }
+       return 0;
+ }
+ 
+ static void bio_alloc_cache_destroy(struct bio_set *bs)
+ {
+       int cpu;
+ 
+       if (!bs->cache)
+               return;
+ 
+       cpuhp_state_remove_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
+       for_each_possible_cpu(cpu) {
+               struct bio_alloc_cache *cache;
+ 
+               cache = per_cpu_ptr(bs->cache, cpu);
+               bio_alloc_cache_prune(cache, -1U);
+       }
+       free_percpu(bs->cache);
+ }
+ 
   /**
    * bio_put - release a reference to a bio
    * @bio:   bio to release reference to
@@@ -596,16 -681,23 +676,23 @@@
    **/
   void bio_put(struct bio *bio)
   {
-       if (!bio_flagged(bio, BIO_REFFED))
-               bio_free(bio);
-       else {
+       if (unlikely(bio_flagged(bio, BIO_REFFED))) {
                 BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
+               if (!atomic_dec_and_test(&bio->__bi_cnt))
+                       return;
+       }
   
-               /*
-                * last put frees it
-                */
-               if (atomic_dec_and_test(&bio->__bi_cnt))
-                       bio_free(bio);
+       if (bio_flagged(bio, BIO_PERCPU_CACHE)) {
+               struct bio_alloc_cache *cache;
+ 
+               bio_uninit(bio);
+               cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
+               bio_list_add_head(&cache->free_list, bio);
+               if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK)
+                       bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK);
+               put_cpu();
+       } else {
+               bio_free(bio);
         }
   }
   EXPORT_SYMBOL(bio_put);
@@@ -974,14 -1066,6 +1061,14 @@@ static int bio_iov_bvec_set_append(stru
         return 0;
   }
   
+ +static void bio_put_pages(struct page **pages, size_t size, size_t off)
+ +{
+ +      size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
+ +
+ +      for (i = 0; i < nr; i++)
+ +              put_page(pages[i]);
+ +}
+ +
   #define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))
   
   /**
@@@ -1026,10 -1110,8 +1113,10 @@@ static int __bio_iov_iter_get_pages(str
                         if (same_page)
                                 put_page(page);
                 } else {
- -                      if (WARN_ON_ONCE(bio_full(bio, len)))
- -                                return -EINVAL;
+ +                      if (WARN_ON_ONCE(bio_full(bio, len))) {
+ +                              bio_put_pages(pages + i, left, offset);
+ +                              return -EINVAL;
+ +                      }
                         __bio_add_page(bio, page, len, offset);
                 }
                 offset = 0;
@@@ -1074,7 -1156,6 +1161,7 @@@ static int __bio_iov_append_get_pages(s
                 len = min_t(size_t, PAGE_SIZE - offset, left);
                 if (bio_add_hw_page(q, bio, page, len, offset,
                                 max_append_sectors, &same_page) != len) {
+ +                      bio_put_pages(pages + i, left, offset);
                         ret = -EINVAL;
                         break;
                 }
@@@ -1197,15 -1278,27 +1284,15 @@@ EXPORT_SYMBOL(bio_advance)
   void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
                         struct bio *src, struct bvec_iter *src_iter)
   {
- -      struct bio_vec src_bv, dst_bv;
- -      void *src_p, *dst_p;
- -      unsigned bytes;
- -
         while (src_iter->bi_size && dst_iter->bi_size) {
- -              src_bv = bio_iter_iovec(src, *src_iter);
- -              dst_bv = bio_iter_iovec(dst, *dst_iter);
- -
- -              bytes = min(src_bv.bv_len, dst_bv.bv_len);
- -
- -              src_p = kmap_atomic(src_bv.bv_page);
- -              dst_p = kmap_atomic(dst_bv.bv_page);
- -
- -              memcpy(dst_p + dst_bv.bv_offset,
- -                     src_p + src_bv.bv_offset,
- -                     bytes);
- -
- -              kunmap_atomic(dst_p);
- -              kunmap_atomic(src_p);
- -
- -              flush_dcache_page(dst_bv.bv_page);
+ +              struct bio_vec src_bv = bio_iter_iovec(src, *src_iter);
+ +              struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter);
+ +              unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
+ +              void *src_buf;
+ +
+ +              src_buf = bvec_kmap_local(&src_bv);
+ +              memcpy_to_bvec(&dst_bv, src_buf);
+ +              kunmap_local(src_buf);
   
                 bio_advance_iter_single(src, src_iter, bytes);
                 bio_advance_iter_single(dst, dst_iter, bytes);
@@@ -1496,6 -1589,7 +1583,7 @@@ int biovec_init_pool(mempool_t *pool, i
    */
   void bioset_exit(struct bio_set *bs)
   {
+       bio_alloc_cache_destroy(bs);
         if (bs->rescue_workqueue)
                 destroy_workqueue(bs->rescue_workqueue);
         bs->rescue_workqueue = NULL;
@@@ -1557,12 -1651,18 +1645,18 @@@ int bioset_init(struct bio_set *bs
             biovec_init_pool(&bs->bvec_pool, pool_size))
                 goto bad;
   
-       if (!(flags & BIOSET_NEED_RESCUER))
-               return 0;
- 
-       bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
-       if (!bs->rescue_workqueue)
-               goto bad;
+       if (flags & BIOSET_NEED_RESCUER) {
+               bs->rescue_workqueue = alloc_workqueue("bioset",
+                                                       WQ_MEM_RECLAIM, 0);
+               if (!bs->rescue_workqueue)
+                       goto bad;
+       }
+       if (flags & BIOSET_PERCPU_CACHE) {
+               bs->cache = alloc_percpu(struct bio_alloc_cache);
+               if (!bs->cache)
+                       goto bad;
+               cpuhp_state_add_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
+       }
   
         return 0;
   bad:
@@@ -1589,6 -1689,46 +1683,46 @@@ int bioset_init_from_src(struct bio_se
   }
   EXPORT_SYMBOL(bioset_init_from_src);
   
+ /**
+  * bio_alloc_kiocb - Allocate a bio from bio_set based on kiocb
+  * @kiocb:    kiocb describing the IO
+  * @nr_iovecs:        number of iovecs to pre-allocate
+  * @bs:               bio_set to allocate from
+  *
+  * Description:
+  *    Like @bio_alloc_bioset, but pass in the kiocb. The kiocb is only
+  *    used to check if we should dip into the per-cpu bio_set allocation
+  *    cache. The allocation uses GFP_KERNEL internally. On return, the
+  *    bio is marked BIO_PERCPU_CACHEABLE, and the final put of the bio
+  *    MUST be done from process context, not hard/soft IRQ.
+  *
+  */
+ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
+                           struct bio_set *bs)
+ {
+       struct bio_alloc_cache *cache;
+       struct bio *bio;
+ 
+       if (!(kiocb->ki_flags & IOCB_ALLOC_CACHE) || nr_vecs > BIO_INLINE_VECS)
+               return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
+ 
+       cache = per_cpu_ptr(bs->cache, get_cpu());
+       bio = bio_list_pop(&cache->free_list);
+       if (bio) {
+               cache->nr--;
+               put_cpu();
+               bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs);
+               bio->bi_pool = bs;
+               bio_set_flag(bio, BIO_PERCPU_CACHE);
+               return bio;
+       }
+       put_cpu();
+       bio = bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
+       bio_set_flag(bio, BIO_PERCPU_CACHE);
+       return bio;
+ }
+ EXPORT_SYMBOL_GPL(bio_alloc_kiocb);
+ 
   static int __init init_bio(void)
   {
         int i;
@@@ -1603,6 -1743,9 +1737,9 @@@
                                 SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
         }
   
+       cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL,
+                                       bio_cpu_dead);
+ 
         if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS))
                 panic("bio: can't allocate bios\n");
   
diff --combined block/blk-core.c

index b509873,f35d401..5454db2
--- 1/block/blk-core.c
--- 2/block/blk-core.c
+++ b/block/blk-core.c
@@@ -14,6 -14,7 +14,6 @@@
    */
   #include <linux/kernel.h>
   #include <linux/module.h>
- -#include <linux/backing-dev.h>
   #include <linux/bio.h>
   #include <linux/blkdev.h>
   #include <linux/blk-mq.h>
@@@ -392,7 -393,10 +392,7 @@@ void blk_cleanup_queue(struct request_q
         /* for synchronous bio-based driver finish in-flight integrity i/o */
         blk_flush_integrity();
   
- -      /* @q won't process any more request, flush async actions */
- -      del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer);
         blk_sync_queue(q);
- -
         if (queue_is_mq(q))
                 blk_mq_exit_queue(q);
   
@@@ -529,14 -533,20 +529,14 @@@ struct request_queue *blk_alloc_queue(i
         if (ret)
                 goto fail_id;
   
- -      q->backing_dev_info = bdi_alloc(node_id);
- -      if (!q->backing_dev_info)
- -              goto fail_split;
- -
         q->stats = blk_alloc_queue_stats();
         if (!q->stats)
- -              goto fail_stats;
+ +              goto fail_split;
   
         q->node = node_id;
   
         atomic_set(&q->nr_active_requests_shared_sbitmap, 0);
   
- -      timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
- -                  laptop_mode_timer_fn, 0);
         timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
         INIT_WORK(&q->timeout_work, blk_timeout_work);
         INIT_LIST_HEAD(&q->icq_list);
@@@ -561,7 -571,7 +561,7 @@@
         if (percpu_ref_init(&q->q_usage_counter,
                                 blk_queue_usage_counter_release,
                                 PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
- -              goto fail_bdi;
+ +              goto fail_stats;
   
         if (blkcg_init_queue(q))
                 goto fail_ref;
@@@ -574,8 -584,10 +574,8 @@@
   
   fail_ref:
         percpu_ref_exit(&q->q_usage_counter);
- -fail_bdi:
- -      blk_free_queue_stats(q->stats);
   fail_stats:
- -      bdi_put(q->backing_dev_info);
+ +      blk_free_queue_stats(q->stats);
   fail_split:
         bioset_exit(&q->bio_split);
   fail_id:
@@@ -821,7 -833,7 +821,7 @@@ static noinline_for_stack bool submit_b
         }
   
         if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
-               bio->bi_opf &= ~REQ_HIPRI;
+               bio_clear_hipri(bio);
   
         switch (bio_op(bio)) {
         case REQ_OP_DISCARD:
diff --combined block/blk-merge.c

index eeba842,bc25ad4..7a5c81c
--- 1/block/blk-merge.c
--- 2/block/blk-merge.c
+++ b/block/blk-merge.c
@@@ -285,7 -285,7 +285,7 @@@ split
          * iopoll in direct IO routine. Given performance gain of iopoll for
          * big IO can be trival, disable iopoll when split needed.
          */
-       bio->bi_opf &= ~REQ_HIPRI;
+       bio_clear_hipri(bio);
   
         return bio_split(bio, sectors, GFP_NOIO, bs);
   }
@@@ -348,8 -348,6 +348,8 @@@ void __blk_queue_split(struct bio **bio
                 trace_block_split(split, (*bio)->bi_iter.bi_sector);
                 submit_bio_noacct(*bio);
                 *bio = split;
+ +
+ +              blk_throtl_charge_bio_split(*bio);
         }
   }
   
@@@ -707,6 -705,22 +707,6 @@@ static void blk_account_io_merge_reques
         }
   }
   
- -/*
- - * Two cases of handling DISCARD merge:
- - * If max_discard_segments > 1, the driver takes every bio
- - * as a range and send them to controller together. The ranges
- - * needn't to be contiguous.
- - * Otherwise, the bios/requests will be handled as same as
- - * others which should be contiguous.
- - */
- -static inline bool blk_discard_mergable(struct request *req)
- -{
- -      if (req_op(req) == REQ_OP_DISCARD &&
- -          queue_max_discard_segments(req->q) > 1)
- -              return true;
- -      return false;
- -}
- -
   static enum elv_merge blk_try_req_merge(struct request *req,
                                         struct request *next)
   {
diff --combined block/blk.h

index 346d184,5a4652a..8c96b0c
--- 1/block/blk.h
--- 2/block/blk.h
+++ b/block/blk.h
@@@ -128,7 -128,7 +128,7 @@@ static inline bool integrity_req_gap_fr
                                 bip_next->bip_vec[0].bv_offset);
   }
   
- -void blk_integrity_add(struct gendisk *);
+ +int blk_integrity_add(struct gendisk *disk);
   void blk_integrity_del(struct gendisk *);
   #else /* CONFIG_BLK_DEV_INTEGRITY */
   static inline bool blk_integrity_merge_rq(struct request_queue *rq,
@@@ -162,9 -162,8 +162,9 @@@ static inline bool bio_integrity_endio(
   static inline void bio_integrity_free(struct bio *bio)
   {
   }
- -static inline void blk_integrity_add(struct gendisk *disk)
+ +static inline int blk_integrity_add(struct gendisk *disk)
   {
+ +      return 0;
   }
   static inline void blk_integrity_del(struct gendisk *disk)
   {
@@@ -290,13 -289,11 +290,13 @@@ int create_task_io_context(struct task_
   extern int blk_throtl_init(struct request_queue *q);
   extern void blk_throtl_exit(struct request_queue *q);
   extern void blk_throtl_register_queue(struct request_queue *q);
+ +extern void blk_throtl_charge_bio_split(struct bio *bio);
   bool blk_throtl_bio(struct bio *bio);
   #else /* CONFIG_BLK_DEV_THROTTLING */
   static inline int blk_throtl_init(struct request_queue *q) { return 0; }
   static inline void blk_throtl_exit(struct request_queue *q) { }
   static inline void blk_throtl_register_queue(struct request_queue *q) { }
+ +static inline void blk_throtl_charge_bio_split(struct bio *bio) { }
   static inline bool blk_throtl_bio(struct bio *bio) { return false; }
   #endif /* CONFIG_BLK_DEV_THROTTLING */
   #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
@@@ -343,14 -340,15 +343,14 @@@ static inline void blk_queue_clear_zone
   
   int blk_alloc_ext_minor(void);
   void blk_free_ext_minor(unsigned int minor);
- -char *disk_name(struct gendisk *hd, int partno, char *buf);
   #define ADDPART_FLAG_NONE     0
   #define ADDPART_FLAG_RAID     1
   #define ADDPART_FLAG_WHOLEDISK        2
- -int bdev_add_partition(struct block_device *bdev, int partno,
- -              sector_t start, sector_t length);
- -int bdev_del_partition(struct block_device *bdev, int partno);
- -int bdev_resize_partition(struct block_device *bdev, int partno,
- -              sector_t start, sector_t length);
+ +int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
+ +              sector_t length);
+ +int bdev_del_partition(struct gendisk *disk, int partno);
+ +int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
+ +              sector_t length);
   
   int bio_add_hw_page(struct request_queue *q, struct bio *bio,
                 struct page *page, unsigned int len, unsigned int offset,
@@@ -358,7 -356,7 +358,7 @@@
   
   struct request_queue *blk_alloc_queue(int node_id);
   
- -void disk_alloc_events(struct gendisk *disk);
+ +int disk_alloc_events(struct gendisk *disk);
   void disk_add_events(struct gendisk *disk);
   void disk_del_events(struct gendisk *disk);
   void disk_release_events(struct gendisk *disk);
@@@ -366,4 -364,11 +366,11 @@@ extern struct device_attribute dev_attr
   extern struct device_attribute dev_attr_events_async;
   extern struct device_attribute dev_attr_events_poll_msecs;
   
+ static inline void bio_clear_hipri(struct bio *bio)
+ {
+       /* can't support alloc cache if we turn off polling */
+       bio_clear_flag(bio, BIO_PERCPU_CACHE);
+       bio->bi_opf &= ~REQ_HIPRI;
+ }
+ 
   #endif /* BLK_INTERNAL_H */
diff --combined fs/block_dev.c

index 1f21ac9,3c7fb71..45df6cb
--- 1/fs/block_dev.c
--- 2/fs/block_dev.c
+++ b/fs/block_dev.c
@@@ -35,7 -35,6 +35,7 @@@
   #include <linux/uaccess.h>
   #include <linux/suspend.h>
   #include "internal.h"
+ +#include "../block/blk.h"
   
   struct bdev_inode {
         struct block_device bdev;
@@@ -386,7 -385,7 +386,7 @@@ static ssize_t __blkdev_direct_IO(struc
             (bdev_logical_block_size(bdev) - 1))
                 return -EINVAL;
   
-       bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
+       bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
   
         dio = container_of(bio, struct blkdev_dio, bio);
         dio->is_sync = is_sync = is_sync_kiocb(iocb);
@@@ -514,7 -513,9 +514,9 @@@ blkdev_direct_IO(struct kiocb *iocb, st
   
   static __init int blkdev_init(void)
   {
-       return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS);
+       return bioset_init(&blkdev_dio_pool, 4,
+                               offsetof(struct blkdev_dio, bio),
+                               BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE);
   }
   module_init(blkdev_init);
   
@@@ -687,8 -688,7 +689,8 @@@ static loff_t block_llseek(struct file 
         return retval;
   }
         
- -int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
+ +static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
+ +              int datasync)
   {
         struct inode *bd_inode = bdev_file_inode(filp);
         struct block_device *bdev = I_BDEV(bd_inode);
@@@ -709,6 -709,7 +711,6 @@@
   
         return error;
   }
- -EXPORT_SYMBOL(blkdev_fsync);
   
   /**
    * bdev_read_page() - Start reading a page from a block device
@@@ -802,6 -803,7 +804,6 @@@ static struct inode *bdev_alloc_inode(s
         if (!ei)
                 return NULL;
         memset(&ei->bdev, 0, sizeof(ei->bdev));
- -      ei->bdev.bd_bdi = &noop_backing_dev_info;
         return &ei->vfs_inode;
   }
   
@@@ -812,15 -814,8 +814,15 @@@ static void bdev_free_inode(struct inod
         free_percpu(bdev->bd_stats);
         kfree(bdev->bd_meta_info);
   
- -      if (!bdev_is_partition(bdev))
+ +      if (!bdev_is_partition(bdev)) {
+ +              if (bdev->bd_disk && bdev->bd_disk->bdi)
+ +                      bdi_put(bdev->bd_disk->bdi);
                 kfree(bdev->bd_disk);
+ +      }
+ +
+ +      if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR)
+ +              blk_free_ext_minor(MINOR(bdev->bd_dev));
+ +
         kmem_cache_free(bdev_cachep, BDEV_I(inode));
   }
   
@@@ -833,9 -828,16 +835,9 @@@ static void init_once(void *data
   
   static void bdev_evict_inode(struct inode *inode)
   {
- -      struct block_device *bdev = &BDEV_I(inode)->bdev;
         truncate_inode_pages_final(&inode->i_data);
         invalidate_inode_buffers(inode); /* is it needed here? */
         clear_inode(inode);
- -      /* Detach inode from wb early as bdi_put() may free bdi->wb */
- -      inode_detach_wb(inode);
- -      if (bdev->bd_bdi != &noop_backing_dev_info) {
- -              bdi_put(bdev->bd_bdi);
- -              bdev->bd_bdi = &noop_backing_dev_info;
- -      }
   }
   
   static const struct super_operations bdev_sops = {
@@@ -902,6 -904,9 +904,6 @@@ struct block_device *bdev_alloc(struct 
         bdev->bd_disk = disk;
         bdev->bd_partno = partno;
         bdev->bd_inode = inode;
- -#ifdef CONFIG_SYSFS
- -      INIT_LIST_HEAD(&bdev->bd_holder_disks);
- -#endif
         bdev->bd_stats = alloc_percpu(struct disk_stats);
         if (!bdev->bd_stats) {
                 iput(inode);
@@@ -918,6 -923,31 +920,6 @@@ void bdev_add(struct block_device *bdev
         insert_inode_hash(bdev->bd_inode);
   }
   
- -static struct block_device *bdget(dev_t dev)
- -{
- -      struct inode *inode;
- -
- -      inode = ilookup(blockdev_superblock, dev);
- -      if (!inode)
- -              return NULL;
- -      return &BDEV_I(inode)->bdev;
- -}
- -
- -/**
- - * bdgrab -- Grab a reference to an already referenced block device
- - * @bdev:     Block device to grab a reference to.
- - *
- - * Returns the block_device with an additional reference when successful,
- - * or NULL if the inode is already beeing freed.
- - */
- -struct block_device *bdgrab(struct block_device *bdev)
- -{
- -      if (!igrab(bdev->bd_inode))
- -              return NULL;
- -      return bdev;
- -}
- -EXPORT_SYMBOL(bdgrab);
- -
   long nr_blockdev_pages(void)
   {
         struct inode *inode;
@@@ -931,6 -961,12 +933,6 @@@
         return ret;
   }
   
- -void bdput(struct block_device *bdev)
- -{
- -      iput(bdev->bd_inode);
- -}
- -EXPORT_SYMBOL(bdput);
- - 
   /**
    * bd_may_claim - test whether a block device can be claimed
    * @bdev: block device of interest
@@@ -1060,6 -1096,148 +1062,6 @@@ void bd_abort_claiming(struct block_dev
   }
   EXPORT_SYMBOL(bd_abort_claiming);
   
- -#ifdef CONFIG_SYSFS
- -struct bd_holder_disk {
- -      struct list_head        list;
- -      struct gendisk          *disk;
- -      int                     refcnt;
- -};
- -
- -static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
- -                                                struct gendisk *disk)
- -{
- -      struct bd_holder_disk *holder;
- -
- -      list_for_each_entry(holder, &bdev->bd_holder_disks, list)
- -              if (holder->disk == disk)
- -                      return holder;
- -      return NULL;
- -}
- -
- -static int add_symlink(struct kobject *from, struct kobject *to)
- -{
- -      return sysfs_create_link(from, to, kobject_name(to));
- -}
- -
- -static void del_symlink(struct kobject *from, struct kobject *to)
- -{
- -      sysfs_remove_link(from, kobject_name(to));
- -}
- -
- -/**
- - * bd_link_disk_holder - create symlinks between holding disk and slave bdev
- - * @bdev: the claimed slave bdev
- - * @disk: the holding disk
- - *
- - * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
- - *
- - * This functions creates the following sysfs symlinks.
- - *
- - * - from "slaves" directory of the holder @disk to the claimed @bdev
- - * - from "holders" directory of the @bdev to the holder @disk
- - *
- - * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
- - * passed to bd_link_disk_holder(), then:
- - *
- - *   /sys/block/dm-0/slaves/sda --> /sys/block/sda
- - *   /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
- - *
- - * The caller must have claimed @bdev before calling this function and
- - * ensure that both @bdev and @disk are valid during the creation and
- - * lifetime of these symlinks.
- - *
- - * CONTEXT:
- - * Might sleep.
- - *
- - * RETURNS:
- - * 0 on success, -errno on failure.
- - */
- -int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
- -{
- -      struct bd_holder_disk *holder;
- -      int ret = 0;
- -
- -      mutex_lock(&bdev->bd_disk->open_mutex);
- -
- -      WARN_ON_ONCE(!bdev->bd_holder);
- -
- -      /* FIXME: remove the following once add_disk() handles errors */
- -      if (WARN_ON(!disk->slave_dir || !bdev->bd_holder_dir))
- -              goto out_unlock;
- -
- -      holder = bd_find_holder_disk(bdev, disk);
- -      if (holder) {
- -              holder->refcnt++;
- -              goto out_unlock;
- -      }
- -
- -      holder = kzalloc(sizeof(*holder), GFP_KERNEL);
- -      if (!holder) {
- -              ret = -ENOMEM;
- -              goto out_unlock;
- -      }
- -
- -      INIT_LIST_HEAD(&holder->list);
- -      holder->disk = disk;
- -      holder->refcnt = 1;
- -
- -      ret = add_symlink(disk->slave_dir, bdev_kobj(bdev));
- -      if (ret)
- -              goto out_free;
- -
- -      ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
- -      if (ret)
- -              goto out_del;
- -      /*
- -       * bdev could be deleted beneath us which would implicitly destroy
- -       * the holder directory.  Hold on to it.
- -       */
- -      kobject_get(bdev->bd_holder_dir);
- -
- -      list_add(&holder->list, &bdev->bd_holder_disks);
- -      goto out_unlock;
- -
- -out_del:
- -      del_symlink(disk->slave_dir, bdev_kobj(bdev));
- -out_free:
- -      kfree(holder);
- -out_unlock:
- -      mutex_unlock(&bdev->bd_disk->open_mutex);
- -      return ret;
- -}
- -EXPORT_SYMBOL_GPL(bd_link_disk_holder);
- -
- -/**
- - * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
- - * @bdev: the calimed slave bdev
- - * @disk: the holding disk
- - *
- - * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
- - *
- - * CONTEXT:
- - * Might sleep.
- - */
- -void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
- -{
- -      struct bd_holder_disk *holder;
- -
- -      mutex_lock(&bdev->bd_disk->open_mutex);
- -
- -      holder = bd_find_holder_disk(bdev, disk);
- -
- -      if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
- -              del_symlink(disk->slave_dir, bdev_kobj(bdev));
- -              del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
- -              kobject_put(bdev->bd_holder_dir);
- -              list_del_init(&holder->list);
- -              kfree(holder);
- -      }
- -
- -      mutex_unlock(&bdev->bd_disk->open_mutex);
- -}
- -EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
- -#endif
- -
   static void blkdev_flush_mapping(struct block_device *bdev)
   {
         WARN_ON_ONCE(bdev->bd_holders);
@@@ -1084,8 -1262,11 +1086,8 @@@ static int blkdev_get_whole(struct bloc
                 }
         }
   
- -      if (!bdev->bd_openers) {
+ +      if (!bdev->bd_openers)
                 set_init_blocksize(bdev);
- -              if (bdev->bd_bdi == &noop_backing_dev_info)
- -                      bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
- -      }
         if (test_bit(GD_NEED_PART_SCAN, &disk->state))
                 bdev_disk_changed(disk, false);
         bdev->bd_openers++;
@@@ -1103,14 -1284,16 +1105,14 @@@ static void blkdev_put_whole(struct blo
   static int blkdev_get_part(struct block_device *part, fmode_t mode)
   {
         struct gendisk *disk = part->bd_disk;
- -      struct block_device *whole;
         int ret;
   
         if (part->bd_openers)
                 goto done;
   
- -      whole = bdgrab(disk->part0);
- -      ret = blkdev_get_whole(whole, mode);
+ +      ret = blkdev_get_whole(bdev_whole(part), mode);
         if (ret)
- -              goto out_put_whole;
+ +              return ret;
   
         ret = -ENXIO;
         if (!bdev_nr_sectors(part))
@@@ -1118,12 -1301,16 +1120,12 @@@
   
         disk->open_partitions++;
         set_init_blocksize(part);
- -      if (part->bd_bdi == &noop_backing_dev_info)
- -              part->bd_bdi = bdi_get(disk->queue->backing_dev_info);
   done:
         part->bd_openers++;
         return 0;
   
   out_blkdev_put:
- -      blkdev_put_whole(whole, mode);
- -out_put_whole:
- -      bdput(whole);
+ +      blkdev_put_whole(bdev_whole(part), mode);
         return ret;
   }
   
@@@ -1136,42 -1323,42 +1138,42 @@@ static void blkdev_put_part(struct bloc
         blkdev_flush_mapping(part);
         whole->bd_disk->open_partitions--;
         blkdev_put_whole(whole, mode);
- -      bdput(whole);
   }
   
   struct block_device *blkdev_get_no_open(dev_t dev)
   {
         struct block_device *bdev;
- -      struct gendisk *disk;
+ +      struct inode *inode;
   
- -      bdev = bdget(dev);
- -      if (!bdev) {
+ +      inode = ilookup(blockdev_superblock, dev);
+ +      if (!inode) {
                 blk_request_module(dev);
- -              bdev = bdget(dev);
- -              if (!bdev)
+ +              inode = ilookup(blockdev_superblock, dev);
+ +              if (!inode)
                         return NULL;
         }
   
- -      disk = bdev->bd_disk;
- -      if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj))
- -              goto bdput;
- -      if ((disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP)
- -              goto put_disk;
- -      if (!try_module_get(bdev->bd_disk->fops->owner))
- -              goto put_disk;
+ +      /* switch from the inode reference to a device mode one: */
+ +      bdev = &BDEV_I(inode)->bdev;
+ +      if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
+ +              bdev = NULL;
+ +      iput(inode);
+ +
+ +      if (!bdev)
+ +              return NULL;
+ +      if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) ||
+ +          !try_module_get(bdev->bd_disk->fops->owner)) {
+ +              put_device(&bdev->bd_device);
+ +              return NULL;
+ +      }
+ +
         return bdev;
- -put_disk:
- -      put_disk(disk);
- -bdput:
- -      bdput(bdev);
- -      return NULL;
   }
   
   void blkdev_put_no_open(struct block_device *bdev)
   {
         module_put(bdev->bd_disk->fops->owner);
- -      put_disk(bdev->bd_disk);
- -      bdput(bdev);
+ +      put_device(&bdev->bd_device);
   }
   
   /**
@@@ -1224,7 -1411,7 +1226,7 @@@ struct block_device *blkdev_get_by_dev(
   
         mutex_lock(&disk->open_mutex);
         ret = -ENXIO;
- -      if (!(disk->flags & GENHD_FL_UP))
+ +      if (!disk_live(disk))
                 goto abort_claiming;
         if (bdev_is_partition(bdev))
                 ret = blkdev_get_part(bdev, mode);
diff --combined fs/io_uring.c

index 7cc458e,504aede..73928d9
--- 1/fs/io_uring.c
--- 2/fs/io_uring.c
+++ b/fs/io_uring.c
@@@ -92,12 -92,12 +92,12 @@@
   #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
   #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
   
- -/* 512 entries per page on 64-bit archs, 64 pages max */
+ +/* only define max */
   #define IORING_MAX_FIXED_FILES        (1U << 15)
   #define IORING_MAX_RESTRICTIONS       (IORING_RESTRICTION_LAST + \
                                  IORING_REGISTER_LAST + IORING_OP_LAST)
   
- -#define IO_RSRC_TAG_TABLE_SHIFT       9
+ +#define IO_RSRC_TAG_TABLE_SHIFT       (PAGE_SHIFT - 3)
   #define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
   #define IO_RSRC_TAG_TABLE_MASK        (IO_RSRC_TAG_TABLE_MAX - 1)
   
@@@ -375,7 -375,6 +375,7 @@@ struct io_ring_ctx 
   
                 struct io_submit_state  submit_state;
                 struct list_head        timeout_list;
+ +              struct list_head        ltimeout_list;
                 struct list_head        cq_overflow_list;
                 struct xarray           io_buffers;
                 struct xarray           personalities;
@@@ -509,7 -508,6 +509,7 @@@ struct io_timeout_data 
         struct hrtimer                  timer;
         struct timespec64               ts;
         enum hrtimer_mode               mode;
+ +      u32                             flags;
   };
   
   struct io_accept {
@@@ -517,7 -515,6 +517,7 @@@
         struct sockaddr __user          *addr;
         int __user                      *addr_len;
         int                             flags;
+ +      u32                             file_slot;
         unsigned long                   nofile;
   };
   
@@@ -552,7 -549,6 +552,7 @@@ struct io_timeout_rem 
         /* timeout update */
         struct timespec64               ts;
         u32                             flags;
+ +      bool                            ltimeout;
   };
   
   struct io_rw {
@@@ -584,7 -580,6 +584,7 @@@ struct io_sr_msg 
   struct io_open {
         struct file                     *file;
         int                             dfd;
+ +      u32                             file_slot;
         struct filename                 *filename;
         struct open_how                 how;
         unsigned long                   nofile;
@@@ -710,12 -705,12 +710,12 @@@ enum 
         REQ_F_NEED_CLEANUP_BIT,
         REQ_F_POLLED_BIT,
         REQ_F_BUFFER_SELECTED_BIT,
- -      REQ_F_LTIMEOUT_ACTIVE_BIT,
         REQ_F_COMPLETE_INLINE_BIT,
         REQ_F_REISSUE_BIT,
         REQ_F_DONT_REISSUE_BIT,
         REQ_F_CREDS_BIT,
         REQ_F_REFCOUNT_BIT,
+ +      REQ_F_ARM_LTIMEOUT_BIT,
         /* keep async read/write and isreg together and in order */
         REQ_F_NOWAIT_READ_BIT,
         REQ_F_NOWAIT_WRITE_BIT,
@@@ -755,6 -750,8 +755,6 @@@ enum 
         REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
         /* buffer already selected */
         REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
- -      /* linked timeout is active, i.e. prepared by link's head */
- -      REQ_F_LTIMEOUT_ACTIVE   = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
         /* completion is deferred through io_comp_state */
         REQ_F_COMPLETE_INLINE   = BIT(REQ_F_COMPLETE_INLINE_BIT),
         /* caller should reissue async */
@@@ -771,8 -768,6 +771,8 @@@
         REQ_F_CREDS             = BIT(REQ_F_CREDS_BIT),
         /* skip refcounting if not set */
         REQ_F_REFCOUNT          = BIT(REQ_F_REFCOUNT_BIT),
+ +      /* there is a linked timeout that has to be armed */
+ +      REQ_F_ARM_LTIMEOUT      = BIT(REQ_F_ARM_LTIMEOUT_BIT),
   };
   
   struct async_poll {
@@@ -780,7 -775,7 +780,7 @@@
         struct io_poll_iocb     *double_poll;
   };
   
- -typedef void (*io_req_tw_func_t)(struct io_kiocb *req);
+ +typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
   
   struct io_task_work {
         union {
@@@ -1039,9 -1034,6 +1039,9 @@@ static const struct io_op_def io_op_def
         [IORING_OP_UNLINKAT] = {},
   };
   
+ +/* requests with any of those set should undergo io_disarm_next() */
+ +#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
+ +
   static bool io_disarm_next(struct io_kiocb *req);
   static void io_uring_del_tctx_node(unsigned long index);
   static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
@@@ -1068,10 -1060,6 +1068,10 @@@ static void io_req_task_queue(struct io
   static void io_submit_flush_completions(struct io_ring_ctx *ctx);
   static int io_req_prep_async(struct io_kiocb *req);
   
+ +static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
+ +                               unsigned int issue_flags, u32 slot_index);
+ +static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
+ +
   static struct kmem_cache *req_cachep;
   
   static const struct file_operations io_uring_fops;
@@@ -1089,14 -1077,6 +1089,14 @@@ struct sock *io_uring_get_socket(struc
   }
   EXPORT_SYMBOL(io_uring_get_socket);
   
+ +static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
+ +{
+ +      if (!*locked) {
+ +              mutex_lock(&ctx->uring_lock);
+ +              *locked = true;
+ +      }
+ +}
+ +
   #define io_for_each_link(pos, head) \
         for (pos = (head); pos; pos = pos->link)
   
@@@ -1135,19 -1115,14 +1135,19 @@@ static inline void req_ref_get(struct i
         atomic_inc(&req->refs);
   }
   
- -static inline void io_req_refcount(struct io_kiocb *req)
+ +static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
   {
         if (!(req->flags & REQ_F_REFCOUNT)) {
                 req->flags |= REQ_F_REFCOUNT;
- -              atomic_set(&req->refs, 1);
+ +              atomic_set(&req->refs, nr);
         }
   }
   
+ +static inline void io_req_set_refcount(struct io_kiocb *req)
+ +{
+ +      __io_req_set_refcount(req, 1);
+ +}
+ +
   static inline void io_req_set_rsrc_node(struct io_kiocb *req)
   {
         struct io_ring_ctx *ctx = req->ctx;
@@@ -1192,12 -1167,6 +1192,12 @@@ static inline void req_set_fail(struct 
         req->flags |= REQ_F_FAIL;
   }
   
+ +static inline void req_fail_link_node(struct io_kiocb *req, int res)
+ +{
+ +      req_set_fail(req);
+ +      req->result = res;
+ +}
+ +
   static void io_ring_ctx_ref_free(struct percpu_ref *ref)
   {
         struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
@@@ -1216,19 -1185,11 +1216,19 @@@ static void io_fallback_req_func(struc
                                                 fallback_work.work);
         struct llist_node *node = llist_del_all(&ctx->fallback_llist);
         struct io_kiocb *req, *tmp;
+ +      bool locked = false;
   
         percpu_ref_get(&ctx->refs);
         llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
- -              req->io_task_work.func(req);
+ +              req->io_task_work.func(req, &locked);
+ +
+ +      if (locked) {
+ +              if (ctx->submit_state.compl_nr)
+ +                      io_submit_flush_completions(ctx);
+ +              mutex_unlock(&ctx->uring_lock);
+ +      }
         percpu_ref_put(&ctx->refs);
+ +
   }
   
   static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
@@@ -1280,7 -1241,6 +1280,7 @@@
         INIT_LIST_HEAD(&ctx->iopoll_list);
         INIT_LIST_HEAD(&ctx->defer_list);
         INIT_LIST_HEAD(&ctx->timeout_list);
+ +      INIT_LIST_HEAD(&ctx->ltimeout_list);
         spin_lock_init(&ctx->rsrc_ref_lock);
         INIT_LIST_HEAD(&ctx->rsrc_ref_list);
         INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
@@@ -1338,28 -1298,27 +1338,28 @@@ static void io_req_track_inflight(struc
         }
   }
   
- -static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
+ +static inline void io_unprep_linked_timeout(struct io_kiocb *req)
   {
- -      struct io_kiocb *nxt = req->link;
+ +      req->flags &= ~REQ_F_LINK_TIMEOUT;
+ +}
   
- -      if (req->flags & REQ_F_LINK_TIMEOUT)
+ +static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
+ +{
+ +      if (WARN_ON_ONCE(!req->link))
                 return NULL;
   
- -      /* linked timeouts should have two refs once prep'ed */
- -      io_req_refcount(req);
- -      io_req_refcount(nxt);
- -      req_ref_get(nxt);
- -
- -      nxt->timeout.head = req;
- -      nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
+ +      req->flags &= ~REQ_F_ARM_LTIMEOUT;
         req->flags |= REQ_F_LINK_TIMEOUT;
- -      return nxt;
+ +
+ +      /* linked timeouts should have two refs once prep'ed */
+ +      io_req_set_refcount(req);
+ +      __io_req_set_refcount(req->link, 2);
+ +      return req->link;
   }
   
   static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
   {
- -      if (likely(!req->link || req->link->opcode != IORING_OP_LINK_TIMEOUT))
+ +      if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
                 return NULL;
         return __io_prep_linked_timeout(req);
   }
@@@ -1413,15 -1372,12 +1413,15 @@@ static void io_prep_async_link(struct i
         }
   }
   
- -static void io_queue_async_work(struct io_kiocb *req)
+ +static void io_queue_async_work(struct io_kiocb *req, bool *locked)
   {
         struct io_ring_ctx *ctx = req->ctx;
         struct io_kiocb *link = io_prep_linked_timeout(req);
         struct io_uring_task *tctx = req->task->io_uring;
   
+ +      /* must not take the lock, NULL it as a precaution */
+ +      locked = NULL;
+ +
         BUG_ON(!tctx);
         BUG_ON(!tctx->io_wq);
   
@@@ -1561,13 -1517,6 +1561,13 @@@ static inline bool io_should_trigger_ev
         return !ctx->eventfd_async || io_wq_current_is_worker();
   }
   
+ +/*
+ + * This should only get called when at least one event has been posted.
+ + * Some applications rely on the eventfd notification count only changing
+ + * IFF a new CQE has been added to the CQ ring. There's no depedency on
+ + * 1:1 relationship between how many times this function is called (and
+ + * hence the eventfd count) and number of CQEs posted to the CQ ring.
+ + */
   static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
   {
         /*
@@@ -1665,32 -1614,10 +1665,32 @@@ static inline void io_put_task(struct t
   {
         struct io_uring_task *tctx = task->io_uring;
   
- -      percpu_counter_sub(&tctx->inflight, nr);
- -      if (unlikely(atomic_read(&tctx->in_idle)))
- -              wake_up(&tctx->wait);
- -      put_task_struct_many(task, nr);
+ +      if (likely(task == current)) {
+ +              tctx->cached_refs += nr;
+ +      } else {
+ +              percpu_counter_sub(&tctx->inflight, nr);
+ +              if (unlikely(atomic_read(&tctx->in_idle)))
+ +                      wake_up(&tctx->wait);
+ +              put_task_struct_many(task, nr);
+ +      }
+ +}
+ +
+ +static void io_task_refs_refill(struct io_uring_task *tctx)
+ +{
+ +      unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
+ +
+ +      percpu_counter_add(&tctx->inflight, refill);
+ +      refcount_add(refill, &current->usage);
+ +      tctx->cached_refs += refill;
+ +}
+ +
+ +static inline void io_get_task_refs(int nr)
+ +{
+ +      struct io_uring_task *tctx = current->io_uring;
+ +
+ +      tctx->cached_refs -= nr;
+ +      if (unlikely(tctx->cached_refs < 0))
+ +              io_task_refs_refill(tctx);
   }
   
   static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
@@@ -1763,7 -1690,7 +1763,7 @@@ static void io_req_complete_post(struc
          */
         if (req_ref_put_and_test(req)) {
                 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
- -                      if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL))
+ +                      if (req->flags & IO_DISARM_MASK)
                                 io_disarm_next(req);
                         if (req->link) {
                                 io_req_task_queue(req->link);
@@@ -1964,13 -1891,16 +1964,13 @@@ static bool io_kill_linked_timeout(stru
   {
         struct io_kiocb *link = req->link;
   
- -      /*
- -       * Can happen if a linked timeout fired and link had been like
- -       * req -> link t-out -> link t-out [-> ...]
- -       */
- -      if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
+ +      if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
                 struct io_timeout_data *io = link->async_data;
   
                 io_remove_next_linked(req);
                 link->timeout.head = NULL;
                 if (hrtimer_try_to_cancel(&io->timer) != -1) {
+ +                      list_del(&link->timeout.list);
                         io_cqring_fill_event(link->ctx, link->user_data,
                                              -ECANCELED, 0);
                         io_put_req_deferred(link);
@@@ -1987,16 -1917,11 +1987,16 @@@ static void io_fail_links(struct io_kio
   
         req->link = NULL;
         while (link) {
+ +              long res = -ECANCELED;
+ +
+ +              if (link->flags & REQ_F_FAIL)
+ +                      res = link->result;
+ +
                 nxt = link->link;
                 link->link = NULL;
   
                 trace_io_uring_fail_link(req, link);
- -              io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0);
+ +              io_cqring_fill_event(link->ctx, link->user_data, res, 0);
                 io_put_req_deferred(link);
                 link = nxt;
         }
@@@ -2007,18 -1932,7 +2007,18 @@@ static bool io_disarm_next(struct io_ki
   {
         bool posted = false;
   
- -      if (likely(req->flags & REQ_F_LINK_TIMEOUT)) {
+ +      if (req->flags & REQ_F_ARM_LTIMEOUT) {
+ +              struct io_kiocb *link = req->link;
+ +
+ +              req->flags &= ~REQ_F_ARM_LTIMEOUT;
+ +              if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
+ +                      io_remove_next_linked(req);
+ +                      io_cqring_fill_event(link->ctx, link->user_data,
+ +                                           -ECANCELED, 0);
+ +                      io_put_req_deferred(link);
+ +                      posted = true;
+ +              }
+ +      } else if (req->flags & REQ_F_LINK_TIMEOUT) {
                 struct io_ring_ctx *ctx = req->ctx;
   
                 spin_lock_irq(&ctx->timeout_lock);
@@@ -2043,7 -1957,7 +2043,7 @@@ static struct io_kiocb *__io_req_find_n
          * dependencies to the next request. In case of failure, fail the rest
          * of the chain.
          */
- -      if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL)) {
+ +      if (req->flags & IO_DISARM_MASK) {
                 struct io_ring_ctx *ctx = req->ctx;
                 bool posted;
   
@@@ -2067,22 -1981,20 +2067,22 @@@ static inline struct io_kiocb *io_req_f
         return __io_req_find_next(req);
   }
   
- -static void ctx_flush_and_put(struct io_ring_ctx *ctx)
+ +static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
   {
         if (!ctx)
                 return;
- -      if (ctx->submit_state.compl_nr) {
- -              mutex_lock(&ctx->uring_lock);
- -              io_submit_flush_completions(ctx);
+ +      if (*locked) {
+ +              if (ctx->submit_state.compl_nr)
+ +                      io_submit_flush_completions(ctx);
                 mutex_unlock(&ctx->uring_lock);
+ +              *locked = false;
         }
         percpu_ref_put(&ctx->refs);
   }
   
   static void tctx_task_work(struct callback_head *cb)
   {
+ +      bool locked = false;
         struct io_ring_ctx *ctx = NULL;
         struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
                                                   task_work);
@@@ -2105,20 -2017,18 +2105,20 @@@
                                                             io_task_work.node);
   
                         if (req->ctx != ctx) {
- -                              ctx_flush_and_put(ctx);
+ +                              ctx_flush_and_put(ctx, &locked);
                                 ctx = req->ctx;
+ +                              /* if not contended, grab and improve batching */
+ +                              locked = mutex_trylock(&ctx->uring_lock);
                                 percpu_ref_get(&ctx->refs);
                         }
- -                      req->io_task_work.func(req);
+ +                      req->io_task_work.func(req, &locked);
                         node = next;
                 } while (node);
   
                 cond_resched();
         }
   
- -      ctx_flush_and_put(ctx);
+ +      ctx_flush_and_put(ctx, &locked);
   }
   
   static void io_req_task_work_add(struct io_kiocb *req)
@@@ -2170,25 -2080,27 +2170,25 @@@
         }
   }
   
- -static void io_req_task_cancel(struct io_kiocb *req)
+ +static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
   {
         struct io_ring_ctx *ctx = req->ctx;
   
- -      /* ctx is guaranteed to stay alive while we hold uring_lock */
- -      mutex_lock(&ctx->uring_lock);
+ +      /* not needed for normal modes, but SQPOLL depends on it */
+ +      io_tw_lock(ctx, locked);
         io_req_complete_failed(req, req->result);
- -      mutex_unlock(&ctx->uring_lock);
   }
   
- -static void io_req_task_submit(struct io_kiocb *req)
+ +static void io_req_task_submit(struct io_kiocb *req, bool *locked)
   {
         struct io_ring_ctx *ctx = req->ctx;
   
- -      /* ctx stays valid until unlock, even if we drop all ours ctx->refs */
- -      mutex_lock(&ctx->uring_lock);
+ +      io_tw_lock(ctx, locked);
+ +      /* req->task == current here, checking PF_EXITING is safe */
         if (likely(!(req->task->flags & PF_EXITING)))
                 __io_queue_sqe(req);
         else
                 io_req_complete_failed(req, -EFAULT);
- -      mutex_unlock(&ctx->uring_lock);
   }
   
   static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
@@@ -2224,11 -2136,6 +2224,11 @@@ static void io_free_req(struct io_kioc
         __io_free_req(req);
   }
   
+ +static void io_free_req_work(struct io_kiocb *req, bool *locked)
+ +{
+ +      io_free_req(req);
+ +}
+ +
   struct req_batch {
         struct task_struct      *task;
         int                     task_refs;
@@@ -2247,7 -2154,9 +2247,7 @@@ static void io_req_free_batch_finish(st
   {
         if (rb->ctx_refs)
                 percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
- -      if (rb->task == current)
- -              current->io_uring->cached_refs += rb->task_refs;
- -      else if (rb->task)
+ +      if (rb->task)
                 io_put_task(rb->task, rb->task_refs);
   }
   
@@@ -2273,7 -2182,7 +2273,7 @@@ static void io_req_free_batch(struct re
   }
   
   static void io_submit_flush_completions(struct io_ring_ctx *ctx)
- -      __must_hold(&req->ctx->uring_lock)
+ +      __must_hold(&ctx->uring_lock)
   {
         struct io_submit_state *state = &ctx->submit_state;
         int i, nr = state->compl_nr;
@@@ -2326,7 -2235,7 +2326,7 @@@ static inline void io_put_req(struct io
   static inline void io_put_req_deferred(struct io_kiocb *req)
   {
         if (req_ref_put_and_test(req)) {
- -              req->io_task_work.func = io_free_req;
+ +              req->io_task_work.func = io_free_req_work;
                 io_req_task_work_add(req);
         }
   }
@@@ -2361,8 -2270,6 +2361,8 @@@ static inline unsigned int io_put_rw_kb
   {
         struct io_buffer *kbuf;
   
+ +      if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ +              return 0;
         kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
         return io_put_kbuf(req, kbuf);
   }
@@@ -2382,7 -2289,7 +2382,7 @@@ static inline bool io_run_task_work(voi
    * Find and free completed poll iocbs
    */
   static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
- -                             struct list_head *done, bool resubmit)
+ +                             struct list_head *done)
   {
         struct req_batch rb;
         struct io_kiocb *req;
@@@ -2392,18 -2299,22 +2392,18 @@@
   
         io_init_req_batch(&rb);
         while (!list_empty(done)) {
- -              int cflags = 0;
- -
                 req = list_first_entry(done, struct io_kiocb, inflight_entry);
                 list_del(&req->inflight_entry);
   
- -              if (READ_ONCE(req->result) == -EAGAIN && resubmit &&
+ +              if (READ_ONCE(req->result) == -EAGAIN &&
                     !(req->flags & REQ_F_DONT_REISSUE)) {
                         req->iopoll_completed = 0;
                         io_req_task_queue_reissue(req);
                         continue;
                 }
   
- -              if (req->flags & REQ_F_BUFFER_SELECTED)
- -                      cflags = io_put_rw_kbuf(req);
- -
- -              __io_cqring_fill_event(ctx, req->user_data, req->result, cflags);
+ +              __io_cqring_fill_event(ctx, req->user_data, req->result,
+ +                                      io_put_rw_kbuf(req));
                 (*nr_events)++;
   
                 if (req_ref_put_and_test(req))
@@@ -2416,7 -2327,7 +2416,7 @@@
   }
   
   static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
- -                      long min, bool resubmit)
+ +                      long min)
   {
         struct io_kiocb *req, *tmp;
         LIST_HEAD(done);
@@@ -2456,7 -2367,7 +2456,7 @@@
         }
   
         if (!list_empty(&done))
- -              io_iopoll_complete(ctx, nr_events, &done, resubmit);
+ +              io_iopoll_complete(ctx, nr_events, &done);
   
         return 0;
   }
@@@ -2474,7 -2385,7 +2474,7 @@@ static void io_iopoll_try_reap_events(s
         while (!list_empty(&ctx->iopoll_list)) {
                 unsigned int nr_events = 0;
   
- -              io_do_iopoll(ctx, &nr_events, 0, false);
+ +              io_do_iopoll(ctx, &nr_events, 0);
   
                 /* let it sleep and repeat later if can't complete a request */
                 if (nr_events == 0)
@@@ -2536,7 -2447,7 +2536,7 @@@ static int io_iopoll_check(struct io_ri
                             list_empty(&ctx->iopoll_list))
                                 break;
                 }
- -              ret = io_do_iopoll(ctx, &nr_events, min, true);
+ +              ret = io_do_iopoll(ctx, &nr_events, min);
         } while (!ret && nr_events < min && !need_resched());
   out:
         mutex_unlock(&ctx->uring_lock);
@@@ -2621,22 -2532,13 +2621,22 @@@ static bool __io_complete_rw_common(str
         return false;
   }
   
- -static void io_req_task_complete(struct io_kiocb *req)
+ +static void io_req_task_complete(struct io_kiocb *req, bool *locked)
   {
- -      int cflags = 0;
+ +      unsigned int cflags = io_put_rw_kbuf(req);
+ +      long res = req->result;
   
- -      if (req->flags & REQ_F_BUFFER_SELECTED)
- -              cflags = io_put_rw_kbuf(req);
- -      __io_req_complete(req, 0, req->result, cflags);
+ +      if (*locked) {
+ +              struct io_ring_ctx *ctx = req->ctx;
+ +              struct io_submit_state *state = &ctx->submit_state;
+ +
+ +              io_req_complete_state(req, res, cflags);
+ +              state->compl_reqs[state->compl_nr++] = req;
+ +              if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
+ +                      io_submit_flush_completions(ctx);
+ +      } else {
+ +              io_req_complete_post(req, res, cflags);
+ +      }
   }
   
   static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
@@@ -2644,7 -2546,7 +2644,7 @@@
   {
         if (__io_complete_rw_common(req, res))
                 return;
- -      io_req_task_complete(req);
+ +      __io_req_complete(req, 0, req->result, io_put_rw_kbuf(req));
   }
   
   static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
@@@ -2835,7 -2737,7 +2835,7 @@@ static int io_prep_rw(struct io_kiocb *
                     !kiocb->ki_filp->f_op->iopoll)
                         return -EOPNOTSUPP;
   
-               kiocb->ki_flags |= IOCB_HIPRI;
+               kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
                 kiocb->ki_complete = io_complete_rw_iopoll;
                 req->iopoll_completed = 0;
         } else {
@@@ -2904,9 -2806,12 +2904,9 @@@ static void kiocb_done(struct kiocb *ki
                 if (io_resubmit_prep(req)) {
                         io_req_task_queue_reissue(req);
                 } else {
- -                      int cflags = 0;
- -
                         req_set_fail(req);
- -                      if (req->flags & REQ_F_BUFFER_SELECTED)
- -                              cflags = io_put_rw_kbuf(req);
- -                      __io_req_complete(req, issue_flags, ret, cflags);
+ +                      __io_req_complete(req, issue_flags, ret,
+ +                                        io_put_rw_kbuf(req));
                 }
         }
   }
@@@ -3588,7 -3493,7 +3588,7 @@@ static int io_renameat_prep(struct io_k
   
         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
- -      if (sqe->ioprio || sqe->buf_index)
+ +      if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
                 return -EINVAL;
         if (unlikely(req->flags & REQ_F_FIXED_FILE))
                 return -EBADF;
@@@ -3639,8 -3544,7 +3639,8 @@@ static int io_unlinkat_prep(struct io_k
   
         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
- -      if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
+ +      if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
+ +          sqe->splice_fd_in)
                 return -EINVAL;
         if (unlikely(req->flags & REQ_F_FIXED_FILE))
                 return -EBADF;
@@@ -3686,8 -3590,8 +3686,8 @@@ static int io_shutdown_prep(struct io_k
   #if defined(CONFIG_NET)
         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
- -      if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
- -          sqe->buf_index)
+ +      if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
+ +                   sqe->buf_index || sqe->splice_fd_in))
                 return -EINVAL;
   
         req->shutdown.how = READ_ONCE(sqe->len);
@@@ -3835,8 -3739,7 +3835,8 @@@ static int io_fsync_prep(struct io_kioc
   
         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
- -      if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
+ +      if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
+ +                   sqe->splice_fd_in))
                 return -EINVAL;
   
         req->sync.flags = READ_ONCE(sqe->fsync_flags);
@@@ -3869,8 -3772,7 +3869,8 @@@ static int io_fsync(struct io_kiocb *re
   static int io_fallocate_prep(struct io_kiocb *req,
                              const struct io_uring_sqe *sqe)
   {
- -      if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
+ +      if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
+ +          sqe->splice_fd_in)
                 return -EINVAL;
         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
@@@ -3920,11 -3822,6 +3920,11 @@@ static int __io_openat_prep(struct io_k
                 req->open.filename = NULL;
                 return ret;
         }
+ +
+ +      req->open.file_slot = READ_ONCE(sqe->file_index);
+ +      if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
+ +              return -EINVAL;
+ +
         req->open.nofile = rlimit(RLIMIT_NOFILE);
         req->flags |= REQ_F_NEED_CLEANUP;
         return 0;
@@@ -3962,8 -3859,8 +3962,8 @@@ static int io_openat2(struct io_kiocb *
   {
         struct open_flags op;
         struct file *file;
- -      bool nonblock_set;
- -      bool resolve_nonblock;
+ +      bool resolve_nonblock, nonblock_set;
+ +      bool fixed = !!req->open.file_slot;
         int ret;
   
         ret = build_open_flags(&req->open.how, &op);
@@@ -3982,11 -3879,9 +3982,11 @@@
                 op.open_flag |= O_NONBLOCK;
         }
   
- -      ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
- -      if (ret < 0)
- -              goto err;
+ +      if (!fixed) {
+ +              ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
+ +              if (ret < 0)
+ +                      goto err;
+ +      }
   
         file = do_filp_open(req->open.dfd, req->open.filename, &op);
         if (IS_ERR(file)) {
@@@ -3995,8 -3890,7 +3995,8 @@@
                  * marginal gain for something that is now known to be a slower
                  * path. So just put it, and we'll get a new one when we retry.
                  */
- -              put_unused_fd(ret);
+ +              if (!fixed)
+ +                      put_unused_fd(ret);
   
                 ret = PTR_ERR(file);
                 /* only retry if RESOLVE_CACHED wasn't already set by application */
@@@ -4009,12 -3903,7 +4009,12 @@@
         if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
                 file->f_flags &= ~O_NONBLOCK;
         fsnotify_open(file);
- -      fd_install(ret, file);
+ +
+ +      if (!fixed)
+ +              fd_install(ret, file);
+ +      else
+ +              ret = io_install_fixed_file(req, file, issue_flags,
+ +                                          req->open.file_slot - 1);
   err:
         putname(req->open.filename);
         req->flags &= ~REQ_F_NEED_CLEANUP;
@@@ -4035,8 -3924,7 +4035,8 @@@ static int io_remove_buffers_prep(struc
         struct io_provide_buf *p = &req->pbuf;
         u64 tmp;
   
- -      if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
+ +      if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
+ +          sqe->splice_fd_in)
                 return -EINVAL;
   
         tmp = READ_ONCE(sqe->fd);
@@@ -4107,7 -3995,7 +4107,7 @@@ static int io_provide_buffers_prep(stru
         struct io_provide_buf *p = &req->pbuf;
         u64 tmp;
   
- -      if (sqe->ioprio || sqe->rw_flags)
+ +      if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
                 return -EINVAL;
   
         tmp = READ_ONCE(sqe->fd);
@@@ -4194,7 -4082,7 +4194,7 @@@ static int io_epoll_ctl_prep(struct io_
                              const struct io_uring_sqe *sqe)
   {
   #if defined(CONFIG_EPOLL)
- -      if (sqe->ioprio || sqe->buf_index)
+ +      if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
                 return -EINVAL;
         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
@@@ -4240,7 -4128,7 +4240,7 @@@ static int io_epoll_ctl(struct io_kioc
   static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   {
   #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
- -      if (sqe->ioprio || sqe->buf_index || sqe->off)
+ +      if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
                 return -EINVAL;
         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
@@@ -4275,7 -4163,7 +4275,7 @@@ static int io_madvise(struct io_kiocb *
   
   static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
   {
- -      if (sqe->ioprio || sqe->buf_index || sqe->addr)
+ +      if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
                 return -EINVAL;
         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
@@@ -4313,7 -4201,7 +4313,7 @@@ static int io_statx_prep(struct io_kioc
   {
         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
- -      if (sqe->ioprio || sqe->buf_index)
+ +      if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
                 return -EINVAL;
         if (req->flags & REQ_F_FIXED_FILE)
                 return -EBADF;
@@@ -4349,7 -4237,7 +4349,7 @@@ static int io_close_prep(struct io_kioc
         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
         if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
- -          sqe->rw_flags || sqe->buf_index)
+ +          sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
                 return -EINVAL;
         if (req->flags & REQ_F_FIXED_FILE)
                 return -EBADF;
@@@ -4410,8 -4298,7 +4410,8 @@@ static int io_sfr_prep(struct io_kiocb 
   
         if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
- -      if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
+ +      if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
+ +                   sqe->splice_fd_in))
                 return -EINVAL;
   
         req->sync.off = READ_ONCE(sqe->off);
@@@ -4845,15 -4732,6 +4845,15 @@@ static int io_accept_prep(struct io_kio
         accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
         accept->flags = READ_ONCE(sqe->accept_flags);
         accept->nofile = rlimit(RLIMIT_NOFILE);
+ +
+ +      accept->file_slot = READ_ONCE(sqe->file_index);
+ +      if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) ||
+ +                                (accept->flags & SOCK_CLOEXEC)))
+ +              return -EINVAL;
+ +      if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+ +              return -EINVAL;
+ +      if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
+ +              accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
         return 0;
   }
   
@@@ -4862,35 -4740,20 +4862,35 @@@ static int io_accept(struct io_kiocb *r
         struct io_accept *accept = &req->accept;
         bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
         unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
- -      int ret;
+ +      bool fixed = !!accept->file_slot;
+ +      struct file *file;
+ +      int ret, fd;
   
         if (req->file->f_flags & O_NONBLOCK)
                 req->flags |= REQ_F_NOWAIT;
   
- -      ret = __sys_accept4_file(req->file, file_flags, accept->addr,
- -                                      accept->addr_len, accept->flags,
- -                                      accept->nofile);
- -      if (ret == -EAGAIN && force_nonblock)
- -              return -EAGAIN;
- -      if (ret < 0) {
+ +      if (!fixed) {
+ +              fd = __get_unused_fd_flags(accept->flags, accept->nofile);
+ +              if (unlikely(fd < 0))
+ +                      return fd;
+ +      }
+ +      file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
+ +                       accept->flags);
+ +      if (IS_ERR(file)) {
+ +              if (!fixed)
+ +                      put_unused_fd(fd);
+ +              ret = PTR_ERR(file);
+ +              if (ret == -EAGAIN && force_nonblock)
+ +                      return -EAGAIN;
                 if (ret == -ERESTARTSYS)
                         ret = -EINTR;
                 req_set_fail(req);
+ +      } else if (!fixed) {
+ +              fd_install(fd, file);
+ +              ret = fd;
+ +      } else {
+ +              ret = io_install_fixed_file(req, file, issue_flags,
+ +                                          accept->file_slot - 1);
         }
         __io_req_complete(req, issue_flags, ret, 0);
         return 0;
@@@ -4910,8 -4773,7 +4910,8 @@@ static int io_connect_prep(struct io_ki
   
         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
- -      if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
+ +      if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
+ +          sqe->splice_fd_in)
                 return -EINVAL;
   
         conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@@ -5024,7 -4886,6 +5024,7 @@@ static bool io_poll_rewait(struct io_ki
   {
         struct io_ring_ctx *ctx = req->ctx;
   
+ +      /* req->task == current here, checking PF_EXITING is safe */
         if (unlikely(req->task->flags & PF_EXITING))
                 WRITE_ONCE(poll->canceled, true);
   
@@@ -5103,7 -4964,7 +5103,7 @@@ static bool io_poll_complete(struct io_
         return !(flags & IORING_CQE_F_MORE);
   }
   
- -static void io_poll_task_func(struct io_kiocb *req)
+ +static void io_poll_task_func(struct io_kiocb *req, bool *locked)
   {
         struct io_ring_ctx *ctx = req->ctx;
         struct io_kiocb *nxt;
@@@ -5127,7 -4988,7 +5127,7 @@@
                 if (done) {
                         nxt = io_put_req_find_next(req);
                         if (nxt)
- -                              io_req_task_submit(nxt);
+ +                              io_req_task_submit(nxt, locked);
                 }
         }
   }
@@@ -5194,13 -5055,8 +5194,13 @@@ static void __io_queue_proc(struct io_p
         if (unlikely(pt->nr_entries)) {
                 struct io_poll_iocb *poll_one = poll;
   
+ +              /* double add on the same waitqueue head, ignore */
+ +              if (poll_one->head == head)
+ +                      return;
                 /* already have a 2nd entry, fail a third attempt */
                 if (*poll_ptr) {
+ +                      if ((*poll_ptr)->head == head)
+ +                              return;
                         pt->error = -EINVAL;
                         return;
                 }
@@@ -5210,6 -5066,9 +5210,6 @@@
                  */
                 if (!(poll_one->events & EPOLLONESHOT))
                         poll_one->events |= EPOLLONESHOT;
- -              /* double add on the same waitqueue head, ignore */
- -              if (poll_one->head == head)
- -                      return;
                 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
                 if (!poll) {
                         pt->error = -ENOMEM;
@@@ -5239,7 -5098,7 +5239,7 @@@ static void io_async_queue_proc(struct 
         __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
   }
   
- -static void io_async_task_func(struct io_kiocb *req)
+ +static void io_async_task_func(struct io_kiocb *req, bool *locked)
   {
         struct async_poll *apoll = req->apoll;
         struct io_ring_ctx *ctx = req->ctx;
@@@ -5256,7 -5115,7 +5256,7 @@@
         spin_unlock(&ctx->completion_lock);
   
         if (!READ_ONCE(apoll->poll.canceled))
- -              io_req_task_submit(req);
+ +              io_req_task_submit(req, locked);
         else
                 io_req_complete_failed(req, -ECANCELED);
   }
@@@ -5374,14 -5233,17 +5374,14 @@@ static int io_arm_poll_handler(struct i
         req->apoll = apoll;
         req->flags |= REQ_F_POLLED;
         ipt.pt._qproc = io_async_queue_proc;
- -      io_req_refcount(req);
+ +      io_req_set_refcount(req);
   
         ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
                                         io_async_wake);
- -      if (ret || ipt.error) {
- -              spin_unlock(&ctx->completion_lock);
- -              if (ret)
- -                      return IO_APOLL_READY;
- -              return IO_APOLL_ABORTED;
- -      }
         spin_unlock(&ctx->completion_lock);
+ +      if (ret || ipt.error)
+ +              return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
+ +
         trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
                                 mask, apoll->poll.events);
         return IO_APOLL_OK;
@@@ -5507,7 -5369,7 +5507,7 @@@ static int io_poll_update_prep(struct i
   
         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
- -      if (sqe->ioprio || sqe->buf_index)
+ +      if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
                 return -EINVAL;
         flags = READ_ONCE(sqe->len);
         if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
@@@ -5562,7 -5424,7 +5562,7 @@@ static int io_poll_add_prep(struct io_k
         if (flags & ~IORING_POLL_ADD_MULTI)
                 return -EINVAL;
   
- -      io_req_refcount(req);
+ +      io_req_set_refcount(req);
         poll->events = io_poll_parse_events(sqe, flags);
         return 0;
   }
@@@ -5655,10 -5517,18 +5655,10 @@@ err
         return 0;
   }
   
- -static void io_req_task_timeout(struct io_kiocb *req)
+ +static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
   {
- -      struct io_ring_ctx *ctx = req->ctx;
- -
- -      spin_lock(&ctx->completion_lock);
- -      io_cqring_fill_event(ctx, req->user_data, -ETIME, 0);
- -      io_commit_cqring(ctx);
- -      spin_unlock(&ctx->completion_lock);
- -
- -      io_cqring_ev_posted(ctx);
         req_set_fail(req);
- -      io_put_req(req);
+ +      io_req_complete_post(req, -ETIME, 0);
   }
   
   static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
@@@ -5704,7 -5574,6 +5704,7 @@@ static struct io_kiocb *io_timeout_extr
   }
   
   static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+ +      __must_hold(&ctx->completion_lock)
         __must_hold(&ctx->timeout_lock)
   {
         struct io_kiocb *req = io_timeout_extract(ctx, user_data);
@@@ -5718,47 -5587,6 +5718,47 @@@
         return 0;
   }
   
+ +static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
+ +{
+ +      switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
+ +      case IORING_TIMEOUT_BOOTTIME:
+ +              return CLOCK_BOOTTIME;
+ +      case IORING_TIMEOUT_REALTIME:
+ +              return CLOCK_REALTIME;
+ +      default:
+ +              /* can't happen, vetted at prep time */
+ +              WARN_ON_ONCE(1);
+ +              fallthrough;
+ +      case 0:
+ +              return CLOCK_MONOTONIC;
+ +      }
+ +}
+ +
+ +static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
+ +                                  struct timespec64 *ts, enum hrtimer_mode mode)
+ +      __must_hold(&ctx->timeout_lock)
+ +{
+ +      struct io_timeout_data *io;
+ +      struct io_kiocb *req;
+ +      bool found = false;
+ +
+ +      list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
+ +              found = user_data == req->user_data;
+ +              if (found)
+ +                      break;
+ +      }
+ +      if (!found)
+ +              return -ENOENT;
+ +
+ +      io = req->async_data;
+ +      if (hrtimer_try_to_cancel(&io->timer) == -1)
+ +              return -EALREADY;
+ +      hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
+ +      io->timer.function = io_link_timeout_fn;
+ +      hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
+ +      return 0;
+ +}
+ +
   static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
                              struct timespec64 *ts, enum hrtimer_mode mode)
         __must_hold(&ctx->timeout_lock)
@@@ -5772,7 -5600,7 +5772,7 @@@
         req->timeout.off = 0; /* noseq */
         data = req->async_data;
         list_add_tail(&req->timeout.list, &ctx->timeout_list);
- -      hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode);
+ +      hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
         data->timer.function = io_timeout_fn;
         hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
         return 0;
@@@ -5787,18 -5615,13 +5787,18 @@@ static int io_timeout_remove_prep(struc
                 return -EINVAL;
         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
                 return -EINVAL;
- -      if (sqe->ioprio || sqe->buf_index || sqe->len)
+ +      if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
                 return -EINVAL;
   
+ +      tr->ltimeout = false;
         tr->addr = READ_ONCE(sqe->addr);
         tr->flags = READ_ONCE(sqe->timeout_flags);
- -      if (tr->flags & IORING_TIMEOUT_UPDATE) {
- -              if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS))
+ +      if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
+ +              if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
+ +                      return -EINVAL;
+ +              if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
+ +                      tr->ltimeout = true;
+ +              if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
                         return -EINVAL;
                 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
                         return -EFAULT;
@@@ -5825,26 -5648,22 +5825,26 @@@ static int io_timeout_remove(struct io_
         struct io_ring_ctx *ctx = req->ctx;
         int ret;
   
- -      spin_lock_irq(&ctx->timeout_lock);
- -      if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE))
+ +      if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
+ +              spin_lock(&ctx->completion_lock);
+ +              spin_lock_irq(&ctx->timeout_lock);
                 ret = io_timeout_cancel(ctx, tr->addr);
- -      else
- -              ret = io_timeout_update(ctx, tr->addr, &tr->ts,
- -                                      io_translate_timeout_mode(tr->flags));
- -      spin_unlock_irq(&ctx->timeout_lock);
+ +              spin_unlock_irq(&ctx->timeout_lock);
+ +              spin_unlock(&ctx->completion_lock);
+ +      } else {
+ +              enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
+ +
+ +              spin_lock_irq(&ctx->timeout_lock);
+ +              if (tr->ltimeout)
+ +                      ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
+ +              else
+ +                      ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
+ +              spin_unlock_irq(&ctx->timeout_lock);
+ +      }
   
- -      spin_lock(&ctx->completion_lock);
- -      io_cqring_fill_event(ctx, req->user_data, ret, 0);
- -      io_commit_cqring(ctx);
- -      spin_unlock(&ctx->completion_lock);
- -      io_cqring_ev_posted(ctx);
         if (ret < 0)
                 req_set_fail(req);
- -      io_put_req(req);
+ +      io_req_complete_post(req, ret, 0);
         return 0;
   }
   
@@@ -5857,19 -5676,14 +5857,19 @@@ static int io_timeout_prep(struct io_ki
   
         if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
                 return -EINVAL;
- -      if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
+ +      if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
+ +          sqe->splice_fd_in)
                 return -EINVAL;
         if (off && is_timeout_link)
                 return -EINVAL;
         flags = READ_ONCE(sqe->timeout_flags);
- -      if (flags & ~IORING_TIMEOUT_ABS)
+ +      if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK))
+ +              return -EINVAL;
+ +      /* more than one clock specified is invalid, obviously */
+ +      if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
                 return -EINVAL;
   
+ +      INIT_LIST_HEAD(&req->timeout.list);
         req->timeout.off = off;
         if (unlikely(off && !req->ctx->off_timeout_used))
                 req->ctx->off_timeout_used = true;
@@@ -5879,24 -5693,14 +5879,24 @@@
   
         data = req->async_data;
         data->req = req;
+ +      data->flags = flags;
   
         if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
                 return -EFAULT;
   
         data->mode = io_translate_timeout_mode(flags);
- -      hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
- -      if (is_timeout_link)
- -              io_req_track_inflight(req);
+ +      hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
+ +
+ +      if (is_timeout_link) {
+ +              struct io_submit_link *link = &req->ctx->submit_state.link;
+ +
+ +              if (!link->head)
+ +                      return -EINVAL;
+ +              if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
+ +                      return -EINVAL;
+ +              req->timeout.head = link->last;
+ +              link->last->flags |= REQ_F_ARM_LTIMEOUT;
+ +      }
         return 0;
   }
   
@@@ -5989,27 -5793,32 +5989,27 @@@ static int io_async_cancel_one(struct i
         return ret;
   }
   
- -static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
- -                                   struct io_kiocb *req, __u64 sqe_addr,
- -                                   int success_ret)
+ +static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
   {
+ +      struct io_ring_ctx *ctx = req->ctx;
         int ret;
   
+ +      WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
+ +
         ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
- -      spin_lock(&ctx->completion_lock);
         if (ret != -ENOENT)
- -              goto done;
+ +              return ret;
+ +
+ +      spin_lock(&ctx->completion_lock);
         spin_lock_irq(&ctx->timeout_lock);
         ret = io_timeout_cancel(ctx, sqe_addr);
         spin_unlock_irq(&ctx->timeout_lock);
         if (ret != -ENOENT)
- -              goto done;
+ +              goto out;
         ret = io_poll_cancel(ctx, sqe_addr, false);
- -done:
- -      if (!ret)
- -              ret = success_ret;
- -      io_cqring_fill_event(ctx, req->user_data, ret, 0);
- -      io_commit_cqring(ctx);
+ +out:
         spin_unlock(&ctx->completion_lock);
- -      io_cqring_ev_posted(ctx);
- -
- -      if (ret < 0)
- -              req_set_fail(req);
+ +      return ret;
   }
   
   static int io_async_cancel_prep(struct io_kiocb *req,
@@@ -6019,8 -5828,7 +6019,8 @@@
                 return -EINVAL;
         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
                 return -EINVAL;
- -      if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
+ +      if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
+ +          sqe->splice_fd_in)
                 return -EINVAL;
   
         req->cancel.addr = READ_ONCE(sqe->addr);
@@@ -6034,9 -5842,20 +6034,9 @@@ static int io_async_cancel(struct io_ki
         struct io_tctx_node *node;
         int ret;
   
- -      /* tasks should wait for their io-wq threads, so safe w/o sync */
- -      ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
- -      spin_lock(&ctx->completion_lock);
- -      if (ret != -ENOENT)
- -              goto done;
- -      spin_lock_irq(&ctx->timeout_lock);
- -      ret = io_timeout_cancel(ctx, sqe_addr);
- -      spin_unlock_irq(&ctx->timeout_lock);
+ +      ret = io_try_cancel_userdata(req, sqe_addr);
         if (ret != -ENOENT)
                 goto done;
- -      ret = io_poll_cancel(ctx, sqe_addr, false);
- -      if (ret != -ENOENT)
- -              goto done;
- -      spin_unlock(&ctx->completion_lock);
   
         /* slow path, try all io-wq's */
         io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
@@@ -6049,10 -5868,17 +6049,10 @@@
                         break;
         }
         io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
- -
- -      spin_lock(&ctx->completion_lock);
   done:
- -      io_cqring_fill_event(ctx, req->user_data, ret, 0);
- -      io_commit_cqring(ctx);
- -      spin_unlock(&ctx->completion_lock);
- -      io_cqring_ev_posted(ctx);
- -
         if (ret < 0)
                 req_set_fail(req);
- -      io_put_req(req);
+ +      io_req_complete_post(req, ret, 0);
         return 0;
   }
   
@@@ -6061,7 -5887,7 +6061,7 @@@ static int io_rsrc_update_prep(struct i
   {
         if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
                 return -EINVAL;
- -      if (sqe->ioprio || sqe->rw_flags)
+ +      if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
                 return -EINVAL;
   
         req->rsrc_update.offset = READ_ONCE(sqe->off);
@@@ -6267,7 -6093,7 +6267,7 @@@ fail
         if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
                 spin_unlock(&ctx->completion_lock);
                 kfree(de);
- -              io_queue_async_work(req);
+ +              io_queue_async_work(req, NULL);
                 return true;
         }
   
@@@ -6490,17 -6316,14 +6490,17 @@@ static void io_wq_submit_work(struct io
         struct io_kiocb *timeout;
         int ret = 0;
   
- -      io_req_refcount(req);
- -      /* will be dropped by ->io_free_work() after returning to io-wq */
- -      req_ref_get(req);
+ +      /* one will be dropped by ->io_free_work() after returning to io-wq */
+ +      if (!(req->flags & REQ_F_REFCOUNT))
+ +              __io_req_set_refcount(req, 2);
+ +      else
+ +              req_ref_get(req);
   
         timeout = io_prep_linked_timeout(req);
         if (timeout)
                 io_queue_linked_timeout(timeout);
   
+ +      /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
         if (work->flags & IO_WQ_WORK_CANCEL)
                 ret = -ECANCELED;
   
@@@ -6590,15 -6413,15 +6590,15 @@@ static inline struct file *io_file_get(
                 return io_file_get_normal(ctx, req, fd);
   }
   
- -static void io_req_task_link_timeout(struct io_kiocb *req)
+ +static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
   {
         struct io_kiocb *prev = req->timeout.prev;
- -      struct io_ring_ctx *ctx = req->ctx;
+ +      int ret;
   
         if (prev) {
- -              io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
+ +              ret = io_try_cancel_userdata(req, prev->user_data);
+ +              io_req_complete_post(req, ret ?: -ETIME, 0);
                 io_put_req(prev);
- -              io_put_req(req);
         } else {
                 io_req_complete_post(req, -ETIME, 0);
         }
@@@ -6625,7 -6448,6 +6625,7 @@@ static enum hrtimer_restart io_link_tim
                 if (!req_ref_inc_not_zero(prev))
                         prev = NULL;
         }
+ +      list_del(&req->timeout.list);
         req->timeout.prev = prev;
         spin_unlock_irqrestore(&ctx->timeout_lock, flags);
   
@@@ -6649,7 -6471,6 +6649,7 @@@ static void io_queue_linked_timeout(str
                 data->timer.function = io_link_timeout_fn;
                 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
                                 data->mode);
+ +              list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
         }
         spin_unlock_irq(&ctx->timeout_lock);
         /* drop submission reference */
@@@ -6659,7 -6480,7 +6659,7 @@@
   static void __io_queue_sqe(struct io_kiocb *req)
         __must_hold(&req->ctx->uring_lock)
   {
- -      struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
+ +      struct io_kiocb *linked_timeout;
         int ret;
   
   issue_sqe:
@@@ -6677,34 -6498,24 +6677,34 @@@
                         state->compl_reqs[state->compl_nr++] = req;
                         if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
                                 io_submit_flush_completions(ctx);
+ +                      return;
                 }
+ +
+ +              linked_timeout = io_prep_linked_timeout(req);
+ +              if (linked_timeout)
+ +                      io_queue_linked_timeout(linked_timeout);
         } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
+ +              linked_timeout = io_prep_linked_timeout(req);
+ +
                 switch (io_arm_poll_handler(req)) {
                 case IO_APOLL_READY:
+ +                      if (linked_timeout)
+ +                              io_unprep_linked_timeout(req);
                         goto issue_sqe;
                 case IO_APOLL_ABORTED:
                         /*
                          * Queued up for async execution, worker will release
                          * submit reference when the iocb is actually submitted.
                          */
- -                      io_queue_async_work(req);
+ +                      io_queue_async_work(req, NULL);
                         break;
                 }
+ +
+ +              if (linked_timeout)
+ +                      io_queue_linked_timeout(linked_timeout);
         } else {
                 io_req_complete_failed(req, ret);
         }
- -      if (linked_timeout)
- -              io_queue_linked_timeout(linked_timeout);
   }
   
   static inline void io_queue_sqe(struct io_kiocb *req)
@@@ -6713,17 -6524,15 +6713,17 @@@
         if (unlikely(req->ctx->drain_active) && io_drain_req(req))
                 return;
   
- -      if (likely(!(req->flags & REQ_F_FORCE_ASYNC))) {
+ +      if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) {
                 __io_queue_sqe(req);
+ +      } else if (req->flags & REQ_F_FAIL) {
+ +              io_req_complete_failed(req, req->result);
         } else {
                 int ret = io_req_prep_async(req);
   
                 if (unlikely(ret))
                         io_req_complete_failed(req, ret);
                 else
- -                      io_queue_async_work(req);
+ +                      io_queue_async_work(req, NULL);
         }
   }
   
@@@ -6825,34 -6634,20 +6825,34 @@@ static int io_submit_sqe(struct io_ring
         ret = io_init_req(ctx, req, sqe);
         if (unlikely(ret)) {
   fail_req:
+ +              /* fail even hard links since we don't submit */
                 if (link->head) {
- -                      /* fail even hard links since we don't submit */
- -                      req_set_fail(link->head);
- -                      io_req_complete_failed(link->head, -ECANCELED);
- -                      link->head = NULL;
+ +                      /*
+ +                       * we can judge a link req is failed or cancelled by if
+ +                       * REQ_F_FAIL is set, but the head is an exception since
+ +                       * it may be set REQ_F_FAIL because of other req's failure
+ +                       * so let's leverage req->result to distinguish if a head
+ +                       * is set REQ_F_FAIL because of its failure or other req's
+ +                       * failure so that we can set the correct ret code for it.
+ +                       * init result here to avoid affecting the normal path.
+ +                       */
+ +                      if (!(link->head->flags & REQ_F_FAIL))
+ +                              req_fail_link_node(link->head, -ECANCELED);
+ +              } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
+ +                      /*
+ +                       * the current req is a normal req, we should return
+ +                       * error and thus break the submittion loop.
+ +                       */
+ +                      io_req_complete_failed(req, ret);
+ +                      return ret;
                 }
- -              io_req_complete_failed(req, ret);
- -              return ret;
+ +              req_fail_link_node(req, ret);
+ +      } else {
+ +              ret = io_req_prep(req, sqe);
+ +              if (unlikely(ret))
+ +                      goto fail_req;
         }
   
- -      ret = io_req_prep(req, sqe);
- -      if (unlikely(ret))
- -              goto fail_req;
- -
         /* don't need @sqe from now on */
         trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
                                   req->flags, true,
@@@ -6868,14 -6663,9 +6868,14 @@@
         if (link->head) {
                 struct io_kiocb *head = link->head;
   
- -              ret = io_req_prep_async(req);
- -              if (unlikely(ret))
- -                      goto fail_req;
+ +              if (!(req->flags & REQ_F_FAIL)) {
+ +                      ret = io_req_prep_async(req);
+ +                      if (unlikely(ret)) {
+ +                              req_fail_link_node(req, ret);
+ +                              if (!(head->flags & REQ_F_FAIL))
+ +                                      req_fail_link_node(head, -ECANCELED);
+ +                      }
+ +              }
                 trace_io_uring_link(ctx, req, head);
                 link->last->link = req;
                 link->last = req;
@@@ -6970,15 -6760,25 +6970,15 @@@ static const struct io_uring_sqe *io_ge
   static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
         __must_hold(&ctx->uring_lock)
   {
- -      struct io_uring_task *tctx;
         int submitted = 0;
   
         /* make sure SQ entry isn't read before tail */
         nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
         if (!percpu_ref_tryget_many(&ctx->refs, nr))
                 return -EAGAIN;
+ +      io_get_task_refs(nr);
   
- -      tctx = current->io_uring;
- -      tctx->cached_refs -= nr;
- -      if (unlikely(tctx->cached_refs < 0)) {
- -              unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
- -
- -              percpu_counter_add(&tctx->inflight, refill);
- -              refcount_add(refill, &current->usage);
- -              tctx->cached_refs += refill;
- -      }
         io_submit_state_start(&ctx->submit_state, nr);
- -
         while (submitted < nr) {
                 const struct io_uring_sqe *sqe;
                 struct io_kiocb *req;
@@@ -6991,7 -6791,7 +6991,7 @@@
                 }
                 sqe = io_get_sqe(ctx);
                 if (unlikely(!sqe)) {
- -                      kmem_cache_free(req_cachep, req);
+ +                      list_add(&req->inflight_entry, &ctx->submit_state.free_list);
                         break;
                 }
                 /* will complete beyond this point, count as submitted */
@@@ -7056,7 -6856,7 +7056,7 @@@ static int __io_sq_thread(struct io_rin
   
                 mutex_lock(&ctx->uring_lock);
                 if (!list_empty(&ctx->iopoll_list))
- -                      io_do_iopoll(ctx, &nr_events, 0, true);
+ +                      io_do_iopoll(ctx, &nr_events, 0);
   
                 /*
                  * Don't submit if refs are dying, good for io_uring_register(),
@@@ -7336,14 -7136,14 +7336,14 @@@ static void **io_alloc_page_table(size_
         size_t init_size = size;
         void **table;
   
- -      table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL);
+ +      table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
         if (!table)
                 return NULL;
   
         for (i = 0; i < nr_tables; i++) {
                 unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
   
- -              table[i] = kzalloc(this_size, GFP_KERNEL);
+ +              table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
                 if (!table[i]) {
                         io_free_page_table(table, init_size);
                         return NULL;
@@@ -7534,8 -7334,7 +7534,8 @@@ fail
   
   static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
   {
- -      table->files = kvcalloc(nr_files, sizeof(table->files[0]), GFP_KERNEL);
+ +      table->files = kvcalloc(nr_files, sizeof(table->files[0]),
+ +                              GFP_KERNEL_ACCOUNT);
         return !!table->files;
   }
   
@@@ -7932,8 -7731,6 +7932,8 @@@ static int io_sqe_files_register(struc
                 return -EINVAL;
         if (nr_args > IORING_MAX_FIXED_FILES)
                 return -EMFILE;
+ +      if (nr_args > rlimit(RLIMIT_NOFILE))
+ +              return -EMFILE;
         ret = io_rsrc_node_switch_start(ctx);
         if (ret)
                 return ret;
@@@ -8043,46 -7840,6 +8043,46 @@@ static int io_sqe_file_register(struct 
   #endif
   }
   
+ +static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
+ +                               unsigned int issue_flags, u32 slot_index)
+ +{
+ +      struct io_ring_ctx *ctx = req->ctx;
+ +      bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ +      struct io_fixed_file *file_slot;
+ +      int ret = -EBADF;
+ +
+ +      io_ring_submit_lock(ctx, !force_nonblock);
+ +      if (file->f_op == &io_uring_fops)
+ +              goto err;
+ +      ret = -ENXIO;
+ +      if (!ctx->file_data)
+ +              goto err;
+ +      ret = -EINVAL;
+ +      if (slot_index >= ctx->nr_user_files)
+ +              goto err;
+ +
+ +      slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
+ +      file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
+ +      ret = -EBADF;
+ +      if (file_slot->file_ptr)
+ +              goto err;
+ +
+ +      *io_get_tag_slot(ctx->file_data, slot_index) = 0;
+ +      io_fixed_file_set(file_slot, file);
+ +      ret = io_sqe_file_register(ctx, file, slot_index);
+ +      if (ret) {
+ +              file_slot->file_ptr = 0;
+ +              goto err;
+ +      }
+ +
+ +      ret = 0;
+ +err:
+ +      io_ring_submit_unlock(ctx, !force_nonblock);
+ +      if (ret)
+ +              fput(file);
+ +      return ret;
+ +}
+ +
   static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
                                  struct io_rsrc_node *node, void *rsrc)
   {
@@@ -8942,7 -8699,6 +8942,7 @@@ static void io_ring_ctx_free(struct io_
                 sock_release(ctx->ring_sock);
         }
   #endif
+ +      WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
   
         io_mem_free(ctx->rings);
         io_mem_free(ctx->sq_sqes);
@@@ -9370,8 -9126,8 +9370,8 @@@ static void io_uring_clean_tctx(struct 
                  * Must be after io_uring_del_task_file() (removes nodes under
                  * uring_lock) to avoid race with io_uring_try_cancel_iowq().
                  */
- -              tctx->io_wq = NULL;
                 io_wq_put_and_exit(wq);
+ +              tctx->io_wq = NULL;
         }
   }
   
@@@ -9457,9 -9213,9 +9457,9 @@@ static void io_uring_cancel_generic(boo
         }
   }
   
- -void __io_uring_cancel(struct files_struct *files)
+ +void __io_uring_cancel(bool cancel_all)
   {
- -      io_uring_cancel_generic(!files, NULL);
+ +      io_uring_cancel_generic(cancel_all, NULL);
   }
   
   static void *io_uring_validate_mmap_request(struct file *file,
@@@ -10297,31 -10053,6 +10297,31 @@@ static int io_unregister_iowq_aff(struc
         return io_wq_cpu_affinity(tctx->io_wq, NULL);
   }
   
+ +static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
+ +                                      void __user *arg)
+ +{
+ +      struct io_uring_task *tctx = current->io_uring;
+ +      __u32 new_count[2];
+ +      int i, ret;
+ +
+ +      if (!tctx || !tctx->io_wq)
+ +              return -EINVAL;
+ +      if (copy_from_user(new_count, arg, sizeof(new_count)))
+ +              return -EFAULT;
+ +      for (i = 0; i < ARRAY_SIZE(new_count); i++)
+ +              if (new_count[i] > INT_MAX)
+ +                      return -EINVAL;
+ +
+ +      ret = io_wq_max_workers(tctx->io_wq, new_count);
+ +      if (ret)
+ +              return ret;
+ +
+ +      if (copy_to_user(arg, new_count, sizeof(new_count)))
+ +              return -EFAULT;
+ +
+ +      return 0;
+ +}
+ +
   static bool io_register_op_must_quiesce(int op)
   {
         switch (op) {
@@@ -10339,7 -10070,6 +10339,7 @@@
         case IORING_REGISTER_BUFFERS_UPDATE:
         case IORING_REGISTER_IOWQ_AFF:
         case IORING_UNREGISTER_IOWQ_AFF:
+ +      case IORING_REGISTER_IOWQ_MAX_WORKERS:
                 return false;
         default:
                 return true;
@@@ -10496,12 -10226,6 +10496,12 @@@ static int __io_uring_register(struct i
                         break;
                 ret = io_unregister_iowq_aff(ctx);
                 break;
+ +      case IORING_REGISTER_IOWQ_MAX_WORKERS:
+ +              ret = -EINVAL;
+ +              if (!arg || nr_args != 2)
+ +                      break;
+ +              ret = io_register_iowq_max_workers(ctx, arg);
+ +              break;
         default:
                 ret = -EINVAL;
                 break;
@@@ -10583,16 -10307,11 +10583,16 @@@ static int __init io_uring_init(void
         BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
         BUILD_BUG_SQE_ELEM(42, __u16,  personality);
         BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
+ +      BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
   
         BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
                      sizeof(struct io_uring_rsrc_update));
         BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
                      sizeof(struct io_uring_rsrc_update2));
+ +
+ +      /* ->buf_index is u16 */
+ +      BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
+ +
         /* should fit into one byte */
         BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
   
diff --combined include/linux/bio.h

index 7b5f65a,89ad282..3d67d0f
--- 1/include/linux/bio.h
--- 2/include/linux/bio.h
+++ b/include/linux/bio.h
@@@ -5,6 -5,7 +5,6 @@@
   #ifndef __LINUX_BIO_H
   #define __LINUX_BIO_H
   
- -#include <linux/highmem.h>
   #include <linux/mempool.h>
   #include <linux/ioprio.h>
   /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
@@@ -400,6 -401,7 +400,7 @@@ static inline struct bio *bio_next_spli
   enum {
         BIOSET_NEED_BVECS = BIT(0),
         BIOSET_NEED_RESCUER = BIT(1),
+       BIOSET_PERCPU_CACHE = BIT(2),
   };
   extern int bioset_init(struct bio_set *, unsigned int, unsigned int, int flags);
   extern void bioset_exit(struct bio_set *);
@@@ -408,6 -410,8 +409,8 @@@ extern int bioset_init_from_src(struct 
   
   struct bio *bio_alloc_bioset(gfp_t gfp, unsigned short nr_iovecs,
                 struct bio_set *bs);
+ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
+               struct bio_set *bs);
   struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs);
   extern void bio_put(struct bio *);
   
@@@ -518,6 -522,47 +521,6 @@@ static inline void bio_clone_blkg_assoc
                                               struct bio *src) { }
   #endif        /* CONFIG_BLK_CGROUP */
   
- -#ifdef CONFIG_HIGHMEM
- -/*
- - * remember never ever reenable interrupts between a bvec_kmap_irq and
- - * bvec_kunmap_irq!
- - */
- -static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
- -{
- -      unsigned long addr;
- -
- -      /*
- -       * might not be a highmem page, but the preempt/irq count
- -       * balancing is a lot nicer this way
- -       */
- -      local_irq_save(*flags);
- -      addr = (unsigned long) kmap_atomic(bvec->bv_page);
- -
- -      BUG_ON(addr & ~PAGE_MASK);
- -
- -      return (char *) addr + bvec->bv_offset;
- -}
- -
- -static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
- -{
- -      unsigned long ptr = (unsigned long) buffer & PAGE_MASK;
- -
- -      kunmap_atomic((void *) ptr);
- -      local_irq_restore(*flags);
- -}
- -
- -#else
- -static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
- -{
- -      return page_address(bvec->bv_page) + bvec->bv_offset;
- -}
- -
- -static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
- -{
- -      *flags = 0;
- -}
- -#endif
- -
   /*
    * BIO list management for use by remapping drivers (e.g. DM or MD) and loop.
    *
@@@ -657,6 -702,11 +660,11 @@@ struct bio_set 
         struct kmem_cache *bio_slab;
         unsigned int front_pad;
   
+       /*
+        * per-cpu bio alloc cache
+        */
+       struct bio_alloc_cache __percpu *cache;
+ 
         mempool_t bio_pool;
         mempool_t bvec_pool;
   #if defined(CONFIG_BLK_DEV_INTEGRITY)
@@@ -673,6 -723,11 +681,11 @@@
         struct bio_list         rescue_list;
         struct work_struct      rescue_work;
         struct workqueue_struct *rescue_workqueue;
+ 
+       /*
+        * Hot un-plug notifier for the per-cpu cache, if used
+        */
+       struct hlist_node cpuhp_dead;
   };
   
   static inline bool bioset_initialized(struct bio_set *bs)
diff --combined include/linux/blk_types.h

index 1335efa,f68d4e8..9e392da
--- 1/include/linux/blk_types.h
--- 2/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@@ -34,10 -34,14 +34,10 @@@ struct block_device 
         void *                  bd_holder;
         int                     bd_holders;
         bool                    bd_write_holder;
- -#ifdef CONFIG_SYSFS
- -      struct list_head        bd_holder_disks;
- -#endif
         struct kobject          *bd_holder_dir;
         u8                      bd_partno;
         spinlock_t              bd_size_lock; /* for bd_inode->i_size updates */
         struct gendisk *        bd_disk;
- -      struct backing_dev_info *bd_bdi;
   
         /* The counter of freeze processes */
         int                     bd_fsfreeze_count;
@@@ -297,6 -301,7 +297,7 @@@ enum 
         BIO_TRACKED,            /* set if bio goes through the rq_qos path */
         BIO_REMAPPED,
         BIO_ZONE_WRITE_LOCKED,  /* Owns a zoned device zone write lock */
+       BIO_PERCPU_CACHE,       /* can participate in per-cpu alloc cache */
         BIO_FLAG_LAST
   };
   
diff --combined include/linux/cpuhotplug.h

index 6ac543d,fe72c8d..95f88ed
--- 1/include/linux/cpuhotplug.h
--- 2/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@@ -46,6 -46,7 +46,7 @@@ enum cpuhp_state 
         CPUHP_ARM_OMAP_WAKE_DEAD,
         CPUHP_IRQ_POLL_DEAD,
         CPUHP_BLOCK_SOFTIRQ_DEAD,
+       CPUHP_BIO_DEAD,
         CPUHP_ACPI_CPUDRV_DEAD,
         CPUHP_S390_PFAULT_DEAD,
         CPUHP_BLK_MQ_DEAD,
@@@ -399,7 -400,7 +400,7 @@@ static inline int cpuhp_state_remove_in
   
   /**
    * cpuhp_state_remove_instance_nocalls - Remove hotplug instance from state
- - *                                     without invoking the reatdown callback
+ + *                                     without invoking the teardown callback
    * @state:    The state from which the instance is removed
    * @node:     The node for this individual state.
    *
diff --combined include/linux/fs.h

index 7eae53f,0dcc5de..ae29027
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -319,6 -319,8 +319,8 @@@ enum rw_hint 
   /* iocb->ki_waitq is valid */
   #define IOCB_WAITQ            (1 << 19)
   #define IOCB_NOIO             (1 << 20)
+ /* can use bio alloc cache */
+ #define IOCB_ALLOC_CACHE      (1 << 21)
   
   struct kiocb {
         struct file             *ki_filp;
@@@ -436,10 -438,6 +438,10 @@@ int pagecache_write_end(struct file *, 
    * struct address_space - Contents of a cacheable, mappable object.
    * @host: Owner, either the inode or the block_device.
    * @i_pages: Cached pages.
+ + * @invalidate_lock: Guards coherency between page cache contents and
+ + *   file offset->disk block mappings in the filesystem during invalidates.
+ + *   It is also used to block modification of page cache contents through
+ + *   memory mappings.
    * @gfp_mask: Memory allocation flags to use for allocating pages.
    * @i_mmap_writable: Number of VM_SHARED mappings.
    * @nr_thps: Number of THPs in the pagecache (non-shmem only).
@@@ -457,7 -455,6 +459,7 @@@
   struct address_space {
         struct inode            *host;
         struct xarray           i_pages;
+ +      struct rw_semaphore     invalidate_lock;
         gfp_t                   gfp_mask;
         atomic_t                i_mmap_writable;
   #ifdef CONFIG_READ_ONLY_THP_FOR_FS
@@@ -819,42 -816,9 +821,42 @@@ static inline void inode_lock_shared_ne
         down_read_nested(&inode->i_rwsem, subclass);
   }
   
+ +static inline void filemap_invalidate_lock(struct address_space *mapping)
+ +{
+ +      down_write(&mapping->invalidate_lock);
+ +}
+ +
+ +static inline void filemap_invalidate_unlock(struct address_space *mapping)
+ +{
+ +      up_write(&mapping->invalidate_lock);
+ +}
+ +
+ +static inline void filemap_invalidate_lock_shared(struct address_space *mapping)
+ +{
+ +      down_read(&mapping->invalidate_lock);
+ +}
+ +
+ +static inline int filemap_invalidate_trylock_shared(
+ +                                      struct address_space *mapping)
+ +{
+ +      return down_read_trylock(&mapping->invalidate_lock);
+ +}
+ +
+ +static inline void filemap_invalidate_unlock_shared(
+ +                                      struct address_space *mapping)
+ +{
+ +      up_read(&mapping->invalidate_lock);
+ +}
+ +
   void lock_two_nondirectories(struct inode *, struct inode*);
   void unlock_two_nondirectories(struct inode *, struct inode*);
   
+ +void filemap_invalidate_lock_two(struct address_space *mapping1,
+ +                               struct address_space *mapping2);
+ +void filemap_invalidate_unlock_two(struct address_space *mapping1,
+ +                                 struct address_space *mapping2);
+ +
+ +
   /*
    * NOTE: in a 32bit arch with a preemptable kernel and
    * an UP compile the i_size_read/write must be atomic
@@@ -1545,11 -1509,8 +1547,11 @@@ struct super_block 
         /* Number of inodes with nlink == 0 but still referenced */
         atomic_long_t s_remove_count;
   
- -      /* Pending fsnotify inode refs */
- -      atomic_long_t s_fsnotify_inode_refs;
+ +      /*
+ +       * Number of inode/mount/sb objects that are being watched, note that
+ +       * inodes objects are currently double-accounted.
+ +       */
+ +      atomic_long_t s_fsnotify_connectors;
   
         /* Being remounted read-only */
         int s_readonly_remount;
@@@ -2528,7 -2489,6 +2530,7 @@@ struct file_system_type 
   
         struct lock_class_key i_lock_key;
         struct lock_class_key i_mutex_key;
+ +      struct lock_class_key invalidate_lock_key;
         struct lock_class_key i_mutex_dir_key;
   };
   
@@@ -2612,6 -2572,90 +2614,6 @@@ extern struct kobject *fs_kobj
   
   #define MAX_RW_COUNT (INT_MAX & PAGE_MASK)
   
- -#ifdef CONFIG_MANDATORY_FILE_LOCKING
- -extern int locks_mandatory_locked(struct file *);
- -extern int locks_mandatory_area(struct inode *, struct file *, loff_t, loff_t, unsigned char);
- -
- -/*
- - * Candidates for mandatory locking have the setgid bit set
- - * but no group execute bit -  an otherwise meaningless combination.
- - */
- -
- -static inline int __mandatory_lock(struct inode *ino)
- -{
- -      return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID;
- -}
- -
- -/*
- - * ... and these candidates should be on SB_MANDLOCK mounted fs,
- - * otherwise these will be advisory locks
- - */
- -
- -static inline int mandatory_lock(struct inode *ino)
- -{
- -      return IS_MANDLOCK(ino) && __mandatory_lock(ino);
- -}
- -
- -static inline int locks_verify_locked(struct file *file)
- -{
- -      if (mandatory_lock(locks_inode(file)))
- -              return locks_mandatory_locked(file);
- -      return 0;
- -}
- -
- -static inline int locks_verify_truncate(struct inode *inode,
- -                                  struct file *f,
- -                                  loff_t size)
- -{
- -      if (!inode->i_flctx || !mandatory_lock(inode))
- -              return 0;
- -
- -      if (size < inode->i_size) {
- -              return locks_mandatory_area(inode, f, size, inode->i_size - 1,
- -                              F_WRLCK);
- -      } else {
- -              return locks_mandatory_area(inode, f, inode->i_size, size - 1,
- -                              F_WRLCK);
- -      }
- -}
- -
- -#else /* !CONFIG_MANDATORY_FILE_LOCKING */
- -
- -static inline int locks_mandatory_locked(struct file *file)
- -{
- -      return 0;
- -}
- -
- -static inline int locks_mandatory_area(struct inode *inode, struct file *filp,
- -                                       loff_t start, loff_t end, unsigned char type)
- -{
- -      return 0;
- -}
- -
- -static inline int __mandatory_lock(struct inode *inode)
- -{
- -      return 0;
- -}
- -
- -static inline int mandatory_lock(struct inode *inode)
- -{
- -      return 0;
- -}
- -
- -static inline int locks_verify_locked(struct file *file)
- -{
- -      return 0;
- -}
- -
- -static inline int locks_verify_truncate(struct inode *inode, struct file *filp,
- -                                      size_t size)
- -{
- -      return 0;
- -}
- -
- -#endif /* CONFIG_MANDATORY_FILE_LOCKING */
- -
- -
   #ifdef CONFIG_FILE_LOCKING
   static inline int break_lease(struct inode *inode, unsigned int mode)
   {
@@@ -3204,6 -3248,10 +3206,6 @@@ ssize_t vfs_iocb_iter_read(struct file 
   ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
                             struct iov_iter *iter);
   
- -/* fs/block_dev.c */
- -extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
- -                      int datasync);
- -
   /* fs/splice.c */
   extern ssize_t generic_file_splice_read(struct file *, loff_t *,
                 struct pipe_inode_info *, size_t, unsigned int);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 31 Aug 2021 02:30:30 +0000 (19:30 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 31 Aug 2021 02:30:30 +0000 (19:30 -0700)
		1	2
block/bio.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-core.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-merge.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/block_dev.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/io_uring.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/bio.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blk_types.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/cpuhotplug.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history