Linux 6.9-rc1

[linux-2.6-microblaze.git] / block / bio.c
diff --git a/block/bio.c b/block/bio.c

index 3d3a267..d24420e 100644 (file)
--- a/block/bio.c
+++ b/block/bio.c
@@ -16,7 +16,6 @@
  #include <linux/workqueue.h>
  #include <linux/cgroup.h>
  #include <linux/highmem.h>
-#include <linux/sched/sysctl.h>
  #include <linux/blk-crypto.h>
  #include <linux/xarray.h>
  
@@ -25,9 +24,14 @@
  #include "blk-rq-qos.h"
  #include "blk-cgroup.h"
  
+#define ALLOC_CACHE_THRESHOLD  16
+#define ALLOC_CACHE_MAX                256
+
  struct bio_alloc_cache {
         struct bio              *free_list;
+       struct bio              *free_list_irq;
         unsigned int            nr;
+       unsigned int            nr_irq;
  };
  
  static struct biovec_slab {
@@ -246,6 +250,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
         bio->bi_opf = opf;
         bio->bi_flags = 0;
         bio->bi_ioprio = 0;
+       bio->bi_write_hint = 0;
         bio->bi_status = 0;
         bio->bi_iter.bi_sector = 0;
         bio->bi_iter.bi_size = 0;
@@ -408,6 +413,22 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
         queue_work(bs->rescue_workqueue, &bs->rescue_work);
  }
  
+static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache)
+{
+       unsigned long flags;
+
+       /* cache->free_list must be empty */
+       if (WARN_ON_ONCE(cache->free_list))
+               return;
+
+       local_irq_save(flags);
+       cache->free_list = cache->free_list_irq;
+       cache->free_list_irq = NULL;
+       cache->nr += cache->nr_irq;
+       cache->nr_irq = 0;
+       local_irq_restore(flags);
+}
+
  static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
                 unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp,
                 struct bio_set *bs)
@@ -417,8 +438,12 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
  
         cache = per_cpu_ptr(bs->cache, get_cpu());
         if (!cache->free_list) {
-               put_cpu();
-               return NULL;
+               if (READ_ONCE(cache->nr_irq) >= ALLOC_CACHE_THRESHOLD)
+                       bio_alloc_irq_cache_splice(cache);
+               if (!cache->free_list) {
+                       put_cpu();
+                       return NULL;
+               }
         }
         bio = cache->free_list;
         cache->free_list = bio->bi_next;
@@ -462,9 +487,6 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
   * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
   * for per bio allocations.
   *
- * If REQ_ALLOC_CACHE is set, the final put of the bio MUST be done from process
- * context, not hard/soft IRQ.
- *
   * Returns: Pointer to new bio on success, NULL on failure.
   */
  struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
@@ -526,6 +548,8 @@ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
         }
         if (unlikely(!p))
                 return NULL;
+       if (!mempool_is_saturated(&bs->bio_pool))
+               opf &= ~REQ_ALLOC_CACHE;
  
         bio = p + bs->front_pad;
         if (nr_vecs > BIO_INLINE_VECS) {
@@ -567,7 +591,7 @@ EXPORT_SYMBOL(bio_alloc_bioset);
   * be reused by calling bio_uninit() before calling bio_init() again.
   *
   * Note that unlike bio_alloc() or bio_alloc_bioset() allocations from this
- * function are not backed by a mempool can can fail.  Do not use this function
+ * function are not backed by a mempool can fail.  Do not use this function
   * for allocations in the file system I/O path.
   *
   * Returns: Pointer to new bio on success, NULL on failure.
@@ -582,15 +606,15 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask)
  }
  EXPORT_SYMBOL(bio_kmalloc);
  
-void zero_fill_bio(struct bio *bio)
+void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
  {
         struct bio_vec bv;
         struct bvec_iter iter;
  
-       bio_for_each_segment(bv, bio, iter)
+       __bio_for_each_segment(bv, bio, iter, start)
                 memzero_bvec(&bv);
  }
-EXPORT_SYMBOL(zero_fill_bio);
+EXPORT_SYMBOL(zero_fill_bio_iter);
  
  /**
   * bio_truncate - truncate the bio to small size of @new_size
@@ -676,11 +700,8 @@ void guard_bio_eod(struct bio *bio)
         bio_truncate(bio, maxsector << 9);
  }
  
-#define ALLOC_CACHE_MAX                512
-#define ALLOC_CACHE_SLACK       64
-
-static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
-                                 unsigned int nr)
+static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache,
+                                  unsigned int nr)
  {
         unsigned int i = 0;
         struct bio *bio;
@@ -692,6 +713,17 @@ static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
                 if (++i == nr)
                         break;
         }
+       return i;
+}
+
+static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
+                                 unsigned int nr)
+{
+       nr -= __bio_alloc_cache_prune(cache, nr);
+       if (!READ_ONCE(cache->free_list)) {
+               bio_alloc_irq_cache_splice(cache);
+               __bio_alloc_cache_prune(cache, nr);
+       }
  }
  
  static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node)
@@ -725,6 +757,38 @@ static void bio_alloc_cache_destroy(struct bio_set *bs)
         bs->cache = NULL;
  }
  
+static inline void bio_put_percpu_cache(struct bio *bio)
+{
+       struct bio_alloc_cache *cache;
+
+       cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
+       if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX)
+               goto out_free;
+
+       if (in_task()) {
+               bio_uninit(bio);
+               bio->bi_next = cache->free_list;
+               /* Not necessary but helps not to iopoll already freed bios */
+               bio->bi_bdev = NULL;
+               cache->free_list = bio;
+               cache->nr++;
+       } else if (in_hardirq()) {
+               lockdep_assert_irqs_disabled();
+
+               bio_uninit(bio);
+               bio->bi_next = cache->free_list_irq;
+               cache->free_list_irq = bio;
+               cache->nr_irq++;
+       } else {
+               goto out_free;
+       }
+       put_cpu();
+       return;
+out_free:
+       put_cpu();
+       bio_free(bio);
+}
+
  /**
   * bio_put - release a reference to a bio
   * @bio:   bio to release reference to
@@ -740,29 +804,18 @@ void bio_put(struct bio *bio)
                 if (!atomic_dec_and_test(&bio->__bi_cnt))
                         return;
         }
-
-       if (bio->bi_opf & REQ_ALLOC_CACHE) {
-               struct bio_alloc_cache *cache;
-
-               bio_uninit(bio);
-               cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
-               bio->bi_next = cache->free_list;
-               cache->free_list = bio;
-               if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK)
-                       bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK);
-               put_cpu();
-       } else {
+       if (bio->bi_opf & REQ_ALLOC_CACHE)
+               bio_put_percpu_cache(bio);
+       else
                 bio_free(bio);
-       }
  }
  EXPORT_SYMBOL(bio_put);
  
  static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
  {
         bio_set_flag(bio, BIO_CLONED);
-       if (bio_flagged(bio_src, BIO_THROTTLED))
-               bio_set_flag(bio, BIO_THROTTLED);
         bio->bi_ioprio = bio_src->bi_ioprio;
+       bio->bi_write_hint = bio_src->bi_write_hint;
         bio->bi_iter = bio_src->bi_iter;
  
         if (bio->bi_bdev) {
@@ -853,9 +906,8 @@ static inline bool bio_full(struct bio *bio, unsigned len)
         return false;
  }
  
-static inline bool page_is_mergeable(const struct bio_vec *bv,
-               struct page *page, unsigned int len, unsigned int off,
-               bool *same_page)
+static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
+               unsigned int len, unsigned int off, bool *same_page)
  {
         size_t bv_end = bv->bv_offset + bv->bv_len;
         phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1;
@@ -865,49 +917,19 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
                 return false;
         if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
                 return false;
-
-       *same_page = ((vec_end_addr & PAGE_MASK) == page_addr);
-       if (*same_page)
-               return true;
-       return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE);
-}
-
-/**
- * __bio_try_merge_page - try appending data to an existing bvec.
- * @bio: destination bio
- * @page: start page to add
- * @len: length of the data to add
- * @off: offset of the data relative to @page
- * @same_page: return if the segment has been merged inside the same page
- *
- * Try to add the data at @page + @off to the last bvec of @bio.  This is a
- * useful optimisation for file systems with a block size smaller than the
- * page size.
- *
- * Warn if (@len, @off) crosses pages in case that @same_page is true.
- *
- * Return %true on success or %false on failure.
- */
-static bool __bio_try_merge_page(struct bio *bio, struct page *page,
-               unsigned int len, unsigned int off, bool *same_page)
-{
-       if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
+       if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
                 return false;
  
-       if (bio->bi_vcnt > 0) {
-               struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
-
-               if (page_is_mergeable(bv, page, len, off, same_page)) {
-                       if (bio->bi_iter.bi_size > UINT_MAX - len) {
-                               *same_page = false;
-                               return false;
-                       }
-                       bv->bv_len += len;
-                       bio->bi_iter.bi_size += len;
-                       return true;
-               }
+       *same_page = ((vec_end_addr & PAGE_MASK) == page_addr);
+       if (!*same_page) {
+               if (IS_ENABLED(CONFIG_KMSAN))
+                       return false;
+               if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE)
+                       return false;
         }
-       return false;
+
+       bv->bv_len += len;
+       return true;
  }
  
  /*
@@ -915,20 +937,19 @@ static bool __bio_try_merge_page(struct bio *bio, struct page *page,
   * size limit.  This is not for normal read/write bios, but for passthrough
   * or Zone Append operations that we can't split.
   */
-static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio,
-                                struct page *page, unsigned len,
-                                unsigned offset, bool *same_page)
+bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
+               struct page *page, unsigned len, unsigned offset,
+               bool *same_page)
  {
-       struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
         unsigned long mask = queue_segment_boundary(q);
         phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset;
         phys_addr_t addr2 = page_to_phys(page) + offset + len - 1;
  
         if ((addr1 | mask) != (addr2 | mask))
                 return false;
-       if (bv->bv_len + len > queue_max_segment_size(q))
+       if (len > queue_max_segment_size(q) - bv->bv_len)
                 return false;
-       return __bio_try_merge_page(bio, page, len, offset, same_page);
+       return bvec_try_merge_page(bv, page, len, offset, same_page);
  }
  
  /**
@@ -948,37 +969,37 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
                 struct page *page, unsigned int len, unsigned int offset,
                 unsigned int max_sectors, bool *same_page)
  {
-       struct bio_vec *bvec;
+       unsigned int max_size = max_sectors << SECTOR_SHIFT;
  
         if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
                 return 0;
  
-       if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
+       len = min3(len, max_size, queue_max_segment_size(q));
+       if (len > max_size - bio->bi_iter.bi_size)
                 return 0;
  
         if (bio->bi_vcnt > 0) {
-               if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page))
+               struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+               if (bvec_try_merge_hw_page(q, bv, page, len, offset,
+                               same_page)) {
+                       bio->bi_iter.bi_size += len;
                         return len;
+               }
+
+               if (bio->bi_vcnt >=
+                   min(bio->bi_max_vecs, queue_max_segments(q)))
+                       return 0;
  
                 /*
                  * If the queue doesn't support SG gaps and adding this segment
                  * would create a gap, disallow it.
                  */
-               bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
-               if (bvec_gap_to_prev(&q->limits, bvec, offset))
+               if (bvec_gap_to_prev(&q->limits, bv, offset))
                         return 0;
         }
  
-       if (bio_full(bio, len))
-               return 0;
-
-       if (bio->bi_vcnt >= queue_max_segments(q))
-               return 0;
-
-       bvec = &bio->bi_io_vec[bio->bi_vcnt];
-       bvec->bv_page = page;
-       bvec->bv_len = len;
-       bvec->bv_offset = offset;
+       bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset);
         bio->bi_vcnt++;
         bio->bi_iter.bi_size += len;
         return len;
@@ -1054,20 +1075,12 @@ EXPORT_SYMBOL_GPL(bio_add_zone_append_page);
  void __bio_add_page(struct bio *bio, struct page *page,
                 unsigned int len, unsigned int off)
  {
-       struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
-
         WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
         WARN_ON_ONCE(bio_full(bio, len));
  
-       bv->bv_page = page;
-       bv->bv_offset = off;
-       bv->bv_len = len;
-
+       bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off);
         bio->bi_iter.bi_size += len;
         bio->bi_vcnt++;
-
-       if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page)))
-               bio_set_flag(bio, BIO_WORKINGSET);
  }
  EXPORT_SYMBOL_GPL(__bio_add_page);
  
@@ -1086,15 +1099,33 @@ int bio_add_page(struct bio *bio, struct page *page,
  {
         bool same_page = false;
  
-       if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
-               if (bio_full(bio, len))
-                       return 0;
-               __bio_add_page(bio, page, len, offset);
+       if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
+               return 0;
+       if (bio->bi_iter.bi_size > UINT_MAX - len)
+               return 0;
+
+       if (bio->bi_vcnt > 0 &&
+           bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
+                               page, len, offset, &same_page)) {
+               bio->bi_iter.bi_size += len;
+               return len;
         }
+
+       if (bio->bi_vcnt >= bio->bi_max_vecs)
+               return 0;
+       __bio_add_page(bio, page, len, offset);
         return len;
  }
  EXPORT_SYMBOL(bio_add_page);
  
+void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len,
+                         size_t off)
+{
+       WARN_ON_ONCE(len > UINT_MAX);
+       WARN_ON_ONCE(off > UINT_MAX);
+       __bio_add_page(bio, &folio->page, len, off);
+}
+
  /**
   * bio_add_folio - Attempt to add part of a folio to a bio.
   * @bio: BIO to add to.
@@ -1116,16 +1147,27 @@ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
                 return false;
         return bio_add_page(bio, &folio->page, len, off) > 0;
  }
+EXPORT_SYMBOL(bio_add_folio);
  
  void __bio_release_pages(struct bio *bio, bool mark_dirty)
  {
-       struct bvec_iter_all iter_all;
-       struct bio_vec *bvec;
+       struct folio_iter fi;
  
-       bio_for_each_segment_all(bvec, bio, iter_all) {
-               if (mark_dirty && !PageCompound(bvec->bv_page))
-                       set_page_dirty_lock(bvec->bv_page);
-               put_page(bvec->bv_page);
+       bio_for_each_folio_all(fi, bio) {
+               struct page *page;
+               size_t nr_pages;
+
+               if (mark_dirty) {
+                       folio_lock(fi.folio);
+                       folio_mark_dirty(fi.folio);
+                       folio_unlock(fi.folio);
+               }
+               page = folio_page(fi.folio, fi.offset / PAGE_SIZE);
+               nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE -
+                          fi.offset / PAGE_SIZE + 1;
+               do {
+                       bio_release_page(bio, page++);
+               } while (--nr_pages != 0);
         }
  }
  EXPORT_SYMBOL_GPL(__bio_release_pages);
@@ -1147,7 +1189,6 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
         bio->bi_io_vec = (struct bio_vec *)iter->bvec;
         bio->bi_iter.bi_bvec_done = iter->iov_offset;
         bio->bi_iter.bi_size = size;
-       bio_set_flag(bio, BIO_NO_PAGE_REF);
         bio_set_flag(bio, BIO_CLONED);
  }
  
@@ -1156,13 +1197,18 @@ static int bio_iov_add_page(struct bio *bio, struct page *page,
  {
         bool same_page = false;
  
-       if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
-               __bio_add_page(bio, page, len, offset);
+       if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len))
+               return -EIO;
+
+       if (bio->bi_vcnt > 0 &&
+           bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
+                               page, len, offset, &same_page)) {
+               bio->bi_iter.bi_size += len;
+               if (same_page)
+                       bio_release_page(bio, page);
                 return 0;
         }
-
-       if (same_page)
-               put_page(page);
+       __bio_add_page(bio, page, len, offset);
         return 0;
  }
  
@@ -1176,7 +1222,7 @@ static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page,
                         queue_max_zone_append_sectors(q), &same_page) != len)
                 return -EINVAL;
         if (same_page)
-               put_page(page);
+               bio_release_page(bio, page);
         return 0;
  }
  
@@ -1187,20 +1233,21 @@ static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page,
   * @bio: bio to add pages to
   * @iter: iov iterator describing the region to be mapped
   *
- * Pins pages from *iter and appends them to @bio's bvec array. The
- * pages will have to be released using put_page() when done.
- * For multi-segment *iter, this function only adds pages from the
- * next non-empty segment of the iov iterator.
+ * Extracts pages from *iter and appends them to @bio's bvec array.  The pages
+ * will have to be cleaned up in the way indicated by the BIO_PAGE_PINNED flag.
+ * For a multi-segment *iter, this function only adds pages from the next
+ * non-empty segment of the iov iterator.
   */
  static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
  {
+       iov_iter_extraction_t extraction_flags = 0;
         unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
         unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
         struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
         struct page **pages = (struct page **)bv;
         ssize_t size, left;
         unsigned len, i = 0;
-       size_t offset, trim;
+       size_t offset;
         int ret = 0;
  
         /*
@@ -1211,6 +1258,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
         BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
         pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
  
+       if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
+               extraction_flags |= ITER_ALLOW_P2PDMA;
+
         /*
          * Each segment in the iov is required to be a block size multiple.
          * However, we may not be able to get the entire segment if it spans
@@ -1218,17 +1268,20 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
          * result to ensure the bio's total size is correct. The remainder of
          * the iov data will be picked up in the next bio iteration.
          */
-       size = iov_iter_get_pages2(iter, pages, UINT_MAX - bio->bi_iter.bi_size,
-                                 nr_pages, &offset);
+       size = iov_iter_extract_pages(iter, &pages,
+                                     UINT_MAX - bio->bi_iter.bi_size,
+                                     nr_pages, extraction_flags, &offset);
         if (unlikely(size <= 0))
                 return size ? size : -EFAULT;
  
         nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
  
-       trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1);
-       iov_iter_revert(iter, trim);
+       if (bio->bi_bdev) {
+               size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1);
+               iov_iter_revert(iter, trim);
+               size -= trim;
+       }
  
-       size -= trim;
         if (unlikely(!size)) {
                 ret = -EFAULT;
                 goto out;
@@ -1252,7 +1305,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
         iov_iter_revert(iter, left);
  out:
         while (i < nr_pages)
-               put_page(pages[i++]);
+               bio_release_page(bio, pages[i++]);
  
         return ret;
  }
@@ -1276,26 +1329,26 @@ out:
   * fit into the bio, or are requested in @iter, whatever is smaller. If
   * MM encounters an error pinning the requested pages, it stops. Error
   * is returned only if 0 pages could be pinned.
- *
- * It's intended for direct IO, so doesn't do PSI tracking, the caller is
- * responsible for setting BIO_WORKINGSET if necessary.
   */
  int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
  {
         int ret = 0;
  
+       if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
+               return -EIO;
+
         if (iov_iter_is_bvec(iter)) {
                 bio_iov_bvec_set(bio, iter);
                 iov_iter_advance(iter, bio->bi_iter.bi_size);
                 return 0;
         }
  
+       if (iov_iter_extract_will_pin(iter))
+               bio_set_flag(bio, BIO_PAGE_PINNED);
         do {
                 ret = __bio_iov_iter_get_pages(bio, iter);
         } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
  
-       /* don't account direct I/O as memory stall */
-       bio_clear_flag(bio, BIO_WORKINGSET);
         return bio->bi_vcnt ? 0 : ret;
  }
  EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
@@ -1320,21 +1373,12 @@ int submit_bio_wait(struct bio *bio)
  {
         DECLARE_COMPLETION_ONSTACK_MAP(done,
                         bio->bi_bdev->bd_disk->lockdep_map);
-       unsigned long hang_check;
  
         bio->bi_private = &done;
         bio->bi_end_io = submit_bio_wait_endio;
         bio->bi_opf |= REQ_SYNC;
         submit_bio(bio);
-
-       /* Prevent hang_check timer from firing at us during very long I/O */
-       hang_check = sysctl_hung_task_timeout_secs;
-       if (hang_check)
-               while (!wait_for_completion_io_timeout(&done,
-                                       hang_check * (HZ/2)))
-                       ;
-       else
-               wait_for_completion_io(&done);
+       blk_wait_io(&done);
  
         return blk_status_to_errno(bio->bi_status);
  }
@@ -1402,18 +1446,12 @@ EXPORT_SYMBOL(bio_free_pages);
   * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
   * for performing direct-IO in BIOs.
   *
- * The problem is that we cannot run set_page_dirty() from interrupt context
+ * The problem is that we cannot run folio_mark_dirty() from interrupt context
   * because the required locks are not interrupt-safe.  So what we can do is to
   * mark the pages dirty _before_ performing IO.  And in interrupt context,
   * check that the pages are still dirty.   If so, fine.  If not, redirty them
   * in process context.
   *
- * We special-case compound pages here: normally this means reads into hugetlb
- * pages.  The logic in here doesn't really work right for compound pages
- * because the VM does not uniformly chase down the head page in all cases.
- * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
- * handle them at all.  So we skip compound pages here at an early stage.
- *
   * Note that this code is very hard to test under normal circumstances because
   * direct-io pins the pages with get_user_pages().  This makes
   * is_page_cache_freeable return false, and the VM will not clean the pages.
@@ -1429,14 +1467,15 @@ EXPORT_SYMBOL(bio_free_pages);
   */
  void bio_set_pages_dirty(struct bio *bio)
  {
-       struct bio_vec *bvec;
-       struct bvec_iter_all iter_all;
+       struct folio_iter fi;
  
-       bio_for_each_segment_all(bvec, bio, iter_all) {
-               if (!PageCompound(bvec->bv_page))
-                       set_page_dirty_lock(bvec->bv_page);
+       bio_for_each_folio_all(fi, bio) {
+               folio_lock(fi.folio);
+               folio_mark_dirty(fi.folio);
+               folio_unlock(fi.folio);
         }
  }
+EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
  
  /*
   * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
@@ -1445,8 +1484,8 @@ void bio_set_pages_dirty(struct bio *bio)
   * the BIO and re-dirty the pages in process context.
   *
   * It is expected that bio_check_pages_dirty() will wholly own the BIO from
- * here on.  It will run one put_page() against each page and will run one
- * bio_put() against the BIO.
+ * here on.  It will unpin each page and will run one bio_put() against the
+ * BIO.
   */
  
  static void bio_dirty_fn(struct work_struct *work);
@@ -1477,12 +1516,11 @@ static void bio_dirty_fn(struct work_struct *work)
  
  void bio_check_pages_dirty(struct bio *bio)
  {
-       struct bio_vec *bvec;
+       struct folio_iter fi;
         unsigned long flags;
-       struct bvec_iter_all iter_all;
  
-       bio_for_each_segment_all(bvec, bio, iter_all) {
-               if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
+       bio_for_each_folio_all(fi, bio) {
+               if (!folio_test_dirty(fi.folio))
                         goto defer;
         }
  
@@ -1496,6 +1534,7 @@ defer:
         spin_unlock_irqrestore(&bio_dirty_lock, flags);
         schedule_work(&bio_dirty_work);
  }
+EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
  
  static inline bool bio_remaining_done(struct bio *bio)
  {
@@ -1741,6 +1780,8 @@ static int __init init_bio(void)
  {
         int i;
  
+       BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags));
+
         bio_integrity_init();
  
         for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) {
@@ -1754,7 +1795,8 @@ static int __init init_bio(void)
         cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL,
                                         bio_cpu_dead);
  
-       if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS))
+       if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0,
+                       BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE))
                 panic("bio: can't allocate bios\n");
  
         if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE))