Merge tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 28 Jun 2023 17:28:11 +0000 (10:28 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 28 Jun 2023 17:28:11 +0000 (10:28 -0700)
Pull mm updates from Andrew Morton:

 - Yosry Ahmed brought back some cgroup v1 stats in OOM logs

 - Yosry has also eliminated cgroup's atomic rstat flushing

 - Nhat Pham adds the new cachestat() syscall. It provides userspace
   with the ability to query pagecache status - a similar concept to
   mincore() but more powerful and with improved usability

 - Mel Gorman provides more optimizations for compaction, reducing the
   prevalence of page rescanning

 - Lorenzo Stoakes has done some maintanance work on the
   get_user_pages() interface

 - Liam Howlett continues with cleanups and maintenance work to the
   maple tree code. Peng Zhang also does some work on maple tree

 - Johannes Weiner has done some cleanup work on the compaction code

 - David Hildenbrand has contributed additional selftests for
   get_user_pages()

 - Thomas Gleixner has contributed some maintenance and optimization
   work for the vmalloc code

 - Baolin Wang has provided some compaction cleanups,

 - SeongJae Park continues maintenance work on the DAMON code

 - Huang Ying has done some maintenance on the swap code's usage of
   device refcounting

 - Christoph Hellwig has some cleanups for the filemap/directio code

 - Ryan Roberts provides two patch series which yield some
   rationalization of the kernel's access to pte entries - use the
   provided APIs rather than open-coding accesses

 - Lorenzo Stoakes has some fixes to the interaction between pagecache
   and directio access to file mappings

 - John Hubbard has a series of fixes to the MM selftesting code

 - ZhangPeng continues the folio conversion campaign

 - Hugh Dickins has been working on the pagetable handling code, mainly
   with a view to reducing the load on the mmap_lock

 - Catalin Marinas has reduced the arm64 kmalloc() minimum alignment
   from 128 to 8

 - Domenico Cerasuolo has improved the zswap reclaim mechanism by
   reorganizing the LRU management

 - Matthew Wilcox provides some fixups to make gfs2 work better with the
   buffer_head code

 - Vishal Moola also has done some folio conversion work

 - Matthew Wilcox has removed the remnants of the pagevec code - their
   functionality is migrated over to struct folio_batch

* tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (380 commits)
  mm/hugetlb: remove hugetlb_set_page_subpool()
  mm: nommu: correct the range of mmap_sem_read_lock in task_mem()
  hugetlb: revert use of page_cache_next_miss()
  Revert "page cache: fix page_cache_next/prev_miss off by one"
  mm/vmscan: fix root proactive reclaim unthrottling unbalanced node
  mm: memcg: rename and document global_reclaim()
  mm: kill [add|del]_page_to_lru_list()
  mm: compaction: convert to use a folio in isolate_migratepages_block()
  mm: zswap: fix double invalidate with exclusive loads
  mm: remove unnecessary pagevec includes
  mm: remove references to pagevec
  mm: rename invalidate_mapping_pagevec to mapping_try_invalidate
  mm: remove struct pagevec
  net: convert sunrpc from pagevec to folio_batch
  i915: convert i915_gpu_error to use a folio_batch
  pagevec: rename fbatch_count()
  mm: remove check_move_unevictable_pages()
  drm: convert drm_gem_put_pages() to use a folio_batch
  i915: convert shmem_sg_free_table() to use a folio_batch
  scatterlist: add sg_set_folio()
  ...

65 files changed:
1  2 
Documentation/admin-guide/cgroup-v2.rst
Documentation/dev-tools/kselftest.rst
MAINTAINERS
arch/arm64/Kconfig
arch/arm64/kernel/traps.c
arch/arm64/mm/fault.c
arch/arm64/mm/init.c
arch/powerpc/xmon/xmon.c
arch/riscv/mm/hugetlbpage.c
arch/x86/mm/mem_encrypt_identity.c
block/fops.c
drivers/block/zram/zram_drv.c
drivers/iommu/Kconfig
drivers/md/dm-crypt.c
drivers/usb/core/buffer.c
drivers/vdpa/vdpa_user/vduse_dev.c
drivers/vhost/vdpa.c
fs/afs/write.c
fs/btrfs/file.c
fs/buffer.c
fs/ceph/file.c
fs/direct-io.c
fs/ext4/file.c
fs/ext4/inode.c
fs/f2fs/file.c
fs/fuse/file.c
fs/gfs2/file.c
fs/iomap/buffered-io.c
fs/iomap/direct-io.c
fs/nfs/file.c
fs/ntfs/file.c
fs/ntfs3/file.c
fs/xfs/xfs_file.c
fs/zonefs/file.c
include/linux/cgroup.h
include/linux/fs.h
include/linux/mm.h
include/linux/mmzone.h
include/linux/sched.h
include/linux/sunrpc/svc.h
include/linux/suspend.h
include/linux/syscalls.h
io_uring/rsrc.c
kernel/events/core.c
kernel/power/main.c
kernel/power/power.h
kernel/power/snapshot.c
kernel/trace/trace_events_user.c
lib/Kconfig.debug
lib/Makefile
mm/filemap.c
mm/gup.c
mm/internal.h
mm/memblock.c
mm/mm_init.c
mm/mmap.c
mm/mremap.c
mm/page_alloc.c
mm/page_table_check.c
mm/shmem.c
mm/slab.h
mm/swapfile.c
mm/vmstat.c
net/sunrpc/svc.c
virt/kvm/kvm_main.c

Simple merge
diff --cc MAINTAINERS
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
diff --cc block/fops.c
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
diff --cc fs/afs/write.c
Simple merge
diff --cc fs/btrfs/file.c
@@@ -1682,7 -1683,9 +1679,6 @@@ ssize_t btrfs_do_write_iter(struct kioc
                        num_written = num_sync;
        }
  
-       current->backing_dev_info = NULL;
 -      if (sync)
 -              atomic_dec(&inode->sync_writers);
 -
        return num_written;
  }
  
diff --cc fs/buffer.c
Simple merge
diff --cc fs/ceph/file.c
Simple merge
diff --cc fs/direct-io.c
Simple merge
diff --cc fs/ext4/file.c
Simple merge
diff --cc fs/ext4/inode.c
Simple merge
diff --cc fs/f2fs/file.c
Simple merge
diff --cc fs/fuse/file.c
Simple merge
diff --cc fs/gfs2/file.c
Simple merge
Simple merge
Simple merge
diff --cc fs/nfs/file.c
Simple merge
diff --cc fs/ntfs/file.c
Simple merge
diff --cc fs/ntfs3/file.c
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
@@@ -1910,40 -1901,18 +1901,40 @@@ static inline bool page_needs_cow_for_d
        return page_maybe_dma_pinned(page);
  }
  
- /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
 +/**
 + * is_zero_page - Query if a page is a zero page
 + * @page: The page to query
 + *
 + * This returns true if @page is one of the permanent zero pages.
 + */
 +static inline bool is_zero_page(const struct page *page)
 +{
 +      return is_zero_pfn(page_to_pfn(page));
 +}
 +
 +/**
 + * is_zero_folio - Query if a folio is a zero page
 + * @folio: The folio to query
 + *
 + * This returns true if @folio is one of the permanent zero pages.
 + */
 +static inline bool is_zero_folio(const struct folio *folio)
 +{
 +      return is_zero_page(&folio->page);
 +}
 +
+ /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin folios */
  #ifdef CONFIG_MIGRATION
- static inline bool is_longterm_pinnable_page(struct page *page)
+ static inline bool folio_is_longterm_pinnable(struct folio *folio)
  {
  #ifdef CONFIG_CMA
-       int mt = get_pageblock_migratetype(page);
+       int mt = folio_migratetype(folio);
  
        if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
                return false;
  #endif
 -      /* The zero page may always be pinned */
 -      if (is_zero_pfn(folio_pfn(folio)))
 +      /* The zero page can be "pinned" but gets special handling. */
-       if (is_zero_page(page))
++      if (is_zero_folio(folio))
                return true;
  
        /* Coherent device memory must always allow eviction. */
Simple merge
Simple merge
Simple merge
@@@ -512,7 -500,13 +509,12 @@@ extern void pm_report_max_hw_sleep(u64 
  
  /* drivers/base/power/wakeup.c */
  extern bool events_check_enabled;
 -extern suspend_state_t pm_suspend_target_state;
  
+ static inline bool pm_suspended_storage(void)
+ {
+       return !gfp_has_io_fs(gfp_allowed_mask);
+ }
  extern bool pm_wakeup_pending(void);
  extern void pm_system_wakeup(void);
  extern void pm_system_cancel_wakeup(void);
Simple merge
diff --cc io_uring/rsrc.c
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
diff --cc lib/Makefile
Simple merge
diff --cc mm/filemap.c
Simple merge
diff --cc mm/gup.c
+++ b/mm/gup.c
@@@ -127,62 -132,50 +133,57 @@@ struct folio *try_grab_folio(struct pag
        if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
                return NULL;
  
 -      folio = try_get_folio(page, refs);
 -
        if (flags & FOLL_GET)
 -              return folio;
 +              return try_get_folio(page, refs);
-       else if (flags & FOLL_PIN) {
-               struct folio *folio;
  
-               /*
-                * Don't take a pin on the zero page - it's not going anywhere
-                * and it is used in a *lot* of places.
-                */
-               if (is_zero_page(page))
-                       return page_folio(page);
+       /* FOLL_PIN is set */
 +
-               /*
-                * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
-                * right zone, so fail and let the caller fall back to the slow
-                * path.
-                */
-               if (unlikely((flags & FOLL_LONGTERM) &&
-                            !is_longterm_pinnable_page(page)))
-                       return NULL;
++      /*
++       * Don't take a pin on the zero page - it's not going anywhere
++       * and it is used in a *lot* of places.
++       */
++      if (is_zero_page(page))
++              return page_folio(page);
 +
-               /*
-                * CAUTION: Don't use compound_head() on the page before this
-                * point, the result won't be stable.
-                */
-               folio = try_get_folio(page, refs);
-               if (!folio)
-                       return NULL;
++      folio = try_get_folio(page, refs);
+       if (!folio)
+               return NULL;
  
-               /*
-                * When pinning a large folio, use an exact count to track it.
-                *
-                * However, be sure to *also* increment the normal folio
-                * refcount field at least once, so that the folio really
-                * is pinned.  That's why the refcount from the earlier
-                * try_get_folio() is left intact.
-                */
-               if (folio_test_large(folio))
-                       atomic_add(refs, &folio->_pincount);
-               else
-                       folio_ref_add(folio,
-                                       refs * (GUP_PIN_COUNTING_BIAS - 1));
-               /*
-                * Adjust the pincount before re-checking the PTE for changes.
-                * This is essentially a smp_mb() and is paired with a memory
-                * barrier in page_try_share_anon_rmap().
-                */
-               smp_mb__after_atomic();
+       /*
+        * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
+        * right zone, so fail and let the caller fall back to the slow
+        * path.
+        */
+       if (unlikely((flags & FOLL_LONGTERM) &&
+                    !folio_is_longterm_pinnable(folio))) {
+               if (!put_devmap_managed_page_refs(&folio->page, refs))
+                       folio_put_refs(folio, refs);
+               return NULL;
+       }
  
-               node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
+       /*
+        * When pinning a large folio, use an exact count to track it.
+        *
+        * However, be sure to *also* increment the normal folio
+        * refcount field at least once, so that the folio really
+        * is pinned.  That's why the refcount from the earlier
+        * try_get_folio() is left intact.
+        */
+       if (folio_test_large(folio))
+               atomic_add(refs, &folio->_pincount);
+       else
+               folio_ref_add(folio,
+                               refs * (GUP_PIN_COUNTING_BIAS - 1));
+       /*
+        * Adjust the pincount before re-checking the PTE for changes.
+        * This is essentially a smp_mb() and is paired with a memory
+        * barrier in page_try_share_anon_rmap().
+        */
+       smp_mb__after_atomic();
  
-               return folio;
-       }
+       node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
  
-       WARN_ON_ONCE(1);
-       return NULL;
+       return folio;
  }
  
  static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
@@@ -3193,13 -3250,9 +3300,12 @@@ EXPORT_SYMBOL(pin_user_pages_remote)
   *
   * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
   * see Documentation/core-api/pin_user_pages.rst for details.
 + *
 + * Note that if a zero_page is amongst the returned pages, it will not have
 + * pins in it and unpin_user_page*() will not remove pins from it.
   */
  long pin_user_pages(unsigned long start, unsigned long nr_pages,
-                   unsigned int gup_flags, struct page **pages,
-                   struct vm_area_struct **vmas)
+                   unsigned int gup_flags, struct page **pages)
  {
        int locked = 1;
  
diff --cc mm/internal.h
Simple merge
diff --cc mm/memblock.c
Simple merge
diff --cc mm/mm_init.c
Simple merge
diff --cc mm/mmap.c
+++ b/mm/mmap.c
@@@ -2385,15 -2412,26 +2398,30 @@@ do_vmi_align_munmap(struct vma_iterato
                        if (error)
                                goto end_split_failed;
                }
 -              error = munmap_sidetree(next, &mas_detach);
 -              if (error)
 -                      goto munmap_sidetree_failed;
 +              vma_start_write(next);
 +              mas_set_range(&mas_detach, next->vm_start, next->vm_end - 1);
 +              if (mas_store_gfp(&mas_detach, next, GFP_KERNEL))
 +                      goto munmap_gather_failed;
 +              vma_mark_detached(next, true);
 +              if (next->vm_flags & VM_LOCKED)
 +                      locked_vm += vma_pages(next);
  
                count++;
+               if (unlikely(uf)) {
+                       /*
+                        * If userfaultfd_unmap_prep returns an error the vmas
+                        * will remain split, but userland will get a
+                        * highly unexpected error anyway. This is no
+                        * different than the case where the first of the two
+                        * __split_vma fails, but we don't undo the first
+                        * split, despite we could. This is unlikely enough
+                        * failure that it's not worth optimizing it for.
+                        */
+                       error = userfaultfd_unmap_prep(next, start, end, uf);
+                       if (error)
+                               goto userfaultfd_error;
+               }
  #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
                BUG_ON(next->vm_start < start);
                BUG_ON(next->vm_start > end);
diff --cc mm/mremap.c
Simple merge
diff --cc mm/page_alloc.c
Simple merge
Simple merge
diff --cc mm/shmem.c
Simple merge
diff --cc mm/slab.h
Simple merge
diff --cc mm/swapfile.c
Simple merge
diff --cc mm/vmstat.c
Simple merge
Simple merge
Simple merge