Merge branch 'master' into mm-stable

author akpm <akpm@linux-foundation.org>

Mon, 27 Jun 2022 17:31:34 +0000 (10:31 -0700)

committer akpm <akpm@linux-foundation.org>

Mon, 27 Jun 2022 17:31:34 +0000 (10:31 -0700)
author akpm <akpm@linux-foundation.org>
Mon, 27 Jun 2022 17:31:34 +0000 (10:31 -0700)
committer akpm <akpm@linux-foundation.org>
Mon, 27 Jun 2022 17:31:34 +0000 (10:31 -0700)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst

index 176298f..ad9ba3e 100644 (file)
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1433,6 +1433,24 @@ PAGE_SIZE multiple when read back.
           workingset_nodereclaim
                 Number of times a shadow node has been reclaimed
  
+         pgscan (npn)
+               Amount of scanned pages (in an inactive LRU list)
+
+         pgsteal (npn)
+               Amount of reclaimed pages
+
+         pgscan_kswapd (npn)
+               Amount of scanned pages by kswapd (in an inactive LRU list)
+
+         pgscan_direct (npn)
+               Amount of scanned pages directly  (in an inactive LRU list)
+
+         pgsteal_kswapd (npn)
+               Amount of reclaimed pages by kswapd
+
+         pgsteal_direct (npn)
+               Amount of reclaimed pages directly
+
           pgfault (npn)
                 Total number of page faults incurred
  
@@ -1442,12 +1460,6 @@ PAGE_SIZE multiple when read back.
           pgrefill (npn)
                 Amount of scanned pages (in an active LRU list)
  
-         pgscan (npn)
-               Amount of scanned pages (in an inactive LRU list)
-
-         pgsteal (npn)
-               Amount of reclaimed pages
-
           pgactivate (npn)
                 Amount of pages moved to the active LRU list
  
diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst

index 1c935f4..5483fd3 100644 (file)
--- a/Documentation/dev-tools/kmemleak.rst
+++ b/Documentation/dev-tools/kmemleak.rst
@@ -174,7 +174,6 @@ mapping:
  
  - ``kmemleak_alloc_phys``
  - ``kmemleak_free_part_phys``
-- ``kmemleak_not_leak_phys``
  - ``kmemleak_ignore_phys``
  
  Dealing with false positives/negatives
diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c

index ec20c10..ef427a6 100644 (file)
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -155,6 +155,10 @@ retry:
         if (fault_signal_pending(fault, regs))
                 return;
  
+       /* The fault is fully completed (including releasing mmap lock) */
+       if (fault & VM_FAULT_COMPLETED)
+               return;
+
         if (unlikely(fault & VM_FAULT_ERROR)) {
                 if (fault & VM_FAULT_OOM)
                         goto out_of_memory;
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c

index dad27e4..5ca59a4 100644 (file)
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -146,6 +146,10 @@ retry:
                 return;
         }
  
+       /* The fault is fully completed (including releasing mmap lock) */
+       if (fault & VM_FAULT_COMPLETED)
+               return;
+
         /*
          * Fault retry nuances, mmap_lock already relinquished by core mm
          */
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c

index a062e07..46cccd6 100644 (file)
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -322,6 +322,10 @@ retry:
                 return 0;
         }
  
+       /* The fault is fully completed (including releasing mmap lock) */
+       if (fault & VM_FAULT_COMPLETED)
+               return 0;
+
         if (!(fault & VM_FAULT_ERROR)) {
                 if (fault & VM_FAULT_RETRY) {
                         flags |= FAULT_FLAG_TRIED;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c

index c5e1176..de166cd 100644 (file)
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -608,6 +608,10 @@ retry:
                 return 0;
         }
  
+       /* The fault is fully completed (including releasing mmap lock) */
+       if (fault & VM_FAULT_COMPLETED)
+               return 0;
+
         if (fault & VM_FAULT_RETRY) {
                 mm_flags |= FAULT_FLAG_TRIED;
                 goto retry;
diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c

index 7215a46..e15f736 100644 (file)
--- a/arch/csky/mm/fault.c
+++ b/arch/csky/mm/fault.c
@@ -285,6 +285,10 @@ good_area:
                 return;
         }
  
+       /* The fault is fully completed (including releasing mmap lock) */
+       if (fault & VM_FAULT_COMPLETED)
+               return;
+
         if (unlikely((fault & VM_FAULT_RETRY) && (flags & FAULT_FLAG_ALLOW_RETRY))) {
                 flags |= FAULT_FLAG_TRIED;
  
diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c

index 4fac4b9..f73c7cb 100644 (file)
--- a/arch/hexagon/mm/vm_fault.c
+++ b/arch/hexagon/mm/vm_fault.c
@@ -96,6 +96,10 @@ good_area:
         if (fault_signal_pending(fault, regs))
                 return;
  
+       /* The fault is fully completed (including releasing mmap lock) */
+       if (fault & VM_FAULT_COMPLETED)
+               return;
+
         /* The most common case -- we are done. */
         if (likely(!(fault & VM_FAULT_ERROR))) {
                 if (fault & VM_FAULT_RETRY) {
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c

index 07379d1..ef78c2d 100644 (file)
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -139,6 +139,10 @@ retry:
         if (fault_signal_pending(fault, regs))
                 return;
  
+       /* The fault is fully completed (including releasing mmap lock) */
+       if (fault & VM_FAULT_COMPLETED)
+               return;
+
         if (unlikely(fault & VM_FAULT_ERROR)) {
                 /*
                  * We ran out of memory, or some other thing happened
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c

index 71aa9f6..4d2837e 100644 (file)
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -141,6 +141,10 @@ good_area:
         if (fault_signal_pending(fault, regs))
                 return 0;
  
+       /* The fault is fully completed (including releasing mmap lock) */
+       if (fault & VM_FAULT_COMPLETED)
+               return 0;
+
         if (unlikely(fault & VM_FAULT_ERROR)) {
                 if (fault & VM_FAULT_OOM)
                         goto out_of_memory;
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c

index a9626e6..5c40c3e 100644 (file)
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -222,6 +222,10 @@ good_area:
         if (fault_signal_pending(fault, regs))
                 return;
  
+       /* The fault is fully completed (including releasing mmap lock) */
+       if (fault & VM_FAULT_COMPLETED)
+               return;
+
         if (unlikely(fault & VM_FAULT_ERROR)) {
                 if (fault & VM_FAULT_OOM)
                         goto out_of_memory;
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c

index b08bc55..a27045f 100644 (file)
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -162,6 +162,10 @@ good_area:
                 return;
         }
  
+       /* The fault is fully completed (including releasing mmap lock) */
+       if (fault & VM_FAULT_COMPLETED)
+               return;
+
         if (unlikely(fault & VM_FAULT_ERROR)) {
                 if (fault & VM_FAULT_OOM)
                         goto out_of_memory;
diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c

index a32f14c..edaca0a 100644 (file)
--- a/arch/nios2/mm/fault.c
+++ b/arch/nios2/mm/fault.c
@@ -139,6 +139,10 @@ good_area:
         if (fault_signal_pending(fault, regs))
                 return;
  
+       /* The fault is fully completed (including releasing mmap lock) */
+       if (fault & VM_FAULT_COMPLETED)
+               return;
+
         if (unlikely(fault & VM_FAULT_ERROR)) {
                 if (fault & VM_FAULT_OOM)
                         goto out_of_memory;
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c

index 53b760a..b4762d6 100644 (file)
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -165,6 +165,10 @@ good_area:
         if (fault_signal_pending(fault, regs))
                 return;
  
+       /* The fault is fully completed (including releasing mmap lock) */
+       if (fault & VM_FAULT_COMPLETED)
+               return;
+
         if (unlikely(fault & VM_FAULT_ERROR)) {
                 if (fault & VM_FAULT_OOM)
                         goto out_of_memory;
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c

index 84bc437..9ad80d4 100644 (file)
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -311,6 +311,10 @@ good_area:
         if (fault_signal_pending(fault, regs))
                 return;
  
+       /* The fault is fully completed (including releasing mmap lock) */
+       if (fault & VM_FAULT_COMPLETED)
+               return;
+
         if (unlikely(fault & VM_FAULT_ERROR)) {
                 /*
                  * We hit a shared mapping outside of the file, or some
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c

index c1cb21a..7c507fb 100644 (file)
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -65,6 +65,11 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
  
         ret = 0;
         *flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0, NULL);
+
+       /* The fault is fully completed (including releasing mmap lock) */
+       if (*flt & VM_FAULT_COMPLETED)
+               return 0;
+
         if (unlikely(*flt & VM_FAULT_ERROR)) {
                 if (*flt & VM_FAULT_OOM) {
                         ret = -ENOMEM;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c

index d53fed4..0140054 100644 (file)
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -511,6 +511,10 @@ retry:
         if (fault_signal_pending(fault, regs))
                 return user_mode(regs) ? 0 : SIGBUS;
  
+       /* The fault is fully completed (including releasing mmap lock) */
+       if (fault & VM_FAULT_COMPLETED)
+               goto out;
+
         /*
          * Handle the retry right now, the mmap_lock has been released in that
          * case.
@@ -525,6 +529,7 @@ retry:
         if (unlikely(fault & VM_FAULT_ERROR))
                 return mm_fault_error(regs, address, fault);
  
+out:
         /*
          * Major/minor page fault accounting.
          */
diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c

index 40694f0..f2fbd14 100644 (file)
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -326,6 +326,10 @@ good_area:
         if (fault_signal_pending(fault, regs))
                 return;
  
+       /* The fault is fully completed (including releasing mmap lock) */
+       if (fault & VM_FAULT_COMPLETED)
+               return;
+
         if (unlikely(fault & VM_FAULT_RETRY)) {
                 flags |= FAULT_FLAG_TRIED;
  
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c

index e173b61..973dcd0 100644 (file)
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -433,6 +433,17 @@ retry:
                         goto out_up;
                 goto out;
         }
+
+       /* The fault is fully completed (including releasing mmap lock) */
+       if (fault & VM_FAULT_COMPLETED) {
+               if (gmap) {
+                       mmap_read_lock(mm);
+                       goto out_gmap;
+               }
+               fault = 0;
+               goto out;
+       }
+
         if (unlikely(fault & VM_FAULT_ERROR))
                 goto out_up;
  
@@ -452,6 +463,7 @@ retry:
                 mmap_read_lock(mm);
                 goto retry;
         }
+out_gmap:
         if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
                 address =  __gmap_link(gmap, current->thread.gmap_addr,
                                        address);
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c

index e175667..acd2f5e 100644 (file)
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -485,6 +485,10 @@ good_area:
                 if (mm_fault_error(regs, error_code, address, fault))
                         return;
  
+       /* The fault is fully completed (including releasing mmap lock) */
+       if (fault & VM_FAULT_COMPLETED)
+               return;
+
         if (fault & VM_FAULT_RETRY) {
                 flags |= FAULT_FLAG_TRIED;
  
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c

index ad569d9..91259f2 100644 (file)
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -190,6 +190,10 @@ good_area:
         if (fault_signal_pending(fault, regs))
                 return;
  
+       /* The fault is fully completed (including releasing mmap lock) */
+       if (fault & VM_FAULT_COMPLETED)
+               return;
+
         if (unlikely(fault & VM_FAULT_ERROR)) {
                 if (fault & VM_FAULT_OOM)
                         goto out_of_memory;
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c

index 253e070..4acc12e 100644 (file)
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -427,6 +427,10 @@ good_area:
         if (fault_signal_pending(fault, regs))
                 goto exit_exception;
  
+       /* The fault is fully completed (including releasing mmap lock) */
+       if (fault & VM_FAULT_COMPLETED)
+               goto lock_released;
+
         if (unlikely(fault & VM_FAULT_ERROR)) {
                 if (fault & VM_FAULT_OOM)
                         goto out_of_memory;
@@ -449,6 +453,7 @@ good_area:
         }
         mmap_read_unlock(mm);
  
+lock_released:
         mm_rss = get_mm_rss(mm);
  #if defined(CONFIG_TRANSPARENT_HUGEPAGE)
         mm_rss -= (mm->context.thp_pte_count * (HPAGE_SIZE / PAGE_SIZE));
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c

index d1d5d0b..d3ce21c 100644 (file)
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -76,6 +76,10 @@ good_area:
                 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
                         goto out_nosemaphore;
  
+               /* The fault is fully completed (including releasing mmap lock) */
+               if (fault & VM_FAULT_COMPLETED)
+                       return 0;
+
                 if (unlikely(fault & VM_FAULT_ERROR)) {
                         if (fault & VM_FAULT_OOM) {
                                 goto out_of_memory;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c

index fad8faa..fe10c6d 100644 (file)
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1408,6 +1408,10 @@ good_area:
                 return;
         }
  
+       /* The fault is fully completed (including releasing mmap lock) */
+       if (fault & VM_FAULT_COMPLETED)
+               return;
+
         /*
          * If we need to retry the mmap_lock has already been released,
          * and if there is a fatal signal pending there is no guarantee
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c

index a0d023c..509408d 100644 (file)
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -19,44 +19,6 @@
  #include <asm/tlbflush.h>
  #include <asm/elf.h>
  
-#if 0  /* This is just for testing */
-struct page *
-follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
-{
-       unsigned long start = address;
-       int length = 1;
-       int nr;
-       struct page *page;
-       struct vm_area_struct *vma;
-
-       vma = find_vma(mm, addr);
-       if (!vma || !is_vm_hugetlb_page(vma))
-               return ERR_PTR(-EINVAL);
-
-       pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
-
-       /* hugetlb should be locked, and hence, prefaulted */
-       WARN_ON(!pte || pte_none(*pte));
-
-       page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
-
-       WARN_ON(!PageHead(page));
-
-       return page;
-}
-
-int pmd_huge(pmd_t pmd)
-{
-       return 0;
-}
-
-int pud_huge(pud_t pud)
-{
-       return 0;
-}
-
-#else
-
  /*
   * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal
   * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry.
@@ -72,7 +34,6 @@ int pud_huge(pud_t pud)
  {
         return !!(pud_val(pud) & _PAGE_PSE);
  }
-#endif
  
  #ifdef CONFIG_HUGETLB_PAGE
  static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c

index 16f0a5f..8c781b0 100644 (file)
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -172,6 +172,10 @@ good_area:
                 return;
         }
  
+       /* The fault is fully completed (including releasing mmap lock) */
+       if (fault & VM_FAULT_COMPLETED)
+               return;
+
         if (unlikely(fault & VM_FAULT_ERROR)) {
                 if (fault & VM_FAULT_OOM)
                         goto out_of_memory;
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c

index a8f5b65..2c677e8 100644 (file)
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -529,7 +529,7 @@ static int __init __reserved_mem_reserve_reg(unsigned long node,
                         pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n",
                                 uname, &base, (unsigned long)(size / SZ_1M));
                         if (!nomap)
-                               kmemleak_alloc_phys(base, size, 0, 0);
+                               kmemleak_alloc_phys(base, size, 0);
                 }
                 else
                         pr_info("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n",
diff --git a/include/linux/damon.h b/include/linux/damon.h

index 7c62da3..2765c7d 100644 (file)
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -397,7 +397,6 @@ struct damon_callback {
   * detail.
   *
   * @kdamond:           Kernel thread who does the monitoring.
- * @kdamond_stop:      Notifies whether kdamond should stop.
   * @kdamond_lock:      Mutex for the synchronizations with @kdamond.
   *
   * For each monitoring context, one kernel thread for the monitoring is
@@ -406,14 +405,14 @@ struct damon_callback {
   * Once started, the monitoring thread runs until explicitly required to be
   * terminated or every monitoring target is invalid.  The validity of the
   * targets is checked via the &damon_operations.target_valid of @ops.  The
- * termination can also be explicitly requested by writing non-zero to
- * @kdamond_stop.  The thread sets @kdamond to NULL when it terminates.
- * Therefore, users can know whether the monitoring is ongoing or terminated by
- * reading @kdamond.  Reads and writes to @kdamond and @kdamond_stop from
- * outside of the monitoring thread must be protected by @kdamond_lock.
- *
- * Note that the monitoring thread protects only @kdamond and @kdamond_stop via
- * @kdamond_lock.  Accesses to other fields must be protected by themselves.
+ * termination can also be explicitly requested by calling damon_stop().
+ * The thread sets @kdamond to NULL when it terminates. Therefore, users can
+ * know whether the monitoring is ongoing or terminated by reading @kdamond.
+ * Reads and writes to @kdamond from outside of the monitoring thread must
+ * be protected by @kdamond_lock.
+ *
+ * Note that the monitoring thread protects only @kdamond via @kdamond_lock.
+ * Accesses to other fields must be protected by themselves.
   *
   * @ops:       Set of monitoring operations for given use cases.
   * @callback:  Set of callbacks for monitoring events notifications.
diff --git a/include/linux/highmem.h b/include/linux/highmem.h

index 3af34de..fee9835 100644 (file)
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -336,19 +336,6 @@ static inline void memcpy_page(struct page *dst_page, size_t dst_off,
         kunmap_local(dst);
  }
  
-static inline void memmove_page(struct page *dst_page, size_t dst_off,
-                              struct page *src_page, size_t src_off,
-                              size_t len)
-{
-       char *dst = kmap_local_page(dst_page);
-       char *src = kmap_local_page(src_page);
-
-       VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE);
-       memmove(dst + dst_off, src + src_off, len);
-       kunmap_local(src);
-       kunmap_local(dst);
-}
-
  static inline void memset_page(struct page *page, size_t offset, int val,
                                size_t len)
  {
diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h

index 34684b2..6a3cd1b 100644 (file)
--- a/include/linux/kmemleak.h
+++ b/include/linux/kmemleak.h
@@ -29,10 +29,9 @@ extern void kmemleak_not_leak(const void *ptr) __ref;
  extern void kmemleak_ignore(const void *ptr) __ref;
  extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref;
  extern void kmemleak_no_scan(const void *ptr) __ref;
-extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count,
+extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size,
                                 gfp_t gfp) __ref;
  extern void kmemleak_free_part_phys(phys_addr_t phys, size_t size) __ref;
-extern void kmemleak_not_leak_phys(phys_addr_t phys) __ref;
  extern void kmemleak_ignore_phys(phys_addr_t phys) __ref;
  
  static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
@@ -107,15 +106,12 @@ static inline void kmemleak_no_scan(const void *ptr)
  {
  }
  static inline void kmemleak_alloc_phys(phys_addr_t phys, size_t size,
-                                      int min_count, gfp_t gfp)
+                                      gfp_t gfp)
  {
  }
  static inline void kmemleak_free_part_phys(phys_addr_t phys, size_t size)
  {
  }
-static inline void kmemleak_not_leak_phys(phys_addr_t phys)
-{
-}
  static inline void kmemleak_ignore_phys(phys_addr_t phys)
  {
  }
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

index 9ecead1..04f2f33 100644 (file)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1740,6 +1740,7 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg)
  }
  
  struct mem_cgroup *mem_cgroup_from_obj(void *p);
+struct mem_cgroup *mem_cgroup_from_slab_obj(void *p);
  
  static inline void count_objcg_event(struct obj_cgroup *objcg,
                                      enum vm_event_item idx)
@@ -1755,6 +1756,42 @@ static inline void count_objcg_event(struct obj_cgroup *objcg,
         rcu_read_unlock();
  }
  
+/**
+ * get_mem_cgroup_from_obj - get a memcg associated with passed kernel object.
+ * @p: pointer to object from which memcg should be extracted. It can be NULL.
+ *
+ * Retrieves the memory group into which the memory of the pointed kernel
+ * object is accounted. If memcg is found, its reference is taken.
+ * If a passed kernel object is uncharged, or if proper memcg cannot be found,
+ * as well as if mem_cgroup is disabled, NULL is returned.
+ *
+ * Return: valid memcg pointer with taken reference or NULL.
+ */
+static inline struct mem_cgroup *get_mem_cgroup_from_obj(void *p)
+{
+       struct mem_cgroup *memcg;
+
+       rcu_read_lock();
+       do {
+               memcg = mem_cgroup_from_obj(p);
+       } while (memcg && !css_tryget(&memcg->css));
+       rcu_read_unlock();
+       return memcg;
+}
+
+/**
+ * mem_cgroup_or_root - always returns a pointer to a valid memory cgroup.
+ * @memcg: pointer to a valid memory cgroup or NULL.
+ *
+ * If passed argument is not NULL, returns it without any additional checks
+ * and changes. Otherwise, root_mem_cgroup is returned.
+ *
+ * NOTE: root_mem_cgroup can be NULL during early boot.
+ */
+static inline struct mem_cgroup *mem_cgroup_or_root(struct mem_cgroup *memcg)
+{
+       return memcg ? memcg : root_mem_cgroup;
+}
  #else
  static inline bool mem_cgroup_kmem_disabled(void)
  {
@@ -1798,7 +1835,12 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg)
  
  static inline struct mem_cgroup *mem_cgroup_from_obj(void *p)
  {
-       return NULL;
+       return NULL;
+}
+
+static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
+{
+       return NULL;
  }
  
  static inline void count_objcg_event(struct obj_cgroup *objcg,
@@ -1806,6 +1848,15 @@ static inline void count_objcg_event(struct obj_cgroup *objcg,
  {
  }
  
+static inline struct mem_cgroup *get_mem_cgroup_from_obj(void *p)
+{
+       return NULL;
+}
+
+static inline struct mem_cgroup *mem_cgroup_or_root(struct mem_cgroup *memcg)
+{
+       return NULL;
+}
  #endif /* CONFIG_MEMCG_KMEM */
  
  #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index c29ab4c..6b961a2 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -729,6 +729,7 @@ typedef __bitwise unsigned int vm_fault_t;
   * @VM_FAULT_NEEDDSYNC:                ->fault did not modify page tables and needs
   *                             fsync() to complete (for synchronous page faults
   *                             in DAX)
+ * @VM_FAULT_COMPLETED:                ->fault completed, meanwhile mmap lock released
   * @VM_FAULT_HINDEX_MASK:      mask HINDEX value
   *
   */
@@ -746,6 +747,7 @@ enum vm_fault_reason {
         VM_FAULT_FALLBACK       = (__force vm_fault_t)0x000800,
         VM_FAULT_DONE_COW       = (__force vm_fault_t)0x001000,
         VM_FAULT_NEEDDSYNC      = (__force vm_fault_t)0x002000,
+       VM_FAULT_COMPLETED      = (__force vm_fault_t)0x004000,
         VM_FAULT_HINDEX_MASK    = (__force vm_fault_t)0x0f0000,
  };
  
diff --git a/lib/test_hmm.c b/lib/test_hmm.c

index cfe6320..f2c3015 100644 (file)
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -732,7 +732,7 @@ static int dmirror_exclusive(struct dmirror *dmirror,
  
         mmap_read_lock(mm);
         for (addr = start; addr < end; addr = next) {
-               unsigned long mapped;
+               unsigned long mapped = 0;
                 int i;
  
                 if (end < addr + (ARRAY_SIZE(pages) << PAGE_SHIFT))
@@ -741,7 +741,13 @@ static int dmirror_exclusive(struct dmirror *dmirror,
                         next = addr + (ARRAY_SIZE(pages) << PAGE_SHIFT);
  
                 ret = make_device_exclusive_range(mm, addr, next, pages, NULL);
-               mapped = dmirror_atomic_map(addr, next, pages, dmirror);
+               /*
+                * Do dmirror_atomic_map() iff all pages are marked for
+                * exclusive access to avoid accessing uninitialized
+                * fields of pages.
+                */
+               if (ret == (next - addr) >> PAGE_SHIFT)
+                       mapped = dmirror_atomic_map(addr, next, pages, dmirror);
                 for (i = 0; i < ret; i++) {
                         if (pages[i]) {
                                 unlock_page(pages[i]);
diff --git a/mm/gup.c b/mm/gup.c

index 5512644..407a81d 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -951,6 +951,25 @@ static int faultin_page(struct vm_area_struct *vma,
         }
  
         ret = handle_mm_fault(vma, address, fault_flags, NULL);
+
+       if (ret & VM_FAULT_COMPLETED) {
+               /*
+                * With FAULT_FLAG_RETRY_NOWAIT we'll never release the
+                * mmap lock in the page fault handler. Sanity check this.
+                */
+               WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT);
+               if (locked)
+                       *locked = 0;
+               /*
+                * We should do the same as VM_FAULT_RETRY, but let's not
+                * return -EBUSY since that's not reflecting the reality of
+                * what has happened - we've just fully completed a page
+                * fault, with the mmap lock released.  Use -EAGAIN to show
+                * that we want to take the mmap lock _again_.
+                */
+               return -EAGAIN;
+       }
+
         if (ret & VM_FAULT_ERROR) {
                 int err = vm_fault_to_errno(ret, *flags);
  
@@ -1177,6 +1196,7 @@ retry:
                         case 0:
                                 goto retry;
                         case -EBUSY:
+                       case -EAGAIN:
                                 ret = 0;
                                 fallthrough;
                         case -EFAULT:
@@ -1303,6 +1323,18 @@ retry:
                 return -EINTR;
  
         ret = handle_mm_fault(vma, address, fault_flags, NULL);
+
+       if (ret & VM_FAULT_COMPLETED) {
+               /*
+                * NOTE: it's a pity that we need to retake the lock here
+                * to pair with the unlock() in the callers. Ideally we
+                * could tell the callers so they do not need to unlock.
+                */
+               mmap_read_lock(mm);
+               *unlocked = true;
+               return 0;
+       }
+
         if (ret & VM_FAULT_ERROR) {
                 int err = vm_fault_to_errno(ret, 0);
  
@@ -1368,7 +1400,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
                         /* VM_FAULT_RETRY couldn't trigger, bypass */
                         return ret;
  
-               /* VM_FAULT_RETRY cannot return errors */
+               /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
                 if (!*locked) {
                         BUG_ON(ret < 0);
                         BUG_ON(ret >= nr_pages);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c

index a182f5d..1eddc01 100644 (file)
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -14,14 +14,16 @@
   * The following locks and mutexes are used by kmemleak:
   *
   * - kmemleak_lock (raw_spinlock_t): protects the object_list modifications and
- *   accesses to the object_tree_root. The object_list is the main list
- *   holding the metadata (struct kmemleak_object) for the allocated memory
- *   blocks. The object_tree_root is a red black tree used to look-up
- *   metadata based on a pointer to the corresponding memory block.  The
- *   kmemleak_object structures are added to the object_list and
- *   object_tree_root in the create_object() function called from the
- *   kmemleak_alloc() callback and removed in delete_object() called from the
- *   kmemleak_free() callback
+ *   accesses to the object_tree_root (or object_phys_tree_root). The
+ *   object_list is the main list holding the metadata (struct kmemleak_object)
+ *   for the allocated memory blocks. The object_tree_root and object_phys_tree_root
+ *   are red black trees used to look-up metadata based on a pointer to the
+ *   corresponding memory block. The object_phys_tree_root is for objects
+ *   allocated with physical address. The kmemleak_object structures are
+ *   added to the object_list and object_tree_root (or object_phys_tree_root)
+ *   in the create_object() function called from the kmemleak_alloc() (or
+ *   kmemleak_alloc_phys()) callback and removed in delete_object() called from
+ *   the kmemleak_free() callback
   * - kmemleak_object.lock (raw_spinlock_t): protects a kmemleak_object.
   *   Accesses to the metadata (e.g. count) are protected by this lock. Note
   *   that some members of this structure may be protected by other means
@@ -172,6 +174,8 @@ struct kmemleak_object {
  #define OBJECT_NO_SCAN         (1 << 2)
  /* flag set to fully scan the object when scan_area allocation failed */
  #define OBJECT_FULL_SCAN       (1 << 3)
+/* flag set for object allocated with physical address */
+#define OBJECT_PHYS            (1 << 4)
  
  #define HEX_PREFIX             "    "
  /* number of bytes to print per line; must be 16 or 32 */
@@ -193,7 +197,9 @@ static int mem_pool_free_count = ARRAY_SIZE(mem_pool);
  static LIST_HEAD(mem_pool_free_list);
  /* search tree for object boundaries */
  static struct rb_root object_tree_root = RB_ROOT;
-/* protecting the access to object_list and object_tree_root */
+/* search tree for object (with OBJECT_PHYS flag) boundaries */
+static struct rb_root object_phys_tree_root = RB_ROOT;
+/* protecting the access to object_list, object_tree_root (or object_phys_tree_root) */
  static DEFINE_RAW_SPINLOCK(kmemleak_lock);
  
  /* allocation caches for kmemleak internal data */
@@ -285,6 +291,9 @@ static void hex_dump_object(struct seq_file *seq,
         const u8 *ptr = (const u8 *)object->pointer;
         size_t len;
  
+       if (WARN_ON_ONCE(object->flags & OBJECT_PHYS))
+               return;
+
         /* limit the number of lines to HEX_MAX_LINES */
         len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE);
  
@@ -378,9 +387,11 @@ static void dump_object_info(struct kmemleak_object *object)
   * beginning of the memory block are allowed. The kmemleak_lock must be held
   * when calling this function.
   */
-static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
+static struct kmemleak_object *__lookup_object(unsigned long ptr, int alias,
+                                              bool is_phys)
  {
-       struct rb_node *rb = object_tree_root.rb_node;
+       struct rb_node *rb = is_phys ? object_phys_tree_root.rb_node :
+                            object_tree_root.rb_node;
         unsigned long untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
  
         while (rb) {
@@ -406,6 +417,12 @@ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
         return NULL;
  }
  
+/* Look-up a kmemleak object which allocated with virtual address. */
+static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
+{
+       return __lookup_object(ptr, alias, false);
+}
+
  /*
   * Increment the object use_count. Return 1 if successful or 0 otherwise. Note
   * that once an object's use_count reached 0, the RCU freeing was already
@@ -515,14 +532,15 @@ static void put_object(struct kmemleak_object *object)
  /*
   * Look up an object in the object search tree and increase its use_count.
   */
-static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
+static struct kmemleak_object *__find_and_get_object(unsigned long ptr, int alias,
+                                                    bool is_phys)
  {
         unsigned long flags;
         struct kmemleak_object *object;
  
         rcu_read_lock();
         raw_spin_lock_irqsave(&kmemleak_lock, flags);
-       object = lookup_object(ptr, alias);
+       object = __lookup_object(ptr, alias, is_phys);
         raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
  
         /* check whether the object is still available */
@@ -533,28 +551,39 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
         return object;
  }
  
+/* Look up and get an object which allocated with virtual address. */
+static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
+{
+       return __find_and_get_object(ptr, alias, false);
+}
+
  /*
- * Remove an object from the object_tree_root and object_list. Must be called
- * with the kmemleak_lock held _if_ kmemleak is still enabled.
+ * Remove an object from the object_tree_root (or object_phys_tree_root)
+ * and object_list. Must be called with the kmemleak_lock held _if_ kmemleak
+ * is still enabled.
   */
  static void __remove_object(struct kmemleak_object *object)
  {
-       rb_erase(&object->rb_node, &object_tree_root);
+       rb_erase(&object->rb_node, object->flags & OBJECT_PHYS ?
+                                  &object_phys_tree_root :
+                                  &object_tree_root);
         list_del_rcu(&object->object_list);
  }
  
  /*
   * Look up an object in the object search tree and remove it from both
- * object_tree_root and object_list. The returned object's use_count should be
- * at least 1, as initially set by create_object().
+ * object_tree_root (or object_phys_tree_root) and object_list. The
+ * returned object's use_count should be at least 1, as initially set
+ * by create_object().
   */
-static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias)
+static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias,
+                                                     bool is_phys)
  {
         unsigned long flags;
         struct kmemleak_object *object;
  
         raw_spin_lock_irqsave(&kmemleak_lock, flags);
-       object = lookup_object(ptr, alias);
+       object = __lookup_object(ptr, alias, is_phys);
         if (object)
                 __remove_object(object);
         raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
@@ -572,10 +601,12 @@ static int __save_stack_trace(unsigned long *trace)
  
  /*
   * Create the metadata (struct kmemleak_object) corresponding to an allocated
- * memory block and add it to the object_list and object_tree_root.
+ * memory block and add it to the object_list and object_tree_root (or
+ * object_phys_tree_root).
   */
-static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
-                                            int min_count, gfp_t gfp)
+static struct kmemleak_object *__create_object(unsigned long ptr, size_t size,
+                                            int min_count, gfp_t gfp,
+                                            bool is_phys)
  {
         unsigned long flags;
         struct kmemleak_object *object, *parent;
@@ -595,7 +626,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
         INIT_HLIST_HEAD(&object->area_list);
         raw_spin_lock_init(&object->lock);
         atomic_set(&object->use_count, 1);
-       object->flags = OBJECT_ALLOCATED;
+       object->flags = OBJECT_ALLOCATED | (is_phys ? OBJECT_PHYS : 0);
         object->pointer = ptr;
         object->size = kfence_ksize((void *)ptr) ?: size;
         object->excess_ref = 0;
@@ -628,9 +659,16 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
         raw_spin_lock_irqsave(&kmemleak_lock, flags);
  
         untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
-       min_addr = min(min_addr, untagged_ptr);
-       max_addr = max(max_addr, untagged_ptr + size);
-       link = &object_tree_root.rb_node;
+       /*
+        * Only update min_addr and max_addr with object
+        * storing virtual address.
+        */
+       if (!is_phys) {
+               min_addr = min(min_addr, untagged_ptr);
+               max_addr = max(max_addr, untagged_ptr + size);
+       }
+       link = is_phys ? &object_phys_tree_root.rb_node :
+               &object_tree_root.rb_node;
         rb_parent = NULL;
         while (*link) {
                 rb_parent = *link;
@@ -654,7 +692,8 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
                 }
         }
         rb_link_node(&object->rb_node, rb_parent, link);
-       rb_insert_color(&object->rb_node, &object_tree_root);
+       rb_insert_color(&object->rb_node, is_phys ? &object_phys_tree_root :
+                                         &object_tree_root);
  
         list_add_tail_rcu(&object->object_list, &object_list);
  out:
@@ -662,6 +701,20 @@ out:
         return object;
  }
  
+/* Create kmemleak object which allocated with virtual address. */
+static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
+                                            int min_count, gfp_t gfp)
+{
+       return __create_object(ptr, size, min_count, gfp, false);
+}
+
+/* Create kmemleak object which allocated with physical address. */
+static struct kmemleak_object *create_object_phys(unsigned long ptr, size_t size,
+                                            int min_count, gfp_t gfp)
+{
+       return __create_object(ptr, size, min_count, gfp, true);
+}
+
  /*
   * Mark the object as not allocated and schedule RCU freeing via put_object().
   */
@@ -690,7 +743,7 @@ static void delete_object_full(unsigned long ptr)
  {
         struct kmemleak_object *object;
  
-       object = find_and_remove_object(ptr, 0);
+       object = find_and_remove_object(ptr, 0, false);
         if (!object) {
  #ifdef DEBUG
                 kmemleak_warn("Freeing unknown object at 0x%08lx\n",
@@ -706,12 +759,12 @@ static void delete_object_full(unsigned long ptr)
   * delete it. If the memory block is partially freed, the function may create
   * additional metadata for the remaining parts of the block.
   */
-static void delete_object_part(unsigned long ptr, size_t size)
+static void delete_object_part(unsigned long ptr, size_t size, bool is_phys)
  {
         struct kmemleak_object *object;
         unsigned long start, end;
  
-       object = find_and_remove_object(ptr, 1);
+       object = find_and_remove_object(ptr, 1, is_phys);
         if (!object) {
  #ifdef DEBUG
                 kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
@@ -728,11 +781,11 @@ static void delete_object_part(unsigned long ptr, size_t size)
         start = object->pointer;
         end = object->pointer + object->size;
         if (ptr > start)
-               create_object(start, ptr - start, object->min_count,
-                             GFP_KERNEL);
+               __create_object(start, ptr - start, object->min_count,
+                             GFP_KERNEL, is_phys);
         if (ptr + size < end)
-               create_object(ptr + size, end - ptr - size, object->min_count,
-                             GFP_KERNEL);
+               __create_object(ptr + size, end - ptr - size, object->min_count,
+                             GFP_KERNEL, is_phys);
  
         __delete_object(object);
  }
@@ -753,11 +806,11 @@ static void paint_it(struct kmemleak_object *object, int color)
         raw_spin_unlock_irqrestore(&object->lock, flags);
  }
  
-static void paint_ptr(unsigned long ptr, int color)
+static void paint_ptr(unsigned long ptr, int color, bool is_phys)
  {
         struct kmemleak_object *object;
  
-       object = find_and_get_object(ptr, 0);
+       object = __find_and_get_object(ptr, 0, is_phys);
         if (!object) {
                 kmemleak_warn("Trying to color unknown object at 0x%08lx as %s\n",
                               ptr,
@@ -775,16 +828,16 @@ static void paint_ptr(unsigned long ptr, int color)
   */
  static void make_gray_object(unsigned long ptr)
  {
-       paint_ptr(ptr, KMEMLEAK_GREY);
+       paint_ptr(ptr, KMEMLEAK_GREY, false);
  }
  
  /*
   * Mark the object as black-colored so that it is ignored from scans and
   * reporting.
   */
-static void make_black_object(unsigned long ptr)
+static void make_black_object(unsigned long ptr, bool is_phys)
  {
-       paint_ptr(ptr, KMEMLEAK_BLACK);
+       paint_ptr(ptr, KMEMLEAK_BLACK, is_phys);
  }
  
  /*
@@ -990,7 +1043,7 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
         pr_debug("%s(0x%p)\n", __func__, ptr);
  
         if (kmemleak_enabled && ptr && !IS_ERR(ptr))
-               delete_object_part((unsigned long)ptr, size);
+               delete_object_part((unsigned long)ptr, size, false);
  }
  EXPORT_SYMBOL_GPL(kmemleak_free_part);
  
@@ -1078,7 +1131,7 @@ void __ref kmemleak_ignore(const void *ptr)
         pr_debug("%s(0x%p)\n", __func__, ptr);
  
         if (kmemleak_enabled && ptr && !IS_ERR(ptr))
-               make_black_object((unsigned long)ptr);
+               make_black_object((unsigned long)ptr, false);
  }
  EXPORT_SYMBOL(kmemleak_ignore);
  
@@ -1125,15 +1178,18 @@ EXPORT_SYMBOL(kmemleak_no_scan);
   *                      address argument
   * @phys:      physical address of the object
   * @size:      size of the object
- * @min_count: minimum number of references to this object.
- *              See kmemleak_alloc()
   * @gfp:       kmalloc() flags used for kmemleak internal memory allocations
   */
-void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count,
-                              gfp_t gfp)
+void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, gfp_t gfp)
  {
-       if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
-               kmemleak_alloc(__va(phys), size, min_count, gfp);
+       pr_debug("%s(0x%pa, %zu)\n", __func__, &phys, size);
+
+       if (kmemleak_enabled)
+               /*
+                * Create object with OBJECT_PHYS flag and
+                * assume min_count 0.
+                */
+               create_object_phys((unsigned long)phys, size, 0, gfp);
  }
  EXPORT_SYMBOL(kmemleak_alloc_phys);
  
@@ -1146,22 +1202,12 @@ EXPORT_SYMBOL(kmemleak_alloc_phys);
   */
  void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size)
  {
-       if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
-               kmemleak_free_part(__va(phys), size);
-}
-EXPORT_SYMBOL(kmemleak_free_part_phys);
+       pr_debug("%s(0x%pa)\n", __func__, &phys);
  
-/**
- * kmemleak_not_leak_phys - similar to kmemleak_not_leak but taking a physical
- *                         address argument
- * @phys:      physical address of the object
- */
-void __ref kmemleak_not_leak_phys(phys_addr_t phys)
-{
-       if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
-               kmemleak_not_leak(__va(phys));
+       if (kmemleak_enabled)
+               delete_object_part((unsigned long)phys, size, true);
  }
-EXPORT_SYMBOL(kmemleak_not_leak_phys);
+EXPORT_SYMBOL(kmemleak_free_part_phys);
  
  /**
   * kmemleak_ignore_phys - similar to kmemleak_ignore but taking a physical
@@ -1170,8 +1216,10 @@ EXPORT_SYMBOL(kmemleak_not_leak_phys);
   */
  void __ref kmemleak_ignore_phys(phys_addr_t phys)
  {
-       if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
-               kmemleak_ignore(__va(phys));
+       pr_debug("%s(0x%pa)\n", __func__, &phys);
+
+       if (kmemleak_enabled)
+               make_black_object((unsigned long)phys, true);
  }
  EXPORT_SYMBOL(kmemleak_ignore_phys);
  
@@ -1182,6 +1230,9 @@ static bool update_checksum(struct kmemleak_object *object)
  {
         u32 old_csum = object->checksum;
  
+       if (WARN_ON_ONCE(object->flags & OBJECT_PHYS))
+               return false;
+
         kasan_disable_current();
         kcsan_disable_current();
         object->checksum = crc32(0, kasan_reset_tag((void *)object->pointer), object->size);
@@ -1335,6 +1386,7 @@ static void scan_object(struct kmemleak_object *object)
  {
         struct kmemleak_scan_area *area;
         unsigned long flags;
+       void *obj_ptr;
  
         /*
          * Once the object->lock is acquired, the corresponding memory block
@@ -1346,10 +1398,15 @@ static void scan_object(struct kmemleak_object *object)
         if (!(object->flags & OBJECT_ALLOCATED))
                 /* already freed object */
                 goto out;
+
+       obj_ptr = object->flags & OBJECT_PHYS ?
+                 __va((phys_addr_t)object->pointer) :
+                 (void *)object->pointer;
+
         if (hlist_empty(&object->area_list) ||
             object->flags & OBJECT_FULL_SCAN) {
-               void *start = (void *)object->pointer;
-               void *end = (void *)(object->pointer + object->size);
+               void *start = obj_ptr;
+               void *end = obj_ptr + object->size;
                 void *next;
  
                 do {
@@ -1413,18 +1470,21 @@ static void scan_gray_list(void)
   */
  static void kmemleak_scan(void)
  {
-       unsigned long flags;
         struct kmemleak_object *object;
         struct zone *zone;
         int __maybe_unused i;
         int new_leaks = 0;
+       int loop1_cnt = 0;
  
         jiffies_last_scan = jiffies;
  
         /* prepare the kmemleak_object's */
         rcu_read_lock();
         list_for_each_entry_rcu(object, &object_list, object_list) {
-               raw_spin_lock_irqsave(&object->lock, flags);
+               bool obj_pinned = false;
+
+               loop1_cnt++;
+               raw_spin_lock_irq(&object->lock);
  #ifdef DEBUG
                 /*
                  * With a few exceptions there should be a maximum of
@@ -1436,12 +1496,45 @@ static void kmemleak_scan(void)
                         dump_object_info(object);
                 }
  #endif
+
+               /* ignore objects outside lowmem (paint them black) */
+               if ((object->flags & OBJECT_PHYS) &&
+                  !(object->flags & OBJECT_NO_SCAN)) {
+                       unsigned long phys = object->pointer;
+
+                       if (PHYS_PFN(phys) < min_low_pfn ||
+                           PHYS_PFN(phys + object->size) >= max_low_pfn)
+                               __paint_it(object, KMEMLEAK_BLACK);
+               }
+
                 /* reset the reference count (whiten the object) */
                 object->count = 0;
-               if (color_gray(object) && get_object(object))
+               if (color_gray(object) && get_object(object)) {
                         list_add_tail(&object->gray_list, &gray_list);
+                       obj_pinned = true;
+               }
  
-               raw_spin_unlock_irqrestore(&object->lock, flags);
+               raw_spin_unlock_irq(&object->lock);
+
+               /*
+                * Do a cond_resched() to avoid soft lockup every 64k objects.
+                * Make sure a reference has been taken so that the object
+                * won't go away without RCU read lock.
+                */
+               if (!(loop1_cnt & 0xffff)) {
+                       if (!obj_pinned && !get_object(object)) {
+                               /* Try the next object instead */
+                               loop1_cnt--;
+                               continue;
+                       }
+
+                       rcu_read_unlock();
+                       cond_resched();
+                       rcu_read_lock();
+
+                       if (!obj_pinned)
+                               put_object(object);
+               }
         }
         rcu_read_unlock();
  
@@ -1509,14 +1602,21 @@ static void kmemleak_scan(void)
          */
         rcu_read_lock();
         list_for_each_entry_rcu(object, &object_list, object_list) {
-               raw_spin_lock_irqsave(&object->lock, flags);
+               /*
+                * This is racy but we can save the overhead of lock/unlock
+                * calls. The missed objects, if any, should be caught in
+                * the next scan.
+                */
+               if (!color_white(object))
+                       continue;
+               raw_spin_lock_irq(&object->lock);
                 if (color_white(object) && (object->flags & OBJECT_ALLOCATED)
                     && update_checksum(object) && get_object(object)) {
                         /* color it gray temporarily */
                         object->count = object->min_count;
                         list_add_tail(&object->gray_list, &gray_list);
                 }
-               raw_spin_unlock_irqrestore(&object->lock, flags);
+               raw_spin_unlock_irq(&object->lock);
         }
         rcu_read_unlock();
  
@@ -1536,7 +1636,14 @@ static void kmemleak_scan(void)
          */
         rcu_read_lock();
         list_for_each_entry_rcu(object, &object_list, object_list) {
-               raw_spin_lock_irqsave(&object->lock, flags);
+               /*
+                * This is racy but we can save the overhead of lock/unlock
+                * calls. The missed objects, if any, should be caught in
+                * the next scan.
+                */
+               if (!color_white(object))
+                       continue;
+               raw_spin_lock_irq(&object->lock);
                 if (unreferenced_object(object) &&
                     !(object->flags & OBJECT_REPORTED)) {
                         object->flags |= OBJECT_REPORTED;
@@ -1546,7 +1653,7 @@ static void kmemleak_scan(void)
  
                         new_leaks++;
                 }
-               raw_spin_unlock_irqrestore(&object->lock, flags);
+               raw_spin_unlock_irq(&object->lock);
         }
         rcu_read_unlock();
  
@@ -1748,15 +1855,14 @@ static int dump_str_object_info(const char *str)
  static void kmemleak_clear(void)
  {
         struct kmemleak_object *object;
-       unsigned long flags;
  
         rcu_read_lock();
         list_for_each_entry_rcu(object, &object_list, object_list) {
-               raw_spin_lock_irqsave(&object->lock, flags);
+               raw_spin_lock_irq(&object->lock);
                 if ((object->flags & OBJECT_REPORTED) &&
                     unreferenced_object(object))
                         __paint_it(object, KMEMLEAK_GREY);
-               raw_spin_unlock_irqrestore(&object->lock, flags);
+               raw_spin_unlock_irq(&object->lock);
         }
         rcu_read_unlock();
  
diff --git a/mm/list_lru.c b/mm/list_lru.c

index ba76428..a05e5be 100644 (file)
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -71,7 +71,7 @@ list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr,
         if (!list_lru_memcg_aware(lru))
                 goto out;
  
-       memcg = mem_cgroup_from_obj(ptr);
+       memcg = mem_cgroup_from_slab_obj(ptr);
         if (!memcg)
                 goto out;
  
diff --git a/mm/memblock.c b/mm/memblock.c

index e4f03a6..749abd2 100644 (file)
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1345,8 +1345,8 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
   * from the regions with mirroring enabled and then retried from any
   * memory region.
   *
- * In addition, function sets the min_count to 0 using kmemleak_alloc_phys for
- * allocated boot memory block, so that it is never reported as leaks.
+ * In addition, function using kmemleak_alloc_phys for allocated boot
+ * memory block, it is never reported as leaks.
   *
   * Return:
   * Physical address of allocated memory block on success, %0 on failure.
@@ -1398,12 +1398,12 @@ done:
          */
         if (end != MEMBLOCK_ALLOC_NOLEAKTRACE)
                 /*
-                * The min_count is set to 0 so that memblock allocated
-                * blocks are never reported as leaks. This is because many
-                * of these blocks are only referred via the physical
-                * address which is not looked up by kmemleak.
+                * Memblock allocated blocks are never reported as
+                * leaks. This is because many of these blocks are
+                * only referred via the physical address which is
+                * not looked up by kmemleak.
                  */
-               kmemleak_alloc_phys(found, size, 0, 0);
+               kmemleak_alloc_phys(found, size, 0);
  
         return found;
  }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 618c366..655c093 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -783,7 +783,7 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
         struct lruvec *lruvec;
  
         rcu_read_lock();
-       memcg = mem_cgroup_from_obj(p);
+       memcg = mem_cgroup_from_slab_obj(p);
  
         /*
          * Untracked pages have no memcg, no lruvec. Update only the
@@ -1460,6 +1460,29 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
         return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
  }
  
+/* Subset of vm_event_item to report for memcg event stats */
+static const unsigned int memcg_vm_event_stat[] = {
+       PGSCAN_KSWAPD,
+       PGSCAN_DIRECT,
+       PGSTEAL_KSWAPD,
+       PGSTEAL_DIRECT,
+       PGFAULT,
+       PGMAJFAULT,
+       PGREFILL,
+       PGACTIVATE,
+       PGDEACTIVATE,
+       PGLAZYFREE,
+       PGLAZYFREED,
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+       ZSWPIN,
+       ZSWPOUT,
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       THP_FAULT_ALLOC,
+       THP_COLLAPSE_ALLOC,
+#endif
+};
+
  static char *memory_stat_format(struct mem_cgroup *memcg)
  {
         struct seq_buf s;
@@ -1495,41 +1518,17 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
         }
  
         /* Accumulated memory events */
-
-       seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
-                      memcg_events(memcg, PGFAULT));
-       seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
-                      memcg_events(memcg, PGMAJFAULT));
-       seq_buf_printf(&s, "%s %lu\n",  vm_event_name(PGREFILL),
-                      memcg_events(memcg, PGREFILL));
         seq_buf_printf(&s, "pgscan %lu\n",
                        memcg_events(memcg, PGSCAN_KSWAPD) +
                        memcg_events(memcg, PGSCAN_DIRECT));
         seq_buf_printf(&s, "pgsteal %lu\n",
                        memcg_events(memcg, PGSTEAL_KSWAPD) +
                        memcg_events(memcg, PGSTEAL_DIRECT));
-       seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
-                      memcg_events(memcg, PGACTIVATE));
-       seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
-                      memcg_events(memcg, PGDEACTIVATE));
-       seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
-                      memcg_events(memcg, PGLAZYFREE));
-       seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
-                      memcg_events(memcg, PGLAZYFREED));
  
-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
-       seq_buf_printf(&s, "%s %lu\n", vm_event_name(ZSWPIN),
-                      memcg_events(memcg, ZSWPIN));
-       seq_buf_printf(&s, "%s %lu\n", vm_event_name(ZSWPOUT),
-                      memcg_events(memcg, ZSWPOUT));
-#endif
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
-                      memcg_events(memcg, THP_FAULT_ALLOC));
-       seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
-                      memcg_events(memcg, THP_COLLAPSE_ALLOC));
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+       for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++)
+               seq_buf_printf(&s, "%s %lu\n",
+                              vm_event_name(memcg_vm_event_stat[i]),
+                              memcg_events(memcg, memcg_vm_event_stat[i]));
  
         /* The above should easily fit into one page */
         WARN_ON_ONCE(seq_buf_has_overflowed(&s));
@@ -2842,27 +2841,9 @@ int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
         return 0;
  }
  
-/*
- * Returns a pointer to the memory cgroup to which the kernel object is charged.
- *
- * A passed kernel object can be a slab object or a generic kernel page, so
- * different mechanisms for getting the memory cgroup pointer should be used.
- * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
- * can not know for sure how the kernel object is implemented.
- * mem_cgroup_from_obj() can be safely used in such cases.
- *
- * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
- * cgroup_mutex, etc.
- */
-struct mem_cgroup *mem_cgroup_from_obj(void *p)
+static __always_inline
+struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
  {
-       struct folio *folio;
-
-       if (mem_cgroup_disabled())
-               return NULL;
-
-       folio = virt_to_folio(p);
-
         /*
          * Slab objects are accounted individually, not per-page.
          * Memcg membership data for each individual object is saved in
@@ -2895,6 +2876,53 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
         return page_memcg_check(folio_page(folio, 0));
  }
  
+/*
+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
+ *
+ * A passed kernel object can be a slab object, vmalloc object or a generic
+ * kernel page, so different mechanisms for getting the memory cgroup pointer
+ * should be used.
+ *
+ * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
+ * can not know for sure how the kernel object is implemented.
+ * mem_cgroup_from_obj() can be safely used in such cases.
+ *
+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
+ * cgroup_mutex, etc.
+ */
+struct mem_cgroup *mem_cgroup_from_obj(void *p)
+{
+       struct folio *folio;
+
+       if (mem_cgroup_disabled())
+               return NULL;
+
+       if (unlikely(is_vmalloc_addr(p)))
+               folio = page_folio(vmalloc_to_page(p));
+       else
+               folio = virt_to_folio(p);
+
+       return mem_cgroup_from_obj_folio(folio, p);
+}
+
+/*
+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
+ * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects,
+ * allocated using vmalloc().
+ *
+ * A passed kernel object must be a slab object or a generic kernel page.
+ *
+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
+ * cgroup_mutex, etc.
+ */
+struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
+{
+       if (mem_cgroup_disabled())
+               return NULL;
+
+       return mem_cgroup_from_obj_folio(virt_to_folio(p), p);
+}
+
  static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
  {
         struct obj_cgroup *objcg = NULL;
diff --git a/mm/memory.c b/mm/memory.c

index 7a08914..580c62f 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3020,7 +3020,7 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
                 balance_dirty_pages_ratelimited(mapping);
                 if (fpin) {
                         fput(fpin);
-                       return VM_FAULT_RETRY;
+                       return VM_FAULT_COMPLETED;
                 }
         }
  
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c

index 1213d0c..1f1a730 100644 (file)
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -237,8 +237,7 @@ static void release_memory_resource(struct resource *res)
         kfree(res);
  }
  
-static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
-               const char *reason)
+static int check_pfn_span(unsigned long pfn, unsigned long nr_pages)
  {
         /*
          * Disallow all operations smaller than a sub-section and only
@@ -255,12 +254,8 @@ static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
                 min_align = PAGES_PER_SUBSECTION;
         else
                 min_align = PAGES_PER_SECTION;
-       if (!IS_ALIGNED(pfn, min_align)
-                       || !IS_ALIGNED(nr_pages, min_align)) {
-               WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n",
-                               reason, pfn, pfn + nr_pages - 1);
+       if (!IS_ALIGNED(pfn | nr_pages, min_align))
                 return -EINVAL;
-       }
         return 0;
  }
  
@@ -337,9 +332,10 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
                 altmap->alloc = 0;
         }
  
-       err = check_pfn_span(pfn, nr_pages, "add");
-       if (err)
-               return err;
+       if (check_pfn_span(pfn, nr_pages)) {
+               WARN(1, "Misaligned %s start: %#lx end: #%lx\n", __func__, pfn, pfn + nr_pages - 1);
+               return -EINVAL;
+       }
  
         for (; pfn < end_pfn; pfn += cur_nr_pages) {
                 /* Select all remaining pages up to the next section boundary */
@@ -536,8 +532,10 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages,
  
         map_offset = vmem_altmap_offset(altmap);
  
-       if (check_pfn_span(pfn, nr_pages, "remove"))
+       if (check_pfn_span(pfn, nr_pages)) {
+               WARN(1, "Misaligned %s start: %#lx end: #%lx\n", __func__, pfn, pfn + nr_pages - 1);
                 return;
+       }
  
         for (; pfn < end_pfn; pfn += cur_nr_pages) {
                 cond_resched();
diff --git a/mm/mempool.c b/mm/mempool.c

index b933d0f..96488b1 100644 (file)
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -379,7 +379,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
         gfp_t gfp_temp;
  
         VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
-       might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
+       might_alloc(gfp_mask);
  
         gfp_mask |= __GFP_NOMEMALLOC;   /* don't allocate emergency reserves */
         gfp_mask |= __GFP_NORETRY;      /* don't loop in __alloc_pages */
diff --git a/mm/memremap.c b/mm/memremap.c

index b870a65..8b5c8fd 100644 (file)
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -141,10 +141,10 @@ void memunmap_pages(struct dev_pagemap *pgmap)
         for (i = 0; i < pgmap->nr_range; i++)
                 percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i));
         wait_for_completion(&pgmap->done);
-       percpu_ref_exit(&pgmap->ref);
  
         for (i = 0; i < pgmap->nr_range; i++)
                 pageunmap_range(pgmap, i);
+       percpu_ref_exit(&pgmap->ref);
  
         WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n");
         devmap_managed_enable_put(pgmap);
@@ -279,8 +279,8 @@ err_pfn_remap:
  
  
  /*
- * Not device managed version of dev_memremap_pages, undone by
- * memunmap_pages().  Please use dev_memremap_pages if you have a struct
+ * Not device managed version of devm_memremap_pages, undone by
+ * memunmap_pages().  Please use devm_memremap_pages if you have a struct
   * device available.
   */
  void *memremap_pages(struct dev_pagemap *pgmap, int nid)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index e008a3d..81fadb2 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5197,10 +5197,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
                         *alloc_flags |= ALLOC_CPUSET;
         }
  
-       fs_reclaim_acquire(gfp_mask);
-       fs_reclaim_release(gfp_mask);
-
-       might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
+       might_alloc(gfp_mask);
  
         if (should_fail_alloc_page(gfp_mask, order))
                 return false;
diff --git a/mm/shmem.c b/mm/shmem.c

index a6f5653..12d45a0 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1706,10 +1706,10 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
  }
  
  /*
- * Swap in the page pointed to by *pagep.
- * Caller has to make sure that *pagep contains a valid swapped page.
- * Returns 0 and the page in pagep if success. On failure, returns the
- * error code and NULL in *pagep.
+ * Swap in the folio pointed to by *foliop.
+ * Caller has to make sure that *foliop contains a valid swapped folio.
+ * Returns 0 and the folio in foliop if success. On failure, returns the
+ * error code and NULL in *foliop.
   */
  static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
                              struct folio **foliop, enum sgp_type sgp,
@@ -1749,7 +1749,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
         }
         folio = page_folio(page);
  
-       /* We have to do this with page locked to prevent races */
+       /* We have to do this with folio locked to prevent races */
         folio_lock(folio);
         if (!folio_test_swapcache(folio) ||
             folio_swap_entry(folio).val != swap.val ||
diff --git a/mm/slab.c b/mm/slab.c

index f8cd00f..47151fb 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2958,12 +2958,6 @@ direct_grow:
         return ac->entry[--ac->avail];
  }
  
-static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
-                                               gfp_t flags)
-{
-       might_sleep_if(gfpflags_allow_blocking(flags));
-}
-
  #if DEBUG
  static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
                                 gfp_t flags, void *objp, unsigned long caller)
@@ -3205,7 +3199,6 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_
         if (unlikely(ptr))
                 goto out_hooks;
  
-       cache_alloc_debugcheck_before(cachep, flags);
         local_irq_save(save_flags);
  
         if (nodeid == NUMA_NO_NODE)
@@ -3290,7 +3283,6 @@ slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags,
         if (unlikely(objp))
                 goto out;
  
-       cache_alloc_debugcheck_before(cachep, flags);
         local_irq_save(save_flags);
         objp = __do_cache_alloc(cachep, flags);
         local_irq_restore(save_flags);
@@ -3527,8 +3519,6 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
         if (!s)
                 return 0;
  
-       cache_alloc_debugcheck_before(s, flags);
-
         local_irq_disable();
         for (i = 0; i < size; i++) {
                 void *objp = kfence_alloc(s, s->object_size, flags) ?: __do_cache_alloc(s, flags);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c

index f4fa61d..652f11a 100644 (file)
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -200,8 +200,8 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
         unsigned long next;
         pgd_t *pgd;
  
-       VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
-       VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));
+       VM_BUG_ON(!PAGE_ALIGNED(start));
+       VM_BUG_ON(!PAGE_ALIGNED(end));
  
         pgd = pgd_offset_k(addr);
         do {
@@ -737,7 +737,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
  
         size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
         for (addr = start; addr < end; addr += size) {
-               unsigned long next = addr, last = addr + size;
+               unsigned long next, last = addr + size;
  
                 /* Populate the head page vmemmap page */
                 pte = vmemmap_populate_address(addr, node, NULL, NULL);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c

index effd1ff..5977b17 100644 (file)
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -790,6 +790,7 @@ unsigned long vmalloc_nr_pages(void)
         return atomic_long_read(&nr_vmalloc_pages);
  }
  
+/* Look up the first VA which satisfies addr < va_end, NULL if none. */
  static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
  {
         struct vmap_area *va = NULL;
@@ -874,11 +875,9 @@ find_va_links(struct vmap_area *va,
                  * Trigger the BUG() if there are sides(left/right)
                  * or full overlaps.
                  */
-               if (va->va_start < tmp_va->va_end &&
-                               va->va_end <= tmp_va->va_start)
+               if (va->va_end <= tmp_va->va_start)
                         link = &(*link)->rb_left;
-               else if (va->va_end > tmp_va->va_start &&
-                               va->va_start >= tmp_va->va_end)
+               else if (va->va_start >= tmp_va->va_end)
                         link = &(*link)->rb_right;
                 else {
                         WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
@@ -931,7 +930,7 @@ link_va(struct vmap_area *va, struct rb_root *root,
                  * Some explanation here. Just perform simple insertion
                  * to the tree. We do not set va->subtree_max_size to
                  * its current size before calling rb_insert_augmented().
-                * It is because of we populate the tree from the bottom
+                * It is because we populate the tree from the bottom
                  * to parent levels when the node _is_ in the tree.
                  *
                  * Therefore we set subtree_max_size to zero after insertion,
@@ -1335,10 +1334,10 @@ classify_va_fit_type(struct vmap_area *va,
  
  static __always_inline int
  adjust_va_to_fit_type(struct vmap_area *va,
-       unsigned long nva_start_addr, unsigned long size,
-       enum fit_type type)
+       unsigned long nva_start_addr, unsigned long size)
  {
         struct vmap_area *lva = NULL;
+       enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);
  
         if (type == FL_FIT_TYPE) {
                 /*
@@ -1444,7 +1443,6 @@ __alloc_vmap_area(unsigned long size, unsigned long align,
         bool adjust_search_size = true;
         unsigned long nva_start_addr;
         struct vmap_area *va;
-       enum fit_type type;
         int ret;
  
         /*
@@ -1472,14 +1470,9 @@ __alloc_vmap_area(unsigned long size, unsigned long align,
         if (nva_start_addr + size > vend)
                 return vend;
  
-       /* Classify what we have found. */
-       type = classify_va_fit_type(va, nva_start_addr, size);
-       if (WARN_ON_ONCE(type == NOTHING_FIT))
-               return vend;
-
         /* Update the free vmap_area. */
-       ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
-       if (ret)
+       ret = adjust_va_to_fit_type(va, nva_start_addr, size);
+       if (WARN_ON_ONCE(ret))
                 return vend;
  
  #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
@@ -1663,7 +1656,7 @@ static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
  
  /*
   * Serialize vmap purging.  There is no actual critical section protected
- * by this look, but we want to avoid concurrent calls for performance
+ * by this lock, but we want to avoid concurrent calls for performance
   * reasons and to make the pcpu_get_vm_areas more deterministic.
   */
  static DEFINE_MUTEX(vmap_purge_lock);
@@ -1677,32 +1670,32 @@ static void purge_fragmented_blocks_allcpus(void);
  static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
  {
         unsigned long resched_threshold;
-       struct list_head local_pure_list;
+       struct list_head local_purge_list;
         struct vmap_area *va, *n_va;
  
         lockdep_assert_held(&vmap_purge_lock);
  
         spin_lock(&purge_vmap_area_lock);
         purge_vmap_area_root = RB_ROOT;
-       list_replace_init(&purge_vmap_area_list, &local_pure_list);
+       list_replace_init(&purge_vmap_area_list, &local_purge_list);
         spin_unlock(&purge_vmap_area_lock);
  
-       if (unlikely(list_empty(&local_pure_list)))
+       if (unlikely(list_empty(&local_purge_list)))
                 return false;
  
         start = min(start,
-               list_first_entry(&local_pure_list,
+               list_first_entry(&local_purge_list,
                         struct vmap_area, list)->va_start);
  
         end = max(end,
-               list_last_entry(&local_pure_list,
+               list_last_entry(&local_purge_list,
                         struct vmap_area, list)->va_end);
  
         flush_tlb_kernel_range(start, end);
         resched_threshold = lazy_max_pages() << 1;
  
         spin_lock(&free_vmap_area_lock);
-       list_for_each_entry_safe(va, n_va, &local_pure_list, list) {
+       list_for_each_entry_safe(va, n_va, &local_purge_list, list) {
                 unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
                 unsigned long orig_start = va->va_start;
                 unsigned long orig_end = va->va_end;
@@ -3735,7 +3728,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
         int area, area2, last_area, term_area;
         unsigned long base, start, size, end, last_end, orig_start, orig_end;
         bool purged = false;
-       enum fit_type type;
  
         /* verify parameters and allocate data structures */
         BUG_ON(offset_in_page(align) || !is_power_of_2(align));
@@ -3846,15 +3838,11 @@ retry:
                         /* It is a BUG(), but trigger recovery instead. */
                         goto recovery;
  
-               type = classify_va_fit_type(va, start, size);
-               if (WARN_ON_ONCE(type == NOTHING_FIT))
+               ret = adjust_va_to_fit_type(va, start, size);
+               if (WARN_ON_ONCE(unlikely(ret)))
                         /* It is a BUG(), but trigger recovery instead. */
                         goto recovery;
  
-               ret = adjust_va_to_fit_type(va, start, size, type);
-               if (unlikely(ret))
-                       goto recovery;
-
                 /* Allocated area. */
                 va = vas[area];
                 va->va_start = start;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c

index 0ec2f59..6b9f191 100644 (file)
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -18,6 +18,7 @@
  #include <linux/user_namespace.h>
  #include <linux/net_namespace.h>
  #include <linux/sched/task.h>
+#include <linux/sched/mm.h>
  #include <linux/uidgid.h>
  #include <linux/cookie.h>
  
@@ -1143,7 +1144,13 @@ static int __register_pernet_operations(struct list_head *list,
                  * setup_net() and cleanup_net() are not possible.
                  */
                 for_each_net(net) {
+                       struct mem_cgroup *old, *memcg;
+
+                       memcg = mem_cgroup_or_root(get_mem_cgroup_from_obj(net));
+                       old = set_active_memcg(memcg);
                         error = ops_init(ops, net);
+                       set_active_memcg(old);
+                       mem_cgroup_put(memcg);
                         if (error)
                                 goto out_undo;
                         list_add_tail(&net->exit_list, &net_exit_list);
diff --git a/tools/testing/memblock/linux/kmemleak.h b/tools/testing/memblock/linux/kmemleak.h

index 462f8c5..5fed13b 100644 (file)
--- a/tools/testing/memblock/linux/kmemleak.h
+++ b/tools/testing/memblock/linux/kmemleak.h
@@ -7,7 +7,7 @@ static inline void kmemleak_free_part_phys(phys_addr_t phys, size_t size)
  }
  
  static inline void kmemleak_alloc_phys(phys_addr_t phys, size_t size,
-                                      int min_count, gfp_t gfp)
+                                      gfp_t gfp)
  {
  }
  
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c

index 0bdfc19..4bc2458 100644 (file)
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -860,7 +860,7 @@ static int stress(struct uffd_stats *uffd_stats)
         /*
          * Be strict and immediately zap area_src, the whole area has
          * been transferred already by the background treads. The
-        * area_src could then be faulted in in a racy way by still
+        * area_src could then be faulted in a racy way by still
          * running uffdio_threads reading zeropages after we zapped
          * area_src (but they're guaranteed to get -EEXIST from
          * UFFDIO_COPY without writing zero pages into area_dst
diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c

index 9b68658..3ae985d 100644 (file)
--- a/tools/vm/slabinfo.c
+++ b/tools/vm/slabinfo.c
@@ -125,7 +125,7 @@ static void usage(void)
                 "-n|--numa              Show NUMA information\n"
                 "-N|--lines=K           Show the first K slabs\n"
                 "-o|--ops               Show kmem_cache_ops\n"
-               "-P|--partial           Sort by number of partial slabs\n"
+               "-P|--partial           Sort by number of partial slabs\n"
                 "-r|--report            Detailed report on single slabs\n"
                 "-s|--shrink            Shrink slabs\n"
                 "-S|--Size              Sort by size\n"
@@ -1045,15 +1045,27 @@ static void sort_slabs(void)
                 for (s2 = s1 + 1; s2 < slabinfo + slabs; s2++) {
                         int result;
  
-                       if (sort_size)
-                               result = slab_size(s1) < slab_size(s2);
-                       else if (sort_active)
-                               result = slab_activity(s1) < slab_activity(s2);
-                       else if (sort_loss)
-                               result = slab_waste(s1) < slab_waste(s2);
-                       else if (sort_partial)
-                               result = s1->partial < s2->partial;
-                       else
+                       if (sort_size) {
+                               if (slab_size(s1) == slab_size(s2))
+                                       result = strcasecmp(s1->name, s2->name);
+                               else
+                                       result = slab_size(s1) < slab_size(s2);
+                       } else if (sort_active) {
+                               if (slab_activity(s1) == slab_activity(s2))
+                                       result = strcasecmp(s1->name, s2->name);
+                               else
+                                       result = slab_activity(s1) < slab_activity(s2);
+                       } else if (sort_loss) {
+                               if (slab_waste(s1) == slab_waste(s2))
+                                       result = strcasecmp(s1->name, s2->name);
+                               else
+                                       result = slab_waste(s1) < slab_waste(s2);
+                       } else if (sort_partial) {
+                               if (s1->partial == s2->partial)
+                                       result = strcasecmp(s1->name, s2->name);
+                               else
+                                       result = s1->partial < s2->partial;
+                       } else
                                 result = strcasecmp(s1->name, s2->name);
  
                         if (show_inverted)
author	akpm <akpm@linux-foundation.org>
	Mon, 27 Jun 2022 17:31:34 +0000 (10:31 -0700)
committer	akpm <akpm@linux-foundation.org>
	Mon, 27 Jun 2022 17:31:34 +0000 (10:31 -0700)
Documentation/admin-guide/cgroup-v2.rst		patch \| blob \| history
Documentation/dev-tools/kmemleak.rst		patch \| blob \| history
arch/alpha/mm/fault.c		patch \| blob \| history
arch/arc/mm/fault.c		patch \| blob \| history
arch/arm/mm/fault.c		patch \| blob \| history
arch/arm64/mm/fault.c		patch \| blob \| history
arch/csky/mm/fault.c		patch \| blob \| history
arch/hexagon/mm/vm_fault.c		patch \| blob \| history
arch/ia64/mm/fault.c		patch \| blob \| history
arch/m68k/mm/fault.c		patch \| blob \| history
arch/microblaze/mm/fault.c		patch \| blob \| history
arch/mips/mm/fault.c		patch \| blob \| history
arch/nios2/mm/fault.c		patch \| blob \| history
arch/openrisc/mm/fault.c		patch \| blob \| history
arch/parisc/mm/fault.c		patch \| blob \| history
arch/powerpc/mm/copro_fault.c		patch \| blob \| history
arch/powerpc/mm/fault.c		patch \| blob \| history
arch/riscv/mm/fault.c		patch \| blob \| history
arch/s390/mm/fault.c		patch \| blob \| history
arch/sh/mm/fault.c		patch \| blob \| history
arch/sparc/mm/fault_32.c		patch \| blob \| history
arch/sparc/mm/fault_64.c		patch \| blob \| history
arch/um/kernel/trap.c		patch \| blob \| history
arch/x86/mm/fault.c		patch \| blob \| history
arch/x86/mm/hugetlbpage.c		patch \| blob \| history
arch/xtensa/mm/fault.c		patch \| blob \| history
drivers/of/fdt.c		patch \| blob \| history
include/linux/damon.h		patch \| blob \| history
include/linux/highmem.h		patch \| blob \| history
include/linux/kmemleak.h		patch \| blob \| history
include/linux/memcontrol.h		patch \| blob \| history
include/linux/mm_types.h		patch \| blob \| history
lib/test_hmm.c		patch \| blob \| history
mm/gup.c		patch \| blob \| history
mm/kmemleak.c		patch \| blob \| history
mm/list_lru.c		patch \| blob \| history
mm/memblock.c		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history
mm/memory_hotplug.c		patch \| blob \| history
mm/mempool.c		patch \| blob \| history
mm/memremap.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/shmem.c		patch \| blob \| history
mm/slab.c		patch \| blob \| history
mm/sparse-vmemmap.c		patch \| blob \| history
mm/vmalloc.c		patch \| blob \| history
net/core/net_namespace.c		patch \| blob \| history
tools/testing/memblock/linux/kmemleak.h		patch \| blob \| history
tools/testing/selftests/vm/userfaultfd.c		patch \| blob \| history
tools/vm/slabinfo.c		patch \| blob \| history