Merge branch 'akpm' (patches from Andrew)
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 25 Mar 2022 17:21:20 +0000 (10:21 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 25 Mar 2022 17:21:20 +0000 (10:21 -0700)
Merge yet more updates from Andrew Morton:
 "This is the material which was staged after willystuff in linux-next.

  Subsystems affected by this patch series: mm (debug, selftests,
  pagecache, thp, rmap, migration, kasan, hugetlb, pagemap, madvise),
  and selftests"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (113 commits)
  selftests: kselftest framework: provide "finished" helper
  mm: madvise: MADV_DONTNEED_LOCKED
  mm: fix race between MADV_FREE reclaim and blkdev direct IO read
  mm: generalize ARCH_HAS_FILTER_PGPROT
  mm: unmap_mapping_range_tree() with i_mmap_rwsem shared
  mm: warn on deleting redirtied only if accounted
  mm/huge_memory: remove stale locking logic from __split_huge_pmd()
  mm/huge_memory: remove stale page_trans_huge_mapcount()
  mm/swapfile: remove stale reuse_swap_page()
  mm/khugepaged: remove reuse_swap_page() usage
  mm/huge_memory: streamline COW logic in do_huge_pmd_wp_page()
  mm: streamline COW logic in do_swap_page()
  mm: slightly clarify KSM logic in do_swap_page()
  mm: optimize do_wp_page() for fresh pages in local LRU pagevecs
  mm: optimize do_wp_page() for exclusive pages in the swapcache
  mm/huge_memory: make is_transparent_hugepage() static
  userfaultfd/selftests: enable hugetlb remap and remove event testing
  selftests/vm: add hugetlb madvise MADV_DONTNEED MADV_REMOVE test
  mm: enable MADV_DONTNEED for hugetlb mappings
  kasan: disable LOCKDEP when printing reports
  ...

73 files changed:
Documentation/dev-tools/kasan.rst
Documentation/vm/page_owner.rst
arch/alpha/include/uapi/asm/mman.h
arch/arm64/Kconfig
arch/arm64/include/asm/vmalloc.h
arch/arm64/include/asm/vmap_stack.h
arch/arm64/kernel/module.c
arch/arm64/mm/pageattr.c
arch/arm64/net/bpf_jit_comp.c
arch/mips/include/uapi/asm/mman.h
arch/parisc/include/uapi/asm/mman.h
arch/powerpc/mm/book3s64/trace.c
arch/s390/kernel/module.c
arch/x86/Kconfig
arch/x86/kernel/module.c
arch/x86/mm/init.c
arch/xtensa/include/uapi/asm/mman.h
include/linux/gfp.h
include/linux/huge_mm.h
include/linux/kasan.h
include/linux/mm.h
include/linux/page-flags.h
include/linux/pagemap.h
include/linux/swap.h
include/linux/vmalloc.h
include/trace/events/huge_memory.h
include/trace/events/migrate.h
include/trace/events/mmflags.h
include/trace/events/thp.h
include/uapi/asm-generic/mman-common.h
kernel/fork.c
kernel/scs.c
lib/Kconfig.kasan
lib/test_kasan.c
lib/vsprintf.c
mm/Kconfig
mm/debug.c
mm/filemap.c
mm/huge_memory.c
mm/kasan/Makefile
mm/kasan/common.c
mm/kasan/hw_tags.c
mm/kasan/kasan.h
mm/kasan/report.c
mm/kasan/report_generic.c
mm/kasan/report_hw_tags.c
mm/kasan/report_sw_tags.c
mm/kasan/report_tags.c
mm/kasan/shadow.c
mm/khugepaged.c
mm/madvise.c
mm/memory.c
mm/memremap.c
mm/migrate.c
mm/page-writeback.c
mm/page_alloc.c
mm/page_owner.c
mm/rmap.c
mm/swap.c
mm/swapfile.c
mm/vmalloc.c
tools/testing/selftests/kselftest.h
tools/testing/selftests/vm/.gitignore
tools/testing/selftests/vm/Makefile
tools/testing/selftests/vm/gup_test.c
tools/testing/selftests/vm/hugetlb-madvise.c [new file with mode: 0644]
tools/testing/selftests/vm/ksm_tests.c
tools/testing/selftests/vm/memfd_secret.c
tools/testing/selftests/vm/run_vmtests.sh
tools/testing/selftests/vm/transhuge-stress.c
tools/testing/selftests/vm/userfaultfd.c
tools/testing/selftests/vm/util.h [new file with mode: 0644]
tools/vm/page_owner_sort.c

index 8089c55..7614a1f 100644 (file)
@@ -30,7 +30,7 @@ Software tag-based KASAN mode is only supported in Clang.
 
 The hardware KASAN mode (#3) relies on hardware to perform the checks but
 still requires a compiler version that supports memory tagging instructions.
-This mode is supported in GCC 10+ and Clang 11+.
+This mode is supported in GCC 10+ and Clang 12+.
 
 Both software KASAN modes work with SLUB and SLAB memory allocators,
 while the hardware tag-based KASAN currently only supports SLUB.
@@ -206,6 +206,9 @@ additional boot parameters that allow disabling KASAN or controlling features:
   Asymmetric mode: a bad access is detected synchronously on reads and
   asynchronously on writes.
 
+- ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc
+  allocations (default: ``on``).
+
 - ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack
   traces collection (default: ``on``).
 
@@ -279,8 +282,8 @@ Software tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through
 pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently
 reserved to tag freed memory regions.
 
-Software tag-based KASAN currently only supports tagging of slab and page_alloc
-memory.
+Software tag-based KASAN currently only supports tagging of slab, page_alloc,
+and vmalloc memory.
 
 Hardware tag-based KASAN
 ~~~~~~~~~~~~~~~~~~~~~~~~
@@ -303,8 +306,8 @@ Hardware tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through
 pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently
 reserved to tag freed memory regions.
 
-Hardware tag-based KASAN currently only supports tagging of slab and page_alloc
-memory.
+Hardware tag-based KASAN currently only supports tagging of slab, page_alloc,
+and VM_ALLOC-based vmalloc memory.
 
 If the hardware does not support MTE (pre ARMv8.5), hardware tag-based KASAN
 will not be enabled. In this case, all KASAN boot parameters are ignored.
@@ -319,6 +322,8 @@ checking gets disabled.
 Shadow memory
 -------------
 
+The contents of this section are only applicable to software KASAN modes.
+
 The kernel maps memory in several different parts of the address space.
 The range of kernel virtual addresses is large: there is not enough real
 memory to support a real shadow region for every address that could be
@@ -349,7 +354,7 @@ CONFIG_KASAN_VMALLOC
 
 With ``CONFIG_KASAN_VMALLOC``, KASAN can cover vmalloc space at the
 cost of greater memory usage. Currently, this is supported on x86,
-riscv, s390, and powerpc.
+arm64, riscv, s390, and powerpc.
 
 This works by hooking into vmalloc and vmap and dynamically
 allocating real shadow memory to back the mappings.
index 905555e..c4de6f8 100644 (file)
@@ -78,7 +78,7 @@ Usage
 
 2) Enable page owner: add "page_owner=on" to boot cmdline.
 
-3) Do the job what you want to debug
+3) Do the job that you want to debug.
 
 4) Analyze information from page owner::
 
@@ -89,22 +89,75 @@ Usage
 
        Page allocated via order XXX, ...
        PFN XXX ...
-        // Detailed stack
+       // Detailed stack
 
        Page allocated via order XXX, ...
        PFN XXX ...
-        // Detailed stack
+       // Detailed stack
 
    The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows
    in buf, uses regexp to extract the page order value, counts the times
-   and pages of buf, and finally sorts them according to the times.
+   and pages of buf, and finally sorts them according to the parameter(s).
 
    See the result about who allocated each page
    in the ``sorted_page_owner.txt``. General output::
 
        XXX times, XXX pages:
        Page allocated via order XXX, ...
-        // Detailed stack
+       // Detailed stack
 
    By default, ``page_owner_sort`` is sorted according to the times of buf.
-   If you want to sort by the pages nums of buf, use the ``-m`` parameter.
+   If you want to sort by the page nums of buf, use the ``-m`` parameter.
+   The detailed parameters are:
+
+   fundamental function:
+
+       Sort:
+               -a              Sort by memory allocation time.
+               -m              Sort by total memory.
+               -p              Sort by pid.
+               -P              Sort by tgid.
+               -n              Sort by task command name.
+               -r              Sort by memory release time.
+               -s              Sort by stack trace.
+               -t              Sort by times (default).
+
+   additional function:
+
+       Cull:
+               -c              Cull by comparing stacktrace instead of total block.
+               --cull <rules>
+                               Specify culling rules.Culling syntax is key[,key[,...]].Choose a
+                               multi-letter key from the **STANDARD FORMAT SPECIFIERS** section.
+
+
+               <rules> is a single argument in the form of a comma-separated list,
+               which offers a way to specify individual culling rules.  The recognized
+               keywords are described in the **STANDARD FORMAT SPECIFIERS** section below.
+               <rules> can be specified by the sequence of keys k1,k2, ..., as described in
+               the STANDARD SORT KEYS section below. Mixed use of abbreviated and
+               complete-form of keys is allowed.
+
+
+               Examples:
+                               ./page_owner_sort <input> <output> --cull=stacktrace
+                               ./page_owner_sort <input> <output> --cull=st,pid,name
+                               ./page_owner_sort <input> <output> --cull=n,f
+
+       Filter:
+               -f              Filter out the information of blocks whose memory has been released.
+
+       Select:
+               --pid <PID>             Select by pid.
+               --tgid <TGID>           Select by tgid.
+               --name <command>        Select by task command name.
+
+STANDARD FORMAT SPECIFIERS
+==========================
+
+       KEY             LONG            DESCRIPTION
+       p               pid             process ID
+       tg              tgid            thread group ID
+       n               name            task command name
+       f               free            whether the page has been released or not
+       st              stacktrace      stace trace of the page allocation
index 56b4ee5..4aa9964 100644 (file)
@@ -74,6 +74,8 @@
 #define MADV_POPULATE_READ     22      /* populate (prefault) page tables readable */
 #define MADV_POPULATE_WRITE    23      /* populate (prefault) page tables writable */
 
+#define MADV_DONTNEED_LOCKED   24      /* like DONTNEED, but drop locked pages too */
+
 /* compatibility flags */
 #define MAP_FILE       0
 
index f868089..23048be 100644 (file)
@@ -208,7 +208,7 @@ config ARM64
        select IOMMU_DMA if IOMMU_SUPPORT
        select IRQ_DOMAIN
        select IRQ_FORCED_THREADING
-       select KASAN_VMALLOC if KASAN_GENERIC
+       select KASAN_VMALLOC if KASAN
        select MODULES_USE_ELF_RELA
        select NEED_DMA_MAP_STATE
        select NEED_SG_DMA_LENGTH
index b918550..38fafff 100644 (file)
@@ -25,4 +25,10 @@ static inline bool arch_vmap_pmd_supported(pgprot_t prot)
 
 #endif
 
+#define arch_vmap_pgprot_tagged arch_vmap_pgprot_tagged
+static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot)
+{
+       return pgprot_tagged(prot);
+}
+
 #endif /* _ASM_ARM64_VMALLOC_H */
index 894e031..2087309 100644 (file)
  */
 static inline unsigned long *arch_alloc_vmap_stack(size_t stack_size, int node)
 {
+       void *p;
+
        BUILD_BUG_ON(!IS_ENABLED(CONFIG_VMAP_STACK));
 
-       return __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node,
+       p = __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node,
                        __builtin_return_address(0));
+       return kasan_reset_tag(p);
 }
 
 #endif /* __ASM_VMAP_STACK_H */
index 309a275..f2d4bb1 100644 (file)
@@ -58,12 +58,13 @@ void *module_alloc(unsigned long size)
                                PAGE_KERNEL, 0, NUMA_NO_NODE,
                                __builtin_return_address(0));
 
-       if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) {
+       if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
                vfree(p);
                return NULL;
        }
 
-       return p;
+       /* Memory is intended to be executable, reset the pointer tag. */
+       return kasan_reset_tag(p);
 }
 
 enum aarch64_reloc_op {
index a3bacd7..64e985e 100644 (file)
@@ -85,7 +85,7 @@ static int change_memory_common(unsigned long addr, int numpages,
         */
        area = find_vm_area((void *)addr);
        if (!area ||
-           end > (unsigned long)area->addr + area->size ||
+           end > (unsigned long)kasan_reset_tag(area->addr) + area->size ||
            !(area->flags & VM_ALLOC))
                return -EINVAL;
 
index e850c69..fcc675a 100644 (file)
@@ -1304,7 +1304,8 @@ u64 bpf_jit_alloc_exec_limit(void)
 
 void *bpf_jit_alloc_exec(unsigned long size)
 {
-       return vmalloc(size);
+       /* Memory is intended to be executable, reset the pointer tag. */
+       return kasan_reset_tag(vmalloc(size));
 }
 
 void bpf_jit_free_exec(void *addr)
index 40b210c..1be4286 100644 (file)
 #define MADV_POPULATE_READ     22      /* populate (prefault) page tables readable */
 #define MADV_POPULATE_WRITE    23      /* populate (prefault) page tables writable */
 
+#define MADV_DONTNEED_LOCKED   24      /* like DONTNEED, but drop locked pages too */
+
 /* compatibility flags */
 #define MAP_FILE       0
 
index 9e3c010..a7ea320 100644 (file)
@@ -55,6 +55,8 @@
 #define MADV_POPULATE_READ     22      /* populate (prefault) page tables readable */
 #define MADV_POPULATE_WRITE    23      /* populate (prefault) page tables writable */
 
+#define MADV_DONTNEED_LOCKED   24      /* like DONTNEED, but drop locked pages too */
+
 #define MADV_MERGEABLE   65            /* KSM may merge identical pages */
 #define MADV_UNMERGEABLE 66            /* KSM may not merge identical pages */
 
index b86e7b9..ccd64b5 100644 (file)
@@ -3,6 +3,5 @@
  * This file is for defining trace points and trace related helpers.
  */
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define CREATE_TRACE_POINTS
 #include <trace/events/thp.h>
 #endif
index c0dd72d..26125a9 100644 (file)
@@ -45,7 +45,7 @@ void *module_alloc(unsigned long size)
        p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
                                 gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
                                 __builtin_return_address(0));
-       if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) {
+       if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
                vfree(p);
                return NULL;
        }
index b037fe7..10e4c33 100644 (file)
@@ -337,9 +337,6 @@ config GENERIC_CALIBRATE_DELAY
 config ARCH_HAS_CPU_RELAX
        def_bool y
 
-config ARCH_HAS_FILTER_PGPROT
-       def_bool y
-
 config ARCH_HIBERNATION_POSSIBLE
        def_bool y
 
index 96d7c27..504ea65 100644 (file)
@@ -78,7 +78,7 @@ void *module_alloc(unsigned long size)
                                    MODULES_END, gfp_mask,
                                    PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
                                    __builtin_return_address(0));
-       if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) {
+       if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
                vfree(p);
                return NULL;
        }
index 4ba024d..d8cfce2 100644 (file)
@@ -31,7 +31,6 @@
  * We need to define the tracepoints somewhere, and tlb.c
  * is only compiled when SMP=y.
  */
-#define CREATE_TRACE_POINTS
 #include <trace/events/tlb.h>
 
 #include "mm_internal.h"
index b3a2209..7966a58 100644 (file)
 #define MADV_POPULATE_READ     22      /* populate (prefault) page tables readable */
 #define MADV_POPULATE_WRITE    23      /* populate (prefault) page tables writable */
 
+#define MADV_DONTNEED_LOCKED   24      /* like DONTNEED, but drop locked pages too */
+
 /* compatibility flags */
 #define MAP_FILE       0
 
index 20f6fbe..0fa17fb 100644 (file)
@@ -54,9 +54,17 @@ struct vm_area_struct;
 #define ___GFP_THISNODE                0x200000u
 #define ___GFP_ACCOUNT         0x400000u
 #define ___GFP_ZEROTAGS                0x800000u
-#define ___GFP_SKIP_KASAN_POISON       0x1000000u
+#ifdef CONFIG_KASAN_HW_TAGS
+#define ___GFP_SKIP_ZERO               0x1000000u
+#define ___GFP_SKIP_KASAN_UNPOISON     0x2000000u
+#define ___GFP_SKIP_KASAN_POISON       0x4000000u
+#else
+#define ___GFP_SKIP_ZERO               0
+#define ___GFP_SKIP_KASAN_UNPOISON     0
+#define ___GFP_SKIP_KASAN_POISON       0
+#endif
 #ifdef CONFIG_LOCKDEP
-#define ___GFP_NOLOCKDEP       0x2000000u
+#define ___GFP_NOLOCKDEP       0x8000000u
 #else
 #define ___GFP_NOLOCKDEP       0
 #endif
@@ -232,24 +240,33 @@ struct vm_area_struct;
  *
  * %__GFP_ZERO returns a zeroed page on success.
  *
- * %__GFP_ZEROTAGS returns a page with zeroed memory tags on success, if
- * __GFP_ZERO is set.
+ * %__GFP_ZEROTAGS zeroes memory tags at allocation time if the memory itself
+ * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that
+ * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting
+ * memory tags at the same time as zeroing memory has minimal additional
+ * performace impact.
+ *
+ * %__GFP_SKIP_KASAN_UNPOISON makes KASAN skip unpoisoning on page allocation.
+ * Only effective in HW_TAGS mode.
  *
- * %__GFP_SKIP_KASAN_POISON returns a page which does not need to be poisoned
- * on deallocation. Typically used for userspace pages. Currently only has an
- * effect in HW tags mode.
+ * %__GFP_SKIP_KASAN_POISON makes KASAN skip poisoning on page deallocation.
+ * Typically, used for userspace pages. Only effective in HW_TAGS mode.
  */
 #define __GFP_NOWARN   ((__force gfp_t)___GFP_NOWARN)
 #define __GFP_COMP     ((__force gfp_t)___GFP_COMP)
 #define __GFP_ZERO     ((__force gfp_t)___GFP_ZERO)
 #define __GFP_ZEROTAGS ((__force gfp_t)___GFP_ZEROTAGS)
-#define __GFP_SKIP_KASAN_POISON        ((__force gfp_t)___GFP_SKIP_KASAN_POISON)
+#define __GFP_SKIP_ZERO ((__force gfp_t)___GFP_SKIP_ZERO)
+#define __GFP_SKIP_KASAN_UNPOISON ((__force gfp_t)___GFP_SKIP_KASAN_UNPOISON)
+#define __GFP_SKIP_KASAN_POISON   ((__force gfp_t)___GFP_SKIP_KASAN_POISON)
 
 /* Disable lockdep for GFP context tracking */
 #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
 
 /* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP))
+#define __GFP_BITS_SHIFT (24 +                                         \
+                         3 * IS_ENABLED(CONFIG_KASAN_HW_TAGS) +        \
+                         IS_ENABLED(CONFIG_LOCKDEP))
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /**
index 0734aff..2999190 100644 (file)
@@ -183,7 +183,6 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 
 void prep_transhuge_page(struct page *page);
 void free_transhuge_page(struct page *page);
-bool is_transparent_hugepage(struct page *page);
 
 bool can_split_folio(struct folio *folio, int *pextra_pins);
 int split_huge_page_to_list(struct page *page, struct list_head *list);
@@ -341,11 +340,6 @@ static inline bool transhuge_vma_enabled(struct vm_area_struct *vma,
 
 static inline void prep_transhuge_page(struct page *page) {}
 
-static inline bool is_transparent_hugepage(struct page *page)
-{
-       return false;
-}
-
 #define transparent_hugepage_flags 0UL
 
 #define thp_get_unmapped_area  NULL
index b6a9326..ceebcb9 100644 (file)
@@ -19,13 +19,15 @@ struct task_struct;
 #include <linux/linkage.h>
 #include <asm/kasan.h>
 
-/* kasan_data struct is used in KUnit tests for KASAN expected failures */
-struct kunit_kasan_expectation {
-       bool report_found;
-};
-
 #endif
 
+typedef unsigned int __bitwise kasan_vmalloc_flags_t;
+
+#define KASAN_VMALLOC_NONE             0x00u
+#define KASAN_VMALLOC_INIT             0x01u
+#define KASAN_VMALLOC_VM_ALLOC         0x02u
+#define KASAN_VMALLOC_PROT_NORMAL      0x04u
+
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 
 #include <linux/pgtable.h>
@@ -84,25 +86,8 @@ static inline void kasan_disable_current(void) {}
 
 #ifdef CONFIG_KASAN_HW_TAGS
 
-void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags);
-void kasan_free_pages(struct page *page, unsigned int order);
-
 #else /* CONFIG_KASAN_HW_TAGS */
 
-static __always_inline void kasan_alloc_pages(struct page *page,
-                                             unsigned int order, gfp_t flags)
-{
-       /* Only available for integrated init. */
-       BUILD_BUG();
-}
-
-static __always_inline void kasan_free_pages(struct page *page,
-                                            unsigned int order)
-{
-       /* Only available for integrated init. */
-       BUILD_BUG();
-}
-
 #endif /* CONFIG_KASAN_HW_TAGS */
 
 static inline bool kasan_has_integrated_init(void)
@@ -282,10 +267,6 @@ static __always_inline bool kasan_check_byte(const void *addr)
        return true;
 }
 
-
-bool kasan_save_enable_multi_shot(void);
-void kasan_restore_multi_shot(bool enabled);
-
 #else /* CONFIG_KASAN */
 
 static inline slab_flags_t kasan_never_merge(void)
@@ -414,34 +395,71 @@ static inline void kasan_init_hw_tags(void) { }
 
 #ifdef CONFIG_KASAN_VMALLOC
 
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+
+void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
 int kasan_populate_vmalloc(unsigned long addr, unsigned long size);
-void kasan_poison_vmalloc(const void *start, unsigned long size);
-void kasan_unpoison_vmalloc(const void *start, unsigned long size);
 void kasan_release_vmalloc(unsigned long start, unsigned long end,
                           unsigned long free_region_start,
                           unsigned long free_region_end);
 
-void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
+#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
+
+static inline void kasan_populate_early_vm_area_shadow(void *start,
+                                                      unsigned long size)
+{ }
+static inline int kasan_populate_vmalloc(unsigned long start,
+                                       unsigned long size)
+{
+       return 0;
+}
+static inline void kasan_release_vmalloc(unsigned long start,
+                                        unsigned long end,
+                                        unsigned long free_region_start,
+                                        unsigned long free_region_end) { }
+
+#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
+
+void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
+                              kasan_vmalloc_flags_t flags);
+static __always_inline void *kasan_unpoison_vmalloc(const void *start,
+                                               unsigned long size,
+                                               kasan_vmalloc_flags_t flags)
+{
+       if (kasan_enabled())
+               return __kasan_unpoison_vmalloc(start, size, flags);
+       return (void *)start;
+}
+
+void __kasan_poison_vmalloc(const void *start, unsigned long size);
+static __always_inline void kasan_poison_vmalloc(const void *start,
+                                                unsigned long size)
+{
+       if (kasan_enabled())
+               __kasan_poison_vmalloc(start, size);
+}
 
 #else /* CONFIG_KASAN_VMALLOC */
 
+static inline void kasan_populate_early_vm_area_shadow(void *start,
+                                                      unsigned long size) { }
 static inline int kasan_populate_vmalloc(unsigned long start,
                                        unsigned long size)
 {
        return 0;
 }
-
-static inline void kasan_poison_vmalloc(const void *start, unsigned long size)
-{ }
-static inline void kasan_unpoison_vmalloc(const void *start, unsigned long size)
-{ }
 static inline void kasan_release_vmalloc(unsigned long start,
                                         unsigned long end,
                                         unsigned long free_region_start,
-                                        unsigned long free_region_end) {}
+                                        unsigned long free_region_end) { }
 
-static inline void kasan_populate_early_vm_area_shadow(void *start,
-                                                      unsigned long size)
+static inline void *kasan_unpoison_vmalloc(const void *start,
+                                          unsigned long size,
+                                          kasan_vmalloc_flags_t flags)
+{
+       return (void *)start;
+}
+static inline void kasan_poison_vmalloc(const void *start, unsigned long size)
 { }
 
 #endif /* CONFIG_KASAN_VMALLOC */
@@ -450,17 +468,17 @@ static inline void kasan_populate_early_vm_area_shadow(void *start,
                !defined(CONFIG_KASAN_VMALLOC)
 
 /*
- * These functions provide a special case to support backing module
- * allocations with real shadow memory. With KASAN vmalloc, the special
- * case is unnecessary, as the work is handled in the generic case.
+ * These functions allocate and free shadow memory for kernel modules.
+ * They are only required when KASAN_VMALLOC is not supported, as otherwise
+ * shadow memory is allocated by the generic vmalloc handlers.
  */
-int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask);
-void kasan_free_shadow(const struct vm_struct *vm);
+int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask);
+void kasan_free_module_shadow(const struct vm_struct *vm);
 
 #else /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */
 
-static inline int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) { return 0; }
-static inline void kasan_free_shadow(const struct vm_struct *vm) {}
+static inline int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask) { return 0; }
+static inline void kasan_free_module_shadow(const struct vm_struct *vm) {}
 
 #endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */
 
index 7a3dd7e..e34edb7 100644 (file)
@@ -834,16 +834,11 @@ static inline int total_mapcount(struct page *page)
        return folio_mapcount(page_folio(page));
 }
 
-int page_trans_huge_mapcount(struct page *page);
 #else
 static inline int total_mapcount(struct page *page)
 {
        return page_mapcount(page);
 }
-static inline int page_trans_huge_mapcount(struct page *page)
-{
-       return page_mapcount(page);
-}
 #endif
 
 static inline struct page *virt_to_head_page(const void *x)
index 88fe1d7..9d8eeaa 100644 (file)
@@ -481,7 +481,7 @@ static inline int TestClearPage##uname(struct page *page) { return 0; }
        TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname)
 
 __PAGEFLAG(Locked, locked, PF_NO_TAIL)
-PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) __CLEARPAGEFLAG(Waiters, waiters, PF_ONLY_HEAD)
+PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD)
 PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL)
 PAGEFLAG(Referenced, referenced, PF_HEAD)
        TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
index 58f395f..a8d0b32 100644 (file)
@@ -1009,8 +1009,7 @@ static inline void __set_page_dirty(struct page *page,
 {
        __folio_mark_dirty(page_folio(page), mapping, warn);
 }
-void folio_account_cleaned(struct folio *folio, struct address_space *mapping,
-                         struct bdi_writeback *wb);
+void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb);
 void __folio_cancel_dirty(struct folio *folio);
 static inline void folio_cancel_dirty(struct folio *folio)
 {
index f37837c..27093b4 100644 (file)
@@ -515,7 +515,6 @@ extern int __swp_swapcount(swp_entry_t entry);
 extern int swp_swapcount(swp_entry_t entry);
 extern struct swap_info_struct *page_swap_info(struct page *);
 extern struct swap_info_struct *swp_swap_info(swp_entry_t entry);
-extern bool reuse_swap_page(struct page *);
 extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
 extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
@@ -681,9 +680,6 @@ static inline int swp_swapcount(swp_entry_t entry)
        return 0;
 }
 
-#define reuse_swap_page(page) \
-       (page_trans_huge_mapcount(page) == 1)
-
 static inline int try_to_free_swap(struct page *page)
 {
        return 0;
index 5a0c3b5..3b1df7d 100644 (file)
@@ -35,17 +35,6 @@ struct notifier_block;               /* in notifier.h */
 #define VM_DEFER_KMEMLEAK      0
 #endif
 
-/*
- * VM_KASAN is used slightly differently depending on CONFIG_KASAN_VMALLOC.
- *
- * If IS_ENABLED(CONFIG_KASAN_VMALLOC), VM_KASAN is set on a vm_struct after
- * shadow memory has been mapped. It's used to handle allocation errors so that
- * we don't try to poison shadow on free if it was never allocated.
- *
- * Otherwise, VM_KASAN is set for kasan_module_alloc() allocations and used to
- * determine which allocations need the module shadow freed.
- */
-
 /* bits [20..32] reserved for arch specific ioremap internals */
 
 /*
@@ -126,6 +115,13 @@ static inline int arch_vmap_pte_supported_shift(unsigned long size)
 }
 #endif
 
+#ifndef arch_vmap_pgprot_tagged
+static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot)
+{
+       return prot;
+}
+#endif
+
 /*
  *     Highlevel APIs for driver use
  */
index 4fdb14a..d651f34 100644 (file)
@@ -29,7 +29,6 @@
        EM( SCAN_VMA_NULL,              "vma_null")                     \
        EM( SCAN_VMA_CHECK,             "vma_check_failed")             \
        EM( SCAN_ADDRESS_RANGE,         "not_suitable_address_range")   \
-       EM( SCAN_SWAP_CACHE_PAGE,       "page_swap_cache")              \
        EM( SCAN_DEL_PAGE_LRU,          "could_not_delete_page_from_lru")\
        EM( SCAN_ALLOC_HUGE_PAGE_FAIL,  "alloc_huge_page_failed")       \
        EM( SCAN_CGROUP_CHARGE_FAIL,    "ccgroup_charge_failed")        \
index 779f3fa..061b512 100644 (file)
@@ -105,6 +105,37 @@ TRACE_EVENT(mm_migrate_pages_start,
                  __print_symbolic(__entry->reason, MIGRATE_REASON))
 );
 
+DECLARE_EVENT_CLASS(migration_pte,
+
+               TP_PROTO(unsigned long addr, unsigned long pte, int order),
+
+               TP_ARGS(addr, pte, order),
+
+               TP_STRUCT__entry(
+                       __field(unsigned long, addr)
+                       __field(unsigned long, pte)
+                       __field(int, order)
+               ),
+
+               TP_fast_assign(
+                       __entry->addr = addr;
+                       __entry->pte = pte;
+                       __entry->order = order;
+               ),
+
+               TP_printk("addr=%lx, pte=%lx order=%d", __entry->addr, __entry->pte, __entry->order)
+);
+
+DEFINE_EVENT(migration_pte, set_migration_pte,
+       TP_PROTO(unsigned long addr, unsigned long pte, int order),
+       TP_ARGS(addr, pte, order)
+);
+
+DEFINE_EVENT(migration_pte, remove_migration_pte,
+       TP_PROTO(unsigned long addr, unsigned long pte, int order),
+       TP_ARGS(addr, pte, order)
+);
+
 #endif /* _TRACE_MIGRATE_H */
 
 /* This part must be outside protection */
index 116ed4d..6532119 100644 (file)
        {(unsigned long)__GFP_RECLAIM,          "__GFP_RECLAIM"},       \
        {(unsigned long)__GFP_DIRECT_RECLAIM,   "__GFP_DIRECT_RECLAIM"},\
        {(unsigned long)__GFP_KSWAPD_RECLAIM,   "__GFP_KSWAPD_RECLAIM"},\
-       {(unsigned long)__GFP_ZEROTAGS,         "__GFP_ZEROTAGS"},      \
-       {(unsigned long)__GFP_SKIP_KASAN_POISON,"__GFP_SKIP_KASAN_POISON"}\
+       {(unsigned long)__GFP_ZEROTAGS,         "__GFP_ZEROTAGS"}       \
+
+#ifdef CONFIG_KASAN_HW_TAGS
+#define __def_gfpflag_names_kasan ,                                           \
+       {(unsigned long)__GFP_SKIP_ZERO,           "__GFP_SKIP_ZERO"},         \
+       {(unsigned long)__GFP_SKIP_KASAN_POISON,   "__GFP_SKIP_KASAN_POISON"}, \
+       {(unsigned long)__GFP_SKIP_KASAN_UNPOISON, "__GFP_SKIP_KASAN_UNPOISON"}
+#else
+#define __def_gfpflag_names_kasan
+#endif
 
 #define show_gfp_flags(flags)                                          \
        (flags) ? __print_flags(flags, "|",                             \
-       __def_gfpflag_names                                             \
+       __def_gfpflag_names __def_gfpflag_names_kasan                   \
        ) : "none"
 
 #ifdef CONFIG_MMU
index ca3f276..202b3e3 100644 (file)
@@ -48,6 +48,33 @@ TRACE_EVENT(hugepage_update,
            TP_printk("hugepage update at addr 0x%lx and pte = 0x%lx clr = 0x%lx, set = 0x%lx", __entry->addr, __entry->pte, __entry->clr, __entry->set)
 );
 
+DECLARE_EVENT_CLASS(migration_pmd,
+
+               TP_PROTO(unsigned long addr, unsigned long pmd),
+
+               TP_ARGS(addr, pmd),
+
+               TP_STRUCT__entry(
+                       __field(unsigned long, addr)
+                       __field(unsigned long, pmd)
+               ),
+
+               TP_fast_assign(
+                       __entry->addr = addr;
+                       __entry->pmd = pmd;
+               ),
+               TP_printk("addr=%lx, pmd=%lx", __entry->addr, __entry->pmd)
+);
+
+DEFINE_EVENT(migration_pmd, set_migration_pmd,
+       TP_PROTO(unsigned long addr, unsigned long pmd),
+       TP_ARGS(addr, pmd)
+);
+
+DEFINE_EVENT(migration_pmd, remove_migration_pmd,
+       TP_PROTO(unsigned long addr, unsigned long pmd),
+       TP_ARGS(addr, pmd)
+);
 #endif /* _TRACE_THP_H */
 
 /* This part must be outside protection */
index 1567a32..6c1aa92 100644 (file)
@@ -75,6 +75,8 @@
 #define MADV_POPULATE_READ     22      /* populate (prefault) page tables readable */
 #define MADV_POPULATE_WRITE    23      /* populate (prefault) page tables writable */
 
+#define MADV_DONTNEED_LOCKED   24      /* like DONTNEED, but drop locked pages too */
+
 /* compatibility flags */
 #define MAP_FILE       0
 
index 834af51..9796897 100644 (file)
@@ -286,11 +286,13 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
                if (!s)
                        continue;
 
-               /* Mark stack accessible for KASAN. */
+               /* Reset stack metadata. */
                kasan_unpoison_range(s->addr, THREAD_SIZE);
 
+               stack = kasan_reset_tag(s->addr);
+
                /* Clear stale pointers from reused stack. */
-               memset(s->addr, 0, THREAD_SIZE);
+               memset(stack, 0, THREAD_SIZE);
 
                if (memcg_charge_kernel_stack(s)) {
                        vfree(s->addr);
@@ -298,7 +300,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
                }
 
                tsk->stack_vm_area = s;
-               tsk->stack = s->addr;
+               tsk->stack = stack;
                return 0;
        }
 
@@ -326,6 +328,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
         * so cache the vm_struct.
         */
        tsk->stack_vm_area = vm;
+       stack = kasan_reset_tag(stack);
        tsk->stack = stack;
        return 0;
 }
index 579841b..b7e1b09 100644 (file)
@@ -32,15 +32,19 @@ static void *__scs_alloc(int node)
        for (i = 0; i < NR_CACHED_SCS; i++) {
                s = this_cpu_xchg(scs_cache[i], NULL);
                if (s) {
-                       kasan_unpoison_vmalloc(s, SCS_SIZE);
+                       s = kasan_unpoison_vmalloc(s, SCS_SIZE,
+                                                  KASAN_VMALLOC_PROT_NORMAL);
                        memset(s, 0, SCS_SIZE);
-                       return s;
+                       goto out;
                }
        }
 
-       return __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END,
+       s = __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END,
                                    GFP_SCS, PAGE_KERNEL, 0, node,
                                    __builtin_return_address(0));
+
+out:
+       return kasan_reset_tag(s);
 }
 
 void *scs_alloc(int node)
@@ -78,7 +82,7 @@ void scs_free(void *s)
                if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL)
                        return;
 
-       kasan_unpoison_vmalloc(s, SCS_SIZE);
+       kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_PROT_NORMAL);
        vfree_atomic(s);
 }
 
index 879757b..1f3e620 100644 (file)
@@ -178,17 +178,17 @@ config KASAN_TAGS_IDENTIFY
          memory consumption.
 
 config KASAN_VMALLOC
-       bool "Back mappings in vmalloc space with real shadow memory"
-       depends on KASAN_GENERIC && HAVE_ARCH_KASAN_VMALLOC
+       bool "Check accesses to vmalloc allocations"
+       depends on HAVE_ARCH_KASAN_VMALLOC
        help
-         By default, the shadow region for vmalloc space is the read-only
-         zero page. This means that KASAN cannot detect errors involving
-         vmalloc space.
-
-         Enabling this option will hook in to vmap/vmalloc and back those
-         mappings with real shadow memory allocated on demand. This allows
-         for KASAN to detect more sorts of errors (and to support vmapped
-         stacks), but at the cost of higher memory usage.
+         This mode makes KASAN check accesses to vmalloc allocations for
+         validity.
+
+         With software KASAN modes, checking is done for all types of vmalloc
+         allocations. Enabling this option leads to higher memory usage.
+
+         With hardware tag-based KASAN, only VM_ALLOC mappings are checked.
+         There is no additional memory usage.
 
 config KASAN_KUNIT_TEST
        tristate "KUnit-compatible tests of KASAN bug detection capabilities" if !KUNIT_ALL_TESTS
index 3b413f8..ad88023 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/vmalloc.h>
+#include <linux/set_memory.h>
 
 #include <asm/page.h>
 
@@ -36,7 +37,7 @@ void *kasan_ptr_result;
 int kasan_int_result;
 
 static struct kunit_resource resource;
-static struct kunit_kasan_expectation fail_data;
+static struct kunit_kasan_status test_status;
 static bool multishot;
 
 /*
@@ -53,58 +54,63 @@ static int kasan_test_init(struct kunit *test)
        }
 
        multishot = kasan_save_enable_multi_shot();
-       fail_data.report_found = false;
+       test_status.report_found = false;
+       test_status.sync_fault = false;
        kunit_add_named_resource(test, NULL, NULL, &resource,
-                                       "kasan_data", &fail_data);
+                                       "kasan_status", &test_status);
        return 0;
 }
 
 static void kasan_test_exit(struct kunit *test)
 {
        kasan_restore_multi_shot(multishot);
-       KUNIT_EXPECT_FALSE(test, fail_data.report_found);
+       KUNIT_EXPECT_FALSE(test, test_status.report_found);
 }
 
 /**
  * KUNIT_EXPECT_KASAN_FAIL() - check that the executed expression produces a
  * KASAN report; causes a test failure otherwise. This relies on a KUnit
- * resource named "kasan_data". Do not use this name for KUnit resources
+ * resource named "kasan_status". Do not use this name for KUnit resources
  * outside of KASAN tests.
  *
- * For hardware tag-based KASAN in sync mode, when a tag fault happens, tag
+ * For hardware tag-based KASAN, when a synchronous tag fault happens, tag
  * checking is auto-disabled. When this happens, this test handler reenables
  * tag checking. As tag checking can be only disabled or enabled per CPU,
  * this handler disables migration (preemption).
  *
- * Since the compiler doesn't see that the expression can change the fail_data
+ * Since the compiler doesn't see that the expression can change the test_status
  * fields, it can reorder or optimize away the accesses to those fields.
  * Use READ/WRITE_ONCE() for the accesses and compiler barriers around the
  * expression to prevent that.
  *
- * In between KUNIT_EXPECT_KASAN_FAIL checks, fail_data.report_found is kept as
- * false. This allows detecting KASAN reports that happen outside of the checks
- * by asserting !fail_data.report_found at the start of KUNIT_EXPECT_KASAN_FAIL
- * and in kasan_test_exit.
+ * In between KUNIT_EXPECT_KASAN_FAIL checks, test_status.report_found is kept
+ * as false. This allows detecting KASAN reports that happen outside of the
+ * checks by asserting !test_status.report_found at the start of
+ * KUNIT_EXPECT_KASAN_FAIL and in kasan_test_exit.
  */
 #define KUNIT_EXPECT_KASAN_FAIL(test, expression) do {                 \
        if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) &&                         \
            kasan_sync_fault_possible())                                \
                migrate_disable();                                      \
-       KUNIT_EXPECT_FALSE(test, READ_ONCE(fail_data.report_found));    \
+       KUNIT_EXPECT_FALSE(test, READ_ONCE(test_status.report_found));  \
        barrier();                                                      \
        expression;                                                     \
        barrier();                                                      \
-       if (!READ_ONCE(fail_data.report_found)) {                       \
+       if (kasan_async_fault_possible())                               \
+               kasan_force_async_fault();                              \
+       if (!READ_ONCE(test_status.report_found)) {                     \
                KUNIT_FAIL(test, KUNIT_SUBTEST_INDENT "KASAN failure "  \
                                "expected in \"" #expression            \
                                 "\", but none occurred");              \
        }                                                               \
-       if (IS_ENABLED(CONFIG_KASAN_HW_TAGS)) {                         \
-               if (READ_ONCE(fail_data.report_found))                  \
-                       kasan_enable_tagging_sync();                    \
+       if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) &&                         \
+           kasan_sync_fault_possible()) {                              \
+               if (READ_ONCE(test_status.report_found) &&              \
+                   READ_ONCE(test_status.sync_fault))                  \
+                       kasan_enable_tagging();                         \
                migrate_enable();                                       \
        }                                                               \
-       WRITE_ONCE(fail_data.report_found, false);                      \
+       WRITE_ONCE(test_status.report_found, false);                    \
 } while (0)
 
 #define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do {                  \
@@ -780,7 +786,7 @@ static void ksize_uaf(struct kunit *test)
 static void kasan_stack_oob(struct kunit *test)
 {
        char stack_array[10];
-       /* See comment in kasan_global_oob. */
+       /* See comment in kasan_global_oob_right. */
        char *volatile array = stack_array;
        char *p = &array[ARRAY_SIZE(stack_array) + OOB_TAG_OFF];
 
@@ -793,7 +799,7 @@ static void kasan_alloca_oob_left(struct kunit *test)
 {
        volatile int i = 10;
        char alloca_array[i];
-       /* See comment in kasan_global_oob. */
+       /* See comment in kasan_global_oob_right. */
        char *volatile array = alloca_array;
        char *p = array - 1;
 
@@ -808,7 +814,7 @@ static void kasan_alloca_oob_right(struct kunit *test)
 {
        volatile int i = 10;
        char alloca_array[i];
-       /* See comment in kasan_global_oob. */
+       /* See comment in kasan_global_oob_right. */
        char *volatile array = alloca_array;
        char *p = array + i;
 
@@ -1057,21 +1063,186 @@ static void kmalloc_double_kzfree(struct kunit *test)
        KUNIT_EXPECT_KASAN_FAIL(test, kfree_sensitive(ptr));
 }
 
+static void vmalloc_helpers_tags(struct kunit *test)
+{
+       void *ptr;
+
+       /* This test is intended for tag-based modes. */
+       KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+       KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC);
+
+       ptr = vmalloc(PAGE_SIZE);
+       KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+       /* Check that the returned pointer is tagged. */
+       KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN);
+       KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
+
+       /* Make sure exported vmalloc helpers handle tagged pointers. */
+       KUNIT_ASSERT_TRUE(test, is_vmalloc_addr(ptr));
+       KUNIT_ASSERT_NOT_ERR_OR_NULL(test, vmalloc_to_page(ptr));
+
+#if !IS_MODULE(CONFIG_KASAN_KUNIT_TEST)
+       {
+               int rv;
+
+               /* Make sure vmalloc'ed memory permissions can be changed. */
+               rv = set_memory_ro((unsigned long)ptr, 1);
+               KUNIT_ASSERT_GE(test, rv, 0);
+               rv = set_memory_rw((unsigned long)ptr, 1);
+               KUNIT_ASSERT_GE(test, rv, 0);
+       }
+#endif
+
+       vfree(ptr);
+}
+
 static void vmalloc_oob(struct kunit *test)
 {
-       void *area;
+       char *v_ptr, *p_ptr;
+       struct page *page;
+       size_t size = PAGE_SIZE / 2 - KASAN_GRANULE_SIZE - 5;
 
        KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC);
 
+       v_ptr = vmalloc(size);
+       KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr);
+
+       OPTIMIZER_HIDE_VAR(v_ptr);
+
        /*
-        * We have to be careful not to hit the guard page.
+        * We have to be careful not to hit the guard page in vmalloc tests.
         * The MMU will catch that and crash us.
         */
-       area = vmalloc(3000);
-       KUNIT_ASSERT_NOT_ERR_OR_NULL(test, area);
 
-       KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)area)[3100]);
-       vfree(area);
+       /* Make sure in-bounds accesses are valid. */
+       v_ptr[0] = 0;
+       v_ptr[size - 1] = 0;
+
+       /*
+        * An unaligned access past the requested vmalloc size.
+        * Only generic KASAN can precisely detect these.
+        */
+       if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+               KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size]);
+
+       /* An aligned access into the first out-of-bounds granule. */
+       KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size + 5]);
+
+       /* Check that in-bounds accesses to the physical page are valid. */
+       page = vmalloc_to_page(v_ptr);
+       KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page);
+       p_ptr = page_address(page);
+       KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr);
+       p_ptr[0] = 0;
+
+       vfree(v_ptr);
+
+       /*
+        * We can't check for use-after-unmap bugs in this nor in the following
+        * vmalloc tests, as the page might be fully unmapped and accessing it
+        * will crash the kernel.
+        */
+}
+
+static void vmap_tags(struct kunit *test)
+{
+       char *p_ptr, *v_ptr;
+       struct page *p_page, *v_page;
+
+       /*
+        * This test is specifically crafted for the software tag-based mode,
+        * the only tag-based mode that poisons vmap mappings.
+        */
+       KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS);
+
+       KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC);
+
+       p_page = alloc_pages(GFP_KERNEL, 1);
+       KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_page);
+       p_ptr = page_address(p_page);
+       KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr);
+
+       v_ptr = vmap(&p_page, 1, VM_MAP, PAGE_KERNEL);
+       KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr);
+
+       /*
+        * We can't check for out-of-bounds bugs in this nor in the following
+        * vmalloc tests, as allocations have page granularity and accessing
+        * the guard page will crash the kernel.
+        */
+
+       KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN);
+       KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL);
+
+       /* Make sure that in-bounds accesses through both pointers work. */
+       *p_ptr = 0;
+       *v_ptr = 0;
+
+       /* Make sure vmalloc_to_page() correctly recovers the page pointer. */
+       v_page = vmalloc_to_page(v_ptr);
+       KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_page);
+       KUNIT_EXPECT_PTR_EQ(test, p_page, v_page);
+
+       vunmap(v_ptr);
+       free_pages((unsigned long)p_ptr, 1);
+}
+
+static void vm_map_ram_tags(struct kunit *test)
+{
+       char *p_ptr, *v_ptr;
+       struct page *page;
+
+       /*
+        * This test is specifically crafted for the software tag-based mode,
+        * the only tag-based mode that poisons vm_map_ram mappings.
+        */
+       KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS);
+
+       page = alloc_pages(GFP_KERNEL, 1);
+       KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page);
+       p_ptr = page_address(page);
+       KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr);
+
+       v_ptr = vm_map_ram(&page, 1, -1);
+       KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr);
+
+       KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN);
+       KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL);
+
+       /* Make sure that in-bounds accesses through both pointers work. */
+       *p_ptr = 0;
+       *v_ptr = 0;
+
+       vm_unmap_ram(v_ptr, 1);
+       free_pages((unsigned long)p_ptr, 1);
+}
+
+static void vmalloc_percpu(struct kunit *test)
+{
+       char __percpu *ptr;
+       int cpu;
+
+       /*
+        * This test is specifically crafted for the software tag-based mode,
+        * the only tag-based mode that poisons percpu mappings.
+        */
+       KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS);
+
+       ptr = __alloc_percpu(PAGE_SIZE, PAGE_SIZE);
+
+       for_each_possible_cpu(cpu) {
+               char *c_ptr = per_cpu_ptr(ptr, cpu);
+
+               KUNIT_EXPECT_GE(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_MIN);
+               KUNIT_EXPECT_LT(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_KERNEL);
+
+               /* Make sure that in-bounds accesses don't crash the kernel. */
+               *c_ptr = 0;
+       }
+
+       free_percpu(ptr);
 }
 
 /*
@@ -1105,6 +1276,18 @@ static void match_all_not_assigned(struct kunit *test)
                KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
                free_pages((unsigned long)ptr, order);
        }
+
+       if (!IS_ENABLED(CONFIG_KASAN_VMALLOC))
+               return;
+
+       for (i = 0; i < 256; i++) {
+               size = (get_random_int() % 1024) + 1;
+               ptr = vmalloc(size);
+               KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+               KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN);
+               KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
+               vfree(ptr);
+       }
 }
 
 /* Check that 0xff works as a match-all pointer tag for tag-based modes. */
@@ -1210,7 +1393,11 @@ static struct kunit_case kasan_kunit_test_cases[] = {
        KUNIT_CASE(kasan_bitops_generic),
        KUNIT_CASE(kasan_bitops_tags),
        KUNIT_CASE(kmalloc_double_kzfree),
+       KUNIT_CASE(vmalloc_helpers_tags),
        KUNIT_CASE(vmalloc_oob),
+       KUNIT_CASE(vmap_tags),
+       KUNIT_CASE(vm_map_ram_tags),
+       KUNIT_CASE(vmalloc_percpu),
        KUNIT_CASE(match_all_not_assigned),
        KUNIT_CASE(match_all_ptr_tag),
        KUNIT_CASE(match_all_mem_tag),
index 53fe73a..40d26a0 100644 (file)
@@ -2906,13 +2906,15 @@ int vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
 {
        int i;
 
+       if (unlikely(!size))
+               return 0;
+
        i = vsnprintf(buf, size, fmt, args);
 
        if (likely(i < size))
                return i;
-       if (size != 0)
-               return size - 1;
-       return 0;
+
+       return size - 1;
 }
 EXPORT_SYMBOL(vscnprintf);
 
index 761f502..034d879 100644 (file)
@@ -762,6 +762,9 @@ config ARCH_HAS_CURRENT_STACK_POINTER
          register alias named "current_stack_pointer", this config can be
          selected.
 
+config ARCH_HAS_FILTER_PGPROT
+       bool
+
 config ARCH_HAS_PTE_DEVMAP
        bool
 
index eeb7ea3..bef329b 100644 (file)
@@ -261,5 +261,4 @@ void page_init_poison(struct page *page, size_t size)
        if (page_init_poisoning)
                memset(page, PAGE_POISON_PATTERN, size);
 }
-EXPORT_SYMBOL_GPL(page_init_poison);
 #endif         /* CONFIG_DEBUG_VM */
index d2e6a79..8426434 100644 (file)
@@ -152,25 +152,25 @@ static void filemap_unaccount_folio(struct address_space *mapping,
 
        VM_BUG_ON_FOLIO(folio_mapped(folio), folio);
        if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {
-               int mapcount;
-
                pr_alert("BUG: Bad page cache in process %s  pfn:%05lx\n",
                         current->comm, folio_pfn(folio));
                dump_page(&folio->page, "still mapped when deleted");
                dump_stack();
                add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 
-               mapcount = page_mapcount(&folio->page);
-               if (mapping_exiting(mapping) &&
-                   folio_ref_count(folio) >= mapcount + 2) {
-                       /*
-                        * All vmas have already been torn down, so it's
-                        * a good bet that actually the folio is unmapped,
-                        * and we'd prefer not to leak it: if we're wrong,
-                        * some other bad page check should catch it later.
-                        */
-                       page_mapcount_reset(&folio->page);
-                       folio_ref_sub(folio, mapcount);
+               if (mapping_exiting(mapping) && !folio_test_large(folio)) {
+                       int mapcount = page_mapcount(&folio->page);
+
+                       if (folio_ref_count(folio) >= mapcount + 2) {
+                               /*
+                                * All vmas have already been torn down, so it's
+                                * a good bet that actually the page is unmapped
+                                * and we'd rather not leak it: if we're wrong,
+                                * another bad page check should catch it later.
+                                */
+                               page_mapcount_reset(&folio->page);
+                               folio_ref_sub(folio, mapcount);
+                       }
                }
        }
 
@@ -193,16 +193,20 @@ static void filemap_unaccount_folio(struct address_space *mapping,
        /*
         * At this point folio must be either written or cleaned by
         * truncate.  Dirty folio here signals a bug and loss of
-        * unwritten data.
+        * unwritten data - on ordinary filesystems.
+        *
+        * But it's harmless on in-memory filesystems like tmpfs; and can
+        * occur when a driver which did get_user_pages() sets page dirty
+        * before putting it, while the inode is being finally evicted.
         *
-        * This fixes dirty accounting after removing the folio entirely
+        * Below fixes dirty accounting after removing the folio entirely
         * but leaves the dirty flag set: it has no effect for truncated
         * folio and anyway will be cleared before returning folio to
         * buddy allocator.
         */
-       if (WARN_ON_ONCE(folio_test_dirty(folio)))
-               folio_account_cleaned(folio, mapping,
-                                       inode_to_wb(mapping->host));
+       if (WARN_ON_ONCE(folio_test_dirty(folio) &&
+                        mapping_can_writeback(mapping)))
+               folio_account_cleaned(folio, inode_to_wb(mapping->host));
 }
 
 /*
@@ -1185,24 +1189,17 @@ static void folio_wake_bit(struct folio *folio, int bit_nr)
        }
 
        /*
-        * It is possible for other pages to have collided on the waitqueue
-        * hash, so in that case check for a page match. That prevents a long-
-        * term waiter
+        * It's possible to miss clearing waiters here, when we woke our page
+        * waiters, but the hashed waitqueue has waiters for other pages on it.
+        * That's okay, it's a rare case. The next waker will clear it.
         *
-        * It is still possible to miss a case here, when we woke page waiters
-        * and removed them from the waitqueue, but there are still other
-        * page waiters.
+        * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE,
+        * other), the flag may be cleared in the course of freeing the page;
+        * but that is not required for correctness.
         */
-       if (!waitqueue_active(q) || !key.page_match) {
+       if (!waitqueue_active(q) || !key.page_match)
                folio_clear_waiters(folio);
-               /*
-                * It's possible to miss clearing Waiters here, when we woke
-                * our page waiters, but the hashed waitqueue has waiters for
-                * other pages on it.
-                *
-                * That's okay, it's a rare case. The next waker will clear it.
-                */
-       }
+
        spin_unlock_irqrestore(&q->lock, flags);
 }
 
index 005fab2..2fe3821 100644 (file)
@@ -40,6 +40,9 @@
 #include <asm/pgalloc.h>
 #include "internal.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/thp.h>
+
 /*
  * By default, transparent hugepage support is disabled in order to avoid
  * risking an increased memory footprint for applications that are not
@@ -530,7 +533,7 @@ void prep_transhuge_page(struct page *page)
        set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
 }
 
-bool is_transparent_hugepage(struct page *page)
+static inline bool is_transparent_hugepage(struct page *page)
 {
        if (!PageCompound(page))
                return false;
@@ -539,7 +542,6 @@ bool is_transparent_hugepage(struct page *page)
        return is_huge_zero_page(page) ||
               page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
 }
-EXPORT_SYMBOL_GPL(is_transparent_hugepage);
 
 static unsigned long __thp_get_unmapped_area(struct file *filp,
                unsigned long addr, unsigned long len,
@@ -1301,7 +1303,6 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
        page = pmd_page(orig_pmd);
        VM_BUG_ON_PAGE(!PageHead(page), page);
 
-       /* Lock page for reuse_swap_page() */
        if (!trylock_page(page)) {
                get_page(page);
                spin_unlock(vmf->ptl);
@@ -1317,10 +1318,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
        }
 
        /*
-        * We can only reuse the page if nobody else maps the huge page or it's
-        * part.
+        * See do_wp_page(): we can only map the page writable if there are
+        * no additional references. Note that we always drain the LRU
+        * pagevecs immediately after adding a THP.
         */
-       if (reuse_swap_page(page)) {
+       if (page_count(page) > 1 + PageSwapCache(page) * thp_nr_pages(page))
+               goto unlock_fallback;
+       if (PageSwapCache(page))
+               try_to_free_swap(page);
+       if (page_count(page) == 1) {
                pmd_t entry;
                entry = pmd_mkyoung(orig_pmd);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1331,6 +1337,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
                return VM_FAULT_WRITE;
        }
 
+unlock_fallback:
        unlock_page(page);
        spin_unlock(vmf->ptl);
 fallback:
@@ -2126,8 +2133,6 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 {
        spinlock_t *ptl;
        struct mmu_notifier_range range;
-       bool do_unlock_folio = false;
-       pmd_t _pmd;
 
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
                                address & HPAGE_PMD_MASK,
@@ -2146,42 +2151,12 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                        goto out;
        }
 
-repeat:
-       if (pmd_trans_huge(*pmd)) {
-               if (!folio) {
-                       folio = page_folio(pmd_page(*pmd));
-                       /*
-                        * An anonymous page must be locked, to ensure that a
-                        * concurrent reuse_swap_page() sees stable mapcount;
-                        * but reuse_swap_page() is not used on shmem or file,
-                        * and page lock must not be taken when zap_pmd_range()
-                        * calls __split_huge_pmd() while i_mmap_lock is held.
-                        */
-                       if (folio_test_anon(folio)) {
-                               if (unlikely(!folio_trylock(folio))) {
-                                       folio_get(folio);
-                                       _pmd = *pmd;
-                                       spin_unlock(ptl);
-                                       folio_lock(folio);
-                                       spin_lock(ptl);
-                                       if (unlikely(!pmd_same(*pmd, _pmd))) {
-                                               folio_unlock(folio);
-                                               folio_put(folio);
-                                               folio = NULL;
-                                               goto repeat;
-                                       }
-                                       folio_put(folio);
-                               }
-                               do_unlock_folio = true;
-                       }
-               }
-       } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
-               goto out;
-       __split_huge_pmd_locked(vma, pmd, range.start, freeze);
+       if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
+           is_pmd_migration_entry(*pmd))
+               __split_huge_pmd_locked(vma, pmd, range.start, freeze);
+
 out:
        spin_unlock(ptl);
-       if (do_unlock_folio)
-               folio_unlock(folio);
        /*
         * No need to double call mmu_notifier->invalidate_range() callback.
         * They are 3 cases to consider inside __split_huge_pmd_locked():
@@ -2476,54 +2451,6 @@ static void __split_huge_page(struct page *page, struct list_head *list,
        }
 }
 
-/*
- * This calculates accurately how many mappings a transparent hugepage
- * has (unlike page_mapcount() which isn't fully accurate). This full
- * accuracy is primarily needed to know if copy-on-write faults can
- * reuse the page and change the mapping to read-write instead of
- * copying them. At the same time this returns the total_mapcount too.
- *
- * The function returns the highest mapcount any one of the subpages
- * has. If the return value is one, even if different processes are
- * mapping different subpages of the transparent hugepage, they can
- * all reuse it, because each process is reusing a different subpage.
- *
- * The total_mapcount is instead counting all virtual mappings of the
- * subpages. If the total_mapcount is equal to "one", it tells the
- * caller all mappings belong to the same "mm" and in turn the
- * anon_vma of the transparent hugepage can become the vma->anon_vma
- * local one as no other process may be mapping any of the subpages.
- *
- * It would be more accurate to replace page_mapcount() with
- * page_trans_huge_mapcount(), however we only use
- * page_trans_huge_mapcount() in the copy-on-write faults where we
- * need full accuracy to avoid breaking page pinning, because
- * page_trans_huge_mapcount() is slower than page_mapcount().
- */
-int page_trans_huge_mapcount(struct page *page)
-{
-       int i, ret;
-
-       /* hugetlbfs shouldn't call it */
-       VM_BUG_ON_PAGE(PageHuge(page), page);
-
-       if (likely(!PageTransCompound(page)))
-               return atomic_read(&page->_mapcount) + 1;
-
-       page = compound_head(page);
-
-       ret = 0;
-       for (i = 0; i < thp_nr_pages(page); i++) {
-               int mapcount = atomic_read(&page[i]._mapcount) + 1;
-               ret = max(ret, mapcount);
-       }
-
-       if (PageDoubleMap(page))
-               ret -= 1;
-
-       return ret + compound_mapcount(page);
-}
-
 /* Racy check whether the huge page can be split */
 bool can_split_folio(struct folio *folio, int *pextra_pins)
 {
@@ -3131,6 +3058,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
        set_pmd_at(mm, address, pvmw->pmd, pmdswp);
        page_remove_rmap(page, vma, true);
        put_page(page);
+       trace_set_migration_pmd(address, pmd_val(pmdswp));
 }
 
 void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
@@ -3163,5 +3091,6 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
 
        /* No need to invalidate - it was non-present before */
        update_mmu_cache_pmd(vma, address, pvmw->pmd);
+       trace_remove_migration_pmd(address, pmd_val(pmde));
 }
 #endif
index adcd9ac..1f84df9 100644 (file)
@@ -35,7 +35,7 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME)
 CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
 CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
 
-obj-$(CONFIG_KASAN) := common.o report.o
+obj-y := common.o report.o
 obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o
 obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o tags.o report_tags.o
 obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o tags.o report_tags.o
index 9219656..d9079ec 100644 (file)
@@ -387,7 +387,7 @@ static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip)
        }
 
        /*
-        * The object will be poisoned by kasan_free_pages() or
+        * The object will be poisoned by kasan_poison_pages() or
         * kasan_slab_free_mempool().
         */
 
@@ -538,7 +538,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,
                return NULL;
 
        /*
-        * The object has already been unpoisoned by kasan_alloc_pages() for
+        * The object has already been unpoisoned by kasan_unpoison_pages() for
         * alloc_pages() or by kasan_krealloc() for krealloc().
         */
 
index 7355cb5..07a76c4 100644 (file)
@@ -32,6 +32,12 @@ enum kasan_arg_mode {
        KASAN_ARG_MODE_ASYMM,
 };
 
+enum kasan_arg_vmalloc {
+       KASAN_ARG_VMALLOC_DEFAULT,
+       KASAN_ARG_VMALLOC_OFF,
+       KASAN_ARG_VMALLOC_ON,
+};
+
 enum kasan_arg_stacktrace {
        KASAN_ARG_STACKTRACE_DEFAULT,
        KASAN_ARG_STACKTRACE_OFF,
@@ -40,18 +46,28 @@ enum kasan_arg_stacktrace {
 
 static enum kasan_arg kasan_arg __ro_after_init;
 static enum kasan_arg_mode kasan_arg_mode __ro_after_init;
-static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init;
+static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata;
+static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata;
 
-/* Whether KASAN is enabled at all. */
+/*
+ * Whether KASAN is enabled at all.
+ * The value remains false until KASAN is initialized by kasan_init_hw_tags().
+ */
 DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled);
 EXPORT_SYMBOL(kasan_flag_enabled);
 
-/* Whether the selected mode is synchronous/asynchronous/asymmetric.*/
+/*
+ * Whether the selected mode is synchronous, asynchronous, or asymmetric.
+ * Defaults to KASAN_MODE_SYNC.
+ */
 enum kasan_mode kasan_mode __ro_after_init;
 EXPORT_SYMBOL_GPL(kasan_mode);
 
+/* Whether to enable vmalloc tagging. */
+DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc);
+
 /* Whether to collect alloc/free stack traces. */
-DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
+DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace);
 
 /* kasan=off/on */
 static int __init early_kasan_flag(char *arg)
@@ -89,6 +105,23 @@ static int __init early_kasan_mode(char *arg)
 }
 early_param("kasan.mode", early_kasan_mode);
 
+/* kasan.vmalloc=off/on */
+static int __init early_kasan_flag_vmalloc(char *arg)
+{
+       if (!arg)
+               return -EINVAL;
+
+       if (!strcmp(arg, "off"))
+               kasan_arg_vmalloc = KASAN_ARG_VMALLOC_OFF;
+       else if (!strcmp(arg, "on"))
+               kasan_arg_vmalloc = KASAN_ARG_VMALLOC_ON;
+       else
+               return -EINVAL;
+
+       return 0;
+}
+early_param("kasan.vmalloc", early_kasan_flag_vmalloc);
+
 /* kasan.stacktrace=off/on */
 static int __init early_kasan_flag_stacktrace(char *arg)
 {
@@ -116,7 +149,10 @@ static inline const char *kasan_mode_info(void)
                return "sync";
 }
 
-/* kasan_init_hw_tags_cpu() is called for each CPU. */
+/*
+ * kasan_init_hw_tags_cpu() is called for each CPU.
+ * Not marked as __init as a CPU can be hot-plugged after boot.
+ */
 void kasan_init_hw_tags_cpu(void)
 {
        /*
@@ -124,7 +160,11 @@ void kasan_init_hw_tags_cpu(void)
         * as this function is only called for MTE-capable hardware.
         */
 
-       /* If KASAN is disabled via command line, don't initialize it. */
+       /*
+        * If KASAN is disabled via command line, don't initialize it.
+        * When this function is called, kasan_flag_enabled is not yet
+        * set by kasan_init_hw_tags(). Thus, check kasan_arg instead.
+        */
        if (kasan_arg == KASAN_ARG_OFF)
                return;
 
@@ -132,12 +172,7 @@ void kasan_init_hw_tags_cpu(void)
         * Enable async or asymm modes only when explicitly requested
         * through the command line.
         */
-       if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC)
-               hw_enable_tagging_async();
-       else if (kasan_arg_mode == KASAN_ARG_MODE_ASYMM)
-               hw_enable_tagging_asymm();
-       else
-               hw_enable_tagging_sync();
+       kasan_enable_tagging();
 }
 
 /* kasan_init_hw_tags() is called once on boot CPU. */
@@ -151,86 +186,168 @@ void __init kasan_init_hw_tags(void)
        if (kasan_arg == KASAN_ARG_OFF)
                return;
 
-       /* Enable KASAN. */
-       static_branch_enable(&kasan_flag_enabled);
-
        switch (kasan_arg_mode) {
        case KASAN_ARG_MODE_DEFAULT:
-               /*
-                * Default to sync mode.
-                */
-               fallthrough;
+               /* Default is specified by kasan_mode definition. */
+               break;
        case KASAN_ARG_MODE_SYNC:
-               /* Sync mode enabled. */
                kasan_mode = KASAN_MODE_SYNC;
                break;
        case KASAN_ARG_MODE_ASYNC:
-               /* Async mode enabled. */
                kasan_mode = KASAN_MODE_ASYNC;
                break;
        case KASAN_ARG_MODE_ASYMM:
-               /* Asymm mode enabled. */
                kasan_mode = KASAN_MODE_ASYMM;
                break;
        }
 
+       switch (kasan_arg_vmalloc) {
+       case KASAN_ARG_VMALLOC_DEFAULT:
+               /* Default is specified by kasan_flag_vmalloc definition. */
+               break;
+       case KASAN_ARG_VMALLOC_OFF:
+               static_branch_disable(&kasan_flag_vmalloc);
+               break;
+       case KASAN_ARG_VMALLOC_ON:
+               static_branch_enable(&kasan_flag_vmalloc);
+               break;
+       }
+
        switch (kasan_arg_stacktrace) {
        case KASAN_ARG_STACKTRACE_DEFAULT:
-               /* Default to enabling stack trace collection. */
-               static_branch_enable(&kasan_flag_stacktrace);
+               /* Default is specified by kasan_flag_stacktrace definition. */
                break;
        case KASAN_ARG_STACKTRACE_OFF:
-               /* Do nothing, kasan_flag_stacktrace keeps its default value. */
+               static_branch_disable(&kasan_flag_stacktrace);
                break;
        case KASAN_ARG_STACKTRACE_ON:
                static_branch_enable(&kasan_flag_stacktrace);
                break;
        }
 
-       pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, stacktrace=%s)\n",
+       /* KASAN is now initialized, enable it. */
+       static_branch_enable(&kasan_flag_enabled);
+
+       pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n",
                kasan_mode_info(),
+               kasan_vmalloc_enabled() ? "on" : "off",
                kasan_stack_collection_enabled() ? "on" : "off");
 }
 
-void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags)
+#ifdef CONFIG_KASAN_VMALLOC
+
+static void unpoison_vmalloc_pages(const void *addr, u8 tag)
 {
+       struct vm_struct *area;
+       int i;
+
        /*
-        * This condition should match the one in post_alloc_hook() in
-        * page_alloc.c.
+        * As hardware tag-based KASAN only tags VM_ALLOC vmalloc allocations
+        * (see the comment in __kasan_unpoison_vmalloc), all of the pages
+        * should belong to a single area.
         */
-       bool init = !want_init_on_free() && want_init_on_alloc(flags);
-
-       if (flags & __GFP_SKIP_KASAN_POISON)
-               SetPageSkipKASanPoison(page);
+       area = find_vm_area((void *)addr);
+       if (WARN_ON(!area))
+               return;
 
-       if (flags & __GFP_ZEROTAGS) {
-               int i;
+       for (i = 0; i < area->nr_pages; i++) {
+               struct page *page = area->pages[i];
 
-               for (i = 0; i != 1 << order; ++i)
-                       tag_clear_highpage(page + i);
-       } else {
-               kasan_unpoison_pages(page, order, init);
+               page_kasan_tag_set(page, tag);
        }
 }
 
-void kasan_free_pages(struct page *page, unsigned int order)
+void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
+                               kasan_vmalloc_flags_t flags)
 {
+       u8 tag;
+       unsigned long redzone_start, redzone_size;
+
+       if (!kasan_vmalloc_enabled())
+               return (void *)start;
+
+       if (!is_vmalloc_or_module_addr(start))
+               return (void *)start;
+
+       /*
+        * Skip unpoisoning and assigning a pointer tag for non-VM_ALLOC
+        * mappings as:
+        *
+        * 1. Unlike the software KASAN modes, hardware tag-based KASAN only
+        *    supports tagging physical memory. Therefore, it can only tag a
+        *    single mapping of normal physical pages.
+        * 2. Hardware tag-based KASAN can only tag memory mapped with special
+        *    mapping protection bits, see arch_vmalloc_pgprot_modify().
+        *    As non-VM_ALLOC mappings can be mapped outside of vmalloc code,
+        *    providing these bits would require tracking all non-VM_ALLOC
+        *    mappers.
+        *
+        * Thus, for VM_ALLOC mappings, hardware tag-based KASAN only tags
+        * the first virtual mapping, which is created by vmalloc().
+        * Tagging the page_alloc memory backing that vmalloc() allocation is
+        * skipped, see ___GFP_SKIP_KASAN_UNPOISON.
+        *
+        * For non-VM_ALLOC allocations, page_alloc memory is tagged as usual.
+        */
+       if (!(flags & KASAN_VMALLOC_VM_ALLOC))
+               return (void *)start;
+
+       /*
+        * Don't tag executable memory.
+        * The kernel doesn't tolerate having the PC register tagged.
+        */
+       if (!(flags & KASAN_VMALLOC_PROT_NORMAL))
+               return (void *)start;
+
+       tag = kasan_random_tag();
+       start = set_tag(start, tag);
+
+       /* Unpoison and initialize memory up to size. */
+       kasan_unpoison(start, size, flags & KASAN_VMALLOC_INIT);
+
+       /*
+        * Explicitly poison and initialize the in-page vmalloc() redzone.
+        * Unlike software KASAN modes, hardware tag-based KASAN doesn't
+        * unpoison memory when populating shadow for vmalloc() space.
+        */
+       redzone_start = round_up((unsigned long)start + size,
+                                KASAN_GRANULE_SIZE);
+       redzone_size = round_up(redzone_start, PAGE_SIZE) - redzone_start;
+       kasan_poison((void *)redzone_start, redzone_size, KASAN_TAG_INVALID,
+                    flags & KASAN_VMALLOC_INIT);
+
        /*
-        * This condition should match the one in free_pages_prepare() in
-        * page_alloc.c.
+        * Set per-page tag flags to allow accessing physical memory for the
+        * vmalloc() mapping through page_address(vmalloc_to_page()).
         */
-       bool init = want_init_on_free();
+       unpoison_vmalloc_pages(start, tag);
 
-       kasan_poison_pages(page, order, init);
+       return (void *)start;
+}
+
+void __kasan_poison_vmalloc(const void *start, unsigned long size)
+{
+       /*
+        * No tagging here.
+        * The physical pages backing the vmalloc() allocation are poisoned
+        * through the usual page_alloc paths.
+        */
 }
 
+#endif
+
 #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
 
-void kasan_enable_tagging_sync(void)
+void kasan_enable_tagging(void)
 {
-       hw_enable_tagging_sync();
+       if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC)
+               hw_enable_tagging_async();
+       else if (kasan_arg_mode == KASAN_ARG_MODE_ASYMM)
+               hw_enable_tagging_asymm();
+       else
+               hw_enable_tagging_sync();
 }
-EXPORT_SYMBOL_GPL(kasan_enable_tagging_sync);
+EXPORT_SYMBOL_GPL(kasan_enable_tagging);
 
 void kasan_force_async_fault(void)
 {
index c17fa8d..d79b83d 100644 (file)
@@ -12,7 +12,8 @@
 #include <linux/static_key.h>
 #include "../slab.h"
 
-DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
+DECLARE_STATIC_KEY_TRUE(kasan_flag_vmalloc);
+DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace);
 
 enum kasan_mode {
        KASAN_MODE_SYNC,
@@ -22,6 +23,11 @@ enum kasan_mode {
 
 extern enum kasan_mode kasan_mode __ro_after_init;
 
+static inline bool kasan_vmalloc_enabled(void)
+{
+       return static_branch_likely(&kasan_flag_vmalloc);
+}
+
 static inline bool kasan_stack_collection_enabled(void)
 {
        return static_branch_unlikely(&kasan_flag_stacktrace);
@@ -71,17 +77,19 @@ static inline bool kasan_sync_fault_possible(void)
 #define KASAN_PAGE_REDZONE      0xFE  /* redzone for kmalloc_large allocations */
 #define KASAN_KMALLOC_REDZONE   0xFC  /* redzone inside slub object */
 #define KASAN_KMALLOC_FREE      0xFB  /* object was freed (kmem_cache_free/kfree) */
-#define KASAN_KMALLOC_FREETRACK 0xFA  /* object was freed and has free track set */
+#define KASAN_VMALLOC_INVALID   0xF8  /* unallocated space in vmapped page */
 #else
 #define KASAN_FREE_PAGE         KASAN_TAG_INVALID
 #define KASAN_PAGE_REDZONE      KASAN_TAG_INVALID
 #define KASAN_KMALLOC_REDZONE   KASAN_TAG_INVALID
 #define KASAN_KMALLOC_FREE      KASAN_TAG_INVALID
-#define KASAN_KMALLOC_FREETRACK KASAN_TAG_INVALID
+#define KASAN_VMALLOC_INVALID   KASAN_TAG_INVALID /* only for SW_TAGS */
 #endif
 
+#ifdef CONFIG_KASAN_GENERIC
+
+#define KASAN_KMALLOC_FREETRACK 0xFA  /* object was freed and has free track set */
 #define KASAN_GLOBAL_REDZONE    0xF9  /* redzone for global variable */
-#define KASAN_VMALLOC_INVALID   0xF8  /* unallocated space in vmapped page */
 
 /*
  * Stack redzone shadow values
@@ -110,6 +118,8 @@ static inline bool kasan_sync_fault_possible(void)
 #define KASAN_ABI_VERSION 1
 #endif
 
+#endif /* CONFIG_KASAN_GENERIC */
+
 /* Metadata layout customization. */
 #define META_BYTES_PER_BLOCK 1
 #define META_BLOCKS_PER_ROW 16
@@ -117,9 +127,15 @@ static inline bool kasan_sync_fault_possible(void)
 #define META_MEM_BYTES_PER_ROW (META_BYTES_PER_ROW * KASAN_GRANULE_SIZE)
 #define META_ROWS_AROUND_ADDR 2
 
-struct kasan_access_info {
-       const void *access_addr;
-       const void *first_bad_addr;
+enum kasan_report_type {
+       KASAN_REPORT_ACCESS,
+       KASAN_REPORT_INVALID_FREE,
+};
+
+struct kasan_report_info {
+       enum kasan_report_type type;
+       void *access_addr;
+       void *first_bad_addr;
        size_t access_size;
        bool is_write;
        unsigned long ip;
@@ -204,6 +220,14 @@ struct kasan_free_meta {
 #endif
 };
 
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
+/* Used in KUnit-compatible KASAN tests. */
+struct kunit_kasan_status {
+       bool report_found;
+       bool sync_fault;
+};
+#endif
+
 struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
                                                const void *object);
 #ifdef CONFIG_KASAN_GENERIC
@@ -221,7 +245,8 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
 
 static inline bool addr_has_metadata(const void *addr)
 {
-       return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
+       return (kasan_reset_tag(addr) >=
+               kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
 }
 
 /**
@@ -251,10 +276,10 @@ static inline void kasan_print_tags(u8 addr_tag, const void *addr) { }
 #endif
 
 void *kasan_find_first_bad_addr(void *addr, size_t size);
-const char *kasan_get_bug_type(struct kasan_access_info *info);
+const char *kasan_get_bug_type(struct kasan_report_info *info);
 void kasan_metadata_fetch_row(char *buffer, void *row);
 
-#if defined(CONFIG_KASAN_GENERIC) && defined(CONFIG_KASAN_STACK)
+#if defined(CONFIG_KASAN_STACK)
 void kasan_print_address_stack_frame(const void *addr);
 #else
 static inline void kasan_print_address_stack_frame(const void *addr) { }
@@ -340,12 +365,12 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
 
 #if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
 
-void kasan_enable_tagging_sync(void);
+void kasan_enable_tagging(void);
 void kasan_force_async_fault(void);
 
 #else /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */
 
-static inline void kasan_enable_tagging_sync(void) { }
+static inline void kasan_enable_tagging(void) { }
 static inline void kasan_force_async_fault(void) { }
 
 #endif /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */
@@ -467,6 +492,13 @@ static inline bool kasan_arch_is_ready(void)       { return true; }
 #error kasan_arch_is_ready only works in KASAN generic outline mode!
 #endif
 
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST)
+
+bool kasan_save_enable_multi_shot(void);
+void kasan_restore_multi_shot(bool enabled);
+
+#endif
+
 /*
  * Exported functions for interfaces called from assembly or from generated
  * code. Declarations here to avoid warning about missing declarations.
index f141465..199d77c 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/ftrace.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/lockdep.h>
 #include <linux/mm.h>
 #include <linux/printk.h>
 #include <linux/sched.h>
@@ -64,6 +65,40 @@ static int __init early_kasan_fault(char *arg)
 }
 early_param("kasan.fault", early_kasan_fault);
 
+static int __init kasan_set_multi_shot(char *str)
+{
+       set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+       return 1;
+}
+__setup("kasan_multi_shot", kasan_set_multi_shot);
+
+/*
+ * Used to suppress reports within kasan_disable/enable_current() critical
+ * sections, which are used for marking accesses to slab metadata.
+ */
+static bool report_suppressed(void)
+{
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+       if (current->kasan_depth)
+               return true;
+#endif
+       return false;
+}
+
+/*
+ * Used to avoid reporting more than one KASAN bug unless kasan_multi_shot
+ * is enabled. Note that KASAN tests effectively enable kasan_multi_shot
+ * for their duration.
+ */
+static bool report_enabled(void)
+{
+       if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
+               return true;
+       return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
+}
+
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST)
+
 bool kasan_save_enable_multi_shot(void)
 {
        return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
@@ -77,53 +112,87 @@ void kasan_restore_multi_shot(bool enabled)
 }
 EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
 
-static int __init kasan_set_multi_shot(char *str)
-{
-       set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
-       return 1;
-}
-__setup("kasan_multi_shot", kasan_set_multi_shot);
+#endif
 
-static void print_error_description(struct kasan_access_info *info)
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
+static void update_kunit_status(bool sync)
 {
-       pr_err("BUG: KASAN: %s in %pS\n",
-               kasan_get_bug_type(info), (void *)info->ip);
-       if (info->access_size)
-               pr_err("%s of size %zu at addr %px by task %s/%d\n",
-                       info->is_write ? "Write" : "Read", info->access_size,
-                       info->access_addr, current->comm, task_pid_nr(current));
-       else
-               pr_err("%s at addr %px by task %s/%d\n",
-                       info->is_write ? "Write" : "Read",
-                       info->access_addr, current->comm, task_pid_nr(current));
+       struct kunit *test;
+       struct kunit_resource *resource;
+       struct kunit_kasan_status *status;
+
+       test = current->kunit_test;
+       if (!test)
+               return;
+
+       resource = kunit_find_named_resource(test, "kasan_status");
+       if (!resource) {
+               kunit_set_failure(test);
+               return;
+       }
+
+       status = (struct kunit_kasan_status *)resource->data;
+       WRITE_ONCE(status->report_found, true);
+       WRITE_ONCE(status->sync_fault, sync);
+
+       kunit_put_resource(resource);
 }
+#else
+static void update_kunit_status(bool sync) { }
+#endif
 
 static DEFINE_SPINLOCK(report_lock);
 
-static void start_report(unsigned long *flags)
+static void start_report(unsigned long *flags, bool sync)
 {
-       /*
-        * Make sure we don't end up in loop.
-        */
+       /* Respect the /proc/sys/kernel/traceoff_on_warning interface. */
+       disable_trace_on_warning();
+       /* Update status of the currently running KASAN test. */
+       update_kunit_status(sync);
+       /* Do not allow LOCKDEP mangling KASAN reports. */
+       lockdep_off();
+       /* Make sure we don't end up in loop. */
        kasan_disable_current();
        spin_lock_irqsave(&report_lock, *flags);
        pr_err("==================================================================\n");
 }
 
-static void end_report(unsigned long *flags, unsigned long addr)
+static void end_report(unsigned long *flags, void *addr)
 {
-       if (!kasan_async_fault_possible())
-               trace_error_report_end(ERROR_DETECTOR_KASAN, addr);
+       if (addr)
+               trace_error_report_end(ERROR_DETECTOR_KASAN,
+                                      (unsigned long)addr);
        pr_err("==================================================================\n");
-       add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
        spin_unlock_irqrestore(&report_lock, *flags);
        if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
                panic("panic_on_warn set ...\n");
        if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC)
                panic("kasan.fault=panic set ...\n");
+       add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+       lockdep_on();
        kasan_enable_current();
 }
 
+static void print_error_description(struct kasan_report_info *info)
+{
+       if (info->type == KASAN_REPORT_INVALID_FREE) {
+               pr_err("BUG: KASAN: double-free or invalid-free in %pS\n",
+                      (void *)info->ip);
+               return;
+       }
+
+       pr_err("BUG: KASAN: %s in %pS\n",
+               kasan_get_bug_type(info), (void *)info->ip);
+       if (info->access_size)
+               pr_err("%s of size %zu at addr %px by task %s/%d\n",
+                       info->is_write ? "Write" : "Read", info->access_size,
+                       info->access_addr, current->comm, task_pid_nr(current));
+       else
+               pr_err("%s at addr %px by task %s/%d\n",
+                       info->is_write ? "Write" : "Read",
+                       info->access_addr, current->comm, task_pid_nr(current));
+}
+
 static void print_track(struct kasan_track *track, const char *prefix)
 {
        pr_err("%s by task %u:\n", prefix, track->pid);
@@ -162,9 +231,6 @@ static void describe_object_addr(struct kmem_cache *cache, void *object,
               " which belongs to the cache %s of size %d\n",
                object, cache->name, cache->object_size);
 
-       if (!addr)
-               return;
-
        if (access_addr < object_addr) {
                rel_type = "to the left";
                rel_bytes = object_addr - access_addr;
@@ -253,19 +319,43 @@ static void print_address_description(void *addr, u8 tag)
                void *object = nearest_obj(cache, slab, addr);
 
                describe_object(cache, object, addr, tag);
+               pr_err("\n");
        }
 
        if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) {
                pr_err("The buggy address belongs to the variable:\n");
                pr_err(" %pS\n", addr);
+               pr_err("\n");
+       }
+
+       if (object_is_on_stack(addr)) {
+               /*
+                * Currently, KASAN supports printing frame information only
+                * for accesses to the task's own stack.
+                */
+               kasan_print_address_stack_frame(addr);
+               pr_err("\n");
+       }
+
+       if (is_vmalloc_addr(addr)) {
+               struct vm_struct *va = find_vm_area(addr);
+
+               if (va) {
+                       pr_err("The buggy address belongs to the virtual mapping at\n"
+                              " [%px, %px) created by:\n"
+                              " %pS\n",
+                              va->addr, va->addr + va->size, va->caller);
+                       pr_err("\n");
+
+                       page = vmalloc_to_page(page);
+               }
        }
 
        if (page) {
-               pr_err("The buggy address belongs to the page:\n");
+               pr_err("The buggy address belongs to the physical page:\n");
                dump_page(page, "kasan: bad access detected");
+               pr_err("\n");
        }
-
-       kasan_print_address_stack_frame(addr);
 }
 
 static bool meta_row_is_guilty(const void *row, const void *addr)
@@ -324,138 +414,110 @@ static void print_memory_metadata(const void *addr)
        }
 }
 
-static bool report_enabled(void)
-{
-#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
-       if (current->kasan_depth)
-               return false;
-#endif
-       if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
-               return true;
-       return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
-}
-
-#if IS_ENABLED(CONFIG_KUNIT)
-static void kasan_update_kunit_status(struct kunit *cur_test)
+static void print_report(struct kasan_report_info *info)
 {
-       struct kunit_resource *resource;
-       struct kunit_kasan_expectation *kasan_data;
+       void *tagged_addr = info->access_addr;
+       void *untagged_addr = kasan_reset_tag(tagged_addr);
+       u8 tag = get_tag(tagged_addr);
 
-       resource = kunit_find_named_resource(cur_test, "kasan_data");
+       print_error_description(info);
+       if (addr_has_metadata(untagged_addr))
+               kasan_print_tags(tag, info->first_bad_addr);
+       pr_err("\n");
 
-       if (!resource) {
-               kunit_set_failure(cur_test);
-               return;
+       if (addr_has_metadata(untagged_addr)) {
+               print_address_description(untagged_addr, tag);
+               print_memory_metadata(info->first_bad_addr);
+       } else {
+               dump_stack_lvl(KERN_ERR);
        }
-
-       kasan_data = (struct kunit_kasan_expectation *)resource->data;
-       WRITE_ONCE(kasan_data->report_found, true);
-       kunit_put_resource(resource);
 }
-#endif /* IS_ENABLED(CONFIG_KUNIT) */
 
-void kasan_report_invalid_free(void *object, unsigned long ip)
+void kasan_report_invalid_free(void *ptr, unsigned long ip)
 {
        unsigned long flags;
-       u8 tag = get_tag(object);
+       struct kasan_report_info info;
 
-       object = kasan_reset_tag(object);
-
-#if IS_ENABLED(CONFIG_KUNIT)
-       if (current->kunit_test)
-               kasan_update_kunit_status(current->kunit_test);
-#endif /* IS_ENABLED(CONFIG_KUNIT) */
+       /*
+        * Do not check report_suppressed(), as an invalid-free cannot be
+        * caused by accessing slab metadata and thus should not be
+        * suppressed by kasan_disable/enable_current() critical sections.
+        */
+       if (unlikely(!report_enabled()))
+               return;
 
-       start_report(&flags);
-       pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip);
-       kasan_print_tags(tag, object);
-       pr_err("\n");
-       print_address_description(object, tag);
-       pr_err("\n");
-       print_memory_metadata(object);
-       end_report(&flags, (unsigned long)object);
-}
+       start_report(&flags, true);
 
-#ifdef CONFIG_KASAN_HW_TAGS
-void kasan_report_async(void)
-{
-       unsigned long flags;
+       info.type = KASAN_REPORT_INVALID_FREE;
+       info.access_addr = ptr;
+       info.first_bad_addr = kasan_reset_tag(ptr);
+       info.access_size = 0;
+       info.is_write = false;
+       info.ip = ip;
 
-#if IS_ENABLED(CONFIG_KUNIT)
-       if (current->kunit_test)
-               kasan_update_kunit_status(current->kunit_test);
-#endif /* IS_ENABLED(CONFIG_KUNIT) */
+       print_report(&info);
 
-       start_report(&flags);
-       pr_err("BUG: KASAN: invalid-access\n");
-       pr_err("Asynchronous mode enabled: no access details available\n");
-       pr_err("\n");
-       dump_stack_lvl(KERN_ERR);
-       end_report(&flags, 0);
+       end_report(&flags, ptr);
 }
-#endif /* CONFIG_KASAN_HW_TAGS */
 
-static void __kasan_report(unsigned long addr, size_t size, bool is_write,
-                               unsigned long ip)
+/*
+ * kasan_report() is the only reporting function that uses
+ * user_access_save/restore(): kasan_report_invalid_free() cannot be called
+ * from a UACCESS region, and kasan_report_async() is not used on x86.
+ */
+bool kasan_report(unsigned long addr, size_t size, bool is_write,
+                       unsigned long ip)
 {
-       struct kasan_access_info info;
-       void *tagged_addr;
-       void *untagged_addr;
-       unsigned long flags;
-
-#if IS_ENABLED(CONFIG_KUNIT)
-       if (current->kunit_test)
-               kasan_update_kunit_status(current->kunit_test);
-#endif /* IS_ENABLED(CONFIG_KUNIT) */
-
-       disable_trace_on_warning();
+       bool ret = true;
+       void *ptr = (void *)addr;
+       unsigned long ua_flags = user_access_save();
+       unsigned long irq_flags;
+       struct kasan_report_info info;
+
+       if (unlikely(report_suppressed()) || unlikely(!report_enabled())) {
+               ret = false;
+               goto out;
+       }
 
-       tagged_addr = (void *)addr;
-       untagged_addr = kasan_reset_tag(tagged_addr);
+       start_report(&irq_flags, true);
 
-       info.access_addr = tagged_addr;
-       if (addr_has_metadata(untagged_addr))
-               info.first_bad_addr =
-                       kasan_find_first_bad_addr(tagged_addr, size);
-       else
-               info.first_bad_addr = untagged_addr;
+       info.type = KASAN_REPORT_ACCESS;
+       info.access_addr = ptr;
+       info.first_bad_addr = kasan_find_first_bad_addr(ptr, size);
        info.access_size = size;
        info.is_write = is_write;
        info.ip = ip;
 
-       start_report(&flags);
+       print_report(&info);
 
-       print_error_description(&info);
-       if (addr_has_metadata(untagged_addr))
-               kasan_print_tags(get_tag(tagged_addr), info.first_bad_addr);
-       pr_err("\n");
+       end_report(&irq_flags, ptr);
 
-       if (addr_has_metadata(untagged_addr)) {
-               print_address_description(untagged_addr, get_tag(tagged_addr));
-               pr_err("\n");
-               print_memory_metadata(info.first_bad_addr);
-       } else {
-               dump_stack_lvl(KERN_ERR);
-       }
+out:
+       user_access_restore(ua_flags);
 
-       end_report(&flags, addr);
+       return ret;
 }
 
-bool kasan_report(unsigned long addr, size_t size, bool is_write,
-                       unsigned long ip)
+#ifdef CONFIG_KASAN_HW_TAGS
+void kasan_report_async(void)
 {
-       unsigned long flags = user_access_save();
-       bool ret = false;
-
-       if (likely(report_enabled())) {
-               __kasan_report(addr, size, is_write, ip);
-               ret = true;
-       }
+       unsigned long flags;
 
-       user_access_restore(flags);
+       /*
+        * Do not check report_suppressed(), as kasan_disable/enable_current()
+        * critical sections do not affect Hardware Tag-Based KASAN.
+        */
+       if (unlikely(!report_enabled()))
+               return;
 
-       return ret;
+       start_report(&flags, false);
+       pr_err("BUG: KASAN: invalid-access\n");
+       pr_err("Asynchronous fault: no details available\n");
+       pr_err("\n");
+       dump_stack_lvl(KERN_ERR);
+       end_report(&flags, NULL);
 }
+#endif /* CONFIG_KASAN_HW_TAGS */
 
 #ifdef CONFIG_KASAN_INLINE
 /*
index 139615e..efc5e79 100644 (file)
@@ -34,12 +34,16 @@ void *kasan_find_first_bad_addr(void *addr, size_t size)
 {
        void *p = addr;
 
+       if (!addr_has_metadata(p))
+               return p;
+
        while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p)))
                p += KASAN_GRANULE_SIZE;
+
        return p;
 }
 
-static const char *get_shadow_bug_type(struct kasan_access_info *info)
+static const char *get_shadow_bug_type(struct kasan_report_info *info)
 {
        const char *bug_type = "unknown-crash";
        u8 *shadow_addr;
@@ -91,7 +95,7 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info)
        return bug_type;
 }
 
-static const char *get_wild_bug_type(struct kasan_access_info *info)
+static const char *get_wild_bug_type(struct kasan_report_info *info)
 {
        const char *bug_type = "unknown-crash";
 
@@ -105,7 +109,7 @@ static const char *get_wild_bug_type(struct kasan_access_info *info)
        return bug_type;
 }
 
-const char *kasan_get_bug_type(struct kasan_access_info *info)
+const char *kasan_get_bug_type(struct kasan_report_info *info)
 {
        /*
         * If access_size is a negative number, then it has reason to be
@@ -180,7 +184,7 @@ static void print_decoded_frame_descr(const char *frame_descr)
                return;
 
        pr_err("\n");
-       pr_err("this frame has %lu %s:\n", num_objects,
+       pr_err("This frame has %lu %s:\n", num_objects,
               num_objects == 1 ? "object" : "objects");
 
        while (num_objects--) {
@@ -211,6 +215,7 @@ static void print_decoded_frame_descr(const char *frame_descr)
        }
 }
 
+/* Returns true only if the address is on the current task's stack. */
 static bool __must_check get_address_stack_frame_info(const void *addr,
                                                      unsigned long *offset,
                                                      const char **frame_descr,
@@ -224,13 +229,6 @@ static bool __must_check get_address_stack_frame_info(const void *addr,
 
        BUILD_BUG_ON(IS_ENABLED(CONFIG_STACK_GROWSUP));
 
-       /*
-        * NOTE: We currently only support printing frame information for
-        * accesses to the task's own stack.
-        */
-       if (!object_is_on_stack(addr))
-               return false;
-
        aligned_addr = round_down((unsigned long)addr, sizeof(long));
        mem_ptr = round_down(aligned_addr, KASAN_GRANULE_SIZE);
        shadow_ptr = kasan_mem_to_shadow((void *)aligned_addr);
@@ -269,17 +267,17 @@ void kasan_print_address_stack_frame(const void *addr)
        const char *frame_descr;
        const void *frame_pc;
 
+       if (WARN_ON(!object_is_on_stack(addr)))
+               return;
+
+       pr_err("The buggy address belongs to stack of task %s/%d\n",
+              current->comm, task_pid_nr(current));
+
        if (!get_address_stack_frame_info(addr, &offset, &frame_descr,
                                          &frame_pc))
                return;
 
-       /*
-        * get_address_stack_frame_info only returns true if the given addr is
-        * on the current task's stack.
-        */
-       pr_err("\n");
-       pr_err("addr %px is located in stack of task %s/%d at offset %lu in frame:\n",
-              addr, current->comm, task_pid_nr(current), offset);
+       pr_err(" and is located at offset %lu in frame:\n", offset);
        pr_err(" %pS\n", frame_pc);
 
        if (!frame_descr)
index 5dbbbb9..f3d3be6 100644 (file)
@@ -17,6 +17,7 @@
 
 void *kasan_find_first_bad_addr(void *addr, size_t size)
 {
+       /* Return the same value regardless of whether addr_has_metadata(). */
        return kasan_reset_tag(addr);
 }
 
index d2298c3..7a26397 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/mm.h>
 #include <linux/printk.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/slab.h>
 #include <linux/stackdepot.h>
 #include <linux/stacktrace.h>
@@ -35,8 +36,12 @@ void *kasan_find_first_bad_addr(void *addr, size_t size)
        void *p = kasan_reset_tag(addr);
        void *end = p + size;
 
+       if (!addr_has_metadata(p))
+               return p;
+
        while (p < end && tag == *(u8 *)kasan_mem_to_shadow(p))
                p += KASAN_GRANULE_SIZE;
+
        return p;
 }
 
@@ -51,3 +56,14 @@ void kasan_print_tags(u8 addr_tag, const void *addr)
 
        pr_err("Pointer tag: [%02x], memory tag: [%02x]\n", addr_tag, *shadow);
 }
+
+#ifdef CONFIG_KASAN_STACK
+void kasan_print_address_stack_frame(const void *addr)
+{
+       if (WARN_ON(!object_is_on_stack(addr)))
+               return;
+
+       pr_err("The buggy address belongs to stack of task %s/%d\n",
+              current->comm, task_pid_nr(current));
+}
+#endif
index 1b41de8..e25d216 100644 (file)
@@ -7,7 +7,7 @@
 #include "kasan.h"
 #include "../slab.h"
 
-const char *kasan_get_bug_type(struct kasan_access_info *info)
+const char *kasan_get_bug_type(struct kasan_report_info *info)
 {
 #ifdef CONFIG_KASAN_TAGS_IDENTIFY
        struct kasan_alloc_meta *alloc_meta;
index 94136f8..a4f07de 100644 (file)
@@ -345,27 +345,6 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
        return 0;
 }
 
-/*
- * Poison the shadow for a vmalloc region. Called as part of the
- * freeing process at the time the region is freed.
- */
-void kasan_poison_vmalloc(const void *start, unsigned long size)
-{
-       if (!is_vmalloc_or_module_addr(start))
-               return;
-
-       size = round_up(size, KASAN_GRANULE_SIZE);
-       kasan_poison(start, size, KASAN_VMALLOC_INVALID, false);
-}
-
-void kasan_unpoison_vmalloc(const void *start, unsigned long size)
-{
-       if (!is_vmalloc_or_module_addr(start))
-               return;
-
-       kasan_unpoison(start, size, false);
-}
-
 static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
                                        void *unused)
 {
@@ -496,9 +475,48 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
        }
 }
 
+void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
+                              kasan_vmalloc_flags_t flags)
+{
+       /*
+        * Software KASAN modes unpoison both VM_ALLOC and non-VM_ALLOC
+        * mappings, so the KASAN_VMALLOC_VM_ALLOC flag is ignored.
+        * Software KASAN modes can't optimize zeroing memory by combining it
+        * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored.
+        */
+
+       if (!is_vmalloc_or_module_addr(start))
+               return (void *)start;
+
+       /*
+        * Don't tag executable memory with the tag-based mode.
+        * The kernel doesn't tolerate having the PC register tagged.
+        */
+       if (IS_ENABLED(CONFIG_KASAN_SW_TAGS) &&
+           !(flags & KASAN_VMALLOC_PROT_NORMAL))
+               return (void *)start;
+
+       start = set_tag(start, kasan_random_tag());
+       kasan_unpoison(start, size, false);
+       return (void *)start;
+}
+
+/*
+ * Poison the shadow for a vmalloc region. Called as part of the
+ * freeing process at the time the region is freed.
+ */
+void __kasan_poison_vmalloc(const void *start, unsigned long size)
+{
+       if (!is_vmalloc_or_module_addr(start))
+               return;
+
+       size = round_up(size, KASAN_GRANULE_SIZE);
+       kasan_poison(start, size, KASAN_VMALLOC_INVALID, false);
+}
+
 #else /* CONFIG_KASAN_VMALLOC */
 
-int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask)
+int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask)
 {
        void *ret;
        size_t scaled_size;
@@ -534,7 +552,7 @@ int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask)
        return -ENOMEM;
 }
 
-void kasan_free_shadow(const struct vm_struct *vm)
+void kasan_free_module_shadow(const struct vm_struct *vm)
 {
        if (vm->flags & VM_KASAN)
                vfree(kasan_mem_to_shadow(vm->addr));
index 1cdf7c3..a4e5eaf 100644 (file)
@@ -46,7 +46,6 @@ enum scan_result {
        SCAN_VMA_NULL,
        SCAN_VMA_CHECK,
        SCAN_ADDRESS_RANGE,
-       SCAN_SWAP_CACHE_PAGE,
        SCAN_DEL_PAGE_LRU,
        SCAN_ALLOC_HUGE_PAGE_FAIL,
        SCAN_CGROUP_CHARGE_FAIL,
@@ -683,16 +682,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                        result = SCAN_PAGE_COUNT;
                        goto out;
                }
-               if (!pte_write(pteval) && PageSwapCache(page) &&
-                               !reuse_swap_page(page)) {
-                       /*
-                        * Page is in the swap cache and cannot be re-used.
-                        * It cannot be collapsed into a THP.
-                        */
-                       unlock_page(page);
-                       result = SCAN_SWAP_CACHE_PAGE;
-                       goto out;
-               }
 
                /*
                 * Isolate the page to avoid collapsing an hugepage
index 39b712f..b41858e 100644 (file)
@@ -52,6 +52,7 @@ static int madvise_need_mmap_write(int behavior)
        case MADV_REMOVE:
        case MADV_WILLNEED:
        case MADV_DONTNEED:
+       case MADV_DONTNEED_LOCKED:
        case MADV_COLD:
        case MADV_PAGEOUT:
        case MADV_FREE:
@@ -504,7 +505,7 @@ static void madvise_cold_page_range(struct mmu_gather *tlb,
 
 static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
 {
-       return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
+       return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
 }
 
 static long madvise_cold(struct vm_area_struct *vma,
@@ -777,6 +778,29 @@ static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
        return 0;
 }
 
+static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
+                                           unsigned long start,
+                                           unsigned long *end,
+                                           int behavior)
+{
+       if (!is_vm_hugetlb_page(vma)) {
+               unsigned int forbidden = VM_PFNMAP;
+
+               if (behavior != MADV_DONTNEED_LOCKED)
+                       forbidden |= VM_LOCKED;
+
+               return !(vma->vm_flags & forbidden);
+       }
+
+       if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
+               return false;
+       if (start & ~huge_page_mask(hstate_vma(vma)))
+               return false;
+
+       *end = ALIGN(*end, huge_page_size(hstate_vma(vma)));
+       return true;
+}
+
 static long madvise_dontneed_free(struct vm_area_struct *vma,
                                  struct vm_area_struct **prev,
                                  unsigned long start, unsigned long end,
@@ -785,7 +809,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
        struct mm_struct *mm = vma->vm_mm;
 
        *prev = vma;
-       if (!can_madv_lru_vma(vma))
+       if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
                return -EINVAL;
 
        if (!userfaultfd_remove(vma, start, end)) {
@@ -807,7 +831,12 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
                         */
                        return -ENOMEM;
                }
-               if (!can_madv_lru_vma(vma))
+               /*
+                * Potential end adjustment for hugetlb vma is OK as
+                * the check below keeps end within vma.
+                */
+               if (!madvise_dontneed_free_valid_vma(vma, start, &end,
+                                                    behavior))
                        return -EINVAL;
                if (end > vma->vm_end) {
                        /*
@@ -827,7 +856,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
                VM_WARN_ON(start >= end);
        }
 
-       if (behavior == MADV_DONTNEED)
+       if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
                return madvise_dontneed_single_vma(vma, start, end);
        else if (behavior == MADV_FREE)
                return madvise_free_single_vma(vma, start, end);
@@ -966,6 +995,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
                return madvise_pageout(vma, prev, start, end);
        case MADV_FREE:
        case MADV_DONTNEED:
+       case MADV_DONTNEED_LOCKED:
                return madvise_dontneed_free(vma, prev, start, end, behavior);
        case MADV_POPULATE_READ:
        case MADV_POPULATE_WRITE:
@@ -1096,6 +1126,7 @@ madvise_behavior_valid(int behavior)
        case MADV_REMOVE:
        case MADV_WILLNEED:
        case MADV_DONTNEED:
+       case MADV_DONTNEED_LOCKED:
        case MADV_FREE:
        case MADV_COLD:
        case MADV_PAGEOUT:
index 6666bc2..be44d0b 100644 (file)
@@ -3287,19 +3287,35 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
        if (PageAnon(vmf->page)) {
                struct page *page = vmf->page;
 
-               /* PageKsm() doesn't necessarily raise the page refcount */
-               if (PageKsm(page) || page_count(page) != 1)
+               /*
+                * We have to verify under page lock: these early checks are
+                * just an optimization to avoid locking the page and freeing
+                * the swapcache if there is little hope that we can reuse.
+                *
+                * PageKsm() doesn't necessarily raise the page refcount.
+                */
+               if (PageKsm(page) || page_count(page) > 3)
+                       goto copy;
+               if (!PageLRU(page))
+                       /*
+                        * Note: We cannot easily detect+handle references from
+                        * remote LRU pagevecs or references to PageLRU() pages.
+                        */
+                       lru_add_drain();
+               if (page_count(page) > 1 + PageSwapCache(page))
                        goto copy;
                if (!trylock_page(page))
                        goto copy;
-               if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) {
+               if (PageSwapCache(page))
+                       try_to_free_swap(page);
+               if (PageKsm(page) || page_count(page) != 1) {
                        unlock_page(page);
                        goto copy;
                }
                /*
-                * Ok, we've got the only map reference, and the only
-                * page count reference, and the page is locked,
-                * it's dark out, and we're wearing sunglasses. Hit it.
+                * Ok, we've got the only page reference from our mapping
+                * and the page is locked, it's dark out, and we're wearing
+                * sunglasses. Hit it.
                 */
                unlock_page(page);
                wp_page_reuse(vmf);
@@ -3372,11 +3388,11 @@ void unmap_mapping_folio(struct folio *folio)
        details.even_cows = false;
        details.single_folio = folio;
 
-       i_mmap_lock_write(mapping);
+       i_mmap_lock_read(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
                unmap_mapping_range_tree(&mapping->i_mmap, first_index,
                                         last_index, &details);
-       i_mmap_unlock_write(mapping);
+       i_mmap_unlock_read(mapping);
 }
 
 /**
@@ -3402,11 +3418,11 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
        if (last_index < first_index)
                last_index = ULONG_MAX;
 
-       i_mmap_lock_write(mapping);
+       i_mmap_lock_read(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
                unmap_mapping_range_tree(&mapping->i_mmap, first_index,
                                         last_index, &details);
-       i_mmap_unlock_write(mapping);
+       i_mmap_unlock_read(mapping);
 }
 EXPORT_SYMBOL_GPL(unmap_mapping_pages);
 
@@ -3473,6 +3489,25 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
        return 0;
 }
 
+static inline bool should_try_to_free_swap(struct page *page,
+                                          struct vm_area_struct *vma,
+                                          unsigned int fault_flags)
+{
+       if (!PageSwapCache(page))
+               return false;
+       if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) ||
+           PageMlocked(page))
+               return true;
+       /*
+        * If we want to map a page that's in the swapcache writable, we
+        * have to detect via the refcount if we're really the exclusive
+        * user. Try freeing the swapcache to get rid of the swapcache
+        * reference only in case it's likely that we'll be the exlusive user.
+        */
+       return (fault_flags & FAULT_FLAG_WRITE) && !PageKsm(page) &&
+               page_count(page) == 2;
+}
+
 /*
  * We enter with non-exclusive mmap_lock (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
@@ -3591,21 +3626,39 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                goto out_release;
        }
 
-       /*
-        * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
-        * release the swapcache from under us.  The page pin, and pte_same
-        * test below, are not enough to exclude that.  Even if it is still
-        * swapcache, we need to check that the page's swap has not changed.
-        */
-       if (unlikely((!PageSwapCache(page) ||
-                       page_private(page) != entry.val)) && swapcache)
-               goto out_page;
-
-       page = ksm_might_need_to_copy(page, vma, vmf->address);
-       if (unlikely(!page)) {
-               ret = VM_FAULT_OOM;
-               page = swapcache;
-               goto out_page;
+       if (swapcache) {
+               /*
+                * Make sure try_to_free_swap or swapoff did not release the
+                * swapcache from under us.  The page pin, and pte_same test
+                * below, are not enough to exclude that.  Even if it is still
+                * swapcache, we need to check that the page's swap has not
+                * changed.
+                */
+               if (unlikely(!PageSwapCache(page) ||
+                            page_private(page) != entry.val))
+                       goto out_page;
+
+               /*
+                * KSM sometimes has to copy on read faults, for example, if
+                * page->index of !PageKSM() pages would be nonlinear inside the
+                * anon VMA -- PageKSM() is lost on actual swapout.
+                */
+               page = ksm_might_need_to_copy(page, vma, vmf->address);
+               if (unlikely(!page)) {
+                       ret = VM_FAULT_OOM;
+                       page = swapcache;
+                       goto out_page;
+               }
+
+               /*
+                * If we want to map a page that's in the swapcache writable, we
+                * have to detect via the refcount if we're really the exclusive
+                * owner. Try removing the extra reference from the local LRU
+                * pagevecs if required.
+                */
+               if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache &&
+                   !PageKsm(page) && !PageLRU(page))
+                       lru_add_drain();
        }
 
        cgroup_throttle_swaprate(page, GFP_KERNEL);
@@ -3624,19 +3677,25 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
        }
 
        /*
-        * The page isn't present yet, go ahead with the fault.
-        *
-        * Be careful about the sequence of operations here.
-        * To get its accounting right, reuse_swap_page() must be called
-        * while the page is counted on swap but not yet in mapcount i.e.
-        * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
-        * must be called after the swap_free(), or it will never succeed.
+        * Remove the swap entry and conditionally try to free up the swapcache.
+        * We're already holding a reference on the page but haven't mapped it
+        * yet.
         */
+       swap_free(entry);
+       if (should_try_to_free_swap(page, vma, vmf->flags))
+               try_to_free_swap(page);
 
        inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
        dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
        pte = mk_pte(page, vma->vm_page_prot);
-       if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
+
+       /*
+        * Same logic as in do_wp_page(); however, optimize for fresh pages
+        * that are certainly not shared because we just allocated them without
+        * exposing them to the swapcache.
+        */
+       if ((vmf->flags & FAULT_FLAG_WRITE) && !PageKsm(page) &&
+           (page != swapcache || page_count(page) == 1)) {
                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
                vmf->flags &= ~FAULT_FLAG_WRITE;
                ret |= VM_FAULT_WRITE;
@@ -3662,10 +3721,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
        arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
 
-       swap_free(entry);
-       if (mem_cgroup_swap_full(page) ||
-           (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
-               try_to_free_swap(page);
        unlock_page(page);
        if (page != swapcache && swapcache) {
                /*
index c17eca4..af02236 100644 (file)
@@ -456,8 +456,6 @@ void free_zone_device_page(struct page *page)
        if (WARN_ON_ONCE(!page->pgmap->ops || !page->pgmap->ops->page_free))
                return;
 
-       __ClearPageWaiters(page);
-
        mem_cgroup_uncharge(page_folio(page));
 
        /*
index 4f30ed3..3d60823 100644 (file)
@@ -53,7 +53,6 @@
 
 #include <asm/tlbflush.h>
 
-#define CREATE_TRACE_POINTS
 #include <trace/events/migrate.h>
 
 #include "internal.h"
@@ -249,6 +248,9 @@ static bool remove_migration_pte(struct folio *folio,
                if (vma->vm_flags & VM_LOCKED)
                        mlock_page_drain(smp_processor_id());
 
+               trace_remove_migration_pte(pvmw.address, pte_val(pte),
+                                          compound_order(new));
+
                /* No need to invalidate - it was non-present before */
                update_mmu_cache(vma, pvmw.address, pvmw.pte);
        }
index 435c026..7e2da28 100644 (file)
@@ -2465,16 +2465,14 @@ static void folio_account_dirtied(struct folio *folio,
  *
  * Caller must hold lock_page_memcg().
  */
-void folio_account_cleaned(struct folio *folio, struct address_space *mapping,
-                         struct bdi_writeback *wb)
+void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
 {
-       if (mapping_can_writeback(mapping)) {
-               long nr = folio_nr_pages(folio);
-               lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
-               zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
-               wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
-               task_io_account_cancelled_write(nr * PAGE_SIZE);
-       }
+       long nr = folio_nr_pages(folio);
+
+       lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
+       zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
+       wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
+       task_io_account_cancelled_write(nr * PAGE_SIZE);
 }
 
 /*
@@ -2683,7 +2681,7 @@ void __folio_cancel_dirty(struct folio *folio)
                wb = unlocked_inode_to_wb_begin(inode, &cookie);
 
                if (folio_test_clear_dirty(folio))
-                       folio_account_cleaned(folio, mapping, wb);
+                       folio_account_cleaned(folio, wb);
 
                unlocked_inode_to_wb_end(inode, &cookie);
                folio_memcg_unlock(folio);
index 6e0b459..bdc8f60 100644 (file)
@@ -378,25 +378,9 @@ int page_group_by_mobility_disabled __read_mostly;
  */
 static DEFINE_STATIC_KEY_TRUE(deferred_pages);
 
-/*
- * Calling kasan_poison_pages() only after deferred memory initialization
- * has completed. Poisoning pages during deferred memory init will greatly
- * lengthen the process and cause problem in large memory systems as the
- * deferred pages initialization is done with interrupt disabled.
- *
- * Assuming that there will be no reference to those newly initialized
- * pages before they are ever allocated, this should have no effect on
- * KASAN memory tracking as the poison will be properly inserted at page
- * allocation time. The only corner case is when pages are allocated by
- * on-demand allocation and then freed again before the deferred pages
- * initialization is done, but this is not likely to happen.
- */
-static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
+static inline bool deferred_pages_enabled(void)
 {
-       return static_branch_unlikely(&deferred_pages) ||
-              (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
-               (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
-              PageSkipKASanPoison(page);
+       return static_branch_unlikely(&deferred_pages);
 }
 
 /* Returns true if the struct page for the pfn is uninitialised */
@@ -447,11 +431,9 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
        return false;
 }
 #else
-static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
+static inline bool deferred_pages_enabled(void)
 {
-       return (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
-               (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
-              PageSkipKASanPoison(page);
+       return false;
 }
 
 static inline bool early_page_uninitialised(unsigned long pfn)
@@ -1267,15 +1249,38 @@ out:
        return ret;
 }
 
-static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags)
+/*
+ * Skip KASAN memory poisoning when either:
+ *
+ * 1. Deferred memory initialization has not yet completed,
+ *    see the explanation below.
+ * 2. Skipping poisoning is requested via FPI_SKIP_KASAN_POISON,
+ *    see the comment next to it.
+ * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON,
+ *    see the comment next to it.
+ *
+ * Poisoning pages during deferred memory init will greatly lengthen the
+ * process and cause problem in large memory systems as the deferred pages
+ * initialization is done with interrupt disabled.
+ *
+ * Assuming that there will be no reference to those newly initialized
+ * pages before they are ever allocated, this should have no effect on
+ * KASAN memory tracking as the poison will be properly inserted at page
+ * allocation time. The only corner case is when pages are allocated by
+ * on-demand allocation and then freed again before the deferred pages
+ * initialization is done, but this is not likely to happen.
+ */
+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
 {
-       int i;
+       return deferred_pages_enabled() ||
+              (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+               (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
+              PageSkipKASanPoison(page);
+}
 
-       if (zero_tags) {
-               for (i = 0; i < numpages; i++)
-                       tag_clear_highpage(page + i);
-               return;
-       }
+static void kernel_init_free_pages(struct page *page, int numpages)
+{
+       int i;
 
        /* s390's use of memset() could override KASAN redzones. */
        kasan_disable_current();
@@ -1292,7 +1297,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
                        unsigned int order, bool check_free, fpi_t fpi_flags)
 {
        int bad = 0;
-       bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
+       bool init = want_init_on_free();
 
        VM_BUG_ON_PAGE(PageTail(page), page);
 
@@ -1359,23 +1364,21 @@ static __always_inline bool free_pages_prepare(struct page *page,
 
        /*
         * As memory initialization might be integrated into KASAN,
-        * kasan_free_pages and kernel_init_free_pages must be
+        * KASAN poisoning and memory initialization code must be
         * kept together to avoid discrepancies in behavior.
         *
         * With hardware tag-based KASAN, memory tags must be set before the
         * page becomes unavailable via debug_pagealloc or arch_free_page.
         */
-       if (kasan_has_integrated_init()) {
-               if (!skip_kasan_poison)
-                       kasan_free_pages(page, order);
-       } else {
-               bool init = want_init_on_free();
+       if (!should_skip_kasan_poison(page, fpi_flags)) {
+               kasan_poison_pages(page, order, init);
 
-               if (init)
-                       kernel_init_free_pages(page, 1 << order, false);
-               if (!skip_kasan_poison)
-                       kasan_poison_pages(page, order, init);
+               /* Memory is already initialized if KASAN did it internally. */
+               if (kasan_has_integrated_init())
+                       init = false;
        }
+       if (init)
+               kernel_init_free_pages(page, 1 << order);
 
        /*
         * arch_free_page() can make the page's contents inaccessible.  s390
@@ -2340,9 +2343,43 @@ static inline bool check_new_pcp(struct page *page, unsigned int order)
 }
 #endif /* CONFIG_DEBUG_VM */
 
+static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags)
+{
+       /* Don't skip if a software KASAN mode is enabled. */
+       if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
+           IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+               return false;
+
+       /* Skip, if hardware tag-based KASAN is not enabled. */
+       if (!kasan_hw_tags_enabled())
+               return true;
+
+       /*
+        * With hardware tag-based KASAN enabled, skip if either:
+        *
+        * 1. Memory tags have already been cleared via tag_clear_highpage().
+        * 2. Skipping has been requested via __GFP_SKIP_KASAN_UNPOISON.
+        */
+       return init_tags || (flags & __GFP_SKIP_KASAN_UNPOISON);
+}
+
+static inline bool should_skip_init(gfp_t flags)
+{
+       /* Don't skip, if hardware tag-based KASAN is not enabled. */
+       if (!kasan_hw_tags_enabled())
+               return false;
+
+       /* For hardware tag-based KASAN, skip if requested. */
+       return (flags & __GFP_SKIP_ZERO);
+}
+
 inline void post_alloc_hook(struct page *page, unsigned int order,
                                gfp_t gfp_flags)
 {
+       bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
+                       !should_skip_init(gfp_flags);
+       bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS);
+
        set_page_private(page, 0);
        set_page_refcounted(page);
 
@@ -2358,19 +2395,38 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 
        /*
         * As memory initialization might be integrated into KASAN,
-        * kasan_alloc_pages and kernel_init_free_pages must be
+        * KASAN unpoisoning and memory initializion code must be
         * kept together to avoid discrepancies in behavior.
         */
-       if (kasan_has_integrated_init()) {
-               kasan_alloc_pages(page, order, gfp_flags);
-       } else {
-               bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
 
+       /*
+        * If memory tags should be zeroed (which happens only when memory
+        * should be initialized as well).
+        */
+       if (init_tags) {
+               int i;
+
+               /* Initialize both memory and tags. */
+               for (i = 0; i != 1 << order; ++i)
+                       tag_clear_highpage(page + i);
+
+               /* Note that memory is already initialized by the loop above. */
+               init = false;
+       }
+       if (!should_skip_kasan_unpoison(gfp_flags, init_tags)) {
+               /* Unpoison shadow memory or set memory tags. */
                kasan_unpoison_pages(page, order, init);
-               if (init)
-                       kernel_init_free_pages(page, 1 << order,
-                                              gfp_flags & __GFP_ZEROTAGS);
+
+               /* Note that memory is already initialized by KASAN. */
+               if (kasan_has_integrated_init())
+                       init = false;
        }
+       /* If memory is still not initialized, do it now. */
+       if (init)
+               kernel_init_free_pages(page, 1 << order);
+       /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */
+       if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON))
+               SetPageSkipKASanPoison(page);
 
        set_page_owner(page, order, gfp_flags);
        page_table_check_alloc(page, order);
index 99e360d..fb3a05f 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/migrate.h>
 #include <linux/stackdepot.h>
 #include <linux/seq_file.h>
+#include <linux/memcontrol.h>
 #include <linux/sched/clock.h>
 
 #include "internal.h"
@@ -28,7 +29,9 @@ struct page_owner {
        depot_stack_handle_t free_handle;
        u64 ts_nsec;
        u64 free_ts_nsec;
+       char comm[TASK_COMM_LEN];
        pid_t pid;
+       pid_t tgid;
 };
 
 static bool page_owner_enabled = false;
@@ -163,7 +166,10 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext,
                page_owner->gfp_mask = gfp_mask;
                page_owner->last_migrate_reason = -1;
                page_owner->pid = current->pid;
+               page_owner->tgid = current->tgid;
                page_owner->ts_nsec = local_clock();
+               strlcpy(page_owner->comm, current->comm,
+                       sizeof(page_owner->comm));
                __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
                __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
 
@@ -229,8 +235,10 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
                old_page_owner->last_migrate_reason;
        new_page_owner->handle = old_page_owner->handle;
        new_page_owner->pid = old_page_owner->pid;
+       new_page_owner->tgid = old_page_owner->tgid;
        new_page_owner->ts_nsec = old_page_owner->ts_nsec;
        new_page_owner->free_ts_nsec = old_page_owner->ts_nsec;
+       strcpy(new_page_owner->comm, old_page_owner->comm);
 
        /*
         * We don't clear the bit on the old folio as it's going to be freed
@@ -325,6 +333,45 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
        seq_putc(m, '\n');
 }
 
+/*
+ * Looking for memcg information and print it out
+ */
+static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret,
+                                        struct page *page)
+{
+#ifdef CONFIG_MEMCG
+       unsigned long memcg_data;
+       struct mem_cgroup *memcg;
+       bool online;
+       char name[80];
+
+       rcu_read_lock();
+       memcg_data = READ_ONCE(page->memcg_data);
+       if (!memcg_data)
+               goto out_unlock;
+
+       if (memcg_data & MEMCG_DATA_OBJCGS)
+               ret += scnprintf(kbuf + ret, count - ret,
+                               "Slab cache page\n");
+
+       memcg = page_memcg_check(page);
+       if (!memcg)
+               goto out_unlock;
+
+       online = (memcg->css.flags & CSS_ONLINE);
+       cgroup_name(memcg->css.cgroup, name, sizeof(name));
+       ret += scnprintf(kbuf + ret, count - ret,
+                       "Charged %sto %smemcg %s\n",
+                       PageMemcgKmem(page) ? "(via objcg) " : "",
+                       online ? "" : "offline ",
+                       name);
+out_unlock:
+       rcu_read_unlock();
+#endif /* CONFIG_MEMCG */
+
+       return ret;
+}
+
 static ssize_t
 print_page_owner(char __user *buf, size_t count, unsigned long pfn,
                struct page *page, struct page_owner *page_owner,
@@ -338,19 +385,17 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
        if (!kbuf)
                return -ENOMEM;
 
-       ret = snprintf(kbuf, count,
-                       "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns, free_ts %llu ns\n",
+       ret = scnprintf(kbuf, count,
+                       "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns, free_ts %llu ns\n",
                        page_owner->order, page_owner->gfp_mask,
                        &page_owner->gfp_mask, page_owner->pid,
+                       page_owner->tgid, page_owner->comm,
                        page_owner->ts_nsec, page_owner->free_ts_nsec);
 
-       if (ret >= count)
-               goto err;
-
        /* Print information relevant to grouping pages by mobility */
        pageblock_mt = get_pageblock_migratetype(page);
        page_mt  = gfp_migratetype(page_owner->gfp_mask);
-       ret += snprintf(kbuf + ret, count - ret,
+       ret += scnprintf(kbuf + ret, count - ret,
                        "PFN %lu type %s Block %lu type %s Flags %pGp\n",
                        pfn,
                        migratetype_names[page_mt],
@@ -358,21 +403,18 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
                        migratetype_names[pageblock_mt],
                        &page->flags);
 
-       if (ret >= count)
-               goto err;
-
        ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
        if (ret >= count)
                goto err;
 
        if (page_owner->last_migrate_reason != -1) {
-               ret += snprintf(kbuf + ret, count - ret,
+               ret += scnprintf(kbuf + ret, count - ret,
                        "Page has been migrated, last migrate reason: %s\n",
                        migrate_reason_names[page_owner->last_migrate_reason]);
-               if (ret >= count)
-                       goto err;
        }
 
+       ret = print_page_owner_memcg(kbuf, count, ret, page);
+
        ret += snprintf(kbuf + ret, count - ret, "\n");
        if (ret >= count)
                goto err;
@@ -415,9 +457,10 @@ void __dump_page_owner(const struct page *page)
        else
                pr_alert("page_owner tracks the page as freed\n");
 
-       pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu, free_ts %llu\n",
+       pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu, free_ts %llu\n",
                 page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask,
-                page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec);
+                page_owner->pid, page_owner->tgid, page_owner->comm,
+                page_owner->ts_nsec, page_owner->free_ts_nsec);
 
        handle = READ_ONCE(page_owner->handle);
        if (!handle)
index 615b5d3..5cb970d 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -76,7 +76,9 @@
 
 #include <asm/tlbflush.h>
 
+#define CREATE_TRACE_POINTS
 #include <trace/events/tlb.h>
+#include <trace/events/migrate.h>
 
 #include "internal.h"
 
@@ -1236,14 +1238,14 @@ void page_add_new_anon_rmap(struct page *page,
 void page_add_file_rmap(struct page *page,
        struct vm_area_struct *vma, bool compound)
 {
-       int i, nr = 1;
+       int i, nr = 0;
 
        VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
        lock_page_memcg(page);
        if (compound && PageTransHuge(page)) {
                int nr_pages = thp_nr_pages(page);
 
-               for (i = 0, nr = 0; i < nr_pages; i++) {
+               for (i = 0; i < nr_pages; i++) {
                        if (atomic_inc_and_test(&page[i]._mapcount))
                                nr++;
                }
@@ -1271,11 +1273,12 @@ void page_add_file_rmap(struct page *page,
                        VM_WARN_ON_ONCE(!PageLocked(page));
                        SetPageDoubleMap(compound_head(page));
                }
-               if (!atomic_inc_and_test(&page->_mapcount))
-                       goto out;
+               if (atomic_inc_and_test(&page->_mapcount))
+                       nr++;
        }
-       __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
 out:
+       if (nr)
+               __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
        unlock_page_memcg(page);
 
        mlock_vma_page(page, vma, compound);
@@ -1283,7 +1286,7 @@ out:
 
 static void page_remove_file_rmap(struct page *page, bool compound)
 {
-       int i, nr = 1;
+       int i, nr = 0;
 
        VM_BUG_ON_PAGE(compound && !PageHead(page), page);
 
@@ -1298,12 +1301,12 @@ static void page_remove_file_rmap(struct page *page, bool compound)
        if (compound && PageTransHuge(page)) {
                int nr_pages = thp_nr_pages(page);
 
-               for (i = 0, nr = 0; i < nr_pages; i++) {
+               for (i = 0; i < nr_pages; i++) {
                        if (atomic_add_negative(-1, &page[i]._mapcount))
                                nr++;
                }
                if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
-                       return;
+                       goto out;
                if (PageSwapBacked(page))
                        __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
                                                -nr_pages);
@@ -1311,16 +1314,12 @@ static void page_remove_file_rmap(struct page *page, bool compound)
                        __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
                                                -nr_pages);
        } else {
-               if (!atomic_add_negative(-1, &page->_mapcount))
-                       return;
+               if (atomic_add_negative(-1, &page->_mapcount))
+                       nr++;
        }
-
-       /*
-        * We use the irq-unsafe __{inc|mod}_lruvec_page_state because
-        * these counters are not modified in interrupt context, and
-        * pte lock(a spinlock) is held, which implies preemption disabled.
-        */
-       __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
+out:
+       if (nr)
+               __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
 }
 
 static void page_remove_anon_compound_rmap(struct page *page)
@@ -1589,7 +1588,30 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 
                        /* MADV_FREE page check */
                        if (!folio_test_swapbacked(folio)) {
-                               if (!folio_test_dirty(folio)) {
+                               int ref_count, map_count;
+
+                               /*
+                                * Synchronize with gup_pte_range():
+                                * - clear PTE; barrier; read refcount
+                                * - inc refcount; barrier; read PTE
+                                */
+                               smp_mb();
+
+                               ref_count = folio_ref_count(folio);
+                               map_count = folio_mapcount(folio);
+
+                               /*
+                                * Order reads for page refcount and dirty flag
+                                * (see comments in __remove_mapping()).
+                                */
+                               smp_rmb();
+
+                               /*
+                                * The only page refs must be one from isolation
+                                * plus the rmap(s) (dropped by discard:).
+                                */
+                               if (ref_count == 1 + map_count &&
+                                   !folio_test_dirty(folio)) {
                                        /* Invalidate as we cleared the pte */
                                        mmu_notifier_invalidate_range(mm,
                                                address, address + PAGE_SIZE);
@@ -1852,6 +1874,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                        if (pte_swp_uffd_wp(pteval))
                                swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
+                       trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
+                                               compound_order(&folio->page));
                        /*
                         * No need to invalidate here it will synchronize on
                         * against the special swap migration pte.
@@ -1920,6 +1944,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                        if (pte_uffd_wp(pteval))
                                swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        set_pte_at(mm, address, pvmw.pte, swp_pte);
+                       trace_set_migration_pte(address, pte_val(swp_pte),
+                                               compound_order(&folio->page));
                        /*
                         * No need to invalidate here it will synchronize on
                         * against the special swap migration pte.
index 5b30045..bceff0c 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -97,7 +97,6 @@ static void __page_cache_release(struct page *page)
                mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
                count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
        }
-       __ClearPageWaiters(page);
 }
 
 static void __put_single_page(struct page *page)
@@ -152,7 +151,6 @@ void put_pages_list(struct list_head *pages)
                        continue;
                }
                /* Cannot be PageLRU because it's passed to us using the lru */
-               __ClearPageWaiters(page);
        }
 
        free_unref_page_list(pages);
@@ -971,8 +969,6 @@ void release_pages(struct page **pages, int nr)
                        count_vm_event(UNEVICTABLE_PGCLEARED);
                }
 
-               __ClearPageWaiters(page);
-
                list_add(&page->lru, &pages_to_free);
        }
        if (lruvec)
index 33c7abb..63c61f8 100644 (file)
@@ -1167,16 +1167,6 @@ out:
        return NULL;
 }
 
-static struct swap_info_struct *swap_info_get(swp_entry_t entry)
-{
-       struct swap_info_struct *p;
-
-       p = _swap_info_get(entry);
-       if (p)
-               spin_lock(&p->lock);
-       return p;
-}
-
 static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
                                        struct swap_info_struct *q)
 {
@@ -1601,100 +1591,6 @@ static bool page_swapped(struct page *page)
        return false;
 }
 
-static int page_trans_huge_map_swapcount(struct page *page,
-                                        int *total_swapcount)
-{
-       int i, map_swapcount, _total_swapcount;
-       unsigned long offset = 0;
-       struct swap_info_struct *si;
-       struct swap_cluster_info *ci = NULL;
-       unsigned char *map = NULL;
-       int swapcount = 0;
-
-       /* hugetlbfs shouldn't call it */
-       VM_BUG_ON_PAGE(PageHuge(page), page);
-
-       if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
-               if (PageSwapCache(page))
-                       swapcount = page_swapcount(page);
-               if (total_swapcount)
-                       *total_swapcount = swapcount;
-               return swapcount + page_trans_huge_mapcount(page);
-       }
-
-       page = compound_head(page);
-
-       _total_swapcount = map_swapcount = 0;
-       if (PageSwapCache(page)) {
-               swp_entry_t entry;
-
-               entry.val = page_private(page);
-               si = _swap_info_get(entry);
-               if (si) {
-                       map = si->swap_map;
-                       offset = swp_offset(entry);
-               }
-       }
-       if (map)
-               ci = lock_cluster(si, offset);
-       for (i = 0; i < HPAGE_PMD_NR; i++) {
-               int mapcount = atomic_read(&page[i]._mapcount) + 1;
-               if (map) {
-                       swapcount = swap_count(map[offset + i]);
-                       _total_swapcount += swapcount;
-               }
-               map_swapcount = max(map_swapcount, mapcount + swapcount);
-       }
-       unlock_cluster(ci);
-
-       if (PageDoubleMap(page))
-               map_swapcount -= 1;
-
-       if (total_swapcount)
-               *total_swapcount = _total_swapcount;
-
-       return map_swapcount + compound_mapcount(page);
-}
-
-/*
- * We can write to an anon page without COW if there are no other references
- * to it.  And as a side-effect, free up its swap: because the old content
- * on disk will never be read, and seeking back there to write new content
- * later would only waste time away from clustering.
- */
-bool reuse_swap_page(struct page *page)
-{
-       int count, total_swapcount;
-
-       VM_BUG_ON_PAGE(!PageLocked(page), page);
-       if (unlikely(PageKsm(page)))
-               return false;
-       count = page_trans_huge_map_swapcount(page, &total_swapcount);
-       if (count == 1 && PageSwapCache(page) &&
-           (likely(!PageTransCompound(page)) ||
-            /* The remaining swap count will be freed soon */
-            total_swapcount == page_swapcount(page))) {
-               if (!PageWriteback(page)) {
-                       page = compound_head(page);
-                       delete_from_swap_cache(page);
-                       SetPageDirty(page);
-               } else {
-                       swp_entry_t entry;
-                       struct swap_info_struct *p;
-
-                       entry.val = page_private(page);
-                       p = swap_info_get(entry);
-                       if (p->flags & SWP_STABLE_WRITES) {
-                               spin_unlock(&p->lock);
-                               return false;
-                       }
-                       spin_unlock(&p->lock);
-               }
-       }
-
-       return count <= 1;
-}
-
 /*
  * If swap is getting full, or if there are no more mappings of this page,
  * then try_to_free_swap is called to free its swap space.
index 99e0f3e..e163372 100644 (file)
@@ -74,7 +74,7 @@ static const bool vmap_allow_huge = false;
 
 bool is_vmalloc_addr(const void *x)
 {
-       unsigned long addr = (unsigned long)x;
+       unsigned long addr = (unsigned long)kasan_reset_tag(x);
 
        return addr >= VMALLOC_START && addr < VMALLOC_END;
 }
@@ -631,7 +631,7 @@ int is_vmalloc_or_module_addr(const void *x)
         * just put it in the vmalloc space.
         */
 #if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
-       unsigned long addr = (unsigned long)x;
+       unsigned long addr = (unsigned long)kasan_reset_tag(x);
        if (addr >= MODULES_VADDR && addr < MODULES_END)
                return 1;
 #endif
@@ -795,6 +795,8 @@ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
        struct vmap_area *va = NULL;
        struct rb_node *n = vmap_area_root.rb_node;
 
+       addr = (unsigned long)kasan_reset_tag((void *)addr);
+
        while (n) {
                struct vmap_area *tmp;
 
@@ -816,6 +818,8 @@ static struct vmap_area *__find_vmap_area(unsigned long addr)
 {
        struct rb_node *n = vmap_area_root.rb_node;
 
+       addr = (unsigned long)kasan_reset_tag((void *)addr);
+
        while (n) {
                struct vmap_area *va;
 
@@ -2166,7 +2170,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases);
 void vm_unmap_ram(const void *mem, unsigned int count)
 {
        unsigned long size = (unsigned long)count << PAGE_SHIFT;
-       unsigned long addr = (unsigned long)mem;
+       unsigned long addr = (unsigned long)kasan_reset_tag(mem);
        struct vmap_area *va;
 
        might_sleep();
@@ -2227,14 +2231,19 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
                mem = (void *)addr;
        }
 
-       kasan_unpoison_vmalloc(mem, size);
-
        if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
                                pages, PAGE_SHIFT) < 0) {
                vm_unmap_ram(mem, count);
                return NULL;
        }
 
+       /*
+        * Mark the pages as accessible, now that they are mapped.
+        * With hardware tag-based KASAN, marking is skipped for
+        * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
+        */
+       mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL);
+
        return mem;
 }
 EXPORT_SYMBOL(vm_map_ram);
@@ -2460,10 +2469,20 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
                return NULL;
        }
 
-       kasan_unpoison_vmalloc((void *)va->va_start, requested_size);
-
        setup_vmalloc_vm(area, va, flags, caller);
 
+       /*
+        * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
+        * best-effort approach, as they can be mapped outside of vmalloc code.
+        * For VM_ALLOC mappings, the pages are marked as accessible after
+        * getting mapped in __vmalloc_node_range().
+        * With hardware tag-based KASAN, marking is skipped for
+        * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
+        */
+       if (!(flags & VM_ALLOC))
+               area->addr = kasan_unpoison_vmalloc(area->addr, requested_size,
+                                                   KASAN_VMALLOC_PROT_NORMAL);
+
        return area;
 }
 
@@ -2547,7 +2566,7 @@ struct vm_struct *remove_vm_area(const void *addr)
                va->vm = NULL;
                spin_unlock(&vmap_area_lock);
 
-               kasan_free_shadow(vm);
+               kasan_free_module_shadow(vm);
                free_unmap_vmap_area(va);
 
                return vm;
@@ -3071,7 +3090,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
                        const void *caller)
 {
        struct vm_struct *area;
-       void *addr;
+       void *ret;
+       kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
        unsigned long real_size = size;
        unsigned long real_align = align;
        unsigned int shift = PAGE_SHIFT;
@@ -3124,10 +3144,50 @@ again:
                goto fail;
        }
 
-       addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
-       if (!addr)
+       /*
+        * Prepare arguments for __vmalloc_area_node() and
+        * kasan_unpoison_vmalloc().
+        */
+       if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
+               if (kasan_hw_tags_enabled()) {
+                       /*
+                        * Modify protection bits to allow tagging.
+                        * This must be done before mapping.
+                        */
+                       prot = arch_vmap_pgprot_tagged(prot);
+
+                       /*
+                        * Skip page_alloc poisoning and zeroing for physical
+                        * pages backing VM_ALLOC mapping. Memory is instead
+                        * poisoned and zeroed by kasan_unpoison_vmalloc().
+                        */
+                       gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO;
+               }
+
+               /* Take note that the mapping is PAGE_KERNEL. */
+               kasan_flags |= KASAN_VMALLOC_PROT_NORMAL;
+       }
+
+       /* Allocate physical pages and map them into vmalloc space. */
+       ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
+       if (!ret)
                goto fail;
 
+       /*
+        * Mark the pages as accessible, now that they are mapped.
+        * The init condition should match the one in post_alloc_hook()
+        * (except for the should_skip_init() check) to make sure that memory
+        * is initialized under the same conditions regardless of the enabled
+        * KASAN mode.
+        * Tag-based KASAN modes only assign tags to normal non-executable
+        * allocations, see __kasan_unpoison_vmalloc().
+        */
+       kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
+       if (!want_init_on_free() && want_init_on_alloc(gfp_mask))
+               kasan_flags |= KASAN_VMALLOC_INIT;
+       /* KASAN_VMALLOC_PROT_NORMAL already set if required. */
+       area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags);
+
        /*
         * In this function, newly allocated vm_struct has VM_UNINITIALIZED
         * flag. It means that vm_struct is not fully initialized.
@@ -3139,7 +3199,7 @@ again:
        if (!(vm_flags & VM_DEFER_KMEMLEAK))
                kmemleak_vmalloc(area, size, gfp_mask);
 
-       return addr;
+       return area->addr;
 
 fail:
        if (shift > PAGE_SHIFT) {
@@ -3424,6 +3484,8 @@ long vread(char *buf, char *addr, unsigned long count)
        unsigned long buflen = count;
        unsigned long n;
 
+       addr = kasan_reset_tag(addr);
+
        /* Don't allow overflow */
        if ((unsigned long) addr + count < count)
                count = -(unsigned long) addr;
@@ -3809,9 +3871,6 @@ retry:
        for (area = 0; area < nr_vms; area++) {
                if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
                        goto err_free_shadow;
-
-               kasan_unpoison_vmalloc((void *)vas[area]->va_start,
-                                      sizes[area]);
        }
 
        /* insert all vm's */
@@ -3824,6 +3883,16 @@ retry:
        }
        spin_unlock(&vmap_area_lock);
 
+       /*
+        * Mark allocated areas as accessible. Do it now as a best-effort
+        * approach, as they can be mapped outside of vmalloc code.
+        * With hardware tag-based KASAN, marking is skipped for
+        * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
+        */
+       for (area = 0; area < nr_vms; area++)
+               vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr,
+                               vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);
+
        kfree(vas);
        return vms;
 
index f118098..b8f2480 100644 (file)
@@ -28,6 +28,7 @@
  *
  * When all tests are finished, clean up and exit the program with one of:
  *
+ *    ksft_finished();
  *    ksft_exit(condition);
  *    ksft_exit_pass();
  *    ksft_exit_fail();
@@ -235,6 +236,15 @@ static inline int ksft_exit_fail(void)
                ksft_exit_fail();       \
        } while (0)
 
+/**
+ * ksft_finished() - Exit selftest with success if all tests passed
+ */
+#define ksft_finished()                        \
+       ksft_exit(ksft_plan ==          \
+                 ksft_cnt.ksft_pass +  \
+                 ksft_cnt.ksft_xfail + \
+                 ksft_cnt.ksft_xskip)
+
 static inline int ksft_exit_fail_msg(const char *msg, ...)
 {
        int saved_errno = errno;
index 3b5faec..d7507f3 100644 (file)
@@ -3,6 +3,7 @@ hugepage-mmap
 hugepage-mremap
 hugepage-shm
 hugepage-vmemmap
+hugetlb-madvise
 khugepaged
 map_hugetlb
 map_populate
index fbf390b..04a49e8 100644 (file)
@@ -30,6 +30,7 @@ LDLIBS = -lrt -lpthread
 TEST_GEN_FILES = compaction_test
 TEST_GEN_FILES += gup_test
 TEST_GEN_FILES += hmm-tests
+TEST_GEN_FILES += hugetlb-madvise
 TEST_GEN_FILES += hugepage-mmap
 TEST_GEN_FILES += hugepage-mremap
 TEST_GEN_FILES += hugepage-shm
index fe043f6..cda837a 100644 (file)
@@ -10,8 +10,9 @@
 #include <assert.h>
 #include "../../../../mm/gup_test.h"
 
+#include "util.h"
+
 #define MB (1UL << 20)
-#define PAGE_SIZE sysconf(_SC_PAGESIZE)
 
 /* Just the flags we need, copied from mm.h: */
 #define FOLL_WRITE     0x01    /* check pte is writable */
diff --git a/tools/testing/selftests/vm/hugetlb-madvise.c b/tools/testing/selftests/vm/hugetlb-madvise.c
new file mode 100644 (file)
index 0000000..6c6af40
--- /dev/null
@@ -0,0 +1,410 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * hugepage-madvise:
+ *
+ * Basic functional testing of madvise MADV_DONTNEED and MADV_REMOVE
+ * on hugetlb mappings.
+ *
+ * Before running this test, make sure the administrator has pre-allocated
+ * at least MIN_FREE_PAGES hugetlb pages and they are free.  In addition,
+ * the test takes an argument that is the path to a file in a hugetlbfs
+ * filesystem.  Therefore, a hugetlbfs filesystem must be mounted on some
+ * directory.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#define __USE_GNU
+#include <fcntl.h>
+
+#define USAGE  "USAGE: %s <hugepagefile_name>\n"
+#define MIN_FREE_PAGES 20
+#define NR_HUGE_PAGES  10      /* common number of pages to map/allocate */
+
+#define validate_free_pages(exp_free)                                  \
+       do {                                                            \
+               int fhp = get_free_hugepages();                         \
+               if (fhp != (exp_free)) {                                \
+                       printf("Unexpected number of free huge "        \
+                               "pages line %d\n", __LINE__);           \
+                       exit(1);                                        \
+               }                                                       \
+       } while (0)
+
+unsigned long huge_page_size;
+unsigned long base_page_size;
+
+/*
+ * default_huge_page_size copied from mlock2-tests.c
+ */
+unsigned long default_huge_page_size(void)
+{
+       unsigned long hps = 0;
+       char *line = NULL;
+       size_t linelen = 0;
+       FILE *f = fopen("/proc/meminfo", "r");
+
+       if (!f)
+               return 0;
+       while (getline(&line, &linelen, f) > 0) {
+               if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
+                       hps <<= 10;
+                       break;
+               }
+       }
+
+       free(line);
+       fclose(f);
+       return hps;
+}
+
+unsigned long get_free_hugepages(void)
+{
+       unsigned long fhp = 0;
+       char *line = NULL;
+       size_t linelen = 0;
+       FILE *f = fopen("/proc/meminfo", "r");
+
+       if (!f)
+               return fhp;
+       while (getline(&line, &linelen, f) > 0) {
+               if (sscanf(line, "HugePages_Free:      %lu", &fhp) == 1)
+                       break;
+       }
+
+       free(line);
+       fclose(f);
+       return fhp;
+}
+
+void write_fault_pages(void *addr, unsigned long nr_pages)
+{
+       unsigned long i;
+
+       for (i = 0; i < nr_pages; i++)
+               *((unsigned long *)(addr + (i * huge_page_size))) = i;
+}
+
+void read_fault_pages(void *addr, unsigned long nr_pages)
+{
+       unsigned long i, tmp;
+
+       for (i = 0; i < nr_pages; i++)
+               tmp += *((unsigned long *)(addr + (i * huge_page_size)));
+}
+
+int main(int argc, char **argv)
+{
+       unsigned long free_hugepages;
+       void *addr, *addr2;
+       int fd;
+       int ret;
+
+       if (argc != 2) {
+               printf(USAGE, argv[0]);
+               exit(1);
+       }
+
+       huge_page_size = default_huge_page_size();
+       if (!huge_page_size) {
+               printf("Unable to determine huge page size, exiting!\n");
+               exit(1);
+       }
+       base_page_size = sysconf(_SC_PAGE_SIZE);
+       if (!huge_page_size) {
+               printf("Unable to determine base page size, exiting!\n");
+               exit(1);
+       }
+
+       free_hugepages = get_free_hugepages();
+       if (free_hugepages < MIN_FREE_PAGES) {
+               printf("Not enough free huge pages to test, exiting!\n");
+               exit(1);
+       }
+
+       fd = open(argv[1], O_CREAT | O_RDWR, 0755);
+       if (fd < 0) {
+               perror("Open failed");
+               exit(1);
+       }
+
+       /*
+        * Test validity of MADV_DONTNEED addr and length arguments.  mmap
+        * size is NR_HUGE_PAGES + 2.  One page at the beginning and end of
+        * the mapping will be unmapped so we KNOW there is nothing mapped
+        * there.
+        */
+       addr = mmap(NULL, (NR_HUGE_PAGES + 2) * huge_page_size,
+                       PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+                       -1, 0);
+       if (addr == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+       if (munmap(addr, huge_page_size) ||
+                       munmap(addr + (NR_HUGE_PAGES + 1) * huge_page_size,
+                               huge_page_size)) {
+               perror("munmap");
+               exit(1);
+       }
+       addr = addr + huge_page_size;
+
+       write_fault_pages(addr, NR_HUGE_PAGES);
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       /* addr before mapping should fail */
+       ret = madvise(addr - base_page_size, NR_HUGE_PAGES * huge_page_size,
+               MADV_DONTNEED);
+       if (!ret) {
+               printf("Unexpected success of madvise call with invalid addr line %d\n",
+                               __LINE__);
+                       exit(1);
+       }
+
+       /* addr + length after mapping should fail */
+       ret = madvise(addr, (NR_HUGE_PAGES * huge_page_size) + base_page_size,
+               MADV_DONTNEED);
+       if (!ret) {
+               printf("Unexpected success of madvise call with invalid length line %d\n",
+                               __LINE__);
+                       exit(1);
+       }
+
+       (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+
+       /*
+        * Test alignment of MADV_DONTNEED addr and length arguments
+        */
+       addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+                       PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+                       -1, 0);
+       if (addr == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+       write_fault_pages(addr, NR_HUGE_PAGES);
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       /* addr is not huge page size aligned and should fail */
+       ret = madvise(addr + base_page_size,
+                       NR_HUGE_PAGES * huge_page_size - base_page_size,
+                       MADV_DONTNEED);
+       if (!ret) {
+               printf("Unexpected success of madvise call with unaligned start address %d\n",
+                               __LINE__);
+                       exit(1);
+       }
+
+       /* addr + length should be aligned up to huge page size */
+       if (madvise(addr,
+                       ((NR_HUGE_PAGES - 1) * huge_page_size) + base_page_size,
+                       MADV_DONTNEED)) {
+               perror("madvise");
+               exit(1);
+       }
+
+       /* should free all pages in mapping */
+       validate_free_pages(free_hugepages);
+
+       (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+
+       /*
+        * Test MADV_DONTNEED on anonymous private mapping
+        */
+       addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+                       PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+                       -1, 0);
+       if (addr == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+       write_fault_pages(addr, NR_HUGE_PAGES);
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+               perror("madvise");
+               exit(1);
+       }
+
+       /* should free all pages in mapping */
+       validate_free_pages(free_hugepages);
+
+       (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+
+       /*
+        * Test MADV_DONTNEED on private mapping of hugetlb file
+        */
+       if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) {
+               perror("fallocate");
+               exit(1);
+       }
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+                       PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE, fd, 0);
+       if (addr == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+
+       /* read should not consume any pages */
+       read_fault_pages(addr, NR_HUGE_PAGES);
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       /* madvise should not free any pages */
+       if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+               perror("madvise");
+               exit(1);
+       }
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       /* writes should allocate private pages */
+       write_fault_pages(addr, NR_HUGE_PAGES);
+       validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
+
+       /* madvise should free private pages */
+       if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+               perror("madvise");
+               exit(1);
+       }
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       /* writes should allocate private pages */
+       write_fault_pages(addr, NR_HUGE_PAGES);
+       validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
+
+       /*
+        * The fallocate below certainly should free the pages associated
+        * with the file.  However, pages in the private mapping are also
+        * freed.  This is not the 'correct' behavior, but is expected
+        * because this is how it has worked since the initial hugetlb
+        * implementation.
+        */
+       if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+                                       0, NR_HUGE_PAGES * huge_page_size)) {
+               perror("fallocate");
+               exit(1);
+       }
+       validate_free_pages(free_hugepages);
+
+       (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+
+       /*
+        * Test MADV_DONTNEED on shared mapping of hugetlb file
+        */
+       if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) {
+               perror("fallocate");
+               exit(1);
+       }
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+                       PROT_READ | PROT_WRITE,
+                       MAP_SHARED, fd, 0);
+       if (addr == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+
+       /* write should not consume any pages */
+       write_fault_pages(addr, NR_HUGE_PAGES);
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       /* madvise should not free any pages */
+       if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+               perror("madvise");
+               exit(1);
+       }
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       /*
+        * Test MADV_REMOVE on shared mapping of hugetlb file
+        *
+        * madvise is same as hole punch and should free all pages.
+        */
+       if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) {
+               perror("madvise");
+               exit(1);
+       }
+       validate_free_pages(free_hugepages);
+       (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+
+       /*
+        * Test MADV_REMOVE on shared and private mapping of hugetlb file
+        */
+       if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) {
+               perror("fallocate");
+               exit(1);
+       }
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+                       PROT_READ | PROT_WRITE,
+                       MAP_SHARED, fd, 0);
+       if (addr == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+
+       /* shared write should not consume any additional pages */
+       write_fault_pages(addr, NR_HUGE_PAGES);
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       addr2 = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+                       PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE, fd, 0);
+       if (addr2 == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+
+       /* private read should not consume any pages */
+       read_fault_pages(addr2, NR_HUGE_PAGES);
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       /* private write should consume additional pages */
+       write_fault_pages(addr2, NR_HUGE_PAGES);
+       validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
+
+       /* madvise of shared mapping should not free any pages */
+       if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+               perror("madvise");
+               exit(1);
+       }
+       validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
+
+       /* madvise of private mapping should free private pages */
+       if (madvise(addr2, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+               perror("madvise");
+               exit(1);
+       }
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       /* private write should consume additional pages again */
+       write_fault_pages(addr2, NR_HUGE_PAGES);
+       validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
+
+       /*
+        * madvise should free both file and private pages although this is
+        * not correct.  private pages should not be freed, but this is
+        * expected.  See comment associated with FALLOC_FL_PUNCH_HOLE call.
+        */
+       if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) {
+               perror("madvise");
+               exit(1);
+       }
+       validate_free_pages(free_hugepages);
+
+       (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+       (void)munmap(addr2, NR_HUGE_PAGES * huge_page_size);
+
+       close(fd);
+       unlink(argv[1]);
+       return 0;
+}
index 1436e1a..fd85f15 100644 (file)
@@ -12,6 +12,7 @@
 
 #include "../kselftest.h"
 #include "../../../../include/vdso/time64.h"
+#include "util.h"
 
 #define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/"
 #define KSM_FP(s) (KSM_SYSFS_PATH s)
 #define KSM_MERGE_ACROSS_NODES_DEFAULT true
 #define MB (1ul << 20)
 
-#define PAGE_SHIFT 12
-#define HPAGE_SHIFT 21
-
-#define PAGE_SIZE (1 << PAGE_SHIFT)
-#define HPAGE_SIZE (1 << HPAGE_SHIFT)
-
-#define PAGEMAP_PRESENT(ent)   (((ent) & (1ull << 63)) != 0)
-#define PAGEMAP_PFN(ent)       ((ent) & ((1ull << 55) - 1))
-
 struct ksm_sysfs {
        unsigned long max_page_sharing;
        unsigned long merge_across_nodes;
@@ -456,34 +448,6 @@ err_out:
        return KSFT_FAIL;
 }
 
-int64_t allocate_transhuge(void *ptr, int pagemap_fd)
-{
-       uint64_t ent[2];
-
-       /* drop pmd */
-       if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE,
-                               MAP_FIXED | MAP_ANONYMOUS |
-                               MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr)
-               errx(2, "mmap transhuge");
-
-       if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE))
-               err(2, "MADV_HUGEPAGE");
-
-       /* allocate transparent huge page */
-       *(volatile void **)ptr = ptr;
-
-       if (pread(pagemap_fd, ent, sizeof(ent),
-                       (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent))
-               err(2, "read pagemap");
-
-       if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) &&
-           PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) &&
-           !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1)))
-               return PAGEMAP_PFN(ent[0]);
-
-       return -1;
-}
-
 static int ksm_merge_hugepages_time(int mapping, int prot, int timeout, size_t map_size)
 {
        void *map_ptr, *map_ptr_orig;
index 93e7e7f..957b9e1 100644 (file)
@@ -282,7 +282,7 @@ int main(int argc, char *argv[])
 
        close(fd);
 
-       ksft_exit(!ksft_get_fail_cnt());
+       ksft_finished();
 }
 
 #else /* __NR_memfd_secret */
index e10d50e..3b265f1 100755 (executable)
@@ -131,6 +131,18 @@ else
        echo "[PASS]"
 fi
 
+echo "-----------------------"
+echo "running hugetlb-madvise"
+echo "-----------------------"
+./hugetlb-madvise $mnt/madvise-test
+if [ $? -ne 0 ]; then
+       echo "[FAIL]"
+       exitcode=1
+else
+       echo "[PASS]"
+fi
+rm -f $mnt/madvise-test
+
 echo "NOTE: The above hugetlb tests provide minimal coverage.  Use"
 echo "      https://github.com/libhugetlbfs/libhugetlbfs.git for"
 echo "      hugetlb regression testing."
@@ -196,14 +208,13 @@ echo "running userfaultfd_hugetlb"
 echo "---------------------------"
 # Test requires source and destination huge pages.  Size of source
 # (half_ufd_size_MB) is passed as argument to test.
-./userfaultfd hugetlb $half_ufd_size_MB 32 $mnt/ufd_test_file
+./userfaultfd hugetlb $half_ufd_size_MB 32
 if [ $? -ne 0 ]; then
        echo "[FAIL]"
        exitcode=1
 else
        echo "[PASS]"
 fi
-rm -f $mnt/ufd_test_file
 
 echo "-------------------------"
 echo "running userfaultfd_shmem"
index a03cb3f..e3f00ad 100644 (file)
 #include <fcntl.h>
 #include <string.h>
 #include <sys/mman.h>
+#include "util.h"
 
-#define PAGE_SHIFT 12
-#define HPAGE_SHIFT 21
-
-#define PAGE_SIZE (1 << PAGE_SHIFT)
-#define HPAGE_SIZE (1 << HPAGE_SHIFT)
-
-#define PAGEMAP_PRESENT(ent)   (((ent) & (1ull << 63)) != 0)
-#define PAGEMAP_PFN(ent)       ((ent) & ((1ull << 55) - 1))
-
-int pagemap_fd;
 int backing_fd = -1;
 int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE;
 #define PROT_RW (PROT_READ | PROT_WRITE)
 
-int64_t allocate_transhuge(void *ptr)
-{
-       uint64_t ent[2];
-
-       /* drop pmd */
-       if (mmap(ptr, HPAGE_SIZE, PROT_RW, MAP_FIXED | mmap_flags,
-                backing_fd, 0) != ptr)
-               errx(2, "mmap transhuge");
-
-       if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE))
-               err(2, "MADV_HUGEPAGE");
-
-       /* allocate transparent huge page */
-       *(volatile void **)ptr = ptr;
-
-       if (pread(pagemap_fd, ent, sizeof(ent),
-                       (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent))
-               err(2, "read pagemap");
-
-       if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) &&
-           PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) &&
-           !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1)))
-               return PAGEMAP_PFN(ent[0]);
-
-       return -1;
-}
-
 int main(int argc, char **argv)
 {
        size_t ram, len;
@@ -67,6 +31,7 @@ int main(int argc, char **argv)
        double s;
        uint8_t *map;
        size_t map_len;
+       int pagemap_fd;
 
        ram = sysconf(_SC_PHYS_PAGES);
        if (ram > SIZE_MAX / sysconf(_SC_PAGESIZE) / 4)
@@ -122,7 +87,7 @@ int main(int argc, char **argv)
                for (p = ptr; p < ptr + len; p += HPAGE_SIZE) {
                        int64_t pfn;
 
-                       pfn = allocate_transhuge(p);
+                       pfn = allocate_transhuge(p, pagemap_fd);
 
                        if (pfn < 0) {
                                nr_failed++;
index 39403b8..92a4516 100644 (file)
@@ -89,7 +89,6 @@ static bool test_uffdio_minor = false;
 static bool map_shared;
 static int shm_fd;
 static int huge_fd;
-static char *huge_fd_off0;
 static unsigned long long *count_verify;
 static int uffd = -1;
 static int uffd_flags, finished, *pipefd;
@@ -128,9 +127,9 @@ const char *examples =
     "./userfaultfd anon 100 99999\n\n"
     "# Run share memory test on 1GiB region with 99 bounces:\n"
     "./userfaultfd shmem 1000 99\n\n"
-    "# Run hugetlb memory test on 256MiB region with 50 bounces (using /dev/hugepages/hugefile):\n"
-    "./userfaultfd hugetlb 256 50 /dev/hugepages/hugefile\n\n"
-    "# Run the same hugetlb test but using shmem:\n"
+    "# Run hugetlb memory test on 256MiB region with 50 bounces:\n"
+    "./userfaultfd hugetlb 256 50\n\n"
+    "# Run the same hugetlb test but using shared file:\n"
     "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n"
     "# 10MiB-~6GiB 999 bounces anonymous test, "
     "continue forever unless an error triggers\n"
@@ -227,10 +226,13 @@ static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
 
 static void hugetlb_release_pages(char *rel_area)
 {
-       if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
-                     rel_area == huge_fd_off0 ? 0 : nr_pages * page_size,
-                     nr_pages * page_size))
-               err("fallocate() failed");
+       if (!map_shared) {
+               if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
+                       err("madvise(MADV_DONTNEED) failed");
+       } else {
+               if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
+                       err("madvise(MADV_REMOVE) failed");
+       }
 }
 
 static void hugetlb_allocate_area(void **alloc_area)
@@ -238,26 +240,37 @@ static void hugetlb_allocate_area(void **alloc_area)
        void *area_alias = NULL;
        char **alloc_area_alias;
 
-       *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
-                          (map_shared ? MAP_SHARED : MAP_PRIVATE) |
-                          MAP_HUGETLB |
-                          (*alloc_area == area_src ? 0 : MAP_NORESERVE),
-                          huge_fd, *alloc_area == area_src ? 0 :
-                          nr_pages * page_size);
+       if (!map_shared)
+               *alloc_area = mmap(NULL,
+                       nr_pages * page_size,
+                       PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB |
+                               (*alloc_area == area_src ? 0 : MAP_NORESERVE),
+                       -1,
+                       0);
+       else
+               *alloc_area = mmap(NULL,
+                       nr_pages * page_size,
+                       PROT_READ | PROT_WRITE,
+                       MAP_SHARED |
+                               (*alloc_area == area_src ? 0 : MAP_NORESERVE),
+                       huge_fd,
+                       *alloc_area == area_src ? 0 : nr_pages * page_size);
        if (*alloc_area == MAP_FAILED)
                err("mmap of hugetlbfs file failed");
 
        if (map_shared) {
-               area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
-                                 MAP_SHARED | MAP_HUGETLB,
-                                 huge_fd, *alloc_area == area_src ? 0 :
-                                 nr_pages * page_size);
+               area_alias = mmap(NULL,
+                       nr_pages * page_size,
+                       PROT_READ | PROT_WRITE,
+                       MAP_SHARED,
+                       huge_fd,
+                       *alloc_area == area_src ? 0 : nr_pages * page_size);
                if (area_alias == MAP_FAILED)
                        err("mmap of hugetlb file alias failed");
        }
 
        if (*alloc_area == area_src) {
-               huge_fd_off0 = *alloc_area;
                alloc_area_alias = &area_src_alias;
        } else {
                alloc_area_alias = &area_dst_alias;
@@ -270,12 +283,7 @@ static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset
 {
        if (!map_shared)
                return;
-       /*
-        * We can't zap just the pagetable with hugetlbfs because
-        * MADV_DONTEED won't work. So exercise -EEXIST on a alias
-        * mapping where the pagetables are not established initially,
-        * this way we'll exercise the -EEXEC at the fs level.
-        */
+
        *start = (unsigned long) area_dst_alias + offset;
 }
 
@@ -428,7 +436,6 @@ static void uffd_test_ctx_clear(void)
                uffd = -1;
        }
 
-       huge_fd_off0 = NULL;
        munmap_area((void **)&area_src);
        munmap_area((void **)&area_src_alias);
        munmap_area((void **)&area_dst);
@@ -926,10 +933,7 @@ static int faulting_process(int signal_test)
        struct sigaction act;
        unsigned long signalled = 0;
 
-       if (test_type != TEST_HUGETLB)
-               split_nr_pages = (nr_pages + 1) / 2;
-       else
-               split_nr_pages = nr_pages;
+       split_nr_pages = (nr_pages + 1) / 2;
 
        if (signal_test) {
                sigbuf = &jbuf;
@@ -986,9 +990,6 @@ static int faulting_process(int signal_test)
        if (signal_test)
                return signalled != split_nr_pages;
 
-       if (test_type == TEST_HUGETLB)
-               return 0;
-
        area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
                          MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
        if (area_dst == MAP_FAILED)
@@ -1676,7 +1677,7 @@ int main(int argc, char **argv)
        }
        nr_pages = nr_pages_per_cpu * nr_cpus;
 
-       if (test_type == TEST_HUGETLB) {
+       if (test_type == TEST_HUGETLB && map_shared) {
                if (argc < 5)
                        usage();
                huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
diff --git a/tools/testing/selftests/vm/util.h b/tools/testing/selftests/vm/util.h
new file mode 100644 (file)
index 0000000..b27d261
--- /dev/null
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __KSELFTEST_VM_UTIL_H
+#define __KSELFTEST_VM_UTIL_H
+
+#include <stdint.h>
+#include <sys/mman.h>
+#include <err.h>
+#include <string.h> /* ffsl() */
+#include <unistd.h> /* _SC_PAGESIZE */
+
+static unsigned int __page_size;
+static unsigned int __page_shift;
+
+static inline unsigned int page_size(void)
+{
+       if (!__page_size)
+               __page_size = sysconf(_SC_PAGESIZE);
+       return __page_size;
+}
+
+static inline unsigned int page_shift(void)
+{
+       if (!__page_shift)
+               __page_shift = (ffsl(page_size()) - 1);
+       return __page_shift;
+}
+
+#define PAGE_SHIFT     (page_shift())
+#define PAGE_SIZE      (page_size())
+/*
+ * On ppc64 this will only work with radix 2M hugepage size
+ */
+#define HPAGE_SHIFT 21
+#define HPAGE_SIZE (1 << HPAGE_SHIFT)
+
+#define PAGEMAP_PRESENT(ent)   (((ent) & (1ull << 63)) != 0)
+#define PAGEMAP_PFN(ent)       ((ent) & ((1ull << 55) - 1))
+
+
+static inline int64_t allocate_transhuge(void *ptr, int pagemap_fd)
+{
+       uint64_t ent[2];
+
+       /* drop pmd */
+       if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE,
+                MAP_FIXED | MAP_ANONYMOUS |
+                MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr)
+               errx(2, "mmap transhuge");
+
+       if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE))
+               err(2, "MADV_HUGEPAGE");
+
+       /* allocate transparent huge page */
+       *(volatile void **)ptr = ptr;
+
+       if (pread(pagemap_fd, ent, sizeof(ent),
+                 (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent))
+               err(2, "read pagemap");
+
+       if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) &&
+           PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) &&
+           !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1)))
+               return PAGEMAP_PFN(ent[0]);
+
+       return -1;
+}
+
+#endif
index 9ebb84a..7679335 100644 (file)
 #include <string.h>
 #include <regex.h>
 #include <errno.h>
+#include <linux/types.h>
+#include <getopt.h>
+
+#define bool int
+#define true 1
+#define false 0
+#define TASK_COMM_LEN 16
 
 struct block_list {
        char *txt;
+       char *comm; // task command name
+       char *stacktrace;
+       __u64 ts_nsec;
+       __u64 free_ts_nsec;
        int len;
        int num;
        int page_num;
+       pid_t pid;
+       pid_t tgid;
 };
-
-static int sort_by_memory;
+enum FILTER_BIT {
+       FILTER_UNRELEASE = 1<<1,
+       FILTER_PID = 1<<2,
+       FILTER_TGID = 1<<3,
+       FILTER_COMM = 1<<4
+};
+enum CULL_BIT {
+       CULL_UNRELEASE = 1<<1,
+       CULL_PID = 1<<2,
+       CULL_TGID = 1<<3,
+       CULL_COMM = 1<<4,
+       CULL_STACKTRACE = 1<<5
+};
+struct filter_condition {
+       pid_t tgid;
+       pid_t pid;
+       char comm[TASK_COMM_LEN];
+};
+static struct filter_condition fc;
 static regex_t order_pattern;
+static regex_t pid_pattern;
+static regex_t tgid_pattern;
+static regex_t comm_pattern;
+static regex_t ts_nsec_pattern;
+static regex_t free_ts_nsec_pattern;
 static struct block_list *list;
 static int list_size;
 static int max_size;
-
-struct block_list *block_head;
+static int cull;
+static int filter;
 
 int read_block(char *buf, int buf_size, FILE *fin)
 {
@@ -58,6 +93,13 @@ static int compare_txt(const void *p1, const void *p2)
        return strcmp(l1->txt, l2->txt);
 }
 
+static int compare_stacktrace(const void *p1, const void *p2)
+{
+       const struct block_list *l1 = p1, *l2 = p2;
+
+       return strcmp(l1->stacktrace, l2->stacktrace);
+}
+
 static int compare_num(const void *p1, const void *p2)
 {
        const struct block_list *l1 = p1, *l2 = p2;
@@ -72,41 +114,260 @@ static int compare_page_num(const void *p1, const void *p2)
        return l2->page_num - l1->page_num;
 }
 
-static int get_page_num(char *buf)
+static int compare_pid(const void *p1, const void *p2)
 {
-       int err, val_len, order_val;
-       char order_str[4] = {0};
-       char *endptr;
+       const struct block_list *l1 = p1, *l2 = p2;
+
+       return l1->pid - l2->pid;
+}
+
+static int compare_tgid(const void *p1, const void *p2)
+{
+       const struct block_list *l1 = p1, *l2 = p2;
+
+       return l1->tgid - l2->tgid;
+}
+
+static int compare_comm(const void *p1, const void *p2)
+{
+       const struct block_list *l1 = p1, *l2 = p2;
+
+       return strcmp(l1->comm, l2->comm);
+}
+
+static int compare_ts(const void *p1, const void *p2)
+{
+       const struct block_list *l1 = p1, *l2 = p2;
+
+       return l1->ts_nsec < l2->ts_nsec ? -1 : 1;
+}
+
+static int compare_free_ts(const void *p1, const void *p2)
+{
+       const struct block_list *l1 = p1, *l2 = p2;
+
+       return l1->free_ts_nsec < l2->free_ts_nsec ? -1 : 1;
+}
+
+
+static int compare_release(const void *p1, const void *p2)
+{
+       const struct block_list *l1 = p1, *l2 = p2;
+
+       if (!l1->free_ts_nsec && !l2->free_ts_nsec)
+               return 0;
+       if (l1->free_ts_nsec && l2->free_ts_nsec)
+               return 0;
+       return l1->free_ts_nsec ? 1 : -1;
+}
+
+
+static int compare_cull_condition(const void *p1, const void *p2)
+{
+       if (cull == 0)
+               return compare_txt(p1, p2);
+       if ((cull & CULL_STACKTRACE) && compare_stacktrace(p1, p2))
+               return compare_stacktrace(p1, p2);
+       if ((cull & CULL_PID) && compare_pid(p1, p2))
+               return compare_pid(p1, p2);
+       if ((cull & CULL_TGID) && compare_tgid(p1, p2))
+               return compare_tgid(p1, p2);
+       if ((cull & CULL_COMM) && compare_comm(p1, p2))
+               return compare_comm(p1, p2);
+       if ((cull & CULL_UNRELEASE) && compare_release(p1, p2))
+               return compare_release(p1, p2);
+       return 0;
+}
+
+static int search_pattern(regex_t *pattern, char *pattern_str, char *buf)
+{
+       int err, val_len;
        regmatch_t pmatch[2];
 
-       err = regexec(&order_pattern, buf, 2, pmatch, REG_NOTBOL);
+       err = regexec(pattern, buf, 2, pmatch, REG_NOTBOL);
        if (err != 0 || pmatch[1].rm_so == -1) {
-               printf("no order pattern in %s\n", buf);
-               return 0;
+               printf("no matching pattern in %s\n", buf);
+               return -1;
        }
        val_len = pmatch[1].rm_eo - pmatch[1].rm_so;
-       if (val_len > 2) /* max_order should not exceed 2 digits */
-               goto wrong_order;
 
-       memcpy(order_str, buf + pmatch[1].rm_so, val_len);
+       memcpy(pattern_str, buf + pmatch[1].rm_so, val_len);
+
+       return 0;
+}
+
+static void check_regcomp(regex_t *pattern, const char *regex)
+{
+       int err;
+
+       err = regcomp(pattern, regex, REG_EXTENDED | REG_NEWLINE);
+       if (err != 0 || pattern->re_nsub != 1) {
+               printf("Invalid pattern %s code %d\n", regex, err);
+               exit(1);
+       }
+}
 
+static char **explode(char sep, const char *str, int *size)
+{
+       int count = 0, len = strlen(str);
+       int lastindex = -1, j = 0;
+
+       for (int i = 0; i < len; i++)
+               if (str[i] == sep)
+                       count++;
+       char **ret = calloc(++count, sizeof(char *));
+
+       for (int i = 0; i < len; i++) {
+               if (str[i] == sep) {
+                       ret[j] = calloc(i - lastindex, sizeof(char));
+                       memcpy(ret[j++], str + lastindex + 1, i - lastindex - 1);
+                       lastindex = i;
+               }
+       }
+       if (lastindex <= len - 1) {
+               ret[j] = calloc(len - lastindex, sizeof(char));
+               memcpy(ret[j++], str + lastindex + 1, strlen(str) - 1 - lastindex);
+       }
+       *size = j;
+       return ret;
+}
+
+static void free_explode(char **arr, int size)
+{
+       for (int i = 0; i < size; i++)
+               free(arr[i]);
+       free(arr);
+}
+
+# define FIELD_BUFF 25
+
+static int get_page_num(char *buf)
+{
+       int order_val;
+       char order_str[FIELD_BUFF] = {0};
+       char *endptr;
+
+       search_pattern(&order_pattern, order_str, buf);
        errno = 0;
        order_val = strtol(order_str, &endptr, 10);
-       if (errno != 0 || endptr == order_str || *endptr != '\0')
-               goto wrong_order;
+       if (order_val > 64 || errno != 0 || endptr == order_str || *endptr != '\0') {
+               printf("wrong order in follow buf:\n%s\n", buf);
+               return 0;
+       }
 
        return 1 << order_val;
+}
+
+static pid_t get_pid(char *buf)
+{
+       pid_t pid;
+       char pid_str[FIELD_BUFF] = {0};
+       char *endptr;
+
+       search_pattern(&pid_pattern, pid_str, buf);
+       errno = 0;
+       pid = strtol(pid_str, &endptr, 10);
+       if (errno != 0 || endptr == pid_str || *endptr != '\0') {
+               printf("wrong/invalid pid in follow buf:\n%s\n", buf);
+               return -1;
+       }
+
+       return pid;
 
-wrong_order:
-       printf("wrong order in follow buf:\n%s\n", buf);
-       return 0;
+}
+
+static pid_t get_tgid(char *buf)
+{
+       pid_t tgid;
+       char tgid_str[FIELD_BUFF] = {0};
+       char *endptr;
+
+       search_pattern(&tgid_pattern, tgid_str, buf);
+       errno = 0;
+       tgid = strtol(tgid_str, &endptr, 10);
+       if (errno != 0 || endptr == tgid_str || *endptr != '\0') {
+               printf("wrong/invalid tgid in follow buf:\n%s\n", buf);
+               return -1;
+       }
+
+       return tgid;
+
+}
+
+static __u64 get_ts_nsec(char *buf)
+{
+       __u64 ts_nsec;
+       char ts_nsec_str[FIELD_BUFF] = {0};
+       char *endptr;
+
+       search_pattern(&ts_nsec_pattern, ts_nsec_str, buf);
+       errno = 0;
+       ts_nsec = strtoull(ts_nsec_str, &endptr, 10);
+       if (errno != 0 || endptr == ts_nsec_str || *endptr != '\0') {
+               printf("wrong ts_nsec in follow buf:\n%s\n", buf);
+               return -1;
+       }
+
+       return ts_nsec;
+}
+
+static __u64 get_free_ts_nsec(char *buf)
+{
+       __u64 free_ts_nsec;
+       char free_ts_nsec_str[FIELD_BUFF] = {0};
+       char *endptr;
+
+       search_pattern(&free_ts_nsec_pattern, free_ts_nsec_str, buf);
+       errno = 0;
+       free_ts_nsec = strtoull(free_ts_nsec_str, &endptr, 10);
+       if (errno != 0 || endptr == free_ts_nsec_str || *endptr != '\0') {
+               printf("wrong free_ts_nsec in follow buf:\n%s\n", buf);
+               return -1;
+       }
+
+       return free_ts_nsec;
+}
+
+static char *get_comm(char *buf)
+{
+       char *comm_str = malloc(TASK_COMM_LEN);
+
+       memset(comm_str, 0, TASK_COMM_LEN);
+
+       search_pattern(&comm_pattern, comm_str, buf);
+       errno = 0;
+       if (errno != 0) {
+               printf("wrong comm in follow buf:\n%s\n", buf);
+               return NULL;
+       }
+
+       return comm_str;
+}
+
+static bool is_need(char *buf)
+{
+               if ((filter & FILTER_UNRELEASE) && get_free_ts_nsec(buf) != 0)
+                       return false;
+               if ((filter & FILTER_PID) && get_pid(buf) != fc.pid)
+                       return false;
+               if ((filter & FILTER_TGID) && get_tgid(buf) != fc.tgid)
+                       return false;
+
+               char *comm = get_comm(buf);
+
+               if ((filter & FILTER_COMM) &&
+               strncmp(comm, fc.comm, TASK_COMM_LEN) != 0) {
+                       free(comm);
+                       return false;
+               }
+               return true;
 }
 
 static void add_list(char *buf, int len)
 {
        if (list_size != 0 &&
-           len == list[list_size-1].len &&
-           memcmp(buf, list[list_size-1].txt, len) == 0) {
+               len == list[list_size-1].len &&
+               memcmp(buf, list[list_size-1].txt, len) == 0) {
                list[list_size-1].num++;
                list[list_size-1].page_num += get_page_num(buf);
                return;
@@ -115,12 +376,27 @@ static void add_list(char *buf, int len)
                printf("max_size too small??\n");
                exit(1);
        }
+       if (!is_need(buf))
+               return;
+       list[list_size].pid = get_pid(buf);
+       list[list_size].tgid = get_tgid(buf);
+       list[list_size].comm = get_comm(buf);
        list[list_size].txt = malloc(len+1);
+       if (!list[list_size].txt) {
+               printf("Out of memory\n");
+               exit(1);
+       }
+       memcpy(list[list_size].txt, buf, len);
+       list[list_size].txt[len] = 0;
        list[list_size].len = len;
        list[list_size].num = 1;
        list[list_size].page_num = get_page_num(buf);
-       memcpy(list[list_size].txt, buf, len);
-       list[list_size].txt[len] = 0;
+
+       list[list_size].stacktrace = strchr(list[list_size].txt, '\n') ?: "";
+       if (*list[list_size].stacktrace == '\n')
+               list[list_size].stacktrace++;
+       list[list_size].ts_nsec = get_ts_nsec(buf);
+       list[list_size].free_ts_nsec = get_free_ts_nsec(buf);
        list_size++;
        if (list_size % 1000 == 0) {
                printf("loaded %d\r", list_size);
@@ -128,29 +404,129 @@ static void add_list(char *buf, int len)
        }
 }
 
+static bool parse_cull_args(const char *arg_str)
+{
+       int size = 0;
+       char **args = explode(',', arg_str, &size);
+
+       for (int i = 0; i < size; ++i)
+               if (!strcmp(args[i], "pid") || !strcmp(args[i], "p"))
+                       cull |= CULL_PID;
+               else if (!strcmp(args[i], "tgid") || !strcmp(args[i], "tg"))
+                       cull |= CULL_TGID;
+               else if (!strcmp(args[i], "name") || !strcmp(args[i], "n"))
+                       cull |= CULL_COMM;
+               else if (!strcmp(args[i], "stacktrace") || !strcmp(args[i], "st"))
+                       cull |= CULL_STACKTRACE;
+               else if (!strcmp(args[i], "free") || !strcmp(args[i], "f"))
+                       cull |= CULL_UNRELEASE;
+               else {
+                       free_explode(args, size);
+                       return false;
+               }
+       free_explode(args, size);
+       return true;
+}
+
 #define BUF_SIZE       (128 * 1024)
 
 static void usage(void)
 {
-       printf("Usage: ./page_owner_sort [-m] <input> <output>\n"
-               "-m     Sort by total memory. If this option is unset, sort by times\n"
+       printf("Usage: ./page_owner_sort [OPTIONS] <input> <output>\n"
+               "-m\t\tSort by total memory.\n"
+               "-s\t\tSort by the stack trace.\n"
+               "-t\t\tSort by times (default).\n"
+               "-p\t\tSort by pid.\n"
+               "-P\t\tSort by tgid.\n"
+               "-n\t\tSort by task command name.\n"
+               "-a\t\tSort by memory allocate time.\n"
+               "-r\t\tSort by memory release time.\n"
+               "-c\t\tCull by comparing stacktrace instead of total block.\n"
+               "-f\t\tFilter out the information of blocks whose memory has been released.\n"
+               "--pid <PID>\tSelect by pid. This selects the information of blocks whose process ID number equals to <PID>.\n"
+               "--tgid <TGID>\tSelect by tgid. This selects the information of blocks whose Thread Group ID number equals to <TGID>.\n"
+               "--name <command>\n\t\tSelect by command name. This selects the information of blocks whose command name identical to <command>.\n"
+               "--cull <rules>\tCull by user-defined rules. <rules> is a single argument in the form of a comma-separated list with some common fields predefined\n"
        );
 }
 
 int main(int argc, char **argv)
 {
+       int (*cmp)(const void *, const void *) = compare_num;
        FILE *fin, *fout;
-       char *buf;
+       char *buf, *endptr;
        int ret, i, count;
-       struct block_list *list2;
        struct stat st;
-       int err;
        int opt;
-
-       while ((opt = getopt(argc, argv, "m")) != -1)
+       struct option longopts[] = {
+               { "pid", required_argument, NULL, 1 },
+               { "tgid", required_argument, NULL, 2 },
+               { "name", required_argument, NULL, 3 },
+               { "cull",  required_argument, NULL, 4 },
+               { 0, 0, 0, 0},
+       };
+
+       while ((opt = getopt_long(argc, argv, "acfmnprstP", longopts, NULL)) != -1)
                switch (opt) {
+               case 'a':
+                       cmp = compare_ts;
+                       break;
+               case 'c':
+                       cull = cull | CULL_STACKTRACE;
+                       break;
+               case 'f':
+                       filter = filter | FILTER_UNRELEASE;
+                       break;
                case 'm':
-                       sort_by_memory = 1;
+                       cmp = compare_page_num;
+                       break;
+               case 'p':
+                       cmp = compare_pid;
+                       break;
+               case 'r':
+                       cmp = compare_free_ts;
+                       break;
+               case 's':
+                       cmp = compare_stacktrace;
+                       break;
+               case 't':
+                       cmp = compare_num;
+                       break;
+               case 'P':
+                       cmp = compare_tgid;
+                       break;
+               case 'n':
+                       cmp = compare_comm;
+                       break;
+               case 1:
+                       filter = filter | FILTER_PID;
+                       errno = 0;
+                       fc.pid = strtol(optarg, &endptr, 10);
+                       if (errno != 0 || endptr == optarg || *endptr != '\0') {
+                               printf("wrong/invalid pid in from the command line:%s\n", optarg);
+                               exit(1);
+                       }
+                       break;
+               case 2:
+                       filter = filter | FILTER_TGID;
+                       errno = 0;
+                       fc.tgid = strtol(optarg, &endptr, 10);
+                       if (errno != 0 || endptr == optarg || *endptr != '\0') {
+                               printf("wrong/invalid tgid in from the command line:%s\n", optarg);
+                               exit(1);
+                       }
+                       break;
+               case 3:
+                       filter = filter | FILTER_COMM;
+                       strncpy(fc.comm, optarg, TASK_COMM_LEN);
+                       fc.comm[TASK_COMM_LEN-1] = '\0';
+                       break;
+               case 4:
+                       if (!parse_cull_args(optarg)) {
+                               printf("wrong argument after --cull in from the command line:%s\n",
+                                               optarg);
+                               exit(1);
+                       }
                        break;
                default:
                        usage();
@@ -170,13 +546,12 @@ int main(int argc, char **argv)
                exit(1);
        }
 
-       err = regcomp(&order_pattern, "order\\s*([0-9]*),", REG_EXTENDED|REG_NEWLINE);
-       if (err != 0 || order_pattern.re_nsub != 1) {
-               printf("%s: Invalid pattern 'order\\s*([0-9]*),' code %d\n",
-                       argv[0], err);
-               exit(1);
-       }
-
+       check_regcomp(&order_pattern, "order\\s*([0-9]*),");
+       check_regcomp(&pid_pattern, "pid\\s*([0-9]*),");
+       check_regcomp(&tgid_pattern, "tgid\\s*([0-9]*) ");
+       check_regcomp(&comm_pattern, "tgid\\s*[0-9]*\\s*\\((.*)\\),\\s*ts");
+       check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,");
+       check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns");
        fstat(fileno(fin), &st);
        max_size = st.st_size / 100; /* hack ... */
 
@@ -199,35 +574,48 @@ int main(int argc, char **argv)
 
        printf("sorting ....\n");
 
-       qsort(list, list_size, sizeof(list[0]), compare_txt);
-
-       list2 = malloc(sizeof(*list) * list_size);
-       if (!list2) {
-               printf("Out of memory\n");
-               exit(1);
-       }
+       qsort(list, list_size, sizeof(list[0]), compare_cull_condition);
 
        printf("culling\n");
 
        for (i = count = 0; i < list_size; i++) {
                if (count == 0 ||
-                   strcmp(list2[count-1].txt, list[i].txt) != 0) {
-                       list2[count++] = list[i];
+                   compare_cull_condition((void *)(&list[count-1]), (void *)(&list[i])) != 0) {
+                       list[count++] = list[i];
                } else {
-                       list2[count-1].num += list[i].num;
-                       list2[count-1].page_num += list[i].page_num;
+                       list[count-1].num += list[i].num;
+                       list[count-1].page_num += list[i].page_num;
                }
        }
 
-       if (sort_by_memory)
-               qsort(list2, count, sizeof(list[0]), compare_page_num);
-       else
-               qsort(list2, count, sizeof(list[0]), compare_num);
-
-       for (i = 0; i < count; i++)
-               fprintf(fout, "%d times, %d pages:\n%s\n",
-                               list2[i].num, list2[i].page_num, list2[i].txt);
-
+       qsort(list, count, sizeof(list[0]), cmp);
+
+       for (i = 0; i < count; i++) {
+               if (cull == 0)
+                       fprintf(fout, "%d times, %d pages:\n%s\n",
+                                       list[i].num, list[i].page_num, list[i].txt);
+               else {
+                       fprintf(fout, "%d times, %d pages",
+                                       list[i].num, list[i].page_num);
+                       if (cull & CULL_PID || filter & FILTER_PID)
+                               fprintf(fout, ", PID %d", list[i].pid);
+                       if (cull & CULL_TGID || filter & FILTER_TGID)
+                               fprintf(fout, ", TGID %d", list[i].pid);
+                       if (cull & CULL_COMM || filter & FILTER_COMM)
+                               fprintf(fout, ", task_comm_name: %s", list[i].comm);
+                       if (cull & CULL_UNRELEASE)
+                               fprintf(fout, " (%s)",
+                                               list[i].free_ts_nsec ? "UNRELEASED" : "RELEASED");
+                       if (cull & CULL_STACKTRACE)
+                               fprintf(fout, ":\n%s", list[i].stacktrace);
+                       fprintf(fout, "\n");
+               }
+       }
        regfree(&order_pattern);
+       regfree(&pid_pattern);
+       regfree(&tgid_pattern);
+       regfree(&comm_pattern);
+       regfree(&ts_nsec_pattern);
+       regfree(&free_ts_nsec_pattern);
        return 0;
 }