Merge branch 'akpm' (patches from Andrew)
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 3 Sep 2021 17:08:28 +0000 (10:08 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 3 Sep 2021 17:08:28 +0000 (10:08 -0700)
Merge misc updates from Andrew Morton:
 "173 patches.

  Subsystems affected by this series: ia64, ocfs2, block, and mm (debug,
  pagecache, gup, swap, shmem, memcg, selftests, pagemap, mremap,
  bootmem, sparsemem, vmalloc, kasan, pagealloc, memory-failure,
  hugetlb, userfaultfd, vmscan, compaction, mempolicy, memblock,
  oom-kill, migration, ksm, percpu, vmstat, and madvise)"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (173 commits)
  mm/madvise: add MADV_WILLNEED to process_madvise()
  mm/vmstat: remove unneeded return value
  mm/vmstat: simplify the array size calculation
  mm/vmstat: correct some wrong comments
  mm/percpu,c: remove obsolete comments of pcpu_chunk_populated()
  selftests: vm: add COW time test for KSM pages
  selftests: vm: add KSM merging time test
  mm: KSM: fix data type
  selftests: vm: add KSM merging across nodes test
  selftests: vm: add KSM zero page merging test
  selftests: vm: add KSM unmerge test
  selftests: vm: add KSM merge test
  mm/migrate: correct kernel-doc notation
  mm: wire up syscall process_mrelease
  mm: introduce process_mrelease system call
  memblock: make memblock_find_in_range method private
  mm/mempolicy.c: use in_task() in mempolicy_slab_node()
  mm/mempolicy: unify the create() func for bind/interleave/prefer-many policies
  mm/mempolicy: advertise new MPOL_PREFERRED_MANY
  mm/hugetlb: add support for mempolicy MPOL_PREFERRED_MANY
  ...

171 files changed:
Documentation/ABI/testing/sysfs-kernel-mm-numa [new file with mode: 0644]
Documentation/admin-guide/mm/numa_memory_policy.rst
Documentation/admin-guide/sysctl/vm.rst
Documentation/core-api/cachetlb.rst
Documentation/dev-tools/kasan.rst
Documentation/translations/zh_CN/core-api/cachetlb.rst
Documentation/vm/hwpoison.rst
arch/alpha/kernel/syscalls/syscall.tbl
arch/arm/include/asm/cacheflush.h
arch/arm/kernel/setup.c
arch/arm/mm/flush.c
arch/arm/mm/nommu.c
arch/arm/tools/syscall.tbl
arch/arm64/include/asm/unistd.h
arch/arm64/include/asm/unistd32.h
arch/arm64/kvm/hyp/reserved_mem.c
arch/arm64/mm/init.c
arch/csky/abiv1/cacheflush.c
arch/csky/abiv1/inc/abi/cacheflush.h
arch/csky/kernel/probes/kprobes.c
arch/ia64/include/asm/meminit.h
arch/ia64/kernel/acpi.c
arch/ia64/kernel/setup.c
arch/ia64/kernel/syscalls/syscall.tbl
arch/m68k/kernel/syscalls/syscall.tbl
arch/microblaze/include/asm/page.h
arch/microblaze/include/asm/pgtable.h
arch/microblaze/kernel/syscalls/syscall.tbl
arch/microblaze/mm/init.c
arch/microblaze/mm/pgtable.c
arch/mips/include/asm/cacheflush.h
arch/mips/kernel/setup.c
arch/mips/kernel/syscalls/syscall_n32.tbl
arch/mips/kernel/syscalls/syscall_n64.tbl
arch/mips/kernel/syscalls/syscall_o32.tbl
arch/nds32/include/asm/cacheflush.h
arch/nds32/mm/cacheflush.c
arch/parisc/include/asm/cacheflush.h
arch/parisc/kernel/cache.c
arch/parisc/kernel/syscalls/syscall.tbl
arch/powerpc/kernel/syscalls/syscall.tbl
arch/powerpc/platforms/pseries/hotplug-memory.c
arch/riscv/mm/init.c
arch/s390/kernel/setup.c
arch/s390/kernel/syscalls/syscall.tbl
arch/s390/mm/fault.c
arch/sh/include/asm/cacheflush.h
arch/sh/kernel/syscalls/syscall.tbl
arch/sparc/kernel/syscalls/syscall.tbl
arch/x86/entry/syscalls/syscall_32.tbl
arch/x86/entry/syscalls/syscall_64.tbl
arch/x86/kernel/aperture_64.c
arch/x86/kernel/ldt.c
arch/x86/mm/init.c
arch/x86/mm/numa.c
arch/x86/mm/numa_emulation.c
arch/x86/realmode/init.c
arch/xtensa/kernel/syscalls/syscall.tbl
block/blk-map.c
drivers/acpi/tables.c
drivers/base/arch_numa.c
drivers/base/memory.c
drivers/mmc/host/jz4740_mmc.c
drivers/mmc/host/mmc_spi.c
drivers/of/of_reserved_mem.c
fs/drop_caches.c
fs/exec.c
fs/fcntl.c
fs/fs-writeback.c
fs/fs_context.c
fs/inode.c
fs/locks.c
fs/namei.c
fs/namespace.c
fs/ocfs2/dlmglue.c
fs/ocfs2/quota_global.c
fs/ocfs2/quota_local.c
fs/pipe.c
fs/select.c
fs/userfaultfd.c
include/linux/backing-dev-defs.h
include/linux/backing-dev.h
include/linux/buffer_head.h
include/linux/compaction.h
include/linux/highmem.h
include/linux/hugetlb_cgroup.h
include/linux/memblock.h
include/linux/memcontrol.h
include/linux/memory.h
include/linux/mempolicy.h
include/linux/migrate.h
include/linux/mm.h
include/linux/mmzone.h
include/linux/pagemap.h
include/linux/sched/mm.h
include/linux/shmem_fs.h
include/linux/swap.h
include/linux/syscalls.h
include/linux/userfaultfd_k.h
include/linux/vm_event_item.h
include/linux/vmpressure.h
include/linux/writeback.h
include/trace/events/migrate.h
include/uapi/asm-generic/unistd.h
include/uapi/linux/mempolicy.h
ipc/msg.c
ipc/namespace.c
ipc/sem.c
ipc/shm.c
kernel/cgroup/namespace.c
kernel/nsproxy.c
kernel/pid_namespace.c
kernel/signal.c
kernel/sys_ni.c
kernel/sysctl.c
kernel/time/namespace.c
kernel/time/posix-timers.c
kernel/user_namespace.c
lib/scatterlist.c
lib/test_kasan.c
lib/test_kasan_module.c
lib/test_vmalloc.c
mm/backing-dev.c
mm/bootmem_info.c
mm/compaction.c
mm/debug_vm_pgtable.c
mm/filemap.c
mm/gup.c
mm/huge_memory.c
mm/hugetlb.c
mm/hwpoison-inject.c
mm/internal.h
mm/kasan/hw_tags.c
mm/kasan/kasan.h
mm/kasan/report.c
mm/khugepaged.c
mm/ksm.c
mm/madvise.c
mm/memblock.c
mm/memcontrol.c
mm/memory-failure.c
mm/memory_hotplug.c
mm/mempolicy.c
mm/migrate.c
mm/mmap.c
mm/mremap.c
mm/oom_kill.c
mm/page-writeback.c
mm/page_alloc.c
mm/page_isolation.c
mm/percpu.c
mm/shmem.c
mm/sparse.c
mm/swap.c
mm/swapfile.c
mm/truncate.c
mm/userfaultfd.c
mm/vmalloc.c
mm/vmpressure.c
mm/vmscan.c
mm/vmstat.c
security/tomoyo/domain.c
tools/testing/scatterlist/linux/mm.h
tools/testing/selftests/vm/.gitignore
tools/testing/selftests/vm/Makefile
tools/testing/selftests/vm/charge_reserved_hugetlb.sh
tools/testing/selftests/vm/hugetlb_reparenting_test.sh
tools/testing/selftests/vm/ksm_tests.c [new file with mode: 0644]
tools/testing/selftests/vm/mlock-random-test.c
tools/testing/selftests/vm/run_vmtests.sh
tools/testing/selftests/vm/userfaultfd.c

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-numa b/Documentation/ABI/testing/sysfs-kernel-mm-numa
new file mode 100644 (file)
index 0000000..77e559d
--- /dev/null
@@ -0,0 +1,24 @@
+What:          /sys/kernel/mm/numa/
+Date:          June 2021
+Contact:       Linux memory management mailing list <linux-mm@kvack.org>
+Description:   Interface for NUMA
+
+What:          /sys/kernel/mm/numa/demotion_enabled
+Date:          June 2021
+Contact:       Linux memory management mailing list <linux-mm@kvack.org>
+Description:   Enable/disable demoting pages during reclaim
+
+               Page migration during reclaim is intended for systems
+               with tiered memory configurations.  These systems have
+               multiple types of memory with varied performance
+               characteristics instead of plain NUMA systems where
+               the same kind of memory is found at varied distances.
+               Allowing page migration during reclaim enables these
+               systems to migrate pages from fast tiers to slow tiers
+               when the fast tier is under pressure.  This migration
+               is performed before swap.  It may move data to a NUMA
+               node that does not fall into the cpuset of the
+               allocating process which might be construed to violate
+               the guarantees of cpusets.  This should not be enabled
+               on systems which need strict cpuset location
+               guarantees.
index 067a90a..64fd0ba 100644 (file)
@@ -245,6 +245,13 @@ MPOL_INTERLEAVED
        address range or file.  During system boot up, the temporary
        interleaved system default policy works in this mode.
 
+MPOL_PREFERRED_MANY
+       This mode specifices that the allocation should be preferrably
+       satisfied from the nodemask specified in the policy. If there is
+       a memory pressure on all nodes in the nodemask, the allocation
+       can fall back to all existing numa nodes. This is effectively
+       MPOL_PREFERRED allowed for a mask rather than a single node.
+
 NUMA memory policy supports the following optional mode flags:
 
 MPOL_F_STATIC_NODES
@@ -253,10 +260,10 @@ MPOL_F_STATIC_NODES
        nodes changes after the memory policy has been defined.
 
        Without this flag, any time a mempolicy is rebound because of a
-       change in the set of allowed nodes, the node (Preferred) or
-       nodemask (Bind, Interleave) is remapped to the new set of
-       allowed nodes.  This may result in nodes being used that were
-       previously undesired.
+        change in the set of allowed nodes, the preferred nodemask (Preferred
+        Many), preferred node (Preferred) or nodemask (Bind, Interleave) is
+        remapped to the new set of allowed nodes.  This may result in nodes
+        being used that were previously undesired.
 
        With this flag, if the user-specified nodes overlap with the
        nodes allowed by the task's cpuset, then the memory policy is
index 003d5cc..5e79520 100644 (file)
@@ -118,7 +118,8 @@ compaction_proactiveness
 
 This tunable takes a value in the range [0, 100] with a default value of
 20. This tunable determines how aggressively compaction is done in the
-background. Setting it to 0 disables proactive compaction.
+background. Write of a non zero value to this tunable will immediately
+trigger the proactive compaction. Setting it to 0 disables proactive compaction.
 
 Note that compaction has a non-trivial system-wide impact as pages
 belonging to different processes are moved around, which could also lead
index fe4290e..8aed910 100644 (file)
@@ -271,10 +271,15 @@ maps this page at its virtual address.
 
   ``void flush_dcache_page(struct page *page)``
 
-       Any time the kernel writes to a page cache page, _OR_
-       the kernel is about to read from a page cache page and
-       user space shared/writable mappings of this page potentially
-       exist, this routine is called.
+        This routines must be called when:
+
+         a) the kernel did write to a page that is in the page cache page
+            and / or in high memory
+         b) the kernel is about to read from a page cache page and user space
+            shared/writable mappings of this page potentially exist.  Note
+            that {get,pin}_user_pages{_fast} already call flush_dcache_page
+            on any page found in the user address space and thus driver
+            code rarely needs to take this into account.
 
        .. note::
 
@@ -284,38 +289,34 @@ maps this page at its virtual address.
              handling vfs symlinks in the page cache need not call
              this interface at all.
 
-       The phrase "kernel writes to a page cache page" means,
-       specifically, that the kernel executes store instructions
-       that dirty data in that page at the page->virtual mapping
-       of that page.  It is important to flush here to handle
-       D-cache aliasing, to make sure these kernel stores are
-       visible to user space mappings of that page.
-
-       The corollary case is just as important, if there are users
-       which have shared+writable mappings of this file, we must make
-       sure that kernel reads of these pages will see the most recent
-       stores done by the user.
-
-       If D-cache aliasing is not an issue, this routine may
-       simply be defined as a nop on that architecture.
-
-        There is a bit set aside in page->flags (PG_arch_1) as
-       "architecture private".  The kernel guarantees that,
-       for pagecache pages, it will clear this bit when such
-       a page first enters the pagecache.
-
-       This allows these interfaces to be implemented much more
-       efficiently.  It allows one to "defer" (perhaps indefinitely)
-       the actual flush if there are currently no user processes
-       mapping this page.  See sparc64's flush_dcache_page and
-       update_mmu_cache implementations for an example of how to go
-       about doing this.
-
-       The idea is, first at flush_dcache_page() time, if
-       page->mapping->i_mmap is an empty tree, just mark the architecture
-       private page flag bit.  Later, in update_mmu_cache(), a check is
-       made of this flag bit, and if set the flush is done and the flag
-       bit is cleared.
+       The phrase "kernel writes to a page cache page" means, specifically,
+       that the kernel executes store instructions that dirty data in that
+       page at the page->virtual mapping of that page.  It is important to
+       flush here to handle D-cache aliasing, to make sure these kernel stores
+       are visible to user space mappings of that page.
+
+       The corollary case is just as important, if there are users which have
+       shared+writable mappings of this file, we must make sure that kernel
+       reads of these pages will see the most recent stores done by the user.
+
+       If D-cache aliasing is not an issue, this routine may simply be defined
+       as a nop on that architecture.
+
+        There is a bit set aside in page->flags (PG_arch_1) as "architecture
+       private".  The kernel guarantees that, for pagecache pages, it will
+       clear this bit when such a page first enters the pagecache.
+
+       This allows these interfaces to be implemented much more efficiently.
+       It allows one to "defer" (perhaps indefinitely) the actual flush if
+       there are currently no user processes mapping this page.  See sparc64's
+       flush_dcache_page and update_mmu_cache implementations for an example
+       of how to go about doing this.
+
+       The idea is, first at flush_dcache_page() time, if page_file_mapping()
+       returns a mapping, and mapping_mapped on that mapping returns %false,
+       just mark the architecture private page flag bit.  Later, in
+       update_mmu_cache(), a check is made of this flag bit, and if set the
+       flush is done and the flag bit is cleared.
 
        .. important::
 
@@ -351,19 +352,6 @@ maps this page at its virtual address.
        architectures).  For incoherent architectures, it should flush
        the cache of the page at vmaddr.
 
-  ``void flush_kernel_dcache_page(struct page *page)``
-
-       When the kernel needs to modify a user page is has obtained
-       with kmap, it calls this function after all modifications are
-       complete (but before kunmapping it) to bring the underlying
-       page up to date.  It is assumed here that the user has no
-       incoherent cached copies (i.e. the original page was obtained
-       from a mechanism like get_user_pages()).  The default
-       implementation is a nop and should remain so on all coherent
-       architectures.  On incoherent architectures, this should flush
-       the kernel cache for page (using page_address(page)).
-
-
   ``void flush_icache_range(unsigned long start, unsigned long end)``
 
        When the kernel stores into addresses that it will execute
index 83ec4a5..21dc03b 100644 (file)
@@ -181,9 +181,16 @@ By default, KASAN prints a bug report only for the first invalid memory access.
 With ``kasan_multi_shot``, KASAN prints a report on every invalid access. This
 effectively disables ``panic_on_warn`` for KASAN reports.
 
+Alternatively, independent of ``panic_on_warn`` the ``kasan.fault=`` boot
+parameter can be used to control panic and reporting behaviour:
+
+- ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN
+  report or also panic the kernel (default: ``report``). The panic happens even
+  if ``kasan_multi_shot`` is enabled.
+
 Hardware tag-based KASAN mode (see the section about various modes below) is
 intended for use in production as a security mitigation. Therefore, it supports
-boot parameters that allow disabling KASAN or controlling its features.
+additional boot parameters that allow disabling KASAN or controlling features:
 
 - ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``).
 
@@ -199,10 +206,6 @@ boot parameters that allow disabling KASAN or controlling its features.
 - ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack
   traces collection (default: ``on``).
 
-- ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN
-  report or also panic the kernel (default: ``report``). The panic happens even
-  if ``kasan_multi_shot`` is enabled.
-
 Implementation details
 ----------------------
 
index 8376485..55827b8 100644 (file)
@@ -298,15 +298,6 @@ HyperSparc cpu就是这样一个具有这种属性的cpu。
        用。默认的实现是nop(对于所有相干的架构应该保持这样)。对于不一致性
        的架构,它应该刷新vmaddr处的页面缓存。
 
-  ``void flush_kernel_dcache_page(struct page *page)``
-
-       当内核需要修改一个用kmap获得的用户页时,它会在所有修改完成后(但在
-       kunmapping之前)调用这个函数,以使底层页面达到最新状态。这里假定用
-       户没有不一致性的缓存副本(即原始页面是从类似get_user_pages()的机制
-       中获得的)。默认的实现是一个nop,在所有相干的架构上都应该如此。在不
-       一致性的架构上,这应该刷新内核缓存中的页面(使用page_address(page))。
-
-
   ``void flush_icache_range(unsigned long start, unsigned long end)``
 
        当内核存储到它将执行的地址中时(例如在加载模块时),这个函数被调用。
index a5c8842..89b5f7a 100644 (file)
@@ -180,7 +180,6 @@ Limitations
 ===========
 - Not all page types are supported and never will. Most kernel internal
   objects cannot be recovered, only LRU pages for now.
-- Right now hugepage support is missing.
 
 ---
 Andi Kleen, Oct 2009
index 7ac22e0..e4a041c 100644 (file)
 554    common  landlock_create_ruleset         sys_landlock_create_ruleset
 555    common  landlock_add_rule               sys_landlock_add_rule
 556    common  landlock_restrict_self          sys_landlock_restrict_self
+# 557 reserved for memfd_secret
+558    common  process_mrelease                sys_process_mrelease
index 2e24e76..5e56288 100644 (file)
@@ -291,6 +291,7 @@ extern void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr
 #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 extern void flush_dcache_page(struct page *);
 
+#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1
 static inline void flush_kernel_vmap_range(void *addr, int size)
 {
        if ((cache_is_vivt() || cache_is_vipt_aliasing()))
@@ -312,9 +313,6 @@ static inline void flush_anon_page(struct vm_area_struct *vma,
                __flush_anon_page(vma, page, vmaddr);
 }
 
-#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
-extern void flush_kernel_dcache_page(struct page *);
-
 #define flush_dcache_mmap_lock(mapping)                xa_lock_irq(&mapping->i_pages)
 #define flush_dcache_mmap_unlock(mapping)      xa_unlock_irq(&mapping->i_pages)
 
index f97eb23..284a80c 100644 (file)
@@ -1012,31 +1012,25 @@ static void __init reserve_crashkernel(void)
                unsigned long long lowmem_max = __pa(high_memory - 1) + 1;
                if (crash_max > lowmem_max)
                        crash_max = lowmem_max;
-               crash_base = memblock_find_in_range(CRASH_ALIGN, crash_max,
-                                                   crash_size, CRASH_ALIGN);
+
+               crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
+                                                      CRASH_ALIGN, crash_max);
                if (!crash_base) {
                        pr_err("crashkernel reservation failed - No suitable area found.\n");
                        return;
                }
        } else {
+               unsigned long long crash_max = crash_base + crash_size;
                unsigned long long start;
 
-               start = memblock_find_in_range(crash_base,
-                                              crash_base + crash_size,
-                                              crash_size, SECTION_SIZE);
-               if (start != crash_base) {
+               start = memblock_phys_alloc_range(crash_size, SECTION_SIZE,
+                                                 crash_base, crash_max);
+               if (!start) {
                        pr_err("crashkernel reservation failed - memory is in use.\n");
                        return;
                }
        }
 
-       ret = memblock_reserve(crash_base, crash_size);
-       if (ret < 0) {
-               pr_warn("crashkernel reservation failed - memory is in use (0x%lx)\n",
-                       (unsigned long)crash_base);
-               return;
-       }
-
        pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n",
                (unsigned long)(crash_size >> 20),
                (unsigned long)(crash_base >> 20),
index 6d89db7..7ff9fee 100644 (file)
@@ -345,39 +345,6 @@ void flush_dcache_page(struct page *page)
 }
 EXPORT_SYMBOL(flush_dcache_page);
 
-/*
- * Ensure cache coherency for the kernel mapping of this page. We can
- * assume that the page is pinned via kmap.
- *
- * If the page only exists in the page cache and there are no user
- * space mappings, this is a no-op since the page was already marked
- * dirty at creation.  Otherwise, we need to flush the dirty kernel
- * cache lines directly.
- */
-void flush_kernel_dcache_page(struct page *page)
-{
-       if (cache_is_vivt() || cache_is_vipt_aliasing()) {
-               struct address_space *mapping;
-
-               mapping = page_mapping_file(page);
-
-               if (!mapping || mapping_mapped(mapping)) {
-                       void *addr;
-
-                       addr = page_address(page);
-                       /*
-                        * kmap_atomic() doesn't set the page virtual
-                        * address for highmem pages, and
-                        * kunmap_atomic() takes care of cache
-                        * flushing already.
-                        */
-                       if (!IS_ENABLED(CONFIG_HIGHMEM) || addr)
-                               __cpuc_flush_dcache_area(addr, PAGE_SIZE);
-               }
-       }
-}
-EXPORT_SYMBOL(flush_kernel_dcache_page);
-
 /*
  * Flush an anonymous page so that users of get_user_pages()
  * can safely access the data.  The expected sequence is:
index 8b3d719..2658f52 100644 (file)
@@ -166,12 +166,6 @@ void flush_dcache_page(struct page *page)
 }
 EXPORT_SYMBOL(flush_dcache_page);
 
-void flush_kernel_dcache_page(struct page *page)
-{
-       __cpuc_flush_dcache_area(page_address(page), PAGE_SIZE);
-}
-EXPORT_SYMBOL(flush_kernel_dcache_page);
-
 void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
                       unsigned long uaddr, void *dst, const void *src,
                       unsigned long len)
index f8a2d5a..7e0a9b6 100644 (file)
 444    common  landlock_create_ruleset         sys_landlock_create_ruleset
 445    common  landlock_add_rule               sys_landlock_add_rule
 446    common  landlock_restrict_self          sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448    common  process_mrelease                sys_process_mrelease
index 727bfc3..3cb206a 100644 (file)
@@ -38,7 +38,7 @@
 #define __ARM_NR_compat_set_tls                (__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END            (__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls           447
+#define __NR_compat_syscalls           449
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
index 03d4ca4..4e99e4b 100644 (file)
@@ -901,6 +901,8 @@ __SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset)
 __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule)
 #define __NR_landlock_restrict_self 446
 __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self)
+#define __NR_process_mrelease 448
+__SYSCALL(__NR_process_mrelease, sys_process_mrelease)
 
 /*
  * Please add new compat syscalls above this comment and update
index d654921..578670e 100644 (file)
@@ -92,12 +92,10 @@ void __init kvm_hyp_reserve(void)
         * this is unmapped from the host stage-2, and fallback to PAGE_SIZE.
         */
        hyp_mem_size = hyp_mem_pages << PAGE_SHIFT;
-       hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(),
-                                             ALIGN(hyp_mem_size, PMD_SIZE),
-                                             PMD_SIZE);
+       hyp_mem_base = memblock_phys_alloc(ALIGN(hyp_mem_size, PMD_SIZE),
+                                          PMD_SIZE);
        if (!hyp_mem_base)
-               hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(),
-                                                     hyp_mem_size, PAGE_SIZE);
+               hyp_mem_base = memblock_phys_alloc(hyp_mem_size, PAGE_SIZE);
        else
                hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE);
 
@@ -105,7 +103,6 @@ void __init kvm_hyp_reserve(void)
                kvm_err("Failed to reserve hyp memory\n");
                return;
        }
-       memblock_reserve(hyp_mem_base, hyp_mem_size);
 
        kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20,
                 hyp_mem_base);
index edc8e95..b16be52 100644 (file)
@@ -74,6 +74,7 @@ phys_addr_t arm64_dma_phys_limit __ro_after_init;
 static void __init reserve_crashkernel(void)
 {
        unsigned long long crash_base, crash_size;
+       unsigned long long crash_max = arm64_dma_phys_limit;
        int ret;
 
        ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
@@ -84,33 +85,18 @@ static void __init reserve_crashkernel(void)
 
        crash_size = PAGE_ALIGN(crash_size);
 
-       if (crash_base == 0) {
-               /* Current arm64 boot protocol requires 2MB alignment */
-               crash_base = memblock_find_in_range(0, arm64_dma_phys_limit,
-                               crash_size, SZ_2M);
-               if (crash_base == 0) {
-                       pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
-                               crash_size);
-                       return;
-               }
-       } else {
-               /* User specifies base address explicitly. */
-               if (!memblock_is_region_memory(crash_base, crash_size)) {
-                       pr_warn("cannot reserve crashkernel: region is not memory\n");
-                       return;
-               }
+       /* User specifies base address explicitly. */
+       if (crash_base)
+               crash_max = crash_base + crash_size;
 
-               if (memblock_is_region_reserved(crash_base, crash_size)) {
-                       pr_warn("cannot reserve crashkernel: region overlaps reserved memory\n");
-                       return;
-               }
-
-               if (!IS_ALIGNED(crash_base, SZ_2M)) {
-                       pr_warn("cannot reserve crashkernel: base address is not 2MB aligned\n");
-                       return;
-               }
+       /* Current arm64 boot protocol requires 2MB alignment */
+       crash_base = memblock_phys_alloc_range(crash_size, SZ_2M,
+                                              crash_base, crash_max);
+       if (!crash_base) {
+               pr_warn("cannot allocate crashkernel (size:0x%llx)\n",
+                       crash_size);
+               return;
        }
-       memblock_reserve(crash_base, crash_size);
 
        pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n",
                crash_base, crash_base + crash_size, crash_size >> 20);
index 07ff17e..fb91b06 100644 (file)
@@ -56,17 +56,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr,
        }
 }
 
-void flush_kernel_dcache_page(struct page *page)
-{
-       struct address_space *mapping;
-
-       mapping = page_mapping_file(page);
-
-       if (!mapping || mapping_mapped(mapping))
-               dcache_wbinv_all();
-}
-EXPORT_SYMBOL(flush_kernel_dcache_page);
-
 void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
        unsigned long end)
 {
index 6cab7af..ed62e20 100644 (file)
@@ -14,12 +14,10 @@ extern void flush_dcache_page(struct page *);
 #define flush_cache_page(vma, page, pfn)       cache_wbinv_all()
 #define flush_cache_dup_mm(mm)                 cache_wbinv_all()
 
-#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
-extern void flush_kernel_dcache_page(struct page *);
-
 #define flush_dcache_mmap_lock(mapping)                xa_lock_irq(&mapping->i_pages)
 #define flush_dcache_mmap_unlock(mapping)      xa_unlock_irq(&mapping->i_pages)
 
+#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1
 static inline void flush_kernel_vmap_range(void *addr, int size)
 {
        dcache_wbinv_all();
index 68b22b4..8fffa34 100644 (file)
@@ -283,8 +283,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr)
                 * normal page fault.
                 */
                regs->pc = (unsigned long) cur->addr;
-               if (!instruction_pointer(regs))
-                       BUG();
+               BUG_ON(!instruction_pointer(regs));
 
                if (kcb->kprobe_status == KPROBE_REENTER)
                        restore_previous_kprobe(kcb);
index 6c47a23..f1d5bf2 100644 (file)
@@ -29,7 +29,6 @@ struct rsvd_region {
 };
 
 extern struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1];
-extern int num_rsvd_regions;
 
 extern void find_memory (void);
 extern void reserve_memory (void);
@@ -40,7 +39,6 @@ extern unsigned long efi_memmap_init(u64 *s, u64 *e);
 extern int find_max_min_low_pfn (u64, u64, void *);
 
 extern unsigned long vmcore_find_descriptor_size(unsigned long address);
-extern int reserve_elfcorehdr(u64 *start, u64 *end);
 
 /*
  * For rounding an address to the next IA64_GRANULE_SIZE or order
index e2af6b1..96d13cb 100644 (file)
@@ -906,6 +906,6 @@ EXPORT_SYMBOL(acpi_unregister_ioapic);
 /*
  * acpi_suspend_lowlevel() - save kernel state and suspend.
  *
- * TBD when when IA64 starts to support suspend...
+ * TBD when IA64 starts to support suspend...
  */
 int acpi_suspend_lowlevel(void) { return 0; }
index dd595fb..31fb84d 100644 (file)
@@ -131,7 +131,7 @@ unsigned long ia64_cache_stride_shift = ~0;
  * We use a special marker for the end of memory and it uses the extra (+1) slot
  */
 struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1] __initdata;
-int num_rsvd_regions __initdata;
+static int num_rsvd_regions __initdata;
 
 
 /*
@@ -325,6 +325,31 @@ static inline void __init setup_crashkernel(unsigned long total, int *n)
 {}
 #endif
 
+#ifdef CONFIG_CRASH_DUMP
+static int __init reserve_elfcorehdr(u64 *start, u64 *end)
+{
+       u64 length;
+
+       /* We get the address using the kernel command line,
+        * but the size is extracted from the EFI tables.
+        * Both address and size are required for reservation
+        * to work properly.
+        */
+
+       if (!is_vmcore_usable())
+               return -EINVAL;
+
+       if ((length = vmcore_find_descriptor_size(elfcorehdr_addr)) == 0) {
+               vmcore_unusable();
+               return -EINVAL;
+       }
+
+       *start = (unsigned long)__va(elfcorehdr_addr);
+       *end = *start + length;
+       return 0;
+}
+#endif /* CONFIG_CRASH_DUMP */
+
 /**
  * reserve_memory - setup reserved memory areas
  *
@@ -522,32 +547,6 @@ static __init int setup_nomca(char *s)
 }
 early_param("nomca", setup_nomca);
 
-#ifdef CONFIG_CRASH_DUMP
-int __init reserve_elfcorehdr(u64 *start, u64 *end)
-{
-       u64 length;
-
-       /* We get the address using the kernel command line,
-        * but the size is extracted from the EFI tables.
-        * Both address and size are required for reservation
-        * to work properly.
-        */
-
-       if (!is_vmcore_usable())
-               return -EINVAL;
-
-       if ((length = vmcore_find_descriptor_size(elfcorehdr_addr)) == 0) {
-               vmcore_unusable();
-               return -EINVAL;
-       }
-
-       *start = (unsigned long)__va(elfcorehdr_addr);
-       *end = *start + length;
-       return 0;
-}
-
-#endif /* CONFIG_PROC_VMCORE */
-
 void __init
 setup_arch (char **cmdline_p)
 {
index 4b20224..6fea184 100644 (file)
 444    common  landlock_create_ruleset         sys_landlock_create_ruleset
 445    common  landlock_add_rule               sys_landlock_add_rule
 446    common  landlock_restrict_self          sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448    common  process_mrelease                sys_process_mrelease
index 3ec1291..7976dff 100644 (file)
 444    common  landlock_create_ruleset         sys_landlock_create_ruleset
 445    common  landlock_add_rule               sys_landlock_add_rule
 446    common  landlock_restrict_self          sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448    common  process_mrelease                sys_process_mrelease
index ce55097..4b8b2fa 100644 (file)
@@ -112,8 +112,7 @@ extern int page_is_ram(unsigned long pfn);
 #  define page_to_phys(page)     (page_to_pfn(page) << PAGE_SHIFT)
 
 #  define ARCH_PFN_OFFSET      (memory_start >> PAGE_SHIFT)
-#  define pfn_valid(pfn)       ((pfn) < (max_mapnr + ARCH_PFN_OFFSET))
-
+#  define pfn_valid(pfn)       ((pfn) >= ARCH_PFN_OFFSET && (pfn) < (max_mapnr + ARCH_PFN_OFFSET))
 # endif /* __ASSEMBLY__ */
 
 #define        virt_addr_valid(vaddr)  (pfn_valid(virt_to_pfn(vaddr)))
index 71cd547..c136a01 100644 (file)
@@ -443,8 +443,6 @@ extern int mem_init_done;
 
 asmlinkage void __init mmu_init(void);
 
-void __init *early_get_page(void);
-
 #endif /* __ASSEMBLY__ */
 #endif /* __KERNEL__ */
 
index 9be3ace..6b0e113 100644 (file)
 444    common  landlock_create_ruleset         sys_landlock_create_ruleset
 445    common  landlock_add_rule               sys_landlock_add_rule
 446    common  landlock_restrict_self          sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448    common  process_mrelease                sys_process_mrelease
index ab55c70..952f35b 100644 (file)
@@ -265,18 +265,6 @@ asmlinkage void __init mmu_init(void)
        dma_contiguous_reserve(memory_start + lowmem_size - 1);
 }
 
-/* This is only called until mem_init is done. */
-void __init *early_get_page(void)
-{
-       /*
-        * Mem start + kernel_tlb -> here is limit
-        * because of mem mapping from head.S
-        */
-       return memblock_alloc_try_nid_raw(PAGE_SIZE, PAGE_SIZE,
-                               MEMBLOCK_LOW_LIMIT, memory_start + kernel_tlb,
-                               NUMA_NO_NODE);
-}
-
 void * __ref zalloc_maybe_bootmem(size_t size, gfp_t mask)
 {
        void *p;
index 38ccb90..c1833b1 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/init.h>
 #include <linux/mm_types.h>
 #include <linux/pgtable.h>
+#include <linux/memblock.h>
 
 #include <asm/pgalloc.h>
 #include <linux/io.h>
@@ -242,15 +243,13 @@ unsigned long iopa(unsigned long addr)
 
 __ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
 {
-       pte_t *pte;
-       if (mem_init_done) {
-               pte = (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-       } else {
-               pte = (pte_t *)early_get_page();
-               if (pte)
-                       clear_page(pte);
-       }
-       return pte;
+       if (mem_init_done)
+               return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+       else
+               return memblock_alloc_try_nid(PAGE_SIZE, PAGE_SIZE,
+                                             MEMBLOCK_LOW_LIMIT,
+                                             memory_start + kernel_tlb,
+                                             NUMA_NO_NODE);
 }
 
 void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
index d687b40..b3dc9c5 100644 (file)
@@ -125,13 +125,7 @@ static inline void kunmap_noncoherent(void)
        kunmap_coherent();
 }
 
-#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
-static inline void flush_kernel_dcache_page(struct page *page)
-{
-       BUG_ON(cpu_has_dc_aliases && PageHighMem(page));
-       flush_dcache_page(page);
-}
-
+#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1
 /*
  * For now flush_kernel_vmap_range and invalidate_kernel_vmap_range both do a
  * cache writeback and invalidate operation.
index 23a1403..f979adf 100644 (file)
@@ -452,8 +452,9 @@ static void __init mips_parse_crashkernel(void)
                return;
 
        if (crash_base <= 0) {
-               crash_base = memblock_find_in_range(CRASH_ALIGN, CRASH_ADDR_MAX,
-                                                       crash_size, CRASH_ALIGN);
+               crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN,
+                                                      CRASH_ALIGN,
+                                                      CRASH_ADDR_MAX);
                if (!crash_base) {
                        pr_warn("crashkernel reservation failed - No suitable area found.\n");
                        return;
@@ -461,8 +462,9 @@ static void __init mips_parse_crashkernel(void)
        } else {
                unsigned long long start;
 
-               start = memblock_find_in_range(crash_base, crash_base + crash_size,
-                                               crash_size, 1);
+               start = memblock_phys_alloc_range(crash_size, 1,
+                                                 crash_base,
+                                                 crash_base + crash_size);
                if (start != crash_base) {
                        pr_warn("Invalid memory region reserved for crash kernel\n");
                        return;
@@ -656,10 +658,6 @@ static void __init arch_mem_init(char **cmdline_p)
        mips_reserve_vmcore();
 
        mips_parse_crashkernel();
-#ifdef CONFIG_KEXEC
-       if (crashk_res.start != crashk_res.end)
-               memblock_reserve(crashk_res.start, resource_size(&crashk_res));
-#endif
        device_tree_init();
 
        /*
index c2d2e19..56c8d3c 100644 (file)
 444    n32     landlock_create_ruleset         sys_landlock_create_ruleset
 445    n32     landlock_add_rule               sys_landlock_add_rule
 446    n32     landlock_restrict_self          sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448    n32     process_mrelease                sys_process_mrelease
index ac653d0..1ca7bc3 100644 (file)
 444    n64     landlock_create_ruleset         sys_landlock_create_ruleset
 445    n64     landlock_add_rule               sys_landlock_add_rule
 446    n64     landlock_restrict_self          sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448    n64     process_mrelease                sys_process_mrelease
index fae3588..201237f 100644 (file)
 444    o32     landlock_create_ruleset         sys_landlock_create_ruleset
 445    o32     landlock_add_rule               sys_landlock_add_rule
 446    o32     landlock_restrict_self          sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448    o32     process_mrelease                sys_process_mrelease
index 7d6824f..c2a222e 100644 (file)
@@ -36,8 +36,7 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
 void flush_anon_page(struct vm_area_struct *vma,
                     struct page *page, unsigned long vaddr);
 
-#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
-void flush_kernel_dcache_page(struct page *page);
+#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1
 void flush_kernel_vmap_range(void *addr, int size);
 void invalidate_kernel_vmap_range(void *addr, int size);
 #define flush_dcache_mmap_lock(mapping)   xa_lock_irq(&(mapping)->i_pages)
index ad5344e..07aac65 100644 (file)
@@ -318,15 +318,6 @@ void flush_anon_page(struct vm_area_struct *vma,
        local_irq_restore(flags);
 }
 
-void flush_kernel_dcache_page(struct page *page)
-{
-       unsigned long flags;
-       local_irq_save(flags);
-       cpu_dcache_wbinval_page((unsigned long)page_address(page));
-       local_irq_restore(flags);
-}
-EXPORT_SYMBOL(flush_kernel_dcache_page);
-
 void flush_kernel_vmap_range(void *addr, int size)
 {
        unsigned long flags;
index 99663fc..eef0096 100644 (file)
@@ -36,16 +36,12 @@ void flush_cache_all_local(void);
 void flush_cache_all(void);
 void flush_cache_mm(struct mm_struct *mm);
 
-#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
 void flush_kernel_dcache_page_addr(void *addr);
-static inline void flush_kernel_dcache_page(struct page *page)
-{
-       flush_kernel_dcache_page_addr(page_address(page));
-}
 
 #define flush_kernel_dcache_range(start,size) \
        flush_kernel_dcache_range_asm((start), (start)+(size));
 
+#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1
 void flush_kernel_vmap_range(void *vaddr, int size);
 void invalidate_kernel_vmap_range(void *vaddr, int size);
 
@@ -59,7 +55,7 @@ extern void flush_dcache_page(struct page *page);
 #define flush_dcache_mmap_unlock(mapping)      xa_unlock_irq(&mapping->i_pages)
 
 #define flush_icache_page(vma,page)    do {            \
-       flush_kernel_dcache_page(page);                 \
+       flush_kernel_dcache_page_addr(page_address(page)); \
        flush_kernel_icache_page(page_address(page));   \
 } while (0)
 
index 86a1a63..39e0222 100644 (file)
@@ -334,7 +334,7 @@ void flush_dcache_page(struct page *page)
                return;
        }
 
-       flush_kernel_dcache_page(page);
+       flush_kernel_dcache_page_addr(page_address(page));
 
        if (!mapping)
                return;
@@ -375,7 +375,6 @@ EXPORT_SYMBOL(flush_dcache_page);
 
 /* Defined in arch/parisc/kernel/pacache.S */
 EXPORT_SYMBOL(flush_kernel_dcache_range_asm);
-EXPORT_SYMBOL(flush_kernel_dcache_page_asm);
 EXPORT_SYMBOL(flush_data_cache_local);
 EXPORT_SYMBOL(flush_kernel_icache_range_asm);
 
index eaf0603..0bf854b 100644 (file)
 444    common  landlock_create_ruleset         sys_landlock_create_ruleset
 445    common  landlock_add_rule               sys_landlock_add_rule
 446    common  landlock_restrict_self          sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448    common  process_mrelease                sys_process_mrelease
index 6f3953f..29b55e2 100644 (file)
 444    common  landlock_create_ruleset         sys_landlock_create_ruleset
 445    common  landlock_add_rule               sys_landlock_add_rule
 446    common  landlock_restrict_self          sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448    common  process_mrelease                sys_process_mrelease
index 377d852..d4f28ee 100644 (file)
@@ -211,13 +211,11 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb)
 static struct memory_block *lmb_to_memblock(struct drmem_lmb *lmb)
 {
        unsigned long section_nr;
-       struct mem_section *mem_sect;
        struct memory_block *mem_block;
 
        section_nr = pfn_to_section_nr(PFN_DOWN(lmb->base_addr));
-       mem_sect = __nr_to_section(section_nr);
 
-       mem_block = find_memory_block(mem_sect);
+       mem_block = find_memory_block(section_nr);
        return mem_block;
 }
 
index 93720b0..fc818c8 100644 (file)
@@ -819,38 +819,22 @@ static void __init reserve_crashkernel(void)
 
        crash_size = PAGE_ALIGN(crash_size);
 
-       if (crash_base == 0) {
-               /*
-                * Current riscv boot protocol requires 2MB alignment for
-                * RV64 and 4MB alignment for RV32 (hugepage size)
-                */
-               crash_base = memblock_find_in_range(search_start, search_end,
-                                                   crash_size, PMD_SIZE);
-
-               if (crash_base == 0) {
-                       pr_warn("crashkernel: couldn't allocate %lldKB\n",
-                               crash_size >> 10);
-                       return;
-               }
-       } else {
-               /* User specifies base address explicitly. */
-               if (!memblock_is_region_memory(crash_base, crash_size)) {
-                       pr_warn("crashkernel: requested region is not memory\n");
-                       return;
-               }
-
-               if (memblock_is_region_reserved(crash_base, crash_size)) {
-                       pr_warn("crashkernel: requested region is reserved\n");
-                       return;
-               }
-
+       if (crash_base) {
+               search_start = crash_base;
+               search_end = crash_base + crash_size;
+       }
 
-               if (!IS_ALIGNED(crash_base, PMD_SIZE)) {
-                       pr_warn("crashkernel: requested region is misaligned\n");
-                       return;
-               }
+       /*
+        * Current riscv boot protocol requires 2MB alignment for
+        * RV64 and 4MB alignment for RV32 (hugepage size)
+        */
+       crash_base = memblock_phys_alloc_range(crash_size, PMD_SIZE,
+                                              search_start, search_end);
+       if (crash_base == 0) {
+               pr_warn("crashkernel: couldn't allocate %lldKB\n",
+                       crash_size >> 10);
+               return;
        }
-       memblock_reserve(crash_base, crash_size);
 
        pr_info("crashkernel: reserved 0x%016llx - 0x%016llx (%lld MB)\n",
                crash_base, crash_base + crash_size, crash_size >> 20);
index fe14beb..5a01872 100644 (file)
@@ -677,8 +677,9 @@ static void __init reserve_crashkernel(void)
                        return;
                }
                low = crash_base ?: low;
-               crash_base = memblock_find_in_range(low, high, crash_size,
-                                                   KEXEC_CRASH_MEM_ALIGN);
+               crash_base = memblock_phys_alloc_range(crash_size,
+                                                      KEXEC_CRASH_MEM_ALIGN,
+                                                      low, high);
        }
 
        if (!crash_base) {
@@ -687,8 +688,10 @@ static void __init reserve_crashkernel(void)
                return;
        }
 
-       if (register_memory_notifier(&kdump_mem_nb))
+       if (register_memory_notifier(&kdump_mem_nb)) {
+               memblock_free(crash_base, crash_size);
                return;
+       }
 
        if (!oldmem_data.start && MACHINE_IS_VM)
                diag10_range(PFN_DOWN(crash_base), PFN_DOWN(crash_size));
index aa705e1..aa9d68b 100644 (file)
 444  common    landlock_create_ruleset sys_landlock_create_ruleset     sys_landlock_create_ruleset
 445  common    landlock_add_rule       sys_landlock_add_rule           sys_landlock_add_rule
 446  common    landlock_restrict_self  sys_landlock_restrict_self      sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448  common    process_mrelease        sys_process_mrelease            sys_process_mrelease
index 212632d..a834e46 100644 (file)
@@ -822,7 +822,7 @@ void do_secure_storage_access(struct pt_regs *regs)
                break;
        case KERNEL_FAULT:
                page = phys_to_page(addr);
-               if (unlikely(!try_get_page(page)))
+               if (unlikely(!try_get_compound_head(page, 1)))
                        break;
                rc = arch_make_page_accessible(page);
                put_page(page);
index 4486a86..372afa8 100644 (file)
@@ -63,6 +63,8 @@ static inline void flush_anon_page(struct vm_area_struct *vma,
        if (boot_cpu_data.dcache.n_aliases && PageAnon(page))
                __flush_anon_page(page, vmaddr);
 }
+
+#define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1
 static inline void flush_kernel_vmap_range(void *addr, int size)
 {
        __flush_wback_region(addr, size);
@@ -72,12 +74,6 @@ static inline void invalidate_kernel_vmap_range(void *addr, int size)
        __flush_invalidate_region(addr, size);
 }
 
-#define ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
-static inline void flush_kernel_dcache_page(struct page *page)
-{
-       flush_dcache_page(page);
-}
-
 extern void copy_to_user_page(struct vm_area_struct *vma,
        struct page *page, unsigned long vaddr, void *dst, const void *src,
        unsigned long len);
index 7bbd670..208f131 100644 (file)
 444    common  landlock_create_ruleset         sys_landlock_create_ruleset
 445    common  landlock_add_rule               sys_landlock_add_rule
 446    common  landlock_restrict_self          sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448    common  process_mrelease                sys_process_mrelease
index f520e9c..7893104 100644 (file)
 444    common  landlock_create_ruleset         sys_landlock_create_ruleset
 445    common  landlock_add_rule               sys_landlock_add_rule
 446    common  landlock_restrict_self          sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448    common  process_mrelease                sys_process_mrelease
index a5beae6..61f18b7 100644 (file)
 445    i386    landlock_add_rule       sys_landlock_add_rule
 446    i386    landlock_restrict_self  sys_landlock_restrict_self
 447    i386    memfd_secret            sys_memfd_secret
+448    i386    process_mrelease        sys_process_mrelease
index f6b5779..807b6a1 100644 (file)
 445    common  landlock_add_rule       sys_landlock_add_rule
 446    common  landlock_restrict_self  sys_landlock_restrict_self
 447    common  memfd_secret            sys_memfd_secret
+448    common  process_mrelease        sys_process_mrelease
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
index 294ed43..1056288 100644 (file)
@@ -109,14 +109,13 @@ static u32 __init allocate_aperture(void)
         * memory. Unfortunately we cannot move it up because that would
         * make the IOMMU useless.
         */
-       addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
-                                     aper_size, aper_size);
+       addr = memblock_phys_alloc_range(aper_size, aper_size,
+                                        GART_MIN_ADDR, GART_MAX_ADDR);
        if (!addr) {
                pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n",
                       addr, addr + aper_size - 1, aper_size >> 10);
                return 0;
        }
-       memblock_reserve(addr, aper_size);
        pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n",
                addr, addr + aper_size - 1, aper_size >> 10);
        register_nosave_region(addr >> PAGE_SHIFT,
index aa15132..525876e 100644 (file)
@@ -154,7 +154,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
        if (num_entries > LDT_ENTRIES)
                return NULL;
 
-       new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL);
+       new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL_ACCOUNT);
        if (!new_ldt)
                return NULL;
 
@@ -168,9 +168,9 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
         * than PAGE_SIZE.
         */
        if (alloc_size > PAGE_SIZE)
-               new_ldt->entries = vzalloc(alloc_size);
+               new_ldt->entries = __vmalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
        else
-               new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL);
+               new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
 
        if (!new_ldt->entries) {
                kfree(new_ldt);
index 75ef19a..23a14d8 100644 (file)
@@ -127,14 +127,12 @@ __ref void *alloc_low_pages(unsigned int num)
                unsigned long ret = 0;
 
                if (min_pfn_mapped < max_pfn_mapped) {
-                       ret = memblock_find_in_range(
+                       ret = memblock_phys_alloc_range(
+                                       PAGE_SIZE * num, PAGE_SIZE,
                                        min_pfn_mapped << PAGE_SHIFT,
-                                       max_pfn_mapped << PAGE_SHIFT,
-                                       PAGE_SIZE * num , PAGE_SIZE);
+                                       max_pfn_mapped << PAGE_SHIFT);
                }
-               if (ret)
-                       memblock_reserve(ret, PAGE_SIZE * num);
-               else if (can_use_brk_pgt)
+               if (!ret && can_use_brk_pgt)
                        ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE));
 
                if (!ret)
@@ -610,8 +608,17 @@ static void __init memory_map_top_down(unsigned long map_start,
        unsigned long addr;
        unsigned long mapped_ram_size = 0;
 
-       /* xen has big range in reserved near end of ram, skip it at first.*/
-       addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);
+       /*
+        * Systems that have many reserved areas near top of the memory,
+        * e.g. QEMU with less than 1G RAM and EFI enabled, or Xen, will
+        * require lots of 4K mappings which may exhaust pgt_buf.
+        * Start with top-most PMD_SIZE range aligned at PMD_SIZE to ensure
+        * there is enough mapped memory that can be allocated from
+        * memblock.
+        */
+       addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start,
+                                        map_end);
+       memblock_free(addr, PMD_SIZE);
        real_end = addr + PMD_SIZE;
 
        /* step_size need to be small so pgt_buf from BRK could cover it */
index e94da74..a1b5c71 100644 (file)
@@ -376,15 +376,14 @@ static int __init numa_alloc_distance(void)
        cnt++;
        size = cnt * cnt * sizeof(numa_distance[0]);
 
-       phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
-                                     size, PAGE_SIZE);
+       phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0,
+                                        PFN_PHYS(max_pfn_mapped));
        if (!phys) {
                pr_warn("Warning: can't allocate distance table!\n");
                /* don't retry until explicitly reset */
                numa_distance = (void *)1LU;
                return -ENOMEM;
        }
-       memblock_reserve(phys, size);
 
        numa_distance = __va(phys);
        numa_distance_cnt = cnt;
index 87d77cc..737491b 100644 (file)
@@ -447,13 +447,12 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
        if (numa_dist_cnt) {
                u64 phys;
 
-               phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
-                                             phys_size, PAGE_SIZE);
+               phys = memblock_phys_alloc_range(phys_size, PAGE_SIZE, 0,
+                                                PFN_PHYS(max_pfn_mapped));
                if (!phys) {
                        pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
                        goto no_emu;
                }
-               memblock_reserve(phys, phys_size);
                phys_dist = __va(phys);
 
                for (i = 0; i < numa_dist_cnt; i++)
index 6534c92..31b5856 100644 (file)
@@ -28,7 +28,7 @@ void __init reserve_real_mode(void)
        WARN_ON(slab_is_available());
 
        /* Has to be under 1M so we can execute real-mode AP code. */
-       mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
+       mem = memblock_phys_alloc_range(size, PAGE_SIZE, 0, 1<<20);
        if (!mem)
                pr_info("No sub-1M memory is available for the trampoline\n");
        else
index b3d1bc8..104b327 100644 (file)
 444    common  landlock_create_ruleset         sys_landlock_create_ruleset
 445    common  landlock_add_rule               sys_landlock_add_rule
 446    common  landlock_restrict_self          sys_landlock_restrict_self
+# 447 reserved for memfd_secret
+448    common  process_mrelease                sys_process_mrelease
index d1448aa..4526add 100644 (file)
@@ -309,7 +309,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
 
 static void bio_invalidate_vmalloc_pages(struct bio *bio)
 {
-#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
+#ifdef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE
        if (bio->bi_private && !op_is_write(bio_op(bio))) {
                unsigned long i, len = 0;
 
index a37a153..f938373 100644 (file)
@@ -583,8 +583,8 @@ void __init acpi_table_upgrade(void)
        }
 
        acpi_tables_addr =
-               memblock_find_in_range(0, ACPI_TABLE_UPGRADE_MAX_PHYS,
-                                      all_tables_size, PAGE_SIZE);
+               memblock_phys_alloc_range(all_tables_size, PAGE_SIZE,
+                                         0, ACPI_TABLE_UPGRADE_MAX_PHYS);
        if (!acpi_tables_addr) {
                WARN_ON(1);
                return;
@@ -599,7 +599,6 @@ void __init acpi_table_upgrade(void)
         * Both memblock_reserve and e820__range_add (via arch_reserve_mem_area)
         * works fine.
         */
-       memblock_reserve(acpi_tables_addr, all_tables_size);
        arch_reserve_mem_area(acpi_tables_addr, all_tables_size);
 
        /*
index 4cc4e11..46c5034 100644 (file)
@@ -279,13 +279,10 @@ static int __init numa_alloc_distance(void)
        int i, j;
 
        size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]);
-       phys = memblock_find_in_range(0, PFN_PHYS(max_pfn),
-                                     size, PAGE_SIZE);
+       phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, PFN_PHYS(max_pfn));
        if (WARN_ON(!phys))
                return -ENOMEM;
 
-       memblock_reserve(phys, size);
-
        numa_distance = __va(phys);
        numa_distance_cnt = nr_node_ids;
 
index aa31a21..e3fd2db 100644 (file)
@@ -578,9 +578,9 @@ static struct memory_block *find_memory_block_by_id(unsigned long block_id)
 /*
  * Called under device_hotplug_lock.
  */
-struct memory_block *find_memory_block(struct mem_section *section)
+struct memory_block *find_memory_block(unsigned long section_nr)
 {
-       unsigned long block_id = memory_block_id(__section_nr(section));
+       unsigned long block_id = memory_block_id(section_nr);
 
        return find_memory_block_by_id(block_id);
 }
index cb1a64a..80a2c27 100644 (file)
@@ -578,10 +578,6 @@ static bool jz4740_mmc_read_data(struct jz4740_mmc_host *host,
                        }
                }
                data->bytes_xfered += miter->length;
-
-               /* This can go away once MIPS implements
-                * flush_kernel_dcache_page */
-               flush_dcache_page(miter->page);
        }
        sg_miter_stop(miter);
 
index a1bcde3..f4c8e1a 100644 (file)
@@ -941,7 +941,7 @@ mmc_spi_data_do(struct mmc_spi_host *host, struct mmc_command *cmd,
 
                /* discard mappings */
                if (direction == DMA_FROM_DEVICE)
-                       flush_kernel_dcache_page(sg_page(sg));
+                       flush_dcache_page(sg_page(sg));
                kunmap(sg_page(sg));
                if (dma_dev)
                        dma_unmap_page(dma_dev, dma_addr, PAGE_SIZE, dir);
index fd3964d..59c1390 100644 (file)
@@ -33,18 +33,22 @@ static int __init early_init_dt_alloc_reserved_memory_arch(phys_addr_t size,
        phys_addr_t *res_base)
 {
        phys_addr_t base;
+       int err = 0;
 
        end = !end ? MEMBLOCK_ALLOC_ANYWHERE : end;
        align = !align ? SMP_CACHE_BYTES : align;
-       base = memblock_find_in_range(start, end, size, align);
+       base = memblock_phys_alloc_range(size, align, start, end);
        if (!base)
                return -ENOMEM;
 
        *res_base = base;
-       if (nomap)
-               return memblock_mark_nomap(base, size);
+       if (nomap) {
+               err = memblock_mark_nomap(base, size);
+               if (err)
+                       memblock_free(base, size);
+       }
 
-       return memblock_reserve(base, size);
+       return err;
 }
 
 /*
index f00fcc4..e619c31 100644 (file)
@@ -3,6 +3,7 @@
  * Implement the manual drop-all-pagecache function
  */
 
+#include <linux/pagemap.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
@@ -27,7 +28,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
                 * we need to reschedule to avoid softlockups.
                 */
                if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
-                   (inode->i_mapping->nrpages == 0 && !need_resched())) {
+                   (mapping_empty(inode->i_mapping) && !need_resched())) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }
index 3b78b22..2dc489c 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -217,8 +217,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
         * We are doing an exec().  'current' is the process
         * doing the exec and bprm->mm is the new process's mm.
         */
+       mmap_read_lock(bprm->mm);
        ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags,
                        &page, NULL, NULL);
+       mmap_read_unlock(bprm->mm);
        if (ret <= 0)
                return NULL;
 
@@ -574,7 +576,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
                                }
 
                                if (kmapped_page) {
-                                       flush_kernel_dcache_page(kmapped_page);
+                                       flush_dcache_page(kmapped_page);
                                        kunmap(kmapped_page);
                                        put_arg_page(kmapped_page);
                                }
@@ -592,7 +594,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
        ret = 0;
 out:
        if (kmapped_page) {
-               flush_kernel_dcache_page(kmapped_page);
+               flush_dcache_page(kmapped_page);
                kunmap(kmapped_page);
                put_arg_page(kmapped_page);
        }
@@ -634,7 +636,7 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
                kaddr = kmap_atomic(page);
                flush_arg_page(bprm, pos & PAGE_MASK, page);
                memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy);
-               flush_kernel_dcache_page(page);
+               flush_dcache_page(page);
                kunmap_atomic(kaddr);
                put_arg_page(page);
        }
index 68added..9c6c6a3 100644 (file)
@@ -1051,7 +1051,8 @@ static int __init fcntl_init(void)
                        __FMODE_EXEC | __FMODE_NONOTIFY));
 
        fasync_cache = kmem_cache_create("fasync_cache",
-               sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
+                                        sizeof(struct fasync_struct), 0,
+                                        SLAB_PANIC | SLAB_ACCOUNT, NULL);
        return 0;
 }
 
index eb57dad..81ec192 100644 (file)
@@ -406,6 +406,11 @@ static bool inode_do_switch_wbs(struct inode *inode,
                inc_wb_stat(new_wb, WB_WRITEBACK);
        }
 
+       if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
+               atomic_dec(&old_wb->writeback_inodes);
+               atomic_inc(&new_wb->writeback_inodes);
+       }
+
        wb_get(new_wb);
 
        /*
@@ -1034,20 +1039,20 @@ restart:
  * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs
  * @bdi_id: target bdi id
  * @memcg_id: target memcg css id
- * @nr: number of pages to write, 0 for best-effort dirty flushing
  * @reason: reason why some writeback work initiated
  * @done: target wb_completion
  *
  * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id
  * with the specified parameters.
  */
-int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr,
+int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
                           enum wb_reason reason, struct wb_completion *done)
 {
        struct backing_dev_info *bdi;
        struct cgroup_subsys_state *memcg_css;
        struct bdi_writeback *wb;
        struct wb_writeback_work *work;
+       unsigned long dirty;
        int ret;
 
        /* lookup bdi and memcg */
@@ -1076,24 +1081,22 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr,
        }
 
        /*
-        * If @nr is zero, the caller is attempting to write out most of
+        * The caller is attempting to write out most of
         * the currently dirty pages.  Let's take the current dirty page
         * count and inflate it by 25% which should be large enough to
         * flush out most dirty pages while avoiding getting livelocked by
         * concurrent dirtiers.
+        *
+        * BTW the memcg stats are flushed periodically and this is best-effort
+        * estimation, so some potential error is ok.
         */
-       if (!nr) {
-               unsigned long filepages, headroom, dirty, writeback;
-
-               mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty,
-                                     &writeback);
-               nr = dirty * 10 / 8;
-       }
+       dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY);
+       dirty = dirty * 10 / 8;
 
        /* issue the writeback work */
        work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
        if (work) {
-               work->nr_pages = nr;
+               work->nr_pages = dirty;
                work->sync_mode = WB_SYNC_NONE;
                work->range_cyclic = 1;
                work->reason = reason;
@@ -1999,7 +2002,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
 static long wb_writeback(struct bdi_writeback *wb,
                         struct wb_writeback_work *work)
 {
-       unsigned long wb_start = jiffies;
        long nr_pages = work->nr_pages;
        unsigned long dirtied_before = jiffies;
        struct inode *inode;
@@ -2053,8 +2055,6 @@ static long wb_writeback(struct bdi_writeback *wb,
                        progress = __writeback_inodes_wb(wb, work);
                trace_writeback_written(wb, work);
 
-               wb_update_bandwidth(wb, wb_start);
-
                /*
                 * Did we write something? Try for more
                 *
index de1985e..b7e43a7 100644 (file)
@@ -254,7 +254,7 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
        struct fs_context *fc;
        int ret = -ENOMEM;
 
-       fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL);
+       fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL_ACCOUNT);
        if (!fc)
                return ERR_PTR(-ENOMEM);
 
@@ -649,7 +649,7 @@ const struct fs_context_operations legacy_fs_context_ops = {
  */
 static int legacy_init_fs_context(struct fs_context *fc)
 {
-       fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL);
+       fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL_ACCOUNT);
        if (!fc->fs_private)
                return -ENOMEM;
        fc->ops = &legacy_fs_context_ops;
index 84c528c..37710ca 100644 (file)
@@ -770,7 +770,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
                return LRU_ROTATE;
        }
 
-       if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+       if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
                __iget(inode);
                spin_unlock(&inode->i_lock);
                spin_unlock(lru_lock);
index 3d6fb4a..51a5b72 100644 (file)
@@ -2941,10 +2941,12 @@ static int __init filelock_init(void)
        int i;
 
        flctx_cache = kmem_cache_create("file_lock_ctx",
-                       sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL);
+                       sizeof(struct file_lock_context), 0,
+                       SLAB_PANIC | SLAB_ACCOUNT, NULL);
 
        filelock_cache = kmem_cache_create("file_lock_cache",
-                       sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
+                       sizeof(struct file_lock), 0,
+                       SLAB_PANIC | SLAB_ACCOUNT, NULL);
 
        for_each_possible_cpu(i) {
                struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);
index d049d39..95a881e 100644 (file)
@@ -4089,7 +4089,9 @@ int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
                return -EPERM;
 
        inode_lock(target);
-       if (is_local_mountpoint(dentry))
+       if (IS_SWAPFILE(target))
+               error = -EPERM;
+       else if (is_local_mountpoint(dentry))
                error = -EBUSY;
        else {
                error = security_inode_unlink(dir, dentry);
@@ -4597,6 +4599,10 @@ int vfs_rename(struct renamedata *rd)
        else if (target)
                inode_lock(target);
 
+       error = -EPERM;
+       if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target)))
+               goto out;
+
        error = -EBUSY;
        if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
                goto out;
index 1285236..659a8f3 100644 (file)
@@ -203,7 +203,8 @@ static struct mount *alloc_vfsmnt(const char *name)
                        goto out_free_cache;
 
                if (name) {
-                       mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
+                       mnt->mnt_devname = kstrdup_const(name,
+                                                        GFP_KERNEL_ACCOUNT);
                        if (!mnt->mnt_devname)
                                goto out_free_id;
                }
@@ -3370,7 +3371,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
        if (!ucounts)
                return ERR_PTR(-ENOSPC);
 
-       new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
+       new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT);
        if (!new_ns) {
                dec_mnt_namespaces(ucounts);
                return ERR_PTR(-ENOMEM);
@@ -4306,7 +4307,7 @@ void __init mnt_init(void)
        int err;
 
        mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
-                       0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+                       0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
 
        mount_hashtable = alloc_large_system_hash("Mount-cache",
                                sizeof(struct hlist_head),
index 48fd369..359524b 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/time.h>
+#include <linux/delay.h>
 #include <linux/quotaops.h>
 #include <linux/sched/signal.h>
 
@@ -2721,7 +2722,7 @@ int ocfs2_inode_lock_tracker(struct inode *inode,
                        return status;
                }
        }
-       return tmp_oh ? 1 : 0;
+       return 1;
 }
 
 void ocfs2_inode_unlock_tracker(struct inode *inode,
@@ -3912,6 +3913,17 @@ downconvert:
        spin_unlock_irqrestore(&lockres->l_lock, flags);
        ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb,
                                     gen);
+       /* The dlm lock convert is being cancelled in background,
+        * ocfs2_cancel_convert() is asynchronous in fs/dlm,
+        * requeue it, try again later.
+        */
+       if (ret == -EBUSY) {
+               ctl->requeue = 1;
+               mlog(ML_BASTS, "lockres %s, ReQ: Downconvert busy\n",
+                    lockres->l_name);
+               ret = 0;
+               msleep(20);
+       }
 
 leave:
        if (ret)
index eda8348..f033de7 100644 (file)
@@ -357,7 +357,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
        }
        oinfo->dqi_gi.dqi_sb = sb;
        oinfo->dqi_gi.dqi_type = type;
-       ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
        oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
        oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
        oinfo->dqi_gqi_bh = NULL;
index b1a8b04..0e4b16d 100644 (file)
@@ -702,6 +702,8 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
        info->dqi_priv = oinfo;
        oinfo->dqi_type = type;
        INIT_LIST_HEAD(&oinfo->dqi_chunk);
+       oinfo->dqi_gqinode = NULL;
+       ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
        oinfo->dqi_rec = NULL;
        oinfo->dqi_lqi_bh = NULL;
        oinfo->dqi_libh = NULL;
index 6d4342b..1fa1f52 100644 (file)
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -191,7 +191,7 @@ EXPORT_SYMBOL(generic_pipe_buf_try_steal);
  */
 bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 {
-       return try_get_page(buf->page);
+       return try_get_compound_head(buf->page, 1);
 }
 EXPORT_SYMBOL(generic_pipe_buf_get);
 
index 945896d..e83e563 100644 (file)
@@ -655,7 +655,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
                        goto out_nofds;
 
                alloc_size = 6 * size;
-               bits = kvmalloc(alloc_size, GFP_KERNEL);
+               bits = kvmalloc(alloc_size, GFP_KERNEL_ACCOUNT);
                if (!bits)
                        goto out_nofds;
        }
@@ -1000,7 +1000,7 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 
                len = min(todo, POLLFD_PER_PAGE);
                walk = walk->next = kmalloc(struct_size(walk, entries, len),
-                                           GFP_KERNEL);
+                                           GFP_KERNEL_ACCOUNT);
                if (!walk) {
                        err = -ENOMEM;
                        goto out_fds;
index 5c2d806..003f0d3 100644 (file)
@@ -33,11 +33,6 @@ int sysctl_unprivileged_userfaultfd __read_mostly;
 
 static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
 
-enum userfaultfd_state {
-       UFFD_STATE_WAIT_API,
-       UFFD_STATE_RUNNING,
-};
-
 /*
  * Start with fault_pending_wqh and fault_wqh so they're more likely
  * to be in the same cacheline.
@@ -69,12 +64,10 @@ struct userfaultfd_ctx {
        unsigned int flags;
        /* features requested from the userspace */
        unsigned int features;
-       /* state machine */
-       enum userfaultfd_state state;
        /* released */
        bool released;
        /* memory mappings are changing because of non-cooperative event */
-       bool mmap_changing;
+       atomic_t mmap_changing;
        /* mm with one ore more vmas attached to this userfaultfd_ctx */
        struct mm_struct *mm;
 };
@@ -104,6 +97,14 @@ struct userfaultfd_wake_range {
        unsigned long len;
 };
 
+/* internal indication that UFFD_API ioctl was successfully executed */
+#define UFFD_FEATURE_INITIALIZED               (1u << 31)
+
+static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
+{
+       return ctx->features & UFFD_FEATURE_INITIALIZED;
+}
+
 static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
                                     int wake_flags, void *key)
 {
@@ -623,7 +624,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
         * already released.
         */
 out:
-       WRITE_ONCE(ctx->mmap_changing, false);
+       atomic_dec(&ctx->mmap_changing);
+       VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0);
        userfaultfd_ctx_put(ctx);
 }
 
@@ -666,15 +668,14 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
 
                refcount_set(&ctx->refcount, 1);
                ctx->flags = octx->flags;
-               ctx->state = UFFD_STATE_RUNNING;
                ctx->features = octx->features;
                ctx->released = false;
-               ctx->mmap_changing = false;
+               atomic_set(&ctx->mmap_changing, 0);
                ctx->mm = vma->vm_mm;
                mmgrab(ctx->mm);
 
                userfaultfd_ctx_get(octx);
-               WRITE_ONCE(octx->mmap_changing, true);
+               atomic_inc(&octx->mmap_changing);
                fctx->orig = octx;
                fctx->new = ctx;
                list_add_tail(&fctx->list, fcs);
@@ -721,7 +722,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
        if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
                vm_ctx->ctx = ctx;
                userfaultfd_ctx_get(ctx);
-               WRITE_ONCE(ctx->mmap_changing, true);
+               atomic_inc(&ctx->mmap_changing);
        } else {
                /* Drop uffd context if remap feature not enabled */
                vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
@@ -766,7 +767,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
                return true;
 
        userfaultfd_ctx_get(ctx);
-       WRITE_ONCE(ctx->mmap_changing, true);
+       atomic_inc(&ctx->mmap_changing);
        mmap_read_unlock(mm);
 
        msg_init(&ewq.msg);
@@ -810,7 +811,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma,
                        return -ENOMEM;
 
                userfaultfd_ctx_get(ctx);
-               WRITE_ONCE(ctx->mmap_changing, true);
+               atomic_inc(&ctx->mmap_changing);
                unmap_ctx->ctx = ctx;
                unmap_ctx->start = start;
                unmap_ctx->end = end;
@@ -943,38 +944,33 @@ static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
 
        poll_wait(file, &ctx->fd_wqh, wait);
 
-       switch (ctx->state) {
-       case UFFD_STATE_WAIT_API:
+       if (!userfaultfd_is_initialized(ctx))
                return EPOLLERR;
-       case UFFD_STATE_RUNNING:
-               /*
-                * poll() never guarantees that read won't block.
-                * userfaults can be waken before they're read().
-                */
-               if (unlikely(!(file->f_flags & O_NONBLOCK)))
-                       return EPOLLERR;
-               /*
-                * lockless access to see if there are pending faults
-                * __pollwait last action is the add_wait_queue but
-                * the spin_unlock would allow the waitqueue_active to
-                * pass above the actual list_add inside
-                * add_wait_queue critical section. So use a full
-                * memory barrier to serialize the list_add write of
-                * add_wait_queue() with the waitqueue_active read
-                * below.
-                */
-               ret = 0;
-               smp_mb();
-               if (waitqueue_active(&ctx->fault_pending_wqh))
-                       ret = EPOLLIN;
-               else if (waitqueue_active(&ctx->event_wqh))
-                       ret = EPOLLIN;
 
-               return ret;
-       default:
-               WARN_ON_ONCE(1);
+       /*
+        * poll() never guarantees that read won't block.
+        * userfaults can be waken before they're read().
+        */
+       if (unlikely(!(file->f_flags & O_NONBLOCK)))
                return EPOLLERR;
-       }
+       /*
+        * lockless access to see if there are pending faults
+        * __pollwait last action is the add_wait_queue but
+        * the spin_unlock would allow the waitqueue_active to
+        * pass above the actual list_add inside
+        * add_wait_queue critical section. So use a full
+        * memory barrier to serialize the list_add write of
+        * add_wait_queue() with the waitqueue_active read
+        * below.
+        */
+       ret = 0;
+       smp_mb();
+       if (waitqueue_active(&ctx->fault_pending_wqh))
+               ret = EPOLLIN;
+       else if (waitqueue_active(&ctx->event_wqh))
+               ret = EPOLLIN;
+
+       return ret;
 }
 
 static const struct file_operations userfaultfd_fops;
@@ -1169,7 +1165,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
        int no_wait = file->f_flags & O_NONBLOCK;
        struct inode *inode = file_inode(file);
 
-       if (ctx->state == UFFD_STATE_WAIT_API)
+       if (!userfaultfd_is_initialized(ctx))
                return -EINVAL;
 
        for (;;) {
@@ -1700,7 +1696,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
        user_uffdio_copy = (struct uffdio_copy __user *) arg;
 
        ret = -EAGAIN;
-       if (READ_ONCE(ctx->mmap_changing))
+       if (atomic_read(&ctx->mmap_changing))
                goto out;
 
        ret = -EFAULT;
@@ -1757,7 +1753,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
        user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
 
        ret = -EAGAIN;
-       if (READ_ONCE(ctx->mmap_changing))
+       if (atomic_read(&ctx->mmap_changing))
                goto out;
 
        ret = -EFAULT;
@@ -1807,7 +1803,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
        struct userfaultfd_wake_range range;
        bool mode_wp, mode_dontwake;
 
-       if (READ_ONCE(ctx->mmap_changing))
+       if (atomic_read(&ctx->mmap_changing))
                return -EAGAIN;
 
        user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
@@ -1855,7 +1851,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
        user_uffdio_continue = (struct uffdio_continue __user *)arg;
 
        ret = -EAGAIN;
-       if (READ_ONCE(ctx->mmap_changing))
+       if (atomic_read(&ctx->mmap_changing))
                goto out;
 
        ret = -EFAULT;
@@ -1908,9 +1904,10 @@ out:
 static inline unsigned int uffd_ctx_features(__u64 user_features)
 {
        /*
-        * For the current set of features the bits just coincide
+        * For the current set of features the bits just coincide. Set
+        * UFFD_FEATURE_INITIALIZED to mark the features as enabled.
         */
-       return (unsigned int)user_features;
+       return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
 }
 
 /*
@@ -1923,12 +1920,10 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 {
        struct uffdio_api uffdio_api;
        void __user *buf = (void __user *)arg;
+       unsigned int ctx_features;
        int ret;
        __u64 features;
 
-       ret = -EINVAL;
-       if (ctx->state != UFFD_STATE_WAIT_API)
-               goto out;
        ret = -EFAULT;
        if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
                goto out;
@@ -1952,9 +1947,13 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
        ret = -EFAULT;
        if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
                goto out;
-       ctx->state = UFFD_STATE_RUNNING;
+
        /* only enable the requested features for this uffd context */
-       ctx->features = uffd_ctx_features(features);
+       ctx_features = uffd_ctx_features(features);
+       ret = -EINVAL;
+       if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
+               goto err_out;
+
        ret = 0;
 out:
        return ret;
@@ -1971,7 +1970,7 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
        int ret = -EINVAL;
        struct userfaultfd_ctx *ctx = file->private_data;
 
-       if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
+       if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
                return -EINVAL;
 
        switch(cmd) {
@@ -2085,9 +2084,8 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
        refcount_set(&ctx->refcount, 1);
        ctx->flags = flags;
        ctx->features = 0;
-       ctx->state = UFFD_STATE_WAIT_API;
        ctx->released = false;
-       ctx->mmap_changing = false;
+       atomic_set(&ctx->mmap_changing, 0);
        ctx->mm = current->mm;
        /* prevent the mm struct to be freed */
        mmgrab(ctx->mm);
index 1d7edad..3320700 100644 (file)
@@ -116,6 +116,7 @@ struct bdi_writeback {
        struct list_head b_dirty_time;  /* time stamps are dirty */
        spinlock_t list_lock;           /* protects the b_* lists */
 
+       atomic_t writeback_inodes;      /* number of inodes under writeback */
        struct percpu_counter stat[NR_WB_STAT_ITEMS];
 
        unsigned long congested;        /* WB_[a]sync_congested flags */
@@ -142,6 +143,7 @@ struct bdi_writeback {
        spinlock_t work_lock;           /* protects work_list & dwork scheduling */
        struct list_head work_list;
        struct delayed_work dwork;      /* work item used for writeback */
+       struct delayed_work bw_dwork;   /* work item used for bandwidth estimate */
 
        unsigned long dirty_sleep;      /* last wait */
 
index 2953085..ac7f231 100644 (file)
@@ -288,6 +288,17 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
        return inode->i_wb;
 }
 
+static inline struct bdi_writeback *inode_to_wb_wbc(
+                               struct inode *inode,
+                               struct writeback_control *wbc)
+{
+       /*
+        * If wbc does not have inode attached, it means cgroup writeback was
+        * disabled when wbc started. Just use the default wb in that case.
+        */
+       return wbc->wb ? wbc->wb : &inode_to_bdi(inode)->wb;
+}
+
 /**
  * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
  * @inode: target inode
@@ -366,6 +377,14 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
        return &inode_to_bdi(inode)->wb;
 }
 
+static inline struct bdi_writeback *inode_to_wb_wbc(
+                               struct inode *inode,
+                               struct writeback_control *wbc)
+{
+       return inode_to_wb(inode);
+}
+
+
 static inline struct bdi_writeback *
 unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
 {
index e7e99da..6486d3c 100644 (file)
@@ -409,7 +409,7 @@ static inline void invalidate_inode_buffers(struct inode *inode) {}
 static inline int remove_inode_buffers(struct inode *inode) { return 1; }
 static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
 static inline void invalidate_bh_lrus_cpu(int cpu) {}
-static inline bool has_bh_in_lru(int cpu, void *dummy) { return 0; }
+static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; }
 #define buffer_heads_over_limit 0
 
 #endif /* CONFIG_BLOCK */
index c24098c..34bce35 100644 (file)
@@ -84,6 +84,8 @@ static inline unsigned long compact_gap(unsigned int order)
 extern unsigned int sysctl_compaction_proactiveness;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
                        void *buffer, size_t *length, loff_t *ppos);
+extern int compaction_proactiveness_sysctl_handler(struct ctl_table *table,
+               int write, void *buffer, size_t *length, loff_t *ppos);
 extern int sysctl_extfrag_threshold;
 extern int sysctl_compact_unevictable_allowed;
 
index d9a606a..b4c49f9 100644 (file)
@@ -130,10 +130,7 @@ static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page
 }
 #endif
 
-#ifndef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
-static inline void flush_kernel_dcache_page(struct page *page)
-{
-}
+#ifndef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE
 static inline void flush_kernel_vmap_range(void *vaddr, int size)
 {
 }
index 0b8d1fd..c137396 100644 (file)
@@ -121,6 +121,13 @@ static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg)
        css_put(&h_cg->css);
 }
 
+static inline void resv_map_dup_hugetlb_cgroup_uncharge_info(
+                                               struct resv_map *resv_map)
+{
+       if (resv_map->css)
+               css_get(resv_map->css);
+}
+
 extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
                                        struct hugetlb_cgroup **ptr);
 extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
@@ -199,6 +206,11 @@ static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg)
 {
 }
 
+static inline void resv_map_dup_hugetlb_cgroup_uncharge_info(
+                                               struct resv_map *resv_map)
+{
+}
+
 static inline int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
                                               struct hugetlb_cgroup **ptr)
 {
index 4a53c3c..b066024 100644 (file)
@@ -99,8 +99,6 @@ void memblock_discard(void);
 static inline void memblock_discard(void) {}
 #endif
 
-phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
-                                  phys_addr_t size, phys_addr_t align);
 void memblock_allow_resize(void);
 int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
 int memblock_add(phys_addr_t base, phys_addr_t size);
index 20151c4..3096c9a 100644 (file)
@@ -105,14 +105,6 @@ struct mem_cgroup_reclaim_iter {
        unsigned int generation;
 };
 
-struct lruvec_stat {
-       long count[NR_VM_NODE_STAT_ITEMS];
-};
-
-struct batched_lruvec_stat {
-       s32 count[NR_VM_NODE_STAT_ITEMS];
-};
-
 /*
  * Bitmap and deferred work of shrinker::id corresponding to memcg-aware
  * shrinkers, which have elements charged to this memcg.
@@ -123,24 +115,30 @@ struct shrinker_info {
        unsigned long *map;
 };
 
+struct lruvec_stats_percpu {
+       /* Local (CPU and cgroup) state */
+       long state[NR_VM_NODE_STAT_ITEMS];
+
+       /* Delta calculation for lockless upward propagation */
+       long state_prev[NR_VM_NODE_STAT_ITEMS];
+};
+
+struct lruvec_stats {
+       /* Aggregated (CPU and subtree) state */
+       long state[NR_VM_NODE_STAT_ITEMS];
+
+       /* Pending child counts during tree propagation */
+       long state_pending[NR_VM_NODE_STAT_ITEMS];
+};
+
 /*
  * per-node information in memory controller.
  */
 struct mem_cgroup_per_node {
        struct lruvec           lruvec;
 
-       /*
-        * Legacy local VM stats. This should be struct lruvec_stat and
-        * cannot be optimized to struct batched_lruvec_stat. Because
-        * the threshold of the lruvec_stat_cpu can be as big as
-        * MEMCG_CHARGE_BATCH * PAGE_SIZE. It can fit into s32. But this
-        * filed has no upper limit.
-        */
-       struct lruvec_stat __percpu *lruvec_stat_local;
-
-       /* Subtree VM stats (batched updates) */
-       struct batched_lruvec_stat __percpu *lruvec_stat_cpu;
-       atomic_long_t           lruvec_stat[NR_VM_NODE_STAT_ITEMS];
+       struct lruvec_stats_percpu __percpu     *lruvec_stats_percpu;
+       struct lruvec_stats                     lruvec_stats;
 
        unsigned long           lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
 
@@ -595,13 +593,6 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
 }
 #endif
 
-static __always_inline bool memcg_stat_item_in_bytes(int idx)
-{
-       if (idx == MEMCG_PERCPU_B)
-               return true;
-       return vmstat_item_in_bytes(idx);
-}
-
 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
        return (memcg == root_mem_cgroup);
@@ -693,13 +684,35 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
                page_counter_read(&memcg->memory);
 }
 
-int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask);
+int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
+                       gfp_t gfp_mask);
+static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
+                                   gfp_t gfp_mask)
+{
+       if (mem_cgroup_disabled())
+               return 0;
+       return __mem_cgroup_charge(page, mm, gfp_mask);
+}
+
 int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
                                  gfp_t gfp, swp_entry_t entry);
 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
 
-void mem_cgroup_uncharge(struct page *page);
-void mem_cgroup_uncharge_list(struct list_head *page_list);
+void __mem_cgroup_uncharge(struct page *page);
+static inline void mem_cgroup_uncharge(struct page *page)
+{
+       if (mem_cgroup_disabled())
+               return;
+       __mem_cgroup_uncharge(page);
+}
+
+void __mem_cgroup_uncharge_list(struct list_head *page_list);
+static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
+{
+       if (mem_cgroup_disabled())
+               return;
+       __mem_cgroup_uncharge_list(page_list);
+}
 
 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
 
@@ -884,11 +897,6 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
        return !!(memcg->css.flags & CSS_ONLINE);
 }
 
-/*
- * For memory reclaim.
- */
-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
-
 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
                int zid, int nr_pages);
 
@@ -955,22 +963,21 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
        local_irq_restore(flags);
 }
 
+static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
+{
+       return READ_ONCE(memcg->vmstats.state[idx]);
+}
+
 static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
                                              enum node_stat_item idx)
 {
        struct mem_cgroup_per_node *pn;
-       long x;
 
        if (mem_cgroup_disabled())
                return node_page_state(lruvec_pgdat(lruvec), idx);
 
        pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-       x = atomic_long_read(&pn->lruvec_stat[idx]);
-#ifdef CONFIG_SMP
-       if (x < 0)
-               x = 0;
-#endif
-       return x;
+       return READ_ONCE(pn->lruvec_stats.state[idx]);
 }
 
 static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
@@ -985,7 +992,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 
        pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        for_each_possible_cpu(cpu)
-               x += per_cpu(pn->lruvec_stat_local->count[idx], cpu);
+               x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu);
 #ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
@@ -993,6 +1000,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
        return x;
 }
 
+void mem_cgroup_flush_stats(void);
+
 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
                              int val);
 void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val);
@@ -1391,6 +1400,11 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
 {
 }
 
+static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
+{
+       return 0;
+}
+
 static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
                                              enum node_stat_item idx)
 {
@@ -1403,6 +1417,10 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
        return node_page_state(lruvec_pgdat(lruvec), idx);
 }
 
+static inline void mem_cgroup_flush_stats(void)
+{
+}
+
 static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec,
                                            enum node_stat_item idx, int val)
 {
index 97e92e8..d9a0b61 100644 (file)
@@ -90,7 +90,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size,
 void remove_memory_block_devices(unsigned long start, unsigned long size);
 extern void memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
-extern struct memory_block *find_memory_block(struct mem_section *);
+extern struct memory_block *find_memory_block(unsigned long section_nr);
 typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
 extern int walk_memory_blocks(unsigned long start, unsigned long size,
                              void *arg, walk_memory_blocks_func_t func);
index 0aaf91b..4091692 100644 (file)
@@ -184,6 +184,14 @@ extern bool vma_migratable(struct vm_area_struct *vma);
 extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
 extern void mpol_put_task_policy(struct task_struct *);
 
+extern bool numa_demotion_enabled;
+
+static inline bool mpol_is_preferred_many(struct mempolicy *pol)
+{
+       return  (pol->mode == MPOL_PREFERRED_MANY);
+}
+
+
 #else
 
 struct mempolicy {};
@@ -292,5 +300,13 @@ static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
 {
        return NULL;
 }
+
+#define numa_demotion_enabled  false
+
+static inline bool mpol_is_preferred_many(struct mempolicy *pol)
+{
+       return  false;
+}
+
 #endif /* CONFIG_NUMA */
 #endif
index 23dadf7..3262509 100644 (file)
@@ -28,6 +28,7 @@ enum migrate_reason {
        MR_NUMA_MISPLACED,
        MR_CONTIG_RANGE,
        MR_LONGTERM_PIN,
+       MR_DEMOTION,
        MR_TYPES
 };
 
@@ -41,7 +42,8 @@ extern int migrate_page(struct address_space *mapping,
                        struct page *newpage, struct page *page,
                        enum migrate_mode mode);
 extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
-               unsigned long private, enum migrate_mode mode, int reason);
+               unsigned long private, enum migrate_mode mode, int reason,
+               unsigned int *ret_succeeded);
 extern struct page *alloc_migration_target(struct page *page, unsigned long private);
 extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
 
@@ -56,7 +58,7 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
 static inline void putback_movable_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_page_t new,
                free_page_t free, unsigned long private, enum migrate_mode mode,
-               int reason)
+               int reason, unsigned int *ret_succeeded)
        { return -ENOSYS; }
 static inline struct page *alloc_migration_target(struct page *page,
                unsigned long private)
@@ -166,6 +168,14 @@ struct migrate_vma {
 int migrate_vma_setup(struct migrate_vma *args);
 void migrate_vma_pages(struct migrate_vma *migrate);
 void migrate_vma_finalize(struct migrate_vma *migrate);
+int next_demotion_node(int node);
+
+#else /* CONFIG_MIGRATION disabled: */
+
+static inline int next_demotion_node(int node)
+{
+       return NUMA_NO_NODE;
+}
 
 #endif /* CONFIG_MIGRATION */
 
index e59646a..ed2552c 100644 (file)
@@ -1216,18 +1216,10 @@ static inline void get_page(struct page *page)
 }
 
 bool __must_check try_grab_page(struct page *page, unsigned int flags);
-__maybe_unused struct page *try_grab_compound_head(struct page *page, int refs,
-                                                  unsigned int flags);
+struct page *try_grab_compound_head(struct page *page, int refs,
+                                   unsigned int flags);
 
-
-static inline __must_check bool try_get_page(struct page *page)
-{
-       page = compound_head(page);
-       if (WARN_ON_ONCE(page_ref_count(page) <= 0))
-               return false;
-       page_ref_inc(page);
-       return true;
-}
+struct page *try_get_compound_head(struct page *page, int refs);
 
 static inline void put_page(struct page *page)
 {
@@ -1849,7 +1841,6 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
 struct kvec;
 int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
                        struct page **pages);
-int get_kernel_page(unsigned long start, int write, struct page **pages);
 struct page *get_dump_page(unsigned long addr);
 
 extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
@@ -3121,7 +3112,7 @@ extern void memory_failure_queue_kick(int cpu);
 extern int unpoison_memory(unsigned long pfn);
 extern int sysctl_memory_failure_early_kill;
 extern int sysctl_memory_failure_recovery;
-extern void shake_page(struct page *p, int access);
+extern void shake_page(struct page *p);
 extern atomic_long_t num_poisoned_pages __read_mostly;
 extern int soft_offline_page(unsigned long pfn, int flags);
 
index fcb5355..1bd5f59 100644 (file)
@@ -846,6 +846,7 @@ typedef struct pglist_data {
        enum zone_type kcompactd_highest_zoneidx;
        wait_queue_head_t kcompactd_wait;
        struct task_struct *kcompactd;
+       bool proactive_compact_trigger;
 #endif
        /*
         * This is a per-node reserve of pages that are not available
@@ -1342,7 +1343,6 @@ static inline struct mem_section *__nr_to_section(unsigned long nr)
                return NULL;
        return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
 }
-extern unsigned long __section_nr(struct mem_section *ms);
 extern size_t mem_section_usage_size(void);
 
 /*
@@ -1365,7 +1365,7 @@ extern size_t mem_section_usage_size(void);
 #define SECTION_TAINT_ZONE_DEVICE      (1UL<<4)
 #define SECTION_MAP_LAST_BIT           (1UL<<5)
 #define SECTION_MAP_MASK               (~(SECTION_MAP_LAST_BIT-1))
-#define SECTION_NID_SHIFT              3
+#define SECTION_NID_SHIFT              6
 
 static inline struct page *__section_mem_map_addr(struct mem_section *section)
 {
index ed02aa5..5dcf446 100644 (file)
@@ -736,7 +736,7 @@ extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);
 /*
  * Fault everything in given userspace address range in.
  */
-static inline int fault_in_pages_writeable(char __user *uaddr, int size)
+static inline int fault_in_pages_writeable(char __user *uaddr, size_t size)
 {
        char __user *end = uaddr + size - 1;
 
@@ -763,7 +763,7 @@ static inline int fault_in_pages_writeable(char __user *uaddr, int size)
        return 0;
 }
 
-static inline int fault_in_pages_readable(const char __user *uaddr, int size)
+static inline int fault_in_pages_readable(const char __user *uaddr, size_t size)
 {
        volatile char c;
        const char __user *end = uaddr + size - 1;
index e24b1fe..5561486 100644 (file)
@@ -174,13 +174,13 @@ static inline gfp_t current_gfp_context(gfp_t flags)
 }
 
 #ifdef CONFIG_LOCKDEP
-extern void __fs_reclaim_acquire(void);
-extern void __fs_reclaim_release(void);
+extern void __fs_reclaim_acquire(unsigned long ip);
+extern void __fs_reclaim_release(unsigned long ip);
 extern void fs_reclaim_acquire(gfp_t gfp_mask);
 extern void fs_reclaim_release(gfp_t gfp_mask);
 #else
-static inline void __fs_reclaim_acquire(void) { }
-static inline void __fs_reclaim_release(void) { }
+static inline void __fs_reclaim_acquire(unsigned long ip) { }
+static inline void __fs_reclaim_release(unsigned long ip) { }
 static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
 static inline void fs_reclaim_release(gfp_t gfp_mask) { }
 #endif
@@ -306,7 +306,7 @@ set_active_memcg(struct mem_cgroup *memcg)
 {
        struct mem_cgroup *old;
 
-       if (in_interrupt()) {
+       if (!in_task()) {
                old = this_cpu_read(int_active_memcg);
                this_cpu_write(int_active_memcg, memcg);
        } else {
index 8e775ce..166158b 100644 (file)
@@ -18,6 +18,7 @@ struct shmem_inode_info {
        unsigned long           flags;
        unsigned long           alloced;        /* data pages alloced to file */
        unsigned long           swapped;        /* subtotal assigned to swap */
+       pgoff_t                 fallocend;      /* highest fallocate endindex */
        struct list_head        shrinklist;     /* shrinkable hpage inodes */
        struct list_head        swaplist;       /* chain of maybes on swap */
        struct shared_policy    policy;         /* NUMA memory alloc policy */
@@ -31,7 +32,7 @@ struct shmem_sb_info {
        struct percpu_counter used_blocks;  /* How many are allocated */
        unsigned long max_inodes;   /* How many inodes are allowed */
        unsigned long free_inodes;  /* How many are left for allocation */
-       spinlock_t stat_lock;       /* Serialize shmem_sb_info changes */
+       raw_spinlock_t stat_lock;   /* Serialize shmem_sb_info changes */
        umode_t mode;               /* Mount mode for root directory */
        unsigned char huge;         /* Whether to try for hugepages */
        kuid_t uid;                 /* Mount uid for root directory */
@@ -85,7 +86,12 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
 extern int shmem_unuse(unsigned int type, bool frontswap,
                       unsigned long *fs_pages_to_unuse);
 
-extern bool shmem_huge_enabled(struct vm_area_struct *vma);
+extern bool shmem_is_huge(struct vm_area_struct *vma,
+                         struct inode *inode, pgoff_t index);
+static inline bool shmem_huge_enabled(struct vm_area_struct *vma)
+{
+       return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff);
+}
 extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
 extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
                                                pgoff_t start, pgoff_t end);
@@ -93,9 +99,8 @@ extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
 /* Flag allocation requirements to shmem_getpage */
 enum sgp_type {
        SGP_READ,       /* don't exceed i_size, don't allocate page */
+       SGP_NOALLOC,    /* similar, but fail on hole or use fallocated page */
        SGP_CACHE,      /* don't exceed i_size, may allocate page */
-       SGP_NOHUGE,     /* like SGP_CACHE, but no huge pages */
-       SGP_HUGE,       /* like SGP_CACHE, huge pages preferred */
        SGP_WRITE,      /* may exceed i_size, may allocate !Uptodate page */
        SGP_FALLOC,     /* like SGP_WRITE, but make existing page Uptodate */
 };
@@ -119,6 +124,18 @@ static inline bool shmem_file(struct file *file)
        return shmem_mapping(file->f_mapping);
 }
 
+/*
+ * If fallocate(FALLOC_FL_KEEP_SIZE) has been used, there may be pages
+ * beyond i_size's notion of EOF, which fallocate has committed to reserving:
+ * which split_huge_page() must therefore not delete.  This use of a single
+ * "fallocend" per inode errs on the side of not deleting a reservation when
+ * in doubt: there are plenty of cases when it preserves unreserved pages.
+ */
+static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof)
+{
+       return max(eof, SHMEM_I(inode)->fallocend);
+}
+
 extern bool shmem_charge(struct inode *inode, long pages);
 extern void shmem_uncharge(struct inode *inode, long pages);
 
index 6f5a432..ba52f3a 100644 (file)
@@ -408,7 +408,7 @@ static inline bool node_reclaim_enabled(void)
 
 extern void check_move_unevictable_pages(struct pagevec *pvec);
 
-extern int kswapd_run(int nid);
+extern void kswapd_run(int nid);
 extern void kswapd_stop(int nid);
 
 #ifdef CONFIG_SWAP
@@ -721,7 +721,13 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
 #endif
 
 #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-extern void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask);
+extern void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask);
+static inline  void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
+{
+       if (mem_cgroup_disabled())
+               return;
+       __cgroup_throttle_swaprate(page, gfp_mask);
+}
 #else
 static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
 {
@@ -730,8 +736,22 @@ static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
 
 #ifdef CONFIG_MEMCG_SWAP
 extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
-extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
-extern void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
+extern int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
+static inline int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
+{
+       if (mem_cgroup_disabled())
+               return 0;
+       return __mem_cgroup_try_charge_swap(page, entry);
+}
+
+extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
+static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
+{
+       if (mem_cgroup_disabled())
+               return;
+       __mem_cgroup_uncharge_swap(entry, nr_pages);
+}
+
 extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
 extern bool mem_cgroup_swap_full(struct page *page);
 #else
index 2b47584..60a3ab0 100644 (file)
@@ -915,6 +915,7 @@ asmlinkage long sys_mincore(unsigned long start, size_t len,
 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
 asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec,
                        size_t vlen, int behavior, unsigned int flags);
+asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags);
 asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
                        unsigned long prot, unsigned long pgoff,
                        unsigned long flags);
index 331d2cc..33cea48 100644 (file)
@@ -60,16 +60,16 @@ extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
 
 extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
                            unsigned long src_start, unsigned long len,
-                           bool *mmap_changing, __u64 mode);
+                           atomic_t *mmap_changing, __u64 mode);
 extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
                              unsigned long dst_start,
                              unsigned long len,
-                             bool *mmap_changing);
+                             atomic_t *mmap_changing);
 extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
-                             unsigned long len, bool *mmap_changing);
+                             unsigned long len, atomic_t *mmap_changing);
 extern int mwriteprotect_range(struct mm_struct *dst_mm,
                               unsigned long start, unsigned long len,
-                              bool enable_wp, bool *mmap_changing);
+                              bool enable_wp, atomic_t *mmap_changing);
 
 /* mm helpers */
 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
index ae0dd19..a185cc7 100644 (file)
@@ -33,6 +33,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
                PGREUSE,
                PGSTEAL_KSWAPD,
                PGSTEAL_DIRECT,
+               PGDEMOTE_KSWAPD,
+               PGDEMOTE_DIRECT,
                PGSCAN_KSWAPD,
                PGSCAN_DIRECT,
                PGSCAN_DIRECT_THROTTLE,
index 6d28bc4..6a2f51e 100644 (file)
@@ -37,7 +37,7 @@ extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
 extern void vmpressure_init(struct vmpressure *vmpr);
 extern void vmpressure_cleanup(struct vmpressure *vmpr);
 extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
-extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr);
+extern struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr);
 extern int vmpressure_register_event(struct mem_cgroup *memcg,
                                     struct eventfd_ctx *eventfd,
                                     const char *args);
index 270677d..d1f65ad 100644 (file)
@@ -218,7 +218,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 void wbc_detach_inode(struct writeback_control *wbc);
 void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
                              size_t bytes);
-int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr_pages,
+int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
                           enum wb_reason reason, struct wb_completion *done);
 void cgroup_writeback_umount(void);
 bool cleanup_offline_cgwb(struct bdi_writeback *wb);
@@ -374,7 +374,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
 unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
 
-void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time);
+void wb_update_bandwidth(struct bdi_writeback *wb);
 void balance_dirty_pages_ratelimited(struct address_space *mapping);
 bool wb_over_bg_thresh(struct bdi_writeback *wb);
 
index 9fb2a3b..779f3fa 100644 (file)
@@ -21,7 +21,8 @@
        EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind")              \
        EM( MR_NUMA_MISPLACED,  "numa_misplaced")               \
        EM( MR_CONTIG_RANGE,    "contig_range")                 \
-       EMe(MR_LONGTERM_PIN,    "longterm_pin")
+       EM( MR_LONGTERM_PIN,    "longterm_pin")                 \
+       EMe(MR_DEMOTION,        "demotion")
 
 /*
  * First define the enums in the above macros to be exported to userspace
index a9d6fcd..14c8fe8 100644 (file)
@@ -877,9 +877,11 @@ __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self)
 #define __NR_memfd_secret 447
 __SYSCALL(__NR_memfd_secret, sys_memfd_secret)
 #endif
+#define __NR_process_mrelease 448
+__SYSCALL(__NR_process_mrelease, sys_process_mrelease)
 
 #undef __NR_syscalls
-#define __NR_syscalls 448
+#define __NR_syscalls 449
 
 /*
  * 32 bit systems traditionally used different
index 19a00bc..046d0cc 100644 (file)
@@ -22,6 +22,7 @@ enum {
        MPOL_BIND,
        MPOL_INTERLEAVE,
        MPOL_LOCAL,
+       MPOL_PREFERRED_MANY,
        MPOL_MAX,       /* always last member of enum */
 };
 
index 6810276..a0d0577 100644 (file)
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -147,7 +147,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params)
        key_t key = params->key;
        int msgflg = params->flg;
 
-       msq = kmalloc(sizeof(*msq), GFP_KERNEL);
+       msq = kmalloc(sizeof(*msq), GFP_KERNEL_ACCOUNT);
        if (unlikely(!msq))
                return -ENOMEM;
 
index 7bd0766..ae83f0f 100644 (file)
@@ -42,7 +42,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
                goto fail;
 
        err = -ENOMEM;
-       ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL);
+       ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL_ACCOUNT);
        if (ns == NULL)
                goto fail_dec;
 
index 971e75d..1a8b9f0 100644 (file)
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -514,7 +514,7 @@ static struct sem_array *sem_alloc(size_t nsems)
        if (nsems > (INT_MAX - sizeof(*sma)) / sizeof(sma->sems[0]))
                return NULL;
 
-       sma = kvzalloc(struct_size(sma, sems, nsems), GFP_KERNEL);
+       sma = kvzalloc(struct_size(sma, sems, nsems), GFP_KERNEL_ACCOUNT);
        if (unlikely(!sma))
                return NULL;
 
@@ -1855,7 +1855,7 @@ static inline int get_undo_list(struct sem_undo_list **undo_listp)
 
        undo_list = current->sysvsem.undo_list;
        if (!undo_list) {
-               undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL);
+               undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL_ACCOUNT);
                if (undo_list == NULL)
                        return -ENOMEM;
                spin_lock_init(&undo_list->lock);
@@ -1941,7 +1941,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
 
        /* step 2: allocate new undo structure */
        new = kvzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems,
-                      GFP_KERNEL);
+                      GFP_KERNEL_ACCOUNT);
        if (!new) {
                ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
                return ERR_PTR(-ENOMEM);
@@ -2005,7 +2005,8 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops,
        if (nsops > ns->sc_semopm)
                return -E2BIG;
        if (nsops > SEMOPM_FAST) {
-               sops = kvmalloc_array(nsops, sizeof(*sops), GFP_KERNEL);
+               sops = kvmalloc_array(nsops, sizeof(*sops),
+                                     GFP_KERNEL_ACCOUNT);
                if (sops == NULL)
                        return -ENOMEM;
        }
index 748933e..ab749be 100644 (file)
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -619,7 +619,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
                        ns->shm_tot + numpages > ns->shm_ctlall)
                return -ENOSPC;
 
-       shp = kmalloc(sizeof(*shp), GFP_KERNEL);
+       shp = kmalloc(sizeof(*shp), GFP_KERNEL_ACCOUNT);
        if (unlikely(!shp))
                return -ENOMEM;
 
index f5e8828..0d5c298 100644 (file)
@@ -24,7 +24,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
        struct cgroup_namespace *new_ns;
        int ret;
 
-       new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+       new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT);
        if (!new_ns)
                return ERR_PTR(-ENOMEM);
        ret = ns_alloc_inum(&new_ns->ns);
index abc01fc..eec72ca 100644 (file)
@@ -568,6 +568,6 @@ out:
 
 int __init nsproxy_cache_init(void)
 {
-       nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
+       nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC|SLAB_ACCOUNT);
        return 0;
 }
index ca43239..a46a372 100644 (file)
@@ -51,7 +51,8 @@ static struct kmem_cache *create_pid_cachep(unsigned int level)
        mutex_lock(&pid_caches_mutex);
        /* Name collision forces to do allocation under mutex. */
        if (!*pkc)
-               *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN, 0);
+               *pkc = kmem_cache_create(name, len, 0,
+                                        SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, 0);
        mutex_unlock(&pid_caches_mutex);
        /* current can fail, but someone else can succeed. */
        return READ_ONCE(*pkc);
@@ -449,7 +450,7 @@ const struct proc_ns_operations pidns_for_children_operations = {
 
 static __init int pid_namespaces_init(void)
 {
-       pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+       pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT);
 
 #ifdef CONFIG_CHECKPOINT_RESTORE
        register_sysctl_paths(kern_path, pid_ns_ctl_table);
index cf7e250..952741f 100644 (file)
@@ -4726,7 +4726,7 @@ void __init signals_init(void)
 {
        siginfo_buildtime_checks();
 
-       sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
+       sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC | SLAB_ACCOUNT);
 }
 
 #ifdef CONFIG_KGDB_KDB
index cb6f98f..64578ad 100644 (file)
@@ -289,6 +289,7 @@ COND_SYSCALL(munlockall);
 COND_SYSCALL(mincore);
 COND_SYSCALL(madvise);
 COND_SYSCALL(process_madvise);
+COND_SYSCALL(process_mrelease);
 COND_SYSCALL(remap_file_pages);
 COND_SYSCALL(mbind);
 COND_SYSCALL_COMPAT(mbind);
index 25e49b4..083be6a 100644 (file)
@@ -2912,7 +2912,7 @@ static struct ctl_table vm_table[] = {
                .data           = &sysctl_compaction_proactiveness,
                .maxlen         = sizeof(sysctl_compaction_proactiveness),
                .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
+               .proc_handler   = compaction_proactiveness_sysctl_handler,
                .extra1         = SYSCTL_ZERO,
                .extra2         = &one_hundred,
        },
index 12eab0d..aec8328 100644 (file)
@@ -88,13 +88,13 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
                goto fail;
 
        err = -ENOMEM;
-       ns = kmalloc(sizeof(*ns), GFP_KERNEL);
+       ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
        if (!ns)
                goto fail_dec;
 
        refcount_set(&ns->ns.count, 1);
 
-       ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+       ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
        if (!ns->vvar_page)
                goto fail_free;
 
index 3913222..1cd10b1 100644 (file)
@@ -273,8 +273,8 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
 static __init int init_posix_timers(void)
 {
        posix_timers_cache = kmem_cache_create("posix_timers_cache",
-                                       sizeof (struct k_itimer), 0, SLAB_PANIC,
-                                       NULL);
+                                       sizeof(struct k_itimer), 0,
+                                       SLAB_PANIC | SLAB_ACCOUNT, NULL);
        return 0;
 }
 __initcall(init_posix_timers);
index ef82d40..6b2e3ca 100644 (file)
@@ -1385,7 +1385,7 @@ const struct proc_ns_operations userns_operations = {
 
 static __init int user_namespaces_init(void)
 {
-       user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
+       user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT);
        return 0;
 }
 subsys_initcall(user_namespaces_init);
index f4b1ff7..abb3432 100644 (file)
@@ -918,9 +918,8 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
                miter->__offset += miter->consumed;
                miter->__remaining -= miter->consumed;
 
-               if ((miter->__flags & SG_MITER_TO_SG) &&
-                   !PageSlab(miter->page))
-                       flush_kernel_dcache_page(miter->page);
+               if (miter->__flags & SG_MITER_TO_SG)
+                       flush_dcache_page(miter->page);
 
                if (miter->__flags & SG_MITER_ATOMIC) {
                        WARN_ON_ONCE(preemptible());
index 8be9d4b..8835e07 100644 (file)
@@ -120,12 +120,28 @@ static void kasan_test_exit(struct kunit *test)
 static void kmalloc_oob_right(struct kunit *test)
 {
        char *ptr;
-       size_t size = 123;
+       size_t size = 128 - KASAN_GRANULE_SIZE - 5;
 
        ptr = kmalloc(size, GFP_KERNEL);
        KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
 
-       KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + OOB_TAG_OFF] = 'x');
+       /*
+        * An unaligned access past the requested kmalloc size.
+        * Only generic KASAN can precisely detect these.
+        */
+       if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+               KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 'x');
+
+       /*
+        * An aligned access into the first out-of-bounds granule that falls
+        * within the aligned kmalloc object.
+        */
+       KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + 5] = 'y');
+
+       /* Out-of-bounds access past the aligned kmalloc object. */
+       KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] =
+                                       ptr[size + KASAN_GRANULE_SIZE + 5]);
+
        kfree(ptr);
 }
 
@@ -149,7 +165,7 @@ static void kmalloc_node_oob_right(struct kunit *test)
        ptr = kmalloc_node(size, GFP_KERNEL, 0);
        KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
 
-       KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 0);
+       KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]);
        kfree(ptr);
 }
 
@@ -185,7 +201,7 @@ static void kmalloc_pagealloc_uaf(struct kunit *test)
        KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
        kfree(ptr);
 
-       KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = 0);
+       KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
 }
 
 static void kmalloc_pagealloc_invalid_free(struct kunit *test)
@@ -219,7 +235,7 @@ static void pagealloc_oob_right(struct kunit *test)
        ptr = page_address(pages);
        KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
 
-       KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 0);
+       KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]);
        free_pages((unsigned long)ptr, order);
 }
 
@@ -234,7 +250,7 @@ static void pagealloc_uaf(struct kunit *test)
        KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
        free_pages((unsigned long)ptr, order);
 
-       KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = 0);
+       KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
 }
 
 static void kmalloc_large_oob_right(struct kunit *test)
@@ -410,64 +426,70 @@ static void kmalloc_uaf_16(struct kunit *test)
        kfree(ptr1);
 }
 
+/*
+ * Note: in the memset tests below, the written range touches both valid and
+ * invalid memory. This makes sure that the instrumentation does not only check
+ * the starting address but the whole range.
+ */
+
 static void kmalloc_oob_memset_2(struct kunit *test)
 {
        char *ptr;
-       size_t size = 8;
+       size_t size = 128 - KASAN_GRANULE_SIZE;
 
        ptr = kmalloc(size, GFP_KERNEL);
        KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
 
-       KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 7 + OOB_TAG_OFF, 0, 2));
+       KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 1, 0, 2));
        kfree(ptr);
 }
 
 static void kmalloc_oob_memset_4(struct kunit *test)
 {
        char *ptr;
-       size_t size = 8;
+       size_t size = 128 - KASAN_GRANULE_SIZE;
 
        ptr = kmalloc(size, GFP_KERNEL);
        KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
 
-       KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 5 + OOB_TAG_OFF, 0, 4));
+       KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 3, 0, 4));
        kfree(ptr);
 }
 
-
 static void kmalloc_oob_memset_8(struct kunit *test)
 {
        char *ptr;
-       size_t size = 8;
+       size_t size = 128 - KASAN_GRANULE_SIZE;
 
        ptr = kmalloc(size, GFP_KERNEL);
        KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
 
-       KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 1 + OOB_TAG_OFF, 0, 8));
+       KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 7, 0, 8));
        kfree(ptr);
 }
 
 static void kmalloc_oob_memset_16(struct kunit *test)
 {
        char *ptr;
-       size_t size = 16;
+       size_t size = 128 - KASAN_GRANULE_SIZE;
 
        ptr = kmalloc(size, GFP_KERNEL);
        KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
 
-       KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + 1 + OOB_TAG_OFF, 0, 16));
+       KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 15, 0, 16));
        kfree(ptr);
 }
 
 static void kmalloc_oob_in_memset(struct kunit *test)
 {
        char *ptr;
-       size_t size = 666;
+       size_t size = 128 - KASAN_GRANULE_SIZE;
 
        ptr = kmalloc(size, GFP_KERNEL);
        KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
 
-       KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr, 0, size + 5 + OOB_TAG_OFF));
+       KUNIT_EXPECT_KASAN_FAIL(test,
+                               memset(ptr, 0, size + KASAN_GRANULE_SIZE));
        kfree(ptr);
 }
 
@@ -477,11 +499,17 @@ static void kmalloc_memmove_invalid_size(struct kunit *test)
        size_t size = 64;
        volatile size_t invalid_size = -2;
 
+       /*
+        * Hardware tag-based mode doesn't check memmove for negative size.
+        * As a result, this test introduces a side-effect memory corruption,
+        * which can result in a crash.
+        */
+       KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_HW_TAGS);
+
        ptr = kmalloc(size, GFP_KERNEL);
        KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
 
        memset((char *)ptr, 0, 64);
-
        KUNIT_EXPECT_KASAN_FAIL(test,
                memmove((char *)ptr, (char *)ptr + 4, invalid_size));
        kfree(ptr);
@@ -496,7 +524,7 @@ static void kmalloc_uaf(struct kunit *test)
        KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
 
        kfree(ptr);
-       KUNIT_EXPECT_KASAN_FAIL(test, *(ptr + 8) = 'x');
+       KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[8]);
 }
 
 static void kmalloc_uaf_memset(struct kunit *test)
@@ -504,6 +532,12 @@ static void kmalloc_uaf_memset(struct kunit *test)
        char *ptr;
        size_t size = 33;
 
+       /*
+        * Only generic KASAN uses quarantine, which is required to avoid a
+        * kernel memory corruption this test causes.
+        */
+       KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+
        ptr = kmalloc(size, GFP_KERNEL);
        KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
 
@@ -535,7 +569,7 @@ again:
                goto again;
        }
 
-       KUNIT_EXPECT_KASAN_FAIL(test, ptr1[40] = 'x');
+       KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[40]);
        KUNIT_EXPECT_PTR_NE(test, ptr1, ptr2);
 
        kfree(ptr2);
@@ -682,7 +716,7 @@ static void ksize_unpoisons_memory(struct kunit *test)
        ptr[size] = 'x';
 
        /* This one must. */
-       KUNIT_EXPECT_KASAN_FAIL(test, ptr[real_size] = 'y');
+       KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[real_size]);
 
        kfree(ptr);
 }
@@ -701,8 +735,8 @@ static void ksize_uaf(struct kunit *test)
        kfree(ptr);
 
        KUNIT_EXPECT_KASAN_FAIL(test, ksize(ptr));
-       KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = *ptr);
-       KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = *(ptr + size));
+       KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
+       KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size]);
 }
 
 static void kasan_stack_oob(struct kunit *test)
index f1017f3..7ebf433 100644 (file)
 
 #include "../mm/kasan/kasan.h"
 
-#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE)
-
 static noinline void __init copy_user_test(void)
 {
        char *kmem;
        char __user *usermem;
-       size_t size = 10;
+       size_t size = 128 - KASAN_GRANULE_SIZE;
        int __maybe_unused unused;
 
        kmem = kmalloc(size, GFP_KERNEL);
@@ -38,25 +36,25 @@ static noinline void __init copy_user_test(void)
        }
 
        pr_info("out-of-bounds in copy_from_user()\n");
-       unused = copy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF);
+       unused = copy_from_user(kmem, usermem, size + 1);
 
        pr_info("out-of-bounds in copy_to_user()\n");
-       unused = copy_to_user(usermem, kmem, size + 1 + OOB_TAG_OFF);
+       unused = copy_to_user(usermem, kmem, size + 1);
 
        pr_info("out-of-bounds in __copy_from_user()\n");
-       unused = __copy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF);
+       unused = __copy_from_user(kmem, usermem, size + 1);
 
        pr_info("out-of-bounds in __copy_to_user()\n");
-       unused = __copy_to_user(usermem, kmem, size + 1 + OOB_TAG_OFF);
+       unused = __copy_to_user(usermem, kmem, size + 1);
 
        pr_info("out-of-bounds in __copy_from_user_inatomic()\n");
-       unused = __copy_from_user_inatomic(kmem, usermem, size + 1 + OOB_TAG_OFF);
+       unused = __copy_from_user_inatomic(kmem, usermem, size + 1);
 
        pr_info("out-of-bounds in __copy_to_user_inatomic()\n");
-       unused = __copy_to_user_inatomic(usermem, kmem, size + 1 + OOB_TAG_OFF);
+       unused = __copy_to_user_inatomic(usermem, kmem, size + 1);
 
        pr_info("out-of-bounds in strncpy_from_user()\n");
-       unused = strncpy_from_user(kmem, usermem, size + 1 + OOB_TAG_OFF);
+       unused = strncpy_from_user(kmem, usermem, size + 1);
 
        vm_munmap((unsigned long)usermem, PAGE_SIZE);
        kfree(kmem);
@@ -73,7 +71,7 @@ static noinline void __init kasan_rcu_reclaim(struct rcu_head *rp)
                                                struct kasan_rcu_info, rcu);
 
        kfree(fp);
-       fp->i = 1;
+       ((volatile struct kasan_rcu_info *)fp)->i;
 }
 
 static noinline void __init kasan_rcu_uaf(void)
index 01e9543..e14993b 100644 (file)
@@ -35,6 +35,9 @@ __param(int, test_repeat_count, 1,
 __param(int, test_loop_count, 1000000,
        "Set test loop counter");
 
+__param(int, nr_pages, 0,
+       "Set number of pages for fix_size_alloc_test(default: 1)");
+
 __param(int, run_test_mask, INT_MAX,
        "Set tests specified in the mask.\n\n"
                "\t\tid: 1,    name: fix_size_alloc_test\n"
@@ -262,7 +265,7 @@ static int fix_size_alloc_test(void)
        int i;
 
        for (i = 0; i < test_loop_count; i++) {
-               ptr = vmalloc(3 * PAGE_SIZE);
+               ptr = vmalloc((nr_pages > 0 ? nr_pages:1) * PAGE_SIZE);
 
                if (!ptr)
                        return -1;
index cd06dca..4a9d4e2 100644 (file)
@@ -271,6 +271,14 @@ void wb_wakeup_delayed(struct bdi_writeback *wb)
        spin_unlock_bh(&wb->work_lock);
 }
 
+static void wb_update_bandwidth_workfn(struct work_struct *work)
+{
+       struct bdi_writeback *wb = container_of(to_delayed_work(work),
+                                               struct bdi_writeback, bw_dwork);
+
+       wb_update_bandwidth(wb);
+}
+
 /*
  * Initial write bandwidth: 100 MB/s
  */
@@ -293,6 +301,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
        INIT_LIST_HEAD(&wb->b_dirty_time);
        spin_lock_init(&wb->list_lock);
 
+       atomic_set(&wb->writeback_inodes, 0);
        wb->bw_time_stamp = jiffies;
        wb->balanced_dirty_ratelimit = INIT_BW;
        wb->dirty_ratelimit = INIT_BW;
@@ -302,6 +311,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
        spin_lock_init(&wb->work_lock);
        INIT_LIST_HEAD(&wb->work_list);
        INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
+       INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
        wb->dirty_sleep = jiffies;
 
        err = fprop_local_init_percpu(&wb->completions, gfp);
@@ -350,6 +360,7 @@ static void wb_shutdown(struct bdi_writeback *wb)
        mod_delayed_work(bdi_wq, &wb->dwork, 0);
        flush_delayed_work(&wb->dwork);
        WARN_ON(!list_empty(&wb->work_list));
+       flush_delayed_work(&wb->bw_dwork);
 }
 
 static void wb_exit(struct bdi_writeback *wb)
index 5b152db..f03f42f 100644 (file)
@@ -39,7 +39,7 @@ void put_page_bootmem(struct page *page)
 }
 
 #ifndef CONFIG_SPARSEMEM_VMEMMAP
-static void register_page_bootmem_info_section(unsigned long start_pfn)
+static void __init register_page_bootmem_info_section(unsigned long start_pfn)
 {
        unsigned long mapsize, section_nr, i;
        struct mem_section *ms;
@@ -74,7 +74,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
 
 }
 #else /* CONFIG_SPARSEMEM_VMEMMAP */
-static void register_page_bootmem_info_section(unsigned long start_pfn)
+static void __init register_page_bootmem_info_section(unsigned long start_pfn)
 {
        unsigned long mapsize, section_nr, i;
        struct mem_section *ms;
index 621508e..fa9b2b5 100644 (file)
@@ -2398,7 +2398,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
 
                err = migrate_pages(&cc->migratepages, compaction_alloc,
                                compaction_free, (unsigned long)cc, cc->mode,
-                               MR_COMPACTION);
+                               MR_COMPACTION, NULL);
 
                trace_mm_compaction_migratepages(cc->nr_migratepages, err,
                                                        &cc->migratepages);
@@ -2706,6 +2706,30 @@ static void compact_nodes(void)
  */
 unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
 
+int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
+               void *buffer, size_t *length, loff_t *ppos)
+{
+       int rc, nid;
+
+       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+       if (rc)
+               return rc;
+
+       if (write && sysctl_compaction_proactiveness) {
+               for_each_online_node(nid) {
+                       pg_data_t *pgdat = NODE_DATA(nid);
+
+                       if (pgdat->proactive_compact_trigger)
+                               continue;
+
+                       pgdat->proactive_compact_trigger = true;
+                       wake_up_interruptible(&pgdat->kcompactd_wait);
+               }
+       }
+
+       return 0;
+}
+
 /*
  * This is the entry point for compacting all nodes via
  * /proc/sys/vm/compact_memory
@@ -2750,7 +2774,8 @@ void compaction_unregister_node(struct node *node)
 
 static inline bool kcompactd_work_requested(pg_data_t *pgdat)
 {
-       return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
+       return pgdat->kcompactd_max_order > 0 || kthread_should_stop() ||
+               pgdat->proactive_compact_trigger;
 }
 
 static bool kcompactd_node_suitable(pg_data_t *pgdat)
@@ -2885,7 +2910,8 @@ static int kcompactd(void *p)
 {
        pg_data_t *pgdat = (pg_data_t *)p;
        struct task_struct *tsk = current;
-       unsigned int proactive_defer = 0;
+       long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC);
+       long timeout = default_timeout;
 
        const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
 
@@ -2900,25 +2926,39 @@ static int kcompactd(void *p)
        while (!kthread_should_stop()) {
                unsigned long pflags;
 
+               /*
+                * Avoid the unnecessary wakeup for proactive compaction
+                * when it is disabled.
+                */
+               if (!sysctl_compaction_proactiveness)
+                       timeout = MAX_SCHEDULE_TIMEOUT;
                trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
                if (wait_event_freezable_timeout(pgdat->kcompactd_wait,
-                       kcompactd_work_requested(pgdat),
-                       msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC))) {
+                       kcompactd_work_requested(pgdat), timeout) &&
+                       !pgdat->proactive_compact_trigger) {
 
                        psi_memstall_enter(&pflags);
                        kcompactd_do_work(pgdat);
                        psi_memstall_leave(&pflags);
+                       /*
+                        * Reset the timeout value. The defer timeout from
+                        * proactive compaction is lost here but that is fine
+                        * as the condition of the zone changing substantionally
+                        * then carrying on with the previous defer interval is
+                        * not useful.
+                        */
+                       timeout = default_timeout;
                        continue;
                }
 
-               /* kcompactd wait timeout */
+               /*
+                * Start the proactive work with default timeout. Based
+                * on the fragmentation score, this timeout is updated.
+                */
+               timeout = default_timeout;
                if (should_proactive_compact_node(pgdat)) {
                        unsigned int prev_score, score;
 
-                       if (proactive_defer) {
-                               proactive_defer--;
-                               continue;
-                       }
                        prev_score = fragmentation_score_node(pgdat);
                        proactive_compact_node(pgdat);
                        score = fragmentation_score_node(pgdat);
@@ -2926,9 +2966,12 @@ static int kcompactd(void *p)
                         * Defer proactive compaction if the fragmentation
                         * score did not go down i.e. no progress made.
                         */
-                       proactive_defer = score < prev_score ?
-                                       0 : 1 << COMPACT_MAX_DEFER_SHIFT;
+                       if (unlikely(score >= prev_score))
+                               timeout =
+                                  default_timeout << COMPACT_MAX_DEFER_SHIFT;
                }
+               if (unlikely(pgdat->proactive_compact_trigger))
+                       pgdat->proactive_compact_trigger = false;
        }
 
        return 0;
index 1c92269..1403639 100644 (file)
@@ -29,6 +29,8 @@
 #include <linux/start_kernel.h>
 #include <linux/sched/mm.h>
 #include <linux/io.h>
+
+#include <asm/cacheflush.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 
 #define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK)
 #define RANDOM_NZVALUE GENMASK(7, 0)
 
-static void __init pte_basic_tests(unsigned long pfn, int idx)
+struct pgtable_debug_args {
+       struct mm_struct        *mm;
+       struct vm_area_struct   *vma;
+
+       pgd_t                   *pgdp;
+       p4d_t                   *p4dp;
+       pud_t                   *pudp;
+       pmd_t                   *pmdp;
+       pte_t                   *ptep;
+
+       p4d_t                   *start_p4dp;
+       pud_t                   *start_pudp;
+       pmd_t                   *start_pmdp;
+       pgtable_t               start_ptep;
+
+       unsigned long           vaddr;
+       pgprot_t                page_prot;
+       pgprot_t                page_prot_none;
+
+       bool                    is_contiguous_page;
+       unsigned long           pud_pfn;
+       unsigned long           pmd_pfn;
+       unsigned long           pte_pfn;
+
+       unsigned long           fixed_pgd_pfn;
+       unsigned long           fixed_p4d_pfn;
+       unsigned long           fixed_pud_pfn;
+       unsigned long           fixed_pmd_pfn;
+       unsigned long           fixed_pte_pfn;
+};
+
+static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx)
 {
        pgprot_t prot = protection_map[idx];
-       pte_t pte = pfn_pte(pfn, prot);
+       pte_t pte = pfn_pte(args->fixed_pte_pfn, prot);
        unsigned long val = idx, *ptr = &val;
 
        pr_debug("Validating PTE basic (%pGv)\n", ptr);
@@ -86,53 +119,63 @@ static void __init pte_basic_tests(unsigned long pfn, int idx)
        WARN_ON(!pte_dirty(pte_wrprotect(pte_mkdirty(pte))));
 }
 
-static void __init pte_advanced_tests(struct mm_struct *mm,
-                                     struct vm_area_struct *vma, pte_t *ptep,
-                                     unsigned long pfn, unsigned long vaddr,
-                                     pgprot_t prot)
+static void __init pte_advanced_tests(struct pgtable_debug_args *args)
 {
+       struct page *page;
        pte_t pte;
 
        /*
         * Architectures optimize set_pte_at by avoiding TLB flush.
         * This requires set_pte_at to be not used to update an
         * existing pte entry. Clear pte before we do set_pte_at
+        *
+        * flush_dcache_page() is called after set_pte_at() to clear
+        * PG_arch_1 for the page on ARM64. The page flag isn't cleared
+        * when it's released and page allocation check will fail when
+        * the page is allocated again. For architectures other than ARM64,
+        * the unexpected overhead of cache flushing is acceptable.
         */
+       page = (args->pte_pfn != ULONG_MAX) ? pfn_to_page(args->pte_pfn) : NULL;
+       if (!page)
+               return;
 
        pr_debug("Validating PTE advanced\n");
-       pte = pfn_pte(pfn, prot);
-       set_pte_at(mm, vaddr, ptep, pte);
-       ptep_set_wrprotect(mm, vaddr, ptep);
-       pte = ptep_get(ptep);
+       pte = pfn_pte(args->pte_pfn, args->page_prot);
+       set_pte_at(args->mm, args->vaddr, args->ptep, pte);
+       flush_dcache_page(page);
+       ptep_set_wrprotect(args->mm, args->vaddr, args->ptep);
+       pte = ptep_get(args->ptep);
        WARN_ON(pte_write(pte));
-       ptep_get_and_clear(mm, vaddr, ptep);
-       pte = ptep_get(ptep);
+       ptep_get_and_clear(args->mm, args->vaddr, args->ptep);
+       pte = ptep_get(args->ptep);
        WARN_ON(!pte_none(pte));
 
-       pte = pfn_pte(pfn, prot);
+       pte = pfn_pte(args->pte_pfn, args->page_prot);
        pte = pte_wrprotect(pte);
        pte = pte_mkclean(pte);
-       set_pte_at(mm, vaddr, ptep, pte);
+       set_pte_at(args->mm, args->vaddr, args->ptep, pte);
+       flush_dcache_page(page);
        pte = pte_mkwrite(pte);
        pte = pte_mkdirty(pte);
-       ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
-       pte = ptep_get(ptep);
+       ptep_set_access_flags(args->vma, args->vaddr, args->ptep, pte, 1);
+       pte = ptep_get(args->ptep);
        WARN_ON(!(pte_write(pte) && pte_dirty(pte)));
-       ptep_get_and_clear_full(mm, vaddr, ptep, 1);
-       pte = ptep_get(ptep);
+       ptep_get_and_clear_full(args->mm, args->vaddr, args->ptep, 1);
+       pte = ptep_get(args->ptep);
        WARN_ON(!pte_none(pte));
 
-       pte = pfn_pte(pfn, prot);
+       pte = pfn_pte(args->pte_pfn, args->page_prot);
        pte = pte_mkyoung(pte);
-       set_pte_at(mm, vaddr, ptep, pte);
-       ptep_test_and_clear_young(vma, vaddr, ptep);
-       pte = ptep_get(ptep);
+       set_pte_at(args->mm, args->vaddr, args->ptep, pte);
+       flush_dcache_page(page);
+       ptep_test_and_clear_young(args->vma, args->vaddr, args->ptep);
+       pte = ptep_get(args->ptep);
        WARN_ON(pte_young(pte));
 }
 
-static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_savedwrite_tests(struct pgtable_debug_args *args)
 {
-       pte_t pte = pfn_pte(pfn, prot);
+       pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot_none);
 
        if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
                return;
@@ -143,7 +186,7 @@ static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_basic_tests(unsigned long pfn, int idx)
+static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx)
 {
        pgprot_t prot = protection_map[idx];
        unsigned long val = idx, *ptr = &val;
@@ -153,7 +196,7 @@ static void __init pmd_basic_tests(unsigned long pfn, int idx)
                return;
 
        pr_debug("Validating PMD basic (%pGv)\n", ptr);
-       pmd = pfn_pmd(pfn, prot);
+       pmd = pfn_pmd(args->fixed_pmd_pfn, prot);
 
        /*
         * This test needs to be executed after the given page table entry
@@ -181,57 +224,70 @@ static void __init pmd_basic_tests(unsigned long pfn, int idx)
        WARN_ON(!pmd_bad(pmd_mkhuge(pmd)));
 }
 
-static void __init pmd_advanced_tests(struct mm_struct *mm,
-                                     struct vm_area_struct *vma, pmd_t *pmdp,
-                                     unsigned long pfn, unsigned long vaddr,
-                                     pgprot_t prot, pgtable_t pgtable)
+static void __init pmd_advanced_tests(struct pgtable_debug_args *args)
 {
+       struct page *page;
        pmd_t pmd;
+       unsigned long vaddr = args->vaddr;
 
        if (!has_transparent_hugepage())
                return;
 
+       page = (args->pmd_pfn != ULONG_MAX) ? pfn_to_page(args->pmd_pfn) : NULL;
+       if (!page)
+               return;
+
+       /*
+        * flush_dcache_page() is called after set_pmd_at() to clear
+        * PG_arch_1 for the page on ARM64. The page flag isn't cleared
+        * when it's released and page allocation check will fail when
+        * the page is allocated again. For architectures other than ARM64,
+        * the unexpected overhead of cache flushing is acceptable.
+        */
        pr_debug("Validating PMD advanced\n");
        /* Align the address wrt HPAGE_PMD_SIZE */
        vaddr &= HPAGE_PMD_MASK;
 
-       pgtable_trans_huge_deposit(mm, pmdp, pgtable);
+       pgtable_trans_huge_deposit(args->mm, args->pmdp, args->start_ptep);
 
-       pmd = pfn_pmd(pfn, prot);
-       set_pmd_at(mm, vaddr, pmdp, pmd);
-       pmdp_set_wrprotect(mm, vaddr, pmdp);
-       pmd = READ_ONCE(*pmdp);
+       pmd = pfn_pmd(args->pmd_pfn, args->page_prot);
+       set_pmd_at(args->mm, vaddr, args->pmdp, pmd);
+       flush_dcache_page(page);
+       pmdp_set_wrprotect(args->mm, vaddr, args->pmdp);
+       pmd = READ_ONCE(*args->pmdp);
        WARN_ON(pmd_write(pmd));
-       pmdp_huge_get_and_clear(mm, vaddr, pmdp);
-       pmd = READ_ONCE(*pmdp);
+       pmdp_huge_get_and_clear(args->mm, vaddr, args->pmdp);
+       pmd = READ_ONCE(*args->pmdp);
        WARN_ON(!pmd_none(pmd));
 
-       pmd = pfn_pmd(pfn, prot);
+       pmd = pfn_pmd(args->pmd_pfn, args->page_prot);
        pmd = pmd_wrprotect(pmd);
        pmd = pmd_mkclean(pmd);
-       set_pmd_at(mm, vaddr, pmdp, pmd);
+       set_pmd_at(args->mm, vaddr, args->pmdp, pmd);
+       flush_dcache_page(page);
        pmd = pmd_mkwrite(pmd);
        pmd = pmd_mkdirty(pmd);
-       pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1);
-       pmd = READ_ONCE(*pmdp);
+       pmdp_set_access_flags(args->vma, vaddr, args->pmdp, pmd, 1);
+       pmd = READ_ONCE(*args->pmdp);
        WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd)));
-       pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1);
-       pmd = READ_ONCE(*pmdp);
+       pmdp_huge_get_and_clear_full(args->vma, vaddr, args->pmdp, 1);
+       pmd = READ_ONCE(*args->pmdp);
        WARN_ON(!pmd_none(pmd));
 
-       pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+       pmd = pmd_mkhuge(pfn_pmd(args->pmd_pfn, args->page_prot));
        pmd = pmd_mkyoung(pmd);
-       set_pmd_at(mm, vaddr, pmdp, pmd);
-       pmdp_test_and_clear_young(vma, vaddr, pmdp);
-       pmd = READ_ONCE(*pmdp);
+       set_pmd_at(args->mm, vaddr, args->pmdp, pmd);
+       flush_dcache_page(page);
+       pmdp_test_and_clear_young(args->vma, vaddr, args->pmdp);
+       pmd = READ_ONCE(*args->pmdp);
        WARN_ON(pmd_young(pmd));
 
        /*  Clear the pte entries  */
-       pmdp_huge_get_and_clear(mm, vaddr, pmdp);
-       pgtable = pgtable_trans_huge_withdraw(mm, pmdp);
+       pmdp_huge_get_and_clear(args->mm, vaddr, args->pmdp);
+       pgtable_trans_huge_withdraw(args->mm, args->pmdp);
 }
 
-static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_leaf_tests(struct pgtable_debug_args *args)
 {
        pmd_t pmd;
 
@@ -239,7 +295,7 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
                return;
 
        pr_debug("Validating PMD leaf\n");
-       pmd = pfn_pmd(pfn, prot);
+       pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
 
        /*
         * PMD based THP is a leaf entry.
@@ -248,7 +304,7 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
        WARN_ON(!pmd_leaf(pmd));
 }
 
-static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args)
 {
        pmd_t pmd;
 
@@ -259,13 +315,13 @@ static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
                return;
 
        pr_debug("Validating PMD saved write\n");
-       pmd = pfn_pmd(pfn, prot);
+       pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot_none);
        WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd))));
        WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd))));
 }
 
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx)
+static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx)
 {
        pgprot_t prot = protection_map[idx];
        unsigned long val = idx, *ptr = &val;
@@ -275,7 +331,7 @@ static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int
                return;
 
        pr_debug("Validating PUD basic (%pGv)\n", ptr);
-       pud = pfn_pud(pfn, prot);
+       pud = pfn_pud(args->fixed_pud_pfn, prot);
 
        /*
         * This test needs to be executed after the given page table entry
@@ -296,7 +352,7 @@ static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int
        WARN_ON(pud_dirty(pud_wrprotect(pud_mkclean(pud))));
        WARN_ON(!pud_dirty(pud_wrprotect(pud_mkdirty(pud))));
 
-       if (mm_pmd_folded(mm))
+       if (mm_pmd_folded(args->mm))
                return;
 
        /*
@@ -306,58 +362,71 @@ static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int
        WARN_ON(!pud_bad(pud_mkhuge(pud)));
 }
 
-static void __init pud_advanced_tests(struct mm_struct *mm,
-                                     struct vm_area_struct *vma, pud_t *pudp,
-                                     unsigned long pfn, unsigned long vaddr,
-                                     pgprot_t prot)
+static void __init pud_advanced_tests(struct pgtable_debug_args *args)
 {
+       struct page *page;
+       unsigned long vaddr = args->vaddr;
        pud_t pud;
 
        if (!has_transparent_hugepage())
                return;
 
+       page = (args->pud_pfn != ULONG_MAX) ? pfn_to_page(args->pud_pfn) : NULL;
+       if (!page)
+               return;
+
+       /*
+        * flush_dcache_page() is called after set_pud_at() to clear
+        * PG_arch_1 for the page on ARM64. The page flag isn't cleared
+        * when it's released and page allocation check will fail when
+        * the page is allocated again. For architectures other than ARM64,
+        * the unexpected overhead of cache flushing is acceptable.
+        */
        pr_debug("Validating PUD advanced\n");
        /* Align the address wrt HPAGE_PUD_SIZE */
        vaddr &= HPAGE_PUD_MASK;
 
-       pud = pfn_pud(pfn, prot);
-       set_pud_at(mm, vaddr, pudp, pud);
-       pudp_set_wrprotect(mm, vaddr, pudp);
-       pud = READ_ONCE(*pudp);
+       pud = pfn_pud(args->pud_pfn, args->page_prot);
+       set_pud_at(args->mm, vaddr, args->pudp, pud);
+       flush_dcache_page(page);
+       pudp_set_wrprotect(args->mm, vaddr, args->pudp);
+       pud = READ_ONCE(*args->pudp);
        WARN_ON(pud_write(pud));
 
 #ifndef __PAGETABLE_PMD_FOLDED
-       pudp_huge_get_and_clear(mm, vaddr, pudp);
-       pud = READ_ONCE(*pudp);
+       pudp_huge_get_and_clear(args->mm, vaddr, args->pudp);
+       pud = READ_ONCE(*args->pudp);
        WARN_ON(!pud_none(pud));
 #endif /* __PAGETABLE_PMD_FOLDED */
-       pud = pfn_pud(pfn, prot);
+       pud = pfn_pud(args->pud_pfn, args->page_prot);
        pud = pud_wrprotect(pud);
        pud = pud_mkclean(pud);
-       set_pud_at(mm, vaddr, pudp, pud);
+       set_pud_at(args->mm, vaddr, args->pudp, pud);
+       flush_dcache_page(page);
        pud = pud_mkwrite(pud);
        pud = pud_mkdirty(pud);
-       pudp_set_access_flags(vma, vaddr, pudp, pud, 1);
-       pud = READ_ONCE(*pudp);
+       pudp_set_access_flags(args->vma, vaddr, args->pudp, pud, 1);
+       pud = READ_ONCE(*args->pudp);
        WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
 
 #ifndef __PAGETABLE_PMD_FOLDED
-       pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
-       pud = READ_ONCE(*pudp);
+       pudp_huge_get_and_clear_full(args->mm, vaddr, args->pudp, 1);
+       pud = READ_ONCE(*args->pudp);
        WARN_ON(!pud_none(pud));
 #endif /* __PAGETABLE_PMD_FOLDED */
 
-       pud = pfn_pud(pfn, prot);
+       pud = pfn_pud(args->pud_pfn, args->page_prot);
        pud = pud_mkyoung(pud);
-       set_pud_at(mm, vaddr, pudp, pud);
-       pudp_test_and_clear_young(vma, vaddr, pudp);
-       pud = READ_ONCE(*pudp);
+       set_pud_at(args->mm, vaddr, args->pudp, pud);
+       flush_dcache_page(page);
+       pudp_test_and_clear_young(args->vma, vaddr, args->pudp);
+       pud = READ_ONCE(*args->pudp);
        WARN_ON(pud_young(pud));
 
-       pudp_huge_get_and_clear(mm, vaddr, pudp);
+       pudp_huge_get_and_clear(args->mm, vaddr, args->pudp);
 }
 
-static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
+static void __init pud_leaf_tests(struct pgtable_debug_args *args)
 {
        pud_t pud;
 
@@ -365,7 +434,7 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
                return;
 
        pr_debug("Validating PUD leaf\n");
-       pud = pfn_pud(pfn, prot);
+       pud = pfn_pud(args->fixed_pud_pfn, args->page_prot);
        /*
         * PUD based THP is a leaf entry.
         */
@@ -373,41 +442,26 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
        WARN_ON(!pud_leaf(pud));
 }
 #else  /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { }
-static void __init pud_advanced_tests(struct mm_struct *mm,
-                                     struct vm_area_struct *vma, pud_t *pudp,
-                                     unsigned long pfn, unsigned long vaddr,
-                                     pgprot_t prot)
-{
-}
-static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { }
+static void __init pud_advanced_tests(struct pgtable_debug_args *args) { }
+static void __init pud_leaf_tests(struct pgtable_debug_args *args) { }
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 #else  /* !CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_basic_tests(unsigned long pfn, int idx) { }
-static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { }
-static void __init pmd_advanced_tests(struct mm_struct *mm,
-                                     struct vm_area_struct *vma, pmd_t *pmdp,
-                                     unsigned long pfn, unsigned long vaddr,
-                                     pgprot_t prot, pgtable_t pgtable)
-{
-}
-static void __init pud_advanced_tests(struct mm_struct *mm,
-                                     struct vm_area_struct *vma, pud_t *pudp,
-                                     unsigned long pfn, unsigned long vaddr,
-                                     pgprot_t prot)
-{
-}
-static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) { }
+static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { }
+static void __init pmd_advanced_tests(struct pgtable_debug_args *args) { }
+static void __init pud_advanced_tests(struct pgtable_debug_args *args) { }
+static void __init pmd_leaf_tests(struct pgtable_debug_args *args) { }
+static void __init pud_leaf_tests(struct pgtable_debug_args *args) { }
+static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) { }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
+static void __init pmd_huge_tests(struct pgtable_debug_args *args)
 {
        pmd_t pmd;
 
-       if (!arch_vmap_pmd_supported(prot))
+       if (!arch_vmap_pmd_supported(args->page_prot))
                return;
 
        pr_debug("Validating PMD huge\n");
@@ -415,18 +469,18 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
         * X86 defined pmd_set_huge() verifies that the given
         * PMD is not a populated non-leaf entry.
         */
-       WRITE_ONCE(*pmdp, __pmd(0));
-       WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot));
-       WARN_ON(!pmd_clear_huge(pmdp));
-       pmd = READ_ONCE(*pmdp);
+       WRITE_ONCE(*args->pmdp, __pmd(0));
+       WARN_ON(!pmd_set_huge(args->pmdp, __pfn_to_phys(args->fixed_pmd_pfn), args->page_prot));
+       WARN_ON(!pmd_clear_huge(args->pmdp));
+       pmd = READ_ONCE(*args->pmdp);
        WARN_ON(!pmd_none(pmd));
 }
 
-static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
+static void __init pud_huge_tests(struct pgtable_debug_args *args)
 {
        pud_t pud;
 
-       if (!arch_vmap_pud_supported(prot))
+       if (!arch_vmap_pud_supported(args->page_prot))
                return;
 
        pr_debug("Validating PUD huge\n");
@@ -434,18 +488,18 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
         * X86 defined pud_set_huge() verifies that the given
         * PUD is not a populated non-leaf entry.
         */
-       WRITE_ONCE(*pudp, __pud(0));
-       WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot));
-       WARN_ON(!pud_clear_huge(pudp));
-       pud = READ_ONCE(*pudp);
+       WRITE_ONCE(*args->pudp, __pud(0));
+       WARN_ON(!pud_set_huge(args->pudp, __pfn_to_phys(args->fixed_pud_pfn), args->page_prot));
+       WARN_ON(!pud_clear_huge(args->pudp));
+       pud = READ_ONCE(*args->pudp);
        WARN_ON(!pud_none(pud));
 }
 #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
-static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { }
-static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_huge_tests(struct pgtable_debug_args *args) { }
+static void __init pud_huge_tests(struct pgtable_debug_args *args) { }
 #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
 
-static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot)
+static void __init p4d_basic_tests(struct pgtable_debug_args *args)
 {
        p4d_t p4d;
 
@@ -454,7 +508,7 @@ static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot)
        WARN_ON(!p4d_same(p4d, p4d));
 }
 
-static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot)
+static void __init pgd_basic_tests(struct pgtable_debug_args *args)
 {
        pgd_t pgd;
 
@@ -464,27 +518,26 @@ static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot)
 }
 
 #ifndef __PAGETABLE_PUD_FOLDED
-static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp)
+static void __init pud_clear_tests(struct pgtable_debug_args *args)
 {
-       pud_t pud = READ_ONCE(*pudp);
+       pud_t pud = READ_ONCE(*args->pudp);
 
-       if (mm_pmd_folded(mm))
+       if (mm_pmd_folded(args->mm))
                return;
 
        pr_debug("Validating PUD clear\n");
        pud = __pud(pud_val(pud) | RANDOM_ORVALUE);
-       WRITE_ONCE(*pudp, pud);
-       pud_clear(pudp);
-       pud = READ_ONCE(*pudp);
+       WRITE_ONCE(*args->pudp, pud);
+       pud_clear(args->pudp);
+       pud = READ_ONCE(*args->pudp);
        WARN_ON(!pud_none(pud));
 }
 
-static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
-                                     pmd_t *pmdp)
+static void __init pud_populate_tests(struct pgtable_debug_args *args)
 {
        pud_t pud;
 
-       if (mm_pmd_folded(mm))
+       if (mm_pmd_folded(args->mm))
                return;
 
        pr_debug("Validating PUD populate\n");
@@ -492,40 +545,36 @@ static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
         * This entry points to next level page table page.
         * Hence this must not qualify as pud_bad().
         */
-       pud_populate(mm, pudp, pmdp);
-       pud = READ_ONCE(*pudp);
+       pud_populate(args->mm, args->pudp, args->start_pmdp);
+       pud = READ_ONCE(*args->pudp);
        WARN_ON(pud_bad(pud));
 }
 #else  /* !__PAGETABLE_PUD_FOLDED */
-static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp) { }
-static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
-                                     pmd_t *pmdp)
-{
-}
+static void __init pud_clear_tests(struct pgtable_debug_args *args) { }
+static void __init pud_populate_tests(struct pgtable_debug_args *args) { }
 #endif /* PAGETABLE_PUD_FOLDED */
 
 #ifndef __PAGETABLE_P4D_FOLDED
-static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp)
+static void __init p4d_clear_tests(struct pgtable_debug_args *args)
 {
-       p4d_t p4d = READ_ONCE(*p4dp);
+       p4d_t p4d = READ_ONCE(*args->p4dp);
 
-       if (mm_pud_folded(mm))
+       if (mm_pud_folded(args->mm))
                return;
 
        pr_debug("Validating P4D clear\n");
        p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE);
-       WRITE_ONCE(*p4dp, p4d);
-       p4d_clear(p4dp);
-       p4d = READ_ONCE(*p4dp);
+       WRITE_ONCE(*args->p4dp, p4d);
+       p4d_clear(args->p4dp);
+       p4d = READ_ONCE(*args->p4dp);
        WARN_ON(!p4d_none(p4d));
 }
 
-static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
-                                     pud_t *pudp)
+static void __init p4d_populate_tests(struct pgtable_debug_args *args)
 {
        p4d_t p4d;
 
-       if (mm_pud_folded(mm))
+       if (mm_pud_folded(args->mm))
                return;
 
        pr_debug("Validating P4D populate\n");
@@ -533,34 +582,33 @@ static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
         * This entry points to next level page table page.
         * Hence this must not qualify as p4d_bad().
         */
-       pud_clear(pudp);
-       p4d_clear(p4dp);
-       p4d_populate(mm, p4dp, pudp);
-       p4d = READ_ONCE(*p4dp);
+       pud_clear(args->pudp);
+       p4d_clear(args->p4dp);
+       p4d_populate(args->mm, args->p4dp, args->start_pudp);
+       p4d = READ_ONCE(*args->p4dp);
        WARN_ON(p4d_bad(p4d));
 }
 
-static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp)
+static void __init pgd_clear_tests(struct pgtable_debug_args *args)
 {
-       pgd_t pgd = READ_ONCE(*pgdp);
+       pgd_t pgd = READ_ONCE(*(args->pgdp));
 
-       if (mm_p4d_folded(mm))
+       if (mm_p4d_folded(args->mm))
                return;
 
        pr_debug("Validating PGD clear\n");
        pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE);
-       WRITE_ONCE(*pgdp, pgd);
-       pgd_clear(pgdp);
-       pgd = READ_ONCE(*pgdp);
+       WRITE_ONCE(*args->pgdp, pgd);
+       pgd_clear(args->pgdp);
+       pgd = READ_ONCE(*args->pgdp);
        WARN_ON(!pgd_none(pgd));
 }
 
-static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
-                                     p4d_t *p4dp)
+static void __init pgd_populate_tests(struct pgtable_debug_args *args)
 {
        pgd_t pgd;
 
-       if (mm_p4d_folded(mm))
+       if (mm_p4d_folded(args->mm))
                return;
 
        pr_debug("Validating PGD populate\n");
@@ -568,56 +616,60 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
         * This entry points to next level page table page.
         * Hence this must not qualify as pgd_bad().
         */
-       p4d_clear(p4dp);
-       pgd_clear(pgdp);
-       pgd_populate(mm, pgdp, p4dp);
-       pgd = READ_ONCE(*pgdp);
+       p4d_clear(args->p4dp);
+       pgd_clear(args->pgdp);
+       pgd_populate(args->mm, args->pgdp, args->start_p4dp);
+       pgd = READ_ONCE(*args->pgdp);
        WARN_ON(pgd_bad(pgd));
 }
 #else  /* !__PAGETABLE_P4D_FOLDED */
-static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp) { }
-static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp) { }
-static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
-                                     pud_t *pudp)
-{
-}
-static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
-                                     p4d_t *p4dp)
-{
-}
+static void __init p4d_clear_tests(struct pgtable_debug_args *args) { }
+static void __init pgd_clear_tests(struct pgtable_debug_args *args) { }
+static void __init p4d_populate_tests(struct pgtable_debug_args *args) { }
+static void __init pgd_populate_tests(struct pgtable_debug_args *args) { }
 #endif /* PAGETABLE_P4D_FOLDED */
 
-static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
-                                  unsigned long pfn, unsigned long vaddr,
-                                  pgprot_t prot)
+static void __init pte_clear_tests(struct pgtable_debug_args *args)
 {
-       pte_t pte = pfn_pte(pfn, prot);
+       struct page *page;
+       pte_t pte = pfn_pte(args->pte_pfn, args->page_prot);
+
+       page = (args->pte_pfn != ULONG_MAX) ? pfn_to_page(args->pte_pfn) : NULL;
+       if (!page)
+               return;
 
+       /*
+        * flush_dcache_page() is called after set_pte_at() to clear
+        * PG_arch_1 for the page on ARM64. The page flag isn't cleared
+        * when it's released and page allocation check will fail when
+        * the page is allocated again. For architectures other than ARM64,
+        * the unexpected overhead of cache flushing is acceptable.
+        */
        pr_debug("Validating PTE clear\n");
 #ifndef CONFIG_RISCV
        pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
 #endif
-       set_pte_at(mm, vaddr, ptep, pte);
+       set_pte_at(args->mm, args->vaddr, args->ptep, pte);
+       flush_dcache_page(page);
        barrier();
-       pte_clear(mm, vaddr, ptep);
-       pte = ptep_get(ptep);
+       pte_clear(args->mm, args->vaddr, args->ptep);
+       pte = ptep_get(args->ptep);
        WARN_ON(!pte_none(pte));
 }
 
-static void __init pmd_clear_tests(struct mm_struct *mm, pmd_t *pmdp)
+static void __init pmd_clear_tests(struct pgtable_debug_args *args)
 {
-       pmd_t pmd = READ_ONCE(*pmdp);
+       pmd_t pmd = READ_ONCE(*args->pmdp);
 
        pr_debug("Validating PMD clear\n");
        pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE);
-       WRITE_ONCE(*pmdp, pmd);
-       pmd_clear(pmdp);
-       pmd = READ_ONCE(*pmdp);
+       WRITE_ONCE(*args->pmdp, pmd);
+       pmd_clear(args->pmdp);
+       pmd = READ_ONCE(*args->pmdp);
        WARN_ON(!pmd_none(pmd));
 }
 
-static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
-                                     pgtable_t pgtable)
+static void __init pmd_populate_tests(struct pgtable_debug_args *args)
 {
        pmd_t pmd;
 
@@ -626,14 +678,14 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
         * This entry points to next level page table page.
         * Hence this must not qualify as pmd_bad().
         */
-       pmd_populate(mm, pmdp, pgtable);
-       pmd = READ_ONCE(*pmdp);
+       pmd_populate(args->mm, args->pmdp, args->start_ptep);
+       pmd = READ_ONCE(*args->pmdp);
        WARN_ON(pmd_bad(pmd));
 }
 
-static void __init pte_special_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_special_tests(struct pgtable_debug_args *args)
 {
-       pte_t pte = pfn_pte(pfn, prot);
+       pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
 
        if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL))
                return;
@@ -642,9 +694,9 @@ static void __init pte_special_tests(unsigned long pfn, pgprot_t prot)
        WARN_ON(!pte_special(pte_mkspecial(pte)));
 }
 
-static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_protnone_tests(struct pgtable_debug_args *args)
 {
-       pte_t pte = pfn_pte(pfn, prot);
+       pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot_none);
 
        if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
                return;
@@ -655,7 +707,7 @@ static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot)
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_protnone_tests(struct pgtable_debug_args *args)
 {
        pmd_t pmd;
 
@@ -666,25 +718,25 @@ static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot)
                return;
 
        pr_debug("Validating PMD protnone\n");
-       pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+       pmd = pmd_mkhuge(pfn_pmd(args->fixed_pmd_pfn, args->page_prot_none));
        WARN_ON(!pmd_protnone(pmd));
        WARN_ON(!pmd_present(pmd));
 }
 #else  /* !CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_protnone_tests(struct pgtable_debug_args *args) { }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
-static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_devmap_tests(struct pgtable_debug_args *args)
 {
-       pte_t pte = pfn_pte(pfn, prot);
+       pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
 
        pr_debug("Validating PTE devmap\n");
        WARN_ON(!pte_devmap(pte_mkdevmap(pte)));
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_devmap_tests(struct pgtable_debug_args *args)
 {
        pmd_t pmd;
 
@@ -692,12 +744,12 @@ static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot)
                return;
 
        pr_debug("Validating PMD devmap\n");
-       pmd = pfn_pmd(pfn, prot);
+       pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
        WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd)));
 }
 
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot)
+static void __init pud_devmap_tests(struct pgtable_debug_args *args)
 {
        pud_t pud;
 
@@ -705,25 +757,25 @@ static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot)
                return;
 
        pr_debug("Validating PUD devmap\n");
-       pud = pfn_pud(pfn, prot);
+       pud = pfn_pud(args->fixed_pud_pfn, args->page_prot);
        WARN_ON(!pud_devmap(pud_mkdevmap(pud)));
 }
 #else  /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 #else  /* CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { }
+static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #else
-static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pte_devmap_tests(struct pgtable_debug_args *args) { }
+static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { }
+static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
 #endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
 
-static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args)
 {
-       pte_t pte = pfn_pte(pfn, prot);
+       pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
 
        if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
                return;
@@ -733,9 +785,9 @@ static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
        WARN_ON(pte_soft_dirty(pte_clear_soft_dirty(pte)));
 }
 
-static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_swap_soft_dirty_tests(struct pgtable_debug_args *args)
 {
-       pte_t pte = pfn_pte(pfn, prot);
+       pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
 
        if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
                return;
@@ -746,7 +798,7 @@ static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args)
 {
        pmd_t pmd;
 
@@ -757,12 +809,12 @@ static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
                return;
 
        pr_debug("Validating PMD soft dirty\n");
-       pmd = pfn_pmd(pfn, prot);
+       pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
        WARN_ON(!pmd_soft_dirty(pmd_mksoft_dirty(pmd)));
        WARN_ON(pmd_soft_dirty(pmd_clear_soft_dirty(pmd)));
 }
 
-static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args)
 {
        pmd_t pmd;
 
@@ -774,31 +826,29 @@ static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
                return;
 
        pr_debug("Validating PMD swap soft dirty\n");
-       pmd = pfn_pmd(pfn, prot);
+       pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
        WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd)));
        WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd)));
 }
 #else  /* !CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
-{
-}
+static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) { }
+static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
-static void __init pte_swap_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_swap_tests(struct pgtable_debug_args *args)
 {
        swp_entry_t swp;
        pte_t pte;
 
        pr_debug("Validating PTE swap\n");
-       pte = pfn_pte(pfn, prot);
+       pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
        swp = __pte_to_swp_entry(pte);
        pte = __swp_entry_to_pte(swp);
-       WARN_ON(pfn != pte_pfn(pte));
+       WARN_ON(args->fixed_pte_pfn != pte_pfn(pte));
 }
 
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
-static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_swap_tests(struct pgtable_debug_args *args)
 {
        swp_entry_t swp;
        pmd_t pmd;
@@ -807,16 +857,16 @@ static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot)
                return;
 
        pr_debug("Validating PMD swap\n");
-       pmd = pfn_pmd(pfn, prot);
+       pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
        swp = __pmd_to_swp_entry(pmd);
        pmd = __swp_entry_to_pmd(swp);
-       WARN_ON(pfn != pmd_pfn(pmd));
+       WARN_ON(args->fixed_pmd_pfn != pmd_pfn(pmd));
 }
 #else  /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
-static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_swap_tests(struct pgtable_debug_args *args) { }
 #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
 
-static void __init swap_migration_tests(void)
+static void __init swap_migration_tests(struct pgtable_debug_args *args)
 {
        struct page *page;
        swp_entry_t swp;
@@ -824,19 +874,18 @@ static void __init swap_migration_tests(void)
        if (!IS_ENABLED(CONFIG_MIGRATION))
                return;
 
-       pr_debug("Validating swap migration\n");
        /*
         * swap_migration_tests() requires a dedicated page as it needs to
         * be locked before creating a migration entry from it. Locking the
         * page that actually maps kernel text ('start_kernel') can be real
-        * problematic. Lets allocate a dedicated page explicitly for this
-        * purpose that will be freed subsequently.
+        * problematic. Lets use the allocated page explicitly for this
+        * purpose.
         */
-       page = alloc_page(GFP_KERNEL);
-       if (!page) {
-               pr_err("page allocation failed\n");
+       page = (args->pte_pfn != ULONG_MAX) ? pfn_to_page(args->pte_pfn) : NULL;
+       if (!page)
                return;
-       }
+
+       pr_debug("Validating swap migration\n");
 
        /*
         * make_migration_entry() expects given page to be
@@ -855,11 +904,10 @@ static void __init swap_migration_tests(void)
        WARN_ON(!is_migration_entry(swp));
        WARN_ON(is_writable_migration_entry(swp));
        __ClearPageLocked(page);
-       __free_page(page);
 }
 
 #ifdef CONFIG_HUGETLB_PAGE
-static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
+static void __init hugetlb_basic_tests(struct pgtable_debug_args *args)
 {
        struct page *page;
        pte_t pte;
@@ -869,25 +917,25 @@ static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
         * Accessing the page associated with the pfn is safe here,
         * as it was previously derived from a real kernel symbol.
         */
-       page = pfn_to_page(pfn);
-       pte = mk_huge_pte(page, prot);
+       page = pfn_to_page(args->fixed_pmd_pfn);
+       pte = mk_huge_pte(page, args->page_prot);
 
        WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte)));
        WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte))));
        WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte))));
 
 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
-       pte = pfn_pte(pfn, prot);
+       pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot);
 
        WARN_ON(!pte_huge(pte_mkhuge(pte)));
 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
 }
 #else  /* !CONFIG_HUGETLB_PAGE */
-static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { }
 #endif /* CONFIG_HUGETLB_PAGE */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_thp_tests(struct pgtable_debug_args *args)
 {
        pmd_t pmd;
 
@@ -906,7 +954,7 @@ static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot)
         * needs to return true. pmd_present() should be true whenever
         * pmd_trans_huge() returns true.
         */
-       pmd = pfn_pmd(pfn, prot);
+       pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
        WARN_ON(!pmd_trans_huge(pmd_mkhuge(pmd)));
 
 #ifndef __HAVE_ARCH_PMDP_INVALIDATE
@@ -916,7 +964,7 @@ static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot)
 }
 
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot)
+static void __init pud_thp_tests(struct pgtable_debug_args *args)
 {
        pud_t pud;
 
@@ -924,7 +972,7 @@ static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot)
                return;
 
        pr_debug("Validating PUD based THP\n");
-       pud = pfn_pud(pfn, prot);
+       pud = pfn_pud(args->fixed_pud_pfn, args->page_prot);
        WARN_ON(!pud_trans_huge(pud_mkhuge(pud)));
 
        /*
@@ -936,11 +984,11 @@ static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot)
         */
 }
 #else  /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_thp_tests(struct pgtable_debug_args *args) { }
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 #else  /* !CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_thp_tests(struct pgtable_debug_args *args) { }
+static void __init pud_thp_tests(struct pgtable_debug_args *args) { }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 static unsigned long __init get_random_vaddr(void)
@@ -955,43 +1003,179 @@ static unsigned long __init get_random_vaddr(void)
        return random_vaddr;
 }
 
-static int __init debug_vm_pgtable(void)
+static void __init destroy_args(struct pgtable_debug_args *args)
 {
-       struct vm_area_struct *vma;
-       struct mm_struct *mm;
-       pgd_t *pgdp;
-       p4d_t *p4dp, *saved_p4dp;
-       pud_t *pudp, *saved_pudp;
-       pmd_t *pmdp, *saved_pmdp, pmd;
-       pte_t *ptep;
-       pgtable_t saved_ptep;
-       pgprot_t prot, protnone;
-       phys_addr_t paddr;
-       unsigned long vaddr, pte_aligned, pmd_aligned;
-       unsigned long pud_aligned, p4d_aligned, pgd_aligned;
-       spinlock_t *ptl = NULL;
-       int idx;
+       struct page *page = NULL;
+
+       /* Free (huge) page */
+       if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+           IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
+           has_transparent_hugepage() &&
+           args->pud_pfn != ULONG_MAX) {
+               if (args->is_contiguous_page) {
+                       free_contig_range(args->pud_pfn,
+                                         (1 << (HPAGE_PUD_SHIFT - PAGE_SHIFT)));
+               } else {
+                       page = pfn_to_page(args->pud_pfn);
+                       __free_pages(page, HPAGE_PUD_SHIFT - PAGE_SHIFT);
+               }
+
+               args->pud_pfn = ULONG_MAX;
+               args->pmd_pfn = ULONG_MAX;
+               args->pte_pfn = ULONG_MAX;
+       }
 
-       pr_info("Validating architecture page table helpers\n");
-       prot = vm_get_page_prot(VMFLAGS);
-       vaddr = get_random_vaddr();
-       mm = mm_alloc();
-       if (!mm) {
-               pr_err("mm_struct allocation failed\n");
-               return 1;
+       if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+           has_transparent_hugepage() &&
+           args->pmd_pfn != ULONG_MAX) {
+               if (args->is_contiguous_page) {
+                       free_contig_range(args->pmd_pfn, (1 << HPAGE_PMD_ORDER));
+               } else {
+                       page = pfn_to_page(args->pmd_pfn);
+                       __free_pages(page, HPAGE_PMD_ORDER);
+               }
+
+               args->pmd_pfn = ULONG_MAX;
+               args->pte_pfn = ULONG_MAX;
        }
 
+       if (args->pte_pfn != ULONG_MAX) {
+               page = pfn_to_page(args->pte_pfn);
+               __free_pages(page, 0);
+
+               args->pte_pfn = ULONG_MAX;
+       }
+
+       /* Free page table entries */
+       if (args->start_ptep) {
+               pte_free(args->mm, args->start_ptep);
+               mm_dec_nr_ptes(args->mm);
+       }
+
+       if (args->start_pmdp) {
+               pmd_free(args->mm, args->start_pmdp);
+               mm_dec_nr_pmds(args->mm);
+       }
+
+       if (args->start_pudp) {
+               pud_free(args->mm, args->start_pudp);
+               mm_dec_nr_puds(args->mm);
+       }
+
+       if (args->start_p4dp)
+               p4d_free(args->mm, args->start_p4dp);
+
+       /* Free vma and mm struct */
+       if (args->vma)
+               vm_area_free(args->vma);
+
+       if (args->mm)
+               mmdrop(args->mm);
+}
+
+static struct page * __init
+debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order)
+{
+       struct page *page = NULL;
+
+#ifdef CONFIG_CONTIG_ALLOC
+       if (order >= MAX_ORDER) {
+               page = alloc_contig_pages((1 << order), GFP_KERNEL,
+                                         first_online_node, NULL);
+               if (page) {
+                       args->is_contiguous_page = true;
+                       return page;
+               }
+       }
+#endif
+
+       if (order < MAX_ORDER)
+               page = alloc_pages(GFP_KERNEL, order);
+
+       return page;
+}
+
+static int __init init_args(struct pgtable_debug_args *args)
+{
+       struct page *page = NULL;
+       phys_addr_t phys;
+       int ret = 0;
+
        /*
+        * Initialize the debugging data.
+        *
         * __P000 (or even __S000) will help create page table entries with
         * PROT_NONE permission as required for pxx_protnone_tests().
         */
-       protnone = __P000;
+       memset(args, 0, sizeof(*args));
+       args->vaddr              = get_random_vaddr();
+       args->page_prot          = vm_get_page_prot(VMFLAGS);
+       args->page_prot_none     = __P000;
+       args->is_contiguous_page = false;
+       args->pud_pfn            = ULONG_MAX;
+       args->pmd_pfn            = ULONG_MAX;
+       args->pte_pfn            = ULONG_MAX;
+       args->fixed_pgd_pfn      = ULONG_MAX;
+       args->fixed_p4d_pfn      = ULONG_MAX;
+       args->fixed_pud_pfn      = ULONG_MAX;
+       args->fixed_pmd_pfn      = ULONG_MAX;
+       args->fixed_pte_pfn      = ULONG_MAX;
+
+       /* Allocate mm and vma */
+       args->mm = mm_alloc();
+       if (!args->mm) {
+               pr_err("Failed to allocate mm struct\n");
+               ret = -ENOMEM;
+               goto error;
+       }
+
+       args->vma = vm_area_alloc(args->mm);
+       if (!args->vma) {
+               pr_err("Failed to allocate vma\n");
+               ret = -ENOMEM;
+               goto error;
+       }
+
+       /*
+        * Allocate page table entries. They will be modified in the tests.
+        * Lets save the page table entries so that they can be released
+        * when the tests are completed.
+        */
+       args->pgdp = pgd_offset(args->mm, args->vaddr);
+       args->p4dp = p4d_alloc(args->mm, args->pgdp, args->vaddr);
+       if (!args->p4dp) {
+               pr_err("Failed to allocate p4d entries\n");
+               ret = -ENOMEM;
+               goto error;
+       }
+       args->start_p4dp = p4d_offset(args->pgdp, 0UL);
+       WARN_ON(!args->start_p4dp);
+
+       args->pudp = pud_alloc(args->mm, args->p4dp, args->vaddr);
+       if (!args->pudp) {
+               pr_err("Failed to allocate pud entries\n");
+               ret = -ENOMEM;
+               goto error;
+       }
+       args->start_pudp = pud_offset(args->p4dp, 0UL);
+       WARN_ON(!args->start_pudp);
+
+       args->pmdp = pmd_alloc(args->mm, args->pudp, args->vaddr);
+       if (!args->pmdp) {
+               pr_err("Failed to allocate pmd entries\n");
+               ret = -ENOMEM;
+               goto error;
+       }
+       args->start_pmdp = pmd_offset(args->pudp, 0UL);
+       WARN_ON(!args->start_pmdp);
 
-       vma = vm_area_alloc(mm);
-       if (!vma) {
-               pr_err("vma allocation failed\n");
-               return 1;
+       if (pte_alloc(args->mm, args->pmdp)) {
+               pr_err("Failed to allocate pte entries\n");
+               ret = -ENOMEM;
+               goto error;
        }
+       args->start_ptep = pmd_pgtable(READ_ONCE(*args->pmdp));
+       WARN_ON(!args->start_ptep);
 
        /*
         * PFN for mapping at PTE level is determined from a standard kernel
@@ -1000,40 +1184,65 @@ static int __init debug_vm_pgtable(void)
         * exist on the platform but that does not really matter as pfn_pxx()
         * helpers will still create appropriate entries for the test. This
         * helps avoid large memory block allocations to be used for mapping
-        * at higher page table levels.
+        * at higher page table levels in some of the tests.
         */
-       paddr = __pa_symbol(&start_kernel);
-
-       pte_aligned = (paddr & PAGE_MASK) >> PAGE_SHIFT;
-       pmd_aligned = (paddr & PMD_MASK) >> PAGE_SHIFT;
-       pud_aligned = (paddr & PUD_MASK) >> PAGE_SHIFT;
-       p4d_aligned = (paddr & P4D_MASK) >> PAGE_SHIFT;
-       pgd_aligned = (paddr & PGDIR_MASK) >> PAGE_SHIFT;
-       WARN_ON(!pfn_valid(pte_aligned));
-
-       pgdp = pgd_offset(mm, vaddr);
-       p4dp = p4d_alloc(mm, pgdp, vaddr);
-       pudp = pud_alloc(mm, p4dp, vaddr);
-       pmdp = pmd_alloc(mm, pudp, vaddr);
+       phys = __pa_symbol(&start_kernel);
+       args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK);
+       args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK);
+       args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK);
+       args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK);
+       args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK);
+       WARN_ON(!pfn_valid(args->fixed_pte_pfn));
+
        /*
-        * Allocate pgtable_t
+        * Allocate (huge) pages because some of the tests need to access
+        * the data in the pages. The corresponding tests will be skipped
+        * if we fail to allocate (huge) pages.
         */
-       if (pte_alloc(mm, pmdp)) {
-               pr_err("pgtable allocation failed\n");
-               return 1;
+       if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+           IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
+           has_transparent_hugepage()) {
+               page = debug_vm_pgtable_alloc_huge_page(args,
+                               HPAGE_PUD_SHIFT - PAGE_SHIFT);
+               if (page) {
+                       args->pud_pfn = page_to_pfn(page);
+                       args->pmd_pfn = args->pud_pfn;
+                       args->pte_pfn = args->pud_pfn;
+                       return 0;
+               }
        }
 
-       /*
-        * Save all the page table page addresses as the page table
-        * entries will be used for testing with random or garbage
-        * values. These saved addresses will be used for freeing
-        * page table pages.
-        */
-       pmd = READ_ONCE(*pmdp);
-       saved_p4dp = p4d_offset(pgdp, 0UL);
-       saved_pudp = pud_offset(p4dp, 0UL);
-       saved_pmdp = pmd_offset(pudp, 0UL);
-       saved_ptep = pmd_pgtable(pmd);
+       if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+           has_transparent_hugepage()) {
+               page = debug_vm_pgtable_alloc_huge_page(args, HPAGE_PMD_ORDER);
+               if (page) {
+                       args->pmd_pfn = page_to_pfn(page);
+                       args->pte_pfn = args->pmd_pfn;
+                       return 0;
+               }
+       }
+
+       page = alloc_pages(GFP_KERNEL, 0);
+       if (page)
+               args->pte_pfn = page_to_pfn(page);
+
+       return 0;
+
+error:
+       destroy_args(args);
+       return ret;
+}
+
+static int __init debug_vm_pgtable(void)
+{
+       struct pgtable_debug_args args;
+       spinlock_t *ptl = NULL;
+       int idx, ret;
+
+       pr_info("Validating architecture page table helpers\n");
+       ret = init_args(&args);
+       if (ret)
+               return ret;
 
        /*
         * Iterate over the protection_map[] to make sure that all
@@ -1042,9 +1251,9 @@ static int __init debug_vm_pgtable(void)
         * given page table entry.
         */
        for (idx = 0; idx < ARRAY_SIZE(protection_map); idx++) {
-               pte_basic_tests(pte_aligned, idx);
-               pmd_basic_tests(pmd_aligned, idx);
-               pud_basic_tests(mm, pud_aligned, idx);
+               pte_basic_tests(&args, idx);
+               pmd_basic_tests(&args, idx);
+               pud_basic_tests(&args, idx);
        }
 
        /*
@@ -1054,79 +1263,70 @@ static int __init debug_vm_pgtable(void)
         * the above iteration for now to save some test execution
         * time.
         */
-       p4d_basic_tests(p4d_aligned, prot);
-       pgd_basic_tests(pgd_aligned, prot);
+       p4d_basic_tests(&args);
+       pgd_basic_tests(&args);
 
-       pmd_leaf_tests(pmd_aligned, prot);
-       pud_leaf_tests(pud_aligned, prot);
+       pmd_leaf_tests(&args);
+       pud_leaf_tests(&args);
 
-       pte_savedwrite_tests(pte_aligned, protnone);
-       pmd_savedwrite_tests(pmd_aligned, protnone);
+       pte_savedwrite_tests(&args);
+       pmd_savedwrite_tests(&args);
 
-       pte_special_tests(pte_aligned, prot);
-       pte_protnone_tests(pte_aligned, protnone);
-       pmd_protnone_tests(pmd_aligned, protnone);
+       pte_special_tests(&args);
+       pte_protnone_tests(&args);
+       pmd_protnone_tests(&args);
 
-       pte_devmap_tests(pte_aligned, prot);
-       pmd_devmap_tests(pmd_aligned, prot);
-       pud_devmap_tests(pud_aligned, prot);
+       pte_devmap_tests(&args);
+       pmd_devmap_tests(&args);
+       pud_devmap_tests(&args);
 
-       pte_soft_dirty_tests(pte_aligned, prot);
-       pmd_soft_dirty_tests(pmd_aligned, prot);
-       pte_swap_soft_dirty_tests(pte_aligned, prot);
-       pmd_swap_soft_dirty_tests(pmd_aligned, prot);
+       pte_soft_dirty_tests(&args);
+       pmd_soft_dirty_tests(&args);
+       pte_swap_soft_dirty_tests(&args);
+       pmd_swap_soft_dirty_tests(&args);
 
-       pte_swap_tests(pte_aligned, prot);
-       pmd_swap_tests(pmd_aligned, prot);
+       pte_swap_tests(&args);
+       pmd_swap_tests(&args);
 
-       swap_migration_tests();
+       swap_migration_tests(&args);
 
-       pmd_thp_tests(pmd_aligned, prot);
-       pud_thp_tests(pud_aligned, prot);
+       pmd_thp_tests(&args);
+       pud_thp_tests(&args);
 
-       hugetlb_basic_tests(pte_aligned, prot);
+       hugetlb_basic_tests(&args);
 
        /*
         * Page table modifying tests. They need to hold
         * proper page table lock.
         */
 
-       ptep = pte_offset_map_lock(mm, pmdp, vaddr, &ptl);
-       pte_clear_tests(mm, ptep, pte_aligned, vaddr, prot);
-       pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
-       pte_unmap_unlock(ptep, ptl);
+       args.ptep = pte_offset_map_lock(args.mm, args.pmdp, args.vaddr, &ptl);
+       pte_clear_tests(&args);
+       pte_advanced_tests(&args);
+       pte_unmap_unlock(args.ptep, ptl);
 
-       ptl = pmd_lock(mm, pmdp);
-       pmd_clear_tests(mm, pmdp);
-       pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep);
-       pmd_huge_tests(pmdp, pmd_aligned, prot);
-       pmd_populate_tests(mm, pmdp, saved_ptep);
+       ptl = pmd_lock(args.mm, args.pmdp);
+       pmd_clear_tests(&args);
+       pmd_advanced_tests(&args);
+       pmd_huge_tests(&args);
+       pmd_populate_tests(&args);
        spin_unlock(ptl);
 
-       ptl = pud_lock(mm, pudp);
-       pud_clear_tests(mm, pudp);
-       pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
-       pud_huge_tests(pudp, pud_aligned, prot);
-       pud_populate_tests(mm, pudp, saved_pmdp);
+       ptl = pud_lock(args.mm, args.pudp);
+       pud_clear_tests(&args);
+       pud_advanced_tests(&args);
+       pud_huge_tests(&args);
+       pud_populate_tests(&args);
        spin_unlock(ptl);
 
-       spin_lock(&mm->page_table_lock);
-       p4d_clear_tests(mm, p4dp);
-       pgd_clear_tests(mm, pgdp);
-       p4d_populate_tests(mm, p4dp, saved_pudp);
-       pgd_populate_tests(mm, pgdp, saved_p4dp);
-       spin_unlock(&mm->page_table_lock);
-
-       p4d_free(mm, saved_p4dp);
-       pud_free(mm, saved_pudp);
-       pmd_free(mm, saved_pmdp);
-       pte_free(mm, saved_ptep);
-
-       vm_area_free(vma);
-       mm_dec_nr_puds(mm);
-       mm_dec_nr_pmds(mm);
-       mm_dec_nr_ptes(mm);
-       mmdrop(mm);
+       spin_lock(&(args.mm->page_table_lock));
+       p4d_clear_tests(&args);
+       pgd_clear_tests(&args);
+       p4d_populate_tests(&args);
+       pgd_populate_tests(&args);
+       spin_unlock(&(args.mm->page_table_lock));
+
+       destroy_args(&args);
        return 0;
 }
 late_initcall(debug_vm_pgtable);
index 920e8dc..dae4812 100644 (file)
@@ -260,12 +260,11 @@ static void page_cache_free_page(struct address_space *mapping,
 void delete_from_page_cache(struct page *page)
 {
        struct address_space *mapping = page_mapping(page);
-       unsigned long flags;
 
        BUG_ON(!PageLocked(page));
-       xa_lock_irqsave(&mapping->i_pages, flags);
+       xa_lock_irq(&mapping->i_pages);
        __delete_from_page_cache(page, NULL);
-       xa_unlock_irqrestore(&mapping->i_pages, flags);
+       xa_unlock_irq(&mapping->i_pages);
 
        page_cache_free_page(mapping, page);
 }
@@ -337,19 +336,18 @@ void delete_from_page_cache_batch(struct address_space *mapping,
                                  struct pagevec *pvec)
 {
        int i;
-       unsigned long flags;
 
        if (!pagevec_count(pvec))
                return;
 
-       xa_lock_irqsave(&mapping->i_pages, flags);
+       xa_lock_irq(&mapping->i_pages);
        for (i = 0; i < pagevec_count(pvec); i++) {
                trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
 
                unaccount_page_cache_page(mapping, pvec->pages[i]);
        }
        page_cache_delete_batch(mapping, pvec);
-       xa_unlock_irqrestore(&mapping->i_pages, flags);
+       xa_unlock_irq(&mapping->i_pages);
 
        for (i = 0; i < pagevec_count(pvec); i++)
                page_cache_free_page(mapping, pvec->pages[i]);
@@ -841,7 +839,6 @@ void replace_page_cache_page(struct page *old, struct page *new)
        void (*freepage)(struct page *) = mapping->a_ops->freepage;
        pgoff_t offset = old->index;
        XA_STATE(xas, &mapping->i_pages, offset);
-       unsigned long flags;
 
        VM_BUG_ON_PAGE(!PageLocked(old), old);
        VM_BUG_ON_PAGE(!PageLocked(new), new);
@@ -853,7 +850,7 @@ void replace_page_cache_page(struct page *old, struct page *new)
 
        mem_cgroup_migrate(old, new);
 
-       xas_lock_irqsave(&xas, flags);
+       xas_lock_irq(&xas);
        xas_store(&xas, new);
 
        old->mapping = NULL;
@@ -866,7 +863,7 @@ void replace_page_cache_page(struct page *old, struct page *new)
                __dec_lruvec_page_state(old, NR_SHMEM);
        if (PageSwapBacked(new))
                __inc_lruvec_page_state(new, NR_SHMEM);
-       xas_unlock_irqrestore(&xas, flags);
+       xas_unlock_irq(&xas);
        if (freepage)
                freepage(old);
        put_page(old);
index b947179..9935a44 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -62,11 +62,24 @@ static void put_page_refs(struct page *page, int refs)
        put_page(page);
 }
 
-/*
- * Return the compound head page with ref appropriately incremented,
- * or NULL if that failed.
+/**
+ * try_get_compound_head() - return the compound head page with refcount
+ * appropriately incremented, or NULL if that failed.
+ *
+ * This handles potential refcount overflow correctly. It also works correctly
+ * for various lockless get_user_pages()-related callers, due to the use of
+ * page_cache_add_speculative().
+ *
+ * Even though the name includes "compound_head", this function is still
+ * appropriate for callers that have a non-compound @page to get.
+ *
+ * @page:  pointer to page to be gotten
+ * @refs:  the value to add to the page's refcount
+ *
+ * Return: head page (with refcount appropriately incremented) for success, or
+ * NULL upon failure.
  */
-static inline struct page *try_get_compound_head(struct page *page, int refs)
+struct page *try_get_compound_head(struct page *page, int refs)
 {
        struct page *head = compound_head(page);
 
@@ -92,10 +105,17 @@ static inline struct page *try_get_compound_head(struct page *page, int refs)
        return head;
 }
 
-/*
+/**
  * try_grab_compound_head() - attempt to elevate a page's refcount, by a
  * flags-dependent amount.
  *
+ * Even though the name includes "compound_head", this function is still
+ * appropriate for callers that have a non-compound @page to get.
+ *
+ * @page:  pointer to page to be grabbed
+ * @refs:  the value to (effectively) add to the page's refcount
+ * @flags: gup flags: these are the FOLL_* flag values.
+ *
  * "grab" names in this file mean, "look at flags to decide whether to use
  * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
  *
@@ -103,22 +123,26 @@ static inline struct page *try_get_compound_head(struct page *page, int refs)
  * same time. (That's true throughout the get_user_pages*() and
  * pin_user_pages*() APIs.) Cases:
  *
- *    FOLL_GET: page's refcount will be incremented by 1.
- *    FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
+ *    FOLL_GET: page's refcount will be incremented by @refs.
+ *
+ *    FOLL_PIN on compound pages that are > two pages long: page's refcount will
+ *    be incremented by @refs, and page[2].hpage_pinned_refcount will be
+ *    incremented by @refs * GUP_PIN_COUNTING_BIAS.
+ *
+ *    FOLL_PIN on normal pages, or compound pages that are two pages long:
+ *    page's refcount will be incremented by @refs * GUP_PIN_COUNTING_BIAS.
  *
  * Return: head page (with refcount appropriately incremented) for success, or
  * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's
  * considered failure, and furthermore, a likely bug in the caller, so a warning
  * is also emitted.
  */
-__maybe_unused struct page *try_grab_compound_head(struct page *page,
-                                                  int refs, unsigned int flags)
+struct page *try_grab_compound_head(struct page *page,
+                                   int refs, unsigned int flags)
 {
        if (flags & FOLL_GET)
                return try_get_compound_head(page, refs);
        else if (flags & FOLL_PIN) {
-               int orig_refs = refs;
-
                /*
                 * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
                 * right zone, so fail and let the caller fall back to the slow
@@ -143,6 +167,8 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page,
                 *
                 * However, be sure to *also* increment the normal page refcount
                 * field at least once, so that the page really is pinned.
+                * That's why the refcount from the earlier
+                * try_get_compound_head() is left intact.
                 */
                if (hpage_pincount_available(page))
                        hpage_pincount_add(page, refs);
@@ -150,7 +176,7 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page,
                        page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1));
 
                mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED,
-                                   orig_refs);
+                                   refs);
 
                return page;
        }
@@ -186,10 +212,8 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags)
  * @flags:   gup flags: these are the FOLL_* flag values.
  *
  * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
- * time. Cases:
- *
- *    FOLL_GET: page's refcount will be incremented by 1.
- *    FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
+ * time. Cases: please see the try_grab_compound_head() documentation, with
+ * "refs=1".
  *
  * Return: true for success, or if no action was required (if neither FOLL_PIN
  * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
@@ -197,35 +221,10 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags)
  */
 bool __must_check try_grab_page(struct page *page, unsigned int flags)
 {
-       WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
+       if (!(flags & (FOLL_GET | FOLL_PIN)))
+               return true;
 
-       if (flags & FOLL_GET)
-               return try_get_page(page);
-       else if (flags & FOLL_PIN) {
-               int refs = 1;
-
-               page = compound_head(page);
-
-               if (WARN_ON_ONCE(page_ref_count(page) <= 0))
-                       return false;
-
-               if (hpage_pincount_available(page))
-                       hpage_pincount_add(page, 1);
-               else
-                       refs = GUP_PIN_COUNTING_BIAS;
-
-               /*
-                * Similar to try_grab_compound_head(): even if using the
-                * hpage_pincount_add/_sub() routines, be sure to
-                * *also* increment the normal page refcount field at least
-                * once, so that the page really is pinned.
-                */
-               page_ref_add(page, refs);
-
-               mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1);
-       }
-
-       return true;
+       return try_grab_compound_head(page, 1, flags);
 }
 
 /**
@@ -1151,7 +1150,6 @@ static long __get_user_pages(struct mm_struct *mm,
                                         * We must stop here.
                                         */
                                        BUG_ON(gup_flags & FOLL_NOWAIT);
-                                       BUG_ON(ret != 0);
                                        goto out;
                                }
                                continue;
@@ -1276,7 +1274,7 @@ int fixup_user_fault(struct mm_struct *mm,
                     bool *unlocked)
 {
        struct vm_area_struct *vma;
-       vm_fault_t ret, major = 0;
+       vm_fault_t ret;
 
        address = untagged_addr(address);
 
@@ -1296,7 +1294,6 @@ retry:
                return -EINTR;
 
        ret = handle_mm_fault(vma, address, fault_flags, NULL);
-       major |= ret & VM_FAULT_MAJOR;
        if (ret & VM_FAULT_ERROR) {
                int err = vm_fault_to_errno(ret, 0);
 
@@ -1475,8 +1472,8 @@ long populate_vma_page_range(struct vm_area_struct *vma,
        unsigned long nr_pages = (end - start) / PAGE_SIZE;
        int gup_flags;
 
-       VM_BUG_ON(start & ~PAGE_MASK);
-       VM_BUG_ON(end   & ~PAGE_MASK);
+       VM_BUG_ON(!PAGE_ALIGNED(start));
+       VM_BUG_ON(!PAGE_ALIGNED(end));
        VM_BUG_ON_VMA(start < vma->vm_start, vma);
        VM_BUG_ON_VMA(end   > vma->vm_end, vma);
        mmap_assert_locked(mm);
@@ -1775,7 +1772,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
        if (!list_empty(&movable_page_list)) {
                ret = migrate_pages(&movable_page_list, alloc_migration_target,
                                    NULL, (unsigned long)&mtc, MIGRATE_SYNC,
-                                   MR_LONGTERM_PIN);
+                                   MR_LONGTERM_PIN, NULL);
                if (ret && !list_empty(&movable_page_list))
                        putback_movable_pages(&movable_page_list);
        }
@@ -2244,6 +2241,7 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
 {
        int nr_start = *nr;
        struct dev_pagemap *pgmap = NULL;
+       int ret = 1;
 
        do {
                struct page *page = pfn_to_page(pfn);
@@ -2251,21 +2249,22 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
                pgmap = get_dev_pagemap(pfn, pgmap);
                if (unlikely(!pgmap)) {
                        undo_dev_pagemap(nr, nr_start, flags, pages);
-                       return 0;
+                       ret = 0;
+                       break;
                }
                SetPageReferenced(page);
                pages[*nr] = page;
                if (unlikely(!try_grab_page(page, flags))) {
                        undo_dev_pagemap(nr, nr_start, flags, pages);
-                       return 0;
+                       ret = 0;
+                       break;
                }
                (*nr)++;
                pfn++;
        } while (addr += PAGE_SIZE, addr != end);
 
-       if (pgmap)
-               put_dev_pagemap(pgmap);
-       return 1;
+       put_dev_pagemap(pgmap);
+       return ret;
 }
 
 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
index afff3ac..5e9ef0f 100644 (file)
@@ -1440,32 +1440,6 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
                goto out;
        }
 
-       /*
-        * Since we took the NUMA fault, we must have observed the !accessible
-        * bit. Make sure all other CPUs agree with that, to avoid them
-        * modifying the page we're about to migrate.
-        *
-        * Must be done under PTL such that we'll observe the relevant
-        * inc_tlb_flush_pending().
-        *
-        * We are not sure a pending tlb flush here is for a huge page
-        * mapping or not. Hence use the tlb range variant
-        */
-       if (mm_tlb_flush_pending(vma->vm_mm)) {
-               flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
-               /*
-                * change_huge_pmd() released the pmd lock before
-                * invalidating the secondary MMUs sharing the primary
-                * MMU pagetables (with ->invalidate_range()). The
-                * mmu_notifier_invalidate_range_end() (which
-                * internally calls ->invalidate_range()) in
-                * change_pmd_range() will run after us, so we can't
-                * rely on it here and we need an explicit invalidate.
-                */
-               mmu_notifier_invalidate_range(vma->vm_mm, haddr,
-                                             haddr + HPAGE_PMD_SIZE);
-       }
-
        pmd = pmd_modify(oldpmd, vma->vm_page_prot);
        page = vm_normal_page_pmd(vma, haddr, pmd);
        if (!page)
@@ -2454,11 +2428,11 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 
        for (i = nr - 1; i >= 1; i--) {
                __split_huge_page_tail(head, i, lruvec, list);
-               /* Some pages can be beyond i_size: drop them from page cache */
+               /* Some pages can be beyond EOF: drop them from page cache */
                if (head[i].index >= end) {
                        ClearPageDirty(head + i);
                        __delete_from_page_cache(head + i, NULL);
-                       if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
+                       if (shmem_mapping(head->mapping))
                                shmem_uncharge(head->mapping->host, 1);
                        put_page(head + i);
                } else if (!PageAnon(page)) {
@@ -2686,6 +2660,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                 * head page lock is good enough to serialize the trimming.
                 */
                end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
+               if (shmem_mapping(mapping))
+                       end = shmem_fallocend(mapping->host, end);
        }
 
        /*
index 8ea35ba..95dc7b8 100644 (file)
@@ -1072,6 +1072,8 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
        int nid = page_to_nid(page);
 
        lockdep_assert_held(&hugetlb_lock);
+       VM_BUG_ON_PAGE(page_count(page), page);
+
        list_move(&page->lru, &h->hugepage_freelists[nid]);
        h->free_huge_pages++;
        h->free_huge_pages_node[nid]++;
@@ -1143,7 +1145,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
                                unsigned long address, int avoid_reserve,
                                long chg)
 {
-       struct page *page;
+       struct page *page = NULL;
        struct mempolicy *mpol;
        gfp_t gfp_mask;
        nodemask_t *nodemask;
@@ -1164,7 +1166,17 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 
        gfp_mask = htlb_alloc_mask(h);
        nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
-       page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+
+       if (mpol_is_preferred_many(mpol)) {
+               page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+
+               /* Fallback to all nodes if page==NULL */
+               nodemask = NULL;
+       }
+
+       if (!page)
+               page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+
        if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
                SetHPageRestoreReserve(page);
                h->resv_huge_pages--;
@@ -1368,8 +1380,28 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page,
                h->surplus_huge_pages_node[nid]--;
        }
 
+       /*
+        * Very subtle
+        *
+        * For non-gigantic pages set the destructor to the normal compound
+        * page dtor.  This is needed in case someone takes an additional
+        * temporary ref to the page, and freeing is delayed until they drop
+        * their reference.
+        *
+        * For gigantic pages set the destructor to the null dtor.  This
+        * destructor will never be called.  Before freeing the gigantic
+        * page destroy_compound_gigantic_page will turn the compound page
+        * into a simple group of pages.  After this the destructor does not
+        * apply.
+        *
+        * This handles the case where more than one ref is held when and
+        * after update_and_free_page is called.
+        */
        set_page_refcounted(page);
-       set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+       if (hstate_is_gigantic(h))
+               set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+       else
+               set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
 
        h->nr_huge_pages--;
        h->nr_huge_pages_node[nid]--;
@@ -1399,11 +1431,20 @@ static void add_hugetlb_page(struct hstate *h, struct page *page,
        SetHPageVmemmapOptimized(page);
 
        /*
-        * This page is now managed by the hugetlb allocator and has
-        * no users -- drop the last reference.
+        * This page is about to be managed by the hugetlb allocator and
+        * should have no users.  Drop our reference, and check for others
+        * just in case.
         */
        zeroed = put_page_testzero(page);
-       VM_BUG_ON_PAGE(!zeroed, page);
+       if (!zeroed)
+               /*
+                * It is VERY unlikely soneone else has taken a ref on
+                * the page.  In this case, we simply return as the
+                * hugetlb destructor (free_huge_page) will be called
+                * when this other ref is dropped.
+                */
+               return;
+
        arch_clear_hugepage_flags(page);
        enqueue_huge_page(h, page);
 }
@@ -1657,16 +1698,14 @@ static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
                 * cache adding could take a ref on a 'to be' tail page.
                 * We need to respect any increased ref count, and only set
                 * the ref count to zero if count is currently 1.  If count
-                * is not 1, we call synchronize_rcu in the hope that a rcu
-                * grace period will cause ref count to drop and then retry.
-                * If count is still inflated on retry we return an error and
-                * must discard the pages.
+                * is not 1, we return an error.  An error return indicates
+                * the set of pages can not be converted to a gigantic page.
+                * The caller who allocated the pages should then discard the
+                * pages using the appropriate free interface.
                 */
                if (!page_ref_freeze(p, 1)) {
-                       pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
-                       synchronize_rcu();
-                       if (!page_ref_freeze(p, 1))
-                               goto out_error;
+                       pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
+                       goto out_error;
                }
                set_page_count(p, 0);
                set_compound_head(p, page);
@@ -1830,7 +1869,6 @@ retry:
                                retry = true;
                                goto retry;
                        }
-                       pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
                        return NULL;
                }
        }
@@ -2020,9 +2058,10 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
  * Allocates a fresh surplus page from the page allocator.
  */
 static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
-               int nid, nodemask_t *nmask)
+               int nid, nodemask_t *nmask, bool zero_ref)
 {
        struct page *page = NULL;
+       bool retry = false;
 
        if (hstate_is_gigantic(h))
                return NULL;
@@ -2032,6 +2071,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
                goto out_unlock;
        spin_unlock_irq(&hugetlb_lock);
 
+retry:
        page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
        if (!page)
                return NULL;
@@ -2049,11 +2089,35 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
                spin_unlock_irq(&hugetlb_lock);
                put_page(page);
                return NULL;
-       } else {
-               h->surplus_huge_pages++;
-               h->surplus_huge_pages_node[page_to_nid(page)]++;
        }
 
+       if (zero_ref) {
+               /*
+                * Caller requires a page with zero ref count.
+                * We will drop ref count here.  If someone else is holding
+                * a ref, the page will be freed when they drop it.  Abuse
+                * temporary page flag to accomplish this.
+                */
+               SetHPageTemporary(page);
+               if (!put_page_testzero(page)) {
+                       /*
+                        * Unexpected inflated ref count on freshly allocated
+                        * huge.  Retry once.
+                        */
+                       pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
+                       spin_unlock_irq(&hugetlb_lock);
+                       if (retry)
+                               return NULL;
+
+                       retry = true;
+                       goto retry;
+               }
+               ClearHPageTemporary(page);
+       }
+
+       h->surplus_huge_pages++;
+       h->surplus_huge_pages_node[page_to_nid(page)]++;
+
 out_unlock:
        spin_unlock_irq(&hugetlb_lock);
 
@@ -2088,16 +2152,26 @@ static
 struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
                struct vm_area_struct *vma, unsigned long addr)
 {
-       struct page *page;
+       struct page *page = NULL;
        struct mempolicy *mpol;
        gfp_t gfp_mask = htlb_alloc_mask(h);
        int nid;
        nodemask_t *nodemask;
 
        nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
-       page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
-       mpol_cond_put(mpol);
+       if (mpol_is_preferred_many(mpol)) {
+               gfp_t gfp = gfp_mask | __GFP_NOWARN;
 
+               gfp &=  ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
+               page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false);
+
+               /* Fallback to all nodes if page==NULL */
+               nodemask = NULL;
+       }
+
+       if (!page)
+               page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false);
+       mpol_cond_put(mpol);
        return page;
 }
 
@@ -2167,7 +2241,7 @@ retry:
        spin_unlock_irq(&hugetlb_lock);
        for (i = 0; i < needed; i++) {
                page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
-                               NUMA_NO_NODE, NULL);
+                               NUMA_NO_NODE, NULL, true);
                if (!page) {
                        alloc_ok = false;
                        break;
@@ -2208,24 +2282,20 @@ retry:
 
        /* Free the needed pages to the hugetlb pool */
        list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
-               int zeroed;
-
                if ((--needed) < 0)
                        break;
-               /*
-                * This page is now managed by the hugetlb allocator and has
-                * no users -- drop the buddy allocator's reference.
-                */
-               zeroed = put_page_testzero(page);
-               VM_BUG_ON_PAGE(!zeroed, page);
+               /* Add the page to the hugetlb allocator */
                enqueue_huge_page(h, page);
        }
 free:
        spin_unlock_irq(&hugetlb_lock);
 
-       /* Free unnecessary surplus pages to the buddy allocator */
+       /*
+        * Free unnecessary surplus pages to the buddy allocator.
+        * Pages have no ref count, call free_huge_page directly.
+        */
        list_for_each_entry_safe(page, tmp, &surplus_list, lru)
-               put_page(page);
+               free_huge_page(page);
        spin_lock_irq(&hugetlb_lock);
 
        return ret;
@@ -2534,6 +2604,7 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
 {
        gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
        int nid = page_to_nid(old_page);
+       bool alloc_retry = false;
        struct page *new_page;
        int ret = 0;
 
@@ -2544,9 +2615,30 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
         * the pool.  This simplifies and let us do most of the processing
         * under the lock.
         */
+alloc_retry:
        new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
        if (!new_page)
                return -ENOMEM;
+       /*
+        * If all goes well, this page will be directly added to the free
+        * list in the pool.  For this the ref count needs to be zero.
+        * Attempt to drop now, and retry once if needed.  It is VERY
+        * unlikely there is another ref on the page.
+        *
+        * If someone else has a reference to the page, it will be freed
+        * when they drop their ref.  Abuse temporary page flag to accomplish
+        * this.  Retry once if there is an inflated ref count.
+        */
+       SetHPageTemporary(new_page);
+       if (!put_page_testzero(new_page)) {
+               if (alloc_retry)
+                       return -EBUSY;
+
+               alloc_retry = true;
+               goto alloc_retry;
+       }
+       ClearHPageTemporary(new_page);
+
        __prep_new_huge_page(h, new_page);
 
 retry:
@@ -2586,11 +2678,10 @@ retry:
                remove_hugetlb_page(h, old_page, false);
 
                /*
-                * Reference count trick is needed because allocator gives us
-                * referenced page but the pool requires pages with 0 refcount.
+                * Ref count on new page is already zero as it was dropped
+                * earlier.  It can be directly added to the pool free list.
                 */
                __prep_account_new_huge_page(h, nid);
-               page_ref_dec(new_page);
                enqueue_huge_page(h, new_page);
 
                /*
@@ -2604,6 +2695,8 @@ retry:
 
 free_new:
        spin_unlock_irq(&hugetlb_lock);
+       /* Page has a zero ref count, but needs a ref to be freed */
+       set_page_refcounted(new_page);
        update_and_free_page(h, new_page, false);
 
        return ret;
@@ -2828,8 +2921,8 @@ static void __init gather_bootmem_prealloc(void)
                        prep_new_huge_page(h, page, page_to_nid(page));
                        put_page(page); /* add to the hugepage allocator */
                } else {
+                       /* VERY unlikely inflated ref count on a tail page */
                        free_gigantic_page(page, huge_page_order(h));
-                       pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
                }
 
                /*
@@ -4033,8 +4126,10 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
         * after this open call completes.  It is therefore safe to take a
         * new reference here without additional locking.
         */
-       if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+       if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+               resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
                kref_get(&resv->refs);
+       }
 }
 
 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
index 1ae1ebc..aff4d27 100644 (file)
@@ -30,7 +30,7 @@ static int hwpoison_inject(void *data, u64 val)
        if (!hwpoison_filter_enable)
                goto inject;
 
-       shake_page(hpage, 0);
+       shake_page(hpage);
        /*
         * This implies unable to support non-LRU pages.
         */
index 31ff935..cf3cb93 100644 (file)
@@ -211,6 +211,10 @@ extern void zone_pcp_reset(struct zone *zone);
 extern void zone_pcp_disable(struct zone *zone);
 extern void zone_pcp_enable(struct zone *zone);
 
+extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
+                         phys_addr_t min_addr,
+                         int nid, bool exact_nid);
+
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
 
 /*
@@ -539,12 +543,17 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 
 #ifdef CONFIG_NUMA
 extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
+extern int find_next_best_node(int node, nodemask_t *used_node_mask);
 #else
 static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
                                unsigned int order)
 {
        return NODE_RECLAIM_NOSCAN;
 }
+static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
+{
+       return NUMA_NO_NODE;
+}
 #endif
 
 extern int hwpoison_filter(struct page *p);
index e4c16f6..05d1e94 100644 (file)
@@ -37,16 +37,9 @@ enum kasan_arg_stacktrace {
        KASAN_ARG_STACKTRACE_ON,
 };
 
-enum kasan_arg_fault {
-       KASAN_ARG_FAULT_DEFAULT,
-       KASAN_ARG_FAULT_REPORT,
-       KASAN_ARG_FAULT_PANIC,
-};
-
 static enum kasan_arg kasan_arg __ro_after_init;
 static enum kasan_arg_mode kasan_arg_mode __ro_after_init;
 static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init;
-static enum kasan_arg_fault kasan_arg_fault __ro_after_init;
 
 /* Whether KASAN is enabled at all. */
 DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled);
@@ -59,9 +52,6 @@ EXPORT_SYMBOL_GPL(kasan_flag_async);
 /* Whether to collect alloc/free stack traces. */
 DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
 
-/* Whether to panic or print a report and disable tag checking on fault. */
-bool kasan_flag_panic __ro_after_init;
-
 /* kasan=off/on */
 static int __init early_kasan_flag(char *arg)
 {
@@ -113,23 +103,6 @@ static int __init early_kasan_flag_stacktrace(char *arg)
 }
 early_param("kasan.stacktrace", early_kasan_flag_stacktrace);
 
-/* kasan.fault=report/panic */
-static int __init early_kasan_fault(char *arg)
-{
-       if (!arg)
-               return -EINVAL;
-
-       if (!strcmp(arg, "report"))
-               kasan_arg_fault = KASAN_ARG_FAULT_REPORT;
-       else if (!strcmp(arg, "panic"))
-               kasan_arg_fault = KASAN_ARG_FAULT_PANIC;
-       else
-               return -EINVAL;
-
-       return 0;
-}
-early_param("kasan.fault", early_kasan_fault);
-
 /* kasan_init_hw_tags_cpu() is called for each CPU. */
 void kasan_init_hw_tags_cpu(void)
 {
@@ -195,22 +168,6 @@ void __init kasan_init_hw_tags(void)
                break;
        }
 
-       switch (kasan_arg_fault) {
-       case KASAN_ARG_FAULT_DEFAULT:
-               /*
-                * Default to no panic on report.
-                * Do nothing, kasan_flag_panic keeps its default value.
-                */
-               break;
-       case KASAN_ARG_FAULT_REPORT:
-               /* Do nothing, kasan_flag_panic keeps its default value. */
-               break;
-       case KASAN_ARG_FAULT_PANIC:
-               /* Enable panic on report. */
-               kasan_flag_panic = true;
-               break;
-       }
-
        pr_info("KernelAddressSanitizer initialized\n");
 }
 
index fff93b0..8bf568a 100644 (file)
@@ -38,7 +38,6 @@ static inline bool kasan_async_mode_enabled(void)
 
 #endif
 
-extern bool kasan_flag_panic __ro_after_init;
 extern bool kasan_flag_async __ro_after_init;
 
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
index 8fff182..884a950 100644 (file)
@@ -39,6 +39,31 @@ static unsigned long kasan_flags;
 #define KASAN_BIT_REPORTED     0
 #define KASAN_BIT_MULTI_SHOT   1
 
+enum kasan_arg_fault {
+       KASAN_ARG_FAULT_DEFAULT,
+       KASAN_ARG_FAULT_REPORT,
+       KASAN_ARG_FAULT_PANIC,
+};
+
+static enum kasan_arg_fault kasan_arg_fault __ro_after_init = KASAN_ARG_FAULT_DEFAULT;
+
+/* kasan.fault=report/panic */
+static int __init early_kasan_fault(char *arg)
+{
+       if (!arg)
+               return -EINVAL;
+
+       if (!strcmp(arg, "report"))
+               kasan_arg_fault = KASAN_ARG_FAULT_REPORT;
+       else if (!strcmp(arg, "panic"))
+               kasan_arg_fault = KASAN_ARG_FAULT_PANIC;
+       else
+               return -EINVAL;
+
+       return 0;
+}
+early_param("kasan.fault", early_kasan_fault);
+
 bool kasan_save_enable_multi_shot(void)
 {
        return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
@@ -102,10 +127,8 @@ static void end_report(unsigned long *flags, unsigned long addr)
                panic_on_warn = 0;
                panic("panic_on_warn set ...\n");
        }
-#ifdef CONFIG_KASAN_HW_TAGS
-       if (kasan_flag_panic)
+       if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC)
                panic("kasan.fault=panic set ...\n");
-#endif
        kasan_enable_current();
 }
 
index b0412be..045cc57 100644 (file)
@@ -1721,7 +1721,7 @@ static void collapse_file(struct mm_struct *mm,
                                xas_unlock_irq(&xas);
                                /* swap in or instantiate fallocated page */
                                if (shmem_getpage(mapping->host, index, &page,
-                                                 SGP_NOHUGE)) {
+                                                 SGP_NOALLOC)) {
                                        result = SCAN_FAIL;
                                        goto xa_unlocked;
                                }
index 3fa9bc8..0253381 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -259,7 +259,7 @@ static unsigned long ksm_stable_node_chains;
 static unsigned long ksm_stable_node_dups;
 
 /* Delay in pruning stale stable_node_dups in the stable_node_chains */
-static int ksm_stable_node_chains_prune_millisecs = 2000;
+static unsigned int ksm_stable_node_chains_prune_millisecs = 2000;
 
 /* Maximum number of page slots sharing a stable node */
 static int ksm_max_page_sharing = 256;
@@ -3105,11 +3105,11 @@ stable_node_chains_prune_millisecs_store(struct kobject *kobj,
                                         struct kobj_attribute *attr,
                                         const char *buf, size_t count)
 {
-       unsigned long msecs;
+       unsigned int msecs;
        int err;
 
-       err = kstrtoul(buf, 10, &msecs);
-       if (err || msecs > UINT_MAX)
+       err = kstrtouint(buf, 10, &msecs);
+       if (err)
                return -EINVAL;
 
        ksm_stable_node_chains_prune_millisecs = msecs;
index 56324a3..0734db8 100644 (file)
@@ -1048,6 +1048,7 @@ process_madvise_behavior_valid(int behavior)
        switch (behavior) {
        case MADV_COLD:
        case MADV_PAGEOUT:
+       case MADV_WILLNEED:
                return true;
        default:
                return false;
index e2ca8dd..0ab5a74 100644 (file)
@@ -315,7 +315,7 @@ static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
  * Return:
  * Found address on success, 0 on failure.
  */
-phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
+static phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
                                        phys_addr_t end, phys_addr_t size,
                                        phys_addr_t align)
 {
@@ -1496,18 +1496,12 @@ void * __init memblock_alloc_exact_nid_raw(
                        phys_addr_t min_addr, phys_addr_t max_addr,
                        int nid)
 {
-       void *ptr;
-
        memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
                     __func__, (u64)size, (u64)align, nid, &min_addr,
                     &max_addr, (void *)_RET_IP_);
 
-       ptr = memblock_alloc_internal(size, align,
-                                          min_addr, max_addr, nid, true);
-       if (ptr && size > 0)
-               page_init_poison(ptr, size);
-
-       return ptr;
+       return memblock_alloc_internal(size, align, min_addr, max_addr, nid,
+                                      true);
 }
 
 /**
@@ -1534,18 +1528,12 @@ void * __init memblock_alloc_try_nid_raw(
                        phys_addr_t min_addr, phys_addr_t max_addr,
                        int nid)
 {
-       void *ptr;
-
        memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
                     __func__, (u64)size, (u64)align, nid, &min_addr,
                     &max_addr, (void *)_RET_IP_);
 
-       ptr = memblock_alloc_internal(size, align,
-                                          min_addr, max_addr, nid, false);
-       if (ptr && size > 0)
-               page_init_poison(ptr, size);
-
-       return ptr;
+       return memblock_alloc_internal(size, align, min_addr, max_addr, nid,
+                                      false);
 }
 
 /**
index 389b576..b762215 100644 (file)
@@ -103,6 +103,14 @@ static bool do_memsw_account(void)
        return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
 }
 
+/* memcg and lruvec stats flushing */
+static void flush_memcg_stats_dwork(struct work_struct *w);
+static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
+static void flush_memcg_stats_work(struct work_struct *w);
+static DECLARE_WORK(stats_flush_work, flush_memcg_stats_work);
+static DEFINE_PER_CPU(unsigned int, stats_flush_threshold);
+static DEFINE_SPINLOCK(stats_flush_lock);
+
 #define THRESHOLDS_EVENTS_TARGET 128
 #define SOFTLIMIT_EVENTS_TARGET 1024
 
@@ -248,9 +256,9 @@ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
        return &memcg->vmpressure;
 }
 
-struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
+struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
 {
-       return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
+       return container_of(vmpr, struct mem_cgroup, vmpressure);
 }
 
 #ifdef CONFIG_MEMCG_KMEM
@@ -645,17 +653,6 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
        cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
 }
 
-/* idx can be of type enum memcg_stat_item or node_stat_item. */
-static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
-{
-       long x = READ_ONCE(memcg->vmstats.state[idx]);
-#ifdef CONFIG_SMP
-       if (x < 0)
-               x = 0;
-#endif
-       return x;
-}
-
 /* idx can be of type enum memcg_stat_item or node_stat_item. */
 static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
 {
@@ -671,23 +668,11 @@ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
        return x;
 }
 
-static struct mem_cgroup_per_node *
-parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
-{
-       struct mem_cgroup *parent;
-
-       parent = parent_mem_cgroup(pn->memcg);
-       if (!parent)
-               return NULL;
-       return parent->nodeinfo[nid];
-}
-
 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
                              int val)
 {
        struct mem_cgroup_per_node *pn;
        struct mem_cgroup *memcg;
-       long x, threshold = MEMCG_CHARGE_BATCH;
 
        pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        memcg = pn->memcg;
@@ -696,21 +681,9 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
        __mod_memcg_state(memcg, idx, val);
 
        /* Update lruvec */
-       __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
-
-       if (vmstat_item_in_bytes(idx))
-               threshold <<= PAGE_SHIFT;
-
-       x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
-       if (unlikely(abs(x) > threshold)) {
-               pg_data_t *pgdat = lruvec_pgdat(lruvec);
-               struct mem_cgroup_per_node *pi;
-
-               for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
-                       atomic_long_add(x, &pi->lruvec_stat[idx]);
-               x = 0;
-       }
-       __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
+       __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
+       if (!(__this_cpu_inc_return(stats_flush_threshold) % MEMCG_CHARGE_BATCH))
+               queue_work(system_unbound_wq, &stats_flush_work);
 }
 
 /**
@@ -905,7 +878,7 @@ EXPORT_SYMBOL(mem_cgroup_from_task);
 
 static __always_inline struct mem_cgroup *active_memcg(void)
 {
-       if (in_interrupt())
+       if (!in_task())
                return this_cpu_read(int_active_memcg);
        else
                return current->active_memcg;
@@ -2205,8 +2178,9 @@ static void drain_local_stock(struct work_struct *dummy)
        unsigned long flags;
 
        /*
-        * The only protection from memory hotplug vs. drain_stock races is
-        * that we always operate on local CPU stock here with IRQ disabled
+        * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs.
+        * drain_stock races is that we always operate on local CPU stock
+        * here with IRQ disabled
         */
        local_irq_save(flags);
 
@@ -2273,7 +2247,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
                if (memcg && stock->nr_pages &&
                    mem_cgroup_is_descendant(memcg, root_memcg))
                        flush = true;
-               if (obj_stock_flush_required(stock, root_memcg))
+               else if (obj_stock_flush_required(stock, root_memcg))
                        flush = true;
                rcu_read_unlock();
 
@@ -2289,40 +2263,13 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
        mutex_unlock(&percpu_charge_mutex);
 }
 
-static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu)
-{
-       int nid;
-
-       for_each_node(nid) {
-               struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
-               unsigned long stat[NR_VM_NODE_STAT_ITEMS];
-               struct batched_lruvec_stat *lstatc;
-               int i;
-
-               lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu);
-               for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
-                       stat[i] = lstatc->count[i];
-                       lstatc->count[i] = 0;
-               }
-
-               do {
-                       for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
-                               atomic_long_add(stat[i], &pn->lruvec_stat[i]);
-               } while ((pn = parent_nodeinfo(pn, nid)));
-       }
-}
-
 static int memcg_hotplug_cpu_dead(unsigned int cpu)
 {
        struct memcg_stock_pcp *stock;
-       struct mem_cgroup *memcg;
 
        stock = &per_cpu(memcg_stock, cpu);
        drain_stock(stock);
 
-       for_each_mem_cgroup(memcg)
-               memcg_flush_lruvec_page_state(memcg, cpu);
-
        return 0;
 }
 
@@ -4116,7 +4063,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 
-       if (val > 100)
+       if (val > 200)
                return -EINVAL;
 
        if (!mem_cgroup_is_root(memcg))
@@ -4668,7 +4615,7 @@ void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
                    atomic_read(&frn->done.cnt) == 1) {
                        frn->at = 0;
                        trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
-                       cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
+                       cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id,
                                               WB_REASON_FOREIGN_FLUSH,
                                               &frn->done);
                }
@@ -4892,9 +4839,9 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
 
        vfs_poll(efile.file, &event->pt);
 
-       spin_lock(&memcg->event_list_lock);
+       spin_lock_irq(&memcg->event_list_lock);
        list_add(&event->list, &memcg->event_list);
-       spin_unlock(&memcg->event_list_lock);
+       spin_unlock_irq(&memcg->event_list_lock);
 
        fdput(cfile);
        fdput(efile);
@@ -5129,17 +5076,9 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
        if (!pn)
                return 1;
 
-       pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
-                                                GFP_KERNEL_ACCOUNT);
-       if (!pn->lruvec_stat_local) {
-               kfree(pn);
-               return 1;
-       }
-
-       pn->lruvec_stat_cpu = alloc_percpu_gfp(struct batched_lruvec_stat,
-                                              GFP_KERNEL_ACCOUNT);
-       if (!pn->lruvec_stat_cpu) {
-               free_percpu(pn->lruvec_stat_local);
+       pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
+                                                  GFP_KERNEL_ACCOUNT);
+       if (!pn->lruvec_stats_percpu) {
                kfree(pn);
                return 1;
        }
@@ -5160,8 +5099,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
        if (!pn)
                return;
 
-       free_percpu(pn->lruvec_stat_cpu);
-       free_percpu(pn->lruvec_stat_local);
+       free_percpu(pn->lruvec_stats_percpu);
        kfree(pn);
 }
 
@@ -5177,15 +5115,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 
 static void mem_cgroup_free(struct mem_cgroup *memcg)
 {
-       int cpu;
-
        memcg_wb_domain_exit(memcg);
-       /*
-        * Flush percpu lruvec stats to guarantee the value
-        * correctness on parent's and all ancestor levels.
-        */
-       for_each_online_cpu(cpu)
-               memcg_flush_lruvec_page_state(memcg, cpu);
        __mem_cgroup_free(memcg);
 }
 
@@ -5321,6 +5251,10 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
        /* Online state pins memcg ID, memcg ID pins CSS */
        refcount_set(&memcg->id.ref, 1);
        css_get(css);
+
+       if (unlikely(mem_cgroup_is_root(memcg)))
+               queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
+                                  2UL*HZ);
        return 0;
 }
 
@@ -5334,12 +5268,12 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
         * Notify userspace about cgroup removing only after rmdir of cgroup
         * directory to avoid race between userspace and kernelspace.
         */
-       spin_lock(&memcg->event_list_lock);
+       spin_lock_irq(&memcg->event_list_lock);
        list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
                list_del_init(&event->list);
                schedule_work(&event->remove);
        }
-       spin_unlock(&memcg->event_list_lock);
+       spin_unlock_irq(&memcg->event_list_lock);
 
        page_counter_set_min(&memcg->memory, 0);
        page_counter_set_low(&memcg->memory, 0);
@@ -5412,13 +5346,33 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
        memcg_wb_domain_size_changed(memcg);
 }
 
+void mem_cgroup_flush_stats(void)
+{
+       if (!spin_trylock(&stats_flush_lock))
+               return;
+
+       cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
+       spin_unlock(&stats_flush_lock);
+}
+
+static void flush_memcg_stats_dwork(struct work_struct *w)
+{
+       mem_cgroup_flush_stats();
+       queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
+}
+
+static void flush_memcg_stats_work(struct work_struct *w)
+{
+       mem_cgroup_flush_stats();
+}
+
 static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        struct mem_cgroup *parent = parent_mem_cgroup(memcg);
        struct memcg_vmstats_percpu *statc;
        long delta, v;
-       int i;
+       int i, nid;
 
        statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
 
@@ -5466,6 +5420,36 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
                if (parent)
                        parent->vmstats.events_pending[i] += delta;
        }
+
+       for_each_node_state(nid, N_MEMORY) {
+               struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+               struct mem_cgroup_per_node *ppn = NULL;
+               struct lruvec_stats_percpu *lstatc;
+
+               if (parent)
+                       ppn = parent->nodeinfo[nid];
+
+               lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
+
+               for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+                       delta = pn->lruvec_stats.state_pending[i];
+                       if (delta)
+                               pn->lruvec_stats.state_pending[i] = 0;
+
+                       v = READ_ONCE(lstatc->state[i]);
+                       if (v != lstatc->state_prev[i]) {
+                               delta += v - lstatc->state_prev[i];
+                               lstatc->state_prev[i] = v;
+                       }
+
+                       if (!delta)
+                               continue;
+
+                       pn->lruvec_stats.state[i] += delta;
+                       if (ppn)
+                               ppn->lruvec_stats.state_pending[i] += delta;
+               }
+       }
 }
 
 #ifdef CONFIG_MMU
@@ -6399,6 +6383,8 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)
        int i;
        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
 
+       cgroup_rstat_flush(memcg->css.cgroup);
+
        for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
                int nid;
 
@@ -6704,8 +6690,7 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
                        atomic_long_read(&parent->memory.children_low_usage)));
 }
 
-static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg,
-                              gfp_t gfp)
+static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp)
 {
        unsigned int nr_pages = thp_nr_pages(page);
        int ret;
@@ -6726,7 +6711,7 @@ out:
 }
 
 /**
- * mem_cgroup_charge - charge a newly allocated page to a cgroup
+ * __mem_cgroup_charge - charge a newly allocated page to a cgroup
  * @page: page to charge
  * @mm: mm context of the victim
  * @gfp_mask: reclaim mode
@@ -6739,16 +6724,14 @@ out:
  *
  * Returns 0 on success. Otherwise, an error code is returned.
  */
-int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
+int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
+                       gfp_t gfp_mask)
 {
        struct mem_cgroup *memcg;
        int ret;
 
-       if (mem_cgroup_disabled())
-               return 0;
-
        memcg = get_mem_cgroup_from_mm(mm);
-       ret = __mem_cgroup_charge(page, memcg, gfp_mask);
+       ret = charge_memcg(page, memcg, gfp_mask);
        css_put(&memcg->css);
 
        return ret;
@@ -6783,7 +6766,7 @@ int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
                memcg = get_mem_cgroup_from_mm(mm);
        rcu_read_unlock();
 
-       ret = __mem_cgroup_charge(page, memcg, gfp);
+       ret = charge_memcg(page, memcg, gfp);
 
        css_put(&memcg->css);
        return ret;
@@ -6919,18 +6902,15 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
 }
 
 /**
- * mem_cgroup_uncharge - uncharge a page
+ * __mem_cgroup_uncharge - uncharge a page
  * @page: page to uncharge
  *
- * Uncharge a page previously charged with mem_cgroup_charge().
+ * Uncharge a page previously charged with __mem_cgroup_charge().
  */
-void mem_cgroup_uncharge(struct page *page)
+void __mem_cgroup_uncharge(struct page *page)
 {
        struct uncharge_gather ug;
 
-       if (mem_cgroup_disabled())
-               return;
-
        /* Don't touch page->lru of any random page, pre-check: */
        if (!page_memcg(page))
                return;
@@ -6941,20 +6921,17 @@ void mem_cgroup_uncharge(struct page *page)
 }
 
 /**
- * mem_cgroup_uncharge_list - uncharge a list of page
+ * __mem_cgroup_uncharge_list - uncharge a list of page
  * @page_list: list of pages to uncharge
  *
  * Uncharge a list of pages previously charged with
- * mem_cgroup_charge().
+ * __mem_cgroup_charge().
  */
-void mem_cgroup_uncharge_list(struct list_head *page_list)
+void __mem_cgroup_uncharge_list(struct list_head *page_list)
 {
        struct uncharge_gather ug;
        struct page *page;
 
-       if (mem_cgroup_disabled())
-               return;
-
        uncharge_gather_clear(&ug);
        list_for_each_entry(page, page_list, lru)
                uncharge_page(page, &ug);
@@ -7244,7 +7221,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 }
 
 /**
- * mem_cgroup_try_charge_swap - try charging swap space for a page
+ * __mem_cgroup_try_charge_swap - try charging swap space for a page
  * @page: page being added to swap
  * @entry: swap entry to charge
  *
@@ -7252,16 +7229,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
  *
  * Returns 0 on success, -ENOMEM on failure.
  */
-int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
+int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
 {
        unsigned int nr_pages = thp_nr_pages(page);
        struct page_counter *counter;
        struct mem_cgroup *memcg;
        unsigned short oldid;
 
-       if (mem_cgroup_disabled())
-               return 0;
-
        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
                return 0;
 
@@ -7297,11 +7271,11 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
 }
 
 /**
- * mem_cgroup_uncharge_swap - uncharge swap space
+ * __mem_cgroup_uncharge_swap - uncharge swap space
  * @entry: swap entry to uncharge
  * @nr_pages: the amount of swap space to uncharge
  */
-void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
+void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
 {
        struct mem_cgroup *memcg;
        unsigned short id;
index e1f87cf..54879c3 100644 (file)
@@ -68,7 +68,7 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
 
 static bool __page_handle_poison(struct page *page)
 {
-       bool ret;
+       int ret;
 
        zone_pcp_disable(page_zone(page));
        ret = dissolve_free_huge_page(page);
@@ -76,7 +76,7 @@ static bool __page_handle_poison(struct page *page)
                ret = take_page_off_buddy(page);
        zone_pcp_enable(page_zone(page));
 
-       return ret;
+       return ret > 0;
 }
 
 static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
@@ -282,9 +282,9 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
 
 /*
  * Unknown page type encountered. Try to check whether it can turn PageLRU by
- * lru_add_drain_all, or a free page by reclaiming slabs when possible.
+ * lru_add_drain_all.
  */
-void shake_page(struct page *p, int access)
+void shake_page(struct page *p)
 {
        if (PageHuge(p))
                return;
@@ -296,11 +296,9 @@ void shake_page(struct page *p, int access)
        }
 
        /*
-        * Only call shrink_node_slabs here (which would also shrink
-        * other caches) if access is not potentially fatal.
+        * TODO: Could shrink slab caches here if a lightweight range-based
+        * shrinker will be available.
         */
-       if (access)
-               drop_slab_node(page_to_nid(p));
 }
 EXPORT_SYMBOL_GPL(shake_page);
 
@@ -391,8 +389,8 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
 /*
  * Kill the processes that have been collected earlier.
  *
- * Only do anything when DOIT is set, otherwise just free the list
- * (this is used for clean pages which do not need killing)
+ * Only do anything when FORCEKILL is set, otherwise just free the
+ * list (this is used for clean pages which do not need killing)
  * Also when FAIL is set do a force kill because something went
  * wrong earlier.
  */
@@ -632,7 +630,7 @@ static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
 {
        struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
        int ret = 0;
-       pte_t *ptep;
+       pte_t *ptep, *mapped_pte;
        spinlock_t *ptl;
 
        ptl = pmd_trans_huge_lock(pmdp, walk->vma);
@@ -645,14 +643,15 @@ static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
        if (pmd_trans_unstable(pmdp))
                goto out;
 
-       ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, addr, &ptl);
+       mapped_pte = ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp,
+                                               addr, &ptl);
        for (; addr != end; ptep++, addr += PAGE_SIZE) {
                ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT,
                                             hwp->pfn, &hwp->tk);
                if (ret == 1)
                        break;
        }
-       pte_unmap_unlock(ptep - 1, ptl);
+       pte_unmap_unlock(mapped_pte, ptl);
 out:
        cond_resched();
        return ret;
@@ -1204,7 +1203,7 @@ try_again:
                         * page, retry.
                         */
                        if (pass++ < 3) {
-                               shake_page(p, 1);
+                               shake_page(p);
                                goto try_again;
                        }
                        ret = -EIO;
@@ -1221,7 +1220,7 @@ try_again:
                 */
                if (pass++ < 3) {
                        put_page(p);
-                       shake_page(p, 1);
+                       shake_page(p);
                        count_increased = false;
                        goto try_again;
                }
@@ -1229,6 +1228,9 @@ try_again:
                ret = -EIO;
        }
 out:
+       if (ret == -EIO)
+               dump_page(p, "hwpoison: unhandlable page");
+
        return ret;
 }
 
@@ -1270,14 +1272,13 @@ static int get_hwpoison_page(struct page *p, unsigned long flags)
  * the pages and send SIGBUS to the processes if the data was dirty.
  */
 static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
-                                 int flags, struct page **hpagep)
+                                 int flags, struct page *hpage)
 {
        enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC;
        struct address_space *mapping;
        LIST_HEAD(tokill);
        bool unmap_success;
        int kill = 1, forcekill;
-       struct page *hpage = *hpagep;
        bool mlocked = PageMlocked(hpage);
 
        /*
@@ -1369,7 +1370,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * shake_page() again to ensure that it's flushed.
         */
        if (mlocked)
-               shake_page(hpage, 0);
+               shake_page(hpage);
 
        /*
         * Now that the dirty bit has been propagated to the
@@ -1502,7 +1503,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
                goto out;
        }
 
-       if (!hwpoison_user_mappings(p, pfn, flags, &head)) {
+       if (!hwpoison_user_mappings(p, pfn, flags, head)) {
                action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
                res = -EBUSY;
                goto out;
@@ -1518,7 +1519,6 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
                struct dev_pagemap *pgmap)
 {
        struct page *page = pfn_to_page(pfn);
-       const bool unmap_success = true;
        unsigned long size = 0;
        struct to_kill *tk;
        LIST_HEAD(tokill);
@@ -1590,7 +1590,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
                start = (page->index << PAGE_SHIFT) & ~(size - 1);
                unmap_mapping_range(page->mapping, start, size, 0);
        }
-       kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
+       kill_procs(&tokill, flags & MF_MUST_KILL, false, pfn, flags);
        rc = 0;
 unlock:
        dax_unlock_page(page, cookie);
@@ -1724,7 +1724,7 @@ try_again:
         * The check (unnecessarily) ignores LRU pages being isolated and
         * walked by the page reclaim code, however that's not a big loss.
         */
-       shake_page(p, 0);
+       shake_page(p);
 
        lock_page(p);
 
@@ -1783,7 +1783,7 @@ try_again:
         * Now take care of user space mappings.
         * Abort on fail: __delete_from_page_cache() assumes unmapped page.
         */
-       if (!hwpoison_user_mappings(p, pfn, flags, &p)) {
+       if (!hwpoison_user_mappings(p, pfn, flags, p)) {
                action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
                res = -EBUSY;
                goto unlock_page;
@@ -2099,7 +2099,7 @@ static int __soft_offline_page(struct page *page)
 
        if (isolate_page(hpage, &pagelist)) {
                ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
-                       (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE);
+                       (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL);
                if (!ret) {
                        bool release = !huge;
 
@@ -2208,9 +2208,6 @@ retry:
                        try_again = false;
                        goto retry;
                }
-       } else if (ret == -EIO) {
-               pr_info("%s: %#lx: unknown page type: %lx (%pGp)\n",
-                        __func__, pfn, page->flags, &page->flags);
        }
 
        return ret;
index 86c3af7..4c527a8 100644 (file)
@@ -1469,7 +1469,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                if (nodes_empty(nmask))
                        node_set(mtc.nid, nmask);
                ret = migrate_pages(&source, alloc_migration_target, NULL,
-                       (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
+                       (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL);
                if (ret) {
                        list_for_each_entry(page, &source, lru) {
                                if (__ratelimit(&migrate_rs)) {
index e32360e..5e90b3f 100644 (file)
@@ -31,6 +31,9 @@
  *                but useful to set in a VMA when you have a non default
  *                process policy.
  *
+ * preferred many Try a set of nodes first before normal fallback. This is
+ *                similar to preferred without the special case.
+ *
  * default        Allocate on the local node first, or when on a VMA
  *                use the process policy. This is what Linux always did
  *               in a NUMA aware kernel and still does by, ahem, default.
@@ -189,7 +192,7 @@ static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
        nodes_onto(*ret, tmp, *rel);
 }
 
-static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
+static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 {
        if (nodes_empty(*nodes))
                return -EINVAL;
@@ -207,14 +210,6 @@ static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
        return 0;
 }
 
-static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
-{
-       if (nodes_empty(*nodes))
-               return -EINVAL;
-       pol->nodes = *nodes;
-       return 0;
-}
-
 /*
  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
  * any, for the new policy.  mpol_new() has already validated the nodes
@@ -394,7 +389,7 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
                .rebind = mpol_rebind_default,
        },
        [MPOL_INTERLEAVE] = {
-               .create = mpol_new_interleave,
+               .create = mpol_new_nodemask,
                .rebind = mpol_rebind_nodemask,
        },
        [MPOL_PREFERRED] = {
@@ -402,12 +397,16 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
                .rebind = mpol_rebind_preferred,
        },
        [MPOL_BIND] = {
-               .create = mpol_new_bind,
+               .create = mpol_new_nodemask,
                .rebind = mpol_rebind_nodemask,
        },
        [MPOL_LOCAL] = {
                .rebind = mpol_rebind_default,
        },
+       [MPOL_PREFERRED_MANY] = {
+               .create = mpol_new_nodemask,
+               .rebind = mpol_rebind_preferred,
+       },
 };
 
 static int migrate_page_add(struct page *page, struct list_head *pagelist,
@@ -900,6 +899,7 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
        case MPOL_PREFERRED:
+       case MPOL_PREFERRED_MANY:
                *nodes = p->nodes;
                break;
        case MPOL_LOCAL:
@@ -1084,7 +1084,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, alloc_migration_target, NULL,
-                               (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
+                               (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
                if (err)
                        putback_movable_pages(&pagelist);
        }
@@ -1338,7 +1338,7 @@ static long do_mbind(unsigned long start, unsigned long len,
                if (!list_empty(&pagelist)) {
                        WARN_ON_ONCE(flags & MPOL_MF_LAZY);
                        nr_failed = migrate_pages(&pagelist, new_page, NULL,
-                               start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
+                               start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
                        if (nr_failed)
                                putback_movable_pages(&pagelist);
                }
@@ -1446,7 +1446,8 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
 {
        *flags = *mode & MPOL_MODE_FLAGS;
        *mode &= ~MPOL_MODE_FLAGS;
-       if ((unsigned int)(*mode) >= MPOL_MAX)
+
+       if ((unsigned int)(*mode) >=  MPOL_MAX)
                return -EINVAL;
        if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
                return -EINVAL;
@@ -1875,16 +1876,27 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
  */
 nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 {
+       int mode = policy->mode;
+
        /* Lower zones don't get a nodemask applied for MPOL_BIND */
-       if (unlikely(policy->mode == MPOL_BIND) &&
-                       apply_policy_zone(policy, gfp_zone(gfp)) &&
-                       cpuset_nodemask_valid_mems_allowed(&policy->nodes))
+       if (unlikely(mode == MPOL_BIND) &&
+               apply_policy_zone(policy, gfp_zone(gfp)) &&
+               cpuset_nodemask_valid_mems_allowed(&policy->nodes))
+               return &policy->nodes;
+
+       if (mode == MPOL_PREFERRED_MANY)
                return &policy->nodes;
 
        return NULL;
 }
 
-/* Return the node id preferred by the given mempolicy, or the given id */
+/*
+ * Return the  preferred node id for 'prefer' mempolicy, and return
+ * the given id for all other policies.
+ *
+ * policy_node() is always coupled with policy_nodemask(), which
+ * secures the nodemask limit for 'bind' and 'prefer-many' policy.
+ */
 static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
 {
        if (policy->mode == MPOL_PREFERRED) {
@@ -1922,7 +1934,7 @@ unsigned int mempolicy_slab_node(void)
        struct mempolicy *policy;
        int node = numa_mem_id();
 
-       if (in_interrupt())
+       if (!in_task())
                return node;
 
        policy = current->mempolicy;
@@ -1936,7 +1948,9 @@ unsigned int mempolicy_slab_node(void)
        case MPOL_INTERLEAVE:
                return interleave_nodes(policy);
 
-       case MPOL_BIND: {
+       case MPOL_BIND:
+       case MPOL_PREFERRED_MANY:
+       {
                struct zoneref *z;
 
                /*
@@ -2008,12 +2022,12 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
  * @addr: address in @vma for shared policy lookup and interleave policy
  * @gfp_flags: for requested zone
  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
- * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
+ * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
  *
  * Returns a nid suitable for a huge page allocation and a pointer
  * to the struct mempolicy for conditional unref after allocation.
- * If the effective policy is 'BIND, returns a pointer to the mempolicy's
- * @nodemask for filtering the zonelist.
+ * If the effective policy is 'bind' or 'prefer-many', returns a pointer
+ * to the mempolicy's @nodemask for filtering the zonelist.
  *
  * Must be protected by read_mems_allowed_begin()
  */
@@ -2021,16 +2035,18 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
                                struct mempolicy **mpol, nodemask_t **nodemask)
 {
        int nid;
+       int mode;
 
        *mpol = get_vma_policy(vma, addr);
-       *nodemask = NULL;       /* assume !MPOL_BIND */
+       *nodemask = NULL;
+       mode = (*mpol)->mode;
 
-       if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
+       if (unlikely(mode == MPOL_INTERLEAVE)) {
                nid = interleave_nid(*mpol, vma, addr,
                                        huge_page_shift(hstate_vma(vma)));
        } else {
                nid = policy_node(gfp_flags, *mpol, numa_node_id());
-               if ((*mpol)->mode == MPOL_BIND)
+               if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
                        *nodemask = &(*mpol)->nodes;
        }
        return nid;
@@ -2063,6 +2079,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
        mempolicy = current->mempolicy;
        switch (mempolicy->mode) {
        case MPOL_PREFERRED:
+       case MPOL_PREFERRED_MANY:
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
                *mask = mempolicy->nodes;
@@ -2128,6 +2145,27 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
        return page;
 }
 
+static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
+                                               int nid, struct mempolicy *pol)
+{
+       struct page *page;
+       gfp_t preferred_gfp;
+
+       /*
+        * This is a two pass approach. The first pass will only try the
+        * preferred nodes but skip the direct reclaim and allow the
+        * allocation to fail, while the second pass will try all the
+        * nodes in system.
+        */
+       preferred_gfp = gfp | __GFP_NOWARN;
+       preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
+       page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
+       if (!page)
+               page = __alloc_pages(gfp, order, numa_node_id(), NULL);
+
+       return page;
+}
+
 /**
  * alloc_pages_vma - Allocate a page for a VMA.
  * @gfp: GFP flags.
@@ -2163,6 +2201,12 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
                goto out;
        }
 
+       if (pol->mode == MPOL_PREFERRED_MANY) {
+               page = alloc_pages_preferred_many(gfp, order, node, pol);
+               mpol_cond_put(pol);
+               goto out;
+       }
+
        if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
                int hpage_node = node;
 
@@ -2173,7 +2217,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
                 * node and don't fall back to other nodes, as the cost of
                 * remote accesses would likely offset THP benefits.
                 *
-                * If the policy is interleave, or does not allow the current
+                * If the policy is interleave or does not allow the current
                 * node in its nodemask, we allocate the standard way.
                 */
                if (pol->mode == MPOL_PREFERRED)
@@ -2240,6 +2284,9 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
         */
        if (pol->mode == MPOL_INTERLEAVE)
                page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
+       else if (pol->mode == MPOL_PREFERRED_MANY)
+               page = alloc_pages_preferred_many(gfp, order,
+                               numa_node_id(), pol);
        else
                page = __alloc_pages(gfp, order,
                                policy_node(gfp, pol, numa_node_id()),
@@ -2311,6 +2358,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
        case MPOL_PREFERRED:
+       case MPOL_PREFERRED_MANY:
                return !!nodes_equal(a->nodes, b->nodes);
        case MPOL_LOCAL:
                return true;
@@ -2425,8 +2473,8 @@ static void sp_free(struct sp_node *n)
  * node id.  Policy determination "mimics" alloc_page_vma().
  * Called from fault path where we know the vma and faulting address.
  *
- * Return: -1 if the page is in a node that is valid for this policy, or a
- * suitable node ID to allocate a replacement page from.
+ * Return: NUMA_NO_NODE if the page is in a node that is valid for this
+ * policy, or a suitable node ID to allocate a replacement page from.
  */
 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
 {
@@ -2437,7 +2485,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
        int thiscpu = raw_smp_processor_id();
        int thisnid = cpu_to_node(thiscpu);
        int polnid = NUMA_NO_NODE;
-       int ret = -1;
+       int ret = NUMA_NO_NODE;
 
        pol = get_vma_policy(vma, addr);
        if (!(pol->flags & MPOL_F_MOF))
@@ -2451,6 +2499,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
                break;
 
        case MPOL_PREFERRED:
+               if (node_isset(curnid, pol->nodes))
+                       goto out;
                polnid = first_node(pol->nodes);
                break;
 
@@ -2465,9 +2515,10 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
                                break;
                        goto out;
                }
+               fallthrough;
 
+       case MPOL_PREFERRED_MANY:
                /*
-                * allows binding to multiple nodes.
                 * use current page if in policy nodemask,
                 * else select nearest allowed node, if any.
                 * If no allowed nodes, use current [!misplaced].
@@ -2829,6 +2880,7 @@ static const char * const policy_modes[] =
        [MPOL_BIND]       = "bind",
        [MPOL_INTERLEAVE] = "interleave",
        [MPOL_LOCAL]      = "local",
+       [MPOL_PREFERRED_MANY]  = "prefer (many)",
 };
 
 
@@ -2907,6 +2959,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
                if (!nodelist)
                        err = 0;
                goto out;
+       case MPOL_PREFERRED_MANY:
        case MPOL_BIND:
                /*
                 * Insist on a nodelist
@@ -2993,6 +3046,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
        case MPOL_LOCAL:
                break;
        case MPOL_PREFERRED:
+       case MPOL_PREFERRED_MANY:
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
                nodes = pol->nodes;
@@ -3021,3 +3075,64 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
                               nodemask_pr_args(&nodes));
 }
+
+bool numa_demotion_enabled = false;
+
+#ifdef CONFIG_SYSFS
+static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
+                                         struct kobj_attribute *attr, char *buf)
+{
+       return sysfs_emit(buf, "%s\n",
+                         numa_demotion_enabled? "true" : "false");
+}
+
+static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
+                                          struct kobj_attribute *attr,
+                                          const char *buf, size_t count)
+{
+       if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
+               numa_demotion_enabled = true;
+       else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
+               numa_demotion_enabled = false;
+       else
+               return -EINVAL;
+
+       return count;
+}
+
+static struct kobj_attribute numa_demotion_enabled_attr =
+       __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
+              numa_demotion_enabled_store);
+
+static struct attribute *numa_attrs[] = {
+       &numa_demotion_enabled_attr.attr,
+       NULL,
+};
+
+static const struct attribute_group numa_attr_group = {
+       .attrs = numa_attrs,
+};
+
+static int __init numa_init_sysfs(void)
+{
+       int err;
+       struct kobject *numa_kobj;
+
+       numa_kobj = kobject_create_and_add("numa", mm_kobj);
+       if (!numa_kobj) {
+               pr_err("failed to create numa kobject\n");
+               return -ENOMEM;
+       }
+       err = sysfs_create_group(numa_kobj, &numa_attr_group);
+       if (err) {
+               pr_err("failed to register numa group\n");
+               goto delete_obj;
+       }
+       return 0;
+
+delete_obj:
+       kobject_put(numa_kobj);
+       return err;
+}
+subsys_initcall(numa_init_sysfs);
+#endif
index 7e24043..a0aeb3f 100644 (file)
@@ -49,6 +49,7 @@
 #include <linux/sched/mm.h>
 #include <linux/ptrace.h>
 #include <linux/oom.h>
+#include <linux/memory.h>
 
 #include <asm/tlbflush.h>
 
@@ -1099,6 +1100,80 @@ out:
        return rc;
 }
 
+
+/*
+ * node_demotion[] example:
+ *
+ * Consider a system with two sockets.  Each socket has
+ * three classes of memory attached: fast, medium and slow.
+ * Each memory class is placed in its own NUMA node.  The
+ * CPUs are placed in the node with the "fast" memory.  The
+ * 6 NUMA nodes (0-5) might be split among the sockets like
+ * this:
+ *
+ *     Socket A: 0, 1, 2
+ *     Socket B: 3, 4, 5
+ *
+ * When Node 0 fills up, its memory should be migrated to
+ * Node 1.  When Node 1 fills up, it should be migrated to
+ * Node 2.  The migration path start on the nodes with the
+ * processors (since allocations default to this node) and
+ * fast memory, progress through medium and end with the
+ * slow memory:
+ *
+ *     0 -> 1 -> 2 -> stop
+ *     3 -> 4 -> 5 -> stop
+ *
+ * This is represented in the node_demotion[] like this:
+ *
+ *     {  1, // Node 0 migrates to 1
+ *        2, // Node 1 migrates to 2
+ *       -1, // Node 2 does not migrate
+ *        4, // Node 3 migrates to 4
+ *        5, // Node 4 migrates to 5
+ *       -1} // Node 5 does not migrate
+ */
+
+/*
+ * Writes to this array occur without locking.  Cycles are
+ * not allowed: Node X demotes to Y which demotes to X...
+ *
+ * If multiple reads are performed, a single rcu_read_lock()
+ * must be held over all reads to ensure that no cycles are
+ * observed.
+ */
+static int node_demotion[MAX_NUMNODES] __read_mostly =
+       {[0 ...  MAX_NUMNODES - 1] = NUMA_NO_NODE};
+
+/**
+ * next_demotion_node() - Get the next node in the demotion path
+ * @node: The starting node to lookup the next node
+ *
+ * Return: node id for next memory node in the demotion path hierarchy
+ * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
+ * @node online or guarantee that it *continues* to be the next demotion
+ * target.
+ */
+int next_demotion_node(int node)
+{
+       int target;
+
+       /*
+        * node_demotion[] is updated without excluding this
+        * function from running.  RCU doesn't provide any
+        * compiler barriers, so the READ_ONCE() is required
+        * to avoid compiler reordering or read merging.
+        *
+        * Make sure to use RCU over entire code blocks if
+        * node_demotion[] reads need to be consistent.
+        */
+       rcu_read_lock();
+       target = READ_ONCE(node_demotion[node]);
+       rcu_read_unlock();
+
+       return target;
+}
+
 /*
  * Obtain the lock on page, remove all ptes and migrate the page
  * to the newly allocated page in newpage.
@@ -1354,6 +1429,8 @@ static inline int try_split_thp(struct page *page, struct page **page2,
  * @mode:              The migration mode that specifies the constraints for
  *                     page migration, if any.
  * @reason:            The reason for page migration.
+ * @ret_succeeded:     Set to the number of pages migrated successfully if
+ *                     the caller passes a non-NULL pointer.
  *
  * The function returns after 10 attempts or if no pages are movable any more
  * because the list has become empty or no retryable pages exist any more.
@@ -1364,7 +1441,7 @@ static inline int try_split_thp(struct page *page, struct page **page2,
  */
 int migrate_pages(struct list_head *from, new_page_t get_new_page,
                free_page_t put_new_page, unsigned long private,
-               enum migrate_mode mode, int reason)
+               enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
 {
        int retry = 1;
        int thp_retry = 1;
@@ -1519,6 +1596,9 @@ out:
        if (!swapwrite)
                current->flags &= ~PF_SWAPWRITE;
 
+       if (ret_succeeded)
+               *ret_succeeded = nr_succeeded;
+
        return rc;
 }
 
@@ -1588,7 +1668,7 @@ static int do_move_pages_to_node(struct mm_struct *mm,
        };
 
        err = migrate_pages(pagelist, alloc_migration_target, NULL,
-                       (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
+               (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
        if (err)
                putback_movable_pages(pagelist);
        return err;
@@ -2103,7 +2183,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 
        list_add(&page->lru, &migratepages);
        nr_remaining = migrate_pages(&migratepages, *new, NULL, node,
-                                    MIGRATE_ASYNC, MR_NUMA_MISPLACED);
+                                    MIGRATE_ASYNC, MR_NUMA_MISPLACED, NULL);
        if (nr_remaining) {
                if (!list_empty(&migratepages)) {
                        list_del(&page->lru);
@@ -2982,3 +3062,232 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
 }
 EXPORT_SYMBOL(migrate_vma_finalize);
 #endif /* CONFIG_DEVICE_PRIVATE */
+
+#if defined(CONFIG_MEMORY_HOTPLUG)
+/* Disable reclaim-based migration. */
+static void __disable_all_migrate_targets(void)
+{
+       int node;
+
+       for_each_online_node(node)
+               node_demotion[node] = NUMA_NO_NODE;
+}
+
+static void disable_all_migrate_targets(void)
+{
+       __disable_all_migrate_targets();
+
+       /*
+        * Ensure that the "disable" is visible across the system.
+        * Readers will see either a combination of before+disable
+        * state or disable+after.  They will never see before and
+        * after state together.
+        *
+        * The before+after state together might have cycles and
+        * could cause readers to do things like loop until this
+        * function finishes.  This ensures they can only see a
+        * single "bad" read and would, for instance, only loop
+        * once.
+        */
+       synchronize_rcu();
+}
+
+/*
+ * Find an automatic demotion target for 'node'.
+ * Failing here is OK.  It might just indicate
+ * being at the end of a chain.
+ */
+static int establish_migrate_target(int node, nodemask_t *used)
+{
+       int migration_target;
+
+       /*
+        * Can not set a migration target on a
+        * node with it already set.
+        *
+        * No need for READ_ONCE() here since this
+        * in the write path for node_demotion[].
+        * This should be the only thread writing.
+        */
+       if (node_demotion[node] != NUMA_NO_NODE)
+               return NUMA_NO_NODE;
+
+       migration_target = find_next_best_node(node, used);
+       if (migration_target == NUMA_NO_NODE)
+               return NUMA_NO_NODE;
+
+       node_demotion[node] = migration_target;
+
+       return migration_target;
+}
+
+/*
+ * When memory fills up on a node, memory contents can be
+ * automatically migrated to another node instead of
+ * discarded at reclaim.
+ *
+ * Establish a "migration path" which will start at nodes
+ * with CPUs and will follow the priorities used to build the
+ * page allocator zonelists.
+ *
+ * The difference here is that cycles must be avoided.  If
+ * node0 migrates to node1, then neither node1, nor anything
+ * node1 migrates to can migrate to node0.
+ *
+ * This function can run simultaneously with readers of
+ * node_demotion[].  However, it can not run simultaneously
+ * with itself.  Exclusion is provided by memory hotplug events
+ * being single-threaded.
+ */
+static void __set_migration_target_nodes(void)
+{
+       nodemask_t next_pass    = NODE_MASK_NONE;
+       nodemask_t this_pass    = NODE_MASK_NONE;
+       nodemask_t used_targets = NODE_MASK_NONE;
+       int node;
+
+       /*
+        * Avoid any oddities like cycles that could occur
+        * from changes in the topology.  This will leave
+        * a momentary gap when migration is disabled.
+        */
+       disable_all_migrate_targets();
+
+       /*
+        * Allocations go close to CPUs, first.  Assume that
+        * the migration path starts at the nodes with CPUs.
+        */
+       next_pass = node_states[N_CPU];
+again:
+       this_pass = next_pass;
+       next_pass = NODE_MASK_NONE;
+       /*
+        * To avoid cycles in the migration "graph", ensure
+        * that migration sources are not future targets by
+        * setting them in 'used_targets'.  Do this only
+        * once per pass so that multiple source nodes can
+        * share a target node.
+        *
+        * 'used_targets' will become unavailable in future
+        * passes.  This limits some opportunities for
+        * multiple source nodes to share a destination.
+        */
+       nodes_or(used_targets, used_targets, this_pass);
+       for_each_node_mask(node, this_pass) {
+               int target_node = establish_migrate_target(node, &used_targets);
+
+               if (target_node == NUMA_NO_NODE)
+                       continue;
+
+               /*
+                * Visit targets from this pass in the next pass.
+                * Eventually, every node will have been part of
+                * a pass, and will become set in 'used_targets'.
+                */
+               node_set(target_node, next_pass);
+       }
+       /*
+        * 'next_pass' contains nodes which became migration
+        * targets in this pass.  Make additional passes until
+        * no more migrations targets are available.
+        */
+       if (!nodes_empty(next_pass))
+               goto again;
+}
+
+/*
+ * For callers that do not hold get_online_mems() already.
+ */
+static void set_migration_target_nodes(void)
+{
+       get_online_mems();
+       __set_migration_target_nodes();
+       put_online_mems();
+}
+
+/*
+ * React to hotplug events that might affect the migration targets
+ * like events that online or offline NUMA nodes.
+ *
+ * The ordering is also currently dependent on which nodes have
+ * CPUs.  That means we need CPU on/offline notification too.
+ */
+static int migration_online_cpu(unsigned int cpu)
+{
+       set_migration_target_nodes();
+       return 0;
+}
+
+static int migration_offline_cpu(unsigned int cpu)
+{
+       set_migration_target_nodes();
+       return 0;
+}
+
+/*
+ * This leaves migrate-on-reclaim transiently disabled between
+ * the MEM_GOING_OFFLINE and MEM_OFFLINE events.  This runs
+ * whether reclaim-based migration is enabled or not, which
+ * ensures that the user can turn reclaim-based migration at
+ * any time without needing to recalculate migration targets.
+ *
+ * These callbacks already hold get_online_mems().  That is why
+ * __set_migration_target_nodes() can be used as opposed to
+ * set_migration_target_nodes().
+ */
+static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
+                                                unsigned long action, void *arg)
+{
+       switch (action) {
+       case MEM_GOING_OFFLINE:
+               /*
+                * Make sure there are not transient states where
+                * an offline node is a migration target.  This
+                * will leave migration disabled until the offline
+                * completes and the MEM_OFFLINE case below runs.
+                */
+               disable_all_migrate_targets();
+               break;
+       case MEM_OFFLINE:
+       case MEM_ONLINE:
+               /*
+                * Recalculate the target nodes once the node
+                * reaches its final state (online or offline).
+                */
+               __set_migration_target_nodes();
+               break;
+       case MEM_CANCEL_OFFLINE:
+               /*
+                * MEM_GOING_OFFLINE disabled all the migration
+                * targets.  Reenable them.
+                */
+               __set_migration_target_nodes();
+               break;
+       case MEM_GOING_ONLINE:
+       case MEM_CANCEL_ONLINE:
+               break;
+       }
+
+       return notifier_from_errno(0);
+}
+
+static int __init migrate_on_reclaim_init(void)
+{
+       int ret;
+
+       ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "migrate on reclaim",
+                               migration_online_cpu,
+                               migration_offline_cpu);
+       /*
+        * In the unlikely case that this fails, the automatic
+        * migration targets may become suboptimal for nodes
+        * where N_CPU changes.  With such a small impact in a
+        * rare case, do not bother trying to do anything special.
+        */
+       WARN_ON(ret < 0);
+
+       hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
+       return 0;
+}
+late_initcall(migrate_on_reclaim_init);
+#endif /* CONFIG_MEMORY_HOTPLUG */
index 181a113..dce4610 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -534,6 +534,7 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
 {
        struct rb_node **__rb_link, *__rb_parent, *rb_prev;
 
+       mmap_assert_locked(mm);
        __rb_link = &mm->mm_rb.rb_node;
        rb_prev = __rb_parent = NULL;
 
@@ -2297,6 +2298,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
        struct rb_node *rb_node;
        struct vm_area_struct *vma;
 
+       mmap_assert_locked(mm);
        /* Check the cache first. */
        vma = vmacache_find(mm, addr);
        if (likely(vma))
@@ -2986,14 +2988,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
        if (mmap_write_lock_killable(mm))
                return -EINTR;
 
-       vma = find_vma(mm, start);
+       vma = vma_lookup(mm, start);
 
        if (!vma || !(vma->vm_flags & VM_SHARED))
                goto out;
 
-       if (start < vma->vm_start)
-               goto out;
-
        if (start + size > vma->vm_end) {
                struct vm_area_struct *next;
 
index 5989d39..badfe17 100644 (file)
@@ -686,7 +686,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
        if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
                /* OOM: unable to split vma, just get accounts right */
                if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
-                       vm_acct_memory(new_len >> PAGE_SHIFT);
+                       vm_acct_memory(old_len >> PAGE_SHIFT);
                excess = 0;
        }
 
index c729a4c..831340e 100644 (file)
@@ -28,6 +28,7 @@
 #include <linux/sched/task.h>
 #include <linux/sched/debug.h>
 #include <linux/swap.h>
+#include <linux/syscalls.h>
 #include <linux/timex.h>
 #include <linux/jiffies.h>
 #include <linux/cpuset.h>
@@ -1141,3 +1142,72 @@ void pagefault_out_of_memory(void)
        out_of_memory(&oc);
        mutex_unlock(&oom_lock);
 }
+
+SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
+{
+#ifdef CONFIG_MMU
+       struct mm_struct *mm = NULL;
+       struct task_struct *task;
+       struct task_struct *p;
+       unsigned int f_flags;
+       bool reap = true;
+       struct pid *pid;
+       long ret = 0;
+
+       if (flags)
+               return -EINVAL;
+
+       pid = pidfd_get_pid(pidfd, &f_flags);
+       if (IS_ERR(pid))
+               return PTR_ERR(pid);
+
+       task = get_pid_task(pid, PIDTYPE_TGID);
+       if (!task) {
+               ret = -ESRCH;
+               goto put_pid;
+       }
+
+       /*
+        * Make sure to choose a thread which still has a reference to mm
+        * during the group exit
+        */
+       p = find_lock_task_mm(task);
+       if (!p) {
+               ret = -ESRCH;
+               goto put_task;
+       }
+
+       mm = p->mm;
+       mmgrab(mm);
+
+       /* If the work has been done already, just exit with success */
+       if (test_bit(MMF_OOM_SKIP, &mm->flags))
+               reap = false;
+       else if (!task_will_free_mem(p)) {
+               reap = false;
+               ret = -EINVAL;
+       }
+       task_unlock(p);
+
+       if (!reap)
+               goto drop_mm;
+
+       if (mmap_read_lock_killable(mm)) {
+               ret = -EINTR;
+               goto drop_mm;
+       }
+       if (!__oom_reap_task_mm(mm))
+               ret = -EAGAIN;
+       mmap_read_unlock(mm);
+
+drop_mm:
+       mmdrop(mm);
+put_task:
+       put_task_struct(task);
+put_pid:
+       put_pid(pid);
+       return ret;
+#else
+       return -ENOSYS;
+#endif /* CONFIG_MMU */
+}
index c12f67c..4812a17 100644 (file)
@@ -183,7 +183,7 @@ static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
 static void wb_min_max_ratio(struct bdi_writeback *wb,
                             unsigned long *minp, unsigned long *maxp)
 {
-       unsigned long this_bw = wb->avg_write_bandwidth;
+       unsigned long this_bw = READ_ONCE(wb->avg_write_bandwidth);
        unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
        unsigned long long min = wb->bdi->min_ratio;
        unsigned long long max = wb->bdi->max_ratio;
@@ -892,7 +892,7 @@ static long long pos_ratio_polynom(unsigned long setpoint,
 static void wb_position_ratio(struct dirty_throttle_control *dtc)
 {
        struct bdi_writeback *wb = dtc->wb;
-       unsigned long write_bw = wb->avg_write_bandwidth;
+       unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth);
        unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
        unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
        unsigned long wb_thresh = dtc->wb_thresh;
@@ -1115,7 +1115,7 @@ out:
                                        &wb->bdi->tot_write_bandwidth) <= 0);
        }
        wb->write_bandwidth = bw;
-       wb->avg_write_bandwidth = avg;
+       WRITE_ONCE(wb->avg_write_bandwidth, avg);
 }
 
 static void update_dirty_limit(struct dirty_throttle_control *dtc)
@@ -1147,8 +1147,8 @@ update:
        dom->dirty_limit = limit;
 }
 
-static void domain_update_bandwidth(struct dirty_throttle_control *dtc,
-                                   unsigned long now)
+static void domain_update_dirty_limit(struct dirty_throttle_control *dtc,
+                                     unsigned long now)
 {
        struct wb_domain *dom = dtc_dom(dtc);
 
@@ -1324,7 +1324,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
        else
                dirty_ratelimit -= step;
 
-       wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
+       WRITE_ONCE(wb->dirty_ratelimit, max(dirty_ratelimit, 1UL));
        wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
 
        trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
@@ -1332,35 +1332,28 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
 
 static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
                                  struct dirty_throttle_control *mdtc,
-                                 unsigned long start_time,
                                  bool update_ratelimit)
 {
        struct bdi_writeback *wb = gdtc->wb;
        unsigned long now = jiffies;
-       unsigned long elapsed = now - wb->bw_time_stamp;
+       unsigned long elapsed;
        unsigned long dirtied;
        unsigned long written;
 
-       lockdep_assert_held(&wb->list_lock);
+       spin_lock(&wb->list_lock);
 
        /*
-        * rate-limit, only update once every 200ms.
+        * Lockless checks for elapsed time are racy and delayed update after
+        * IO completion doesn't do it at all (to make sure written pages are
+        * accounted reasonably quickly). Make sure elapsed >= 1 to avoid
+        * division errors.
         */
-       if (elapsed < BANDWIDTH_INTERVAL)
-               return;
-
+       elapsed = max(now - wb->bw_time_stamp, 1UL);
        dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
        written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
 
-       /*
-        * Skip quiet periods when disk bandwidth is under-utilized.
-        * (at least 1s idle time between two flusher runs)
-        */
-       if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
-               goto snapshot;
-
        if (update_ratelimit) {
-               domain_update_bandwidth(gdtc, now);
+               domain_update_dirty_limit(gdtc, now);
                wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
 
                /*
@@ -1368,23 +1361,41 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
                 * compiler has no way to figure that out.  Help it.
                 */
                if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
-                       domain_update_bandwidth(mdtc, now);
+                       domain_update_dirty_limit(mdtc, now);
                        wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
                }
        }
        wb_update_write_bandwidth(wb, elapsed, written);
 
-snapshot:
        wb->dirtied_stamp = dirtied;
        wb->written_stamp = written;
-       wb->bw_time_stamp = now;
+       WRITE_ONCE(wb->bw_time_stamp, now);
+       spin_unlock(&wb->list_lock);
 }
 
-void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
+void wb_update_bandwidth(struct bdi_writeback *wb)
 {
        struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
 
-       __wb_update_bandwidth(&gdtc, NULL, start_time, false);
+       __wb_update_bandwidth(&gdtc, NULL, false);
+}
+
+/* Interval after which we consider wb idle and don't estimate bandwidth */
+#define WB_BANDWIDTH_IDLE_JIF (HZ)
+
+static void wb_bandwidth_estimate_start(struct bdi_writeback *wb)
+{
+       unsigned long now = jiffies;
+       unsigned long elapsed = now - READ_ONCE(wb->bw_time_stamp);
+
+       if (elapsed > WB_BANDWIDTH_IDLE_JIF &&
+           !atomic_read(&wb->writeback_inodes)) {
+               spin_lock(&wb->list_lock);
+               wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED);
+               wb->written_stamp = wb_stat(wb, WB_WRITTEN);
+               WRITE_ONCE(wb->bw_time_stamp, now);
+               spin_unlock(&wb->list_lock);
+       }
 }
 
 /*
@@ -1407,7 +1418,7 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
 static unsigned long wb_max_pause(struct bdi_writeback *wb,
                                  unsigned long wb_dirty)
 {
-       unsigned long bw = wb->avg_write_bandwidth;
+       unsigned long bw = READ_ONCE(wb->avg_write_bandwidth);
        unsigned long t;
 
        /*
@@ -1429,8 +1440,8 @@ static long wb_min_pause(struct bdi_writeback *wb,
                         unsigned long dirty_ratelimit,
                         int *nr_dirtied_pause)
 {
-       long hi = ilog2(wb->avg_write_bandwidth);
-       long lo = ilog2(wb->dirty_ratelimit);
+       long hi = ilog2(READ_ONCE(wb->avg_write_bandwidth));
+       long lo = ilog2(READ_ONCE(wb->dirty_ratelimit));
        long t;         /* target pause */
        long pause;     /* estimated next pause */
        int pages;      /* target nr_dirtied_pause */
@@ -1710,15 +1721,12 @@ free_running:
                if (dirty_exceeded && !wb->dirty_exceeded)
                        wb->dirty_exceeded = 1;
 
-               if (time_is_before_jiffies(wb->bw_time_stamp +
-                                          BANDWIDTH_INTERVAL)) {
-                       spin_lock(&wb->list_lock);
-                       __wb_update_bandwidth(gdtc, mdtc, start_time, true);
-                       spin_unlock(&wb->list_lock);
-               }
+               if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
+                                          BANDWIDTH_INTERVAL))
+                       __wb_update_bandwidth(gdtc, mdtc, true);
 
                /* throttle according to the chosen dtc */
-               dirty_ratelimit = wb->dirty_ratelimit;
+               dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit);
                task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
                                                        RATELIMIT_CALC_SHIFT;
                max_pause = wb_max_pause(wb, sdtc->wb_dirty);
@@ -2345,9 +2353,12 @@ EXPORT_SYMBOL(generic_writepages);
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
        int ret;
+       struct bdi_writeback *wb;
 
        if (wbc->nr_to_write <= 0)
                return 0;
+       wb = inode_to_wb_wbc(mapping->host, wbc);
+       wb_bandwidth_estimate_start(wb);
        while (1) {
                if (mapping->a_ops->writepages)
                        ret = mapping->a_ops->writepages(mapping, wbc);
@@ -2358,6 +2369,14 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
                cond_resched();
                congestion_wait(BLK_RW_ASYNC, HZ/50);
        }
+       /*
+        * Usually few pages are written by now from those we've just submitted
+        * but if there's constant writeback being submitted, this makes sure
+        * writeback bandwidth is updated once in a while.
+        */
+       if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
+                                  BANDWIDTH_INTERVAL))
+               wb_update_bandwidth(wb);
        return ret;
 }
 
@@ -2729,6 +2748,24 @@ int clear_page_dirty_for_io(struct page *page)
 }
 EXPORT_SYMBOL(clear_page_dirty_for_io);
 
+static void wb_inode_writeback_start(struct bdi_writeback *wb)
+{
+       atomic_inc(&wb->writeback_inodes);
+}
+
+static void wb_inode_writeback_end(struct bdi_writeback *wb)
+{
+       atomic_dec(&wb->writeback_inodes);
+       /*
+        * Make sure estimate of writeback throughput gets updated after
+        * writeback completed. We delay the update by BANDWIDTH_INTERVAL
+        * (which is the interval other bandwidth updates use for batching) so
+        * that if multiple inodes end writeback at a similar time, they get
+        * batched into one bandwidth update.
+        */
+       queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL);
+}
+
 int test_clear_page_writeback(struct page *page)
 {
        struct address_space *mapping = page_mapping(page);
@@ -2750,6 +2787,9 @@ int test_clear_page_writeback(struct page *page)
 
                                dec_wb_stat(wb, WB_WRITEBACK);
                                __wb_writeout_inc(wb);
+                               if (!mapping_tagged(mapping,
+                                                   PAGECACHE_TAG_WRITEBACK))
+                                       wb_inode_writeback_end(wb);
                        }
                }
 
@@ -2792,8 +2832,13 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
                                                   PAGECACHE_TAG_WRITEBACK);
 
                        xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
-                       if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT)
-                               inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
+                       if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
+                               struct bdi_writeback *wb = inode_to_wb(inode);
+
+                               inc_wb_stat(wb, WB_WRITEBACK);
+                               if (!on_wblist)
+                                       wb_inode_writeback_start(wb);
+                       }
 
                        /*
                         * We can come through here when swapping anonymous
index eeb3a9c..f95e1d2 100644 (file)
@@ -4211,7 +4211,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
                if (tsk_is_oom_victim(current) ||
                    (current->flags & (PF_MEMALLOC | PF_EXITING)))
                        filter &= ~SHOW_MEM_FILTER_NODES;
-       if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
+       if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
                filter &= ~SHOW_MEM_FILTER_NODES;
 
        show_mem(filter, nodemask);
@@ -4549,14 +4549,14 @@ static bool __need_reclaim(gfp_t gfp_mask)
        return true;
 }
 
-void __fs_reclaim_acquire(void)
+void __fs_reclaim_acquire(unsigned long ip)
 {
-       lock_map_acquire(&__fs_reclaim_map);
+       lock_acquire_exclusive(&__fs_reclaim_map, 0, 0, NULL, ip);
 }
 
-void __fs_reclaim_release(void)
+void __fs_reclaim_release(unsigned long ip)
 {
-       lock_map_release(&__fs_reclaim_map);
+       lock_release(&__fs_reclaim_map, ip);
 }
 
 void fs_reclaim_acquire(gfp_t gfp_mask)
@@ -4565,7 +4565,7 @@ void fs_reclaim_acquire(gfp_t gfp_mask)
 
        if (__need_reclaim(gfp_mask)) {
                if (gfp_mask & __GFP_FS)
-                       __fs_reclaim_acquire();
+                       __fs_reclaim_acquire(_RET_IP_);
 
 #ifdef CONFIG_MMU_NOTIFIER
                lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
@@ -4582,7 +4582,7 @@ void fs_reclaim_release(gfp_t gfp_mask)
 
        if (__need_reclaim(gfp_mask)) {
                if (gfp_mask & __GFP_FS)
-                       __fs_reclaim_release();
+                       __fs_reclaim_release(_RET_IP_);
        }
 }
 EXPORT_SYMBOL_GPL(fs_reclaim_release);
@@ -4697,7 +4697,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
                 * comment for __cpuset_node_allowed().
                 */
                alloc_flags &= ~ALLOC_CPUSET;
-       } else if (unlikely(rt_task(current)) && !in_interrupt())
+       } else if (unlikely(rt_task(current)) && in_task())
                alloc_flags |= ALLOC_HARDER;
 
        alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
@@ -5157,7 +5157,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
                 * When we are in the interrupt context, it is irrelevant
                 * to the current task context. It means that any node ok.
                 */
-               if (!in_interrupt() && !ac->nodemask)
+               if (in_task() && !ac->nodemask)
                        ac->nodemask = &cpuset_current_mems_allowed;
                else
                        *alloc_flags |= ALLOC_CPUSET;
@@ -5903,6 +5903,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                " unevictable:%lu dirty:%lu writeback:%lu\n"
                " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
                " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
+               " kernel_misc_reclaimable:%lu\n"
                " free:%lu free_pcp:%lu free_cma:%lu\n",
                global_node_page_state(NR_ACTIVE_ANON),
                global_node_page_state(NR_INACTIVE_ANON),
@@ -5919,6 +5920,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                global_node_page_state(NR_SHMEM),
                global_node_page_state(NR_PAGETABLE),
                global_zone_page_state(NR_BOUNCE),
+               global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
                global_zone_page_state(NR_FREE_PAGES),
                free_pcp,
                global_zone_page_state(NR_FREE_CMA_PAGES));
@@ -6155,7 +6157,7 @@ static int node_load[MAX_NUMNODES];
  *
  * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
  */
-static int find_next_best_node(int node, nodemask_t *used_node_mask)
+int find_next_best_node(int node, nodemask_t *used_node_mask)
 {
        int n, val;
        int min_val = INT_MAX;
@@ -6640,7 +6642,6 @@ static void __meminit zone_init_free_lists(struct zone *zone)
        }
 }
 
-#if !defined(CONFIG_FLATMEM)
 /*
  * Only struct pages that correspond to ranges defined by memblock.memory
  * are zeroed and initialized by going through __init_single_page() during
@@ -6685,13 +6686,6 @@ static void __init init_unavailable_range(unsigned long spfn,
                pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
                        node, zone_names[zone], pgcnt);
 }
-#else
-static inline void init_unavailable_range(unsigned long spfn,
-                                         unsigned long epfn,
-                                         int zone, int node)
-{
-}
-#endif
 
 static void __init memmap_init_zone_range(struct zone *zone,
                                          unsigned long start_pfn,
@@ -6721,7 +6715,7 @@ static void __init memmap_init(void)
 {
        unsigned long start_pfn, end_pfn;
        unsigned long hole_pfn = 0;
-       int i, j, zone_id, nid;
+       int i, j, zone_id = 0, nid;
 
        for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
                struct pglist_data *node = NODE_DATA(nid);
@@ -6754,6 +6748,26 @@ static void __init memmap_init(void)
                init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
 }
 
+void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
+                         phys_addr_t min_addr, int nid, bool exact_nid)
+{
+       void *ptr;
+
+       if (exact_nid)
+               ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
+                                                  MEMBLOCK_ALLOC_ACCESSIBLE,
+                                                  nid);
+       else
+               ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
+                                                MEMBLOCK_ALLOC_ACCESSIBLE,
+                                                nid);
+
+       if (ptr && size > 0)
+               page_init_poison(ptr, size);
+
+       return ptr;
+}
+
 static int zone_batchsize(struct zone *zone)
 {
 #ifdef CONFIG_MMU
@@ -7501,7 +7515,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
 }
 
 #ifdef CONFIG_FLATMEM
-static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
+static void __init alloc_node_mem_map(struct pglist_data *pgdat)
 {
        unsigned long __maybe_unused start = 0;
        unsigned long __maybe_unused offset = 0;
@@ -7525,8 +7539,8 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
                end = pgdat_end_pfn(pgdat);
                end = ALIGN(end, MAX_ORDER_NR_PAGES);
                size =  (end - start) * sizeof(struct page);
-               map = memblock_alloc_node(size, SMP_CACHE_BYTES,
-                                         pgdat->node_id);
+               map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
+                                  pgdat->node_id, false);
                if (!map)
                        panic("Failed to allocate %ld bytes for node %d memory map\n",
                              size, pgdat->node_id);
@@ -7547,7 +7561,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
 #endif
 }
 #else
-static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
+static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
 #endif /* CONFIG_FLATMEM */
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -8976,7 +8990,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
                cc->nr_migratepages -= nr_reclaimed;
 
                ret = migrate_pages(&cc->migratepages, alloc_migration_target,
-                               NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
+                       NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
 
                /*
                 * On -ENOMEM, migrate_pages() bails out right away. It is pointless
index bddf788..fff55bb 100644 (file)
@@ -287,6 +287,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
        unsigned long pfn, flags;
        struct page *page;
        struct zone *zone;
+       int ret;
 
        /*
         * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages
@@ -299,15 +300,21 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
                        break;
        }
        page = __first_valid_page(start_pfn, end_pfn - start_pfn);
-       if ((pfn < end_pfn) || !page)
-               return -EBUSY;
+       if ((pfn < end_pfn) || !page) {
+               ret = -EBUSY;
+               goto out;
+       }
+
        /* Check all pages are free or marked as ISOLATED */
        zone = page_zone(page);
        spin_lock_irqsave(&zone->lock, flags);
        pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, isol_flags);
        spin_unlock_irqrestore(&zone->lock, flags);
 
+       ret = pfn < end_pfn ? -EBUSY : 0;
+
+out:
        trace_test_pages_isolated(start_pfn, end_pfn, pfn);
 
-       return pfn < end_pfn ? -EBUSY : 0;
+       return ret;
 }
index 7f2e015..e1c2083 100644 (file)
@@ -1520,9 +1520,6 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
  * Pages in [@page_start,@page_end) have been populated to @chunk.  Update
  * the bookkeeping information accordingly.  Must be called after each
  * successful population.
- *
- * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it
- * is to serve an allocation in that area.
  */
 static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
                                 int page_end)
index 3107ace..8874295 100644 (file)
@@ -38,8 +38,7 @@
 #include <linux/hugetlb.h>
 #include <linux/frontswap.h>
 #include <linux/fs_parser.h>
-
-#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
+#include <linux/swapfile.h>
 
 static struct vfsmount *shm_mnt;
 
@@ -137,9 +136,6 @@ static unsigned long shmem_default_max_inodes(void)
 }
 #endif
 
-static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
-static int shmem_replace_page(struct page **pagep, gfp_t gfp,
-                               struct shmem_inode_info *info, pgoff_t index);
 static int shmem_swapin_page(struct inode *inode, pgoff_t index,
                             struct page **pagep, enum sgp_type sgp,
                             gfp_t gfp, struct vm_area_struct *vma,
@@ -278,10 +274,10 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
        ino_t ino;
 
        if (!(sb->s_flags & SB_KERNMOUNT)) {
-               spin_lock(&sbinfo->stat_lock);
+               raw_spin_lock(&sbinfo->stat_lock);
                if (sbinfo->max_inodes) {
                        if (!sbinfo->free_inodes) {
-                               spin_unlock(&sbinfo->stat_lock);
+                               raw_spin_unlock(&sbinfo->stat_lock);
                                return -ENOSPC;
                        }
                        sbinfo->free_inodes--;
@@ -304,7 +300,7 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
                        }
                        *inop = ino;
                }
-               spin_unlock(&sbinfo->stat_lock);
+               raw_spin_unlock(&sbinfo->stat_lock);
        } else if (inop) {
                /*
                 * __shmem_file_setup, one of our callers, is lock-free: it
@@ -319,13 +315,14 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
                 * to worry about things like glibc compatibility.
                 */
                ino_t *next_ino;
+
                next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
                ino = *next_ino;
                if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
-                       spin_lock(&sbinfo->stat_lock);
+                       raw_spin_lock(&sbinfo->stat_lock);
                        ino = sbinfo->next_ino;
                        sbinfo->next_ino += SHMEM_INO_BATCH;
-                       spin_unlock(&sbinfo->stat_lock);
+                       raw_spin_unlock(&sbinfo->stat_lock);
                        if (unlikely(is_zero_ino(ino)))
                                ino++;
                }
@@ -341,9 +338,9 @@ static void shmem_free_inode(struct super_block *sb)
 {
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        if (sbinfo->max_inodes) {
-               spin_lock(&sbinfo->stat_lock);
+               raw_spin_lock(&sbinfo->stat_lock);
                sbinfo->free_inodes++;
-               spin_unlock(&sbinfo->stat_lock);
+               raw_spin_unlock(&sbinfo->stat_lock);
        }
 }
 
@@ -474,7 +471,38 @@ static bool shmem_confirm_swap(struct address_space *mapping,
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /* ifdef here to avoid bloating shmem.o when not necessary */
 
-static int shmem_huge __read_mostly;
+static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
+
+bool shmem_is_huge(struct vm_area_struct *vma,
+                  struct inode *inode, pgoff_t index)
+{
+       loff_t i_size;
+
+       if (shmem_huge == SHMEM_HUGE_DENY)
+               return false;
+       if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) ||
+           test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)))
+               return false;
+       if (shmem_huge == SHMEM_HUGE_FORCE)
+               return true;
+
+       switch (SHMEM_SB(inode->i_sb)->huge) {
+       case SHMEM_HUGE_ALWAYS:
+               return true;
+       case SHMEM_HUGE_WITHIN_SIZE:
+               index = round_up(index, HPAGE_PMD_NR);
+               i_size = round_up(i_size_read(inode), PAGE_SIZE);
+               if (i_size >= HPAGE_PMD_SIZE && (i_size >> PAGE_SHIFT) >= index)
+                       return true;
+               fallthrough;
+       case SHMEM_HUGE_ADVISE:
+               if (vma && (vma->vm_flags & VM_HUGEPAGE))
+                       return true;
+               fallthrough;
+       default:
+               return false;
+       }
+}
 
 #if defined(CONFIG_SYSFS)
 static int shmem_parse_huge(const char *str)
@@ -645,6 +673,12 @@ static long shmem_unused_huge_count(struct super_block *sb,
 
 #define shmem_huge SHMEM_HUGE_DENY
 
+bool shmem_is_huge(struct vm_area_struct *vma,
+                  struct inode *inode, pgoff_t index)
+{
+       return false;
+}
+
 static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
                struct shrink_control *sc, unsigned long nr_to_split)
 {
@@ -652,15 +686,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
-static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
-{
-       if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
-           (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) &&
-           shmem_huge != SHMEM_HUGE_DENY)
-               return true;
-       return false;
-}
-
 /*
  * Like add_to_page_cache_locked, but error if expected item has gone.
  */
@@ -905,6 +930,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
        if (lend == -1)
                end = -1;       /* unsigned, so actually very big */
 
+       if (info->fallocend > start && info->fallocend <= end && !unfalloc)
+               info->fallocend = start;
+
        pagevec_init(&pvec);
        index = start;
        while (index < end && find_lock_entries(mapping, index, end - 1,
@@ -1038,7 +1066,6 @@ static int shmem_getattr(struct user_namespace *mnt_userns,
 {
        struct inode *inode = path->dentry->d_inode;
        struct shmem_inode_info *info = SHMEM_I(inode);
-       struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb);
 
        if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
                spin_lock_irq(&info->lock);
@@ -1047,7 +1074,7 @@ static int shmem_getattr(struct user_namespace *mnt_userns,
        }
        generic_fillattr(&init_user_ns, inode, stat);
 
-       if (is_huge_enabled(sb_info))
+       if (shmem_is_huge(NULL, inode, 0))
                stat->blksize = HPAGE_PMD_SIZE;
 
        return 0;
@@ -1058,7 +1085,6 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
 {
        struct inode *inode = d_inode(dentry);
        struct shmem_inode_info *info = SHMEM_I(inode);
-       struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        int error;
 
        error = setattr_prepare(&init_user_ns, dentry, attr);
@@ -1094,24 +1120,6 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
                        if (oldsize > holebegin)
                                unmap_mapping_range(inode->i_mapping,
                                                        holebegin, 0, 1);
-
-                       /*
-                        * Part of the huge page can be beyond i_size: subject
-                        * to shrink under memory pressure.
-                        */
-                       if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
-                               spin_lock(&sbinfo->shrinklist_lock);
-                               /*
-                                * _careful to defend against unlocked access to
-                                * ->shrink_list in shmem_unused_huge_shrink()
-                                */
-                               if (list_empty_careful(&info->shrinklist)) {
-                                       list_add_tail(&info->shrinklist,
-                                                       &sbinfo->shrinklist);
-                                       sbinfo->shrinklist_len++;
-                               }
-                               spin_unlock(&sbinfo->shrinklist_lock);
-                       }
                }
        }
 
@@ -1156,8 +1164,6 @@ static void shmem_evict_inode(struct inode *inode)
        clear_inode(inode);
 }
 
-extern struct swap_info_struct *swap_info[];
-
 static int shmem_find_swap_entries(struct address_space *mapping,
                                   pgoff_t start, unsigned int nr_entries,
                                   struct page **entries, pgoff_t *indices,
@@ -1338,7 +1344,19 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
        swp_entry_t swap;
        pgoff_t index;
 
-       VM_BUG_ON_PAGE(PageCompound(page), page);
+       /*
+        * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or
+        * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages,
+        * and its shmem_writeback() needs them to be split when swapping.
+        */
+       if (PageTransCompound(page)) {
+               /* Ensure the subpages are still dirty */
+               SetPageDirty(page);
+               if (split_huge_page(page) < 0)
+                       goto redirty;
+               ClearPageDirty(page);
+       }
+
        BUG_ON(!PageLocked(page));
        mapping = page->mapping;
        index = page->index;
@@ -1453,10 +1471,10 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 {
        struct mempolicy *mpol = NULL;
        if (sbinfo->mpol) {
-               spin_lock(&sbinfo->stat_lock);  /* prevent replace/use races */
+               raw_spin_lock(&sbinfo->stat_lock);      /* prevent replace/use races */
                mpol = sbinfo->mpol;
                mpol_get(mpol);
-               spin_unlock(&sbinfo->stat_lock);
+               raw_spin_unlock(&sbinfo->stat_lock);
        }
        return mpol;
 }
@@ -1798,7 +1816,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
        struct shmem_sb_info *sbinfo;
        struct mm_struct *charge_mm;
        struct page *page;
-       enum sgp_type sgp_huge = sgp;
        pgoff_t hindex = index;
        gfp_t huge_gfp;
        int error;
@@ -1807,8 +1824,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 
        if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
                return -EFBIG;
-       if (sgp == SGP_NOHUGE || sgp == SGP_HUGE)
-               sgp = SGP_CACHE;
 repeat:
        if (sgp <= SGP_CACHE &&
            ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
@@ -1840,26 +1855,31 @@ repeat:
                return error;
        }
 
-       if (page)
+       if (page) {
                hindex = page->index;
-       if (page && sgp == SGP_WRITE)
-               mark_page_accessed(page);
-
-       /* fallocated page? */
-       if (page && !PageUptodate(page)) {
+               if (sgp == SGP_WRITE)
+                       mark_page_accessed(page);
+               if (PageUptodate(page))
+                       goto out;
+               /* fallocated page */
                if (sgp != SGP_READ)
                        goto clear;
                unlock_page(page);
                put_page(page);
-               page = NULL;
-               hindex = index;
        }
-       if (page || sgp == SGP_READ)
-               goto out;
 
        /*
-        * Fast cache lookup did not find it:
-        * bring it back from swap or allocate.
+        * SGP_READ: succeed on hole, with NULL page, letting caller zero.
+        * SGP_NOALLOC: fail on hole, with NULL page, letting caller fail.
+        */
+       *pagep = NULL;
+       if (sgp == SGP_READ)
+               return 0;
+       if (sgp == SGP_NOALLOC)
+               return -ENOENT;
+
+       /*
+        * Fast cache lookup and swap lookup did not find it: allocate.
         */
 
        if (vma && userfaultfd_missing(vma)) {
@@ -1867,36 +1887,12 @@ repeat:
                return 0;
        }
 
-       /* shmem_symlink() */
-       if (!shmem_mapping(mapping))
-               goto alloc_nohuge;
-       if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
-               goto alloc_nohuge;
-       if (shmem_huge == SHMEM_HUGE_FORCE)
-               goto alloc_huge;
-       switch (sbinfo->huge) {
-       case SHMEM_HUGE_NEVER:
+       /* Never use a huge page for shmem_symlink() */
+       if (S_ISLNK(inode->i_mode))
                goto alloc_nohuge;
-       case SHMEM_HUGE_WITHIN_SIZE: {
-               loff_t i_size;
-               pgoff_t off;
-
-               off = round_up(index, HPAGE_PMD_NR);
-               i_size = round_up(i_size_read(inode), PAGE_SIZE);
-               if (i_size >= HPAGE_PMD_SIZE &&
-                   i_size >> PAGE_SHIFT >= off)
-                       goto alloc_huge;
-
-               fallthrough;
-       }
-       case SHMEM_HUGE_ADVISE:
-               if (sgp_huge == SGP_HUGE)
-                       goto alloc_huge;
-               /* TODO: implement fadvise() hints */
+       if (!shmem_is_huge(vma, inode, index))
                goto alloc_nohuge;
-       }
 
-alloc_huge:
        huge_gfp = vma_thp_gfp_mask(vma);
        huge_gfp = limit_gfp_mask(huge_gfp, gfp);
        page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true);
@@ -2052,7 +2048,6 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
        struct vm_area_struct *vma = vmf->vma;
        struct inode *inode = file_inode(vma->vm_file);
        gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
-       enum sgp_type sgp;
        int err;
        vm_fault_t ret = VM_FAULT_LOCKED;
 
@@ -2115,15 +2110,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
                spin_unlock(&inode->i_lock);
        }
 
-       sgp = SGP_CACHE;
-
-       if ((vma->vm_flags & VM_NOHUGEPAGE) ||
-           test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
-               sgp = SGP_NOHUGE;
-       else if (vma->vm_flags & VM_HUGEPAGE)
-               sgp = SGP_HUGE;
-
-       err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
+       err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE,
                                  gfp, vma, vmf, &ret);
        if (err)
                return vmf_error(err);
@@ -2655,7 +2642,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_falloc shmem_falloc;
-       pgoff_t start, index, end;
+       pgoff_t start, index, end, undo_fallocend;
        int error;
 
        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -2724,7 +2711,16 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
        inode->i_private = &shmem_falloc;
        spin_unlock(&inode->i_lock);
 
-       for (index = start; index < end; index++) {
+       /*
+        * info->fallocend is only relevant when huge pages might be
+        * involved: to prevent split_huge_page() freeing fallocated
+        * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
+        */
+       undo_fallocend = info->fallocend;
+       if (info->fallocend < end)
+               info->fallocend = end;
+
+       for (index = start; index < end; ) {
                struct page *page;
 
                /*
@@ -2738,6 +2734,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
                else
                        error = shmem_getpage(inode, index, &page, SGP_FALLOC);
                if (error) {
+                       info->fallocend = undo_fallocend;
                        /* Remove the !PageUptodate pages we added */
                        if (index > start) {
                                shmem_undo_range(inode,
@@ -2747,13 +2744,26 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
                        goto undone;
                }
 
+               index++;
+               /*
+                * Here is a more important optimization than it appears:
+                * a second SGP_FALLOC on the same huge page will clear it,
+                * making it PageUptodate and un-undoable if we fail later.
+                */
+               if (PageTransCompound(page)) {
+                       index = round_up(index, HPAGE_PMD_NR);
+                       /* Beware 32-bit wraparound */
+                       if (!index)
+                               index--;
+               }
+
                /*
                 * Inform shmem_writepage() how far we have reached.
                 * No need for lock or barrier: we have the page lock.
                 */
-               shmem_falloc.next++;
                if (!PageUptodate(page))
-                       shmem_falloc.nr_falloced++;
+                       shmem_falloc.nr_falloced += index - shmem_falloc.next;
+               shmem_falloc.next = index;
 
                /*
                 * If !PageUptodate, leave it that way so that freeable pages
@@ -3488,9 +3498,10 @@ static int shmem_reconfigure(struct fs_context *fc)
        struct shmem_options *ctx = fc->fs_private;
        struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
        unsigned long inodes;
+       struct mempolicy *mpol = NULL;
        const char *err;
 
-       spin_lock(&sbinfo->stat_lock);
+       raw_spin_lock(&sbinfo->stat_lock);
        inodes = sbinfo->max_inodes - sbinfo->free_inodes;
        if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
                if (!sbinfo->max_blocks) {
@@ -3535,14 +3546,15 @@ static int shmem_reconfigure(struct fs_context *fc)
         * Preserve previous mempolicy unless mpol remount option was specified.
         */
        if (ctx->mpol) {
-               mpol_put(sbinfo->mpol);
+               mpol = sbinfo->mpol;
                sbinfo->mpol = ctx->mpol;       /* transfers initial ref */
                ctx->mpol = NULL;
        }
-       spin_unlock(&sbinfo->stat_lock);
+       raw_spin_unlock(&sbinfo->stat_lock);
+       mpol_put(mpol);
        return 0;
 out:
-       spin_unlock(&sbinfo->stat_lock);
+       raw_spin_unlock(&sbinfo->stat_lock);
        return invalfc(fc, "%s", err);
 }
 
@@ -3613,7 +3625,6 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
        struct shmem_options *ctx = fc->fs_private;
        struct inode *inode;
        struct shmem_sb_info *sbinfo;
-       int err = -ENOMEM;
 
        /* Round up to L1_CACHE_BYTES to resist false sharing */
        sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
@@ -3659,7 +3670,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
        sbinfo->mpol = ctx->mpol;
        ctx->mpol = NULL;
 
-       spin_lock_init(&sbinfo->stat_lock);
+       raw_spin_lock_init(&sbinfo->stat_lock);
        if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
                goto failed;
        spin_lock_init(&sbinfo->shrinklist_lock);
@@ -3691,7 +3702,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
 
 failed:
        shmem_put_super(sb);
-       return err;
+       return -ENOMEM;
 }
 
 static int shmem_get_tree(struct fs_context *fc)
@@ -3907,7 +3918,7 @@ int __init shmem_init(void)
        if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
                SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
        else
-               shmem_huge = 0; /* just in case it was patched */
+               shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
 #endif
        return 0;
 
@@ -3976,42 +3987,6 @@ struct kobj_attribute shmem_enabled_attr =
        __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-bool shmem_huge_enabled(struct vm_area_struct *vma)
-{
-       struct inode *inode = file_inode(vma->vm_file);
-       struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
-       loff_t i_size;
-       pgoff_t off;
-
-       if (!transhuge_vma_enabled(vma, vma->vm_flags))
-               return false;
-       if (shmem_huge == SHMEM_HUGE_FORCE)
-               return true;
-       if (shmem_huge == SHMEM_HUGE_DENY)
-               return false;
-       switch (sbinfo->huge) {
-               case SHMEM_HUGE_NEVER:
-                       return false;
-               case SHMEM_HUGE_ALWAYS:
-                       return true;
-               case SHMEM_HUGE_WITHIN_SIZE:
-                       off = round_up(vma->vm_pgoff, HPAGE_PMD_NR);
-                       i_size = round_up(i_size_read(inode), PAGE_SIZE);
-                       if (i_size >= HPAGE_PMD_SIZE &&
-                                       i_size >> PAGE_SHIFT >= off)
-                               return true;
-                       fallthrough;
-               case SHMEM_HUGE_ADVISE:
-                       /* TODO: implement fadvise() hints */
-                       return (vma->vm_flags & VM_HUGEPAGE);
-               default:
-                       VM_BUG_ON(1);
-                       return false;
-       }
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
 #else /* !CONFIG_SHMEM */
 
 /*
index 6326cdf..120bc8e 100644 (file)
@@ -109,32 +109,6 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
 }
 #endif
 
-#ifdef CONFIG_SPARSEMEM_EXTREME
-unsigned long __section_nr(struct mem_section *ms)
-{
-       unsigned long root_nr;
-       struct mem_section *root = NULL;
-
-       for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) {
-               root = __nr_to_section(root_nr * SECTIONS_PER_ROOT);
-               if (!root)
-                       continue;
-
-               if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
-                    break;
-       }
-
-       VM_BUG_ON(!root);
-
-       return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
-}
-#else
-unsigned long __section_nr(struct mem_section *ms)
-{
-       return (unsigned long)(ms - mem_section[0]);
-}
-#endif
-
 /*
  * During early boot, before section_mem_map is used for an actual
  * mem_map, we use section_mem_map to store the section's NUMA
@@ -143,7 +117,7 @@ unsigned long __section_nr(struct mem_section *ms)
  */
 static inline unsigned long sparse_encode_early_nid(int nid)
 {
-       return (nid << SECTION_NID_SHIFT);
+       return ((unsigned long)nid << SECTION_NID_SHIFT);
 }
 
 static inline int sparse_early_nid(struct mem_section *section)
@@ -187,10 +161,9 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
  * those loops early.
  */
 unsigned long __highest_present_section_nr;
-static void section_mark_present(struct mem_section *ms)
+static void __section_mark_present(struct mem_section *ms,
+               unsigned long section_nr)
 {
-       unsigned long section_nr = __section_nr(ms);
-
        if (section_nr > __highest_present_section_nr)
                __highest_present_section_nr = section_nr;
 
@@ -280,7 +253,7 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en
                if (!ms->section_mem_map) {
                        ms->section_mem_map = sparse_encode_early_nid(nid) |
                                                        SECTION_IS_ONLINE;
-                       section_mark_present(ms);
+                       __section_mark_present(ms, section);
                }
        }
 }
@@ -348,7 +321,8 @@ size_t mem_section_usage_size(void)
 static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
 {
 #ifndef CONFIG_NUMA
-       return __pa_symbol(pgdat);
+       VM_BUG_ON(pgdat != &contig_page_data);
+       return __pa_symbol(&contig_page_data);
 #else
        return __pa(pgdat);
 #endif
@@ -462,8 +436,7 @@ struct page __init *__populate_section_memmap(unsigned long pfn,
        if (map)
                return map;
 
-       map = memblock_alloc_try_nid_raw(size, size, addr,
-                                         MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+       map = memmap_alloc(size, size, addr, nid, false);
        if (!map)
                panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
                      __func__, size, PAGE_SIZE, nid, &addr);
@@ -490,8 +463,7 @@ static void __init sparse_buffer_init(unsigned long size, int nid)
         * and we want it to be properly aligned to the section size - this is
         * especially the case for VMEMMAP which maps memmap to PMDs
         */
-       sparsemap_buf = memblock_alloc_exact_nid_raw(size, section_map_size(),
-                                       addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+       sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true);
        sparsemap_buf_end = sparsemap_buf + size;
 }
 
@@ -934,7 +906,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
 
        ms = __nr_to_section(section_nr);
        set_section_nid(section_nr, nid);
-       section_mark_present(ms);
+       __section_mark_present(ms, section_nr);
 
        /* Align memmap to section boundary in the subsection case */
        if (section_nr_to_pfn(section_nr) != start_pfn)
index 1960043..897200d 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -179,28 +179,6 @@ int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
 }
 EXPORT_SYMBOL_GPL(get_kernel_pages);
 
-/*
- * get_kernel_page() - pin a kernel page in memory
- * @start:     starting kernel address
- * @write:     pinning for read/write, currently ignored
- * @pages:     array that receives pointer to the page pinned.
- *             Must be at least nr_segs long.
- *
- * Returns 1 if page is pinned. If the page was not pinned, returns
- * -errno. The page returned must be released with a put_page() call
- * when it is finished with.
- */
-int get_kernel_page(unsigned long start, int write, struct page **pages)
-{
-       const struct kvec kiov = {
-               .iov_base = (void *)start,
-               .iov_len = PAGE_SIZE
-       };
-
-       return get_kernel_pages(&kiov, 1, write, pages);
-}
-EXPORT_SYMBOL_GPL(get_kernel_page);
-
 static void pagevec_lru_move_fn(struct pagevec *pvec,
        void (*move_fn)(struct page *page, struct lruvec *lruvec))
 {
index 1e07d1c..22d10f7 100644 (file)
@@ -3130,6 +3130,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        struct filename *name;
        struct file *swap_file = NULL;
        struct address_space *mapping;
+       struct dentry *dentry;
        int prio;
        int error;
        union swap_header *swap_header;
@@ -3173,6 +3174,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 
        p->swap_file = swap_file;
        mapping = swap_file->f_mapping;
+       dentry = swap_file->f_path.dentry;
        inode = mapping->host;
 
        error = claim_swapfile(p, inode);
@@ -3180,6 +3182,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                goto bad_swap;
 
        inode_lock(inode);
+       if (d_unlinked(dentry) || cant_mount(dentry)) {
+               error = -ENOENT;
+               goto bad_swap_unlock_inode;
+       }
        if (IS_SWAPFILE(inode)) {
                error = -EBUSY;
                goto bad_swap_unlock_inode;
@@ -3773,7 +3779,7 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
 }
 
 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
+void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
 {
        struct swap_info_struct *si, *next;
        int nid = page_to_nid(page);
index 44ad5e5..714eaf1 100644 (file)
@@ -484,8 +484,9 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
                        index = indices[i];
 
                        if (xa_is_value(page)) {
-                               invalidate_exceptional_entry(mapping, index,
-                                                            page);
+                               count += invalidate_exceptional_entry(mapping,
+                                                                     index,
+                                                                     page);
                                continue;
                        }
                        index += thp_nr_pages(page) - 1;
@@ -513,19 +514,18 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
 }
 
 /**
- * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
- * @mapping: the address_space which holds the pages to invalidate
+ * invalidate_mapping_pages - Invalidate all clean, unlocked cache of one inode
+ * @mapping: the address_space which holds the cache to invalidate
  * @start: the offset 'from' which to invalidate
  * @end: the offset 'to' which to invalidate (inclusive)
  *
- * This function only removes the unlocked pages, if you want to
- * remove all the pages of one inode, you must call truncate_inode_pages.
+ * This function removes pages that are clean, unmapped and unlocked,
+ * as well as shadow entries. It will not block on IO activity.
  *
- * invalidate_mapping_pages() will not block on IO activity. It will not
- * invalidate pages which are dirty, locked, under writeback or mapped into
- * pagetables.
+ * If you want to remove all the pages of one inode, regardless of
+ * their use and writeback state, use truncate_inode_pages().
  *
- * Return: the number of the pages that were invalidated
+ * Return: the number of the cache entries that were invalidated
  */
 unsigned long invalidate_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t end)
@@ -561,21 +561,19 @@ void invalidate_mapping_pagevec(struct address_space *mapping,
 static int
 invalidate_complete_page2(struct address_space *mapping, struct page *page)
 {
-       unsigned long flags;
-
        if (page->mapping != mapping)
                return 0;
 
        if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
                return 0;
 
-       xa_lock_irqsave(&mapping->i_pages, flags);
+       xa_lock_irq(&mapping->i_pages);
        if (PageDirty(page))
                goto failed;
 
        BUG_ON(page_has_private(page));
        __delete_from_page_cache(page, NULL);
-       xa_unlock_irqrestore(&mapping->i_pages, flags);
+       xa_unlock_irq(&mapping->i_pages);
 
        if (mapping->a_ops->freepage)
                mapping->a_ops->freepage(page);
@@ -583,7 +581,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
        put_page(page); /* pagecache ref */
        return 1;
 failed:
-       xa_unlock_irqrestore(&mapping->i_pages, flags);
+       xa_unlock_irq(&mapping->i_pages);
        return 0;
 }
 
index 0e21328..7a90084 100644 (file)
@@ -483,7 +483,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
                                              unsigned long src_start,
                                              unsigned long len,
                                              enum mcopy_atomic_mode mcopy_mode,
-                                             bool *mmap_changing,
+                                             atomic_t *mmap_changing,
                                              __u64 mode)
 {
        struct vm_area_struct *dst_vma;
@@ -517,7 +517,7 @@ retry:
         * request the user to retry later
         */
        err = -EAGAIN;
-       if (mmap_changing && READ_ONCE(*mmap_changing))
+       if (mmap_changing && atomic_read(mmap_changing))
                goto out_unlock;
 
        /*
@@ -650,28 +650,29 @@ out:
 
 ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
                     unsigned long src_start, unsigned long len,
-                    bool *mmap_changing, __u64 mode)
+                    atomic_t *mmap_changing, __u64 mode)
 {
        return __mcopy_atomic(dst_mm, dst_start, src_start, len,
                              MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
 }
 
 ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
-                      unsigned long len, bool *mmap_changing)
+                      unsigned long len, atomic_t *mmap_changing)
 {
        return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
                              mmap_changing, 0);
 }
 
 ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
-                      unsigned long len, bool *mmap_changing)
+                      unsigned long len, atomic_t *mmap_changing)
 {
        return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
                              mmap_changing, 0);
 }
 
 int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
-                       unsigned long len, bool enable_wp, bool *mmap_changing)
+                       unsigned long len, bool enable_wp,
+                       atomic_t *mmap_changing)
 {
        struct vm_area_struct *dst_vma;
        pgprot_t newprot;
@@ -694,7 +695,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
         * request the user to retry later
         */
        err = -EAGAIN;
-       if (mmap_changing && READ_ONCE(*mmap_changing))
+       if (mmap_changing && atomic_read(mmap_changing))
                goto out_unlock;
 
        err = -ENOENT;
index d5cd528..3824dc1 100644 (file)
@@ -787,6 +787,28 @@ unsigned long vmalloc_nr_pages(void)
        return atomic_long_read(&nr_vmalloc_pages);
 }
 
+static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
+{
+       struct vmap_area *va = NULL;
+       struct rb_node *n = vmap_area_root.rb_node;
+
+       while (n) {
+               struct vmap_area *tmp;
+
+               tmp = rb_entry(n, struct vmap_area, rb_node);
+               if (tmp->va_end > addr) {
+                       va = tmp;
+                       if (tmp->va_start <= addr)
+                               break;
+
+                       n = n->rb_left;
+               } else
+                       n = n->rb_right;
+       }
+
+       return va;
+}
+
 static struct vmap_area *__find_vmap_area(unsigned long addr)
 {
        struct rb_node *n = vmap_area_root.rb_node;
@@ -1479,6 +1501,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
                                int node, gfp_t gfp_mask)
 {
        struct vmap_area *va;
+       unsigned long freed;
        unsigned long addr;
        int purged = 0;
        int ret;
@@ -1542,13 +1565,12 @@ overflow:
                goto retry;
        }
 
-       if (gfpflags_allow_blocking(gfp_mask)) {
-               unsigned long freed = 0;
-               blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
-               if (freed > 0) {
-                       purged = 0;
-                       goto retry;
-               }
+       freed = 0;
+       blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
+
+       if (freed > 0) {
+               purged = 0;
+               goto retry;
        }
 
        if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
@@ -2779,7 +2801,7 @@ EXPORT_SYMBOL_GPL(vmap_pfn);
 
 static inline unsigned int
 vm_area_alloc_pages(gfp_t gfp, int nid,
-               unsigned int order, unsigned long nr_pages, struct page **pages)
+               unsigned int order, unsigned int nr_pages, struct page **pages)
 {
        unsigned int nr_allocated = 0;
 
@@ -2789,10 +2811,32 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
         * to fails, fallback to a single page allocator that is
         * more permissive.
         */
-       if (!order)
-               nr_allocated = alloc_pages_bulk_array_node(
-                       gfp, nid, nr_pages, pages);
-       else
+       if (!order) {
+               while (nr_allocated < nr_pages) {
+                       unsigned int nr, nr_pages_request;
+
+                       /*
+                        * A maximum allowed request is hard-coded and is 100
+                        * pages per call. That is done in order to prevent a
+                        * long preemption off scenario in the bulk-allocator
+                        * so the range is [1:100].
+                        */
+                       nr_pages_request = min(100U, nr_pages - nr_allocated);
+
+                       nr = alloc_pages_bulk_array_node(gfp, nid,
+                               nr_pages_request, pages + nr_allocated);
+
+                       nr_allocated += nr;
+                       cond_resched();
+
+                       /*
+                        * If zero or pages were obtained partly,
+                        * fallback to a single page allocator.
+                        */
+                       if (nr != nr_pages_request)
+                               break;
+               }
+       } else
                /*
                 * Compound pages required for remap_vmalloc_page if
                 * high-order pages.
@@ -2816,9 +2860,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
                for (i = 0; i < (1U << order); i++)
                        pages[nr_allocated + i] = page + i;
 
-               if (gfpflags_allow_blocking(gfp))
-                       cond_resched();
-
+               cond_resched();
                nr_allocated += 1U << order;
        }
 
@@ -3267,9 +3309,14 @@ long vread(char *buf, char *addr, unsigned long count)
                count = -(unsigned long) addr;
 
        spin_lock(&vmap_area_lock);
-       va = __find_vmap_area((unsigned long)addr);
+       va = find_vmap_area_exceed_addr((unsigned long)addr);
        if (!va)
                goto finished;
+
+       /* no intersects with alive vmap_area */
+       if ((unsigned long)addr + count <= va->va_start)
+               goto finished;
+
        list_for_each_entry_from(va, &vmap_area_list, list) {
                if (!count)
                        break;
index d69019f..76518e4 100644 (file)
@@ -74,8 +74,7 @@ static struct vmpressure *work_to_vmpressure(struct work_struct *work)
 
 static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
 {
-       struct cgroup_subsys_state *css = vmpressure_to_css(vmpr);
-       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+       struct mem_cgroup *memcg = vmpressure_to_memcg(vmpr);
 
        memcg = parent_mem_cgroup(memcg);
        if (!memcg)
@@ -240,7 +239,12 @@ static void vmpressure_work_fn(struct work_struct *work)
 void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
                unsigned long scanned, unsigned long reclaimed)
 {
-       struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
+       struct vmpressure *vmpr;
+
+       if (mem_cgroup_disabled())
+               return;
+
+       vmpr = memcg_to_vmpressure(memcg);
 
        /*
         * Here we only want to account pressure that userland is able to
index eeae2f6..740d03e 100644 (file)
@@ -41,6 +41,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/memcontrol.h>
+#include <linux/migrate.h>
 #include <linux/delayacct.h>
 #include <linux/sysctl.h>
 #include <linux/oom.h>
@@ -121,6 +122,9 @@ struct scan_control {
        /* The file pages on the current node are dangerously low */
        unsigned int file_is_tiny:1;
 
+       /* Always discard instead of demoting to lower tier memory */
+       unsigned int no_demotion:1;
+
        /* Allocation order */
        s8 order;
 
@@ -518,6 +522,48 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker,
        return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
 }
 
+static bool can_demote(int nid, struct scan_control *sc)
+{
+       if (!numa_demotion_enabled)
+               return false;
+       if (sc) {
+               if (sc->no_demotion)
+                       return false;
+               /* It is pointless to do demotion in memcg reclaim */
+               if (cgroup_reclaim(sc))
+                       return false;
+       }
+       if (next_demotion_node(nid) == NUMA_NO_NODE)
+               return false;
+
+       return true;
+}
+
+static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
+                                         int nid,
+                                         struct scan_control *sc)
+{
+       if (memcg == NULL) {
+               /*
+                * For non-memcg reclaim, is there
+                * space in any swap device?
+                */
+               if (get_nr_swap_pages() > 0)
+                       return true;
+       } else {
+               /* Is the memcg below its swap limit? */
+               if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
+                       return true;
+       }
+
+       /*
+        * The page can not be swapped.
+        *
+        * Can it be reclaimed from this node via demotion?
+        */
+       return can_demote(nid, sc);
+}
+
 /*
  * This misses isolated pages which are not accounted for to save counters.
  * As the data only determines if reclaim or compaction continues, it is
@@ -529,7 +575,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
 
        nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
                zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
-       if (get_nr_swap_pages() > 0)
+       if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
                nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
                        zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
 
@@ -893,6 +939,7 @@ out:
 void drop_slab_node(int nid)
 {
        unsigned long freed;
+       int shift = 0;
 
        do {
                struct mem_cgroup *memcg = NULL;
@@ -905,7 +952,7 @@ void drop_slab_node(int nid)
                do {
                        freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
                } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
-       } while (freed > 10);
+       } while ((freed >> shift++) > 1);
 }
 
 void drop_slab(void)
@@ -1052,14 +1099,13 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
 static int __remove_mapping(struct address_space *mapping, struct page *page,
                            bool reclaimed, struct mem_cgroup *target_memcg)
 {
-       unsigned long flags;
        int refcount;
        void *shadow = NULL;
 
        BUG_ON(!PageLocked(page));
        BUG_ON(mapping != page_mapping(page));
 
-       xa_lock_irqsave(&mapping->i_pages, flags);
+       xa_lock_irq(&mapping->i_pages);
        /*
         * The non racy check for a busy page.
         *
@@ -1100,7 +1146,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                if (reclaimed && !mapping_exiting(mapping))
                        shadow = workingset_eviction(page, target_memcg);
                __delete_from_swap_cache(page, swap, shadow);
-               xa_unlock_irqrestore(&mapping->i_pages, flags);
+               xa_unlock_irq(&mapping->i_pages);
                put_swap_page(page, swap);
        } else {
                void (*freepage)(struct page *);
@@ -1126,7 +1172,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                    !mapping_exiting(mapping) && !dax_mapping(mapping))
                        shadow = workingset_eviction(page, target_memcg);
                __delete_from_page_cache(page, shadow);
-               xa_unlock_irqrestore(&mapping->i_pages, flags);
+               xa_unlock_irq(&mapping->i_pages);
 
                if (freepage != NULL)
                        freepage(page);
@@ -1135,7 +1181,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
        return 1;
 
 cannot_free:
-       xa_unlock_irqrestore(&mapping->i_pages, flags);
+       xa_unlock_irq(&mapping->i_pages);
        return 0;
 }
 
@@ -1264,6 +1310,54 @@ static void page_check_dirty_writeback(struct page *page,
                mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
 }
 
+static struct page *alloc_demote_page(struct page *page, unsigned long node)
+{
+       struct migration_target_control mtc = {
+               /*
+                * Allocate from 'node', or fail quickly and quietly.
+                * When this happens, 'page' will likely just be discarded
+                * instead of migrated.
+                */
+               .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
+                           __GFP_THISNODE  | __GFP_NOWARN |
+                           __GFP_NOMEMALLOC | GFP_NOWAIT,
+               .nid = node
+       };
+
+       return alloc_migration_target(page, (unsigned long)&mtc);
+}
+
+/*
+ * Take pages on @demote_list and attempt to demote them to
+ * another node.  Pages which are not demoted are left on
+ * @demote_pages.
+ */
+static unsigned int demote_page_list(struct list_head *demote_pages,
+                                    struct pglist_data *pgdat)
+{
+       int target_nid = next_demotion_node(pgdat->node_id);
+       unsigned int nr_succeeded;
+       int err;
+
+       if (list_empty(demote_pages))
+               return 0;
+
+       if (target_nid == NUMA_NO_NODE)
+               return 0;
+
+       /* Demotion ignores all cpuset and mempolicy settings */
+       err = migrate_pages(demote_pages, alloc_demote_page, NULL,
+                           target_nid, MIGRATE_ASYNC, MR_DEMOTION,
+                           &nr_succeeded);
+
+       if (current_is_kswapd())
+               __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
+       else
+               __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
+
+       return nr_succeeded;
+}
+
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
@@ -1275,12 +1369,16 @@ static unsigned int shrink_page_list(struct list_head *page_list,
 {
        LIST_HEAD(ret_pages);
        LIST_HEAD(free_pages);
+       LIST_HEAD(demote_pages);
        unsigned int nr_reclaimed = 0;
        unsigned int pgactivate = 0;
+       bool do_demote_pass;
 
        memset(stat, 0, sizeof(*stat));
        cond_resched();
+       do_demote_pass = can_demote(pgdat->node_id, sc);
 
+retry:
        while (!list_empty(page_list)) {
                struct address_space *mapping;
                struct page *page;
@@ -1429,6 +1527,17 @@ static unsigned int shrink_page_list(struct list_head *page_list,
                        ; /* try to reclaim the page below */
                }
 
+               /*
+                * Before reclaiming the page, try to relocate
+                * its contents to another node.
+                */
+               if (do_demote_pass &&
+                   (thp_migration_supported() || !PageTransHuge(page))) {
+                       list_add(&page->lru, &demote_pages);
+                       unlock_page(page);
+                       continue;
+               }
+
                /*
                 * Anonymous process memory has backing store?
                 * Try to allocate it some swap space here.
@@ -1624,11 +1733,14 @@ static unsigned int shrink_page_list(struct list_head *page_list,
                        /* follow __remove_mapping for reference */
                        if (!page_ref_freeze(page, 1))
                                goto keep_locked;
-                       if (PageDirty(page)) {
-                               page_ref_unfreeze(page, 1);
-                               goto keep_locked;
-                       }
-
+                       /*
+                        * The page has only one reference left, which is
+                        * from the isolation. After the caller puts the
+                        * page back on lru and drops the reference, the
+                        * page will be freed anyway. It doesn't matter
+                        * which lru it goes. So we don't bother checking
+                        * PageDirty here.
+                        */
                        count_vm_event(PGLAZYFREED);
                        count_memcg_page_event(page, PGLAZYFREED);
                } else if (!mapping || !__remove_mapping(mapping, page, true,
@@ -1680,6 +1792,17 @@ keep:
                list_add(&page->lru, &ret_pages);
                VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
        }
+       /* 'page_list' is always empty here */
+
+       /* Migrate pages selected for demotion */
+       nr_reclaimed += demote_page_list(&demote_pages, pgdat);
+       /* Pages that could not be demoted are still in @demote_pages */
+       if (!list_empty(&demote_pages)) {
+               /* Pages which failed to demoted go back on @page_list for retry: */
+               list_splice_init(&demote_pages, page_list);
+               do_demote_pass = false;
+               goto retry;
+       }
 
        pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
 
@@ -1698,7 +1821,6 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 {
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
-               .priority = DEF_PRIORITY,
                .may_unmap = 1,
        };
        struct reclaim_stat stat;
@@ -2323,10 +2445,10 @@ unsigned long reclaim_pages(struct list_head *page_list)
        unsigned int noreclaim_flag;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
-               .priority = DEF_PRIORITY,
                .may_writepage = 1,
                .may_unmap = 1,
                .may_swap = 1,
+               .no_demotion = 1,
        };
 
        noreclaim_flag = memalloc_noreclaim_save();
@@ -2452,6 +2574,7 @@ enum scan_balance {
 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
                           unsigned long *nr)
 {
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
        struct mem_cgroup *memcg = lruvec_memcg(lruvec);
        unsigned long anon_cost, file_cost, total_cost;
        int swappiness = mem_cgroup_swappiness(memcg);
@@ -2462,7 +2585,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
        enum lru_list lru;
 
        /* If we have no swap space, do not bother scanning anon pages. */
-       if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
+       if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
                scan_balance = SCAN_FILE;
                goto out;
        }
@@ -2645,6 +2768,21 @@ out:
        }
 }
 
+/*
+ * Anonymous LRU management is a waste if there is
+ * ultimately no way to reclaim the memory.
+ */
+static bool can_age_anon_pages(struct pglist_data *pgdat,
+                              struct scan_control *sc)
+{
+       /* Aging the anon LRU is valuable if swap is present: */
+       if (total_swap_pages > 0)
+               return true;
+
+       /* Also valuable if anon pages can be demoted: */
+       return can_demote(pgdat->node_id, sc);
+}
+
 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 {
        unsigned long nr[NR_LRU_LISTS];
@@ -2754,7 +2892,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
         * Even if we did not try to evict anon pages at all, we want to
         * rebalance the anon lru active/inactive ratio.
         */
-       if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON))
+       if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
+           inactive_is_low(lruvec, LRU_INACTIVE_ANON))
                shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                   sc, LRU_ACTIVE_ANON);
 }
@@ -2824,7 +2963,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
         */
        pages_for_compaction = compact_gap(sc->order);
        inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
-       if (get_nr_swap_pages() > 0)
+       if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
                inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
 
        return inactive_lru_pages > pages_for_compaction;
@@ -2898,6 +3037,12 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
        target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
 
 again:
+       /*
+        * Flush the memory cgroup stats, so that we read accurate per-memcg
+        * lruvec stats for heuristics.
+        */
+       mem_cgroup_flush_stats();
+
        memset(&sc->nr, 0, sizeof(sc->nr));
 
        nr_reclaimed = sc->nr_reclaimed;
@@ -3434,18 +3579,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
         * blocked waiting on the same lock. Instead, throttle for up to a
         * second before continuing.
         */
-       if (!(gfp_mask & __GFP_FS)) {
+       if (!(gfp_mask & __GFP_FS))
                wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
                        allow_direct_reclaim(pgdat), HZ);
+       else
+               /* Throttle until kswapd wakes the process */
+               wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+                       allow_direct_reclaim(pgdat));
 
-               goto check_pending;
-       }
-
-       /* Throttle until kswapd wakes the process */
-       wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
-               allow_direct_reclaim(pgdat));
-
-check_pending:
        if (fatal_signal_pending(current))
                return true;
 
@@ -3583,7 +3724,7 @@ static void age_active_anon(struct pglist_data *pgdat,
        struct mem_cgroup *memcg;
        struct lruvec *lruvec;
 
-       if (!total_swap_pages)
+       if (!can_age_anon_pages(pgdat, sc))
                return;
 
        lruvec = mem_cgroup_lruvec(NULL, pgdat);
@@ -3812,7 +3953,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 
        set_task_reclaim_state(current, &sc.reclaim_state);
        psi_memstall_enter(&pflags);
-       __fs_reclaim_acquire();
+       __fs_reclaim_acquire(_THIS_IP_);
 
        count_vm_event(PAGEOUTRUN);
 
@@ -3938,9 +4079,9 @@ restart:
                        wake_up_all(&pgdat->pfmemalloc_wait);
 
                /* Check if kswapd should be suspending */
-               __fs_reclaim_release();
+               __fs_reclaim_release(_THIS_IP_);
                ret = try_to_freeze();
-               __fs_reclaim_acquire();
+               __fs_reclaim_acquire(_THIS_IP_);
                if (ret || kthread_should_stop())
                        break;
 
@@ -3992,7 +4133,7 @@ out:
        }
 
        snapshot_refaults(NULL, pgdat);
-       __fs_reclaim_release();
+       __fs_reclaim_release(_THIS_IP_);
        psi_memstall_leave(&pflags);
        set_task_reclaim_state(current, NULL);
 
@@ -4290,23 +4431,20 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
  * This kswapd start function will be called by init and node-hot-add.
  * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
  */
-int kswapd_run(int nid)
+void kswapd_run(int nid)
 {
        pg_data_t *pgdat = NODE_DATA(nid);
-       int ret = 0;
 
        if (pgdat->kswapd)
-               return 0;
+               return;
 
        pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
        if (IS_ERR(pgdat->kswapd)) {
                /* failure at boot is fatal */
                BUG_ON(system_state < SYSTEM_RUNNING);
                pr_err("Failed to start kswapd on node %d\n", nid);
-               ret = PTR_ERR(pgdat->kswapd);
                pgdat->kswapd = NULL;
        }
-       return ret;
 }
 
 /*
index a7ed56a..0885a34 100644 (file)
@@ -204,7 +204,7 @@ int calculate_normal_threshold(struct zone *zone)
         *
         * Some sample thresholds:
         *
-        * Threshold    Processors      (fls)   Zonesize        fls(mem+1)
+        * Threshold    Processors      (fls)   Zonesize        fls(mem)+1
         * ------------------------------------------------------------------
         * 8            1               1       0.9-1 GB        4
         * 16           2               2       0.9-1 GB        4
@@ -1217,6 +1217,8 @@ const char * const vmstat_text[] = {
        "pgreuse",
        "pgsteal_kswapd",
        "pgsteal_direct",
+       "pgdemote_kswapd",
+       "pgdemote_direct",
        "pgscan_kswapd",
        "pgscan_direct",
        "pgscan_direct_throttle",
@@ -1452,7 +1454,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m,
 }
 
 /* Print out the free pages at each order for each migatetype */
-static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
+static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
 {
        int order;
        pg_data_t *pgdat = (pg_data_t *)arg;
@@ -1464,8 +1466,6 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
        seq_putc(m, '\n');
 
        walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
-
-       return 0;
 }
 
 static void pagetypeinfo_showblockcount_print(struct seq_file *m,
@@ -1501,7 +1501,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
 }
 
 /* Print out the number of pageblocks for each migratetype */
-static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
+static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
 {
        int mtype;
        pg_data_t *pgdat = (pg_data_t *)arg;
@@ -1512,8 +1512,6 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
        seq_putc(m, '\n');
        walk_zones_in_node(m, pgdat, true, false,
                pagetypeinfo_showblockcount_print);
-
-       return 0;
 }
 
 /*
@@ -1873,11 +1871,6 @@ static void vmstat_update(struct work_struct *w)
        }
 }
 
-/*
- * Switch off vmstat processing and then fold all the remaining differentials
- * until the diffs stay at zero. The function is used by NOHZ and can only be
- * invoked when tick processing is not active.
- */
 /*
  * Check if the diffs for a certain cpu indicate that
  * an update is needed.
@@ -1894,17 +1887,15 @@ static bool need_update(int cpu)
                /*
                 * The fast way of checking if there are any vmstat diffs.
                 */
-               if (memchr_inv(pzstats->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
-                              sizeof(pzstats->vm_stat_diff[0])))
+               if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff)))
                        return true;
 
                if (last_pgdat == zone->zone_pgdat)
                        continue;
                last_pgdat = zone->zone_pgdat;
                n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
-               if (memchr_inv(n->vm_node_stat_diff, 0, NR_VM_NODE_STAT_ITEMS *
-                              sizeof(n->vm_node_stat_diff[0])))
-                   return true;
+               if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff)))
+                       return true;
        }
        return false;
 }
index 98d9858..31af29f 100644 (file)
@@ -897,6 +897,9 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos,
                      struct tomoyo_page_dump *dump)
 {
        struct page *page;
+#ifdef CONFIG_MMU
+       int ret;
+#endif
 
        /* dump->data is released by tomoyo_find_next_domain(). */
        if (!dump->data) {
@@ -909,11 +912,13 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos,
        /*
         * This is called at execve() time in order to dig around
         * in the argv/environment of the new proceess
-        * (represented by bprm).  'current' is the process doing
-        * the execve().
+        * (represented by bprm).
         */
-       if (get_user_pages_remote(bprm->mm, pos, 1,
-                               FOLL_FORCE, &page, NULL, NULL) <= 0)
+       mmap_read_lock(bprm->mm);
+       ret = get_user_pages_remote(bprm->mm, pos, 1,
+                                   FOLL_FORCE, &page, NULL, NULL);
+       mmap_read_unlock(bprm->mm);
+       if (ret <= 0)
                return false;
 #else
        page = bprm->page[pos / PAGE_SIZE];
index f9a1200..16ec895 100644 (file)
@@ -127,7 +127,6 @@ kmalloc_array(unsigned int n, unsigned int size, unsigned int flags)
 #define kmemleak_free(a)
 
 #define PageSlab(p) (0)
-#define flush_kernel_dcache_page(p)
 
 #define MAX_ERRNO      4095
 
index f0fd80e..b02eac6 100644 (file)
@@ -27,3 +27,4 @@ hmm-tests
 memfd_secret
 local_config.*
 split_huge_page_test
+ksm_tests
index 5212437..d9605bd 100644 (file)
@@ -45,6 +45,7 @@ TEST_GEN_FILES += thuge-gen
 TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += userfaultfd
 TEST_GEN_FILES += split_huge_page_test
+TEST_GEN_FILES += ksm_tests
 
 ifeq ($(MACHINE),x86_64)
 CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32)
@@ -145,6 +146,8 @@ $(OUTPUT)/hmm-tests: local_config.h
 # HMM_EXTRA_LIBS may get set in local_config.mk, or it may be left empty.
 $(OUTPUT)/hmm-tests: LDLIBS += $(HMM_EXTRA_LIBS)
 
+$(OUTPUT)/ksm_tests: LDLIBS += -lnuma
+
 local_config.mk local_config.h: check_config.sh
        /bin/sh ./check_config.sh $(CC)
 
index 18d3368..fe8fcfb 100644 (file)
@@ -1,11 +1,14 @@
 #!/bin/sh
 # SPDX-License-Identifier: GPL-2.0
 
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
 set -e
 
 if [[ $(id -u) -ne 0 ]]; then
   echo "This test must be run as root. Skipping..."
-  exit 0
+  exit $ksft_skip
 fi
 
 fault_limit_file=limit_in_bytes
index d11d1fe..4a9a3af 100644 (file)
@@ -1,11 +1,14 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
 set -e
 
 if [[ $(id -u) -ne 0 ]]; then
   echo "This test must be run as root. Skipping..."
-  exit 0
+  exit $ksft_skip
 fi
 
 usage_file=usage_in_bytes
diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c
new file mode 100644 (file)
index 0000000..b61dcdb
--- /dev/null
@@ -0,0 +1,662 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <sys/mman.h>
+#include <stdbool.h>
+#include <time.h>
+#include <string.h>
+#include <numa.h>
+
+#include "../kselftest.h"
+#include "../../../../include/vdso/time64.h"
+
+#define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/"
+#define KSM_FP(s) (KSM_SYSFS_PATH s)
+#define KSM_SCAN_LIMIT_SEC_DEFAULT 120
+#define KSM_PAGE_COUNT_DEFAULT 10l
+#define KSM_PROT_STR_DEFAULT "rw"
+#define KSM_USE_ZERO_PAGES_DEFAULT false
+#define KSM_MERGE_ACROSS_NODES_DEFAULT true
+#define MB (1ul << 20)
+
+struct ksm_sysfs {
+       unsigned long max_page_sharing;
+       unsigned long merge_across_nodes;
+       unsigned long pages_to_scan;
+       unsigned long run;
+       unsigned long sleep_millisecs;
+       unsigned long stable_node_chains_prune_millisecs;
+       unsigned long use_zero_pages;
+};
+
+enum ksm_test_name {
+       CHECK_KSM_MERGE,
+       CHECK_KSM_UNMERGE,
+       CHECK_KSM_ZERO_PAGE_MERGE,
+       CHECK_KSM_NUMA_MERGE,
+       KSM_MERGE_TIME,
+       KSM_COW_TIME
+};
+
+static int ksm_write_sysfs(const char *file_path, unsigned long val)
+{
+       FILE *f = fopen(file_path, "w");
+
+       if (!f) {
+               fprintf(stderr, "f %s\n", file_path);
+               perror("fopen");
+               return 1;
+       }
+       if (fprintf(f, "%lu", val) < 0) {
+               perror("fprintf");
+               return 1;
+       }
+       fclose(f);
+
+       return 0;
+}
+
+static int ksm_read_sysfs(const char *file_path, unsigned long *val)
+{
+       FILE *f = fopen(file_path, "r");
+
+       if (!f) {
+               fprintf(stderr, "f %s\n", file_path);
+               perror("fopen");
+               return 1;
+       }
+       if (fscanf(f, "%lu", val) != 1) {
+               perror("fscanf");
+               return 1;
+       }
+       fclose(f);
+
+       return 0;
+}
+
+static int str_to_prot(char *prot_str)
+{
+       int prot = 0;
+
+       if ((strchr(prot_str, 'r')) != NULL)
+               prot |= PROT_READ;
+       if ((strchr(prot_str, 'w')) != NULL)
+               prot |= PROT_WRITE;
+       if ((strchr(prot_str, 'x')) != NULL)
+               prot |= PROT_EXEC;
+
+       return prot;
+}
+
+static void print_help(void)
+{
+       printf("usage: ksm_tests [-h] <test type> [-a prot] [-p page_count] [-l timeout]\n"
+              "[-z use_zero_pages] [-m merge_across_nodes] [-s size]\n");
+
+       printf("Supported <test type>:\n"
+              " -M (page merging)\n"
+              " -Z (zero pages merging)\n"
+              " -N (merging of pages in different NUMA nodes)\n"
+              " -U (page unmerging)\n"
+              " -P evaluate merging time and speed.\n"
+              "    For this test, the size of duplicated memory area (in MiB)\n"
+              "    must be provided using -s option\n"
+              " -C evaluate the time required to break COW of merged pages.\n\n");
+
+       printf(" -a: specify the access protections of pages.\n"
+              "     <prot> must be of the form [rwx].\n"
+              "     Default: %s\n", KSM_PROT_STR_DEFAULT);
+       printf(" -p: specify the number of pages to test.\n"
+              "     Default: %ld\n", KSM_PAGE_COUNT_DEFAULT);
+       printf(" -l: limit the maximum running time (in seconds) for a test.\n"
+              "     Default: %d seconds\n", KSM_SCAN_LIMIT_SEC_DEFAULT);
+       printf(" -z: change use_zero_pages tunable\n"
+              "     Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT);
+       printf(" -m: change merge_across_nodes tunable\n"
+              "     Default: %d\n", KSM_MERGE_ACROSS_NODES_DEFAULT);
+       printf(" -s: the size of duplicated memory area (in MiB)\n");
+
+       exit(0);
+}
+
+static void  *allocate_memory(void *ptr, int prot, int mapping, char data, size_t map_size)
+{
+       void *map_ptr = mmap(ptr, map_size, PROT_WRITE, mapping, -1, 0);
+
+       if (!map_ptr) {
+               perror("mmap");
+               return NULL;
+       }
+       memset(map_ptr, data, map_size);
+       if (mprotect(map_ptr, map_size, prot)) {
+               perror("mprotect");
+               munmap(map_ptr, map_size);
+               return NULL;
+       }
+
+       return map_ptr;
+}
+
+static int ksm_do_scan(int scan_count, struct timespec start_time, int timeout)
+{
+       struct timespec cur_time;
+       unsigned long cur_scan, init_scan;
+
+       if (ksm_read_sysfs(KSM_FP("full_scans"), &init_scan))
+               return 1;
+       cur_scan = init_scan;
+
+       while (cur_scan < init_scan + scan_count) {
+               if (ksm_read_sysfs(KSM_FP("full_scans"), &cur_scan))
+                       return 1;
+               if (clock_gettime(CLOCK_MONOTONIC_RAW, &cur_time)) {
+                       perror("clock_gettime");
+                       return 1;
+               }
+               if ((cur_time.tv_sec - start_time.tv_sec) > timeout) {
+                       printf("Scan time limit exceeded\n");
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
+static int ksm_merge_pages(void *addr, size_t size, struct timespec start_time, int timeout)
+{
+       if (madvise(addr, size, MADV_MERGEABLE)) {
+               perror("madvise");
+               return 1;
+       }
+       if (ksm_write_sysfs(KSM_FP("run"), 1))
+               return 1;
+
+       /* Since merging occurs only after 2 scans, make sure to get at least 2 full scans */
+       if (ksm_do_scan(2, start_time, timeout))
+               return 1;
+
+       return 0;
+}
+
+static bool assert_ksm_pages_count(long dupl_page_count)
+{
+       unsigned long max_page_sharing, pages_sharing, pages_shared;
+
+       if (ksm_read_sysfs(KSM_FP("pages_shared"), &pages_shared) ||
+           ksm_read_sysfs(KSM_FP("pages_sharing"), &pages_sharing) ||
+           ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing))
+               return false;
+
+       /*
+        * Since there must be at least 2 pages for merging and 1 page can be
+        * shared with the limited number of pages (max_page_sharing), sometimes
+        * there are 'leftover' pages that cannot be merged. For example, if there
+        * are 11 pages and max_page_sharing = 10, then only 10 pages will be
+        * merged and the 11th page won't be affected. As a result, when the number
+        * of duplicate pages is divided by max_page_sharing and the remainder is 1,
+        * pages_shared and pages_sharing values will be equal between dupl_page_count
+        * and dupl_page_count - 1.
+        */
+       if (dupl_page_count % max_page_sharing == 1 || dupl_page_count % max_page_sharing == 0) {
+               if (pages_shared == dupl_page_count / max_page_sharing &&
+                   pages_sharing == pages_shared * (max_page_sharing - 1))
+                       return true;
+       } else {
+               if (pages_shared == (dupl_page_count / max_page_sharing + 1) &&
+                   pages_sharing == dupl_page_count - pages_shared)
+                       return true;
+       }
+
+       return false;
+}
+
+static int ksm_save_def(struct ksm_sysfs *ksm_sysfs)
+{
+       if (ksm_read_sysfs(KSM_FP("max_page_sharing"), &ksm_sysfs->max_page_sharing) ||
+           ksm_read_sysfs(KSM_FP("merge_across_nodes"), &ksm_sysfs->merge_across_nodes) ||
+           ksm_read_sysfs(KSM_FP("sleep_millisecs"), &ksm_sysfs->sleep_millisecs) ||
+           ksm_read_sysfs(KSM_FP("pages_to_scan"), &ksm_sysfs->pages_to_scan) ||
+           ksm_read_sysfs(KSM_FP("run"), &ksm_sysfs->run) ||
+           ksm_read_sysfs(KSM_FP("stable_node_chains_prune_millisecs"),
+                          &ksm_sysfs->stable_node_chains_prune_millisecs) ||
+           ksm_read_sysfs(KSM_FP("use_zero_pages"), &ksm_sysfs->use_zero_pages))
+               return 1;
+
+       return 0;
+}
+
+static int ksm_restore(struct ksm_sysfs *ksm_sysfs)
+{
+       if (ksm_write_sysfs(KSM_FP("max_page_sharing"), ksm_sysfs->max_page_sharing) ||
+           ksm_write_sysfs(KSM_FP("merge_across_nodes"), ksm_sysfs->merge_across_nodes) ||
+           ksm_write_sysfs(KSM_FP("pages_to_scan"), ksm_sysfs->pages_to_scan) ||
+           ksm_write_sysfs(KSM_FP("run"), ksm_sysfs->run) ||
+           ksm_write_sysfs(KSM_FP("sleep_millisecs"), ksm_sysfs->sleep_millisecs) ||
+           ksm_write_sysfs(KSM_FP("stable_node_chains_prune_millisecs"),
+                           ksm_sysfs->stable_node_chains_prune_millisecs) ||
+           ksm_write_sysfs(KSM_FP("use_zero_pages"), ksm_sysfs->use_zero_pages))
+               return 1;
+
+       return 0;
+}
+
+static int check_ksm_merge(int mapping, int prot, long page_count, int timeout, size_t page_size)
+{
+       void *map_ptr;
+       struct timespec start_time;
+
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+               perror("clock_gettime");
+               return KSFT_FAIL;
+       }
+
+       /* fill pages with the same data and merge them */
+       map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
+       if (!map_ptr)
+               return KSFT_FAIL;
+
+       if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+               goto err_out;
+
+       /* verify that the right number of pages are merged */
+       if (assert_ksm_pages_count(page_count)) {
+               printf("OK\n");
+               munmap(map_ptr, page_size * page_count);
+               return KSFT_PASS;
+       }
+
+err_out:
+       printf("Not OK\n");
+       munmap(map_ptr, page_size * page_count);
+       return KSFT_FAIL;
+}
+
+static int check_ksm_unmerge(int mapping, int prot, int timeout, size_t page_size)
+{
+       void *map_ptr;
+       struct timespec start_time;
+       int page_count = 2;
+
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+               perror("clock_gettime");
+               return KSFT_FAIL;
+       }
+
+       /* fill pages with the same data and merge them */
+       map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
+       if (!map_ptr)
+               return KSFT_FAIL;
+
+       if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+               goto err_out;
+
+       /* change 1 byte in each of the 2 pages -- KSM must automatically unmerge them */
+       memset(map_ptr, '-', 1);
+       memset(map_ptr + page_size, '+', 1);
+
+       /* get at least 1 scan, so KSM can detect that the pages were modified */
+       if (ksm_do_scan(1, start_time, timeout))
+               goto err_out;
+
+       /* check that unmerging was successful and 0 pages are currently merged */
+       if (assert_ksm_pages_count(0)) {
+               printf("OK\n");
+               munmap(map_ptr, page_size * page_count);
+               return KSFT_PASS;
+       }
+
+err_out:
+       printf("Not OK\n");
+       munmap(map_ptr, page_size * page_count);
+       return KSFT_FAIL;
+}
+
+static int check_ksm_zero_page_merge(int mapping, int prot, long page_count, int timeout,
+                                    bool use_zero_pages, size_t page_size)
+{
+       void *map_ptr;
+       struct timespec start_time;
+
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+               perror("clock_gettime");
+               return KSFT_FAIL;
+       }
+
+       if (ksm_write_sysfs(KSM_FP("use_zero_pages"), use_zero_pages))
+               return KSFT_FAIL;
+
+       /* fill pages with zero and try to merge them */
+       map_ptr = allocate_memory(NULL, prot, mapping, 0, page_size * page_count);
+       if (!map_ptr)
+               return KSFT_FAIL;
+
+       if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+               goto err_out;
+
+       /*
+       * verify that the right number of pages are merged:
+       * 1) if use_zero_pages is set to 1, empty pages are merged
+       *    with the kernel zero page instead of with each other;
+       * 2) if use_zero_pages is set to 0, empty pages are not treated specially
+       *    and merged as usual.
+       */
+       if (use_zero_pages && !assert_ksm_pages_count(0))
+               goto err_out;
+       else if (!use_zero_pages && !assert_ksm_pages_count(page_count))
+               goto err_out;
+
+       printf("OK\n");
+       munmap(map_ptr, page_size * page_count);
+       return KSFT_PASS;
+
+err_out:
+       printf("Not OK\n");
+       munmap(map_ptr, page_size * page_count);
+       return KSFT_FAIL;
+}
+
+static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_across_nodes,
+                               size_t page_size)
+{
+       void *numa1_map_ptr, *numa2_map_ptr;
+       struct timespec start_time;
+       int page_count = 2;
+
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+               perror("clock_gettime");
+               return KSFT_FAIL;
+       }
+
+       if (numa_available() < 0) {
+               perror("NUMA support not enabled");
+               return KSFT_SKIP;
+       }
+       if (numa_max_node() < 1) {
+               printf("At least 2 NUMA nodes must be available\n");
+               return KSFT_SKIP;
+       }
+       if (ksm_write_sysfs(KSM_FP("merge_across_nodes"), merge_across_nodes))
+               return KSFT_FAIL;
+
+       /* allocate 2 pages in 2 different NUMA nodes and fill them with the same data */
+       numa1_map_ptr = numa_alloc_onnode(page_size, 0);
+       numa2_map_ptr = numa_alloc_onnode(page_size, 1);
+       if (!numa1_map_ptr || !numa2_map_ptr) {
+               perror("numa_alloc_onnode");
+               return KSFT_FAIL;
+       }
+
+       memset(numa1_map_ptr, '*', page_size);
+       memset(numa2_map_ptr, '*', page_size);
+
+       /* try to merge the pages */
+       if (ksm_merge_pages(numa1_map_ptr, page_size, start_time, timeout) ||
+           ksm_merge_pages(numa2_map_ptr, page_size, start_time, timeout))
+               goto err_out;
+
+       /*
+       * verify that the right number of pages are merged:
+       * 1) if merge_across_nodes was enabled, 2 duplicate pages will be merged;
+       * 2) if merge_across_nodes = 0, there must be 0 merged pages, since there is
+       *    only 1 unique page in each node and they can't be shared.
+       */
+       if (merge_across_nodes && !assert_ksm_pages_count(page_count))
+               goto err_out;
+       else if (!merge_across_nodes && !assert_ksm_pages_count(0))
+               goto err_out;
+
+       numa_free(numa1_map_ptr, page_size);
+       numa_free(numa2_map_ptr, page_size);
+       printf("OK\n");
+       return KSFT_PASS;
+
+err_out:
+       numa_free(numa1_map_ptr, page_size);
+       numa_free(numa2_map_ptr, page_size);
+       printf("Not OK\n");
+       return KSFT_FAIL;
+}
+
+static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size)
+{
+       void *map_ptr;
+       struct timespec start_time, end_time;
+       unsigned long scan_time_ns;
+
+       map_size *= MB;
+
+       map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size);
+       if (!map_ptr)
+               return KSFT_FAIL;
+
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+               perror("clock_gettime");
+               goto err_out;
+       }
+       if (ksm_merge_pages(map_ptr, map_size, start_time, timeout))
+               goto err_out;
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+               perror("clock_gettime");
+               goto err_out;
+       }
+
+       scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+                      (end_time.tv_nsec - start_time.tv_nsec);
+
+       printf("Total size:    %lu MiB\n", map_size / MB);
+       printf("Total time:    %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC,
+              scan_time_ns % NSEC_PER_SEC);
+       printf("Average speed:  %.3f MiB/s\n", (map_size / MB) /
+                                              ((double)scan_time_ns / NSEC_PER_SEC));
+
+       munmap(map_ptr, map_size);
+       return KSFT_PASS;
+
+err_out:
+       printf("Not OK\n");
+       munmap(map_ptr, map_size);
+       return KSFT_FAIL;
+}
+
+static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size)
+{
+       void *map_ptr;
+       struct timespec start_time, end_time;
+       unsigned long cow_time_ns;
+
+       /* page_count must be less than 2*page_size */
+       size_t page_count = 4000;
+
+       map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
+       if (!map_ptr)
+               return KSFT_FAIL;
+
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+               perror("clock_gettime");
+               return KSFT_FAIL;
+       }
+       for (size_t i = 0; i < page_count - 1; i = i + 2)
+               memset(map_ptr + page_size * i, '-', 1);
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+               perror("clock_gettime");
+               return KSFT_FAIL;
+       }
+
+       cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+                      (end_time.tv_nsec - start_time.tv_nsec);
+
+       printf("Total size:    %lu MiB\n\n", (page_size * page_count) / MB);
+       printf("Not merged pages:\n");
+       printf("Total time:     %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC,
+              cow_time_ns % NSEC_PER_SEC);
+       printf("Average speed:  %.3f MiB/s\n\n", ((page_size * (page_count / 2)) / MB) /
+                                              ((double)cow_time_ns / NSEC_PER_SEC));
+
+       /* Create 2000 pairs of duplicate pages */
+       for (size_t i = 0; i < page_count - 1; i = i + 2) {
+               memset(map_ptr + page_size * i, '+', i / 2 + 1);
+               memset(map_ptr + page_size * (i + 1), '+', i / 2 + 1);
+       }
+       if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+               goto err_out;
+
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+               perror("clock_gettime");
+               goto err_out;
+       }
+       for (size_t i = 0; i < page_count - 1; i = i + 2)
+               memset(map_ptr + page_size * i, '-', 1);
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+               perror("clock_gettime");
+               goto err_out;
+       }
+
+       cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+                      (end_time.tv_nsec - start_time.tv_nsec);
+
+       printf("Merged pages:\n");
+       printf("Total time:     %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC,
+              cow_time_ns % NSEC_PER_SEC);
+       printf("Average speed:  %.3f MiB/s\n", ((page_size * (page_count / 2)) / MB) /
+                                              ((double)cow_time_ns / NSEC_PER_SEC));
+
+       munmap(map_ptr, page_size * page_count);
+       return KSFT_PASS;
+
+err_out:
+       printf("Not OK\n");
+       munmap(map_ptr, page_size * page_count);
+       return KSFT_FAIL;
+}
+
+int main(int argc, char *argv[])
+{
+       int ret, opt;
+       int prot = 0;
+       int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT;
+       long page_count = KSM_PAGE_COUNT_DEFAULT;
+       size_t page_size = sysconf(_SC_PAGESIZE);
+       struct ksm_sysfs ksm_sysfs_old;
+       int test_name = CHECK_KSM_MERGE;
+       bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT;
+       bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT;
+       long size_MB = 0;
+
+       while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPC")) != -1) {
+               switch (opt) {
+               case 'a':
+                       prot = str_to_prot(optarg);
+                       break;
+               case 'p':
+                       page_count = atol(optarg);
+                       if (page_count <= 0) {
+                               printf("The number of pages must be greater than 0\n");
+                               return KSFT_FAIL;
+                       }
+                       break;
+               case 'l':
+                       ksm_scan_limit_sec = atoi(optarg);
+                       if (ksm_scan_limit_sec <= 0) {
+                               printf("Timeout value must be greater than 0\n");
+                               return KSFT_FAIL;
+                       }
+                       break;
+               case 'h':
+                       print_help();
+                       break;
+               case 'z':
+                       if (strcmp(optarg, "0") == 0)
+                               use_zero_pages = 0;
+                       else
+                               use_zero_pages = 1;
+                       break;
+               case 'm':
+                       if (strcmp(optarg, "0") == 0)
+                               merge_across_nodes = 0;
+                       else
+                               merge_across_nodes = 1;
+                       break;
+               case 's':
+                       size_MB = atoi(optarg);
+                       if (size_MB <= 0) {
+                               printf("Size must be greater than 0\n");
+                               return KSFT_FAIL;
+                       }
+               case 'M':
+                       break;
+               case 'U':
+                       test_name = CHECK_KSM_UNMERGE;
+                       break;
+               case 'Z':
+                       test_name = CHECK_KSM_ZERO_PAGE_MERGE;
+                       break;
+               case 'N':
+                       test_name = CHECK_KSM_NUMA_MERGE;
+                       break;
+               case 'P':
+                       test_name = KSM_MERGE_TIME;
+                       break;
+               case 'C':
+                       test_name = KSM_COW_TIME;
+                       break;
+               default:
+                       return KSFT_FAIL;
+               }
+       }
+
+       if (prot == 0)
+               prot = str_to_prot(KSM_PROT_STR_DEFAULT);
+
+       if (access(KSM_SYSFS_PATH, F_OK)) {
+               printf("Config KSM not enabled\n");
+               return KSFT_SKIP;
+       }
+
+       if (ksm_save_def(&ksm_sysfs_old)) {
+               printf("Cannot save default tunables\n");
+               return KSFT_FAIL;
+       }
+
+       if (ksm_write_sysfs(KSM_FP("run"), 2) ||
+           ksm_write_sysfs(KSM_FP("sleep_millisecs"), 0) ||
+           ksm_write_sysfs(KSM_FP("merge_across_nodes"), 1) ||
+           ksm_write_sysfs(KSM_FP("pages_to_scan"), page_count))
+               return KSFT_FAIL;
+
+       switch (test_name) {
+       case CHECK_KSM_MERGE:
+               ret = check_ksm_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count,
+                                     ksm_scan_limit_sec, page_size);
+               break;
+       case CHECK_KSM_UNMERGE:
+               ret = check_ksm_unmerge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
+                                       page_size);
+               break;
+       case CHECK_KSM_ZERO_PAGE_MERGE:
+               ret = check_ksm_zero_page_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count,
+                                               ksm_scan_limit_sec, use_zero_pages, page_size);
+               break;
+       case CHECK_KSM_NUMA_MERGE:
+               ret = check_ksm_numa_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
+                                          merge_across_nodes, page_size);
+               break;
+       case KSM_MERGE_TIME:
+               if (size_MB == 0) {
+                       printf("Option '-s' is required.\n");
+                       return KSFT_FAIL;
+               }
+               ret = ksm_merge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
+                                    size_MB);
+               break;
+       case KSM_COW_TIME:
+               ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
+                                  page_size);
+               break;
+       }
+
+       if (ksm_restore(&ksm_sysfs_old)) {
+               printf("Cannot restore default tunables\n");
+               return KSFT_FAIL;
+       }
+
+       return ret;
+}
index ff4d72e..782ea94 100644 (file)
@@ -70,7 +70,7 @@ int get_proc_locked_vm_size(void)
                }
        }
 
-       perror("cann't parse VmLck in /proc/self/status\n");
+       perror("cannot parse VmLck in /proc/self/status\n");
        fclose(f);
        return -1;
 }
index d09a6b7..45e803a 100755 (executable)
@@ -377,6 +377,102 @@ else
        exitcode=1
 fi
 
+echo "-------------------------------------------------------"
+echo "running KSM MADV_MERGEABLE test with 10 identical pages"
+echo "-------------------------------------------------------"
+./ksm_tests -M -p 10
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+       echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+        echo "[SKIP]"
+        exitcode=$ksft_skip
+else
+       echo "[FAIL]"
+       exitcode=1
+fi
+
+echo "------------------------"
+echo "running KSM unmerge test"
+echo "------------------------"
+./ksm_tests -U
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+       echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+        echo "[SKIP]"
+        exitcode=$ksft_skip
+else
+       echo "[FAIL]"
+       exitcode=1
+fi
+
+echo "----------------------------------------------------------"
+echo "running KSM test with 10 zero pages and use_zero_pages = 0"
+echo "----------------------------------------------------------"
+./ksm_tests -Z -p 10 -z 0
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+       echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+        echo "[SKIP]"
+        exitcode=$ksft_skip
+else
+       echo "[FAIL]"
+       exitcode=1
+fi
+
+echo "----------------------------------------------------------"
+echo "running KSM test with 10 zero pages and use_zero_pages = 1"
+echo "----------------------------------------------------------"
+./ksm_tests -Z -p 10 -z 1
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+       echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+        echo "[SKIP]"
+        exitcode=$ksft_skip
+else
+       echo "[FAIL]"
+       exitcode=1
+fi
+
+echo "-------------------------------------------------------------"
+echo "running KSM test with 2 NUMA nodes and merge_across_nodes = 1"
+echo "-------------------------------------------------------------"
+./ksm_tests -N -m 1
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+       echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+        echo "[SKIP]"
+        exitcode=$ksft_skip
+else
+       echo "[FAIL]"
+       exitcode=1
+fi
+
+echo "-------------------------------------------------------------"
+echo "running KSM test with 2 NUMA nodes and merge_across_nodes = 0"
+echo "-------------------------------------------------------------"
+./ksm_tests -N -m 0
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+       echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+        echo "[SKIP]"
+        exitcode=$ksft_skip
+else
+       echo "[FAIL]"
+       exitcode=1
+fi
+
 exit $exitcode
 
 exit $exitcode
index 2ea438e..10ab56c 100644 (file)
@@ -566,6 +566,18 @@ static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
        }
 }
 
+static void wake_range(int ufd, unsigned long addr, unsigned long len)
+{
+       struct uffdio_range uffdio_wake;
+
+       uffdio_wake.start = addr;
+       uffdio_wake.len = len;
+
+       if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
+               fprintf(stderr, "error waking %lu\n",
+                       addr), exit(1);
+}
+
 static int __copy_page(int ufd, unsigned long offset, bool retry)
 {
        struct uffdio_copy uffdio_copy;
@@ -585,6 +597,7 @@ static int __copy_page(int ufd, unsigned long offset, bool retry)
                if (uffdio_copy.copy != -EEXIST)
                        err("UFFDIO_COPY error: %"PRId64,
                            (int64_t)uffdio_copy.copy);
+               wake_range(ufd, uffdio_copy.dst, page_size);
        } else if (uffdio_copy.copy != page_size) {
                err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
        } else {