Merge branch 'akpm' (patches from Andrew)
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 14 May 2019 17:10:55 +0000 (10:10 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 14 May 2019 17:10:55 +0000 (10:10 -0700)
Merge misc updates from Andrew Morton:

 - a few misc things and hotfixes

 - ocfs2

 - almost all of MM

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (139 commits)
  kernel/memremap.c: remove the unused device_private_entry_fault() export
  mm: delete find_get_entries_tag
  mm/huge_memory.c: make __thp_get_unmapped_area static
  mm/mprotect.c: fix compilation warning because of unused 'mm' variable
  mm/page-writeback: introduce tracepoint for wait_on_page_writeback()
  mm/vmscan: simplify trace_reclaim_flags and trace_shrink_flags
  mm/Kconfig: update "Memory Model" help text
  mm/vmscan.c: don't disable irq again when count pgrefill for memcg
  mm: memblock: make keeping memblock memory opt-in rather than opt-out
  hugetlbfs: always use address space in inode for resv_map pointer
  mm/z3fold.c: support page migration
  mm/z3fold.c: add structure for buddy handles
  mm/z3fold.c: improve compression by extending search
  mm/z3fold.c: introduce helper functions
  mm/page_alloc.c: remove unnecessary parameter in rmqueue_pcplist
  mm/hmm: add ARCH_HAS_HMM_MIRROR ARCH_HAS_HMM_DEVICE Kconfig
  mm/vmscan.c: simplify shrink_inactive_list()
  fs/sync.c: sync_file_range(2) may use WB_SYNC_ALL writeback
  xen/privcmd-buf.c: convert to use vm_map_pages_zero()
  xen/gntdev.c: convert to use vm_map_pages()
  ...

189 files changed:
Documentation/sysctl/vm.txt
Documentation/trace/postprocess/trace-vmscan-postprocess.pl
Documentation/vm/hmm.rst
MAINTAINERS
arch/Kconfig
arch/alpha/mm/init.c
arch/arc/mm/init.c
arch/arm/Kconfig
arch/arm/mm/dma-mapping.c
arch/arm/mm/init.c
arch/arm64/Kconfig
arch/arm64/include/asm/hugetlb.h
arch/arm64/mm/init.c
arch/arm64/mm/mmu.c
arch/c6x/mm/init.c
arch/h8300/mm/init.c
arch/hexagon/Kconfig
arch/hexagon/mm/init.c
arch/ia64/Kconfig
arch/ia64/mm/init.c
arch/m68k/Kconfig
arch/m68k/mm/init.c
arch/microblaze/mm/init.c
arch/mips/Kconfig
arch/mips/mm/gup.c
arch/mips/mm/init.c
arch/nds32/mm/init.c
arch/nios2/Kconfig
arch/nios2/mm/init.c
arch/openrisc/mm/init.c
arch/parisc/mm/init.c
arch/powerpc/Kconfig
arch/powerpc/include/asm/book3s/64/hugetlb.h
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/e500_mmu.c
arch/powerpc/mm/book3s64/iommu_api.c
arch/powerpc/mm/mem.c
arch/powerpc/platforms/Kconfig.cputype
arch/riscv/mm/init.c
arch/s390/Kconfig
arch/s390/include/asm/hugetlb.h
arch/s390/kvm/interrupt.c
arch/s390/mm/init.c
arch/sh/Kconfig
arch/sh/boards/mach-dreamcast/irq.c
arch/sh/mm/gup.c
arch/sh/mm/init.c
arch/sparc/Kconfig
arch/sparc/include/asm/pgtable_64.h
arch/sparc/mm/gup.c
arch/sparc/mm/init_32.c
arch/sparc/mm/init_64.c
arch/um/kernel/mem.c
arch/unicore32/Kconfig
arch/unicore32/mm/init.c
arch/x86/Kconfig
arch/x86/include/asm/hugetlb.h
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/svm.c
arch/x86/mm/hugetlbpage.c
arch/x86/mm/init_32.c
arch/x86/mm/init_64.c
arch/xtensa/mm/init.c
drivers/base/memory.c
drivers/dax/device.c
drivers/firewire/core-iso.c
drivers/fpga/dfl-afu-dma-region.c
drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
drivers/gpu/drm/i915/i915_gem_userptr.c
drivers/gpu/drm/radeon/radeon_mn.c
drivers/gpu/drm/rockchip/rockchip_drm_gem.c
drivers/gpu/drm/via/via_dmablit.c
drivers/gpu/drm/xen/xen_drm_front_gem.c
drivers/infiniband/core/umem.c
drivers/infiniband/core/umem_odp.c
drivers/infiniband/hw/hfi1/user_pages.c
drivers/infiniband/hw/mthca/mthca_memfree.c
drivers/infiniband/hw/qib/qib_user_pages.c
drivers/infiniband/hw/qib/qib_user_sdma.c
drivers/infiniband/hw/usnic/usnic_uiom.c
drivers/iommu/dma-iommu.c
drivers/media/common/videobuf2/videobuf2-core.c
drivers/media/common/videobuf2/videobuf2-dma-contig.c
drivers/media/common/videobuf2/videobuf2-dma-sg.c
drivers/media/v4l2-core/videobuf-dma-sg.c
drivers/misc/genwqe/card_utils.c
drivers/misc/vmw_vmci/vmci_host.c
drivers/misc/vmw_vmci/vmci_queue_pair.c
drivers/platform/goldfish/goldfish_pipe.c
drivers/rapidio/devices/rio_mport_cdev.c
drivers/sbus/char/oradax.c
drivers/scsi/st.c
drivers/staging/gasket/gasket_page_table.c
drivers/tee/tee_shm.c
drivers/vfio/vfio_iommu_spapr_tce.c
drivers/vfio/vfio_iommu_type1.c
drivers/vhost/vhost.c
drivers/video/fbdev/pvr2fb.c
drivers/virt/fsl_hypervisor.c
drivers/xen/gntdev.c
drivers/xen/privcmd-buf.c
fs/dax.c
fs/hugetlbfs/inode.c
fs/io_uring.c
fs/ocfs2/dir.c
fs/ocfs2/export.c
fs/ocfs2/ocfs2_fs.h
fs/orangefs/orangefs-bufmap.c
fs/proc/task_mmu.c
fs/sync.c
fs/userfaultfd.c
include/asm-generic/hugetlb.h
include/linux/balloon_compaction.h
include/linux/gfp.h
include/linux/hmm.h
include/linux/huge_mm.h
include/linux/hugetlb.h
include/linux/list.h
include/linux/memblock.h
include/linux/memcontrol.h
include/linux/memory.h
include/linux/memory_hotplug.h
include/linux/mm.h
include/linux/mm_inline.h
include/linux/mm_types.h
include/linux/mmu_notifier.h
include/linux/mmzone.h
include/linux/pagemap.h
include/linux/userfaultfd_k.h
include/linux/vmstat.h
include/trace/events/compaction.h
include/trace/events/vmscan.h
include/trace/events/writeback.h
include/uapi/linux/fs.h
init/initramfs.c
init/main.c
kernel/events/uprobes.c
kernel/futex.c
kernel/kexec_file.c
kernel/memremap.c
kernel/sys.c
kernel/sysctl.c
lib/iov_iter.c
mm/Kconfig
mm/Kconfig.debug
mm/cma.c
mm/cma_debug.c
mm/compaction.c
mm/filemap.c
mm/gup.c
mm/gup_benchmark.c
mm/hmm.c
mm/huge_memory.c
mm/hugetlb.c
mm/khugepaged.c
mm/ksm.c
mm/madvise.c
mm/memblock.c
mm/memcontrol.c
mm/memfd.c
mm/memory.c
mm/memory_hotplug.c
mm/migrate.c
mm/mmu_notifier.c
mm/mprotect.c
mm/mremap.c
mm/nommu.c
mm/oom_kill.c
mm/page-writeback.c
mm/page_alloc.c
mm/page_isolation.c
mm/rmap.c
mm/shmem.c
mm/slab.c
mm/slob.c
mm/slub.c
mm/sparse.c
mm/swap.c
mm/swap_state.c
mm/userfaultfd.c
mm/util.c
mm/vmscan.c
mm/workingset.c
mm/z3fold.c
net/ceph/pagevec.c
net/rds/info.c
net/rds/rdma.c
net/xdp/xdp_umem.c
virt/kvm/kvm_main.c

index 3f13d85..7493220 100644 (file)
@@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm:
 - stat_refresh
 - numa_stat
 - swappiness
+- unprivileged_userfaultfd
 - user_reserve_kbytes
 - vfs_cache_pressure
 - watermark_boost_factor
@@ -818,6 +819,17 @@ The default value is 60.
 
 ==============================================================
 
+unprivileged_userfaultfd
+
+This flag controls whether unprivileged users can use the userfaultfd
+system calls.  Set this to 1 to allow unprivileged users to use the
+userfaultfd system calls, or set this to 0 to restrict userfaultfd to only
+privileged users (with SYS_CAP_PTRACE capability).
+
+The default value is 1.
+
+==============================================================
+
 - user_reserve_kbytes
 
 When overcommit_memory is set to 2, "never overcommit" mode, reserve
index 66bfd83..995da15 100644 (file)
@@ -113,7 +113,7 @@ my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)';
 my $regex_kswapd_sleep_default = 'nid=([0-9]*)';
 my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*) gfp_flags=([A-Z_|]*)';
 my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)';
-my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)';
+my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate_anon=([0-9]*) nr_activate_file=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)';
 my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)';
 my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)';
 
@@ -212,7 +212,8 @@ $regex_lru_shrink_inactive = generate_traceevent_regex(
                        "vmscan/mm_vmscan_lru_shrink_inactive",
                        $regex_lru_shrink_inactive_default,
                        "nid", "nr_scanned", "nr_reclaimed", "nr_dirty", "nr_writeback",
-                       "nr_congested", "nr_immediate", "nr_activate", "nr_ref_keep",
+                       "nr_congested", "nr_immediate", "nr_activate_anon",
+                       "nr_activate_file", "nr_ref_keep",
                        "nr_unmap_fail", "priority", "flags");
 $regex_lru_shrink_active = generate_traceevent_regex(
                        "vmscan/mm_vmscan_lru_shrink_active",
@@ -407,7 +408,7 @@ EVENT_PROCESS:
                        }
 
                        my $nr_reclaimed = $3;
-                       my $flags = $12;
+                       my $flags = $13;
                        my $file = 0;
                        if ($flags =~ /RECLAIM_WB_FILE/) {
                                $file = 1;
index 44205f0..ec1efa3 100644 (file)
@@ -189,20 +189,10 @@ the driver callback returns.
 When the device driver wants to populate a range of virtual addresses, it can
 use either::
 
-  int hmm_vma_get_pfns(struct vm_area_struct *vma,
-                      struct hmm_range *range,
-                      unsigned long start,
-                      unsigned long end,
-                      hmm_pfn_t *pfns);
-  int hmm_vma_fault(struct vm_area_struct *vma,
-                    struct hmm_range *range,
-                    unsigned long start,
-                    unsigned long end,
-                    hmm_pfn_t *pfns,
-                    bool write,
-                    bool block);
-
-The first one (hmm_vma_get_pfns()) will only fetch present CPU page table
+  long hmm_range_snapshot(struct hmm_range *range);
+  long hmm_range_fault(struct hmm_range *range, bool block);
+
+The first one (hmm_range_snapshot()) will only fetch present CPU page table
 entries and will not trigger a page fault on missing or non-present entries.
 The second one does trigger a page fault on missing or read-only entry if the
 write parameter is true. Page faults use the generic mm page fault code path
@@ -220,25 +210,56 @@ respect in order to keep things properly synchronized. The usage pattern is::
  {
       struct hmm_range range;
       ...
+
+      range.start = ...;
+      range.end = ...;
+      range.pfns = ...;
+      range.flags = ...;
+      range.values = ...;
+      range.pfn_shift = ...;
+      hmm_range_register(&range);
+
+      /*
+       * Just wait for range to be valid, safe to ignore return value as we
+       * will use the return value of hmm_range_snapshot() below under the
+       * mmap_sem to ascertain the validity of the range.
+       */
+      hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC);
+
  again:
-      ret = hmm_vma_get_pfns(vma, &range, start, end, pfns);
-      if (ret)
+      down_read(&mm->mmap_sem);
+      ret = hmm_range_snapshot(&range);
+      if (ret) {
+          up_read(&mm->mmap_sem);
+          if (ret == -EAGAIN) {
+            /*
+             * No need to check hmm_range_wait_until_valid() return value
+             * on retry we will get proper error with hmm_range_snapshot()
+             */
+            hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC);
+            goto again;
+          }
+          hmm_mirror_unregister(&range);
           return ret;
+      }
       take_lock(driver->update);
-      if (!hmm_vma_range_done(vma, &range)) {
+      if (!range.valid) {
           release_lock(driver->update);
+          up_read(&mm->mmap_sem);
           goto again;
       }
 
       // Use pfns array content to update device page table
 
+      hmm_mirror_unregister(&range);
       release_lock(driver->update);
+      up_read(&mm->mmap_sem);
       return 0;
  }
 
 The driver->update lock is the same lock that the driver takes inside its
-update() callback. That lock must be held before hmm_vma_range_done() to avoid
-any race with a concurrent CPU page table update.
+update() callback. That lock must be held before checking the range.valid
+field to avoid any race with a concurrent CPU page table update.
 
 HMM implements all this on top of the mmu_notifier API because we wanted a
 simpler API and also to be able to perform optimizations latter on like doing
@@ -255,6 +276,41 @@ report commands as executed is serialized (there is no point in doing this
 concurrently).
 
 
+Leverage default_flags and pfn_flags_mask
+=========================================
+
+The hmm_range struct has 2 fields default_flags and pfn_flags_mask that allows
+to set fault or snapshot policy for a whole range instead of having to set them
+for each entries in the range.
+
+For instance if the device flags for device entries are:
+    VALID (1 << 63)
+    WRITE (1 << 62)
+
+Now let say that device driver wants to fault with at least read a range then
+it does set:
+    range->default_flags = (1 << 63)
+    range->pfn_flags_mask = 0;
+
+and calls hmm_range_fault() as described above. This will fill fault all page
+in the range with at least read permission.
+
+Now let say driver wants to do the same except for one page in the range for
+which its want to have write. Now driver set:
+    range->default_flags = (1 << 63);
+    range->pfn_flags_mask = (1 << 62);
+    range->pfns[index_of_write] = (1 << 62);
+
+With this HMM will fault in all page with at least read (ie valid) and for the
+address == range->start + (index_of_write << PAGE_SHIFT) it will fault with
+write permission ie if the CPU pte does not have write permission set then HMM
+will call handle_mm_fault().
+
+Note that HMM will populate the pfns array with write permission for any entry
+that have write permission within the CPU pte no matter what are the values set
+in default_flags or pfn_flags_mask.
+
+
 Represent and manage device memory from core kernel point of view
 =================================================================
 
index fb9f9d7..372e60e 100644 (file)
@@ -11746,6 +11746,7 @@ F:      include/linux/oprofile.h
 ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
 M:     Mark Fasheh <mark@fasheh.com>
 M:     Joel Becker <jlbec@evilplan.org>
+M:     Joseph Qi <joseph.qi@linux.alibaba.com>
 L:     ocfs2-devel@oss.oracle.com (moderated for non-subscribers)
 W:     http://ocfs2.wiki.kernel.org
 S:     Supported
index 5e43fcb..f11f069 100644 (file)
@@ -245,6 +245,13 @@ config ARCH_HAS_FORTIFY_SOURCE
          An architecture should select this when it can successfully
          build and run with CONFIG_FORTIFY_SOURCE.
 
+#
+# Select if the arch provides a historic keepinit alias for the retain_initrd
+# command line option
+#
+config ARCH_HAS_KEEPINITRD
+       bool
+
 # Select if arch has all set_memory_ro/rw/x/nx() functions in asm/cacheflush.h
 config ARCH_HAS_SET_MEMORY
        bool
index a42fc5c..e2cbec3 100644 (file)
@@ -285,17 +285,3 @@ mem_init(void)
        memblock_free_all();
        mem_init_print_info(NULL);
 }
-
-void
-free_initmem(void)
-{
-       free_initmem_default(-1);
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void
-free_initrd_mem(unsigned long start, unsigned long end)
-{
-       free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
index e1ab2d7..02b7a3b 100644 (file)
@@ -206,18 +206,3 @@ void __init mem_init(void)
        memblock_free_all();
        mem_init_print_info(NULL);
 }
-
-/*
- * free_initmem: Free all the __init memory.
- */
-void __ref free_initmem(void)
-{
-       free_initmem_default(-1);
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
-       free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
index dc9855c..5fd344b 100644 (file)
@@ -4,11 +4,11 @@ config ARM
        default y
        select ARCH_32BIT_OFF_T
        select ARCH_CLOCKSOURCE_DATA
-       select ARCH_DISCARD_MEMBLOCK if !HAVE_ARCH_PFN_VALID && !KEXEC
        select ARCH_HAS_DEBUG_VIRTUAL if MMU
        select ARCH_HAS_DEVMEM_IS_ALLOWED
        select ARCH_HAS_ELF_RANDOMIZE
        select ARCH_HAS_FORTIFY_SOURCE
+       select ARCH_HAS_KEEPINITRD
        select ARCH_HAS_KCOV
        select ARCH_HAS_MEMBARRIER_SYNC_CORE
        select ARCH_HAS_PTE_SPECIAL if ARM_LPAE
@@ -21,6 +21,7 @@ config ARM
        select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
        select ARCH_HAVE_CUSTOM_GPIO_H
        select ARCH_HAS_GCOV_PROFILE_ALL
+       select ARCH_KEEP_MEMBLOCK if HAVE_ARCH_PFN_VALID || KEXEC
        select ARCH_MIGHT_HAVE_PC_PARPORT
        select ARCH_NO_SG_CHAIN if !ARM_HAS_SG_CHAIN
        select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
index 43f46aa..0a75058 100644 (file)
@@ -1577,31 +1577,21 @@ static int __arm_iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma
                    void *cpu_addr, dma_addr_t dma_addr, size_t size,
                    unsigned long attrs)
 {
-       unsigned long uaddr = vma->vm_start;
-       unsigned long usize = vma->vm_end - vma->vm_start;
        struct page **pages = __iommu_get_pages(cpu_addr, attrs);
        unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
-       unsigned long off = vma->vm_pgoff;
+       int err;
 
        if (!pages)
                return -ENXIO;
 
-       if (off >= nr_pages || (usize >> PAGE_SHIFT) > nr_pages - off)
+       if (vma->vm_pgoff >= nr_pages)
                return -ENXIO;
 
-       pages += off;
-
-       do {
-               int ret = vm_insert_page(vma, uaddr, *pages++);
-               if (ret) {
-                       pr_err("Remapping memory failed: %d\n", ret);
-                       return ret;
-               }
-               uaddr += PAGE_SIZE;
-               usize -= PAGE_SIZE;
-       } while (usize > 0);
+       err = vm_map_pages(vma, pages, nr_pages);
+       if (err)
+               pr_err("Remapping memory failed: %d\n", err);
 
-       return 0;
+       return err;
 }
 static int arm_iommu_mmap_attrs(struct device *dev,
                struct vm_area_struct *vma, void *cpu_addr,
index c2daabb..68dcd5f 100644 (file)
@@ -695,27 +695,14 @@ void free_initmem(void)
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
-
-static int keep_initrd;
-
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-       if (!keep_initrd) {
-               if (start == initrd_start)
-                       start = round_down(start, PAGE_SIZE);
-               if (end == initrd_end)
-                       end = round_up(end, PAGE_SIZE);
+       if (start == initrd_start)
+               start = round_down(start, PAGE_SIZE);
+       if (end == initrd_end)
+               end = round_up(end, PAGE_SIZE);
 
-               poison_init_mem((void *)start, PAGE_ALIGN(end) - start);
-               free_reserved_area((void *)start, (void *)end, -1, "initrd");
-       }
+       poison_init_mem((void *)start, PAGE_ALIGN(end) - start);
+       free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
-
-static int __init keepinitrd_setup(char *__unused)
-{
-       keep_initrd = 1;
-       return 1;
-}
-
-__setup("keepinitrd", keepinitrd_setup);
 #endif
index 3f95744..69a59a5 100644 (file)
@@ -19,8 +19,9 @@ config ARM64
        select ARCH_HAS_FAST_MULTIPLIER
        select ARCH_HAS_FORTIFY_SOURCE
        select ARCH_HAS_GCOV_PROFILE_ALL
-       select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+       select ARCH_HAS_GIGANTIC_PAGE
        select ARCH_HAS_KCOV
+       select ARCH_HAS_KEEPINITRD
        select ARCH_HAS_MEMBARRIER_SYNC_CORE
        select ARCH_HAS_PTE_SPECIAL
        select ARCH_HAS_SETUP_DMA_OPS
@@ -59,6 +60,7 @@ config ARM64
        select ARCH_INLINE_SPIN_UNLOCK_BH if !PREEMPT
        select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPT
        select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPT
+       select ARCH_KEEP_MEMBLOCK
        select ARCH_USE_CMPXCHG_LOCKREF
        select ARCH_USE_QUEUED_RWLOCKS
        select ARCH_USE_QUEUED_SPINLOCKS
index c6a07a3..4aad638 100644 (file)
@@ -70,8 +70,4 @@ extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
 
 #include <asm-generic/hugetlb.h>
 
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void) { return true; }
-#endif
-
 #endif /* __ASM_HUGETLB_H */
index 40e2d7e..007c05a 100644 (file)
@@ -578,24 +578,11 @@ void free_initmem(void)
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
-
-static int keep_initrd __initdata;
-
 void __init free_initrd_mem(unsigned long start, unsigned long end)
 {
-       if (!keep_initrd) {
-               free_reserved_area((void *)start, (void *)end, 0, "initrd");
-               memblock_free(__virt_to_phys(start), end - start);
-       }
-}
-
-static int __init keepinitrd_setup(char *__unused)
-{
-       keep_initrd = 1;
-       return 1;
+       free_reserved_area((void *)start, (void *)end, 0, "initrd");
+       memblock_free(__virt_to_phys(start), end - start);
 }
-
-__setup("keepinitrd", keepinitrd_setup);
 #endif
 
 /*
index ef82312..ef32d48 100644 (file)
@@ -1065,8 +1065,8 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
-                   bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+                       struct mhp_restrictions *restrictions)
 {
        int flags = 0;
 
@@ -1077,6 +1077,6 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
                             size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
 
        return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
-                          altmap, want_memblock);
+                          restrictions);
 }
 #endif
index fe582c3..573242b 100644 (file)
@@ -68,15 +68,3 @@ void __init mem_init(void)
 
        mem_init_print_info(NULL);
 }
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
-       free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
-void __init free_initmem(void)
-{
-       free_initmem_default(-1);
-}
index 0f04a5e..1eab16b 100644 (file)
@@ -102,17 +102,3 @@ void __init mem_init(void)
 
        mem_init_print_info(NULL);
 }
-
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-       free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
-void
-free_initmem(void)
-{
-       free_initmem_default(-1);
-}
index 3e54a53..b7d404b 100644 (file)
@@ -22,7 +22,6 @@ config HEXAGON
        select GENERIC_IRQ_SHOW
        select HAVE_ARCH_KGDB
        select HAVE_ARCH_TRACEHOOK
-       select ARCH_DISCARD_MEMBLOCK
        select NEED_SG_DMA_LENGTH
        select NO_IOPORT_MAP
        select GENERIC_IOMAP
index 1719ede..41cf342 100644 (file)
@@ -84,16 +84,6 @@ void __init mem_init(void)
        init_mm.context.ptbase = __pa(init_mm.pgd);
 }
 
-/*
- * free_initmem - frees memory used by stuff declared with __init
- *
- * Todo:  free pages between __init_begin and __init_end; possibly
- * some devtree related stuff as well.
- */
-void __ref free_initmem(void)
-{
-}
-
 /*
  * free_initrd_mem - frees...  initrd memory.
  * @start - start of init memory
index 73a26f0..7468d8e 100644 (file)
@@ -33,7 +33,6 @@ config IA64
        select ARCH_HAS_DMA_COHERENT_TO_PFN if SWIOTLB
        select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB
        select VIRT_TO_BUS
-       select ARCH_DISCARD_MEMBLOCK
        select GENERIC_IRQ_PROBE
        select GENERIC_PENDING_IRQ if SMP
        select GENERIC_IRQ_SHOW
index e49200e..d28e291 100644 (file)
@@ -666,14 +666,14 @@ mem_init (void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
-               bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+                       struct mhp_restrictions *restrictions)
 {
        unsigned long start_pfn = start >> PAGE_SHIFT;
        unsigned long nr_pages = size >> PAGE_SHIFT;
        int ret;
 
-       ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+       ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
        if (ret)
                printk("%s: Problem encountered in __add_pages() as ret=%d\n",
                       __func__,  ret);
@@ -682,20 +682,15 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
+void arch_remove_memory(int nid, u64 start, u64 size,
+                       struct vmem_altmap *altmap)
 {
        unsigned long start_pfn = start >> PAGE_SHIFT;
        unsigned long nr_pages = size >> PAGE_SHIFT;
        struct zone *zone;
-       int ret;
 
        zone = page_zone(pfn_to_page(start_pfn));
-       ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
-       if (ret)
-               pr_warn("%s: Problem encountered in __remove_pages() as"
-                       " ret=%d\n", __func__,  ret);
-
-       return ret;
+       __remove_pages(zone, start_pfn, nr_pages, altmap);
 }
 #endif
 #endif
index fe5cc2d..218e037 100644 (file)
@@ -26,7 +26,6 @@ config M68K
        select MODULES_USE_ELF_RELA
        select OLD_SIGSUSPEND3
        select OLD_SIGACTION
-       select ARCH_DISCARD_MEMBLOCK
        select MMU_GATHER_NO_RANGE if MMU
 
 config CPU_BIG_ENDIAN
index 8868a4c..778cacb 100644 (file)
@@ -147,10 +147,3 @@ void __init mem_init(void)
        init_pointer_tables();
        mem_init_print_info(NULL);
 }
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-       free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
index 7e97d44..a015a95 100644 (file)
@@ -186,18 +186,6 @@ void __init setup_memory(void)
        paging_init();
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-       free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
-void free_initmem(void)
-{
-       free_initmem_default(-1);
-}
-
 void __init mem_init(void)
 {
        high_memory = (void *)__va(memory_start + lowmem_size - 1);
index ff8cff9..677e5bf 100644 (file)
@@ -5,7 +5,6 @@ config MIPS
        select ARCH_32BIT_OFF_T if !64BIT
        select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT
        select ARCH_CLOCKSOURCE_DATA
-       select ARCH_DISCARD_MEMBLOCK
        select ARCH_HAS_ELF_RANDOMIZE
        select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
        select ARCH_HAS_UBSAN_SANITIZE_ALL
index 0d14e0d..4c2b448 100644 (file)
@@ -235,7 +235,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
  * get_user_pages_fast() - pin user pages in memory
  * @start:     starting user address
  * @nr_pages:  number of pages from start to pin
- * @write:     whether pages will be written to
+ * @gup_flags: flags modifying pin behaviour
  * @pages:     array that receives pointers to the pages pinned.
  *             Should be at least nr_pages long.
  *
@@ -247,8 +247,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
  * requested. If nr_pages is 0 or negative, returns 0. If no pages
  * were pinned, returns -errno.
  */
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
-                       struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+                       unsigned int gup_flags, struct page **pages)
 {
        struct mm_struct *mm = current->mm;
        unsigned long addr, len, end;
@@ -273,7 +273,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
                next = pgd_addr_end(addr, end);
                if (pgd_none(pgd))
                        goto slow;
-               if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+               if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE,
+                                  pages, &nr))
                        goto slow;
        } while (pgdp++, addr = next, addr != end);
        local_irq_enable();
@@ -289,7 +290,7 @@ slow_irqon:
        pages += nr;
 
        ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT,
-                                     pages, write ? FOLL_WRITE : 0);
+                                     pages, gup_flags);
 
        /* Have to be a bit careful with return values */
        if (nr > 0) {
index bbb196a..8a038b3 100644 (file)
@@ -504,14 +504,6 @@ void free_init_pages(const char *what, unsigned long begin, unsigned long end)
        printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-       free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
-                          "initrd");
-}
-#endif
-
 void (*free_init_pages_eva)(void *begin, void *end) = NULL;
 
 void __ref free_initmem(void)
index 1d03633..1a4ab1b 100644 (file)
@@ -252,18 +252,6 @@ void __init mem_init(void)
        return;
 }
 
-void free_initmem(void)
-{
-       free_initmem_default(-1);
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-       free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
 void __set_fixmap(enum fixed_addresses idx,
                               phys_addr_t phys, pgprot_t flags)
 {
index ea37394..26a9c76 100644 (file)
@@ -23,7 +23,6 @@ config NIOS2
        select SPARSE_IRQ
        select USB_ARCH_HAS_HCD if USB_SUPPORT
        select CPU_NO_EFFICIENT_FFS
-       select ARCH_DISCARD_MEMBLOCK
        select MMU_GATHER_NO_RANGE if MMU
 
 config GENERIC_CSUM
index 16cea57..2c609c2 100644 (file)
@@ -82,18 +82,6 @@ void __init mmu_init(void)
        flush_tlb_all();
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
-       free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
-void __ref free_initmem(void)
-{
-       free_initmem_default(-1);
-}
-
 #define __page_aligned(order) __aligned(PAGE_SIZE << (order))
 pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned(PGD_ORDER);
 pte_t invalid_pte_table[PTRS_PER_PTE] __page_aligned(PTE_ORDER);
index caeb418..abe87e5 100644 (file)
@@ -223,15 +223,3 @@ void __init mem_init(void)
        mem_init_done = 1;
        return;
 }
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-       free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
-void free_initmem(void)
-{
-       free_initmem_default(-1);
-}
index 3b0f9ea..11ec1f1 100644 (file)
@@ -917,10 +917,3 @@ void flush_tlb_all(void)
        spin_unlock(&sid_lock);
 }
 #endif
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-       free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
index d7996cf..8c1c636 100644 (file)
@@ -137,6 +137,7 @@ config PPC
        select ARCH_HAS_UBSAN_SANITIZE_ALL
        select ARCH_HAS_ZONE_DEVICE             if PPC_BOOK3S_64
        select ARCH_HAVE_NMI_SAFE_CMPXCHG
+       select ARCH_KEEP_MEMBLOCK
        select ARCH_MIGHT_HAVE_PC_PARPORT
        select ARCH_MIGHT_HAVE_PC_SERIO
        select ARCH_OPTIONAL_KERNEL_RWX         if ARCH_HAS_STRICT_KERNEL_RWX
index 56140d1..12e150e 100644 (file)
@@ -36,8 +36,8 @@ static inline int hstate_get_psize(struct hstate *hstate)
        }
 }
 
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void)
+#define __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED
+static inline bool gigantic_page_runtime_supported(void)
 {
        /*
         * We used gigantic page reservation with hypervisor assist in some case.
@@ -49,7 +49,6 @@ static inline bool gigantic_page_supported(void)
 
        return true;
 }
-#endif
 
 /* hugepd entry valid bit */
 #define HUGEPD_VAL_BITS                (0x8000000000000000UL)
index be7bc07..ab3d484 100644 (file)
@@ -600,7 +600,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
        /* If writing != 0, then the HPTE must allow writing, if we get here */
        write_ok = writing;
        hva = gfn_to_hva_memslot(memslot, gfn);
-       npages = get_user_pages_fast(hva, 1, writing, pages);
+       npages = get_user_pages_fast(hva, 1, writing ? FOLL_WRITE : 0, pages);
        if (npages < 1) {
                /* Check if it's an I/O mapping */
                down_read(&current->mm->mmap_sem);
@@ -1193,7 +1193,7 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
        if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
                goto err;
        hva = gfn_to_hva_memslot(memslot, gfn);
-       npages = get_user_pages_fast(hva, 1, 1, pages);
+       npages = get_user_pages_fast(hva, 1, FOLL_WRITE, pages);
        if (npages < 1)
                goto err;
        page = pages[0];
index 24296f4..e0af53f 100644 (file)
@@ -783,7 +783,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
        if (!pages)
                return -ENOMEM;
 
-       ret = get_user_pages_fast(cfg->array, num_pages, 1, pages);
+       ret = get_user_pages_fast(cfg->array, num_pages, FOLL_WRITE, pages);
        if (ret < 0)
                goto free_pages;
 
index 8330f13..5c521f3 100644 (file)
@@ -141,8 +141,9 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
        for (entry = 0; entry < entries; entry += chunk) {
                unsigned long n = min(entries - entry, chunk);
 
-               ret = get_user_pages_longterm(ua + (entry << PAGE_SHIFT), n,
-                               FOLL_WRITE, mem->hpages + entry, NULL);
+               ret = get_user_pages(ua + (entry << PAGE_SHIFT), n,
+                               FOLL_WRITE | FOLL_LONGTERM,
+                               mem->hpages + entry, NULL);
                if (ret == n) {
                        pinned += n;
                        continue;
index cd525d7..e885fe2 100644 (file)
@@ -109,8 +109,8 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end)
        return -ENODEV;
 }
 
-int __ref arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
-                         bool want_memblock)
+int __ref arch_add_memory(int nid, u64 start, u64 size,
+                       struct mhp_restrictions *restrictions)
 {
        unsigned long start_pfn = start >> PAGE_SHIFT;
        unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -127,11 +127,11 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altm
        }
        flush_inval_dcache_range(start, start + size);
 
-       return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+       return __add_pages(nid, start_pfn, nr_pages, restrictions);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-int __ref arch_remove_memory(int nid, u64 start, u64 size,
+void __ref arch_remove_memory(int nid, u64 start, u64 size,
                             struct vmem_altmap *altmap)
 {
        unsigned long start_pfn = start >> PAGE_SHIFT;
@@ -147,14 +147,13 @@ int __ref arch_remove_memory(int nid, u64 start, u64 size,
        if (altmap)
                page += vmem_altmap_offset(altmap);
 
-       ret = __remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
-       if (ret)
-               return ret;
+       __remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
 
        /* Remove htab bolted mappings for this section of memory */
        start = (unsigned long)__va(start);
        flush_inval_dcache_range(start, start + size);
        ret = remove_section_mapping(start, start + size);
+       WARN_ON_ONCE(ret);
 
        /* Ensure all vmalloc mappings are flushed in case they also
         * hit that section of memory
@@ -163,8 +162,6 @@ int __ref arch_remove_memory(int nid, u64 start, u64 size,
 
        if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC)
                pr_warn("Hash collision while resizing HPT\n");
-
-       return ret;
 }
 #endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
@@ -338,13 +335,6 @@ void free_initmem(void)
        free_initmem_default(POISON_FREE_INITMEM);
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
-       free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
 /*
  * This is called when a page has been modified by the kernel.
  * It just marks the page as not i-cache clean.  We do the i-cache
index d0e172d..2794235 100644 (file)
@@ -331,7 +331,7 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK
 config PPC_RADIX_MMU
        bool "Radix MMU Support"
        depends on PPC_BOOK3S_64 && HUGETLB_PAGE
-       select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+       select ARCH_HAS_GIGANTIC_PAGE
        select PPC_HAVE_KUEP
        select PPC_HAVE_KUAP
        default y
index bc7b77e..8bf6f9c 100644 (file)
@@ -66,11 +66,6 @@ void __init mem_init(void)
        mem_init_print_info(NULL);
 }
 
-void free_initmem(void)
-{
-       free_initmem_default(0);
-}
-
 #ifdef CONFIG_BLK_DEV_INITRD
 static void __init setup_initrd(void)
 {
index 0748558..109243f 100644 (file)
@@ -63,7 +63,7 @@ config S390
        select ARCH_HAS_ELF_RANDOMIZE
        select ARCH_HAS_FORTIFY_SOURCE
        select ARCH_HAS_GCOV_PROFILE_ALL
-       select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+       select ARCH_HAS_GIGANTIC_PAGE
        select ARCH_HAS_KCOV
        select ARCH_HAS_PTE_SPECIAL
        select ARCH_HAS_SET_MEMORY
@@ -100,6 +100,7 @@ config S390
        select ARCH_INLINE_WRITE_UNLOCK_BH
        select ARCH_INLINE_WRITE_UNLOCK_IRQ
        select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+       select ARCH_KEEP_MEMBLOCK
        select ARCH_SAVE_PAGE_KEYS if HIBERNATION
        select ARCH_SUPPORTS_ATOMIC_RMW
        select ARCH_SUPPORTS_NUMA_BALANCING
index 2d1afa5..bb59dd9 100644 (file)
@@ -116,7 +116,9 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
        return pte_modify(pte, newprot);
 }
 
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void) { return true; }
-#endif
+static inline bool gigantic_page_runtime_supported(void)
+{
+       return true;
+}
+
 #endif /* _ASM_S390_HUGETLB_H */
index 37503ae..1fd706f 100644 (file)
@@ -2376,7 +2376,7 @@ static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr)
                ret = -EFAULT;
                goto out;
        }
-       ret = get_user_pages_fast(map->addr, 1, 1, &map->page);
+       ret = get_user_pages_fast(map->addr, 1, FOLL_WRITE, &map->page);
        if (ret < 0)
                goto out;
        BUG_ON(ret != 1);
index 7cf48ee..14d1eae 100644 (file)
@@ -157,14 +157,6 @@ void free_initmem(void)
        free_initmem_default(POISON_FREE_INITMEM);
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
-       free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
-                          "initrd");
-}
-#endif
-
 unsigned long memory_block_size_bytes(void)
 {
        /*
@@ -227,8 +219,8 @@ device_initcall(s390_cma_mem_init);
 
 #endif /* CONFIG_CMA */
 
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
-               bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+               struct mhp_restrictions *restrictions)
 {
        unsigned long start_pfn = PFN_DOWN(start);
        unsigned long size_pages = PFN_DOWN(size);
@@ -238,21 +230,22 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
        if (rc)
                return rc;
 
-       rc = __add_pages(nid, start_pfn, size_pages, altmap, want_memblock);
+       rc = __add_pages(nid, start_pfn, size_pages, restrictions);
        if (rc)
                vmem_remove_mapping(start, size);
        return rc;
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
+void arch_remove_memory(int nid, u64 start, u64 size,
+                       struct vmem_altmap *altmap)
 {
        /*
         * There is no hardware or firmware interface which could trigger a
         * hot memory remove on s390. So there is nothing that needs to be
         * implemented.
         */
-       return -EBUSY;
+       BUG();
 }
 #endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
index 0be08d5..b77f512 100644 (file)
@@ -10,7 +10,6 @@ config SUPERH
        select DMA_DECLARE_COHERENT
        select HAVE_IDE if HAS_IOPORT_MAP
        select HAVE_MEMBLOCK_NODE_MAP
-       select ARCH_DISCARD_MEMBLOCK
        select HAVE_OPROFILE
        select HAVE_ARCH_TRACEHOOK
        select HAVE_PERF_EVENTS
@@ -53,6 +52,7 @@ config SUPERH
        select HAVE_FUTEX_CMPXCHG if FUTEX
        select HAVE_NMI
        select NEED_SG_DMA_LENGTH
+       select ARCH_HAS_GIGANTIC_PAGE
 
        help
          The SuperH is a RISC processor targeted for use in embedded systems
index a929f76..cc06e4c 100644 (file)
@@ -10,7 +10,6 @@
  */
 #include <linux/irq.h>
 #include <linux/io.h>
-#include <linux/irq.h>
 #include <linux/export.h>
 #include <linux/err.h>
 #include <mach/sysasic.h>
index 3e27f6d..277c882 100644 (file)
@@ -204,7 +204,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
  * get_user_pages_fast() - pin user pages in memory
  * @start:     starting user address
  * @nr_pages:  number of pages from start to pin
- * @write:     whether pages will be written to
+ * @gup_flags: flags modifying pin behaviour
  * @pages:     array that receives pointers to the pages pinned.
  *             Should be at least nr_pages long.
  *
@@ -216,8 +216,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
  * requested. If nr_pages is 0 or negative, returns 0. If no pages
  * were pinned, returns -errno.
  */
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
-                       struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+                       unsigned int gup_flags, struct page **pages)
 {
        struct mm_struct *mm = current->mm;
        unsigned long addr, len, end;
@@ -241,7 +241,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
                next = pgd_addr_end(addr, end);
                if (pgd_none(pgd))
                        goto slow;
-               if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+               if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE,
+                                  pages, &nr))
                        goto slow;
        } while (pgdp++, addr = next, addr != end);
        local_irq_enable();
@@ -261,7 +262,7 @@ slow_irqon:
 
                ret = get_user_pages_unlocked(start,
                        (end - start) >> PAGE_SHIFT, pages,
-                       write ? FOLL_WRITE : 0);
+                       gup_flags);
 
                /* Have to be a bit careful with return values */
                if (nr > 0) {
index 7062132..b95e343 100644 (file)
@@ -403,28 +403,16 @@ void __init mem_init(void)
        mem_init_done = 1;
 }
 
-void free_initmem(void)
-{
-       free_initmem_default(-1);
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-       free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
-               bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+                       struct mhp_restrictions *restrictions)
 {
        unsigned long start_pfn = PFN_DOWN(start);
        unsigned long nr_pages = size >> PAGE_SHIFT;
        int ret;
 
        /* We only have ZONE_NORMAL, so this is easy.. */
-       ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+       ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
        if (unlikely(ret))
                printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
 
@@ -441,20 +429,15 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
+void arch_remove_memory(int nid, u64 start, u64 size,
+                       struct vmem_altmap *altmap)
 {
        unsigned long start_pfn = PFN_DOWN(start);
        unsigned long nr_pages = size >> PAGE_SHIFT;
        struct zone *zone;
-       int ret;
 
        zone = page_zone(pfn_to_page(start_pfn));
-       ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
-       if (unlikely(ret))
-               pr_warn("%s: Failed, __remove_pages() == %d\n", __func__,
-                       ret);
-
-       return ret;
+       __remove_pages(zone, start_pfn, nr_pages, altmap);
 }
 #endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
index f6421c9..7c93f31 100644 (file)
@@ -92,6 +92,7 @@ config SPARC64
        select ARCH_CLOCKSOURCE_DATA
        select ARCH_HAS_PTE_SPECIAL
        select PCI_DOMAINS if PCI
+       select ARCH_HAS_GIGANTIC_PAGE
 
 config ARCH_DEFCONFIG
        string
index 1393a8a..22500c3 100644 (file)
@@ -231,36 +231,6 @@ extern unsigned long _PAGE_ALL_SZ_BITS;
 extern struct page *mem_map_zero;
 #define ZERO_PAGE(vaddr)       (mem_map_zero)
 
-/* This macro must be updated when the size of struct page grows above 80
- * or reduces below 64.
- * The idea that compiler optimizes out switch() statement, and only
- * leaves clrx instructions
- */
-#define        mm_zero_struct_page(pp) do {                                    \
-       unsigned long *_pp = (void *)(pp);                              \
-                                                                       \
-        /* Check that struct page is either 64, 72, or 80 bytes */     \
-       BUILD_BUG_ON(sizeof(struct page) & 7);                          \
-       BUILD_BUG_ON(sizeof(struct page) < 64);                         \
-       BUILD_BUG_ON(sizeof(struct page) > 80);                         \
-                                                                       \
-       switch (sizeof(struct page)) {                                  \
-       case 80:                                                        \
-               _pp[9] = 0;     /* fallthrough */                       \
-       case 72:                                                        \
-               _pp[8] = 0;     /* fallthrough */                       \
-       default:                                                        \
-               _pp[7] = 0;                                             \
-               _pp[6] = 0;                                             \
-               _pp[5] = 0;                                             \
-               _pp[4] = 0;                                             \
-               _pp[3] = 0;                                             \
-               _pp[2] = 0;                                             \
-               _pp[1] = 0;                                             \
-               _pp[0] = 0;                                             \
-       }                                                               \
-} while (0)
-
 /* PFNs are real physical page numbers.  However, mem_map only begins to record
  * per-page information starting at pfn_base.  This is to handle systems where
  * the first physical page in the machine is at some huge physical address,
index aee6dba..1e770a5 100644 (file)
@@ -245,8 +245,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
        return nr;
 }
 
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
-                       struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+                       unsigned int gup_flags, struct page **pages)
 {
        struct mm_struct *mm = current->mm;
        unsigned long addr, len, end;
@@ -303,7 +303,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
                next = pgd_addr_end(addr, end);
                if (pgd_none(pgd))
                        goto slow;
-               if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+               if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE,
+                                  pages, &nr))
                        goto slow;
        } while (pgdp++, addr = next, addr != end);
 
@@ -324,7 +325,7 @@ slow:
 
                ret = get_user_pages_unlocked(start,
                        (end - start) >> PAGE_SHIFT, pages,
-                       write ? FOLL_WRITE : 0);
+                       gup_flags);
 
                /* Have to be a bit careful with return values */
                if (nr > 0) {
index a8ff298..046ab11 100644 (file)
@@ -294,19 +294,6 @@ void __init mem_init(void)
        mem_init_print_info(NULL);
 }
 
-void free_initmem (void)
-{
-       free_initmem_default(POISON_FREE_INITMEM);
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-       free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
-                          "initrd");
-}
-#endif
-
 void sparc_flush_page_to_ram(struct page *page)
 {
        unsigned long vaddr = (unsigned long)page_address(page);
index bc2aaa4..4b099dd 100644 (file)
@@ -2572,14 +2572,6 @@ void free_initmem(void)
        }
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-       free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
-                          "initrd");
-}
-#endif
-
 pgprot_t PAGE_KERNEL __read_mostly;
 EXPORT_SYMBOL(PAGE_KERNEL);
 
index 99aa11b..a9c9a94 100644 (file)
@@ -188,13 +188,6 @@ void free_initmem(void)
 {
 }
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-       free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
 /* Allocate and free page tables. */
 
 pgd_t *pgd_alloc(struct mm_struct *mm)
index 2445dfc..afe4949 100644 (file)
@@ -3,6 +3,7 @@ config UNICORE32
        def_bool y
        select ARCH_32BIT_OFF_T
        select ARCH_HAS_DEVMEM_IS_ALLOWED
+       select ARCH_HAS_KEEPINITRD
        select ARCH_MIGHT_HAVE_PC_PARPORT
        select ARCH_MIGHT_HAVE_PC_SERIO
        select HAVE_KERNEL_GZIP
index 74b6a2e..b4442f3 100644 (file)
@@ -287,27 +287,3 @@ void __init mem_init(void)
                sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
        }
 }
-
-void free_initmem(void)
-{
-       free_initmem_default(-1);
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-
-static int keep_initrd;
-
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-       if (!keep_initrd)
-               free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-
-static int __init keepinitrd_setup(char *__unused)
-{
-       keep_initrd = 1;
-       return 1;
-}
-
-__setup("keepinitrd", keepinitrd_setup);
-#endif
index e721273..818b361 100644 (file)
@@ -22,7 +22,7 @@ config X86_64
        def_bool y
        depends on 64BIT
        # Options that are inherently 64-bit kernel only:
-       select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+       select ARCH_HAS_GIGANTIC_PAGE
        select ARCH_SUPPORTS_INT128
        select ARCH_USE_CMPXCHG_LOCKREF
        select HAVE_ARCH_SOFT_DIRTY
@@ -47,7 +47,6 @@ config X86
        select ARCH_32BIT_OFF_T                 if X86_32
        select ARCH_CLOCKSOURCE_DATA
        select ARCH_CLOCKSOURCE_INIT
-       select ARCH_DISCARD_MEMBLOCK
        select ARCH_HAS_ACPI_TABLE_UPGRADE      if ACPI
        select ARCH_HAS_DEBUG_VIRTUAL
        select ARCH_HAS_DEVMEM_IS_ALLOWED
index 7469d32..f65cfb4 100644 (file)
@@ -17,8 +17,4 @@ static inline void arch_clear_hugepage_flags(struct page *page)
 {
 }
 
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void) { return true; }
-#endif
-
 #endif /* _ASM_X86_HUGETLB_H */
index 6bdca39..0871503 100644 (file)
@@ -140,7 +140,7 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
        pt_element_t *table;
        struct page *page;
 
-       npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
+       npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page);
        /* Check if the user is doing something meaningless. */
        if (unlikely(npages != 1))
                return -EFAULT;
index 406b558..6b92eaf 100644 (file)
@@ -1805,7 +1805,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
                return NULL;
 
        /* Pin the user virtual address. */
-       npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
+       npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages);
        if (npinned != npages) {
                pr_err("SEV: Failure locking %lu pages.\n", npages);
                goto err;
index 92e4c4b..fab0953 100644 (file)
@@ -203,7 +203,7 @@ static __init int setup_hugepagesz(char *opt)
 }
 __setup("hugepagesz=", setup_hugepagesz);
 
-#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
+#ifdef CONFIG_CONTIG_ALLOC
 static __init int gigantic_pages_init(void)
 {
        /* With compaction or CMA we can allocate gigantic pages at runtime */
index 85c94f9..075e568 100644 (file)
@@ -850,24 +850,25 @@ void __init mem_init(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
-               bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+                       struct mhp_restrictions *restrictions)
 {
        unsigned long start_pfn = start >> PAGE_SHIFT;
        unsigned long nr_pages = size >> PAGE_SHIFT;
 
-       return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+       return __add_pages(nid, start_pfn, nr_pages, restrictions);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
+void arch_remove_memory(int nid, u64 start, u64 size,
+                       struct vmem_altmap *altmap)
 {
        unsigned long start_pfn = start >> PAGE_SHIFT;
        unsigned long nr_pages = size >> PAGE_SHIFT;
        struct zone *zone;
 
        zone = page_zone(pfn_to_page(start_pfn));
-       return __remove_pages(zone, start_pfn, nr_pages, altmap);
+       __remove_pages(zone, start_pfn, nr_pages, altmap);
 }
 #endif
 #endif
index bccff68..20d1425 100644 (file)
@@ -777,11 +777,11 @@ static void update_end_of_memory_vars(u64 start, u64 size)
 }
 
 int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
-               struct vmem_altmap *altmap, bool want_memblock)
+                               struct mhp_restrictions *restrictions)
 {
        int ret;
 
-       ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+       ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
        WARN_ON_ONCE(ret);
 
        /* update max_pfn, max_low_pfn and high_memory */
@@ -791,15 +791,15 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
        return ret;
 }
 
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
-               bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+                       struct mhp_restrictions *restrictions)
 {
        unsigned long start_pfn = start >> PAGE_SHIFT;
        unsigned long nr_pages = size >> PAGE_SHIFT;
 
        init_memory_mapping(start, start + size);
 
-       return add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+       return add_pages(nid, start_pfn, nr_pages, restrictions);
 }
 
 #define PAGE_INUSE 0xFD
@@ -1141,24 +1141,20 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end)
        remove_pagetable(start, end, true, NULL);
 }
 
-int __ref arch_remove_memory(int nid, u64 start, u64 size,
-                               struct vmem_altmap *altmap)
+void __ref arch_remove_memory(int nid, u64 start, u64 size,
+                             struct vmem_altmap *altmap)
 {
        unsigned long start_pfn = start >> PAGE_SHIFT;
        unsigned long nr_pages = size >> PAGE_SHIFT;
        struct page *page = pfn_to_page(start_pfn);
        struct zone *zone;
-       int ret;
 
        /* With altmap the first mapped page is offset from @start */
        if (altmap)
                page += vmem_altmap_offset(altmap);
        zone = page_zone(page);
-       ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
-       WARN_ON_ONCE(ret);
+       __remove_pages(zone, start_pfn, nr_pages, altmap);
        kernel_physical_mapping_remove(start, start + size);
-
-       return ret;
 }
 #endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
index d498610..b51746f 100644 (file)
@@ -216,11 +216,6 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 }
 #endif
 
-void free_initmem(void)
-{
-       free_initmem_default(-1);
-}
-
 static void __init parse_memmap_one(char *p)
 {
        char *oldp;
index e49028a..f180427 100644 (file)
@@ -231,13 +231,14 @@ static bool pages_correctly_probed(unsigned long start_pfn)
  * OK to have direct references to sparsemem variables in here.
  */
 static int
-memory_block_action(unsigned long phys_index, unsigned long action, int online_type)
+memory_block_action(unsigned long start_section_nr, unsigned long action,
+                   int online_type)
 {
        unsigned long start_pfn;
        unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
        int ret;
 
-       start_pfn = section_nr_to_pfn(phys_index);
+       start_pfn = section_nr_to_pfn(start_section_nr);
 
        switch (action) {
        case MEM_ONLINE:
@@ -251,7 +252,7 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t
                break;
        default:
                WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
-                    "%ld\n", __func__, phys_index, action, action);
+                    "%ld\n", __func__, start_section_nr, action, action);
                ret = -EINVAL;
        }
 
@@ -733,16 +734,18 @@ unregister_memory(struct memory_block *memory)
 {
        BUG_ON(memory->dev.bus != &memory_subsys);
 
-       /* drop the ref. we got in remove_memory_section() */
+       /* drop the ref. we got via find_memory_block() */
        put_device(&memory->dev);
        device_unregister(&memory->dev);
 }
 
-static int remove_memory_section(unsigned long node_id,
-                              struct mem_section *section, int phys_device)
+void unregister_memory_section(struct mem_section *section)
 {
        struct memory_block *mem;
 
+       if (WARN_ON_ONCE(!present_section(section)))
+               return;
+
        mutex_lock(&mem_sysfs_mutex);
 
        /*
@@ -763,15 +766,6 @@ static int remove_memory_section(unsigned long node_id,
 
 out_unlock:
        mutex_unlock(&mem_sysfs_mutex);
-       return 0;
-}
-
-int unregister_memory_section(struct mem_section *section)
-{
-       if (!present_section(section))
-               return -EINVAL;
-
-       return remove_memory_section(0, section, 0);
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
index e428468..996d68f 100644 (file)
@@ -184,8 +184,7 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
 
        *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
 
-       return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, *pfn,
-                       vmf->flags & FAULT_FLAG_WRITE);
+       return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE);
 }
 
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
@@ -235,8 +234,7 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
 
        *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
 
-       return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, *pfn,
-                       vmf->flags & FAULT_FLAG_WRITE);
+       return vmf_insert_pfn_pud(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE);
 }
 #else
 static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
index 35e784c..5414eb1 100644 (file)
@@ -107,19 +107,8 @@ EXPORT_SYMBOL(fw_iso_buffer_init);
 int fw_iso_buffer_map_vma(struct fw_iso_buffer *buffer,
                          struct vm_area_struct *vma)
 {
-       unsigned long uaddr;
-       int i, err;
-
-       uaddr = vma->vm_start;
-       for (i = 0; i < buffer->page_count; i++) {
-               err = vm_insert_page(vma, uaddr, buffer->pages[i]);
-               if (err)
-                       return err;
-
-               uaddr += PAGE_SIZE;
-       }
-
-       return 0;
+       return vm_map_pages_zero(vma, buffer->pages,
+                                       buffer->page_count);
 }
 
 void fw_iso_buffer_destroy(struct fw_iso_buffer *buffer,
index e18a786..c438722 100644 (file)
@@ -102,7 +102,7 @@ static int afu_dma_pin_pages(struct dfl_feature_platform_data *pdata,
                goto unlock_vm;
        }
 
-       pinned = get_user_pages_fast(region->user_addr, npages, 1,
+       pinned = get_user_pages_fast(region->user_addr, npages, FOLL_WRITE,
                                     region->pages);
        if (pinned < 0) {
                ret = pinned;
index 3e6823f..58ed401 100644 (file)
@@ -256,14 +256,14 @@ static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn,
        /* TODO we should be able to split locking for interval tree and
         * amdgpu_mn_invalidate_node
         */
-       if (amdgpu_mn_read_lock(amn, range->blockable))
+       if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range)))
                return -EAGAIN;
 
        it = interval_tree_iter_first(&amn->objects, range->start, end);
        while (it) {
                struct amdgpu_mn_node *node;
 
-               if (!range->blockable) {
+               if (!mmu_notifier_range_blockable(range)) {
                        amdgpu_mn_read_unlock(amn);
                        return -EAGAIN;
                }
@@ -299,7 +299,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
        /* notification is exclusive, but interval is inclusive */
        end = range->end - 1;
 
-       if (amdgpu_mn_read_lock(amn, range->blockable))
+       if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range)))
                return -EAGAIN;
 
        it = interval_tree_iter_first(&amn->objects, range->start, end);
@@ -307,7 +307,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
                struct amdgpu_mn_node *node;
                struct amdgpu_bo *bo;
 
-               if (!range->blockable) {
+               if (!mmu_notifier_range_blockable(range)) {
                        amdgpu_mn_read_unlock(amn);
                        return -EAGAIN;
                }
index 215bf3f..8079ea3 100644 (file)
@@ -122,7 +122,7 @@ userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
        while (it) {
                struct drm_i915_gem_object *obj;
 
-               if (!range->blockable) {
+               if (!mmu_notifier_range_blockable(range)) {
                        ret = -EAGAIN;
                        break;
                }
index b301950..c9bd127 100644 (file)
@@ -133,7 +133,7 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
        /* TODO we should be able to split locking for interval tree and
         * the tear down.
         */
-       if (range->blockable)
+       if (mmu_notifier_range_blockable(range))
                mutex_lock(&rmn->lock);
        else if (!mutex_trylock(&rmn->lock))
                return -EAGAIN;
@@ -144,7 +144,7 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
                struct radeon_bo *bo;
                long r;
 
-               if (!range->blockable) {
+               if (!mmu_notifier_range_blockable(range)) {
                        ret = -EAGAIN;
                        goto out_unlock;
                }
index a8db758..a2ebb08 100644 (file)
@@ -221,26 +221,13 @@ static int rockchip_drm_gem_object_mmap_iommu(struct drm_gem_object *obj,
                                              struct vm_area_struct *vma)
 {
        struct rockchip_gem_object *rk_obj = to_rockchip_obj(obj);
-       unsigned int i, count = obj->size >> PAGE_SHIFT;
+       unsigned int count = obj->size >> PAGE_SHIFT;
        unsigned long user_count = vma_pages(vma);
-       unsigned long uaddr = vma->vm_start;
-       unsigned long offset = vma->vm_pgoff;
-       unsigned long end = user_count + offset;
-       int ret;
 
        if (user_count == 0)
                return -ENXIO;
-       if (end > count)
-               return -ENXIO;
 
-       for (i = offset; i < end; i++) {
-               ret = vm_insert_page(vma, uaddr, rk_obj->pages[i]);
-               if (ret)
-                       return ret;
-               uaddr += PAGE_SIZE;
-       }
-
-       return 0;
+       return vm_map_pages(vma, rk_obj->pages, count);
 }
 
 static int rockchip_drm_gem_object_mmap_dma(struct drm_gem_object *obj,
index 8bf3a7c..0620674 100644 (file)
@@ -243,7 +243,8 @@ via_lock_all_dma_pages(drm_via_sg_info_t *vsg,  drm_via_dmablit_t *xfer)
        if (NULL == vsg->pages)
                return -ENOMEM;
        ret = get_user_pages_fast((unsigned long)xfer->mem_addr,
-                       vsg->num_pages, vsg->direction == DMA_FROM_DEVICE,
+                       vsg->num_pages,
+                       vsg->direction == DMA_FROM_DEVICE ? FOLL_WRITE : 0,
                        vsg->pages);
        if (ret != vsg->num_pages) {
                if (ret < 0)
index 53c376d..a245484 100644 (file)
@@ -224,8 +224,7 @@ xen_drm_front_gem_import_sg_table(struct drm_device *dev,
 static int gem_mmap_obj(struct xen_gem_object *xen_obj,
                        struct vm_area_struct *vma)
 {
-       unsigned long addr = vma->vm_start;
-       int i;
+       int ret;
 
        /*
         * clear the VM_PFNMAP flag that was set by drm_gem_mmap(), and set the
@@ -252,18 +251,11 @@ static int gem_mmap_obj(struct xen_gem_object *xen_obj,
         * FIXME: as we insert all the pages now then no .fault handler must
         * be called, so don't provide one
         */
-       for (i = 0; i < xen_obj->num_pages; i++) {
-               int ret;
-
-               ret = vm_insert_page(vma, addr, xen_obj->pages[i]);
-               if (ret < 0) {
-                       DRM_ERROR("Failed to insert pages into vma: %d\n", ret);
-                       return ret;
-               }
+       ret = vm_map_pages(vma, xen_obj->pages, xen_obj->num_pages);
+       if (ret < 0)
+               DRM_ERROR("Failed to map pages into vma: %d\n", ret);
 
-               addr += PAGE_SIZE;
-       }
-       return 0;
+       return ret;
 }
 
 int xen_drm_front_gem_mmap(struct file *filp, struct vm_area_struct *vma)
index 0a23048..e7ea819 100644 (file)
@@ -295,10 +295,11 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
 
        while (npages) {
                down_read(&mm->mmap_sem);
-               ret = get_user_pages_longterm(cur_base,
+               ret = get_user_pages(cur_base,
                                     min_t(unsigned long, npages,
                                           PAGE_SIZE / sizeof (struct page *)),
-                                    gup_flags, page_list, NULL);
+                                    gup_flags | FOLL_LONGTERM,
+                                    page_list, NULL);
                if (ret < 0) {
                        up_read(&mm->mmap_sem);
                        goto umem_release;
index c7226cf..f962b5b 100644 (file)
@@ -152,7 +152,7 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
        struct ib_ucontext_per_mm *per_mm =
                container_of(mn, struct ib_ucontext_per_mm, mn);
 
-       if (range->blockable)
+       if (mmu_notifier_range_blockable(range))
                down_read(&per_mm->umem_rwsem);
        else if (!down_read_trylock(&per_mm->umem_rwsem))
                return -EAGAIN;
@@ -170,7 +170,8 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
        return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
                                             range->end,
                                             invalidate_range_start_trampoline,
-                                            range->blockable, NULL);
+                                            mmu_notifier_range_blockable(range),
+                                            NULL);
 }
 
 static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start,
index 24b592c..02eee8e 100644 (file)
@@ -104,8 +104,9 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np
                            bool writable, struct page **pages)
 {
        int ret;
+       unsigned int gup_flags = FOLL_LONGTERM | (writable ? FOLL_WRITE : 0);
 
-       ret = get_user_pages_fast(vaddr, npages, writable, pages);
+       ret = get_user_pages_fast(vaddr, npages, gup_flags, pages);
        if (ret < 0)
                return ret;
 
index 112d2f3..8ff0e90 100644 (file)
@@ -472,7 +472,8 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
                goto out;
        }
 
-       ret = get_user_pages_fast(uaddr & PAGE_MASK, 1, FOLL_WRITE, pages);
+       ret = get_user_pages_fast(uaddr & PAGE_MASK, 1,
+                                 FOLL_WRITE | FOLL_LONGTERM, pages);
        if (ret < 0)
                goto out;
 
index 123ca8f..f712fb7 100644 (file)
@@ -114,10 +114,10 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages,
 
        down_read(&current->mm->mmap_sem);
        for (got = 0; got < num_pages; got += ret) {
-               ret = get_user_pages_longterm(start_page + got * PAGE_SIZE,
-                                             num_pages - got,
-                                             FOLL_WRITE | FOLL_FORCE,
-                                             p + got, NULL);
+               ret = get_user_pages(start_page + got * PAGE_SIZE,
+                                    num_pages - got,
+                                    FOLL_LONGTERM | FOLL_WRITE | FOLL_FORCE,
+                                    p + got, NULL);
                if (ret < 0) {
                        up_read(&current->mm->mmap_sem);
                        goto bail_release;
index ef19d39..0c20477 100644 (file)
@@ -670,7 +670,7 @@ static int qib_user_sdma_pin_pages(const struct qib_devdata *dd,
                else
                        j = npages;
 
-               ret = get_user_pages_fast(addr, j, 0, pages);
+               ret = get_user_pages_fast(addr, j, FOLL_LONGTERM, pages);
                if (ret != j) {
                        i = 0;
                        j = ret;
index da35d6f..e312f52 100644 (file)
@@ -143,10 +143,11 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
        ret = 0;
 
        while (npages) {
-               ret = get_user_pages_longterm(cur_base,
-                                       min_t(unsigned long, npages,
-                                       PAGE_SIZE / sizeof(struct page *)),
-                                       gup_flags, page_list, NULL);
+               ret = get_user_pages(cur_base,
+                                    min_t(unsigned long, npages,
+                                    PAGE_SIZE / sizeof(struct page *)),
+                                    gup_flags | FOLL_LONGTERM,
+                                    page_list, NULL);
 
                if (ret < 0)
                        goto out;
index 77aabe6..20abd19 100644 (file)
@@ -619,17 +619,7 @@ out_free_pages:
 
 int iommu_dma_mmap(struct page **pages, size_t size, struct vm_area_struct *vma)
 {
-       unsigned long uaddr = vma->vm_start;
-       unsigned int i, count = PAGE_ALIGN(size) >> PAGE_SHIFT;
-       int ret = -ENXIO;
-
-       for (i = vma->vm_pgoff; i < count && uaddr < vma->vm_end; i++) {
-               ret = vm_insert_page(vma, uaddr, pages[i]);
-               if (ret)
-                       break;
-               uaddr += PAGE_SIZE;
-       }
-       return ret;
+       return vm_map_pages(vma, pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
 }
 
 static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
index 7ebd58a..3cf25ab 100644 (file)
@@ -2201,6 +2201,13 @@ int vb2_mmap(struct vb2_queue *q, struct vm_area_struct *vma)
                goto unlock;
        }
 
+       /*
+        * vm_pgoff is treated in V4L2 API as a 'cookie' to select a buffer,
+        * not as a in-buffer offset. We always want to mmap a whole buffer
+        * from its beginning.
+        */
+       vma->vm_pgoff = 0;
+
        ret = call_memop(vb, mmap, vb->planes[plane].mem_priv, vma);
 
 unlock:
index 82389ae..ecbef26 100644 (file)
@@ -186,12 +186,6 @@ static int vb2_dc_mmap(void *buf_priv, struct vm_area_struct *vma)
                return -EINVAL;
        }
 
-       /*
-        * dma_mmap_* uses vm_pgoff as in-buffer offset, but we want to
-        * map whole buffer
-        */
-       vma->vm_pgoff = 0;
-
        ret = dma_mmap_attrs(buf->dev, vma, buf->cookie,
                buf->dma_addr, buf->size, buf->attrs);
 
index 270c316..4a4c49d 100644 (file)
@@ -328,28 +328,18 @@ static unsigned int vb2_dma_sg_num_users(void *buf_priv)
 static int vb2_dma_sg_mmap(void *buf_priv, struct vm_area_struct *vma)
 {
        struct vb2_dma_sg_buf *buf = buf_priv;
-       unsigned long uaddr = vma->vm_start;
-       unsigned long usize = vma->vm_end - vma->vm_start;
-       int i = 0;
+       int err;
 
        if (!buf) {
                printk(KERN_ERR "No memory to map\n");
                return -EINVAL;
        }
 
-       do {
-               int ret;
-
-               ret = vm_insert_page(vma, uaddr, buf->pages[i++]);
-               if (ret) {
-                       printk(KERN_ERR "Remapping memory, error: %d\n", ret);
-                       return ret;
-               }
-
-               uaddr += PAGE_SIZE;
-               usize -= PAGE_SIZE;
-       } while (usize > 0);
-
+       err = vm_map_pages(vma, buf->pages, buf->num_pages);
+       if (err) {
+               printk(KERN_ERR "Remapping memory, error: %d\n", err);
+               return err;
+       }
 
        /*
         * Use common vm_area operations to track buffer refcount.
index 08929c0..870a2a5 100644 (file)
@@ -186,12 +186,12 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma,
        dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n",
                data, size, dma->nr_pages);
 
-       err = get_user_pages_longterm(data & PAGE_MASK, dma->nr_pages,
-                            flags, dma->pages, NULL);
+       err = get_user_pages(data & PAGE_MASK, dma->nr_pages,
+                            flags | FOLL_LONGTERM, dma->pages, NULL);
 
        if (err != dma->nr_pages) {
                dma->nr_pages = (err >= 0) ? err : 0;
-               dprintk(1, "get_user_pages_longterm: err=%d [%d]\n", err,
+               dprintk(1, "get_user_pages: err=%d [%d]\n", err,
                        dma->nr_pages);
                return err < 0 ? err : -EINVAL;
        }
index 25265fd..89cff9d 100644 (file)
@@ -603,7 +603,7 @@ int genwqe_user_vmap(struct genwqe_dev *cd, struct dma_mapping *m, void *uaddr,
        /* pin user pages in memory */
        rc = get_user_pages_fast(data & PAGE_MASK, /* page aligned addr */
                                 m->nr_pages,
-                                m->write,              /* readable/writable */
+                                m->write ? FOLL_WRITE : 0,     /* readable/writable */
                                 m->page_list); /* ptrs to pages */
        if (rc < 0)
                goto fail_get_user_pages;
index 997f925..422d08d 100644 (file)
@@ -242,7 +242,7 @@ static int vmci_host_setup_notify(struct vmci_ctx *context,
        /*
         * Lock physical page backing a given user VA.
         */
-       retval = get_user_pages_fast(uva, 1, 1, &context->notify_page);
+       retval = get_user_pages_fast(uva, 1, FOLL_WRITE, &context->notify_page);
        if (retval != 1) {
                context->notify_page = NULL;
                return VMCI_ERROR_GENERIC;
index f5f1aac..1174735 100644 (file)
@@ -659,7 +659,8 @@ static int qp_host_get_user_memory(u64 produce_uva,
        int err = VMCI_SUCCESS;
 
        retval = get_user_pages_fast((uintptr_t) produce_uva,
-                                    produce_q->kernel_if->num_pages, 1,
+                                    produce_q->kernel_if->num_pages,
+                                    FOLL_WRITE,
                                     produce_q->kernel_if->u.h.header_page);
        if (retval < (int)produce_q->kernel_if->num_pages) {
                pr_debug("get_user_pages_fast(produce) failed (retval=%d)",
@@ -671,7 +672,8 @@ static int qp_host_get_user_memory(u64 produce_uva,
        }
 
        retval = get_user_pages_fast((uintptr_t) consume_uva,
-                                    consume_q->kernel_if->num_pages, 1,
+                                    consume_q->kernel_if->num_pages,
+                                    FOLL_WRITE,
                                     consume_q->kernel_if->u.h.header_page);
        if (retval < (int)consume_q->kernel_if->num_pages) {
                pr_debug("get_user_pages_fast(consume) failed (retval=%d)",
index 321bc67..cef0133 100644 (file)
@@ -274,7 +274,8 @@ static int pin_user_pages(unsigned long first_page,
                *iter_last_page_size = last_page_size;
        }
 
-       ret = get_user_pages_fast(first_page, requested_pages, !is_write,
+       ret = get_user_pages_fast(first_page, requested_pages,
+                                 !is_write ? FOLL_WRITE : 0,
                                  pages);
        if (ret <= 0)
                return -EFAULT;
index 1e1f42e..4a4a75f 100644 (file)
@@ -868,7 +868,9 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode,
 
                pinned = get_user_pages_fast(
                                (unsigned long)xfer->loc_addr & PAGE_MASK,
-                               nr_pages, dir == DMA_FROM_DEVICE, page_list);
+                               nr_pages,
+                               dir == DMA_FROM_DEVICE ? FOLL_WRITE : 0,
+                               page_list);
 
                if (pinned != nr_pages) {
                        if (pinned < 0) {
index acd9ba4..8090dc9 100644 (file)
@@ -437,7 +437,7 @@ static int dax_lock_page(void *va, struct page **p)
 
        dax_dbg("uva %p", va);
 
-       ret = get_user_pages_fast((unsigned long)va, 1, 1, p);
+       ret = get_user_pages_fast((unsigned long)va, 1, FOLL_WRITE, p);
        if (ret == 1) {
                dax_dbg("locked page %p, for VA %p", *p, va);
                return 0;
index 19c022e..3c6a18a 100644 (file)
@@ -4922,7 +4922,8 @@ static int sgl_map_user_pages(struct st_buffer *STbp,
 
         /* Try to fault in all of the necessary pages */
         /* rw==READ means read from drive, write into memory area */
-       res = get_user_pages_fast(uaddr, nr_pages, rw == READ, pages);
+       res = get_user_pages_fast(uaddr, nr_pages, rw == READ ? FOLL_WRITE : 0,
+                                 pages);
 
        /* Errors and no page mapped should return here */
        if (res < nr_pages)
index 600928f..d35c4fb 100644 (file)
@@ -486,8 +486,8 @@ static int gasket_perform_mapping(struct gasket_page_table *pg_tbl,
                        ptes[i].dma_addr = pg_tbl->coherent_pages[0].paddr +
                                           off + i * PAGE_SIZE;
                } else {
-                       ret = get_user_pages_fast(page_addr - offset, 1, 1,
-                                                 &page);
+                       ret = get_user_pages_fast(page_addr - offset, 1,
+                                                 FOLL_WRITE, &page);
 
                        if (ret <= 0) {
                                dev_err(pg_tbl->device,
index 0b9ab1d..49fd731 100644 (file)
@@ -273,7 +273,7 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr,
                goto err;
        }
 
-       rc = get_user_pages_fast(start, num_pages, 1, shm->pages);
+       rc = get_user_pages_fast(start, num_pages, FOLL_WRITE, shm->pages);
        if (rc > 0)
                shm->num_pages = rc;
        if (rc != num_pages) {
index 6b64e45..40ddc0c 100644 (file)
@@ -532,7 +532,8 @@ static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
        enum dma_data_direction direction = iommu_tce_direction(tce);
 
        if (get_user_pages_fast(tce & PAGE_MASK, 1,
-                       direction != DMA_TO_DEVICE, &page) != 1)
+                       direction != DMA_TO_DEVICE ? FOLL_WRITE : 0,
+                       &page) != 1)
                return -EFAULT;
 
        *hpa = __pa((unsigned long) page_address(page));
index 3be1db3..3ddc375 100644 (file)
@@ -358,7 +358,8 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
 
        down_read(&mm->mmap_sem);
        if (mm == current->mm) {
-               ret = get_user_pages_longterm(vaddr, 1, flags, page, vmas);
+               ret = get_user_pages(vaddr, 1, flags | FOLL_LONGTERM, page,
+                                    vmas);
        } else {
                ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page,
                                            vmas, NULL);
index 351af88..1e3ed41 100644 (file)
@@ -1704,7 +1704,7 @@ static int set_bit_to_user(int nr, void __user *addr)
        int bit = nr + (log % PAGE_SIZE) * 8;
        int r;
 
-       r = get_user_pages_fast(log, 1, 1, &page);
+       r = get_user_pages_fast(log, 1, FOLL_WRITE, &page);
        if (r < 0)
                return r;
        BUG_ON(r != 1);
index dfed532..4e4d6a0 100644 (file)
@@ -686,7 +686,7 @@ static ssize_t pvr2fb_write(struct fb_info *info, const char *buf,
        if (!pages)
                return -ENOMEM;
 
-       ret = get_user_pages_fast((unsigned long)buf, nr_pages, true, pages);
+       ret = get_user_pages_fast((unsigned long)buf, nr_pages, FOLL_WRITE, pages);
        if (ret < nr_pages) {
                nr_pages = ret;
                ret = -EINVAL;
index 8ba726e..6446bca 100644 (file)
@@ -244,7 +244,7 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p)
 
        /* Get the physical addresses of the source buffer */
        num_pinned = get_user_pages_fast(param.local_vaddr - lb_offset,
-               num_pages, param.source != -1, pages);
+               num_pages, param.source != -1 ? FOLL_WRITE : 0, pages);
 
        if (num_pinned != num_pages) {
                /* get_user_pages() failed */
index 7cf9c51..469dfbd 100644 (file)
@@ -526,20 +526,20 @@ static int mn_invl_range_start(struct mmu_notifier *mn,
        struct gntdev_grant_map *map;
        int ret = 0;
 
-       if (range->blockable)
+       if (mmu_notifier_range_blockable(range))
                mutex_lock(&priv->lock);
        else if (!mutex_trylock(&priv->lock))
                return -EAGAIN;
 
        list_for_each_entry(map, &priv->maps, next) {
                ret = unmap_if_in_range(map, range->start, range->end,
-                                       range->blockable);
+                                       mmu_notifier_range_blockable(range));
                if (ret)
                        goto out_unlock;
        }
        list_for_each_entry(map, &priv->freeable_maps, next) {
                ret = unmap_if_in_range(map, range->start, range->end,
-                                       range->blockable);
+                                       mmu_notifier_range_blockable(range));
                if (ret)
                        goto out_unlock;
        }
@@ -852,7 +852,7 @@ static int gntdev_get_page(struct gntdev_copy_batch *batch, void __user *virt,
        unsigned long xen_pfn;
        int ret;
 
-       ret = get_user_pages_fast(addr, 1, writeable, &page);
+       ret = get_user_pages_fast(addr, 1, writeable ? FOLL_WRITE : 0, &page);
        if (ret < 0)
                return ret;
 
@@ -1084,7 +1084,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
        int index = vma->vm_pgoff;
        int count = vma_pages(vma);
        struct gntdev_grant_map *map;
-       int i, err = -EINVAL;
+       int err = -EINVAL;
 
        if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
                return -EINVAL;
@@ -1145,12 +1145,9 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
                goto out_put_map;
 
        if (!use_ptemod) {
-               for (i = 0; i < count; i++) {
-                       err = vm_insert_page(vma, vma->vm_start + i*PAGE_SIZE,
-                               map->pages[i]);
-                       if (err)
-                               goto out_put_map;
-               }
+               err = vm_map_pages(vma, map->pages, map->count);
+               if (err)
+                       goto out_put_map;
        } else {
 #ifdef CONFIG_X86
                /*
index a1c61e3..dd5bbb6 100644 (file)
@@ -165,12 +165,8 @@ static int privcmd_buf_mmap(struct file *file, struct vm_area_struct *vma)
        if (vma_priv->n_pages != count)
                ret = -ENOMEM;
        else
-               for (i = 0; i < vma_priv->n_pages; i++) {
-                       ret = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE,
-                                            vma_priv->pages[i]);
-                       if (ret)
-                               break;
-               }
+               ret = vm_map_pages_zero(vma, vma_priv->pages,
+                                               vma_priv->n_pages);
 
        if (ret)
                privcmd_buf_vmapriv_free(vma_priv);
index e5e54da..f743862 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -814,7 +814,7 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
                                goto unlock_pmd;
 
                        flush_cache_page(vma, address, pfn);
-                       pmd = pmdp_huge_clear_flush(vma, address, pmdp);
+                       pmd = pmdp_invalidate(vma, address, pmdp);
                        pmd = pmd_wrprotect(pmd);
                        pmd = pmd_mkclean(pmd);
                        set_pmd_at(vma->vm_mm, address, pmdp, pmd);
@@ -1575,8 +1575,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
                }
 
                trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
-               result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
-                                           write);
+               result = vmf_insert_pfn_pmd(vmf, pfn, write);
                break;
        case IOMAP_UNWRITTEN:
        case IOMAP_HOLE:
@@ -1686,8 +1685,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
                ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
 #ifdef CONFIG_FS_DAX_PMD
        else if (order == PMD_ORDER)
-               ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
-                       pfn, true);
+               ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
 #endif
        else
                ret = VM_FAULT_FALLBACK;
index c74ef44..1dcc571 100644 (file)
@@ -440,9 +440,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
                        u32 hash;
 
                        index = page->index;
-                       hash = hugetlb_fault_mutex_hash(h, current->mm,
-                                                       &pseudo_vma,
-                                                       mapping, index, 0);
+                       hash = hugetlb_fault_mutex_hash(h, mapping, index, 0);
                        mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
                        /*
@@ -499,8 +497,15 @@ static void hugetlbfs_evict_inode(struct inode *inode)
        struct resv_map *resv_map;
 
        remove_inode_hugepages(inode, 0, LLONG_MAX);
-       resv_map = (struct resv_map *)inode->i_mapping->private_data;
-       /* root inode doesn't have the resv_map, so we should check it */
+
+       /*
+        * Get the resv_map from the address space embedded in the inode.
+        * This is the address space which points to any resv_map allocated
+        * at inode creation time.  If this is a device special inode,
+        * i_mapping may not point to the original address space.
+        */
+       resv_map = (struct resv_map *)(&inode->i_data)->private_data;
+       /* Only regular and link inodes have associated reserve maps */
        if (resv_map)
                resv_map_release(&resv_map->refs);
        clear_inode(inode);
@@ -639,8 +644,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
                addr = index * hpage_size;
 
                /* mutex taken here, fault path and hole punch */
-               hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
-                                               index, addr);
+               hash = hugetlb_fault_mutex_hash(h, mapping, index, addr);
                mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
                /* See if already present in mapping to avoid alloc/free */
index 48ea397..fdc1832 100644 (file)
@@ -2697,8 +2697,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
 
                ret = 0;
                down_read(&current->mm->mmap_sem);
-               pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE,
-                                               pages, vmas);
+               pret = get_user_pages(ubuf, nr_pages,
+                                     FOLL_WRITE | FOLL_LONGTERM,
+                                     pages, vmas);
                if (pret == nr_pages) {
                        /* don't support file backed memory */
                        for (j = 0; j < nr_pages; j++) {
index c121abb..85f21ca 100644 (file)
 #define NAMEI_RA_BLOCKS  4
 #define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
 
-static unsigned char ocfs2_filetype_table[] = {
-       DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-};
-
 static int ocfs2_do_extend_dir(struct super_block *sb,
                               handle_t *handle,
                               struct inode *dir,
@@ -1718,7 +1714,7 @@ int __ocfs2_add_entry(handle_t *handle,
                                de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
                                de = de1;
                        }
-                       de->file_type = OCFS2_FT_UNKNOWN;
+                       de->file_type = FT_UNKNOWN;
                        if (blkno) {
                                de->inode = cpu_to_le64(blkno);
                                ocfs2_set_de_type(de, inode->i_mode);
@@ -1803,13 +1799,9 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
                }
                offset += le16_to_cpu(de->rec_len);
                if (le64_to_cpu(de->inode)) {
-                       unsigned char d_type = DT_UNKNOWN;
-
-                       if (de->file_type < OCFS2_FT_MAX)
-                               d_type = ocfs2_filetype_table[de->file_type];
-
                        if (!dir_emit(ctx, de->name, de->name_len,
-                                     le64_to_cpu(de->inode), d_type))
+                                     le64_to_cpu(de->inode),
+                                     fs_ftype_to_dtype(de->file_type)))
                                goto out;
                }
                ctx->pos += le16_to_cpu(de->rec_len);
@@ -1900,14 +1892,10 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
                                break;
                        }
                        if (le64_to_cpu(de->inode)) {
-                               unsigned char d_type = DT_UNKNOWN;
-
-                               if (de->file_type < OCFS2_FT_MAX)
-                                       d_type = ocfs2_filetype_table[de->file_type];
                                if (!dir_emit(ctx, de->name,
                                                de->name_len,
                                                le64_to_cpu(de->inode),
-                                               d_type)) {
+                                       fs_ftype_to_dtype(de->file_type))) {
                                        brelse(bh);
                                        return 0;
                                }
index 4bf8d58..af2888d 100644 (file)
@@ -148,16 +148,24 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
        u64 blkno;
        struct dentry *parent;
        struct inode *dir = d_inode(child);
+       int set;
 
        trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name,
                               (unsigned long long)OCFS2_I(dir)->ip_blkno);
 
+       status = ocfs2_nfs_sync_lock(OCFS2_SB(dir->i_sb), 1);
+       if (status < 0) {
+               mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status);
+               parent = ERR_PTR(status);
+               goto bail;
+       }
+
        status = ocfs2_inode_lock(dir, NULL, 0);
        if (status < 0) {
                if (status != -ENOENT)
                        mlog_errno(status);
                parent = ERR_PTR(status);
-               goto bail;
+               goto unlock_nfs_sync;
        }
 
        status = ocfs2_lookup_ino_from_name(dir, "..", 2, &blkno);
@@ -166,11 +174,31 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
                goto bail_unlock;
        }
 
+       status = ocfs2_test_inode_bit(OCFS2_SB(dir->i_sb), blkno, &set);
+       if (status < 0) {
+               if (status == -EINVAL) {
+                       status = -ESTALE;
+               } else
+                       mlog(ML_ERROR, "test inode bit failed %d\n", status);
+               parent = ERR_PTR(status);
+               goto bail_unlock;
+       }
+
+       trace_ocfs2_get_dentry_test_bit(status, set);
+       if (!set) {
+               status = -ESTALE;
+               parent = ERR_PTR(status);
+               goto bail_unlock;
+       }
+
        parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
 
 bail_unlock:
        ocfs2_inode_unlock(dir, 0);
 
+unlock_nfs_sync:
+       ocfs2_nfs_sync_unlock(OCFS2_SB(dir->i_sb), 1);
+
 bail:
        trace_ocfs2_get_parent_end(parent);
 
index 7071ad0..b86bf5e 100644 (file)
@@ -391,21 +391,6 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
 #define OCFS2_HB_LOCAL                 "heartbeat=local"
 #define OCFS2_HB_GLOBAL                        "heartbeat=global"
 
-/*
- * OCFS2 directory file types.  Only the low 3 bits are used.  The
- * other bits are reserved for now.
- */
-#define OCFS2_FT_UNKNOWN       0
-#define OCFS2_FT_REG_FILE      1
-#define OCFS2_FT_DIR           2
-#define OCFS2_FT_CHRDEV                3
-#define OCFS2_FT_BLKDEV                4
-#define OCFS2_FT_FIFO          5
-#define OCFS2_FT_SOCK          6
-#define OCFS2_FT_SYMLINK       7
-
-#define OCFS2_FT_MAX           8
-
 /*
  * OCFS2_DIR_PAD defines the directory entries boundaries
  *
@@ -424,17 +409,6 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
 #define        OCFS2_LINKS_HI_SHIFT    16
 #define        OCFS2_DX_ENTRIES_MAX    (0xffffffffU)
 
-#define S_SHIFT                        12
-static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
-       [S_IFREG >> S_SHIFT]  = OCFS2_FT_REG_FILE,
-       [S_IFDIR >> S_SHIFT]  = OCFS2_FT_DIR,
-       [S_IFCHR >> S_SHIFT]  = OCFS2_FT_CHRDEV,
-       [S_IFBLK >> S_SHIFT]  = OCFS2_FT_BLKDEV,
-       [S_IFIFO >> S_SHIFT]  = OCFS2_FT_FIFO,
-       [S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK,
-       [S_IFLNK >> S_SHIFT]  = OCFS2_FT_SYMLINK,
-};
-
 
 /*
  * Convenience casts
@@ -1629,7 +1603,7 @@ static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
 static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
                                    umode_t mode)
 {
-       de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+       de->file_type = fs_umode_to_ftype(mode);
 }
 
 static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd)
index d4811f9..2bb916d 100644 (file)
@@ -269,7 +269,7 @@ orangefs_bufmap_map(struct orangefs_bufmap *bufmap,
 
        /* map the pages */
        ret = get_user_pages_fast((unsigned long)user_desc->ptr,
-                            bufmap->page_count, 1, bufmap->page_array);
+                            bufmap->page_count, FOLL_WRITE, bufmap->page_array);
 
        if (ret < 0)
                return ret;
index 95ca1fe..01d4eb0 100644 (file)
@@ -1169,7 +1169,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
                                break;
                        }
 
-                       mmu_notifier_range_init(&range, mm, 0, -1UL);
+                       mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
+                                               0, NULL, mm, 0, -1UL);
                        mmu_notifier_invalidate_range_start(&range);
                }
                walk_page_range(0, mm->highest_vm_end, &clear_refs_walk);
index 01e8217..4d1ff01 100644 (file)
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -292,8 +292,14 @@ int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
        }
 
        if (flags & SYNC_FILE_RANGE_WRITE) {
+               int sync_mode = WB_SYNC_NONE;
+
+               if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) ==
+                            SYNC_FILE_RANGE_WRITE_AND_WAIT)
+                       sync_mode = WB_SYNC_ALL;
+
                ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
-                                                WB_SYNC_NONE);
+                                                sync_mode);
                if (ret < 0)
                        goto out;
        }
@@ -306,9 +312,9 @@ out:
 }
 
 /*
- * sys_sync_file_range() permits finely controlled syncing over a segment of
+ * ksys_sync_file_range() permits finely controlled syncing over a segment of
  * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
- * zero then sys_sync_file_range() will operate from offset out to EOF.
+ * zero then ksys_sync_file_range() will operate from offset out to EOF.
  *
  * The flag bits are:
  *
@@ -325,7 +331,7 @@ out:
  * Useful combinations of the flag bits are:
  *
  * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
- * in the range which were dirty on entry to sys_sync_file_range() are placed
+ * in the range which were dirty on entry to ksys_sync_file_range() are placed
  * under writeout.  This is a start-write-for-data-integrity operation.
  *
  * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
@@ -337,10 +343,13 @@ out:
  * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
  * for that operation to complete and to return the result.
  *
- * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER:
+ * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER
+ * (a.k.a. SYNC_FILE_RANGE_WRITE_AND_WAIT):
  * a traditional sync() operation.  This is a write-for-data-integrity operation
  * which will ensure that all pages in the range which were dirty on entry to
- * sys_sync_file_range() are committed to disk.
+ * ksys_sync_file_range() are written to disk.  It should be noted that disk
+ * caches are not flushed by this call, so there are no guarantees here that the
+ * data will be available on disk after a crash.
  *
  *
  * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
index f5de1e7..3b30301 100644 (file)
@@ -30,6 +30,8 @@
 #include <linux/security.h>
 #include <linux/hugetlb.h>
 
+int sysctl_unprivileged_userfaultfd __read_mostly = 1;
+
 static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
 
 enum userfaultfd_state {
@@ -1930,6 +1932,9 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
        struct userfaultfd_ctx *ctx;
        int fd;
 
+       if (!sysctl_unprivileged_userfaultfd && !capable(CAP_SYS_PTRACE))
+               return -EPERM;
+
        BUG_ON(!current->mm);
 
        /* Check the UFFD_* constants for consistency.  */
index 71d7b77..822f433 100644 (file)
@@ -126,4 +126,11 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
 }
 #endif
 
+#ifndef __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED
+static inline bool gigantic_page_runtime_supported(void)
+{
+       return IS_ENABLED(CONFIG_ARCH_HAS_GIGANTIC_PAGE);
+}
+#endif /* __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED */
+
 #endif /* _ASM_GENERIC_HUGETLB_H */
index f111c78..f31521d 100644 (file)
@@ -151,21 +151,6 @@ static inline void balloon_page_delete(struct page *page)
        list_del(&page->lru);
 }
 
-static inline bool __is_movable_balloon_page(struct page *page)
-{
-       return false;
-}
-
-static inline bool balloon_page_movable(struct page *page)
-{
-       return false;
-}
-
-static inline bool isolated_balloon_page(struct page *page)
-{
-       return false;
-}
-
 static inline bool balloon_page_isolate(struct page *page)
 {
        return false;
index fdab7de..fb07b50 100644 (file)
@@ -585,12 +585,12 @@ static inline bool pm_suspended_storage(void)
 }
 #endif /* CONFIG_PM_SLEEP */
 
-#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
+#ifdef CONFIG_CONTIG_ALLOC
 /* The below functions must be run on a range from a single zone. */
 extern int alloc_contig_range(unsigned long start, unsigned long end,
                              unsigned migratetype, gfp_t gfp_mask);
-extern void free_contig_range(unsigned long pfn, unsigned nr_pages);
 #endif
+void free_contig_range(unsigned long pfn, unsigned int nr_pages);
 
 #ifdef CONFIG_CMA
 /* CMA stuff */
index ad50b7b..51ec27a 100644 (file)
 #include <linux/migrate.h>
 #include <linux/memremap.h>
 #include <linux/completion.h>
+#include <linux/mmu_notifier.h>
 
-struct hmm;
+
+/*
+ * struct hmm - HMM per mm struct
+ *
+ * @mm: mm struct this HMM struct is bound to
+ * @lock: lock protecting ranges list
+ * @ranges: list of range being snapshotted
+ * @mirrors: list of mirrors for this mm
+ * @mmu_notifier: mmu notifier to track updates to CPU page table
+ * @mirrors_sem: read/write semaphore protecting the mirrors list
+ * @wq: wait queue for user waiting on a range invalidation
+ * @notifiers: count of active mmu notifiers
+ * @dead: is the mm dead ?
+ */
+struct hmm {
+       struct mm_struct        *mm;
+       struct kref             kref;
+       struct mutex            lock;
+       struct list_head        ranges;
+       struct list_head        mirrors;
+       struct mmu_notifier     mmu_notifier;
+       struct rw_semaphore     mirrors_sem;
+       wait_queue_head_t       wq;
+       long                    notifiers;
+       bool                    dead;
+};
 
 /*
  * hmm_pfn_flag_e - HMM flag enums
@@ -131,6 +157,7 @@ enum hmm_pfn_value_e {
 /*
  * struct hmm_range - track invalidation lock on virtual address range
  *
+ * @hmm: the core HMM structure this range is active against
  * @vma: the vm area struct for the range
  * @list: all range lock are on a list
  * @start: range virtual start address (inclusive)
@@ -138,10 +165,13 @@ enum hmm_pfn_value_e {
  * @pfns: array of pfns (big enough for the range)
  * @flags: pfn flags to match device driver page table
  * @values: pfn value for some special case (none, special, error, ...)
+ * @default_flags: default flags for the range (write, read, ... see hmm doc)
+ * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
  * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)
  * @valid: pfns array did not change since it has been fill by an HMM function
  */
 struct hmm_range {
+       struct hmm              *hmm;
        struct vm_area_struct   *vma;
        struct list_head        list;
        unsigned long           start;
@@ -149,41 +179,96 @@ struct hmm_range {
        uint64_t                *pfns;
        const uint64_t          *flags;
        const uint64_t          *values;
+       uint64_t                default_flags;
+       uint64_t                pfn_flags_mask;
+       uint8_t                 page_shift;
        uint8_t                 pfn_shift;
        bool                    valid;
 };
 
 /*
- * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn
- * @range: range use to decode HMM pfn value
- * @pfn: HMM pfn value to get corresponding struct page from
- * Returns: struct page pointer if pfn is a valid HMM pfn, NULL otherwise
+ * hmm_range_page_shift() - return the page shift for the range
+ * @range: range being queried
+ * Returns: page shift (page size = 1 << page shift) for the range
+ */
+static inline unsigned hmm_range_page_shift(const struct hmm_range *range)
+{
+       return range->page_shift;
+}
+
+/*
+ * hmm_range_page_size() - return the page size for the range
+ * @range: range being queried
+ * Returns: page size for the range in bytes
+ */
+static inline unsigned long hmm_range_page_size(const struct hmm_range *range)
+{
+       return 1UL << hmm_range_page_shift(range);
+}
+
+/*
+ * hmm_range_wait_until_valid() - wait for range to be valid
+ * @range: range affected by invalidation to wait on
+ * @timeout: time out for wait in ms (ie abort wait after that period of time)
+ * Returns: true if the range is valid, false otherwise.
+ */
+static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
+                                             unsigned long timeout)
+{
+       /* Check if mm is dead ? */
+       if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) {
+               range->valid = false;
+               return false;
+       }
+       if (range->valid)
+               return true;
+       wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead,
+                          msecs_to_jiffies(timeout));
+       /* Return current valid status just in case we get lucky */
+       return range->valid;
+}
+
+/*
+ * hmm_range_valid() - test if a range is valid or not
+ * @range: range
+ * Returns: true if the range is valid, false otherwise.
+ */
+static inline bool hmm_range_valid(struct hmm_range *range)
+{
+       return range->valid;
+}
+
+/*
+ * hmm_device_entry_to_page() - return struct page pointed to by a device entry
+ * @range: range use to decode device entry value
+ * @entry: device entry value to get corresponding struct page from
+ * Returns: struct page pointer if entry is a valid, NULL otherwise
  *
- * If the HMM pfn is valid (ie valid flag set) then return the struct page
- * matching the pfn value stored in the HMM pfn. Otherwise return NULL.
+ * If the device entry is valid (ie valid flag set) then return the struct page
+ * matching the entry value. Otherwise return NULL.
  */
-static inline struct page *hmm_pfn_to_page(const struct hmm_range *range,
-                                          uint64_t pfn)
+static inline struct page *hmm_device_entry_to_page(const struct hmm_range *range,
+                                                   uint64_t entry)
 {
-       if (pfn == range->values[HMM_PFN_NONE])
+       if (entry == range->values[HMM_PFN_NONE])
                return NULL;
-       if (pfn == range->values[HMM_PFN_ERROR])
+       if (entry == range->values[HMM_PFN_ERROR])
                return NULL;
-       if (pfn == range->values[HMM_PFN_SPECIAL])
+       if (entry == range->values[HMM_PFN_SPECIAL])
                return NULL;
-       if (!(pfn & range->flags[HMM_PFN_VALID]))
+       if (!(entry & range->flags[HMM_PFN_VALID]))
                return NULL;
-       return pfn_to_page(pfn >> range->pfn_shift);
+       return pfn_to_page(entry >> range->pfn_shift);
 }
 
 /*
- * hmm_pfn_to_pfn() - return pfn value store in a HMM pfn
- * @range: range use to decode HMM pfn value
- * @pfn: HMM pfn value to extract pfn from
- * Returns: pfn value if HMM pfn is valid, -1UL otherwise
+ * hmm_device_entry_to_pfn() - return pfn value store in a device entry
+ * @range: range use to decode device entry value
+ * @entry: device entry to extract pfn from
+ * Returns: pfn value if device entry is valid, -1UL otherwise
  */
-static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range,
-                                          uint64_t pfn)
+static inline unsigned long
+hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn)
 {
        if (pfn == range->values[HMM_PFN_NONE])
                return -1UL;
@@ -197,31 +282,66 @@ static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range,
 }
 
 /*
- * hmm_pfn_from_page() - create a valid HMM pfn value from struct page
+ * hmm_device_entry_from_page() - create a valid device entry for a page
  * @range: range use to encode HMM pfn value
- * @page: struct page pointer for which to create the HMM pfn
- * Returns: valid HMM pfn for the page
+ * @page: page for which to create the device entry
+ * Returns: valid device entry for the page
  */
-static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range,
-                                        struct page *page)
+static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range,
+                                                 struct page *page)
 {
        return (page_to_pfn(page) << range->pfn_shift) |
                range->flags[HMM_PFN_VALID];
 }
 
 /*
- * hmm_pfn_from_pfn() - create a valid HMM pfn value from pfn
+ * hmm_device_entry_from_pfn() - create a valid device entry value from pfn
  * @range: range use to encode HMM pfn value
- * @pfn: pfn value for which to create the HMM pfn
- * Returns: valid HMM pfn for the pfn
+ * @pfn: pfn value for which to create the device entry
+ * Returns: valid device entry for the pfn
  */
-static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
-                                       unsigned long pfn)
+static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
+                                                unsigned long pfn)
 {
        return (pfn << range->pfn_shift) |
                range->flags[HMM_PFN_VALID];
 }
 
+/*
+ * Old API:
+ * hmm_pfn_to_page()
+ * hmm_pfn_to_pfn()
+ * hmm_pfn_from_page()
+ * hmm_pfn_from_pfn()
+ *
+ * This are the OLD API please use new API, it is here to avoid cross-tree
+ * merge painfullness ie we convert things to new API in stages.
+ */
+static inline struct page *hmm_pfn_to_page(const struct hmm_range *range,
+                                          uint64_t pfn)
+{
+       return hmm_device_entry_to_page(range, pfn);
+}
+
+static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range,
+                                          uint64_t pfn)
+{
+       return hmm_device_entry_to_pfn(range, pfn);
+}
+
+static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range,
+                                        struct page *page)
+{
+       return hmm_device_entry_from_page(range, page);
+}
+
+static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
+                                       unsigned long pfn)
+{
+       return hmm_device_entry_from_pfn(range, pfn);
+}
+
+
 
 #if IS_ENABLED(CONFIG_HMM_MIRROR)
 /*
@@ -353,43 +473,113 @@ struct hmm_mirror {
 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
 void hmm_mirror_unregister(struct hmm_mirror *mirror);
 
-
 /*
- * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device
- * driver lock that serializes device page table updates, then call
- * hmm_vma_range_done(), to check if the snapshot is still valid. The same
- * device driver page table update lock must also be used in the
- * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page
- * table invalidation serializes on it.
- *
- * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL
- * hmm_vma_get_pfns() WITHOUT ERROR !
- *
- * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID !
+ * hmm_mirror_mm_is_alive() - test if mm is still alive
+ * @mirror: the HMM mm mirror for which we want to lock the mmap_sem
+ * Returns: false if the mm is dead, true otherwise
+ *
+ * This is an optimization it will not accurately always return -EINVAL if the
+ * mm is dead ie there can be false negative (process is being kill but HMM is
+ * not yet inform of that). It is only intented to be use to optimize out case
+ * where driver is about to do something time consuming and it would be better
+ * to skip it if the mm is dead.
  */
-int hmm_vma_get_pfns(struct hmm_range *range);
-bool hmm_vma_range_done(struct hmm_range *range);
+static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror)
+{
+       struct mm_struct *mm;
+
+       if (!mirror || !mirror->hmm)
+               return false;
+       mm = READ_ONCE(mirror->hmm->mm);
+       if (mirror->hmm->dead || !mm)
+               return false;
+
+       return true;
+}
 
 
 /*
- * Fault memory on behalf of device driver. Unlike handle_mm_fault(), this will
- * not migrate any device memory back to system memory. The HMM pfn array will
- * be updated with the fault result and current snapshot of the CPU page table
- * for the range.
- *
- * The mmap_sem must be taken in read mode before entering and it might be
- * dropped by the function if the block argument is false. In that case, the
- * function returns -EAGAIN.
- *
- * Return value does not reflect if the fault was successful for every single
- * address or not. Therefore, the caller must to inspect the HMM pfn array to
- * determine fault status for each address.
- *
- * Trying to fault inside an invalid vma will result in -EINVAL.
+ * Please see Documentation/vm/hmm.rst for how to use the range API.
+ */
+int hmm_range_register(struct hmm_range *range,
+                      struct mm_struct *mm,
+                      unsigned long start,
+                      unsigned long end,
+                      unsigned page_shift);
+void hmm_range_unregister(struct hmm_range *range);
+long hmm_range_snapshot(struct hmm_range *range);
+long hmm_range_fault(struct hmm_range *range, bool block);
+long hmm_range_dma_map(struct hmm_range *range,
+                      struct device *device,
+                      dma_addr_t *daddrs,
+                      bool block);
+long hmm_range_dma_unmap(struct hmm_range *range,
+                        struct vm_area_struct *vma,
+                        struct device *device,
+                        dma_addr_t *daddrs,
+                        bool dirty);
+
+/*
+ * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
  *
- * See the function description in mm/hmm.c for further documentation.
+ * When waiting for mmu notifiers we need some kind of time out otherwise we
+ * could potentialy wait for ever, 1000ms ie 1s sounds like a long time to
+ * wait already.
  */
-int hmm_vma_fault(struct hmm_range *range, bool block);
+#define HMM_RANGE_DEFAULT_TIMEOUT 1000
+
+/* This is a temporary helper to avoid merge conflict between trees. */
+static inline bool hmm_vma_range_done(struct hmm_range *range)
+{
+       bool ret = hmm_range_valid(range);
+
+       hmm_range_unregister(range);
+       return ret;
+}
+
+/* This is a temporary helper to avoid merge conflict between trees. */
+static inline int hmm_vma_fault(struct hmm_range *range, bool block)
+{
+       long ret;
+
+       /*
+        * With the old API the driver must set each individual entries with
+        * the requested flags (valid, write, ...). So here we set the mask to
+        * keep intact the entries provided by the driver and zero out the
+        * default_flags.
+        */
+       range->default_flags = 0;
+       range->pfn_flags_mask = -1UL;
+
+       ret = hmm_range_register(range, range->vma->vm_mm,
+                                range->start, range->end,
+                                PAGE_SHIFT);
+       if (ret)
+               return (int)ret;
+
+       if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) {
+               /*
+                * The mmap_sem was taken by driver we release it here and
+                * returns -EAGAIN which correspond to mmap_sem have been
+                * drop in the old API.
+                */
+               up_read(&range->vma->vm_mm->mmap_sem);
+               return -EAGAIN;
+       }
+
+       ret = hmm_range_fault(range, block);
+       if (ret <= 0) {
+               if (ret == -EBUSY || !ret) {
+                       /* Same as above  drop mmap_sem to match old API. */
+                       up_read(&range->vma->vm_mm->mmap_sem);
+                       ret = -EBUSY;
+               } else if (ret == -EAGAIN)
+                       ret = -EBUSY;
+               hmm_range_unregister(range);
+               return ret;
+       }
+       return 0;
+}
 
 /* Below are for HMM internal use only! Not to be used by device driver! */
 void hmm_mm_destroy(struct mm_struct *mm);
index 381e872..7cd5c15 100644 (file)
@@ -47,10 +47,8 @@ extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                        unsigned long addr, pgprot_t newprot,
                        int prot_numa);
-vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
-                       pmd_t *pmd, pfn_t pfn, bool write);
-vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
-                       pud_t *pud, pfn_t pfn, bool write);
+vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
+vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
 enum transparent_hugepage_flag {
        TRANSPARENT_HUGEPAGE_FLAG,
        TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
index 11943b6..edf476c 100644 (file)
@@ -123,9 +123,7 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
 void free_huge_page(struct page *page);
 void hugetlb_fix_reserve_counts(struct inode *inode);
 extern struct mutex *hugetlb_fault_mutex_table;
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
-                               struct vm_area_struct *vma,
-                               struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
                                pgoff_t idx, unsigned long address);
 
 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
index 58aa3ad..9e9a640 100644 (file)
@@ -270,6 +270,24 @@ static inline void list_rotate_left(struct list_head *head)
        }
 }
 
+/**
+ * list_rotate_to_front() - Rotate list to specific item.
+ * @list: The desired new front of the list.
+ * @head: The head of the list.
+ *
+ * Rotates list so that @list becomes the new front of the list.
+ */
+static inline void list_rotate_to_front(struct list_head *list,
+                                       struct list_head *head)
+{
+       /*
+        * Deletes the list head from the list denoted by @head and
+        * places it as the tail of @list, this effectively rotates the
+        * list so that @list is at the front.
+        */
+       list_move_tail(head, list);
+}
+
 /**
  * list_is_singular - tests whether a list has just one entry.
  * @head: the list to test.
index 294d5d8..676d390 100644 (file)
@@ -96,13 +96,14 @@ struct memblock {
 extern struct memblock memblock;
 extern int memblock_debug;
 
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+#ifndef CONFIG_ARCH_KEEP_MEMBLOCK
 #define __init_memblock __meminit
 #define __initdata_memblock __meminitdata
 void memblock_discard(void);
 #else
 #define __init_memblock
 #define __initdata_memblock
+static inline void memblock_discard(void) {}
 #endif
 
 #define memblock_dbg(fmt, ...) \
@@ -240,6 +241,47 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
             i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid))
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
+                                 unsigned long *out_spfn,
+                                 unsigned long *out_epfn);
+/**
+ * for_each_free_mem_range_in_zone - iterate through zone specific free
+ * memblock areas
+ * @i: u64 used as loop variable
+ * @zone: zone in which all of the memory blocks reside
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ *
+ * Walks over free (memory && !reserved) areas of memblock in a specific
+ * zone. Available once memblock and an empty zone is initialized. The main
+ * assumption is that the zone start, end, and pgdat have been associated.
+ * This way we can use the zone to determine NUMA node, and if a given part
+ * of the memblock is valid for the zone.
+ */
+#define for_each_free_mem_pfn_range_in_zone(i, zone, p_start, p_end)   \
+       for (i = 0,                                                     \
+            __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end);    \
+            i != U64_MAX;                                      \
+            __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end))
+
+/**
+ * for_each_free_mem_range_in_zone_from - iterate through zone specific
+ * free memblock areas from a given point
+ * @i: u64 used as loop variable
+ * @zone: zone in which all of the memory blocks reside
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ *
+ * Walks over free (memory && !reserved) areas of memblock in a specific
+ * zone, continuing from current position. Available as soon as memblock is
+ * initialized.
+ */
+#define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \
+       for (; i != U64_MAX;                                      \
+            __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end))
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
 /**
  * for_each_free_mem_range - iterate through free memblock areas
  * @i: u64 used as loop variable
index dbb6118..30561a9 100644 (file)
@@ -501,22 +501,6 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
                int zid, int nr_pages);
 
-unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
-                                          int nid, unsigned int lru_mask);
-
-static inline
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
-{
-       struct mem_cgroup_per_node *mz;
-       unsigned long nr_pages = 0;
-       int zid;
-
-       mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-       for (zid = 0; zid < MAX_NR_ZONES; zid++)
-               nr_pages += mz->lru_zone_size[zid][lru];
-       return nr_pages;
-}
-
 static inline
 unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
                enum lru_list lru, int zone_idx)
@@ -960,11 +944,6 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
        return true;
 }
 
-static inline unsigned long
-mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
-{
-       return 0;
-}
 static inline
 unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
                enum lru_list lru, int zone_idx)
@@ -972,13 +951,6 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
        return 0;
 }
 
-static inline unsigned long
-mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
-                            int nid, unsigned int lru_mask)
-{
-       return 0;
-}
-
 static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
 {
        return 0;
@@ -1117,6 +1089,12 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
 {
 }
 
+static inline void __count_memcg_events(struct mem_cgroup *memcg,
+                                       enum vm_event_item idx,
+                                       unsigned long count)
+{
+}
+
 static inline void count_memcg_page_event(struct page *page,
                                          int idx)
 {
index a6ddefc..e1dc1bb 100644 (file)
@@ -113,7 +113,7 @@ extern int register_memory_isolate_notifier(struct notifier_block *nb);
 extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
 int hotplug_memory_register(int nid, struct mem_section *section);
 #ifdef CONFIG_MEMORY_HOTREMOVE
-extern int unregister_memory_section(struct mem_section *);
+extern void unregister_memory_section(struct mem_section *);
 #endif
 extern int memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
index 8ade08c..ae892ee 100644 (file)
@@ -53,6 +53,16 @@ enum {
        MMOP_ONLINE_MOVABLE,
 };
 
+/*
+ * Restrictions for the memory hotplug:
+ * flags:  MHP_ flags
+ * altmap: alternative allocator for memmap array
+ */
+struct mhp_restrictions {
+       unsigned long flags;
+       struct vmem_altmap *altmap;
+};
+
 /*
  * Zone resizing functions
  *
@@ -87,7 +97,8 @@ extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
 extern int online_pages(unsigned long, unsigned long, int);
 extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
        unsigned long *valid_start, unsigned long *valid_end);
-extern void __offline_isolated_pages(unsigned long, unsigned long);
+extern unsigned long __offline_isolated_pages(unsigned long start_pfn,
+                                               unsigned long end_pfn);
 
 typedef void (*online_page_callback_t)(struct page *page, unsigned int order);
 
@@ -100,6 +111,8 @@ extern void __online_page_free(struct page *page);
 
 extern int try_online_node(int nid);
 
+extern int arch_add_memory(int nid, u64 start, u64 size,
+                       struct mhp_restrictions *restrictions);
 extern u64 max_mem_size;
 
 extern bool memhp_auto_online;
@@ -111,26 +124,33 @@ static inline bool movable_node_is_enabled(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-extern int arch_remove_memory(int nid, u64 start, u64 size,
-                               struct vmem_altmap *altmap);
-extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
-       unsigned long nr_pages, struct vmem_altmap *altmap);
+extern void arch_remove_memory(int nid, u64 start, u64 size,
+                              struct vmem_altmap *altmap);
+extern void __remove_pages(struct zone *zone, unsigned long start_pfn,
+                          unsigned long nr_pages, struct vmem_altmap *altmap);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
+/*
+ * Do we want sysfs memblock files created. This will allow userspace to online
+ * and offline memory explicitly. Lack of this bit means that the caller has to
+ * call move_pfn_range_to_zone to finish the initialization.
+ */
+
+#define MHP_MEMBLOCK_API               (1<<0)
+
 /* reasonably generic interface to expand the physical pages */
 extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
-               struct vmem_altmap *altmap, bool want_memblock);
+                      struct mhp_restrictions *restrictions);
 
 #ifndef CONFIG_ARCH_HAS_ADD_PAGES
 static inline int add_pages(int nid, unsigned long start_pfn,
-               unsigned long nr_pages, struct vmem_altmap *altmap,
-               bool want_memblock)
+               unsigned long nr_pages, struct mhp_restrictions *restrictions)
 {
-       return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+       return __add_pages(nid, start_pfn, nr_pages, restrictions);
 }
 #else /* ARCH_HAS_ADD_PAGES */
 int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
-               struct vmem_altmap *altmap, bool want_memblock);
+             struct mhp_restrictions *restrictions);
 #endif /* ARCH_HAS_ADD_PAGES */
 
 #ifdef CONFIG_NUMA
@@ -331,8 +351,6 @@ extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
 extern int __add_memory(int nid, u64 start, u64 size);
 extern int add_memory(int nid, u64 start, u64 size);
 extern int add_memory_resource(int nid, struct resource *resource);
-extern int arch_add_memory(int nid, u64 start, u64 size,
-               struct vmem_altmap *altmap, bool want_memblock);
 extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
                unsigned long nr_pages, struct vmem_altmap *altmap);
 extern bool is_memblock_offlined(struct memory_block *mem);
index 083d7b4..912614f 100644 (file)
@@ -124,10 +124,45 @@ extern int mmap_rnd_compat_bits __read_mostly;
 
 /*
  * On some architectures it is expensive to call memset() for small sizes.
- * Those architectures should provide their own implementation of "struct page"
- * zeroing by defining this macro in <asm/pgtable.h>.
+ * If an architecture decides to implement their own version of
+ * mm_zero_struct_page they should wrap the defines below in a #ifndef and
+ * define their own version of this macro in <asm/pgtable.h>
  */
-#ifndef mm_zero_struct_page
+#if BITS_PER_LONG == 64
+/* This function must be updated when the size of struct page grows above 80
+ * or reduces below 56. The idea that compiler optimizes out switch()
+ * statement, and only leaves move/store instructions. Also the compiler can
+ * combine write statments if they are both assignments and can be reordered,
+ * this can result in several of the writes here being dropped.
+ */
+#define        mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
+static inline void __mm_zero_struct_page(struct page *page)
+{
+       unsigned long *_pp = (void *)page;
+
+        /* Check that struct page is either 56, 64, 72, or 80 bytes */
+       BUILD_BUG_ON(sizeof(struct page) & 7);
+       BUILD_BUG_ON(sizeof(struct page) < 56);
+       BUILD_BUG_ON(sizeof(struct page) > 80);
+
+       switch (sizeof(struct page)) {
+       case 80:
+               _pp[9] = 0;     /* fallthrough */
+       case 72:
+               _pp[8] = 0;     /* fallthrough */
+       case 64:
+               _pp[7] = 0;     /* fallthrough */
+       case 56:
+               _pp[6] = 0;
+               _pp[5] = 0;
+               _pp[4] = 0;
+               _pp[3] = 0;
+               _pp[2] = 0;
+               _pp[1] = 0;
+               _pp[0] = 0;
+       }
+}
+#else
 #define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct page)))
 #endif
 
@@ -1007,6 +1042,30 @@ static inline void put_page(struct page *page)
                __put_page(page);
 }
 
+/**
+ * put_user_page() - release a gup-pinned page
+ * @page:            pointer to page to be released
+ *
+ * Pages that were pinned via get_user_pages*() must be released via
+ * either put_user_page(), or one of the put_user_pages*() routines
+ * below. This is so that eventually, pages that are pinned via
+ * get_user_pages*() can be separately tracked and uniquely handled. In
+ * particular, interactions with RDMA and filesystems need special
+ * handling.
+ *
+ * put_user_page() and put_page() are not interchangeable, despite this early
+ * implementation that makes them look the same. put_user_page() calls must
+ * be perfectly matched up with get_user_page() calls.
+ */
+static inline void put_user_page(struct page *page)
+{
+       put_page(page);
+}
+
+void put_user_pages_dirty(struct page **pages, unsigned long npages);
+void put_user_pages_dirty_lock(struct page **pages, unsigned long npages);
+void put_user_pages(struct page **pages, unsigned long npages);
+
 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
 #define SECTION_IN_PAGE_FLAGS
 #endif
@@ -1505,21 +1564,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                    struct page **pages, unsigned int gup_flags);
 
-#if defined(CONFIG_FS_DAX) || defined(CONFIG_CMA)
-long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
-                           unsigned int gup_flags, struct page **pages,
-                           struct vm_area_struct **vmas);
-#else
-static inline long get_user_pages_longterm(unsigned long start,
-               unsigned long nr_pages, unsigned int gup_flags,
-               struct page **pages, struct vm_area_struct **vmas)
-{
-       return get_user_pages(start, nr_pages, gup_flags, pages, vmas);
-}
-#endif /* CONFIG_FS_DAX */
-
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
-                       struct page **pages);
+int get_user_pages_fast(unsigned long start, int nr_pages,
+                       unsigned int gup_flags, struct page **pages);
 
 /* Container for pinned pfns / pages */
 struct frame_vector {
@@ -2533,6 +2579,10 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
 int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
                        unsigned long pfn, unsigned long size, pgprot_t);
 int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
+int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+                               unsigned long num);
+int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
+                               unsigned long num);
 vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn);
 vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
@@ -2583,6 +2633,34 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 #define FOLL_REMOTE    0x2000  /* we are working on non-current tsk/mm */
 #define FOLL_COW       0x4000  /* internal GUP flag */
 #define FOLL_ANON      0x8000  /* don't do file mappings */
+#define FOLL_LONGTERM  0x10000 /* mapping lifetime is indefinite: see below */
+
+/*
+ * NOTE on FOLL_LONGTERM:
+ *
+ * FOLL_LONGTERM indicates that the page will be held for an indefinite time
+ * period _often_ under userspace control.  This is contrasted with
+ * iov_iter_get_pages() where usages which are transient.
+ *
+ * FIXME: For pages which are part of a filesystem, mappings are subject to the
+ * lifetime enforced by the filesystem and we need guarantees that longterm
+ * users like RDMA and V4L2 only establish mappings which coordinate usage with
+ * the filesystem.  Ideas for this coordination include revoking the longterm
+ * pin, delaying writeback, bounce buffer page writeback, etc.  As FS DAX was
+ * added after the problem with filesystems was found FS DAX VMAs are
+ * specifically failed.  Filesystem pages are still subject to bugs and use of
+ * FOLL_LONGTERM should be avoided on those pages.
+ *
+ * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call.
+ * Currently only get_user_pages() and get_user_pages_fast() support this flag
+ * and calls to get_user_pages_[un]locked are specifically not allowed.  This
+ * is due to an incompatibility with the FS DAX check and
+ * FAULT_FLAG_ALLOW_RETRY
+ *
+ * In the CMA case: longterm pins in a CMA region would unnecessarily fragment
+ * that region.  And so CMA attempts to migrate the page before pinning when
+ * FOLL_LONGTERM is specified.
+ */
 
 static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
 {
index 04ec454..6f2fef7 100644 (file)
@@ -29,7 +29,7 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec,
 {
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 
-       __mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages);
+       __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
        __mod_zone_page_state(&pgdat->node_zones[zid],
                                NR_ZONE_LRU_BASE + lru, nr_pages);
 }
index 4ef4bbe..e1f42a0 100644 (file)
@@ -103,7 +103,7 @@ struct page {
                };
                struct {        /* slab, slob and slub */
                        union {
-                               struct list_head slab_list;     /* uses lru */
+                               struct list_head slab_list;
                                struct {        /* Partial pages */
                                        struct page *next;
 #ifdef CONFIG_64BIT
index 4050ec1..b6c004b 100644 (file)
 struct mmu_notifier;
 struct mmu_notifier_ops;
 
+/**
+ * enum mmu_notifier_event - reason for the mmu notifier callback
+ * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that
+ * move the range
+ *
+ * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like
+ * madvise() or replacing a page by another one, ...).
+ *
+ * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range
+ * ie using the vma access permission (vm_page_prot) to update the whole range
+ * is enough no need to inspect changes to the CPU page table (mprotect()
+ * syscall)
+ *
+ * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for
+ * pages in the range so to mirror those changes the user must inspect the CPU
+ * page table (from the end callback).
+ *
+ * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same
+ * access flags). User should soft dirty the page in the end callback to make
+ * sure that anyone relying on soft dirtyness catch pages that might be written
+ * through non CPU mappings.
+ */
+enum mmu_notifier_event {
+       MMU_NOTIFY_UNMAP = 0,
+       MMU_NOTIFY_CLEAR,
+       MMU_NOTIFY_PROTECTION_VMA,
+       MMU_NOTIFY_PROTECTION_PAGE,
+       MMU_NOTIFY_SOFT_DIRTY,
+};
+
 #ifdef CONFIG_MMU_NOTIFIER
 
 /*
@@ -25,11 +55,15 @@ struct mmu_notifier_mm {
        spinlock_t lock;
 };
 
+#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
+
 struct mmu_notifier_range {
+       struct vm_area_struct *vma;
        struct mm_struct *mm;
        unsigned long start;
        unsigned long end;
-       bool blockable;
+       unsigned flags;
+       enum mmu_notifier_event event;
 };
 
 struct mmu_notifier_ops {
@@ -225,6 +259,14 @@ extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r,
                                  bool only_end);
 extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
                                  unsigned long start, unsigned long end);
+extern bool
+mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range);
+
+static inline bool
+mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
+{
+       return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE);
+}
 
 static inline void mmu_notifier_release(struct mm_struct *mm)
 {
@@ -269,7 +311,7 @@ static inline void
 mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
 {
        if (mm_has_notifiers(range->mm)) {
-               range->blockable = true;
+               range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE;
                __mmu_notifier_invalidate_range_start(range);
        }
 }
@@ -278,7 +320,7 @@ static inline int
 mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
 {
        if (mm_has_notifiers(range->mm)) {
-               range->blockable = false;
+               range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE;
                return __mmu_notifier_invalidate_range_start(range);
        }
        return 0;
@@ -318,13 +360,19 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 
 
 static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
+                                          enum mmu_notifier_event event,
+                                          unsigned flags,
+                                          struct vm_area_struct *vma,
                                           struct mm_struct *mm,
                                           unsigned long start,
                                           unsigned long end)
 {
+       range->vma = vma;
+       range->event = event;
        range->mm = mm;
        range->start = start;
        range->end = end;
+       range->flags = flags;
 }
 
 #define ptep_clear_flush_young_notify(__vma, __address, __ptep)                \
@@ -452,9 +500,14 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range,
        range->end = end;
 }
 
-#define mmu_notifier_range_init(range, mm, start, end) \
+#define mmu_notifier_range_init(range,event,flags,vma,mm,start,end)  \
        _mmu_notifier_range_init(range, start, end)
 
+static inline bool
+mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
+{
+       return true;
+}
 
 static inline int mm_has_notifiers(struct mm_struct *mm)
 {
@@ -517,6 +570,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 {
 }
 
+#define mmu_notifier_range_update_to_read_only(r) false
+
 #define ptep_clear_flush_young_notify ptep_clear_flush_young
 #define pmdp_clear_flush_young_notify pmdp_clear_flush_young
 #define ptep_clear_young_notify ptep_test_and_clear_young
index fba7741..5a4aedc 100644 (file)
@@ -247,11 +247,6 @@ struct lruvec {
 #endif
 };
 
-/* Mask used at gathering information at once (see memcontrol.c) */
-#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
-#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
-#define LRU_ALL             ((1 << NR_LRU_LISTS) - 1)
-
 /* Isolate unmapped file */
 #define ISOLATE_UNMAPPED       ((__force isolate_mode_t)0x2)
 /* Isolate for asynchronous migration */
index bcf909d..9ec3544 100644 (file)
@@ -333,6 +333,19 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
                        mapping_gfp_mask(mapping));
 }
 
+static inline struct page *find_subpage(struct page *page, pgoff_t offset)
+{
+       unsigned long mask;
+
+       if (PageHuge(page))
+               return page;
+
+       VM_BUG_ON_PAGE(PageTail(page), page);
+
+       mask = (1UL << compound_order(page)) - 1;
+       return page + (offset & mask);
+}
+
 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset);
 struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
 unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
@@ -360,9 +373,6 @@ static inline unsigned find_get_pages_tag(struct address_space *mapping,
        return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag,
                                        nr_pages, pages);
 }
-unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
-                       xa_mark_t tag, unsigned int nr_entries,
-                       struct page **entries, pgoff_t *indices);
 
 struct page *grab_cache_page_write_begin(struct address_space *mapping,
                        pgoff_t index, unsigned flags);
@@ -527,15 +537,7 @@ static inline int wait_on_page_locked_killable(struct page *page)
 
 extern void put_and_wait_on_page_locked(struct page *page);
 
-/* 
- * Wait for a page to complete writeback
- */
-static inline void wait_on_page_writeback(struct page *page)
-{
-       if (PageWriteback(page))
-               wait_on_page_bit(page, PG_writeback);
-}
-
+void wait_on_page_writeback(struct page *page);
 extern void end_page_writeback(struct page *page);
 void wait_for_stable_page(struct page *page);
 
index 37c9eba..ac9d71e 100644 (file)
@@ -28,6 +28,8 @@
 #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
 #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
 
+extern int sysctl_unprivileged_userfaultfd;
+
 extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
 
 extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
index 2db8d60..bdeda4b 100644 (file)
@@ -26,7 +26,7 @@ struct reclaim_stat {
        unsigned nr_congested;
        unsigned nr_writeback;
        unsigned nr_immediate;
-       unsigned nr_activate;
+       unsigned nr_activate[2];
        unsigned nr_ref_keep;
        unsigned nr_unmap_fail;
 };
index 6074eff..e5bf6ee 100644 (file)
@@ -64,6 +64,7 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
        TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken)
 );
 
+#ifdef CONFIG_COMPACTION
 TRACE_EVENT(mm_compaction_migratepages,
 
        TP_PROTO(unsigned long nr_all,
@@ -132,7 +133,6 @@ TRACE_EVENT(mm_compaction_begin,
                __entry->sync ? "sync" : "async")
 );
 
-#ifdef CONFIG_COMPACTION
 TRACE_EVENT(mm_compaction_end,
        TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn,
                unsigned long free_pfn, unsigned long zone_end, bool sync,
@@ -166,7 +166,6 @@ TRACE_EVENT(mm_compaction_end,
                __entry->sync ? "sync" : "async",
                __print_symbolic(__entry->status, COMPACTION_STATUS))
 );
-#endif
 
 TRACE_EVENT(mm_compaction_try_to_compact_pages,
 
@@ -189,13 +188,12 @@ TRACE_EVENT(mm_compaction_try_to_compact_pages,
                __entry->prio = prio;
        ),
 
-       TP_printk("order=%d gfp_mask=0x%x priority=%d",
+       TP_printk("order=%d gfp_mask=%s priority=%d",
                __entry->order,
-               __entry->gfp_mask,
+               show_gfp_flags(__entry->gfp_mask),
                __entry->prio)
 );
 
-#ifdef CONFIG_COMPACTION
 DECLARE_EVENT_CLASS(mm_compaction_suitable_template,
 
        TP_PROTO(struct zone *zone,
@@ -296,7 +294,6 @@ DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_reset,
 
        TP_ARGS(zone, order)
 );
-#endif
 
 TRACE_EVENT(mm_compaction_kcompactd_sleep,
 
@@ -352,6 +349,7 @@ DEFINE_EVENT(kcompactd_wake_template, mm_compaction_kcompactd_wake,
 
        TP_ARGS(nid, order, classzone_idx)
 );
+#endif
 
 #endif /* _TRACE_COMPACTION_H */
 
index 252327d..a5ab297 100644 (file)
                {RECLAIM_WB_ASYNC,      "RECLAIM_WB_ASYNC"}     \
                ) : "RECLAIM_WB_NONE"
 
-#define trace_reclaim_flags(page) ( \
-       (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
+#define trace_reclaim_flags(file) ( \
+       (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
        (RECLAIM_WB_ASYNC) \
        )
 
-#define trace_shrink_flags(file) \
-       ( \
-               (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
-               (RECLAIM_WB_ASYNC) \
-       )
-
 TRACE_EVENT(mm_vmscan_kswapd_sleep,
 
        TP_PROTO(int nid),
@@ -73,7 +67,9 @@ TRACE_EVENT(mm_vmscan_kswapd_wake,
                __entry->order  = order;
        ),
 
-       TP_printk("nid=%d zid=%d order=%d", __entry->nid, __entry->zid, __entry->order)
+       TP_printk("nid=%d order=%d",
+               __entry->nid,
+               __entry->order)
 );
 
 TRACE_EVENT(mm_vmscan_wakeup_kswapd,
@@ -96,60 +92,53 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd,
                __entry->gfp_flags      = gfp_flags;
        ),
 
-       TP_printk("nid=%d zid=%d order=%d gfp_flags=%s",
+       TP_printk("nid=%d order=%d gfp_flags=%s",
                __entry->nid,
-               __entry->zid,
                __entry->order,
                show_gfp_flags(__entry->gfp_flags))
 );
 
 DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template,
 
-       TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
+       TP_PROTO(int order, gfp_t gfp_flags),
 
-       TP_ARGS(order, may_writepage, gfp_flags, classzone_idx),
+       TP_ARGS(order, gfp_flags),
 
        TP_STRUCT__entry(
                __field(        int,    order           )
-               __field(        int,    may_writepage   )
                __field(        gfp_t,  gfp_flags       )
-               __field(        int,    classzone_idx   )
        ),
 
        TP_fast_assign(
                __entry->order          = order;
-               __entry->may_writepage  = may_writepage;
                __entry->gfp_flags      = gfp_flags;
-               __entry->classzone_idx  = classzone_idx;
        ),
 
-       TP_printk("order=%d may_writepage=%d gfp_flags=%s classzone_idx=%d",
+       TP_printk("order=%d gfp_flags=%s",
                __entry->order,
-               __entry->may_writepage,
-               show_gfp_flags(__entry->gfp_flags),
-               __entry->classzone_idx)
+               show_gfp_flags(__entry->gfp_flags))
 );
 
 DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin,
 
-       TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
+       TP_PROTO(int order, gfp_t gfp_flags),
 
-       TP_ARGS(order, may_writepage, gfp_flags, classzone_idx)
+       TP_ARGS(order, gfp_flags)
 );
 
 #ifdef CONFIG_MEMCG
 DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin,
 
-       TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
+       TP_PROTO(int order, gfp_t gfp_flags),
 
-       TP_ARGS(order, may_writepage, gfp_flags, classzone_idx)
+       TP_ARGS(order, gfp_flags)
 );
 
 DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin,
 
-       TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
+       TP_PROTO(int order, gfp_t gfp_flags),
 
-       TP_ARGS(order, may_writepage, gfp_flags, classzone_idx)
+       TP_ARGS(order, gfp_flags)
 );
 #endif /* CONFIG_MEMCG */
 
@@ -333,7 +322,8 @@ TRACE_EVENT(mm_vmscan_writepage,
 
        TP_fast_assign(
                __entry->pfn = page_to_pfn(page);
-               __entry->reclaim_flags = trace_reclaim_flags(page);
+               __entry->reclaim_flags = trace_reclaim_flags(
+                                               page_is_file_cache(page));
        ),
 
        TP_printk("page=%p pfn=%lu flags=%s",
@@ -358,7 +348,8 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
                __field(unsigned long, nr_writeback)
                __field(unsigned long, nr_congested)
                __field(unsigned long, nr_immediate)
-               __field(unsigned long, nr_activate)
+               __field(unsigned int, nr_activate0)
+               __field(unsigned int, nr_activate1)
                __field(unsigned long, nr_ref_keep)
                __field(unsigned long, nr_unmap_fail)
                __field(int, priority)
@@ -373,20 +364,22 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
                __entry->nr_writeback = stat->nr_writeback;
                __entry->nr_congested = stat->nr_congested;
                __entry->nr_immediate = stat->nr_immediate;
-               __entry->nr_activate = stat->nr_activate;
+               __entry->nr_activate0 = stat->nr_activate[0];
+               __entry->nr_activate1 = stat->nr_activate[1];
                __entry->nr_ref_keep = stat->nr_ref_keep;
                __entry->nr_unmap_fail = stat->nr_unmap_fail;
                __entry->priority = priority;
-               __entry->reclaim_flags = trace_shrink_flags(file);
+               __entry->reclaim_flags = trace_reclaim_flags(file);
        ),
 
-       TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate=%ld nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s",
+       TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate_anon=%d nr_activate_file=%d nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s",
                __entry->nid,
                __entry->nr_scanned, __entry->nr_reclaimed,
                __entry->nr_dirty, __entry->nr_writeback,
                __entry->nr_congested, __entry->nr_immediate,
-               __entry->nr_activate, __entry->nr_ref_keep,
-               __entry->nr_unmap_fail, __entry->priority,
+               __entry->nr_activate0, __entry->nr_activate1,
+               __entry->nr_ref_keep, __entry->nr_unmap_fail,
+               __entry->priority,
                show_reclaim_flags(__entry->reclaim_flags))
 );
 
@@ -415,7 +408,7 @@ TRACE_EVENT(mm_vmscan_lru_shrink_active,
                __entry->nr_deactivated = nr_deactivated;
                __entry->nr_referenced = nr_referenced;
                __entry->priority = priority;
-               __entry->reclaim_flags = trace_shrink_flags(file);
+               __entry->reclaim_flags = trace_reclaim_flags(file);
        ),
 
        TP_printk("nid=%d nr_taken=%ld nr_active=%ld nr_deactivated=%ld nr_referenced=%ld priority=%d flags=%s",
@@ -454,7 +447,8 @@ TRACE_EVENT(mm_vmscan_inactive_list_is_low,
                __entry->total_active = total_active;
                __entry->active = active;
                __entry->ratio = ratio;
-               __entry->reclaim_flags = trace_shrink_flags(file) & RECLAIM_WB_LRU;
+               __entry->reclaim_flags = trace_reclaim_flags(file) &
+                                        RECLAIM_WB_LRU;
        ),
 
        TP_printk("nid=%d reclaim_idx=%d total_inactive=%ld inactive=%ld total_active=%ld active=%ld ratio=%ld flags=%s",
@@ -465,6 +459,38 @@ TRACE_EVENT(mm_vmscan_inactive_list_is_low,
                __entry->ratio,
                show_reclaim_flags(__entry->reclaim_flags))
 );
+
+TRACE_EVENT(mm_vmscan_node_reclaim_begin,
+
+       TP_PROTO(int nid, int order, gfp_t gfp_flags),
+
+       TP_ARGS(nid, order, gfp_flags),
+
+       TP_STRUCT__entry(
+               __field(int, nid)
+               __field(int, order)
+               __field(gfp_t, gfp_flags)
+       ),
+
+       TP_fast_assign(
+               __entry->nid = nid;
+               __entry->order = order;
+               __entry->gfp_flags = gfp_flags;
+       ),
+
+       TP_printk("nid=%d order=%d gfp_flags=%s",
+               __entry->nid,
+               __entry->order,
+               show_gfp_flags(__entry->gfp_flags))
+);
+
+DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_node_reclaim_end,
+
+       TP_PROTO(unsigned long nr_reclaimed),
+
+       TP_ARGS(nr_reclaimed)
+);
+
 #endif /* _TRACE_VMSCAN_H */
 
 /* This part must be outside protection */
index 32db72c..aa7f3ae 100644 (file)
@@ -53,7 +53,7 @@ WB_WORK_REASON
 
 struct wb_writeback_work;
 
-TRACE_EVENT(writeback_dirty_page,
+DECLARE_EVENT_CLASS(writeback_page_template,
 
        TP_PROTO(struct page *page, struct address_space *mapping),
 
@@ -79,6 +79,20 @@ TRACE_EVENT(writeback_dirty_page,
        )
 );
 
+DEFINE_EVENT(writeback_page_template, writeback_dirty_page,
+
+       TP_PROTO(struct page *page, struct address_space *mapping),
+
+       TP_ARGS(page, mapping)
+);
+
+DEFINE_EVENT(writeback_page_template, wait_on_page_writeback,
+
+       TP_PROTO(struct page *page, struct address_space *mapping),
+
+       TP_ARGS(page, mapping)
+);
+
 DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
 
        TP_PROTO(struct inode *inode, int flags),
index 121e82c..59c71fa 100644 (file)
@@ -320,6 +320,9 @@ struct fscrypt_key {
 #define SYNC_FILE_RANGE_WAIT_BEFORE    1
 #define SYNC_FILE_RANGE_WRITE          2
 #define SYNC_FILE_RANGE_WAIT_AFTER     4
+#define SYNC_FILE_RANGE_WRITE_AND_WAIT (SYNC_FILE_RANGE_WRITE | \
+                                        SYNC_FILE_RANGE_WAIT_BEFORE | \
+                                        SYNC_FILE_RANGE_WAIT_AFTER)
 
 /*
  * Flags for preadv2/pwritev2:
index 4749e11..435a428 100644 (file)
@@ -513,42 +513,55 @@ static int __init retain_initrd_param(char *str)
 }
 __setup("retain_initrd", retain_initrd_param);
 
+#ifdef CONFIG_ARCH_HAS_KEEPINITRD
+static int __init keepinitrd_setup(char *__unused)
+{
+       do_retain_initrd = 1;
+       return 1;
+}
+__setup("keepinitrd", keepinitrd_setup);
+#endif
+
 extern char __initramfs_start[];
 extern unsigned long __initramfs_size;
 #include <linux/initrd.h>
 #include <linux/kexec.h>
 
-static void __init free_initrd(void)
+void __weak free_initrd_mem(unsigned long start, unsigned long end)
 {
+       free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
+                       "initrd");
+}
+
 #ifdef CONFIG_KEXEC_CORE
+static bool kexec_free_initrd(void)
+{
        unsigned long crashk_start = (unsigned long)__va(crashk_res.start);
        unsigned long crashk_end   = (unsigned long)__va(crashk_res.end);
-#endif
-       if (do_retain_initrd)
-               goto skip;
 
-#ifdef CONFIG_KEXEC_CORE
        /*
         * If the initrd region is overlapped with crashkernel reserved region,
         * free only memory that is not part of crashkernel region.
         */
-       if (initrd_start < crashk_end && initrd_end > crashk_start) {
-               /*
-                * Initialize initrd memory region since the kexec boot does
-                * not do.
-                */
-               memset((void *)initrd_start, 0, initrd_end - initrd_start);
-               if (initrd_start < crashk_start)
-                       free_initrd_mem(initrd_start, crashk_start);
-               if (initrd_end > crashk_end)
-                       free_initrd_mem(crashk_end, initrd_end);
-       } else
-#endif
-               free_initrd_mem(initrd_start, initrd_end);
-skip:
-       initrd_start = 0;
-       initrd_end = 0;
+       if (initrd_start >= crashk_end || initrd_end <= crashk_start)
+               return false;
+
+       /*
+        * Initialize initrd memory region since the kexec boot does not do.
+        */
+       memset((void *)initrd_start, 0, initrd_end - initrd_start);
+       if (initrd_start < crashk_start)
+               free_initrd_mem(initrd_start, crashk_start);
+       if (initrd_end > crashk_end)
+               free_initrd_mem(crashk_end, initrd_end);
+       return true;
+}
+#else
+static inline bool kexec_free_initrd(void)
+{
+       return false;
 }
+#endif /* CONFIG_KEXEC_CORE */
 
 #ifdef CONFIG_BLK_DEV_RAM
 #define BUF_SIZE 1024
@@ -597,7 +610,38 @@ static void __init clean_rootfs(void)
        ksys_close(fd);
        kfree(buf);
 }
-#endif
+#else
+static inline void clean_rootfs(void)
+{
+}
+#endif /* CONFIG_BLK_DEV_RAM */
+
+#ifdef CONFIG_BLK_DEV_RAM
+static void populate_initrd_image(char *err)
+{
+       ssize_t written;
+       int fd;
+
+       unpack_to_rootfs(__initramfs_start, __initramfs_size);
+
+       printk(KERN_INFO "rootfs image is not initramfs (%s); looks like an initrd\n",
+                       err);
+       fd = ksys_open("/initrd.image", O_WRONLY | O_CREAT, 0700);
+       if (fd < 0)
+               return;
+
+       written = xwrite(fd, (char *)initrd_start, initrd_end - initrd_start);
+       if (written != initrd_end - initrd_start)
+               pr_err("/initrd.image: incomplete write (%zd != %ld)\n",
+                      written, initrd_end - initrd_start);
+       ksys_close(fd);
+}
+#else
+static void populate_initrd_image(char *err)
+{
+       printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err);
+}
+#endif /* CONFIG_BLK_DEV_RAM */
 
 static int __init populate_rootfs(void)
 {
@@ -605,46 +649,31 @@ static int __init populate_rootfs(void)
        char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
        if (err)
                panic("%s", err); /* Failed to decompress INTERNAL initramfs */
-       /* If available load the bootloader supplied initrd */
-       if (initrd_start && !IS_ENABLED(CONFIG_INITRAMFS_FORCE)) {
-#ifdef CONFIG_BLK_DEV_RAM
-               int fd;
+
+       if (!initrd_start || IS_ENABLED(CONFIG_INITRAMFS_FORCE))
+               goto done;
+
+       if (IS_ENABLED(CONFIG_BLK_DEV_RAM))
                printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n");
-               err = unpack_to_rootfs((char *)initrd_start,
-                       initrd_end - initrd_start);
-               if (!err) {
-                       free_initrd();
-                       goto done;
-               } else {
-                       clean_rootfs();
-                       unpack_to_rootfs(__initramfs_start, __initramfs_size);
-               }
-               printk(KERN_INFO "rootfs image is not initramfs (%s)"
-                               "; looks like an initrd\n", err);
-               fd = ksys_open("/initrd.image",
-                             O_WRONLY|O_CREAT, 0700);
-               if (fd >= 0) {
-                       ssize_t written = xwrite(fd, (char *)initrd_start,
-                                               initrd_end - initrd_start);
-
-                       if (written != initrd_end - initrd_start)
-                               pr_err("/initrd.image: incomplete write (%zd != %ld)\n",
-                                      written, initrd_end - initrd_start);
-
-                       ksys_close(fd);
-                       free_initrd();
-               }
-       done:
-               /* empty statement */;
-#else
+       else
                printk(KERN_INFO "Unpacking initramfs...\n");
-               err = unpack_to_rootfs((char *)initrd_start,
-                       initrd_end - initrd_start);
-               if (err)
-                       printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err);
-               free_initrd();
-#endif
+
+       err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start);
+       if (err) {
+               clean_rootfs();
+               populate_initrd_image(err);
        }
+
+done:
+       /*
+        * If the initrd region is overlapped with crashkernel reserved region,
+        * free only memory that is not part of crashkernel region.
+        */
+       if (!do_retain_initrd && !kexec_free_initrd())
+               free_initrd_mem(initrd_start, initrd_end);
+       initrd_start = 0;
+       initrd_end = 0;
+
        flush_delayed_fput();
        return 0;
 }
index 33c87e9..5a2c69b 100644 (file)
@@ -1074,6 +1074,11 @@ static inline void mark_readonly(void)
 }
 #endif
 
+void __weak free_initmem(void)
+{
+       free_initmem_default(POISON_FREE_INITMEM);
+}
+
 static int __ref kernel_init(void *unused)
 {
        int ret;
index 4ca7364..78f61bf 100644 (file)
@@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
        struct mmu_notifier_range range;
        struct mem_cgroup *memcg;
 
-       mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE);
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
+                               addr + PAGE_SIZE);
 
        VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
 
index 6262f15..2268b97 100644 (file)
@@ -543,7 +543,7 @@ again:
        if (unlikely(should_fail_futex(fshared)))
                return -EFAULT;
 
-       err = get_user_pages_fast(address, 1, 1, &page);
+       err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
        /*
         * If write access is not required (eg. FUTEX_WAIT), try
         * and get read-only access.
index f7fb8f6..072b6ee 100644 (file)
@@ -500,13 +500,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg)
        return locate_mem_hole_bottom_up(start, end, kbuf);
 }
 
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-static int kexec_walk_memblock(struct kexec_buf *kbuf,
-                              int (*func)(struct resource *, void *))
-{
-       return 0;
-}
-#else
+#ifdef CONFIG_ARCH_KEEP_MEMBLOCK
 static int kexec_walk_memblock(struct kexec_buf *kbuf,
                               int (*func)(struct resource *, void *))
 {
@@ -550,6 +544,12 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
 
        return ret;
 }
+#else
+static int kexec_walk_memblock(struct kexec_buf *kbuf,
+                              int (*func)(struct resource *, void *))
+{
+       return 0;
+}
 #endif
 
 /**
@@ -589,7 +589,7 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
        if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN)
                return 0;
 
-       if (IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK))
+       if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
                ret = kexec_walk_resources(kbuf, locate_mem_hole_callback);
        else
                ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback);
index a856cb5..1490e63 100644 (file)
@@ -45,7 +45,6 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
         */
        return devmem->page_fault(vma, addr, page, flags, pmdp);
 }
-EXPORT_SYMBOL(device_private_entry_fault);
 #endif /* CONFIG_DEVICE_PRIVATE */
 
 static void pgmap_array_delete(struct resource *res)
@@ -148,6 +147,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
                        &pgmap->altmap : NULL;
        struct resource *res = &pgmap->res;
        struct dev_pagemap *conflict_pgmap;
+       struct mhp_restrictions restrictions = {
+               /*
+                * We do not want any optional features only our own memmap
+               */
+               .altmap = altmap,
+       };
        pgprot_t pgprot = PAGE_KERNEL;
        int error, nid, is_ram;
 
@@ -214,7 +219,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
         */
        if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
                error = add_pages(nid, align_start >> PAGE_SHIFT,
-                               align_size >> PAGE_SHIFT, NULL, false);
+                               align_size >> PAGE_SHIFT, &restrictions);
        } else {
                error = kasan_add_zero_shadow(__va(align_start), align_size);
                if (error) {
@@ -222,8 +227,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
                        goto err_kasan;
                }
 
-               error = arch_add_memory(nid, align_start, align_size, altmap,
-                               false);
+               error = arch_add_memory(nid, align_start, align_size,
+                                       &restrictions);
        }
 
        if (!error) {
index 12df0e5..bdbfe8d 100644 (file)
@@ -1924,7 +1924,7 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map)
        ((unsigned long)prctl_map->__m1 __op                            \
         (unsigned long)prctl_map->__m2) ? 0 : -EINVAL
        error  = __prctl_check_order(start_code, <, end_code);
-       error |= __prctl_check_order(start_data, <, end_data);
+       error |= __prctl_check_order(start_data,<=, end_data);
        error |= __prctl_check_order(start_brk, <=, brk);
        error |= __prctl_check_order(arg_start, <=, arg_end);
        error |= __prctl_check_order(env_start, <=, env_end);
index 599510a..ba158f6 100644 (file)
@@ -66,6 +66,7 @@
 #include <linux/kexec.h>
 #include <linux/bpf.h>
 #include <linux/mount.h>
+#include <linux/userfaultfd_k.h>
 
 #include "../lib/kstrtox.h"
 
@@ -1719,6 +1720,17 @@ static struct ctl_table vm_table[] = {
                .extra1         = (void *)&mmap_rnd_compat_bits_min,
                .extra2         = (void *)&mmap_rnd_compat_bits_max,
        },
+#endif
+#ifdef CONFIG_USERFAULTFD
+       {
+               .procname       = "unprivileged_userfaultfd",
+               .data           = &sysctl_unprivileged_userfaultfd,
+               .maxlen         = sizeof(sysctl_unprivileged_userfaultfd),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
 #endif
        { }
 };
index b396d32..f74fa83 100644 (file)
@@ -1293,7 +1293,9 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
                        len = maxpages * PAGE_SIZE;
                addr &= ~(PAGE_SIZE - 1);
                n = DIV_ROUND_UP(len, PAGE_SIZE);
-               res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, pages);
+               res = get_user_pages_fast(addr, n,
+                               iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0,
+                               pages);
                if (unlikely(res < 0))
                        return res;
                return (res == n ? len : res * PAGE_SIZE) - *start;
@@ -1374,7 +1376,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
                p = get_pages_array(n);
                if (!p)
                        return -ENOMEM;
-               res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, p);
+               res = get_user_pages_fast(addr, n,
+                               iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0, p);
                if (unlikely(res < 0)) {
                        kvfree(p);
                        return res;
index 25c71eb..ee8d1f3 100644 (file)
@@ -11,23 +11,24 @@ choice
        default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
        default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
        default FLATMEM_MANUAL
+       help
+         This option allows you to change some of the ways that
+         Linux manages its memory internally. Most users will
+         only have one option here selected by the architecture
+         configuration. This is normal.
 
 config FLATMEM_MANUAL
        bool "Flat Memory"
        depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
        help
-         This option allows you to change some of the ways that
-         Linux manages its memory internally.  Most users will
-         only have one option here: FLATMEM.  This is normal
-         and a correct option.
-
-         Some users of more advanced features like NUMA and
-         memory hotplug may have different options here.
-         DISCONTIGMEM is a more mature, better tested system,
-         but is incompatible with memory hotplug and may suffer
-         decreased performance over SPARSEMEM.  If unsure between
-         "Sparse Memory" and "Discontiguous Memory", choose
-         "Discontiguous Memory".
+         This option is best suited for non-NUMA systems with
+         flat address space. The FLATMEM is the most efficient
+         system in terms of performance and resource consumption
+         and it is the best option for smaller systems.
+
+         For systems that have holes in their physical address
+         spaces and for features like NUMA and memory hotplug,
+         choose "Sparse Memory"
 
          If unsure, choose this option (Flat Memory) over any other.
 
@@ -38,29 +39,26 @@ config DISCONTIGMEM_MANUAL
          This option provides enhanced support for discontiguous
          memory systems, over FLATMEM.  These systems have holes
          in their physical address spaces, and this option provides
-         more efficient handling of these holes.  However, the vast
-         majority of hardware has quite flat address spaces, and
-         can have degraded performance from the extra overhead that
-         this option imposes.
+         more efficient handling of these holes.
 
-         Many NUMA configurations will have this as the only option.
+         Although "Discontiguous Memory" is still used by several
+         architectures, it is considered deprecated in favor of
+         "Sparse Memory".
 
-         If unsure, choose "Flat Memory" over this option.
+         If unsure, choose "Sparse Memory" over this option.
 
 config SPARSEMEM_MANUAL
        bool "Sparse Memory"
        depends on ARCH_SPARSEMEM_ENABLE
        help
          This will be the only option for some systems, including
-         memory hotplug systems.  This is normal.
+         memory hot-plug systems.  This is normal.
 
-         For many other systems, this will be an alternative to
-         "Discontiguous Memory".  This option provides some potential
-         performance benefits, along with decreased code complexity,
-         but it is newer, and more experimental.
+         This option provides efficient support for systems with
+         holes is their physical address space and allows memory
+         hot-plug and hot-remove.
 
-         If unsure, choose "Discontiguous Memory" or "Flat Memory"
-         over this option.
+         If unsure, choose "Flat Memory" over this option.
 
 endchoice
 
@@ -136,7 +134,7 @@ config HAVE_MEMBLOCK_PHYS_MAP
 config HAVE_GENERIC_GUP
        bool
 
-config ARCH_DISCARD_MEMBLOCK
+config ARCH_KEEP_MEMBLOCK
        bool
 
 config MEMORY_ISOLATION
@@ -161,7 +159,6 @@ config MEMORY_HOTPLUG_SPARSE
 
 config MEMORY_HOTPLUG_DEFAULT_ONLINE
         bool "Online the newly added memory blocks by default"
-        default n
         depends on MEMORY_HOTPLUG
         help
          This option sets the default policy setting for memory hotplug
@@ -258,6 +255,9 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION
 config ARCH_ENABLE_THP_MIGRATION
        bool
 
+config CONTIG_ALLOC
+       def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
+
 config PHYS_ADDR_T_64BIT
        def_bool 64BIT
 
@@ -436,7 +436,6 @@ config NEED_PER_CPU_KM
 
 config CLEANCACHE
        bool "Enable cleancache driver to cache clean pages if tmem is present"
-       default n
        help
          Cleancache can be thought of as a page-granularity victim cache
          for clean pages that the kernel's pageframe replacement algorithm
@@ -460,7 +459,6 @@ config CLEANCACHE
 config FRONTSWAP
        bool "Enable frontswap to cache swap pages if tmem is present"
        depends on SWAP
-       default n
        help
          Frontswap is so named because it can be thought of as the opposite
          of a "backing" store for a swap device.  The data is stored into
@@ -532,7 +530,6 @@ config ZSWAP
        depends on FRONTSWAP && CRYPTO=y
        select CRYPTO_LZO
        select ZPOOL
-       default n
        help
          A lightweight compressed cache for swap pages.  It takes
          pages that are in the process of being swapped out and attempts to
@@ -549,14 +546,12 @@ config ZSWAP
 
 config ZPOOL
        tristate "Common API for compressed memory storage"
-       default n
        help
          Compressed memory storage API.  This allows using either zbud or
          zsmalloc.
 
 config ZBUD
        tristate "Low (Up to 2x) density storage for compressed pages"
-       default n
        help
          A special purpose allocator for storing compressed pages.
          It is designed to store up to two compressed pages per physical
@@ -567,7 +562,6 @@ config ZBUD
 config Z3FOLD
        tristate "Up to 3x density storage for compressed pages"
        depends on ZPOOL
-       default n
        help
          A special purpose allocator for storing compressed pages.
          It is designed to store up to three compressed pages per physical
@@ -577,7 +571,6 @@ config Z3FOLD
 config ZSMALLOC
        tristate "Memory allocator for compressed pages"
        depends on MMU
-       default n
        help
          zsmalloc is a slab-based memory allocator designed to store
          compressed RAM pages.  zsmalloc uses virtual memory mapping
@@ -628,7 +621,6 @@ config MAX_STACK_SIZE_MB
 
 config DEFERRED_STRUCT_PAGE_INIT
        bool "Defer initialisation of struct pages to kthreads"
-       default n
        depends on SPARSEMEM
        depends on !NEED_PER_CPU_KM
        depends on 64BIT
@@ -676,6 +668,22 @@ config ZONE_DEVICE
 
          If FS_DAX is enabled, then say Y.
 
+config ARCH_HAS_HMM_MIRROR
+       bool
+       default y
+       depends on (X86_64 || PPC64)
+       depends on MMU && 64BIT
+
+config ARCH_HAS_HMM_DEVICE
+       bool
+       default y
+       depends on (X86_64 || PPC64)
+       depends on MEMORY_HOTPLUG
+       depends on MEMORY_HOTREMOVE
+       depends on SPARSEMEM_VMEMMAP
+       depends on ARCH_HAS_ZONE_DEVICE
+       select XARRAY_MULTI
+
 config ARCH_HAS_HMM
        bool
        default y
@@ -694,12 +702,12 @@ config DEV_PAGEMAP_OPS
 
 config HMM
        bool
+       select MMU_NOTIFIER
        select MIGRATE_VMA_HELPER
 
 config HMM_MIRROR
        bool "HMM mirror CPU page table into a device page table"
        depends on ARCH_HAS_HMM
-       select MMU_NOTIFIER
        select HMM
        help
          Select HMM_MIRROR if you want to mirror range of the CPU page table of a
@@ -740,7 +748,6 @@ config ARCH_HAS_PKEYS
 
 config PERCPU_STATS
        bool "Collect percpu memory statistics"
-       default n
        help
          This feature collects and exposes statistics via debugfs. The
          information includes global and per chunk statistics, which can
@@ -748,7 +755,6 @@ config PERCPU_STATS
 
 config GUP_BENCHMARK
        bool "Enable infrastructure for get_user_pages_fast() benchmarking"
-       default n
        help
          Provides /sys/kernel/debug/gup_benchmark that helps with testing
          performance of get_user_pages_fast().
index e3df921..e980ceb 100644 (file)
@@ -33,7 +33,6 @@ config DEBUG_PAGEALLOC
 
 config DEBUG_PAGEALLOC_ENABLE_DEFAULT
        bool "Enable debug page memory allocations by default?"
-       default n
        depends on DEBUG_PAGEALLOC
        ---help---
          Enable debug page memory allocations by default? This value
index bb2d333..5e36d74 100644 (file)
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -106,8 +106,10 @@ static int __init cma_activate_area(struct cma *cma)
 
        cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
 
-       if (!cma->bitmap)
+       if (!cma->bitmap) {
+               cma->count = 0;
                return -ENOMEM;
+       }
 
        WARN_ON_ONCE(!pfn_valid(pfn));
        zone = page_zone(pfn_to_page(pfn));
@@ -367,23 +369,26 @@ err:
 #ifdef CONFIG_CMA_DEBUG
 static void cma_debug_show_areas(struct cma *cma)
 {
-       unsigned long next_zero_bit, next_set_bit;
+       unsigned long next_zero_bit, next_set_bit, nr_zero;
        unsigned long start = 0;
-       unsigned int nr_zero, nr_total = 0;
+       unsigned long nr_part, nr_total = 0;
+       unsigned long nbits = cma_bitmap_maxno(cma);
 
        mutex_lock(&cma->lock);
        pr_info("number of available pages: ");
        for (;;) {
-               next_zero_bit = find_next_zero_bit(cma->bitmap, cma->count, start);
-               if (next_zero_bit >= cma->count)
+               next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start);
+               if (next_zero_bit >= nbits)
                        break;
-               next_set_bit = find_next_bit(cma->bitmap, cma->count, next_zero_bit);
+               next_set_bit = find_next_bit(cma->bitmap, nbits, next_zero_bit);
                nr_zero = next_set_bit - next_zero_bit;
-               pr_cont("%s%u@%lu", nr_total ? "+" : "", nr_zero, next_zero_bit);
-               nr_total += nr_zero;
+               nr_part = nr_zero << cma->order_per_bit;
+               pr_cont("%s%lu@%lu", nr_total ? "+" : "", nr_part,
+                       next_zero_bit);
+               nr_total += nr_part;
                start = next_zero_bit + nr_zero;
        }
-       pr_cont("=> %u free of %lu total pages\n", nr_total, cma->count);
+       pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count);
        mutex_unlock(&cma->lock);
 }
 #else
index 8d7b2fd..a7dd9e8 100644 (file)
@@ -56,7 +56,7 @@ static int cma_maxchunk_get(void *data, u64 *val)
        mutex_lock(&cma->lock);
        for (;;) {
                start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end);
-               if (start >= cma->count)
+               if (start >= bitmap_maxno)
                        break;
                end = find_next_bit(cma->bitmap, bitmap_maxno, start);
                maxchunk = max(end - start, maxchunk);
index 3319e08..6cc4bea 100644 (file)
@@ -1164,7 +1164,9 @@ static bool suitable_migration_target(struct compact_control *cc,
 static inline unsigned int
 freelist_scan_limit(struct compact_control *cc)
 {
-       return (COMPACT_CLUSTER_MAX >> cc->fast_search_fail) + 1;
+       unsigned short shift = BITS_PER_LONG - 1;
+
+       return (COMPACT_CLUSTER_MAX >> min(shift, cc->fast_search_fail)) + 1;
 }
 
 /*
index d78f577..c5af80c 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/pagemap.h>
 #include <linux/file.h>
 #include <linux/uio.h>
+#include <linux/error-injection.h>
 #include <linux/hash.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
@@ -279,11 +280,11 @@ EXPORT_SYMBOL(delete_from_page_cache);
  * @pvec: pagevec with pages to delete
  *
  * The function walks over mapping->i_pages and removes pages passed in @pvec
- * from the mapping. The function expects @pvec to be sorted by page index.
+ * from the mapping. The function expects @pvec to be sorted by page index
+ * and is optimised for it to be dense.
  * It tolerates holes in @pvec (mapping entries at those indices are not
  * modified). The function expects only THP head pages to be present in the
- * @pvec and takes care to delete all corresponding tail pages from the
- * mapping as well.
+ * @pvec.
  *
  * The function expects the i_pages lock to be held.
  */
@@ -292,40 +293,44 @@ static void page_cache_delete_batch(struct address_space *mapping,
 {
        XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
        int total_pages = 0;
-       int i = 0, tail_pages = 0;
+       int i = 0;
        struct page *page;
 
        mapping_set_update(&xas, mapping);
        xas_for_each(&xas, page, ULONG_MAX) {
-               if (i >= pagevec_count(pvec) && !tail_pages)
+               if (i >= pagevec_count(pvec))
                        break;
+
+               /* A swap/dax/shadow entry got inserted? Skip it. */
                if (xa_is_value(page))
                        continue;
-               if (!tail_pages) {
-                       /*
-                        * Some page got inserted in our range? Skip it. We
-                        * have our pages locked so they are protected from
-                        * being removed.
-                        */
-                       if (page != pvec->pages[i]) {
-                               VM_BUG_ON_PAGE(page->index >
-                                               pvec->pages[i]->index, page);
-                               continue;
-                       }
-                       WARN_ON_ONCE(!PageLocked(page));
-                       if (PageTransHuge(page) && !PageHuge(page))
-                               tail_pages = HPAGE_PMD_NR - 1;
+               /*
+                * A page got inserted in our range? Skip it. We have our
+                * pages locked so they are protected from being removed.
+                * If we see a page whose index is higher than ours, it
+                * means our page has been removed, which shouldn't be
+                * possible because we're holding the PageLock.
+                */
+               if (page != pvec->pages[i]) {
+                       VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index,
+                                       page);
+                       continue;
+               }
+
+               WARN_ON_ONCE(!PageLocked(page));
+
+               if (page->index == xas.xa_index)
                        page->mapping = NULL;
-                       /*
-                        * Leave page->index set: truncation lookup relies
-                        * upon it
-                        */
+               /* Leave page->index set: truncation lookup relies on it */
+
+               /*
+                * Move to the next page in the vector if this is a regular
+                * page or the index is of the last sub-page of this compound
+                * page.
+                */
+               if (page->index + (1UL << compound_order(page)) - 1 ==
+                               xas.xa_index)
                        i++;
-               } else {
-                       VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages
-                                       != pvec->pages[i]->index, page);
-                       tail_pages--;
-               }
                xas_store(&xas, NULL);
                total_pages++;
        }
@@ -878,6 +883,7 @@ error:
        put_page(page);
        return xas_error(&xas);
 }
+ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
 
 /**
  * add_to_page_cache_locked - add a locked page to the pagecache
@@ -1440,7 +1446,7 @@ pgoff_t page_cache_next_miss(struct address_space *mapping,
 EXPORT_SYMBOL(page_cache_next_miss);
 
 /**
- * page_cache_prev_miss() - Find the next gap in the page cache.
+ * page_cache_prev_miss() - Find the previous gap in the page cache.
  * @mapping: Mapping.
  * @index: Index.
  * @max_scan: Maximum range to search.
@@ -1491,7 +1497,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
 {
        XA_STATE(xas, &mapping->i_pages, offset);
-       struct page *head, *page;
+       struct page *page;
 
        rcu_read_lock();
 repeat:
@@ -1506,25 +1512,19 @@ repeat:
        if (!page || xa_is_value(page))
                goto out;
 
-       head = compound_head(page);
-       if (!page_cache_get_speculative(head))
+       if (!page_cache_get_speculative(page))
                goto repeat;
 
-       /* The page was split under us? */
-       if (compound_head(page) != head) {
-               put_page(head);
-               goto repeat;
-       }
-
        /*
-        * Has the page moved?
+        * Has the page moved or been split?
         * This is part of the lockless pagecache protocol. See
         * include/linux/pagemap.h for details.
         */
        if (unlikely(page != xas_reload(&xas))) {
-               put_page(head);
+               put_page(page);
                goto repeat;
        }
+       page = find_subpage(page, offset);
 out:
        rcu_read_unlock();
 
@@ -1706,7 +1706,6 @@ unsigned find_get_entries(struct address_space *mapping,
 
        rcu_read_lock();
        xas_for_each(&xas, page, ULONG_MAX) {
-               struct page *head;
                if (xas_retry(&xas, page))
                        continue;
                /*
@@ -1717,17 +1716,13 @@ unsigned find_get_entries(struct address_space *mapping,
                if (xa_is_value(page))
                        goto export;
 
-               head = compound_head(page);
-               if (!page_cache_get_speculative(head))
+               if (!page_cache_get_speculative(page))
                        goto retry;
 
-               /* The page was split under us? */
-               if (compound_head(page) != head)
-                       goto put_page;
-
-               /* Has the page moved? */
+               /* Has the page moved or been split? */
                if (unlikely(page != xas_reload(&xas)))
                        goto put_page;
+               page = find_subpage(page, xas.xa_index);
 
 export:
                indices[ret] = xas.xa_index;
@@ -1736,7 +1731,7 @@ export:
                        break;
                continue;
 put_page:
-               put_page(head);
+               put_page(page);
 retry:
                xas_reset(&xas);
        }
@@ -1778,33 +1773,27 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
 
        rcu_read_lock();
        xas_for_each(&xas, page, end) {
-               struct page *head;
                if (xas_retry(&xas, page))
                        continue;
                /* Skip over shadow, swap and DAX entries */
                if (xa_is_value(page))
                        continue;
 
-               head = compound_head(page);
-               if (!page_cache_get_speculative(head))
+               if (!page_cache_get_speculative(page))
                        goto retry;
 
-               /* The page was split under us? */
-               if (compound_head(page) != head)
-                       goto put_page;
-
-               /* Has the page moved? */
+               /* Has the page moved or been split? */
                if (unlikely(page != xas_reload(&xas)))
                        goto put_page;
 
-               pages[ret] = page;
+               pages[ret] = find_subpage(page, xas.xa_index);
                if (++ret == nr_pages) {
                        *start = xas.xa_index + 1;
                        goto out;
                }
                continue;
 put_page:
-               put_page(head);
+               put_page(page);
 retry:
                xas_reset(&xas);
        }
@@ -1849,7 +1838,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
 
        rcu_read_lock();
        for (page = xas_load(&xas); page; page = xas_next(&xas)) {
-               struct page *head;
                if (xas_retry(&xas, page))
                        continue;
                /*
@@ -1859,24 +1847,19 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
                if (xa_is_value(page))
                        break;
 
-               head = compound_head(page);
-               if (!page_cache_get_speculative(head))
+               if (!page_cache_get_speculative(page))
                        goto retry;
 
-               /* The page was split under us? */
-               if (compound_head(page) != head)
-                       goto put_page;
-
-               /* Has the page moved? */
+               /* Has the page moved or been split? */
                if (unlikely(page != xas_reload(&xas)))
                        goto put_page;
 
-               pages[ret] = page;
+               pages[ret] = find_subpage(page, xas.xa_index);
                if (++ret == nr_pages)
                        break;
                continue;
 put_page:
-               put_page(head);
+               put_page(page);
 retry:
                xas_reset(&xas);
        }
@@ -1912,7 +1895,6 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
 
        rcu_read_lock();
        xas_for_each_marked(&xas, page, end, tag) {
-               struct page *head;
                if (xas_retry(&xas, page))
                        continue;
                /*
@@ -1923,26 +1905,21 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
                if (xa_is_value(page))
                        continue;
 
-               head = compound_head(page);
-               if (!page_cache_get_speculative(head))
+               if (!page_cache_get_speculative(page))
                        goto retry;
 
-               /* The page was split under us? */
-               if (compound_head(page) != head)
-                       goto put_page;
-
-               /* Has the page moved? */
+               /* Has the page moved or been split? */
                if (unlikely(page != xas_reload(&xas)))
                        goto put_page;
 
-               pages[ret] = page;
+               pages[ret] = find_subpage(page, xas.xa_index);
                if (++ret == nr_pages) {
                        *index = xas.xa_index + 1;
                        goto out;
                }
                continue;
 put_page:
-               put_page(head);
+               put_page(page);
 retry:
                xas_reset(&xas);
        }
@@ -1964,72 +1941,6 @@ out:
 }
 EXPORT_SYMBOL(find_get_pages_range_tag);
 
-/**
- * find_get_entries_tag - find and return entries that match @tag
- * @mapping:   the address_space to search
- * @start:     the starting page cache index
- * @tag:       the tag index
- * @nr_entries:        the maximum number of entries
- * @entries:   where the resulting entries are placed
- * @indices:   the cache indices corresponding to the entries in @entries
- *
- * Like find_get_entries, except we only return entries which are tagged with
- * @tag.
- *
- * Return: the number of entries which were found.
- */
-unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
-                       xa_mark_t tag, unsigned int nr_entries,
-                       struct page **entries, pgoff_t *indices)
-{
-       XA_STATE(xas, &mapping->i_pages, start);
-       struct page *page;
-       unsigned int ret = 0;
-
-       if (!nr_entries)
-               return 0;
-
-       rcu_read_lock();
-       xas_for_each_marked(&xas, page, ULONG_MAX, tag) {
-               struct page *head;
-               if (xas_retry(&xas, page))
-                       continue;
-               /*
-                * A shadow entry of a recently evicted page, a swap
-                * entry from shmem/tmpfs or a DAX entry.  Return it
-                * without attempting to raise page count.
-                */
-               if (xa_is_value(page))
-                       goto export;
-
-               head = compound_head(page);
-               if (!page_cache_get_speculative(head))
-                       goto retry;
-
-               /* The page was split under us? */
-               if (compound_head(page) != head)
-                       goto put_page;
-
-               /* Has the page moved? */
-               if (unlikely(page != xas_reload(&xas)))
-                       goto put_page;
-
-export:
-               indices[ret] = xas.xa_index;
-               entries[ret] = page;
-               if (++ret == nr_entries)
-                       break;
-               continue;
-put_page:
-               put_page(head);
-retry:
-               xas_reset(&xas);
-       }
-       rcu_read_unlock();
-       return ret;
-}
-EXPORT_SYMBOL(find_get_entries_tag);
-
 /*
  * CD/DVDs are error prone. When a medium error occurs, the driver may fail
  * a _large_ part of the i/o request. Imagine the worst scenario:
@@ -2691,7 +2602,7 @@ void filemap_map_pages(struct vm_fault *vmf,
        pgoff_t last_pgoff = start_pgoff;
        unsigned long max_idx;
        XA_STATE(xas, &mapping->i_pages, start_pgoff);
-       struct page *head, *page;
+       struct page *page;
 
        rcu_read_lock();
        xas_for_each(&xas, page, end_pgoff) {
@@ -2700,24 +2611,19 @@ void filemap_map_pages(struct vm_fault *vmf,
                if (xa_is_value(page))
                        goto next;
 
-               head = compound_head(page);
-
                /*
                 * Check for a locked page first, as a speculative
                 * reference may adversely influence page migration.
                 */
-               if (PageLocked(head))
+               if (PageLocked(page))
                        goto next;
-               if (!page_cache_get_speculative(head))
+               if (!page_cache_get_speculative(page))
                        goto next;
 
-               /* The page was split under us? */
-               if (compound_head(page) != head)
-                       goto skip;
-
-               /* Has the page moved? */
+               /* Has the page moved or been split? */
                if (unlikely(page != xas_reload(&xas)))
                        goto skip;
+               page = find_subpage(page, xas.xa_index);
 
                if (!PageUptodate(page) ||
                                PageReadahead(page) ||
index 91819b8..2c08248 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -28,6 +28,111 @@ struct follow_page_context {
        unsigned int page_mask;
 };
 
+typedef int (*set_dirty_func_t)(struct page *page);
+
+static void __put_user_pages_dirty(struct page **pages,
+                                  unsigned long npages,
+                                  set_dirty_func_t sdf)
+{
+       unsigned long index;
+
+       for (index = 0; index < npages; index++) {
+               struct page *page = compound_head(pages[index]);
+
+               /*
+                * Checking PageDirty at this point may race with
+                * clear_page_dirty_for_io(), but that's OK. Two key cases:
+                *
+                * 1) This code sees the page as already dirty, so it skips
+                * the call to sdf(). That could happen because
+                * clear_page_dirty_for_io() called page_mkclean(),
+                * followed by set_page_dirty(). However, now the page is
+                * going to get written back, which meets the original
+                * intention of setting it dirty, so all is well:
+                * clear_page_dirty_for_io() goes on to call
+                * TestClearPageDirty(), and write the page back.
+                *
+                * 2) This code sees the page as clean, so it calls sdf().
+                * The page stays dirty, despite being written back, so it
+                * gets written back again in the next writeback cycle.
+                * This is harmless.
+                */
+               if (!PageDirty(page))
+                       sdf(page);
+
+               put_user_page(page);
+       }
+}
+
+/**
+ * put_user_pages_dirty() - release and dirty an array of gup-pinned pages
+ * @pages:  array of pages to be marked dirty and released.
+ * @npages: number of pages in the @pages array.
+ *
+ * "gup-pinned page" refers to a page that has had one of the get_user_pages()
+ * variants called on that page.
+ *
+ * For each page in the @pages array, make that page (or its head page, if a
+ * compound page) dirty, if it was previously listed as clean. Then, release
+ * the page using put_user_page().
+ *
+ * Please see the put_user_page() documentation for details.
+ *
+ * set_page_dirty(), which does not lock the page, is used here.
+ * Therefore, it is the caller's responsibility to ensure that this is
+ * safe. If not, then put_user_pages_dirty_lock() should be called instead.
+ *
+ */
+void put_user_pages_dirty(struct page **pages, unsigned long npages)
+{
+       __put_user_pages_dirty(pages, npages, set_page_dirty);
+}
+EXPORT_SYMBOL(put_user_pages_dirty);
+
+/**
+ * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages
+ * @pages:  array of pages to be marked dirty and released.
+ * @npages: number of pages in the @pages array.
+ *
+ * For each page in the @pages array, make that page (or its head page, if a
+ * compound page) dirty, if it was previously listed as clean. Then, release
+ * the page using put_user_page().
+ *
+ * Please see the put_user_page() documentation for details.
+ *
+ * This is just like put_user_pages_dirty(), except that it invokes
+ * set_page_dirty_lock(), instead of set_page_dirty().
+ *
+ */
+void put_user_pages_dirty_lock(struct page **pages, unsigned long npages)
+{
+       __put_user_pages_dirty(pages, npages, set_page_dirty_lock);
+}
+EXPORT_SYMBOL(put_user_pages_dirty_lock);
+
+/**
+ * put_user_pages() - release an array of gup-pinned pages.
+ * @pages:  array of pages to be marked dirty and released.
+ * @npages: number of pages in the @pages array.
+ *
+ * For each page in the @pages array, release the page using put_user_page().
+ *
+ * Please see the put_user_page() documentation for details.
+ */
+void put_user_pages(struct page **pages, unsigned long npages)
+{
+       unsigned long index;
+
+       /*
+        * TODO: this can be optimized for huge pages: if a series of pages is
+        * physically contiguous and part of the same compound page, then a
+        * single operation to the head page should suffice.
+        */
+       for (index = 0; index < npages; index++)
+               put_user_page(pages[index]);
+}
+EXPORT_SYMBOL(put_user_pages);
+
 static struct page *no_page_table(struct vm_area_struct *vma,
                unsigned int flags)
 {
@@ -1018,6 +1123,15 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           int *locked)
 {
+       /*
+        * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+        * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+        * vmas.  As there are no users of this flag in this call we simply
+        * disallow this option for now.
+        */
+       if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+               return -EINVAL;
+
        return __get_user_pages_locked(current, current->mm, start, nr_pages,
                                       pages, NULL, locked,
                                       gup_flags | FOLL_TOUCH);
@@ -1046,6 +1160,15 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
        int locked = 1;
        long ret;
 
+       /*
+        * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+        * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+        * vmas.  As there are no users of this flag in this call we simply
+        * disallow this option for now.
+        */
+       if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+               return -EINVAL;
+
        down_read(&mm->mmap_sem);
        ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
                                      &locked, gup_flags | FOLL_TOUCH);
@@ -1116,32 +1239,22 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
                unsigned int gup_flags, struct page **pages,
                struct vm_area_struct **vmas, int *locked)
 {
+       /*
+        * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+        * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+        * vmas.  As there are no users of this flag in this call we simply
+        * disallow this option for now.
+        */
+       if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+               return -EINVAL;
+
        return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
                                       locked,
                                       gup_flags | FOLL_TOUCH | FOLL_REMOTE);
 }
 EXPORT_SYMBOL(get_user_pages_remote);
 
-/*
- * This is the same as get_user_pages_remote(), just with a
- * less-flexible calling convention where we assume that the task
- * and mm being operated on are the current task's and don't allow
- * passing of a locked parameter.  We also obviously don't pass
- * FOLL_REMOTE in here.
- */
-long get_user_pages(unsigned long start, unsigned long nr_pages,
-               unsigned int gup_flags, struct page **pages,
-               struct vm_area_struct **vmas)
-{
-       return __get_user_pages_locked(current, current->mm, start, nr_pages,
-                                      pages, vmas, NULL,
-                                      gup_flags | FOLL_TOUCH);
-}
-EXPORT_SYMBOL(get_user_pages);
-
 #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
-
-#ifdef CONFIG_FS_DAX
 static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
 {
        long i;
@@ -1160,12 +1273,6 @@ static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
        }
        return false;
 }
-#else
-static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
-{
-       return false;
-}
-#endif
 
 #ifdef CONFIG_CMA
 static struct page *new_non_cma_page(struct page *page, unsigned long private)
@@ -1219,10 +1326,13 @@ static struct page *new_non_cma_page(struct page *page, unsigned long private)
        return __alloc_pages_node(nid, gfp_mask, 0);
 }
 
-static long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
-                                       unsigned int gup_flags,
+static long check_and_migrate_cma_pages(struct task_struct *tsk,
+                                       struct mm_struct *mm,
+                                       unsigned long start,
+                                       unsigned long nr_pages,
                                        struct page **pages,
-                                       struct vm_area_struct **vmas)
+                                       struct vm_area_struct **vmas,
+                                       unsigned int gup_flags)
 {
        long i;
        bool drain_allow = true;
@@ -1278,10 +1388,14 @@ check_again:
                                putback_movable_pages(&cma_page_list);
                }
                /*
-                * We did migrate all the pages, Try to get the page references again
-                * migrating any new CMA pages which we failed to isolate earlier.
+                * We did migrate all the pages, Try to get the page references
+                * again migrating any new CMA pages which we failed to isolate
+                * earlier.
                 */
-               nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+               nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages,
+                                                  pages, vmas, NULL,
+                                                  gup_flags);
+
                if ((nr_pages > 0) && migrate_allow) {
                        drain_allow = true;
                        goto check_again;
@@ -1291,66 +1405,101 @@ check_again:
        return nr_pages;
 }
 #else
-static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
-                                              unsigned int gup_flags,
-                                              struct page **pages,
-                                              struct vm_area_struct **vmas)
+static long check_and_migrate_cma_pages(struct task_struct *tsk,
+                                       struct mm_struct *mm,
+                                       unsigned long start,
+                                       unsigned long nr_pages,
+                                       struct page **pages,
+                                       struct vm_area_struct **vmas,
+                                       unsigned int gup_flags)
 {
        return nr_pages;
 }
 #endif
 
 /*
- * This is the same as get_user_pages() in that it assumes we are
- * operating on the current task's mm, but it goes further to validate
- * that the vmas associated with the address range are suitable for
- * longterm elevated page reference counts. For example, filesystem-dax
- * mappings are subject to the lifetime enforced by the filesystem and
- * we need guarantees that longterm users like RDMA and V4L2 only
- * establish mappings that have a kernel enforced revocation mechanism.
- *
- * "longterm" == userspace controlled elevated page count lifetime.
- * Contrast this to iov_iter_get_pages() usages which are transient.
+ * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
+ * allows us to process the FOLL_LONGTERM flag.
  */
-long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
-                            unsigned int gup_flags, struct page **pages,
-                            struct vm_area_struct **vmas_arg)
+static long __gup_longterm_locked(struct task_struct *tsk,
+                                 struct mm_struct *mm,
+                                 unsigned long start,
+                                 unsigned long nr_pages,
+                                 struct page **pages,
+                                 struct vm_area_struct **vmas,
+                                 unsigned int gup_flags)
 {
-       struct vm_area_struct **vmas = vmas_arg;
-       unsigned long flags;
+       struct vm_area_struct **vmas_tmp = vmas;
+       unsigned long flags = 0;
        long rc, i;
 
-       if (!pages)
-               return -EINVAL;
-
-       if (!vmas) {
-               vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
-                              GFP_KERNEL);
-               if (!vmas)
-                       return -ENOMEM;
+       if (gup_flags & FOLL_LONGTERM) {
+               if (!pages)
+                       return -EINVAL;
+
+               if (!vmas_tmp) {
+                       vmas_tmp = kcalloc(nr_pages,
+                                          sizeof(struct vm_area_struct *),
+                                          GFP_KERNEL);
+                       if (!vmas_tmp)
+                               return -ENOMEM;
+               }
+               flags = memalloc_nocma_save();
        }
 
-       flags = memalloc_nocma_save();
-       rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
-       memalloc_nocma_restore(flags);
-       if (rc < 0)
-               goto out;
+       rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages,
+                                    vmas_tmp, NULL, gup_flags);
 
-       if (check_dax_vmas(vmas, rc)) {
-               for (i = 0; i < rc; i++)
-                       put_page(pages[i]);
-               rc = -EOPNOTSUPP;
-               goto out;
+       if (gup_flags & FOLL_LONGTERM) {
+               memalloc_nocma_restore(flags);
+               if (rc < 0)
+                       goto out;
+
+               if (check_dax_vmas(vmas_tmp, rc)) {
+                       for (i = 0; i < rc; i++)
+                               put_page(pages[i]);
+                       rc = -EOPNOTSUPP;
+                       goto out;
+               }
+
+               rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages,
+                                                vmas_tmp, gup_flags);
        }
 
-       rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas);
 out:
-       if (vmas != vmas_arg)
-               kfree(vmas);
+       if (vmas_tmp != vmas)
+               kfree(vmas_tmp);
        return rc;
 }
-EXPORT_SYMBOL(get_user_pages_longterm);
-#endif /* CONFIG_FS_DAX */
+#else /* !CONFIG_FS_DAX && !CONFIG_CMA */
+static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
+                                                 struct mm_struct *mm,
+                                                 unsigned long start,
+                                                 unsigned long nr_pages,
+                                                 struct page **pages,
+                                                 struct vm_area_struct **vmas,
+                                                 unsigned int flags)
+{
+       return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+                                      NULL, flags);
+}
+#endif /* CONFIG_FS_DAX || CONFIG_CMA */
+
+/*
+ * This is the same as get_user_pages_remote(), just with a
+ * less-flexible calling convention where we assume that the task
+ * and mm being operated on are the current task's and don't allow
+ * passing of a locked parameter.  We also obviously don't pass
+ * FOLL_REMOTE in here.
+ */
+long get_user_pages(unsigned long start, unsigned long nr_pages,
+               unsigned int gup_flags, struct page **pages,
+               struct vm_area_struct **vmas)
+{
+       return __gup_longterm_locked(current, current->mm, start, nr_pages,
+                                    pages, vmas, gup_flags | FOLL_TOUCH);
+}
+EXPORT_SYMBOL(get_user_pages);
 
 /**
  * populate_vma_page_range() -  populate a range of pages in the vma.
@@ -1571,7 +1720,7 @@ static inline struct page *try_get_compound_head(struct page *page, int refs)
 
 #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
-                        int write, struct page **pages, int *nr)
+                        unsigned int flags, struct page **pages, int *nr)
 {
        struct dev_pagemap *pgmap = NULL;
        int nr_start = *nr, ret = 0;
@@ -1589,10 +1738,13 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
                if (pte_protnone(pte))
                        goto pte_unmap;
 
-               if (!pte_access_permitted(pte, write))
+               if (!pte_access_permitted(pte, flags & FOLL_WRITE))
                        goto pte_unmap;
 
                if (pte_devmap(pte)) {
+                       if (unlikely(flags & FOLL_LONGTERM))
+                               goto pte_unmap;
+
                        pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
                        if (unlikely(!pgmap)) {
                                undo_dev_pagemap(nr, nr_start, pages);
@@ -1641,7 +1793,7 @@ pte_unmap:
  * useful to have gup_huge_pmd even if we can't operate on ptes.
  */
 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
-                        int write, struct page **pages, int *nr)
+                        unsigned int flags, struct page **pages, int *nr)
 {
        return 0;
 }
@@ -1724,16 +1876,19 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
 #endif
 
 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
-               unsigned long end, int write, struct page **pages, int *nr)
+               unsigned long end, unsigned int flags, struct page **pages, int *nr)
 {
        struct page *head, *page;
        int refs;
 
-       if (!pmd_access_permitted(orig, write))
+       if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
                return 0;
 
-       if (pmd_devmap(orig))
+       if (pmd_devmap(orig)) {
+               if (unlikely(flags & FOLL_LONGTERM))
+                       return 0;
                return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
+       }
 
        refs = 0;
        page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@ -1762,16 +1917,19 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 }
 
 static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
-               unsigned long end, int write, struct page **pages, int *nr)
+               unsigned long end, unsigned int flags, struct page **pages, int *nr)
 {
        struct page *head, *page;
        int refs;
 
-       if (!pud_access_permitted(orig, write))
+       if (!pud_access_permitted(orig, flags & FOLL_WRITE))
                return 0;
 
-       if (pud_devmap(orig))
+       if (pud_devmap(orig)) {
+               if (unlikely(flags & FOLL_LONGTERM))
+                       return 0;
                return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
+       }
 
        refs = 0;
        page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
@@ -1800,13 +1958,13 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
 }
 
 static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
-                       unsigned long end, int write,
+                       unsigned long end, unsigned int flags,
                        struct page **pages, int *nr)
 {
        int refs;
        struct page *head, *page;
 
-       if (!pgd_access_permitted(orig, write))
+       if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
                return 0;
 
        BUILD_BUG_ON(pgd_devmap(orig));
@@ -1837,7 +1995,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
 }
 
 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
-               int write, struct page **pages, int *nr)
+               unsigned int flags, struct page **pages, int *nr)
 {
        unsigned long next;
        pmd_t *pmdp;
@@ -1860,7 +2018,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                        if (pmd_protnone(pmd))
                                return 0;
 
-                       if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
+                       if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
                                pages, nr))
                                return 0;
 
@@ -1870,9 +2028,9 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                         * pmd format and THP pmd format
                         */
                        if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
-                                        PMD_SHIFT, next, write, pages, nr))
+                                        PMD_SHIFT, next, flags, pages, nr))
                                return 0;
-               } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+               } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
                        return 0;
        } while (pmdp++, addr = next, addr != end);
 
@@ -1880,7 +2038,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 }
 
 static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
-                        int write, struct page **pages, int *nr)
+                        unsigned int flags, struct page **pages, int *nr)
 {
        unsigned long next;
        pud_t *pudp;
@@ -1893,14 +2051,14 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
                if (pud_none(pud))
                        return 0;
                if (unlikely(pud_huge(pud))) {
-                       if (!gup_huge_pud(pud, pudp, addr, next, write,
+                       if (!gup_huge_pud(pud, pudp, addr, next, flags,
                                          pages, nr))
                                return 0;
                } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
                        if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
-                                        PUD_SHIFT, next, write, pages, nr))
+                                        PUD_SHIFT, next, flags, pages, nr))
                                return 0;
-               } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+               } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr))
                        return 0;
        } while (pudp++, addr = next, addr != end);
 
@@ -1908,7 +2066,7 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
 }
 
 static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
-                        int write, struct page **pages, int *nr)
+                        unsigned int flags, struct page **pages, int *nr)
 {
        unsigned long next;
        p4d_t *p4dp;
@@ -1923,9 +2081,9 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
                BUILD_BUG_ON(p4d_huge(p4d));
                if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
                        if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
-                                        P4D_SHIFT, next, write, pages, nr))
+                                        P4D_SHIFT, next, flags, pages, nr))
                                return 0;
-               } else if (!gup_pud_range(p4d, addr, next, write, pages, nr))
+               } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr))
                        return 0;
        } while (p4dp++, addr = next, addr != end);
 
@@ -1933,7 +2091,7 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
 }
 
 static void gup_pgd_range(unsigned long addr, unsigned long end,
-               int write, struct page **pages, int *nr)
+               unsigned int flags, struct page **pages, int *nr)
 {
        unsigned long next;
        pgd_t *pgdp;
@@ -1946,14 +2104,14 @@ static void gup_pgd_range(unsigned long addr, unsigned long end,
                if (pgd_none(pgd))
                        return;
                if (unlikely(pgd_huge(pgd))) {
-                       if (!gup_huge_pgd(pgd, pgdp, addr, next, write,
+                       if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
                                          pages, nr))
                                return;
                } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
                        if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
-                                        PGDIR_SHIFT, next, write, pages, nr))
+                                        PGDIR_SHIFT, next, flags, pages, nr))
                                return;
-               } else if (!gup_p4d_range(pgd, addr, next, write, pages, nr))
+               } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr))
                        return;
        } while (pgdp++, addr = next, addr != end);
 }
@@ -2007,18 +2165,41 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
 
        if (gup_fast_permitted(start, nr_pages)) {
                local_irq_save(flags);
-               gup_pgd_range(start, end, write, pages, &nr);
+               gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr);
                local_irq_restore(flags);
        }
 
        return nr;
 }
 
+static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
+                                  unsigned int gup_flags, struct page **pages)
+{
+       int ret;
+
+       /*
+        * FIXME: FOLL_LONGTERM does not work with
+        * get_user_pages_unlocked() (see comments in that function)
+        */
+       if (gup_flags & FOLL_LONGTERM) {
+               down_read(&current->mm->mmap_sem);
+               ret = __gup_longterm_locked(current, current->mm,
+                                           start, nr_pages,
+                                           pages, NULL, gup_flags);
+               up_read(&current->mm->mmap_sem);
+       } else {
+               ret = get_user_pages_unlocked(start, nr_pages,
+                                             pages, gup_flags);
+       }
+
+       return ret;
+}
+
 /**
  * get_user_pages_fast() - pin user pages in memory
  * @start:     starting user address
  * @nr_pages:  number of pages from start to pin
- * @write:     whether pages will be written to
+ * @gup_flags: flags modifying pin behaviour
  * @pages:     array that receives pointers to the pages pinned.
  *             Should be at least nr_pages long.
  *
@@ -2030,8 +2211,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
  * requested. If nr_pages is 0 or negative, returns 0. If no pages
  * were pinned, returns -errno.
  */
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
-                       struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+                       unsigned int gup_flags, struct page **pages)
 {
        unsigned long addr, len, end;
        int nr = 0, ret = 0;
@@ -2049,7 +2230,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 
        if (gup_fast_permitted(start, nr_pages)) {
                local_irq_disable();
-               gup_pgd_range(addr, end, write, pages, &nr);
+               gup_pgd_range(addr, end, gup_flags, pages, &nr);
                local_irq_enable();
                ret = nr;
        }
@@ -2059,8 +2240,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
                start += nr << PAGE_SHIFT;
                pages += nr;
 
-               ret = get_user_pages_unlocked(start, nr_pages - nr, pages,
-                               write ? FOLL_WRITE : 0);
+               ret = __gup_longterm_unlocked(start, nr_pages - nr,
+                                             gup_flags, pages);
 
                /* Have to be a bit careful with return values */
                if (nr > 0) {
index 6c0279e..7dd602d 100644 (file)
@@ -54,8 +54,9 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
                                                 pages + i);
                        break;
                case GUP_LONGTERM_BENCHMARK:
-                       nr = get_user_pages_longterm(addr, nr, gup->flags & 1,
-                                                    pages + i, NULL);
+                       nr = get_user_pages(addr, nr,
+                                           (gup->flags & 1) | FOLL_LONGTERM,
+                                           pages + i, NULL);
                        break;
                case GUP_BENCHMARK:
                        nr = get_user_pages(addr, nr, gup->flags & 1, pages + i,
index fe1cd87..0db8491 100644 (file)
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -30,6 +30,7 @@
 #include <linux/hugetlb.h>
 #include <linux/memremap.h>
 #include <linux/jump_label.h>
+#include <linux/dma-mapping.h>
 #include <linux/mmu_notifier.h>
 #include <linux/memory_hotplug.h>
 
 #if IS_ENABLED(CONFIG_HMM_MIRROR)
 static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
 
-/*
- * struct hmm - HMM per mm struct
- *
- * @mm: mm struct this HMM struct is bound to
- * @lock: lock protecting ranges list
- * @ranges: list of range being snapshotted
- * @mirrors: list of mirrors for this mm
- * @mmu_notifier: mmu notifier to track updates to CPU page table
- * @mirrors_sem: read/write semaphore protecting the mirrors list
- */
-struct hmm {
-       struct mm_struct        *mm;
-       spinlock_t              lock;
-       struct list_head        ranges;
-       struct list_head        mirrors;
-       struct mmu_notifier     mmu_notifier;
-       struct rw_semaphore     mirrors_sem;
-};
+static inline struct hmm *mm_get_hmm(struct mm_struct *mm)
+{
+       struct hmm *hmm = READ_ONCE(mm->hmm);
 
-/*
- * hmm_register - register HMM against an mm (HMM internal)
+       if (hmm && kref_get_unless_zero(&hmm->kref))
+               return hmm;
+
+       return NULL;
+}
+
+/**
+ * hmm_get_or_create - register HMM against an mm (HMM internal)
  *
  * @mm: mm struct to attach to
+ * Returns: returns an HMM object, either by referencing the existing
+ *          (per-process) object, or by creating a new one.
  *
- * This is not intended to be used directly by device drivers. It allocates an
- * HMM struct if mm does not have one, and initializes it.
+ * This is not intended to be used directly by device drivers. If mm already
+ * has an HMM struct then it get a reference on it and returns it. Otherwise
+ * it allocates an HMM struct, initializes it, associate it with the mm and
+ * returns it.
  */
-static struct hmm *hmm_register(struct mm_struct *mm)
+static struct hmm *hmm_get_or_create(struct mm_struct *mm)
 {
-       struct hmm *hmm = READ_ONCE(mm->hmm);
+       struct hmm *hmm = mm_get_hmm(mm);
        bool cleanup = false;
 
-       /*
-        * The hmm struct can only be freed once the mm_struct goes away,
-        * hence we should always have pre-allocated an new hmm struct
-        * above.
-        */
        if (hmm)
                return hmm;
 
        hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
        if (!hmm)
                return NULL;
+       init_waitqueue_head(&hmm->wq);
        INIT_LIST_HEAD(&hmm->mirrors);
        init_rwsem(&hmm->mirrors_sem);
        hmm->mmu_notifier.ops = NULL;
        INIT_LIST_HEAD(&hmm->ranges);
-       spin_lock_init(&hmm->lock);
+       mutex_init(&hmm->lock);
+       kref_init(&hmm->kref);
+       hmm->notifiers = 0;
+       hmm->dead = false;
        hmm->mm = mm;
 
        spin_lock(&mm->page_table_lock);
@@ -106,7 +101,7 @@ static struct hmm *hmm_register(struct mm_struct *mm)
        if (__mmu_notifier_register(&hmm->mmu_notifier, mm))
                goto error_mm;
 
-       return mm->hmm;
+       return hmm;
 
 error_mm:
        spin_lock(&mm->page_table_lock);
@@ -118,54 +113,60 @@ error:
        return NULL;
 }
 
-void hmm_mm_destroy(struct mm_struct *mm)
+static void hmm_free(struct kref *kref)
 {
-       kfree(mm->hmm);
-}
+       struct hmm *hmm = container_of(kref, struct hmm, kref);
+       struct mm_struct *mm = hmm->mm;
 
-static int hmm_invalidate_range(struct hmm *hmm, bool device,
-                               const struct hmm_update *update)
-{
-       struct hmm_mirror *mirror;
-       struct hmm_range *range;
-
-       spin_lock(&hmm->lock);
-       list_for_each_entry(range, &hmm->ranges, list) {
-               unsigned long addr, idx, npages;
+       mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
 
-               if (update->end < range->start || update->start >= range->end)
-                       continue;
+       spin_lock(&mm->page_table_lock);
+       if (mm->hmm == hmm)
+               mm->hmm = NULL;
+       spin_unlock(&mm->page_table_lock);
 
-               range->valid = false;
-               addr = max(update->start, range->start);
-               idx = (addr - range->start) >> PAGE_SHIFT;
-               npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT;
-               memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
-       }
-       spin_unlock(&hmm->lock);
+       kfree(hmm);
+}
 
-       if (!device)
-               return 0;
+static inline void hmm_put(struct hmm *hmm)
+{
+       kref_put(&hmm->kref, hmm_free);
+}
 
-       down_read(&hmm->mirrors_sem);
-       list_for_each_entry(mirror, &hmm->mirrors, list) {
-               int ret;
+void hmm_mm_destroy(struct mm_struct *mm)
+{
+       struct hmm *hmm;
 
-               ret = mirror->ops->sync_cpu_device_pagetables(mirror, update);
-               if (!update->blockable && ret == -EAGAIN) {
-                       up_read(&hmm->mirrors_sem);
-                       return -EAGAIN;
-               }
+       spin_lock(&mm->page_table_lock);
+       hmm = mm_get_hmm(mm);
+       mm->hmm = NULL;
+       if (hmm) {
+               hmm->mm = NULL;
+               hmm->dead = true;
+               spin_unlock(&mm->page_table_lock);
+               hmm_put(hmm);
+               return;
        }
-       up_read(&hmm->mirrors_sem);
 
-       return 0;
+       spin_unlock(&mm->page_table_lock);
 }
 
 static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 {
+       struct hmm *hmm = mm_get_hmm(mm);
        struct hmm_mirror *mirror;
-       struct hmm *hmm = mm->hmm;
+       struct hmm_range *range;
+
+       /* Report this HMM as dying. */
+       hmm->dead = true;
+
+       /* Wake-up everyone waiting on any range. */
+       mutex_lock(&hmm->lock);
+       list_for_each_entry(range, &hmm->ranges, list) {
+               range->valid = false;
+       }
+       wake_up_all(&hmm->wq);
+       mutex_unlock(&hmm->lock);
 
        down_write(&hmm->mirrors_sem);
        mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror,
@@ -186,36 +187,86 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
                                                  struct hmm_mirror, list);
        }
        up_write(&hmm->mirrors_sem);
+
+       hmm_put(hmm);
 }
 
 static int hmm_invalidate_range_start(struct mmu_notifier *mn,
-                       const struct mmu_notifier_range *range)
+                       const struct mmu_notifier_range *nrange)
 {
+       struct hmm *hmm = mm_get_hmm(nrange->mm);
+       struct hmm_mirror *mirror;
        struct hmm_update update;
-       struct hmm *hmm = range->mm->hmm;
+       struct hmm_range *range;
+       int ret = 0;
 
        VM_BUG_ON(!hmm);
 
-       update.start = range->start;
-       update.end = range->end;
+       update.start = nrange->start;
+       update.end = nrange->end;
        update.event = HMM_UPDATE_INVALIDATE;
-       update.blockable = range->blockable;
-       return hmm_invalidate_range(hmm, true, &update);
+       update.blockable = mmu_notifier_range_blockable(nrange);
+
+       if (mmu_notifier_range_blockable(nrange))
+               mutex_lock(&hmm->lock);
+       else if (!mutex_trylock(&hmm->lock)) {
+               ret = -EAGAIN;
+               goto out;
+       }
+       hmm->notifiers++;
+       list_for_each_entry(range, &hmm->ranges, list) {
+               if (update.end < range->start || update.start >= range->end)
+                       continue;
+
+               range->valid = false;
+       }
+       mutex_unlock(&hmm->lock);
+
+       if (mmu_notifier_range_blockable(nrange))
+               down_read(&hmm->mirrors_sem);
+       else if (!down_read_trylock(&hmm->mirrors_sem)) {
+               ret = -EAGAIN;
+               goto out;
+       }
+       list_for_each_entry(mirror, &hmm->mirrors, list) {
+               int ret;
+
+               ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update);
+               if (!update.blockable && ret == -EAGAIN) {
+                       up_read(&hmm->mirrors_sem);
+                       ret = -EAGAIN;
+                       goto out;
+               }
+       }
+       up_read(&hmm->mirrors_sem);
+
+out:
+       hmm_put(hmm);
+       return ret;
 }
 
 static void hmm_invalidate_range_end(struct mmu_notifier *mn,
-                       const struct mmu_notifier_range *range)
+                       const struct mmu_notifier_range *nrange)
 {
-       struct hmm_update update;
-       struct hmm *hmm = range->mm->hmm;
+       struct hmm *hmm = mm_get_hmm(nrange->mm);
 
        VM_BUG_ON(!hmm);
 
-       update.start = range->start;
-       update.end = range->end;
-       update.event = HMM_UPDATE_INVALIDATE;
-       update.blockable = true;
-       hmm_invalidate_range(hmm, false, &update);
+       mutex_lock(&hmm->lock);
+       hmm->notifiers--;
+       if (!hmm->notifiers) {
+               struct hmm_range *range;
+
+               list_for_each_entry(range, &hmm->ranges, list) {
+                       if (range->valid)
+                               continue;
+                       range->valid = true;
+               }
+               wake_up_all(&hmm->wq);
+       }
+       mutex_unlock(&hmm->lock);
+
+       hmm_put(hmm);
 }
 
 static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
@@ -241,24 +292,13 @@ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
        if (!mm || !mirror || !mirror->ops)
                return -EINVAL;
 
-again:
-       mirror->hmm = hmm_register(mm);
+       mirror->hmm = hmm_get_or_create(mm);
        if (!mirror->hmm)
                return -ENOMEM;
 
        down_write(&mirror->hmm->mirrors_sem);
-       if (mirror->hmm->mm == NULL) {
-               /*
-                * A racing hmm_mirror_unregister() is about to destroy the hmm
-                * struct. Try again to allocate a new one.
-                */
-               up_write(&mirror->hmm->mirrors_sem);
-               mirror->hmm = NULL;
-               goto again;
-       } else {
-               list_add(&mirror->list, &mirror->hmm->mirrors);
-               up_write(&mirror->hmm->mirrors_sem);
-       }
+       list_add(&mirror->list, &mirror->hmm->mirrors);
+       up_write(&mirror->hmm->mirrors_sem);
 
        return 0;
 }
@@ -273,38 +313,24 @@ EXPORT_SYMBOL(hmm_mirror_register);
  */
 void hmm_mirror_unregister(struct hmm_mirror *mirror)
 {
-       bool should_unregister = false;
-       struct mm_struct *mm;
-       struct hmm *hmm;
+       struct hmm *hmm = READ_ONCE(mirror->hmm);
 
-       if (mirror->hmm == NULL)
+       if (hmm == NULL)
                return;
 
-       hmm = mirror->hmm;
        down_write(&hmm->mirrors_sem);
        list_del_init(&mirror->list);
-       should_unregister = list_empty(&hmm->mirrors);
+       /* To protect us against double unregister ... */
        mirror->hmm = NULL;
-       mm = hmm->mm;
-       hmm->mm = NULL;
        up_write(&hmm->mirrors_sem);
 
-       if (!should_unregister || mm == NULL)
-               return;
-
-       mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
-
-       spin_lock(&mm->page_table_lock);
-       if (mm->hmm == hmm)
-               mm->hmm = NULL;
-       spin_unlock(&mm->page_table_lock);
-
-       kfree(hmm);
+       hmm_put(hmm);
 }
 EXPORT_SYMBOL(hmm_mirror_unregister);
 
 struct hmm_vma_walk {
        struct hmm_range        *range;
+       struct dev_pagemap      *pgmap;
        unsigned long           last;
        bool                    fault;
        bool                    block;
@@ -323,13 +349,13 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
        flags |= write_fault ? FAULT_FLAG_WRITE : 0;
        ret = handle_mm_fault(vma, addr, flags);
        if (ret & VM_FAULT_RETRY)
-               return -EBUSY;
+               return -EAGAIN;
        if (ret & VM_FAULT_ERROR) {
                *pfn = range->values[HMM_PFN_ERROR];
                return -EFAULT;
        }
 
-       return -EAGAIN;
+       return -EBUSY;
 }
 
 static int hmm_pfns_bad(unsigned long addr,
@@ -355,7 +381,7 @@ static int hmm_pfns_bad(unsigned long addr,
  * @fault: should we fault or not ?
  * @write_fault: write fault ?
  * @walk: mm_walk structure
- * Returns: 0 on success, -EAGAIN after page fault, or page fault error
+ * Returns: 0 on success, -EBUSY after page fault, or page fault error
  *
  * This function will be called whenever pmd_none() or pte_none() returns true,
  * or whenever there is no page directory covering the virtual address range.
@@ -367,23 +393,25 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
        struct hmm_vma_walk *hmm_vma_walk = walk->private;
        struct hmm_range *range = hmm_vma_walk->range;
        uint64_t *pfns = range->pfns;
-       unsigned long i;
+       unsigned long i, page_size;
 
        hmm_vma_walk->last = addr;
-       i = (addr - range->start) >> PAGE_SHIFT;
-       for (; addr < end; addr += PAGE_SIZE, i++) {
+       page_size = hmm_range_page_size(range);
+       i = (addr - range->start) >> range->page_shift;
+
+       for (; addr < end; addr += page_size, i++) {
                pfns[i] = range->values[HMM_PFN_NONE];
                if (fault || write_fault) {
                        int ret;
 
                        ret = hmm_vma_do_fault(walk, addr, write_fault,
                                               &pfns[i]);
-                       if (ret != -EAGAIN)
+                       if (ret != -EBUSY)
                                return ret;
                }
        }
 
-       return (fault || write_fault) ? -EAGAIN : 0;
+       return (fault || write_fault) ? -EBUSY : 0;
 }
 
 static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
@@ -392,10 +420,21 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
 {
        struct hmm_range *range = hmm_vma_walk->range;
 
-       *fault = *write_fault = false;
        if (!hmm_vma_walk->fault)
                return;
 
+       /*
+        * So we not only consider the individual per page request we also
+        * consider the default flags requested for the range. The API can
+        * be use in 2 fashions. The first one where the HMM user coalesce
+        * multiple page fault into one request and set flags per pfns for
+        * of those faults. The second one where the HMM user want to pre-
+        * fault a range with specific flags. For the latter one it is a
+        * waste to have the user pre-fill the pfn arrays with a default
+        * flags value.
+        */
+       pfns = (pfns & range->pfn_flags_mask) | range->default_flags;
+
        /* We aren't ask to do anything ... */
        if (!(pfns & range->flags[HMM_PFN_VALID]))
                return;
@@ -431,10 +470,11 @@ static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
                return;
        }
 
+       *fault = *write_fault = false;
        for (i = 0; i < npages; ++i) {
                hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags,
                                   fault, write_fault);
-               if ((*fault) || (*write_fault))
+               if ((*write_fault))
                        return;
        }
 }
@@ -465,12 +505,22 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
                                range->flags[HMM_PFN_VALID];
 }
 
+static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
+{
+       if (!pud_present(pud))
+               return 0;
+       return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
+                               range->flags[HMM_PFN_WRITE] :
+                               range->flags[HMM_PFN_VALID];
+}
+
 static int hmm_vma_handle_pmd(struct mm_walk *walk,
                              unsigned long addr,
                              unsigned long end,
                              uint64_t *pfns,
                              pmd_t pmd)
 {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        struct hmm_vma_walk *hmm_vma_walk = walk->private;
        struct hmm_range *range = hmm_vma_walk->range;
        unsigned long pfn, npages, i;
@@ -486,10 +536,25 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
                return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
 
        pfn = pmd_pfn(pmd) + pte_index(addr);
-       for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
-               pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
+       for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+               if (pmd_devmap(pmd)) {
+                       hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
+                                             hmm_vma_walk->pgmap);
+                       if (unlikely(!hmm_vma_walk->pgmap))
+                               return -EBUSY;
+               }
+               pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
+       }
+       if (hmm_vma_walk->pgmap) {
+               put_dev_pagemap(hmm_vma_walk->pgmap);
+               hmm_vma_walk->pgmap = NULL;
+       }
        hmm_vma_walk->last = end;
        return 0;
+#else
+       /* If THP is not enabled then we should never reach that code ! */
+       return -EINVAL;
+#endif
 }
 
 static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
@@ -514,11 +579,11 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
        uint64_t orig_pfn = *pfn;
 
        *pfn = range->values[HMM_PFN_NONE];
-       cpu_flags = pte_to_hmm_pfn_flags(range, pte);
-       hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
-                          &fault, &write_fault);
+       fault = write_fault = false;
 
        if (pte_none(pte)) {
+               hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0,
+                                  &fault, &write_fault);
                if (fault || write_fault)
                        goto fault;
                return 0;
@@ -546,7 +611,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
                                           &fault, &write_fault);
                        if (fault || write_fault)
                                goto fault;
-                       *pfn = hmm_pfn_from_pfn(range, swp_offset(entry));
+                       *pfn = hmm_device_entry_from_pfn(range,
+                                           swp_offset(entry));
                        *pfn |= cpu_flags;
                        return 0;
                }
@@ -557,7 +623,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
                                hmm_vma_walk->last = addr;
                                migration_entry_wait(vma->vm_mm,
                                                     pmdp, addr);
-                               return -EAGAIN;
+                               return -EBUSY;
                        }
                        return 0;
                }
@@ -565,15 +631,33 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
                /* Report error for everything else */
                *pfn = range->values[HMM_PFN_ERROR];
                return -EFAULT;
+       } else {
+               cpu_flags = pte_to_hmm_pfn_flags(range, pte);
+               hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
+                                  &fault, &write_fault);
        }
 
        if (fault || write_fault)
                goto fault;
 
-       *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
+       if (pte_devmap(pte)) {
+               hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
+                                             hmm_vma_walk->pgmap);
+               if (unlikely(!hmm_vma_walk->pgmap))
+                       return -EBUSY;
+       } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
+               *pfn = range->values[HMM_PFN_SPECIAL];
+               return -EFAULT;
+       }
+
+       *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags;
        return 0;
 
 fault:
+       if (hmm_vma_walk->pgmap) {
+               put_dev_pagemap(hmm_vma_walk->pgmap);
+               hmm_vma_walk->pgmap = NULL;
+       }
        pte_unmap(ptep);
        /* Fault any virtual address we were asked to fault */
        return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
@@ -615,7 +699,7 @@ again:
                if (fault || write_fault) {
                        hmm_vma_walk->last = addr;
                        pmd_migration_entry_wait(vma->vm_mm, pmdp);
-                       return -EAGAIN;
+                       return -EBUSY;
                }
                return 0;
        } else if (!pmd_present(pmd))
@@ -661,12 +745,158 @@ again:
                        return r;
                }
        }
+       if (hmm_vma_walk->pgmap) {
+               /*
+                * We do put_dev_pagemap() here and not in hmm_vma_handle_pte()
+                * so that we can leverage get_dev_pagemap() optimization which
+                * will not re-take a reference on a pgmap if we already have
+                * one.
+                */
+               put_dev_pagemap(hmm_vma_walk->pgmap);
+               hmm_vma_walk->pgmap = NULL;
+       }
        pte_unmap(ptep - 1);
 
        hmm_vma_walk->last = addr;
        return 0;
 }
 
+static int hmm_vma_walk_pud(pud_t *pudp,
+                           unsigned long start,
+                           unsigned long end,
+                           struct mm_walk *walk)
+{
+       struct hmm_vma_walk *hmm_vma_walk = walk->private;
+       struct hmm_range *range = hmm_vma_walk->range;
+       unsigned long addr = start, next;
+       pmd_t *pmdp;
+       pud_t pud;
+       int ret;
+
+again:
+       pud = READ_ONCE(*pudp);
+       if (pud_none(pud))
+               return hmm_vma_walk_hole(start, end, walk);
+
+       if (pud_huge(pud) && pud_devmap(pud)) {
+               unsigned long i, npages, pfn;
+               uint64_t *pfns, cpu_flags;
+               bool fault, write_fault;
+
+               if (!pud_present(pud))
+                       return hmm_vma_walk_hole(start, end, walk);
+
+               i = (addr - range->start) >> PAGE_SHIFT;
+               npages = (end - addr) >> PAGE_SHIFT;
+               pfns = &range->pfns[i];
+
+               cpu_flags = pud_to_hmm_pfn_flags(range, pud);
+               hmm_range_need_fault(hmm_vma_walk, pfns, npages,
+                                    cpu_flags, &fault, &write_fault);
+               if (fault || write_fault)
+                       return hmm_vma_walk_hole_(addr, end, fault,
+                                               write_fault, walk);
+
+#ifdef CONFIG_HUGETLB_PAGE
+               pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+               for (i = 0; i < npages; ++i, ++pfn) {
+                       hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
+                                             hmm_vma_walk->pgmap);
+                       if (unlikely(!hmm_vma_walk->pgmap))
+                               return -EBUSY;
+                       pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
+                                 cpu_flags;
+               }
+               if (hmm_vma_walk->pgmap) {
+                       put_dev_pagemap(hmm_vma_walk->pgmap);
+                       hmm_vma_walk->pgmap = NULL;
+               }
+               hmm_vma_walk->last = end;
+               return 0;
+#else
+               return -EINVAL;
+#endif
+       }
+
+       split_huge_pud(walk->vma, pudp, addr);
+       if (pud_none(*pudp))
+               goto again;
+
+       pmdp = pmd_offset(pudp, addr);
+       do {
+               next = pmd_addr_end(addr, end);
+               ret = hmm_vma_walk_pmd(pmdp, addr, next, walk);
+               if (ret)
+                       return ret;
+       } while (pmdp++, addr = next, addr != end);
+
+       return 0;
+}
+
+static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
+                                     unsigned long start, unsigned long end,
+                                     struct mm_walk *walk)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+       unsigned long addr = start, i, pfn, mask, size, pfn_inc;
+       struct hmm_vma_walk *hmm_vma_walk = walk->private;
+       struct hmm_range *range = hmm_vma_walk->range;
+       struct vm_area_struct *vma = walk->vma;
+       struct hstate *h = hstate_vma(vma);
+       uint64_t orig_pfn, cpu_flags;
+       bool fault, write_fault;
+       spinlock_t *ptl;
+       pte_t entry;
+       int ret = 0;
+
+       size = 1UL << huge_page_shift(h);
+       mask = size - 1;
+       if (range->page_shift != PAGE_SHIFT) {
+               /* Make sure we are looking at full page. */
+               if (start & mask)
+                       return -EINVAL;
+               if (end < (start + size))
+                       return -EINVAL;
+               pfn_inc = size >> PAGE_SHIFT;
+       } else {
+               pfn_inc = 1;
+               size = PAGE_SIZE;
+       }
+
+
+       ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+       entry = huge_ptep_get(pte);
+
+       i = (start - range->start) >> range->page_shift;
+       orig_pfn = range->pfns[i];
+       range->pfns[i] = range->values[HMM_PFN_NONE];
+       cpu_flags = pte_to_hmm_pfn_flags(range, entry);
+       fault = write_fault = false;
+       hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
+                          &fault, &write_fault);
+       if (fault || write_fault) {
+               ret = -ENOENT;
+               goto unlock;
+       }
+
+       pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift);
+       for (; addr < end; addr += size, i++, pfn += pfn_inc)
+               range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
+                                cpu_flags;
+       hmm_vma_walk->last = end;
+
+unlock:
+       spin_unlock(ptl);
+
+       if (ret == -ENOENT)
+               return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
+
+       return ret;
+#else /* CONFIG_HUGETLB_PAGE */
+       return -EINVAL;
+#endif
+}
+
 static void hmm_pfns_clear(struct hmm_range *range,
                           uint64_t *pfns,
                           unsigned long addr,
@@ -676,279 +906,437 @@ static void hmm_pfns_clear(struct hmm_range *range,
                *pfns = range->values[HMM_PFN_NONE];
 }
 
-static void hmm_pfns_special(struct hmm_range *range)
-{
-       unsigned long addr = range->start, i = 0;
-
-       for (; addr < range->end; addr += PAGE_SIZE, i++)
-               range->pfns[i] = range->values[HMM_PFN_SPECIAL];
-}
-
 /*
- * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
- * @range: range being snapshotted
- * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
- *          vma permission, 0 success
- *
- * This snapshots the CPU page table for a range of virtual addresses. Snapshot
- * validity is tracked by range struct. See hmm_vma_range_done() for further
- * information.
- *
- * The range struct is initialized here. It tracks the CPU page table, but only
- * if the function returns success (0), in which case the caller must then call
- * hmm_vma_range_done() to stop CPU page table update tracking on this range.
+ * hmm_range_register() - start tracking change to CPU page table over a range
+ * @range: range
+ * @mm: the mm struct for the range of virtual address
+ * @start: start virtual address (inclusive)
+ * @end: end virtual address (exclusive)
+ * @page_shift: expect page shift for the range
+ * Returns 0 on success, -EFAULT if the address space is no longer valid
  *
- * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
- * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
+ * Track updates to the CPU page table see include/linux/hmm.h
  */
-int hmm_vma_get_pfns(struct hmm_range *range)
+int hmm_range_register(struct hmm_range *range,
+                      struct mm_struct *mm,
+                      unsigned long start,
+                      unsigned long end,
+                      unsigned page_shift)
 {
-       struct vm_area_struct *vma = range->vma;
-       struct hmm_vma_walk hmm_vma_walk;
-       struct mm_walk mm_walk;
-       struct hmm *hmm;
+       unsigned long mask = ((1UL << page_shift) - 1UL);
+
+       range->valid = false;
+       range->hmm = NULL;
 
-       /* Sanity check, this really should not happen ! */
-       if (range->start < vma->vm_start || range->start >= vma->vm_end)
+       if ((start & mask) || (end & mask))
                return -EINVAL;
-       if (range->end < vma->vm_start || range->end > vma->vm_end)
+       if (start >= end)
                return -EINVAL;
 
-       hmm = hmm_register(vma->vm_mm);
-       if (!hmm)
-               return -ENOMEM;
-       /* Caller must have registered a mirror, via hmm_mirror_register() ! */
-       if (!hmm->mmu_notifier.ops)
-               return -EINVAL;
+       range->page_shift = page_shift;
+       range->start = start;
+       range->end = end;
 
-       /* FIXME support hugetlb fs */
-       if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
-                       vma_is_dax(vma)) {
-               hmm_pfns_special(range);
-               return -EINVAL;
-       }
+       range->hmm = hmm_get_or_create(mm);
+       if (!range->hmm)
+               return -EFAULT;
 
-       if (!(vma->vm_flags & VM_READ)) {
-               /*
-                * If vma do not allow read access, then assume that it does
-                * not allow write access, either. Architecture that allow
-                * write without read access are not supported by HMM, because
-                * operations such has atomic access would not work.
-                */
-               hmm_pfns_clear(range, range->pfns, range->start, range->end);
-               return -EPERM;
+       /* Check if hmm_mm_destroy() was call. */
+       if (range->hmm->mm == NULL || range->hmm->dead) {
+               hmm_put(range->hmm);
+               return -EFAULT;
        }
 
        /* Initialize range to track CPU page table update */
-       spin_lock(&hmm->lock);
-       range->valid = true;
-       list_add_rcu(&range->list, &hmm->ranges);
-       spin_unlock(&hmm->lock);
-
-       hmm_vma_walk.fault = false;
-       hmm_vma_walk.range = range;
-       mm_walk.private = &hmm_vma_walk;
-
-       mm_walk.vma = vma;
-       mm_walk.mm = vma->vm_mm;
-       mm_walk.pte_entry = NULL;
-       mm_walk.test_walk = NULL;
-       mm_walk.hugetlb_entry = NULL;
-       mm_walk.pmd_entry = hmm_vma_walk_pmd;
-       mm_walk.pte_hole = hmm_vma_walk_hole;
-
-       walk_page_range(range->start, range->end, &mm_walk);
+       mutex_lock(&range->hmm->lock);
+
+       list_add_rcu(&range->list, &range->hmm->ranges);
+
+       /*
+        * If there are any concurrent notifiers we have to wait for them for
+        * the range to be valid (see hmm_range_wait_until_valid()).
+        */
+       if (!range->hmm->notifiers)
+               range->valid = true;
+       mutex_unlock(&range->hmm->lock);
+
        return 0;
 }
-EXPORT_SYMBOL(hmm_vma_get_pfns);
+EXPORT_SYMBOL(hmm_range_register);
 
 /*
- * hmm_vma_range_done() - stop tracking change to CPU page table over a range
- * @range: range being tracked
- * Returns: false if range data has been invalidated, true otherwise
+ * hmm_range_unregister() - stop tracking change to CPU page table over a range
+ * @range: range
  *
  * Range struct is used to track updates to the CPU page table after a call to
- * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done
- * using the data,  or wants to lock updates to the data it got from those
- * functions, it must call the hmm_vma_range_done() function, which will then
- * stop tracking CPU page table updates.
- *
- * Note that device driver must still implement general CPU page table update
- * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using
- * the mmu_notifier API directly.
- *
- * CPU page table update tracking done through hmm_range is only temporary and
- * to be used while trying to duplicate CPU page table contents for a range of
- * virtual addresses.
- *
- * There are two ways to use this :
- * again:
- *   hmm_vma_get_pfns(range); or hmm_vma_fault(...);
- *   trans = device_build_page_table_update_transaction(pfns);
- *   device_page_table_lock();
- *   if (!hmm_vma_range_done(range)) {
- *     device_page_table_unlock();
- *     goto again;
- *   }
- *   device_commit_transaction(trans);
- *   device_page_table_unlock();
+ * hmm_range_register(). See include/linux/hmm.h for how to use it.
+ */
+void hmm_range_unregister(struct hmm_range *range)
+{
+       /* Sanity check this really should not happen. */
+       if (range->hmm == NULL || range->end <= range->start)
+               return;
+
+       mutex_lock(&range->hmm->lock);
+       list_del_rcu(&range->list);
+       mutex_unlock(&range->hmm->lock);
+
+       /* Drop reference taken by hmm_range_register() */
+       range->valid = false;
+       hmm_put(range->hmm);
+       range->hmm = NULL;
+}
+EXPORT_SYMBOL(hmm_range_unregister);
+
+/*
+ * hmm_range_snapshot() - snapshot CPU page table for a range
+ * @range: range
+ * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
+ *          permission (for instance asking for write and range is read only),
+ *          -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid
+ *          vma or it is illegal to access that range), number of valid pages
+ *          in range->pfns[] (from range start address).
  *
- * Or:
- *   hmm_vma_get_pfns(range); or hmm_vma_fault(...);
- *   device_page_table_lock();
- *   hmm_vma_range_done(range);
- *   device_update_page_table(range->pfns);
- *   device_page_table_unlock();
+ * This snapshots the CPU page table for a range of virtual addresses. Snapshot
+ * validity is tracked by range struct. See in include/linux/hmm.h for example
+ * on how to use.
  */
-bool hmm_vma_range_done(struct hmm_range *range)
+long hmm_range_snapshot(struct hmm_range *range)
 {
-       unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
-       struct hmm *hmm;
+       const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
+       unsigned long start = range->start, end;
+       struct hmm_vma_walk hmm_vma_walk;
+       struct hmm *hmm = range->hmm;
+       struct vm_area_struct *vma;
+       struct mm_walk mm_walk;
 
-       if (range->end <= range->start) {
-               BUG();
-               return false;
-       }
+       /* Check if hmm_mm_destroy() was call. */
+       if (hmm->mm == NULL || hmm->dead)
+               return -EFAULT;
 
-       hmm = hmm_register(range->vma->vm_mm);
-       if (!hmm) {
-               memset(range->pfns, 0, sizeof(*range->pfns) * npages);
-               return false;
-       }
+       do {
+               /* If range is no longer valid force retry. */
+               if (!range->valid)
+                       return -EAGAIN;
 
-       spin_lock(&hmm->lock);
-       list_del_rcu(&range->list);
-       spin_unlock(&hmm->lock);
+               vma = find_vma(hmm->mm, start);
+               if (vma == NULL || (vma->vm_flags & device_vma))
+                       return -EFAULT;
+
+               if (is_vm_hugetlb_page(vma)) {
+                       struct hstate *h = hstate_vma(vma);
 
-       return range->valid;
+                       if (huge_page_shift(h) != range->page_shift &&
+                           range->page_shift != PAGE_SHIFT)
+                               return -EINVAL;
+               } else {
+                       if (range->page_shift != PAGE_SHIFT)
+                               return -EINVAL;
+               }
+
+               if (!(vma->vm_flags & VM_READ)) {
+                       /*
+                        * If vma do not allow read access, then assume that it
+                        * does not allow write access, either. HMM does not
+                        * support architecture that allow write without read.
+                        */
+                       hmm_pfns_clear(range, range->pfns,
+                               range->start, range->end);
+                       return -EPERM;
+               }
+
+               range->vma = vma;
+               hmm_vma_walk.pgmap = NULL;
+               hmm_vma_walk.last = start;
+               hmm_vma_walk.fault = false;
+               hmm_vma_walk.range = range;
+               mm_walk.private = &hmm_vma_walk;
+               end = min(range->end, vma->vm_end);
+
+               mm_walk.vma = vma;
+               mm_walk.mm = vma->vm_mm;
+               mm_walk.pte_entry = NULL;
+               mm_walk.test_walk = NULL;
+               mm_walk.hugetlb_entry = NULL;
+               mm_walk.pud_entry = hmm_vma_walk_pud;
+               mm_walk.pmd_entry = hmm_vma_walk_pmd;
+               mm_walk.pte_hole = hmm_vma_walk_hole;
+               mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
+
+               walk_page_range(start, end, &mm_walk);
+               start = end;
+       } while (start < range->end);
+
+       return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
 }
-EXPORT_SYMBOL(hmm_vma_range_done);
+EXPORT_SYMBOL(hmm_range_snapshot);
 
 /*
- * hmm_vma_fault() - try to fault some address in a virtual address range
+ * hmm_range_fault() - try to fault some address in a virtual address range
  * @range: range being faulted
  * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
- * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
+ * Returns: number of valid pages in range->pfns[] (from range start
+ *          address). This may be zero. If the return value is negative,
+ *          then one of the following values may be returned:
+ *
+ *           -EINVAL  invalid arguments or mm or virtual address are in an
+ *                    invalid vma (for instance device file vma).
+ *           -ENOMEM: Out of memory.
+ *           -EPERM:  Invalid permission (for instance asking for write and
+ *                    range is read only).
+ *           -EAGAIN: If you need to retry and mmap_sem was drop. This can only
+ *                    happens if block argument is false.
+ *           -EBUSY:  If the the range is being invalidated and you should wait
+ *                    for invalidation to finish.
+ *           -EFAULT: Invalid (ie either no valid vma or it is illegal to access
+ *                    that range), number of valid pages in range->pfns[] (from
+ *                    range start address).
  *
  * This is similar to a regular CPU page fault except that it will not trigger
- * any memory migration if the memory being faulted is not accessible by CPUs.
+ * any memory migration if the memory being faulted is not accessible by CPUs
+ * and caller does not ask for migration.
  *
  * On error, for one virtual address in the range, the function will mark the
  * corresponding HMM pfn entry with an error flag.
- *
- * Expected use pattern:
- * retry:
- *   down_read(&mm->mmap_sem);
- *   // Find vma and address device wants to fault, initialize hmm_pfn_t
- *   // array accordingly
- *   ret = hmm_vma_fault(range, write, block);
- *   switch (ret) {
- *   case -EAGAIN:
- *     hmm_vma_range_done(range);
- *     // You might want to rate limit or yield to play nicely, you may
- *     // also commit any valid pfn in the array assuming that you are
- *     // getting true from hmm_vma_range_monitor_end()
- *     goto retry;
- *   case 0:
- *     break;
- *   case -ENOMEM:
- *   case -EINVAL:
- *   case -EPERM:
- *   default:
- *     // Handle error !
- *     up_read(&mm->mmap_sem)
- *     return;
- *   }
- *   // Take device driver lock that serialize device page table update
- *   driver_lock_device_page_table_update();
- *   hmm_vma_range_done(range);
- *   // Commit pfns we got from hmm_vma_fault()
- *   driver_unlock_device_page_table_update();
- *   up_read(&mm->mmap_sem)
- *
- * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0)
- * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION !
- *
- * YOU HAVE BEEN WARNED !
  */
-int hmm_vma_fault(struct hmm_range *range, bool block)
+long hmm_range_fault(struct hmm_range *range, bool block)
 {
-       struct vm_area_struct *vma = range->vma;
-       unsigned long start = range->start;
+       const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
+       unsigned long start = range->start, end;
        struct hmm_vma_walk hmm_vma_walk;
+       struct hmm *hmm = range->hmm;
+       struct vm_area_struct *vma;
        struct mm_walk mm_walk;
-       struct hmm *hmm;
        int ret;
 
-       /* Sanity check, this really should not happen ! */
-       if (range->start < vma->vm_start || range->start >= vma->vm_end)
-               return -EINVAL;
-       if (range->end < vma->vm_start || range->end > vma->vm_end)
-               return -EINVAL;
+       /* Check if hmm_mm_destroy() was call. */
+       if (hmm->mm == NULL || hmm->dead)
+               return -EFAULT;
 
-       hmm = hmm_register(vma->vm_mm);
-       if (!hmm) {
-               hmm_pfns_clear(range, range->pfns, range->start, range->end);
-               return -ENOMEM;
-       }
-       /* Caller must have registered a mirror using hmm_mirror_register() */
-       if (!hmm->mmu_notifier.ops)
-               return -EINVAL;
+       do {
+               /* If range is no longer valid force retry. */
+               if (!range->valid) {
+                       up_read(&hmm->mm->mmap_sem);
+                       return -EAGAIN;
+               }
 
-       /* FIXME support hugetlb fs */
-       if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
-                       vma_is_dax(vma)) {
-               hmm_pfns_special(range);
-               return -EINVAL;
-       }
+               vma = find_vma(hmm->mm, start);
+               if (vma == NULL || (vma->vm_flags & device_vma))
+                       return -EFAULT;
+
+               if (is_vm_hugetlb_page(vma)) {
+                       if (huge_page_shift(hstate_vma(vma)) !=
+                           range->page_shift &&
+                           range->page_shift != PAGE_SHIFT)
+                               return -EINVAL;
+               } else {
+                       if (range->page_shift != PAGE_SHIFT)
+                               return -EINVAL;
+               }
+
+               if (!(vma->vm_flags & VM_READ)) {
+                       /*
+                        * If vma do not allow read access, then assume that it
+                        * does not allow write access, either. HMM does not
+                        * support architecture that allow write without read.
+                        */
+                       hmm_pfns_clear(range, range->pfns,
+                               range->start, range->end);
+                       return -EPERM;
+               }
+
+               range->vma = vma;
+               hmm_vma_walk.pgmap = NULL;
+               hmm_vma_walk.last = start;
+               hmm_vma_walk.fault = true;
+               hmm_vma_walk.block = block;
+               hmm_vma_walk.range = range;
+               mm_walk.private = &hmm_vma_walk;
+               end = min(range->end, vma->vm_end);
+
+               mm_walk.vma = vma;
+               mm_walk.mm = vma->vm_mm;
+               mm_walk.pte_entry = NULL;
+               mm_walk.test_walk = NULL;
+               mm_walk.hugetlb_entry = NULL;
+               mm_walk.pud_entry = hmm_vma_walk_pud;
+               mm_walk.pmd_entry = hmm_vma_walk_pmd;
+               mm_walk.pte_hole = hmm_vma_walk_hole;
+               mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
+
+               do {
+                       ret = walk_page_range(start, end, &mm_walk);
+                       start = hmm_vma_walk.last;
+
+                       /* Keep trying while the range is valid. */
+               } while (ret == -EBUSY && range->valid);
+
+               if (ret) {
+                       unsigned long i;
+
+                       i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
+                       hmm_pfns_clear(range, &range->pfns[i],
+                               hmm_vma_walk.last, range->end);
+                       return ret;
+               }
+               start = end;
+
+       } while (start < range->end);
+
+       return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
+}
+EXPORT_SYMBOL(hmm_range_fault);
+
+/**
+ * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one.
+ * @range: range being faulted
+ * @device: device against to dma map page to
+ * @daddrs: dma address of mapped pages
+ * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
+ * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been
+ *          drop and you need to try again, some other error value otherwise
+ *
+ * Note same usage pattern as hmm_range_fault().
+ */
+long hmm_range_dma_map(struct hmm_range *range,
+                      struct device *device,
+                      dma_addr_t *daddrs,
+                      bool block)
+{
+       unsigned long i, npages, mapped;
+       long ret;
+
+       ret = hmm_range_fault(range, block);
+       if (ret <= 0)
+               return ret ? ret : -EBUSY;
+
+       npages = (range->end - range->start) >> PAGE_SHIFT;
+       for (i = 0, mapped = 0; i < npages; ++i) {
+               enum dma_data_direction dir = DMA_TO_DEVICE;
+               struct page *page;
 
-       if (!(vma->vm_flags & VM_READ)) {
                /*
-                * If vma do not allow read access, then assume that it does
-                * not allow write access, either. Architecture that allow
-                * write without read access are not supported by HMM, because
-                * operations such has atomic access would not work.
+                * FIXME need to update DMA API to provide invalid DMA address
+                * value instead of a function to test dma address value. This
+                * would remove lot of dumb code duplicated accross many arch.
+                *
+                * For now setting it to 0 here is good enough as the pfns[]
+                * value is what is use to check what is valid and what isn't.
                 */
-               hmm_pfns_clear(range, range->pfns, range->start, range->end);
-               return -EPERM;
+               daddrs[i] = 0;
+
+               page = hmm_device_entry_to_page(range, range->pfns[i]);
+               if (page == NULL)
+                       continue;
+
+               /* Check if range is being invalidated */
+               if (!range->valid) {
+                       ret = -EBUSY;
+                       goto unmap;
+               }
+
+               /* If it is read and write than map bi-directional. */
+               if (range->pfns[i] & range->flags[HMM_PFN_WRITE])
+                       dir = DMA_BIDIRECTIONAL;
+
+               daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir);
+               if (dma_mapping_error(device, daddrs[i])) {
+                       ret = -EFAULT;
+                       goto unmap;
+               }
+
+               mapped++;
        }
 
-       /* Initialize range to track CPU page table update */
-       spin_lock(&hmm->lock);
-       range->valid = true;
-       list_add_rcu(&range->list, &hmm->ranges);
-       spin_unlock(&hmm->lock);
-
-       hmm_vma_walk.fault = true;
-       hmm_vma_walk.block = block;
-       hmm_vma_walk.range = range;
-       mm_walk.private = &hmm_vma_walk;
-       hmm_vma_walk.last = range->start;
-
-       mm_walk.vma = vma;
-       mm_walk.mm = vma->vm_mm;
-       mm_walk.pte_entry = NULL;
-       mm_walk.test_walk = NULL;
-       mm_walk.hugetlb_entry = NULL;
-       mm_walk.pmd_entry = hmm_vma_walk_pmd;
-       mm_walk.pte_hole = hmm_vma_walk_hole;
+       return mapped;
 
-       do {
-               ret = walk_page_range(start, range->end, &mm_walk);
-               start = hmm_vma_walk.last;
-       } while (ret == -EAGAIN);
+unmap:
+       for (npages = i, i = 0; (i < npages) && mapped; ++i) {
+               enum dma_data_direction dir = DMA_TO_DEVICE;
+               struct page *page;
 
-       if (ret) {
-               unsigned long i;
+               page = hmm_device_entry_to_page(range, range->pfns[i]);
+               if (page == NULL)
+                       continue;
+
+               if (dma_mapping_error(device, daddrs[i]))
+                       continue;
 
-               i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
-               hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last,
-                              range->end);
-               hmm_vma_range_done(range);
+               /* If it is read and write than map bi-directional. */
+               if (range->pfns[i] & range->flags[HMM_PFN_WRITE])
+                       dir = DMA_BIDIRECTIONAL;
+
+               dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
+               mapped--;
        }
+
        return ret;
 }
-EXPORT_SYMBOL(hmm_vma_fault);
+EXPORT_SYMBOL(hmm_range_dma_map);
+
+/**
+ * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map()
+ * @range: range being unmapped
+ * @vma: the vma against which the range (optional)
+ * @device: device against which dma map was done
+ * @daddrs: dma address of mapped pages
+ * @dirty: dirty page if it had the write flag set
+ * Returns: number of page unmapped on success, -EINVAL otherwise
+ *
+ * Note that caller MUST abide by mmu notifier or use HMM mirror and abide
+ * to the sync_cpu_device_pagetables() callback so that it is safe here to
+ * call set_page_dirty(). Caller must also take appropriate locks to avoid
+ * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress.
+ */
+long hmm_range_dma_unmap(struct hmm_range *range,
+                        struct vm_area_struct *vma,
+                        struct device *device,
+                        dma_addr_t *daddrs,
+                        bool dirty)
+{
+       unsigned long i, npages;
+       long cpages = 0;
+
+       /* Sanity check. */
+       if (range->end <= range->start)
+               return -EINVAL;
+       if (!daddrs)
+               return -EINVAL;
+       if (!range->pfns)
+               return -EINVAL;
+
+       npages = (range->end - range->start) >> PAGE_SHIFT;
+       for (i = 0; i < npages; ++i) {
+               enum dma_data_direction dir = DMA_TO_DEVICE;
+               struct page *page;
+
+               page = hmm_device_entry_to_page(range, range->pfns[i]);
+               if (page == NULL)
+                       continue;
+
+               /* If it is read and write than map bi-directional. */
+               if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) {
+                       dir = DMA_BIDIRECTIONAL;
+
+                       /*
+                        * See comments in function description on why it is
+                        * safe here to call set_page_dirty()
+                        */
+                       if (dirty)
+                               set_page_dirty(page);
+               }
+
+               /* Unmap and clear pfns/dma address */
+               dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
+               range->pfns[i] = range->values[HMM_PFN_NONE];
+               /* FIXME see comments in hmm_vma_dma_map() */
+               daddrs[i] = 0;
+               cpages++;
+       }
+
+       return cpages;
+}
+EXPORT_SYMBOL(hmm_range_dma_unmap);
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
index b6a34b3..9f8bce9 100644 (file)
@@ -509,7 +509,7 @@ void prep_transhuge_page(struct page *page)
        set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
 }
 
-unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
+static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
                loff_t off, unsigned long flags, unsigned long size)
 {
        unsigned long addr;
@@ -793,11 +793,13 @@ out_unlock:
                pte_free(mm, pgtable);
 }
 
-vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
-                       pmd_t *pmd, pfn_t pfn, bool write)
+vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
 {
+       unsigned long addr = vmf->address & PMD_MASK;
+       struct vm_area_struct *vma = vmf->vma;
        pgprot_t pgprot = vma->vm_page_prot;
        pgtable_t pgtable = NULL;
+
        /*
         * If we had pmd_special, we could avoid all these restrictions,
         * but we need to be consistent with PTEs and architectures that
@@ -820,7 +822,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 
        track_pfn_insert(vma, &pgprot, pfn);
 
-       insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write, pgtable);
+       insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
        return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
@@ -869,10 +871,12 @@ out_unlock:
        spin_unlock(ptl);
 }
 
-vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
-                       pud_t *pud, pfn_t pfn, bool write)
+vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
 {
+       unsigned long addr = vmf->address & PUD_MASK;
+       struct vm_area_struct *vma = vmf->vma;
        pgprot_t pgprot = vma->vm_page_prot;
+
        /*
         * If we had pud_special, we could avoid all these restrictions,
         * but we need to be consistent with PTEs and architectures that
@@ -889,7 +893,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
 
        track_pfn_insert(vma, &pgprot, pfn);
 
-       insert_pfn_pud(vma, addr, pud, pfn, pgprot, write);
+       insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write);
        return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
@@ -1220,8 +1224,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
                cond_resched();
        }
 
-       mmu_notifier_range_init(&range, vma->vm_mm, haddr,
-                               haddr + HPAGE_PMD_SIZE);
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+                               haddr, haddr + HPAGE_PMD_SIZE);
        mmu_notifier_invalidate_range_start(&range);
 
        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
@@ -1384,8 +1388,8 @@ alloc:
                                    vma, HPAGE_PMD_NR);
        __SetPageUptodate(new_page);
 
-       mmu_notifier_range_init(&range, vma->vm_mm, haddr,
-                               haddr + HPAGE_PMD_SIZE);
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+                               haddr, haddr + HPAGE_PMD_SIZE);
        mmu_notifier_invalidate_range_start(&range);
 
        spin_lock(vmf->ptl);
@@ -2060,7 +2064,8 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
        spinlock_t *ptl;
        struct mmu_notifier_range range;
 
-       mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PUD_MASK,
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+                               address & HPAGE_PUD_MASK,
                                (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
        mmu_notifier_invalidate_range_start(&range);
        ptl = pud_lock(vma->vm_mm, pud);
@@ -2278,7 +2283,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
        spinlock_t *ptl;
        struct mmu_notifier_range range;
 
-       mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PMD_MASK,
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+                               address & HPAGE_PMD_MASK,
                                (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
        mmu_notifier_invalidate_range_start(&range);
        ptl = pmd_lock(vma->vm_mm, pmd);
@@ -2492,6 +2498,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
                        if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
                                shmem_uncharge(head->mapping->host, 1);
                        put_page(head + i);
+               } else if (!PageAnon(page)) {
+                       __xa_store(&head->mapping->i_pages, head[i].index,
+                                       head + i, 0);
                }
        }
 
index 641cedf..81718c5 100644 (file)
@@ -740,7 +740,15 @@ void resv_map_release(struct kref *ref)
 
 static inline struct resv_map *inode_resv_map(struct inode *inode)
 {
-       return inode->i_mapping->private_data;
+       /*
+        * At inode evict time, i_mapping may not point to the original
+        * address space within the inode.  This original address space
+        * contains the pointer to the resv_map.  So, always use the
+        * address space embedded within the inode.
+        * The VERY common case is inode->mapping == &inode->i_data but,
+        * this may not be true for device special inodes.
+        */
+       return (struct resv_map *)(&inode->i_data)->private_data;
 }
 
 static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
@@ -1059,6 +1067,7 @@ static void free_gigantic_page(struct page *page, unsigned int order)
        free_contig_range(page_to_pfn(page), 1 << order);
 }
 
+#ifdef CONFIG_CONTIG_ALLOC
 static int __alloc_gigantic_page(unsigned long start_pfn,
                                unsigned long nr_pages, gfp_t gfp_mask)
 {
@@ -1143,11 +1152,20 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
 
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
 static void prep_compound_gigantic_page(struct page *page, unsigned int order);
+#else /* !CONFIG_CONTIG_ALLOC */
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+                                       int nid, nodemask_t *nodemask)
+{
+       return NULL;
+}
+#endif /* CONFIG_CONTIG_ALLOC */
 
 #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
-static inline bool gigantic_page_supported(void) { return false; }
 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
-               int nid, nodemask_t *nodemask) { return NULL; }
+                                       int nid, nodemask_t *nodemask)
+{
+       return NULL;
+}
 static inline void free_gigantic_page(struct page *page, unsigned int order) { }
 static inline void destroy_compound_gigantic_page(struct page *page,
                                                unsigned int order) { }
@@ -1157,7 +1175,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
 {
        int i;
 
-       if (hstate_is_gigantic(h) && !gigantic_page_supported())
+       if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                return;
 
        h->nr_huge_pages--;
@@ -1258,12 +1276,23 @@ void free_huge_page(struct page *page)
        ClearPagePrivate(page);
 
        /*
-        * A return code of zero implies that the subpool will be under its
-        * minimum size if the reservation is not restored after page is free.
-        * Therefore, force restore_reserve operation.
+        * If PagePrivate() was set on page, page allocation consumed a
+        * reservation.  If the page was associated with a subpool, there
+        * would have been a page reserved in the subpool before allocation
+        * via hugepage_subpool_get_pages().  Since we are 'restoring' the
+        * reservtion, do not call hugepage_subpool_put_pages() as this will
+        * remove the reserved page from the subpool.
         */
-       if (hugepage_subpool_put_pages(spool, 1) == 0)
-               restore_reserve = true;
+       if (!restore_reserve) {
+               /*
+                * A return code of zero implies that the subpool will be
+                * under its minimum size if the reservation is not restored
+                * after page is free.  Therefore, force restore_reserve
+                * operation.
+                */
+               if (hugepage_subpool_put_pages(spool, 1) == 0)
+                       restore_reserve = true;
+       }
 
        spin_lock(&hugetlb_lock);
        clear_page_huge_active(page);
@@ -1574,8 +1603,9 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
         */
        if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
                SetPageHugeTemporary(page);
+               spin_unlock(&hugetlb_lock);
                put_page(page);
-               page = NULL;
+               return NULL;
        } else {
                h->surplus_huge_pages++;
                h->surplus_huge_pages_node[page_to_nid(page)]++;
@@ -2277,13 +2307,47 @@ found:
 }
 
 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
-static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
-                                               nodemask_t *nodes_allowed)
+static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
+                             nodemask_t *nodes_allowed)
 {
        unsigned long min_count, ret;
 
-       if (hstate_is_gigantic(h) && !gigantic_page_supported())
-               return h->max_huge_pages;
+       spin_lock(&hugetlb_lock);
+
+       /*
+        * Check for a node specific request.
+        * Changing node specific huge page count may require a corresponding
+        * change to the global count.  In any case, the passed node mask
+        * (nodes_allowed) will restrict alloc/free to the specified node.
+        */
+       if (nid != NUMA_NO_NODE) {
+               unsigned long old_count = count;
+
+               count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
+               /*
+                * User may have specified a large count value which caused the
+                * above calculation to overflow.  In this case, they wanted
+                * to allocate as many huge pages as possible.  Set count to
+                * largest possible value to align with their intention.
+                */
+               if (count < old_count)
+                       count = ULONG_MAX;
+       }
+
+       /*
+        * Gigantic pages runtime allocation depend on the capability for large
+        * page range allocation.
+        * If the system does not provide this feature, return an error when
+        * the user tries to allocate gigantic pages but let the user free the
+        * boottime allocated gigantic pages.
+        */
+       if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
+               if (count > persistent_huge_pages(h)) {
+                       spin_unlock(&hugetlb_lock);
+                       return -EINVAL;
+               }
+               /* Fall through to decrease pool */
+       }
 
        /*
         * Increase the pool size
@@ -2296,7 +2360,6 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
         * pool might be one hugepage larger than it needs to be, but
         * within all the constraints specified by the sysctls.
         */
-       spin_lock(&hugetlb_lock);
        while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
                if (!adjust_pool_surplus(h, nodes_allowed, -1))
                        break;
@@ -2351,9 +2414,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
                        break;
        }
 out:
-       ret = persistent_huge_pages(h);
+       h->max_huge_pages = persistent_huge_pages(h);
        spin_unlock(&hugetlb_lock);
-       return ret;
+
+       return 0;
 }
 
 #define HSTATE_ATTR_RO(_name) \
@@ -2403,41 +2467,32 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
                                           unsigned long count, size_t len)
 {
        int err;
-       NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
+       nodemask_t nodes_allowed, *n_mask;
 
-       if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
-               err = -EINVAL;
-               goto out;
-       }
+       if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+               return -EINVAL;
 
        if (nid == NUMA_NO_NODE) {
                /*
                 * global hstate attribute
                 */
                if (!(obey_mempolicy &&
-                               init_nodemask_of_mempolicy(nodes_allowed))) {
-                       NODEMASK_FREE(nodes_allowed);
-                       nodes_allowed = &node_states[N_MEMORY];
-               }
-       } else if (nodes_allowed) {
+                               init_nodemask_of_mempolicy(&nodes_allowed)))
+                       n_mask = &node_states[N_MEMORY];
+               else
+                       n_mask = &nodes_allowed;
+       } else {
                /*
-                * per node hstate attribute: adjust count to global,
-                * but restrict alloc/free to the specified node.
+                * Node specific request.  count adjustment happens in
+                * set_max_huge_pages() after acquiring hugetlb_lock.
                 */
-               count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
-               init_nodemask_of_node(nodes_allowed, nid);
-       } else
-               nodes_allowed = &node_states[N_MEMORY];
-
-       h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
+               init_nodemask_of_node(&nodes_allowed, nid);
+               n_mask = &nodes_allowed;
+       }
 
-       if (nodes_allowed != &node_states[N_MEMORY])
-               NODEMASK_FREE(nodes_allowed);
+       err = set_max_huge_pages(h, count, nid, n_mask);
 
-       return len;
-out:
-       NODEMASK_FREE(nodes_allowed);
-       return err;
+       return err ? err : len;
 }
 
 static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
@@ -3247,7 +3302,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
        cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 
        if (cow) {
-               mmu_notifier_range_init(&range, src, vma->vm_start,
+               mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
+                                       vma->vm_start,
                                        vma->vm_end);
                mmu_notifier_invalidate_range_start(&range);
        }
@@ -3359,7 +3415,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
        /*
         * If sharing possible, alert mmu notifiers of worst case.
         */
-       mmu_notifier_range_init(&range, mm, start, end);
+       mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
+                               end);
        adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
        mmu_notifier_invalidate_range_start(&range);
        address = start;
@@ -3626,7 +3683,8 @@ retry_avoidcopy:
                            pages_per_huge_page(h));
        __SetPageUptodate(new_page);
 
-       mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h));
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
+                               haddr + huge_page_size(h));
        mmu_notifier_invalidate_range_start(&range);
 
        /*
@@ -3777,8 +3835,7 @@ retry:
                         * handling userfault.  Reacquire after handling
                         * fault to make calling code simpler.
                         */
-                       hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
-                                                       idx, haddr);
+                       hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                        ret = handle_userfault(&vmf, VM_UFFD_MISSING);
                        mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -3886,21 +3943,14 @@ backout_unlocked:
 }
 
 #ifdef CONFIG_SMP
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
-                           struct vm_area_struct *vma,
-                           struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
                            pgoff_t idx, unsigned long address)
 {
        unsigned long key[2];
        u32 hash;
 
-       if (vma->vm_flags & VM_SHARED) {
-               key[0] = (unsigned long) mapping;
-               key[1] = idx;
-       } else {
-               key[0] = (unsigned long) mm;
-               key[1] = address >> huge_page_shift(h);
-       }
+       key[0] = (unsigned long) mapping;
+       key[1] = idx;
 
        hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
 
@@ -3911,9 +3961,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
  * For uniprocesor systems we always use a single mutex, so just
  * return 0 and avoid the hashing overhead.
  */
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
-                           struct vm_area_struct *vma,
-                           struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
                            pgoff_t idx, unsigned long address)
 {
        return 0;
@@ -3958,7 +4006,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * get spurious allocation failures if two CPUs race to instantiate
         * the same page in the page cache.
         */
-       hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
+       hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
        mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
        entry = huge_ptep_get(ptep);
@@ -4371,7 +4419,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
         * start/end.  Set range.start/range.end to cover the maximum possible
         * range if PMD sharing is possible.
         */
-       mmu_notifier_range_init(&range, mm, start, end);
+       mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
+                               0, vma, mm, start, end);
        adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
 
        BUG_ON(address >= end);
@@ -4477,6 +4526,11 @@ int hugetlb_reserve_pages(struct inode *inode,
         * called to make the mapping read-write. Assume !vma is a shm mapping
         */
        if (!vma || vma->vm_flags & VM_MAYSHARE) {
+               /*
+                * resv_map can not be NULL as hugetlb_reserve_pages is only
+                * called for inodes for which resv_maps were created (see
+                * hugetlbfs_get_inode).
+                */
                resv_map = inode_resv_map(inode);
 
                chg = region_chg(resv_map, from, to);
@@ -4568,6 +4622,10 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
        struct hugepage_subpool *spool = subpool_inode(inode);
        long gbl_reserve;
 
+       /*
+        * Since this routine can be called in the evict inode path for all
+        * hugetlbfs inodes, resv_map could be NULL.
+        */
        if (resv_map) {
                chg = region_del(resv_map, start, end);
                /*
index 4490443..a335f7c 100644 (file)
@@ -1016,7 +1016,8 @@ static void collapse_huge_page(struct mm_struct *mm,
        pte = pte_offset_map(pmd, address);
        pte_ptl = pte_lockptr(mm, pmd);
 
-       mmu_notifier_range_init(&range, mm, address, address + HPAGE_PMD_SIZE);
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
+                               address, address + HPAGE_PMD_SIZE);
        mmu_notifier_invalidate_range_start(&range);
        pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
        /*
@@ -1374,7 +1375,7 @@ static void collapse_shmem(struct mm_struct *mm,
                                result = SCAN_FAIL;
                                goto xa_locked;
                        }
-                       xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
+                       xas_store(&xas, new_page);
                        nr_none++;
                        continue;
                }
@@ -1450,7 +1451,7 @@ static void collapse_shmem(struct mm_struct *mm,
                list_add_tail(&page->lru, &pagelist);
 
                /* Finally, replace with the new page. */
-               xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
+               xas_store(&xas, new_page);
                continue;
 out_unlock:
                unlock_page(page);
index fc64874..81c20ed 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1066,7 +1066,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 
        BUG_ON(PageTransCompound(page));
 
-       mmu_notifier_range_init(&range, mm, pvmw.address,
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+                               pvmw.address,
                                pvmw.address + PAGE_SIZE);
        mmu_notifier_invalidate_range_start(&range);
 
@@ -1154,7 +1155,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        if (!pmd)
                goto out;
 
-       mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE);
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
+                               addr + PAGE_SIZE);
        mmu_notifier_invalidate_range_start(&range);
 
        ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
index bb3a455..628022e 100644 (file)
@@ -472,7 +472,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
        range.end = min(vma->vm_end, end_addr);
        if (range.end <= vma->vm_start)
                return -EINVAL;
-       mmu_notifier_range_init(&range, mm, range.start, range.end);
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+                               range.start, range.end);
 
        lru_add_drain();
        tlb_gather_mmu(&tlb, mm, range.start, range.end);
index a48f520..6bbad46 100644 (file)
@@ -94,7 +94,7 @@
  * :c:func:`mem_init` function frees all the memory to the buddy page
  * allocator.
  *
- * If an architecure enables %CONFIG_ARCH_DISCARD_MEMBLOCK, the
+ * Unless an architecure enables %CONFIG_ARCH_KEEP_MEMBLOCK, the
  * memblock data structures will be discarded after the system
  * initialization compltes.
  */
@@ -375,7 +375,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
        }
 }
 
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+#ifndef CONFIG_ARCH_KEEP_MEMBLOCK
 /**
  * memblock_discard - discard memory and reserved arrays if they were allocated
  */
@@ -1255,6 +1255,70 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
        return 0;
 }
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+/**
+ * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone()
+ *
+ * @idx: pointer to u64 loop variable
+ * @zone: zone in which all of the memory blocks reside
+ * @out_spfn: ptr to ulong for start pfn of the range, can be %NULL
+ * @out_epfn: ptr to ulong for end pfn of the range, can be %NULL
+ *
+ * This function is meant to be a zone/pfn specific wrapper for the
+ * for_each_mem_range type iterators. Specifically they are used in the
+ * deferred memory init routines and as such we were duplicating much of
+ * this logic throughout the code. So instead of having it in multiple
+ * locations it seemed like it would make more sense to centralize this to
+ * one new iterator that does everything they need.
+ */
+void __init_memblock
+__next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
+                            unsigned long *out_spfn, unsigned long *out_epfn)
+{
+       int zone_nid = zone_to_nid(zone);
+       phys_addr_t spa, epa;
+       int nid;
+
+       __next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
+                        &memblock.memory, &memblock.reserved,
+                        &spa, &epa, &nid);
+
+       while (*idx != U64_MAX) {
+               unsigned long epfn = PFN_DOWN(epa);
+               unsigned long spfn = PFN_UP(spa);
+
+               /*
+                * Verify the end is at least past the start of the zone and
+                * that we have at least one PFN to initialize.
+                */
+               if (zone->zone_start_pfn < epfn && spfn < epfn) {
+                       /* if we went too far just stop searching */
+                       if (zone_end_pfn(zone) <= spfn) {
+                               *idx = U64_MAX;
+                               break;
+                       }
+
+                       if (out_spfn)
+                               *out_spfn = max(zone->zone_start_pfn, spfn);
+                       if (out_epfn)
+                               *out_epfn = min(zone_end_pfn(zone), epfn);
+
+                       return;
+               }
+
+               __next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
+                                &memblock.memory, &memblock.reserved,
+                                &spa, &epa, &nid);
+       }
+
+       /* signal end of iteration */
+       if (out_spfn)
+               *out_spfn = ULONG_MAX;
+       if (out_epfn)
+               *out_epfn = 0;
+}
+
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
 /**
  * memblock_alloc_range_nid - allocate boot memory block
@@ -1923,7 +1987,7 @@ unsigned long __init memblock_free_all(void)
        return pages;
 }
 
-#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK)
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK)
 
 static int memblock_debug_show(struct seq_file *m, void *private)
 {
index 81a0d39..2879330 100644 (file)
@@ -725,34 +725,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
        __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
 }
 
-unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
-                                          int nid, unsigned int lru_mask)
-{
-       struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
-       unsigned long nr = 0;
-       enum lru_list lru;
-
-       VM_BUG_ON((unsigned)nid >= nr_node_ids);
-
-       for_each_lru(lru) {
-               if (!(BIT(lru) & lru_mask))
-                       continue;
-               nr += mem_cgroup_get_lru_size(lruvec, lru);
-       }
-       return nr;
-}
-
-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
-                       unsigned int lru_mask)
-{
-       unsigned long nr = 0;
-       int nid;
-
-       for_each_node_state(nid, N_MEMORY)
-               nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
-       return nr;
-}
-
 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
                                       enum mem_cgroup_events_target target)
 {
@@ -1358,7 +1330,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
 
                for (i = 0; i < NR_LRU_LISTS; i++)
                        pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
-                               K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
+                               K(memcg_page_state(iter, NR_LRU_BASE + i)));
 
                pr_cont("\n");
        }
@@ -1422,11 +1394,15 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
                int nid, bool noswap)
 {
-       if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
+       struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
+
+       if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
+           lruvec_page_state(lruvec, NR_ACTIVE_FILE))
                return true;
        if (noswap || !total_swap_pages)
                return false;
-       if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
+       if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
+           lruvec_page_state(lruvec, NR_ACTIVE_ANON))
                return true;
        return false;
 
@@ -2990,8 +2966,8 @@ static void accumulate_memcg_tree(struct mem_cgroup *memcg,
                                acc->events_array ? acc->events_array[i] : i);
 
                for (i = 0; i < NR_LRU_LISTS; i++)
-                       acc->lru_pages[i] +=
-                               mem_cgroup_nr_lru_pages(mi, BIT(i));
+                       acc->lru_pages[i] += memcg_page_state(mi,
+                                                             NR_LRU_BASE + i);
        }
 }
 
@@ -3331,6 +3307,42 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
 #endif
 
 #ifdef CONFIG_NUMA
+
+#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
+#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
+#define LRU_ALL             ((1 << NR_LRU_LISTS) - 1)
+
+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+                                          int nid, unsigned int lru_mask)
+{
+       struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
+       unsigned long nr = 0;
+       enum lru_list lru;
+
+       VM_BUG_ON((unsigned)nid >= nr_node_ids);
+
+       for_each_lru(lru) {
+               if (!(BIT(lru) & lru_mask))
+                       continue;
+               nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
+       }
+       return nr;
+}
+
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
+                                            unsigned int lru_mask)
+{
+       unsigned long nr = 0;
+       enum lru_list lru;
+
+       for_each_lru(lru) {
+               if (!(BIT(lru) & lru_mask))
+                       continue;
+               nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
+       }
+       return nr;
+}
+
 static int memcg_numa_stat_show(struct seq_file *m, void *v)
 {
        struct numa_stat {
@@ -3421,7 +3433,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 
        for (i = 0; i < NR_LRU_LISTS; i++)
                seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
-                          mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
+                          memcg_page_state(memcg, NR_LRU_BASE + i) *
+                          PAGE_SIZE);
 
        /* Hierarchical information */
        memory = memsw = PAGE_COUNTER_MAX;
@@ -3927,8 +3940,8 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 
        /* this should eventually include NR_UNSTABLE_NFS */
        *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
-       *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
-                                                    (1 << LRU_ACTIVE_FILE));
+       *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
+                       memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
        *pheadroom = PAGE_COUNTER_MAX;
 
        while ((parent = parent_mem_cgroup(memcg))) {
index 650e65a..2647c89 100644 (file)
@@ -39,6 +39,7 @@ static void memfd_tag_pins(struct xa_state *xas)
        xas_for_each(xas, page, ULONG_MAX) {
                if (xa_is_value(page))
                        continue;
+               page = find_subpage(page, xas->xa_index);
                if (page_count(page) - page_mapcount(page) > 1)
                        xas_set_mark(xas, MEMFD_TAG_PINNED);
 
@@ -88,6 +89,7 @@ static int memfd_wait_for_pins(struct address_space *mapping)
                        bool clear = true;
                        if (xa_is_value(page))
                                continue;
+                       page = find_subpage(page, xas.xa_index);
                        if (page_count(page) - page_mapcount(page) != 1) {
                                /*
                                 * On the last scan, we clean up all those tags
index f7d962d..96f1d47 100644 (file)
@@ -1010,7 +1010,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        is_cow = is_cow_mapping(vma->vm_flags);
 
        if (is_cow) {
-               mmu_notifier_range_init(&range, src_mm, addr, end);
+               mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
+                                       0, vma, src_mm, addr, end);
                mmu_notifier_invalidate_range_start(&range);
        }
 
@@ -1334,7 +1335,8 @@ void unmap_vmas(struct mmu_gather *tlb,
 {
        struct mmu_notifier_range range;
 
-       mmu_notifier_range_init(&range, vma->vm_mm, start_addr, end_addr);
+       mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
+                               start_addr, end_addr);
        mmu_notifier_invalidate_range_start(&range);
        for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
                unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
@@ -1356,7 +1358,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
        struct mmu_gather tlb;
 
        lru_add_drain();
-       mmu_notifier_range_init(&range, vma->vm_mm, start, start + size);
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+                               start, start + size);
        tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
        update_hiwater_rss(vma->vm_mm);
        mmu_notifier_invalidate_range_start(&range);
@@ -1382,7 +1385,8 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
        struct mmu_gather tlb;
 
        lru_add_drain();
-       mmu_notifier_range_init(&range, vma->vm_mm, address, address + size);
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+                               address, address + size);
        tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
        update_hiwater_rss(vma->vm_mm);
        mmu_notifier_invalidate_range_start(&range);
@@ -1523,6 +1527,87 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
 }
 EXPORT_SYMBOL(vm_insert_page);
 
+/*
+ * __vm_map_pages - maps range of kernel pages into user vma
+ * @vma: user vma to map to
+ * @pages: pointer to array of source kernel pages
+ * @num: number of pages in page array
+ * @offset: user's requested vm_pgoff
+ *
+ * This allows drivers to map range of kernel pages into a user vma.
+ *
+ * Return: 0 on success and error code otherwise.
+ */
+static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+                               unsigned long num, unsigned long offset)
+{
+       unsigned long count = vma_pages(vma);
+       unsigned long uaddr = vma->vm_start;
+       int ret, i;
+
+       /* Fail if the user requested offset is beyond the end of the object */
+       if (offset > num)
+               return -ENXIO;
+
+       /* Fail if the user requested size exceeds available object size */
+       if (count > num - offset)
+               return -ENXIO;
+
+       for (i = 0; i < count; i++) {
+               ret = vm_insert_page(vma, uaddr, pages[offset + i]);
+               if (ret < 0)
+                       return ret;
+               uaddr += PAGE_SIZE;
+       }
+
+       return 0;
+}
+
+/**
+ * vm_map_pages - maps range of kernel pages starts with non zero offset
+ * @vma: user vma to map to
+ * @pages: pointer to array of source kernel pages
+ * @num: number of pages in page array
+ *
+ * Maps an object consisting of @num pages, catering for the user's
+ * requested vm_pgoff
+ *
+ * If we fail to insert any page into the vma, the function will return
+ * immediately leaving any previously inserted pages present.  Callers
+ * from the mmap handler may immediately return the error as their caller
+ * will destroy the vma, removing any successfully inserted pages. Other
+ * callers should make their own arrangements for calling unmap_region().
+ *
+ * Context: Process context. Called by mmap handlers.
+ * Return: 0 on success and error code otherwise.
+ */
+int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+                               unsigned long num)
+{
+       return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
+}
+EXPORT_SYMBOL(vm_map_pages);
+
+/**
+ * vm_map_pages_zero - map range of kernel pages starts with zero offset
+ * @vma: user vma to map to
+ * @pages: pointer to array of source kernel pages
+ * @num: number of pages in page array
+ *
+ * Similar to vm_map_pages(), except that it explicitly sets the offset
+ * to 0. This function is intended for the drivers that did not consider
+ * vm_pgoff.
+ *
+ * Context: Process context. Called by mmap handlers.
+ * Return: 0 on success and error code otherwise.
+ */
+int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
+                               unsigned long num)
+{
+       return __vm_map_pages(vma, pages, num, 0);
+}
+EXPORT_SYMBOL(vm_map_pages_zero);
+
 static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        pfn_t pfn, pgprot_t prot, bool mkwrite)
 {
@@ -2279,7 +2364,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 
        __SetPageUptodate(new_page);
 
-       mmu_notifier_range_init(&range, mm, vmf->address & PAGE_MASK,
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+                               vmf->address & PAGE_MASK,
                                (vmf->address & PAGE_MASK) + PAGE_SIZE);
        mmu_notifier_invalidate_range_start(&range);
 
@@ -4104,8 +4190,9 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
                        goto out;
 
                if (range) {
-                       mmu_notifier_range_init(range, mm, address & PMD_MASK,
-                                            (address & PMD_MASK) + PMD_SIZE);
+                       mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
+                                               NULL, mm, address & PMD_MASK,
+                                               (address & PMD_MASK) + PMD_SIZE);
                        mmu_notifier_invalidate_range_start(range);
                }
                *ptlp = pmd_lock(mm, pmd);
@@ -4122,8 +4209,9 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
                goto out;
 
        if (range) {
-               mmu_notifier_range_init(range, mm, address & PAGE_MASK,
-                                    (address & PAGE_MASK) + PAGE_SIZE);
+               mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
+                                       address & PAGE_MASK,
+                                       (address & PAGE_MASK) + PAGE_SIZE);
                mmu_notifier_invalidate_range_start(range);
        }
        ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
index b236069..6c0c4f4 100644 (file)
@@ -273,12 +273,12 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
  * add the new pages.
  */
 int __ref __add_pages(int nid, unsigned long phys_start_pfn,
-               unsigned long nr_pages, struct vmem_altmap *altmap,
-               bool want_memblock)
+               unsigned long nr_pages, struct mhp_restrictions *restrictions)
 {
        unsigned long i;
        int err = 0;
        int start_sec, end_sec;
+       struct vmem_altmap *altmap = restrictions->altmap;
 
        /* during initialize mem_map, align hot-added range to section */
        start_sec = pfn_to_section_nr(phys_start_pfn);
@@ -299,7 +299,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
 
        for (i = start_sec; i <= end_sec; i++) {
                err = __add_section(nid, section_nr_to_pfn(i), altmap,
-                               want_memblock);
+                               restrictions->flags & MHP_MEMBLOCK_API);
 
                /*
                 * EEXIST is finally dealt with by ioresource collision
@@ -516,26 +516,23 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
        pgdat_resize_unlock(zone->zone_pgdat, &flags);
 }
 
-static int __remove_section(struct zone *zone, struct mem_section *ms,
-               unsigned long map_offset, struct vmem_altmap *altmap)
+static void __remove_section(struct zone *zone, struct mem_section *ms,
+                            unsigned long map_offset,
+                            struct vmem_altmap *altmap)
 {
        unsigned long start_pfn;
        int scn_nr;
-       int ret = -EINVAL;
 
-       if (!valid_section(ms))
-               return ret;
+       if (WARN_ON_ONCE(!valid_section(ms)))
+               return;
 
-       ret = unregister_memory_section(ms);
-       if (ret)
-               return ret;
+       unregister_memory_section(ms);
 
        scn_nr = __section_nr(ms);
        start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
        __remove_zone(zone, start_pfn);
 
        sparse_remove_one_section(zone, ms, map_offset, altmap);
-       return 0;
 }
 
 /**
@@ -550,31 +547,17 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
  * sure that pages are marked reserved and zones are adjust properly by
  * calling offline_pages().
  */
-int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
-                unsigned long nr_pages, struct vmem_altmap *altmap)
+void __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
+                   unsigned long nr_pages, struct vmem_altmap *altmap)
 {
        unsigned long i;
        unsigned long map_offset = 0;
-       int sections_to_remove, ret = 0;
+       int sections_to_remove;
 
        /* In the ZONE_DEVICE case device driver owns the memory region */
        if (is_dev_zone(zone)) {
                if (altmap)
                        map_offset = vmem_altmap_offset(altmap);
-       } else {
-               resource_size_t start, size;
-
-               start = phys_start_pfn << PAGE_SHIFT;
-               size = nr_pages * PAGE_SIZE;
-
-               ret = release_mem_region_adjustable(&iomem_resource, start,
-                                       size);
-               if (ret) {
-                       resource_size_t endres = start + size - 1;
-
-                       pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
-                                       &start, &endres, ret);
-               }
        }
 
        clear_zone_contiguous(zone);
@@ -590,16 +573,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
                unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
 
                cond_resched();
-               ret = __remove_section(zone, __pfn_to_section(pfn), map_offset,
-                               altmap);
+               __remove_section(zone, __pfn_to_section(pfn), map_offset,
+                                altmap);
                map_offset = 0;
-               if (ret)
-                       break;
        }
 
        set_zone_contiguous(zone);
-
-       return ret;
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
@@ -714,7 +693,7 @@ static void node_states_check_changes_online(unsigned long nr_pages,
        if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
                arg->status_change_nid_normal = nid;
 #ifdef CONFIG_HIGHMEM
-       if (zone_idx(zone) <= N_HIGH_MEMORY && !node_state(nid, N_HIGH_MEMORY))
+       if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY))
                arg->status_change_nid_high = nid;
 #endif
 }
@@ -1097,6 +1076,9 @@ static int online_memory_block(struct memory_block *mem, void *arg)
  */
 int __ref add_memory_resource(int nid, struct resource *res)
 {
+       struct mhp_restrictions restrictions = {
+               .flags = MHP_MEMBLOCK_API,
+       };
        u64 start, size;
        bool new_node = false;
        int ret;
@@ -1124,7 +1106,7 @@ int __ref add_memory_resource(int nid, struct resource *res)
        new_node = ret;
 
        /* call arch's memory hotadd */
-       ret = arch_add_memory(nid, start, size, NULL, true);
+       ret = arch_add_memory(nid, start, size, &restrictions);
        if (ret < 0)
                goto error;
 
@@ -1341,8 +1323,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
                if (!PageHuge(page))
                        continue;
                head = compound_head(page);
-               if (hugepage_migration_supported(page_hstate(head)) &&
-                   page_huge_active(head))
+               if (page_huge_active(head))
                        return pfn;
                skip = (1 << compound_order(head)) - (page - head);
                pfn += skip - 1;
@@ -1382,10 +1363,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 
                if (PageHuge(page)) {
                        struct page *head = compound_head(page);
-                       if (compound_order(head) > PFN_SECTION_SHIFT) {
-                               ret = -EBUSY;
-                               break;
-                       }
                        pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
                        isolate_huge_page(head, &source);
                        continue;
@@ -1454,15 +1431,10 @@ static int
 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
                        void *data)
 {
-       __offline_isolated_pages(start, start + nr_pages);
-       return 0;
-}
+       unsigned long *offlined_pages = (unsigned long *)data;
 
-static void
-offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
-{
-       walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
-                               offline_isolated_pages_cb);
+       *offlined_pages += __offline_isolated_pages(start, start + nr_pages);
+       return 0;
 }
 
 /*
@@ -1472,26 +1444,7 @@ static int
 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
                        void *data)
 {
-       int ret;
-       long offlined = *(long *)data;
-       ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
-       offlined = nr_pages;
-       if (!ret)
-               *(long *)data += offlined;
-       return ret;
-}
-
-static long
-check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
-{
-       long offlined = 0;
-       int ret;
-
-       ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
-                       check_pages_isolated_cb);
-       if (ret < 0)
-               offlined = (long)ret;
-       return offlined;
+       return test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
 }
 
 static int __init cmdline_parse_movable_node(char *p)
@@ -1576,7 +1529,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
                  unsigned long end_pfn)
 {
        unsigned long pfn, nr_pages;
-       long offlined_pages;
+       unsigned long offlined_pages = 0;
        int ret, node, nr_isolate_pageblock;
        unsigned long flags;
        unsigned long valid_start, valid_end;
@@ -1652,14 +1605,15 @@ static int __ref __offline_pages(unsigned long start_pfn,
                        goto failed_removal_isolated;
                }
                /* check again */
-               offlined_pages = check_pages_isolated(start_pfn, end_pfn);
-       } while (offlined_pages < 0);
+               ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
+                                           NULL, check_pages_isolated_cb);
+       } while (ret);
 
-       pr_info("Offlined Pages %ld\n", offlined_pages);
        /* Ok, all of our target is isolated.
           We cannot do rollback at this point. */
-       offline_isolated_pages(start_pfn, end_pfn);
-
+       walk_system_ram_range(start_pfn, end_pfn - start_pfn,
+                             &offlined_pages, offline_isolated_pages_cb);
+       pr_info("Offlined Pages %ld\n", offlined_pages);
        /*
         * Onlining will reset pagetype flags and makes migrate type
         * MOVABLE, so just need to decrease the number of isolated
@@ -1843,6 +1797,26 @@ void try_offline_node(int nid)
 }
 EXPORT_SYMBOL(try_offline_node);
 
+static void __release_memory_resource(resource_size_t start,
+                                     resource_size_t size)
+{
+       int ret;
+
+       /*
+        * When removing memory in the same granularity as it was added,
+        * this function never fails. It might only fail if resources
+        * have to be adjusted or split. We'll ignore the error, as
+        * removing of memory cannot fail.
+        */
+       ret = release_mem_region_adjustable(&iomem_resource, start, size);
+       if (ret) {
+               resource_size_t endres = start + size - 1;
+
+               pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
+                       &start, &endres, ret);
+       }
+}
+
 /**
  * remove_memory
  * @nid: the node ID
@@ -1877,6 +1851,7 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
        memblock_remove(start, size);
 
        arch_remove_memory(nid, start, size, NULL);
+       __release_memory_resource(start, size);
 
        try_offline_node(nid);
 
index 663a544..f2ecc28 100644 (file)
@@ -463,7 +463,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
 
                for (i = 1; i < HPAGE_PMD_NR; i++) {
                        xas_next(&xas);
-                       xas_store(&xas, newpage + i);
+                       xas_store(&xas, newpage);
                }
        }
 
@@ -2356,7 +2356,8 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
        mm_walk.mm = migrate->vma->vm_mm;
        mm_walk.private = migrate;
 
-       mmu_notifier_range_init(&range, mm_walk.mm, migrate->start,
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm_walk.mm,
+                               migrate->start,
                                migrate->end);
        mmu_notifier_invalidate_range_start(&range);
        walk_page_range(migrate->start, migrate->end, &mm_walk);
@@ -2764,6 +2765,8 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
                                notified = true;
 
                                mmu_notifier_range_init(&range,
+                                                       MMU_NOTIFY_CLEAR, 0,
+                                                       NULL,
                                                        migrate->vma->vm_mm,
                                                        addr, migrate->end);
                                mmu_notifier_invalidate_range_start(&range);
index 9c884ab..ee36068 100644 (file)
@@ -180,7 +180,7 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
                        if (_ret) {
                                pr_info("%pS callback failed with %d in %sblockable context.\n",
                                        mn->ops->invalidate_range_start, _ret,
-                                       !range->blockable ? "non-" : "");
+                                       !mmu_notifier_range_blockable(range) ? "non-" : "");
                                ret = _ret;
                        }
                }
@@ -395,3 +395,13 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn,
        mmdrop(mm);
 }
 EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release);
+
+bool
+mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range)
+{
+       if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA)
+               return false;
+       /* Return true if the vma still have the read flag set. */
+       return range->vma->vm_flags & VM_READ;
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only);
index 028c724..bf38dfb 100644 (file)
@@ -39,7 +39,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, unsigned long end, pgprot_t newprot,
                int dirty_accountable, int prot_numa)
 {
-       struct mm_struct *mm = vma->vm_mm;
        pte_t *pte, oldpte;
        spinlock_t *ptl;
        unsigned long pages = 0;
@@ -136,7 +135,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                newpte = swp_entry_to_pte(entry);
                                if (pte_swp_soft_dirty(oldpte))
                                        newpte = pte_swp_mksoft_dirty(newpte);
-                               set_pte_at(mm, addr, pte, newpte);
+                               set_pte_at(vma->vm_mm, addr, pte, newpte);
 
                                pages++;
                        }
@@ -150,7 +149,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                 */
                                make_device_private_entry_read(&entry);
                                newpte = swp_entry_to_pte(entry);
-                               set_pte_at(mm, addr, pte, newpte);
+                               set_pte_at(vma->vm_mm, addr, pte, newpte);
 
                                pages++;
                        }
@@ -185,7 +184,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 
                /* invoke the mmu notifier if the pmd is populated */
                if (!range.start) {
-                       mmu_notifier_range_init(&range, vma->vm_mm, addr, end);
+                       mmu_notifier_range_init(&range,
+                               MMU_NOTIFY_PROTECTION_VMA, 0,
+                               vma, vma->vm_mm, addr, end);
                        mmu_notifier_invalidate_range_start(&range);
                }
 
index e3edef6..fc241d2 100644 (file)
@@ -249,7 +249,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
        old_end = old_addr + len;
        flush_cache_range(vma, old_addr, old_end);
 
-       mmu_notifier_range_init(&range, vma->vm_mm, old_addr, old_end);
+       mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
+                               old_addr, old_end);
        mmu_notifier_invalidate_range_start(&range);
 
        for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
index 749276b..b492fd1 100644 (file)
@@ -473,6 +473,20 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
 }
 EXPORT_SYMBOL(vm_insert_page);
 
+int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+                       unsigned long num)
+{
+       return -EINVAL;
+}
+EXPORT_SYMBOL(vm_map_pages);
+
+int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
+                               unsigned long num)
+{
+       return -EINVAL;
+}
+EXPORT_SYMBOL(vm_map_pages_zero);
+
 /*
  *  sys_brk() for the most part doesn't need the global kernel
  *  lock, except when an application is doing something nasty
index 3a24848..539c91d 100644 (file)
@@ -531,7 +531,8 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
                        struct mmu_notifier_range range;
                        struct mmu_gather tlb;
 
-                       mmu_notifier_range_init(&range, mm, vma->vm_start,
+                       mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
+                                               vma, mm, vma->vm_start,
                                                vma->vm_end);
                        tlb_gather_mmu(&tlb, mm, range.start, range.end);
                        if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
index 9f61dfe..0765648 100644 (file)
@@ -2808,6 +2808,18 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
 }
 EXPORT_SYMBOL(__test_set_page_writeback);
 
+/*
+ * Wait for a page to complete writeback
+ */
+void wait_on_page_writeback(struct page *page)
+{
+       if (PageWriteback(page)) {
+               trace_wait_on_page_writeback(page, page_mapping(page));
+               wait_on_page_bit(page, PG_writeback);
+       }
+}
+EXPORT_SYMBOL_GPL(wait_on_page_writeback);
+
 /**
  * wait_for_stable_page() - wait for writeback to finish, if necessary.
  * @page:      The page to wait on.
index 5966110..f2f3fb4 100644 (file)
@@ -1416,36 +1416,22 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
 #endif
 
 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
-static inline bool __meminit __maybe_unused
-meminit_pfn_in_nid(unsigned long pfn, int node,
-                  struct mminit_pfnnid_cache *state)
+/* Only safe to use early in boot when initialisation is single-threaded */
+static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
 {
        int nid;
 
-       nid = __early_pfn_to_nid(pfn, state);
+       nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
        if (nid >= 0 && nid != node)
                return false;
        return true;
 }
 
-/* Only safe to use early in boot when initialisation is single-threaded */
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
-{
-       return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
-}
-
 #else
-
 static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
 {
        return true;
 }
-static inline bool __meminit  __maybe_unused
-meminit_pfn_in_nid(unsigned long pfn, int node,
-                  struct mminit_pfnnid_cache *state)
-{
-       return true;
-}
 #endif
 
 
@@ -1574,21 +1560,13 @@ static inline void __init pgdat_init_report_one_done(void)
  *
  * Then, we check if a current large page is valid by only checking the validity
  * of the head pfn.
- *
- * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave
- * within a node: a pfn is between start and end of a node, but does not belong
- * to this memory node.
  */
-static inline bool __init
-deferred_pfn_valid(int nid, unsigned long pfn,
-                  struct mminit_pfnnid_cache *nid_init_state)
+static inline bool __init deferred_pfn_valid(unsigned long pfn)
 {
        if (!pfn_valid_within(pfn))
                return false;
        if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
                return false;
-       if (!meminit_pfn_in_nid(pfn, nid, nid_init_state))
-               return false;
        return true;
 }
 
@@ -1596,15 +1574,14 @@ deferred_pfn_valid(int nid, unsigned long pfn,
  * Free pages to buddy allocator. Try to free aligned pages in
  * pageblock_nr_pages sizes.
  */
-static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
+static void __init deferred_free_pages(unsigned long pfn,
                                       unsigned long end_pfn)
 {
-       struct mminit_pfnnid_cache nid_init_state = { };
        unsigned long nr_pgmask = pageblock_nr_pages - 1;
        unsigned long nr_free = 0;
 
        for (; pfn < end_pfn; pfn++) {
-               if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
+               if (!deferred_pfn_valid(pfn)) {
                        deferred_free_range(pfn - nr_free, nr_free);
                        nr_free = 0;
                } else if (!(pfn & nr_pgmask)) {
@@ -1624,17 +1601,18 @@ static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
  * by performing it only once every pageblock_nr_pages.
  * Return number of pages initialized.
  */
-static unsigned long  __init deferred_init_pages(int nid, int zid,
+static unsigned long  __init deferred_init_pages(struct zone *zone,
                                                 unsigned long pfn,
                                                 unsigned long end_pfn)
 {
-       struct mminit_pfnnid_cache nid_init_state = { };
        unsigned long nr_pgmask = pageblock_nr_pages - 1;
+       int nid = zone_to_nid(zone);
        unsigned long nr_pages = 0;
+       int zid = zone_idx(zone);
        struct page *page = NULL;
 
        for (; pfn < end_pfn; pfn++) {
-               if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
+               if (!deferred_pfn_valid(pfn)) {
                        page = NULL;
                        continue;
                } else if (!page || !(pfn & nr_pgmask)) {
@@ -1649,18 +1627,100 @@ static unsigned long  __init deferred_init_pages(int nid, int zid,
        return (nr_pages);
 }
 
+/*
+ * This function is meant to pre-load the iterator for the zone init.
+ * Specifically it walks through the ranges until we are caught up to the
+ * first_init_pfn value and exits there. If we never encounter the value we
+ * return false indicating there are no valid ranges left.
+ */
+static bool __init
+deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
+                                   unsigned long *spfn, unsigned long *epfn,
+                                   unsigned long first_init_pfn)
+{
+       u64 j;
+
+       /*
+        * Start out by walking through the ranges in this zone that have
+        * already been initialized. We don't need to do anything with them
+        * so we just need to flush them out of the system.
+        */
+       for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
+               if (*epfn <= first_init_pfn)
+                       continue;
+               if (*spfn < first_init_pfn)
+                       *spfn = first_init_pfn;
+               *i = j;
+               return true;
+       }
+
+       return false;
+}
+
+/*
+ * Initialize and free pages. We do it in two loops: first we initialize
+ * struct page, then free to buddy allocator, because while we are
+ * freeing pages we can access pages that are ahead (computing buddy
+ * page in __free_one_page()).
+ *
+ * In order to try and keep some memory in the cache we have the loop
+ * broken along max page order boundaries. This way we will not cause
+ * any issues with the buddy page computation.
+ */
+static unsigned long __init
+deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
+                      unsigned long *end_pfn)
+{
+       unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
+       unsigned long spfn = *start_pfn, epfn = *end_pfn;
+       unsigned long nr_pages = 0;
+       u64 j = *i;
+
+       /* First we loop through and initialize the page values */
+       for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
+               unsigned long t;
+
+               if (mo_pfn <= *start_pfn)
+                       break;
+
+               t = min(mo_pfn, *end_pfn);
+               nr_pages += deferred_init_pages(zone, *start_pfn, t);
+
+               if (mo_pfn < *end_pfn) {
+                       *start_pfn = mo_pfn;
+                       break;
+               }
+       }
+
+       /* Reset values and now loop through freeing pages as needed */
+       swap(j, *i);
+
+       for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
+               unsigned long t;
+
+               if (mo_pfn <= spfn)
+                       break;
+
+               t = min(mo_pfn, epfn);
+               deferred_free_pages(spfn, t);
+
+               if (mo_pfn <= epfn)
+                       break;
+       }
+
+       return nr_pages;
+}
+
 /* Initialise remaining memory on a node */
 static int __init deferred_init_memmap(void *data)
 {
        pg_data_t *pgdat = data;
-       int nid = pgdat->node_id;
+       const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+       unsigned long spfn = 0, epfn = 0, nr_pages = 0;
+       unsigned long first_init_pfn, flags;
        unsigned long start = jiffies;
-       unsigned long nr_pages = 0;
-       unsigned long spfn, epfn, first_init_pfn, flags;
-       phys_addr_t spa, epa;
-       int zid;
        struct zone *zone;
-       const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+       int zid;
        u64 i;
 
        /* Bind memory initialisation thread to a local node if possible */
@@ -1686,31 +1746,27 @@ static int __init deferred_init_memmap(void *data)
                if (first_init_pfn < zone_end_pfn(zone))
                        break;
        }
-       first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
+
+       /* If the zone is empty somebody else may have cleared out the zone */
+       if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+                                                first_init_pfn))
+               goto zone_empty;
 
        /*
-        * Initialize and free pages. We do it in two loops: first we initialize
-        * struct page, than free to buddy allocator, because while we are
-        * freeing pages we can access pages that are ahead (computing buddy
-        * page in __free_one_page()).
+        * Initialize and free pages in MAX_ORDER sized increments so
+        * that we can avoid introducing any issues with the buddy
+        * allocator.
         */
-       for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
-               spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
-               epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
-               nr_pages += deferred_init_pages(nid, zid, spfn, epfn);
-       }
-       for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
-               spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
-               epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
-               deferred_free_pages(nid, zid, spfn, epfn);
-       }
+       while (spfn < epfn)
+               nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+zone_empty:
        pgdat_resize_unlock(pgdat, &flags);
 
        /* Sanity check that the next zone really is unpopulated */
        WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
 
-       pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
-                                       jiffies_to_msecs(jiffies - start));
+       pr_info("node %d initialised, %lu pages in %ums\n",
+               pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start));
 
        pgdat_init_report_one_done();
        return 0;
@@ -1734,14 +1790,11 @@ static int __init deferred_init_memmap(void *data)
 static noinline bool __init
 deferred_grow_zone(struct zone *zone, unsigned int order)
 {
-       int zid = zone_idx(zone);
-       int nid = zone_to_nid(zone);
-       pg_data_t *pgdat = NODE_DATA(nid);
        unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
-       unsigned long nr_pages = 0;
-       unsigned long first_init_pfn, spfn, epfn, t, flags;
+       pg_data_t *pgdat = zone->zone_pgdat;
        unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
-       phys_addr_t spa, epa;
+       unsigned long spfn, epfn, flags;
+       unsigned long nr_pages = 0;
        u64 i;
 
        /* Only the last zone may have deferred pages */
@@ -1770,38 +1823,35 @@ deferred_grow_zone(struct zone *zone, unsigned int order)
                return true;
        }
 
-       first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn);
-
-       if (first_init_pfn >= pgdat_end_pfn(pgdat)) {
+       /* If the zone is empty somebody else may have cleared out the zone */
+       if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+                                                first_deferred_pfn)) {
+               pgdat->first_deferred_pfn = ULONG_MAX;
                pgdat_resize_unlock(pgdat, &flags);
-               return false;
+               return true;
        }
 
-       for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
-               spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
-               epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
+       /*
+        * Initialize and free pages in MAX_ORDER sized increments so
+        * that we can avoid introducing any issues with the buddy
+        * allocator.
+        */
+       while (spfn < epfn) {
+               /* update our first deferred PFN for this section */
+               first_deferred_pfn = spfn;
 
-               while (spfn < epfn && nr_pages < nr_pages_needed) {
-                       t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION);
-                       first_deferred_pfn = min(t, epfn);
-                       nr_pages += deferred_init_pages(nid, zid, spfn,
-                                                       first_deferred_pfn);
-                       spfn = first_deferred_pfn;
-               }
+               nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+
+               /* We should only stop along section boundaries */
+               if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
+                       continue;
 
+               /* If our quota has been met we can stop here */
                if (nr_pages >= nr_pages_needed)
                        break;
        }
 
-       for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
-               spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
-               epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa));
-               deferred_free_pages(nid, zid, spfn, epfn);
-
-               if (first_deferred_pfn == epfn)
-                       break;
-       }
-       pgdat->first_deferred_pfn = first_deferred_pfn;
+       pgdat->first_deferred_pfn = spfn;
        pgdat_resize_unlock(pgdat, &flags);
 
        return nr_pages > 0;
@@ -1846,10 +1896,9 @@ void __init page_alloc_init_late(void)
        /* Reinit limits that are based on free pages after the kernel is up */
        files_maxfiles_init();
 #endif
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+
        /* Discard memblock private memory */
        memblock_discard();
-#endif
 
        for_each_populated_zone(zone)
                set_zone_contiguous(zone);
@@ -3120,9 +3169,8 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
 
 /* Lock and remove page from the per-cpu list */
 static struct page *rmqueue_pcplist(struct zone *preferred_zone,
-                       struct zone *zone, unsigned int order,
-                       gfp_t gfp_flags, int migratetype,
-                       unsigned int alloc_flags)
+                       struct zone *zone, gfp_t gfp_flags,
+                       int migratetype, unsigned int alloc_flags)
 {
        struct per_cpu_pages *pcp;
        struct list_head *list;
@@ -3134,7 +3182,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
        list = &pcp->lists[migratetype];
        page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
        if (page) {
-               __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+               __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
                zone_statistics(preferred_zone, zone);
        }
        local_irq_restore(flags);
@@ -3154,8 +3202,8 @@ struct page *rmqueue(struct zone *preferred_zone,
        struct page *page;
 
        if (likely(order == 0)) {
-               page = rmqueue_pcplist(preferred_zone, zone, order,
-                               gfp_flags, migratetype, alloc_flags);
+               page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
+                                       migratetype, alloc_flags);
                goto out;
        }
 
@@ -4821,7 +4869,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
 /**
  * alloc_pages_exact - allocate an exact number physically-contiguous pages.
  * @size: the number of bytes to allocate
- * @gfp_mask: GFP flags for the allocation
+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
  *
  * This function is similar to alloc_pages(), except that it allocates the
  * minimum number of pages to satisfy the request.  alloc_pages() can only
@@ -4838,6 +4886,9 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
        unsigned int order = get_order(size);
        unsigned long addr;
 
+       if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
+               gfp_mask &= ~__GFP_COMP;
+
        addr = __get_free_pages(gfp_mask, order);
        return make_alloc_exact(addr, order, size);
 }
@@ -4848,7 +4899,7 @@ EXPORT_SYMBOL(alloc_pages_exact);
  *                        pages on a node.
  * @nid: the preferred node ID where memory should be allocated
  * @size: the number of bytes to allocate
- * @gfp_mask: GFP flags for the allocation
+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
  *
  * Like alloc_pages_exact(), but try to allocate on node nid first before falling
  * back.
@@ -4858,7 +4909,12 @@ EXPORT_SYMBOL(alloc_pages_exact);
 void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
 {
        unsigned int order = get_order(size);
-       struct page *p = alloc_pages_node(nid, gfp_mask, order);
+       struct page *p;
+
+       if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
+               gfp_mask &= ~__GFP_COMP;
+
+       p = alloc_pages_node(nid, gfp_mask, order);
        if (!p)
                return NULL;
        return make_alloc_exact((unsigned long)page_address(p), order, size);
@@ -6247,13 +6303,15 @@ static unsigned long __init zone_spanned_pages_in_node(int nid,
                                        unsigned long *zone_end_pfn,
                                        unsigned long *ignored)
 {
+       unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
+       unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
        /* When hotadd a new node from cpu_up(), the node should be empty */
        if (!node_start_pfn && !node_end_pfn)
                return 0;
 
        /* Get the start and end of the zone */
-       *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
-       *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+       *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
+       *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
        adjust_zone_range_for_zone_movable(nid, zone_type,
                                node_start_pfn, node_end_pfn,
                                zone_start_pfn, zone_end_pfn);
@@ -8129,8 +8187,7 @@ unmovable:
        return true;
 }
 
-#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
-
+#ifdef CONFIG_CONTIG_ALLOC
 static unsigned long pfn_max_align_down(unsigned long pfn)
 {
        return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
@@ -8339,8 +8396,9 @@ done:
                                pfn_max_align_up(end), migratetype);
        return ret;
 }
+#endif /* CONFIG_CONTIG_ALLOC */
 
-void free_contig_range(unsigned long pfn, unsigned nr_pages)
+void free_contig_range(unsigned long pfn, unsigned int nr_pages)
 {
        unsigned int count = 0;
 
@@ -8352,7 +8410,6 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
        }
        WARN(count != 0, "%d pages are still in use!\n", count);
 }
-#endif
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
@@ -8394,7 +8451,7 @@ void zone_pcp_reset(struct zone *zone)
  * All pages in the range must be in a single zone and isolated
  * before calling this.
  */
-void
+unsigned long
 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 {
        struct page *page;
@@ -8402,12 +8459,15 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
        unsigned int order, i;
        unsigned long pfn;
        unsigned long flags;
+       unsigned long offlined_pages = 0;
+
        /* find the first valid pfn */
        for (pfn = start_pfn; pfn < end_pfn; pfn++)
                if (pfn_valid(pfn))
                        break;
        if (pfn == end_pfn)
-               return;
+               return offlined_pages;
+
        offline_mem_sections(pfn, end_pfn);
        zone = page_zone(pfn_to_page(pfn));
        spin_lock_irqsave(&zone->lock, flags);
@@ -8425,12 +8485,14 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
                if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
                        pfn++;
                        SetPageReserved(page);
+                       offlined_pages++;
                        continue;
                }
 
                BUG_ON(page_count(page));
                BUG_ON(!PageBuddy(page));
                order = page_order(page);
+               offlined_pages += 1 << order;
 #ifdef CONFIG_DEBUG_VM
                pr_info("remove from free list %lx %d %lx\n",
                        pfn, 1 << order, end_pfn);
@@ -8443,6 +8505,8 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
                pfn += (1 << order);
        }
        spin_unlock_irqrestore(&zone->lock, flags);
+
+       return offlined_pages;
 }
 #endif
 
index 0192807..e3638a5 100644 (file)
@@ -151,8 +151,6 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
        for (i = 0; i < nr_pages; i++) {
                struct page *page;
 
-               if (!pfn_valid_within(pfn + i))
-                       continue;
                page = pfn_to_online_page(pfn + i);
                if (!page)
                        continue;
index b30c7c7..e5dfe2a 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -850,7 +850,7 @@ int page_referenced(struct page *page,
        };
 
        *vm_flags = 0;
-       if (!page_mapped(page))
+       if (!pra.mapcount)
                return 0;
 
        if (!page_rmapping(page))
@@ -896,7 +896,8 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
         * We have to assume the worse case ie pmd for invalidation. Note that
         * the page can not be free from this function.
         */
-       mmu_notifier_range_init(&range, vma->vm_mm, address,
+       mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
+                               0, vma, vma->vm_mm, address,
                                min(vma->vm_end, address +
                                    (PAGE_SIZE << compound_order(page))));
        mmu_notifier_invalidate_range_start(&range);
@@ -928,7 +929,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
                                continue;
 
                        flush_cache_page(vma, address, page_to_pfn(page));
-                       entry = pmdp_huge_clear_flush(vma, address, pmd);
+                       entry = pmdp_invalidate(vma, address, pmd);
                        entry = pmd_wrprotect(entry);
                        entry = pmd_mkclean(entry);
                        set_pmd_at(vma->vm_mm, address, pmd, entry);
@@ -1371,7 +1372,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
         * Note that the page can not be free in this function as call of
         * try_to_unmap() must hold a reference on the page.
         */
-       mmu_notifier_range_init(&range, vma->vm_mm, address,
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+                               address,
                                min(vma->vm_end, address +
                                    (PAGE_SIZE << compound_order(page))));
        if (PageHuge(page)) {
index f4dce9c..1bb3b8d 100644 (file)
@@ -614,7 +614,7 @@ static int shmem_add_to_page_cache(struct page *page,
                if (xas_error(&xas))
                        goto unlock;
 next:
-               xas_store(&xas, page + i);
+               xas_store(&xas, page);
                if (++i < nr) {
                        xas_next(&xas);
                        goto next;
index 284ab73..2915d91 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -990,10 +990,8 @@ static void cpuup_canceled(long cpu)
 
                /* cpu is dead; no one can alloc from it. */
                nc = per_cpu_ptr(cachep->cpu_cache, cpu);
-               if (nc) {
-                       free_block(cachep, nc->entry, nc->avail, node, &list);
-                       nc->avail = 0;
-               }
+               free_block(cachep, nc->entry, nc->avail, node, &list);
+               nc->avail = 0;
 
                if (!cpumask_empty(mask)) {
                        spin_unlock_irq(&n->list_lock);
@@ -1674,8 +1672,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
 {
        struct page *page, *n;
 
-       list_for_each_entry_safe(page, n, list, lru) {
-               list_del(&page->lru);
+       list_for_each_entry_safe(page, n, list, slab_list) {
+               list_del(&page->slab_list);
                slab_destroy(cachep, page);
        }
 }
@@ -2231,8 +2229,8 @@ static int drain_freelist(struct kmem_cache *cache,
                        goto out;
                }
 
-               page = list_entry(p, struct page, lru);
-               list_del(&page->lru);
+               page = list_entry(p, struct page, slab_list);
+               list_del(&page->slab_list);
                n->free_slabs--;
                n->total_slabs--;
                /*
@@ -2691,13 +2689,13 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
        if (!page)
                return;
 
-       INIT_LIST_HEAD(&page->lru);
+       INIT_LIST_HEAD(&page->slab_list);
        n = get_node(cachep, page_to_nid(page));
 
        spin_lock(&n->list_lock);
        n->total_slabs++;
        if (!page->active) {
-               list_add_tail(&page->lru, &(n->slabs_free));
+               list_add_tail(&page->slab_list, &n->slabs_free);
                n->free_slabs++;
        } else
                fixup_slab_list(cachep, n, page, &list);
@@ -2806,9 +2804,9 @@ static inline void fixup_slab_list(struct kmem_cache *cachep,
                                void **list)
 {
        /* move slabp to correct slabp list: */
-       list_del(&page->lru);
+       list_del(&page->slab_list);
        if (page->active == cachep->num) {
-               list_add(&page->lru, &n->slabs_full);
+               list_add(&page->slab_list, &n->slabs_full);
                if (OBJFREELIST_SLAB(cachep)) {
 #if DEBUG
                        /* Poisoning will be done without holding the lock */
@@ -2822,7 +2820,7 @@ static inline void fixup_slab_list(struct kmem_cache *cachep,
                        page->freelist = NULL;
                }
        } else
-               list_add(&page->lru, &n->slabs_partial);
+               list_add(&page->slab_list, &n->slabs_partial);
 }
 
 /* Try to find non-pfmemalloc slab if needed */
@@ -2845,20 +2843,20 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
        }
 
        /* Move pfmemalloc slab to the end of list to speed up next search */
-       list_del(&page->lru);
+       list_del(&page->slab_list);
        if (!page->active) {
-               list_add_tail(&page->lru, &n->slabs_free);
+               list_add_tail(&page->slab_list, &n->slabs_free);
                n->free_slabs++;
        } else
-               list_add_tail(&page->lru, &n->slabs_partial);
+               list_add_tail(&page->slab_list, &n->slabs_partial);
 
-       list_for_each_entry(page, &n->slabs_partial, lru) {
+       list_for_each_entry(page, &n->slabs_partial, slab_list) {
                if (!PageSlabPfmemalloc(page))
                        return page;
        }
 
        n->free_touched = 1;
-       list_for_each_entry(page, &n->slabs_free, lru) {
+       list_for_each_entry(page, &n->slabs_free, slab_list) {
                if (!PageSlabPfmemalloc(page)) {
                        n->free_slabs--;
                        return page;
@@ -2873,11 +2871,12 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
        struct page *page;
 
        assert_spin_locked(&n->list_lock);
-       page = list_first_entry_or_null(&n->slabs_partial, struct page, lru);
+       page = list_first_entry_or_null(&n->slabs_partial, struct page,
+                                       slab_list);
        if (!page) {
                n->free_touched = 1;
                page = list_first_entry_or_null(&n->slabs_free, struct page,
-                                               lru);
+                                               slab_list);
                if (page)
                        n->free_slabs--;
        }
@@ -3378,29 +3377,29 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
                objp = objpp[i];
 
                page = virt_to_head_page(objp);
-               list_del(&page->lru);
+               list_del(&page->slab_list);
                check_spinlock_acquired_node(cachep, node);
                slab_put_obj(cachep, page, objp);
                STATS_DEC_ACTIVE(cachep);
 
                /* fixup slab chains */
                if (page->active == 0) {
-                       list_add(&page->lru, &n->slabs_free);
+                       list_add(&page->slab_list, &n->slabs_free);
                        n->free_slabs++;
                } else {
                        /* Unconditionally move a slab to the end of the
                         * partial list on free - maximum time for the
                         * other objects to be freed, too.
                         */
-                       list_add_tail(&page->lru, &n->slabs_partial);
+                       list_add_tail(&page->slab_list, &n->slabs_partial);
                }
        }
 
        while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) {
                n->free_objects -= cachep->num;
 
-               page = list_last_entry(&n->slabs_free, struct page, lru);
-               list_move(&page->lru, list);
+               page = list_last_entry(&n->slabs_free, struct page, slab_list);
+               list_move(&page->slab_list, list);
                n->free_slabs--;
                n->total_slabs--;
        }
@@ -3438,7 +3437,7 @@ free_done:
                int i = 0;
                struct page *page;
 
-               list_for_each_entry(page, &n->slabs_free, lru) {
+               list_for_each_entry(page, &n->slabs_free, slab_list) {
                        BUG_ON(page->active);
 
                        i++;
@@ -4292,8 +4291,12 @@ static int leaks_show(struct seq_file *m, void *p)
         * whole processing.
         */
        do {
-               set_store_user_clean(cachep);
                drain_cpu_caches(cachep);
+               /*
+                * drain_cpu_caches() could make kmemleak_object and
+                * debug_objects_cache dirty, so reset afterwards.
+                */
+               set_store_user_clean(cachep);
 
                x[1] = 0;
 
@@ -4302,9 +4305,9 @@ static int leaks_show(struct seq_file *m, void *p)
                        check_irq_on();
                        spin_lock_irq(&n->list_lock);
 
-                       list_for_each_entry(page, &n->slabs_full, lru)
+                       list_for_each_entry(page, &n->slabs_full, slab_list)
                                handle_slab(x, cachep, page);
-                       list_for_each_entry(page, &n->slabs_partial, lru)
+                       list_for_each_entry(page, &n->slabs_partial, slab_list)
                                handle_slab(x, cachep, page);
                        spin_unlock_irq(&n->list_lock);
                }
index 307c2c9..84aefd9 100644 (file)
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -112,13 +112,13 @@ static inline int slob_page_free(struct page *sp)
 
 static void set_slob_page_free(struct page *sp, struct list_head *list)
 {
-       list_add(&sp->lru, list);
+       list_add(&sp->slab_list, list);
        __SetPageSlobFree(sp);
 }
 
 static inline void clear_slob_page_free(struct page *sp)
 {
-       list_del(&sp->lru);
+       list_del(&sp->slab_list);
        __ClearPageSlobFree(sp);
 }
 
@@ -213,13 +213,26 @@ static void slob_free_pages(void *b, int order)
 }
 
 /*
- * Allocate a slob block within a given slob_page sp.
+ * slob_page_alloc() - Allocate a slob block within a given slob_page sp.
+ * @sp: Page to look in.
+ * @size: Size of the allocation.
+ * @align: Allocation alignment.
+ * @page_removed_from_list: Return parameter.
+ *
+ * Tries to find a chunk of memory at least @size bytes big within @page.
+ *
+ * Return: Pointer to memory if allocated, %NULL otherwise.  If the
+ *         allocation fills up @page then the page is removed from the
+ *         freelist, in this case @page_removed_from_list will be set to
+ *         true (set to false otherwise).
  */
-static void *slob_page_alloc(struct page *sp, size_t size, int align)
+static void *slob_page_alloc(struct page *sp, size_t size, int align,
+                            bool *page_removed_from_list)
 {
        slob_t *prev, *cur, *aligned = NULL;
        int delta = 0, units = SLOB_UNITS(size);
 
+       *page_removed_from_list = false;
        for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
                slobidx_t avail = slob_units(cur);
 
@@ -254,8 +267,10 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align)
                        }
 
                        sp->units -= units;
-                       if (!sp->units)
+                       if (!sp->units) {
                                clear_slob_page_free(sp);
+                               *page_removed_from_list = true;
+                       }
                        return cur;
                }
                if (slob_last(cur))
@@ -269,10 +284,10 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align)
 static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
 {
        struct page *sp;
-       struct list_head *prev;
        struct list_head *slob_list;
        slob_t *b = NULL;
        unsigned long flags;
+       bool _unused;
 
        if (size < SLOB_BREAK1)
                slob_list = &free_slob_small;
@@ -283,7 +298,8 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
 
        spin_lock_irqsave(&slob_lock, flags);
        /* Iterate through each partially free page, try to find room */
-       list_for_each_entry(sp, slob_list, lru) {
+       list_for_each_entry(sp, slob_list, slab_list) {
+               bool page_removed_from_list = false;
 #ifdef CONFIG_NUMA
                /*
                 * If there's a node specification, search for a partial
@@ -296,18 +312,25 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
                if (sp->units < SLOB_UNITS(size))
                        continue;
 
-               /* Attempt to alloc */
-               prev = sp->lru.prev;
-               b = slob_page_alloc(sp, size, align);
+               b = slob_page_alloc(sp, size, align, &page_removed_from_list);
                if (!b)
                        continue;
 
-               /* Improve fragment distribution and reduce our average
-                * search time by starting our next search here. (see
-                * Knuth vol 1, sec 2.5, pg 449) */
-               if (prev != slob_list->prev &&
-                               slob_list->next != prev->next)
-                       list_move_tail(slob_list, prev->next);
+               /*
+                * If slob_page_alloc() removed sp from the list then we
+                * cannot call list functions on sp.  If so allocation
+                * did not fragment the page anyway so optimisation is
+                * unnecessary.
+                */
+               if (!page_removed_from_list) {
+                       /*
+                        * Improve fragment distribution and reduce our average
+                        * search time by starting our next search here. (see
+                        * Knuth vol 1, sec 2.5, pg 449)
+                        */
+                       if (!list_is_first(&sp->slab_list, slob_list))
+                               list_rotate_to_front(&sp->slab_list, slob_list);
+               }
                break;
        }
        spin_unlock_irqrestore(&slob_lock, flags);
@@ -323,10 +346,10 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
                spin_lock_irqsave(&slob_lock, flags);
                sp->units = SLOB_UNITS(PAGE_SIZE);
                sp->freelist = b;
-               INIT_LIST_HEAD(&sp->lru);
+               INIT_LIST_HEAD(&sp->slab_list);
                set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
                set_slob_page_free(sp, slob_list);
-               b = slob_page_alloc(sp, size, align);
+               b = slob_page_alloc(sp, size, align, &_unused);
                BUG_ON(!b);
                spin_unlock_irqrestore(&slob_lock, flags);
        }
index 6b28cd2..cd04dbd 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
  *     D. page->frozen         -> frozen state
  *
  *   If a slab is frozen then it is exempt from list management. It is not
- *   on any list. The processor that froze the slab is the one who can
- *   perform list operations on the page. Other processors may put objects
- *   onto the freelist but the processor that froze the slab is the only
- *   one that can retrieve the objects from the page's freelist.
+ *   on any list except per cpu partial list. The processor that froze the
+ *   slab is the one who can perform list operations on the page. Other
+ *   processors may put objects onto the freelist but the processor that
+ *   froze the slab is the only one that can retrieve the objects from the
+ *   page's freelist.
  *
  *   The list_lock protects the partial and full list on each node and
  *   the partial slab counter. If taken then no new slabs may be added or
@@ -1014,7 +1015,7 @@ static void add_full(struct kmem_cache *s,
                return;
 
        lockdep_assert_held(&n->list_lock);
-       list_add(&page->lru, &n->full);
+       list_add(&page->slab_list, &n->full);
 }
 
 static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
@@ -1023,7 +1024,7 @@ static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct
                return;
 
        lockdep_assert_held(&n->list_lock);
-       list_del(&page->lru);
+       list_del(&page->slab_list);
 }
 
 /* Tracking of the number of slabs for debugging purposes */
@@ -1764,9 +1765,9 @@ __add_partial(struct kmem_cache_node *n, struct page *page, int tail)
 {
        n->nr_partial++;
        if (tail == DEACTIVATE_TO_TAIL)
-               list_add_tail(&page->lru, &n->partial);
+               list_add_tail(&page->slab_list, &n->partial);
        else
-               list_add(&page->lru, &n->partial);
+               list_add(&page->slab_list, &n->partial);
 }
 
 static inline void add_partial(struct kmem_cache_node *n,
@@ -1780,7 +1781,7 @@ static inline void remove_partial(struct kmem_cache_node *n,
                                        struct page *page)
 {
        lockdep_assert_held(&n->list_lock);
-       list_del(&page->lru);
+       list_del(&page->slab_list);
        n->nr_partial--;
 }
 
@@ -1854,7 +1855,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
                return NULL;
 
        spin_lock(&n->list_lock);
-       list_for_each_entry_safe(page, page2, &n->partial, lru) {
+       list_for_each_entry_safe(page, page2, &n->partial, slab_list) {
                void *t;
 
                if (!pfmemalloc_match(page, flags))
@@ -1942,7 +1943,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
                        }
                }
        } while (read_mems_allowed_retry(cpuset_mems_cookie));
-#endif
+#endif /* CONFIG_NUMA */
        return NULL;
 }
 
@@ -2240,7 +2241,7 @@ static void unfreeze_partials(struct kmem_cache *s,
                discard_slab(s, page);
                stat(s, FREE_SLAB);
        }
-#endif
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
 }
 
 /*
@@ -2299,7 +2300,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
                local_irq_restore(flags);
        }
        preempt_enable();
-#endif
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
 }
 
 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
@@ -2398,7 +2399,7 @@ static unsigned long count_partial(struct kmem_cache_node *n,
        struct page *page;
 
        spin_lock_irqsave(&n->list_lock, flags);
-       list_for_each_entry(page, &n->partial, lru)
+       list_for_each_entry(page, &n->partial, slab_list)
                x += get_count(page);
        spin_unlock_irqrestore(&n->list_lock, flags);
        return x;
@@ -2804,7 +2805,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
 #endif
-#endif
+#endif /* CONFIG_NUMA */
 
 /*
  * Slow path handling. This may still be called frequently since objects
@@ -2903,8 +2904,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
         * then add it.
         */
        if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
-               if (kmem_cache_debug(s))
-                       remove_full(s, n, page);
+               remove_full(s, n, page);
                add_partial(n, page, DEACTIVATE_TO_TAIL);
                stat(s, FREE_ADD_PARTIAL);
        }
@@ -3696,10 +3696,10 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
 
        BUG_ON(irqs_disabled());
        spin_lock_irq(&n->list_lock);
-       list_for_each_entry_safe(page, h, &n->partial, lru) {
+       list_for_each_entry_safe(page, h, &n->partial, slab_list) {
                if (!page->inuse) {
                        remove_partial(n, page);
-                       list_add(&page->lru, &discard);
+                       list_add(&page->slab_list, &discard);
                } else {
                        list_slab_objects(s, page,
                        "Objects remaining in %s on __kmem_cache_shutdown()");
@@ -3707,7 +3707,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
        }
        spin_unlock_irq(&n->list_lock);
 
-       list_for_each_entry_safe(page, h, &discard, lru)
+       list_for_each_entry_safe(page, h, &discard, slab_list)
                discard_slab(s, page);
 }
 
@@ -3839,7 +3839,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
        return ret;
 }
 EXPORT_SYMBOL(__kmalloc_node);
-#endif
+#endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_HARDENED_USERCOPY
 /*
@@ -3987,7 +3987,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
                 * Note that concurrent frees may occur while we hold the
                 * list_lock. page->inuse here is the upper limit.
                 */
-               list_for_each_entry_safe(page, t, &n->partial, lru) {
+               list_for_each_entry_safe(page, t, &n->partial, slab_list) {
                        int free = page->objects - page->inuse;
 
                        /* Do not reread page->inuse */
@@ -3997,10 +3997,10 @@ int __kmem_cache_shrink(struct kmem_cache *s)
                        BUG_ON(free <= 0);
 
                        if (free == page->objects) {
-                               list_move(&page->lru, &discard);
+                               list_move(&page->slab_list, &discard);
                                n->nr_partial--;
                        } else if (free <= SHRINK_PROMOTE_MAX)
-                               list_move(&page->lru, promote + free - 1);
+                               list_move(&page->slab_list, promote + free - 1);
                }
 
                /*
@@ -4013,7 +4013,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
                spin_unlock_irqrestore(&n->list_lock, flags);
 
                /* Release empty slabs */
-               list_for_each_entry_safe(page, t, &discard, lru)
+               list_for_each_entry_safe(page, t, &discard, slab_list)
                        discard_slab(s, page);
 
                if (slabs_node(s, node))
@@ -4057,7 +4057,7 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s)
         */
        slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu);
 }
-#endif
+#endif /* CONFIG_MEMCG */
 
 static int slab_mem_going_offline_callback(void *arg)
 {
@@ -4205,11 +4205,11 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
        for_each_kmem_cache_node(s, node, n) {
                struct page *p;
 
-               list_for_each_entry(p, &n->partial, lru)
+               list_for_each_entry(p, &n->partial, slab_list)
                        p->slab_cache = s;
 
 #ifdef CONFIG_SLUB_DEBUG
-               list_for_each_entry(p, &n->full, lru)
+               list_for_each_entry(p, &n->full, slab_list)
                        p->slab_cache = s;
 #endif
        }
@@ -4426,7 +4426,7 @@ static int validate_slab_node(struct kmem_cache *s,
 
        spin_lock_irqsave(&n->list_lock, flags);
 
-       list_for_each_entry(page, &n->partial, lru) {
+       list_for_each_entry(page, &n->partial, slab_list) {
                validate_slab_slab(s, page, map);
                count++;
        }
@@ -4437,7 +4437,7 @@ static int validate_slab_node(struct kmem_cache *s,
        if (!(s->flags & SLAB_STORE_USER))
                goto out;
 
-       list_for_each_entry(page, &n->full, lru) {
+       list_for_each_entry(page, &n->full, slab_list) {
                validate_slab_slab(s, page, map);
                count++;
        }
@@ -4633,9 +4633,9 @@ static int list_locations(struct kmem_cache *s, char *buf,
                        continue;
 
                spin_lock_irqsave(&n->list_lock, flags);
-               list_for_each_entry(page, &n->partial, lru)
+               list_for_each_entry(page, &n->partial, slab_list)
                        process_slab(&t, s, page, alloc, map);
-               list_for_each_entry(page, &n->full, lru)
+               list_for_each_entry(page, &n->full, slab_list)
                        process_slab(&t, s, page, alloc, map);
                spin_unlock_irqrestore(&n->list_lock, flags);
        }
@@ -4690,7 +4690,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
                len += sprintf(buf, "No data\n");
        return len;
 }
-#endif
+#endif /* CONFIG_SLUB_DEBUG */
 
 #ifdef SLUB_RESILIENCY_TEST
 static void __init resiliency_test(void)
@@ -4750,7 +4750,7 @@ static void __init resiliency_test(void)
 #ifdef CONFIG_SYSFS
 static void resiliency_test(void) {};
 #endif
-#endif
+#endif /* SLUB_RESILIENCY_TEST */
 
 #ifdef CONFIG_SYSFS
 enum slab_stat_type {
@@ -5407,7 +5407,7 @@ STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
 STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
 STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
 STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
-#endif
+#endif /* CONFIG_SLUB_STATS */
 
 static struct attribute *slab_attrs[] = {
        &slab_size_attr.attr,
@@ -5608,7 +5608,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
 
        if (buffer)
                free_page((unsigned long)buffer);
-#endif
+#endif /* CONFIG_MEMCG */
 }
 
 static void kmem_cache_release(struct kobject *k)
index 56e057c..fd13166 100644 (file)
@@ -684,10 +684,18 @@ static void free_map_bootmem(struct page *memmap)
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
-/*
- * returns the number of sections whose mem_maps were properly
- * set.  If this is <=0, then that means that the passed-in
- * map was not consumed and must be freed.
+/**
+ * sparse_add_one_section - add a memory section
+ * @nid: The node to add section on
+ * @start_pfn: start pfn of the memory range
+ * @altmap: device page map
+ *
+ * This is only intended for hotplug.
+ *
+ * Return:
+ * * 0         - On success.
+ * * -EEXIST   - Section has been present.
+ * * -ENOMEM   - Out of memory.
  */
 int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
                                     struct vmem_altmap *altmap)
index 301ed4e..3a75722 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -867,7 +867,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
        SetPageLRU(page);
        /*
         * Page becomes evictable in two ways:
-        * 1) Within LRU lock [munlock_vma_pages() and __munlock_pagevec()].
+        * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
         * 2) Before acquiring LRU lock to put the page to correct LRU and then
         *   a) do PageLRU check with lock [check_move_unevictable_pages]
         *   b) do PageLRU check before lock [clear_page_mlock]
index 85245fd..eb71416 100644 (file)
@@ -132,7 +132,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp)
                for (i = 0; i < nr; i++) {
                        VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
                        set_page_private(page + i, entry.val + i);
-                       xas_store(&xas, page + i);
+                       xas_store(&xas, page);
                        xas_next(&xas);
                }
                address_space->nrpages += nr;
@@ -167,7 +167,7 @@ void __delete_from_swap_cache(struct page *page, swp_entry_t entry)
 
        for (i = 0; i < nr; i++) {
                void *entry = xas_store(&xas, NULL);
-               VM_BUG_ON_PAGE(entry != page + i, entry);
+               VM_BUG_ON_PAGE(entry != page, entry);
                set_page_private(page + i, 0);
                xas_next(&xas);
        }
index d59b5a7..9932d57 100644 (file)
@@ -271,8 +271,7 @@ retry:
                 */
                idx = linear_page_index(dst_vma, dst_addr);
                mapping = dst_vma->vm_file->f_mapping;
-               hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
-                                                               idx, dst_addr);
+               hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr);
                mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
                err = -ENOMEM;
index 43a2984..e2e4f8c 100644 (file)
--- a/mm/util.c
+++ b/mm/util.c
@@ -318,7 +318,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
  * get_user_pages_fast() - pin user pages in memory
  * @start:     starting user address
  * @nr_pages:  number of pages from start to pin
- * @write:     whether pages will be written to
+ * @gup_flags: flags modifying pin behaviour
  * @pages:     array that receives pointers to the pages pinned.
  *             Should be at least nr_pages long.
  *
@@ -339,10 +339,10 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
  * were pinned, returns -errno.
  */
 int __weak get_user_pages_fast(unsigned long start,
-                               int nr_pages, int write, struct page **pages)
+                               int nr_pages, unsigned int gup_flags,
+                               struct page **pages)
 {
-       return get_user_pages_unlocked(start, nr_pages, pages,
-                                      write ? FOLL_WRITE : 0);
+       return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
 }
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
 
@@ -652,7 +652,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
  */
 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 {
-       long free, allowed, reserve;
+       long allowed;
 
        VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
                        -(s64)vm_committed_as_batch * num_online_cpus(),
@@ -667,51 +667,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
                return 0;
 
        if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
-               free = global_zone_page_state(NR_FREE_PAGES);
-               free += global_node_page_state(NR_FILE_PAGES);
-
-               /*
-                * shmem pages shouldn't be counted as free in this
-                * case, they can't be purged, only swapped out, and
-                * that won't affect the overall amount of available
-                * memory in the system.
-                */
-               free -= global_node_page_state(NR_SHMEM);
-
-               free += get_nr_swap_pages();
-
-               /*
-                * Any slabs which are created with the
-                * SLAB_RECLAIM_ACCOUNT flag claim to have contents
-                * which are reclaimable, under pressure.  The dentry
-                * cache and most inode caches should fall into this
-                */
-               free += global_node_page_state(NR_SLAB_RECLAIMABLE);
-
-               /*
-                * Part of the kernel memory, which can be released
-                * under memory pressure.
-                */
-               free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
-
-               /*
-                * Leave reserved pages. The pages are not for anonymous pages.
-                */
-               if (free <= totalreserve_pages)
+               if (pages > totalram_pages() + total_swap_pages)
                        goto error;
-               else
-                       free -= totalreserve_pages;
-
-               /*
-                * Reserve some for root
-                */
-               if (!cap_sys_admin)
-                       free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-
-               if (free > pages)
-                       return 0;
-
-               goto error;
+               return 0;
        }
 
        allowed = vm_commit_limit();
@@ -725,7 +683,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
         * Don't let a single process grow so big a user can't recover
         */
        if (mm) {
-               reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+               long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+
                allowed -= min_t(long, mm->total_vm / 32, reserve);
        }
 
index fd9de50..d96c547 100644 (file)
@@ -346,7 +346,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone
        int zid;
 
        if (!mem_cgroup_disabled())
-               lru_size = mem_cgroup_get_lru_size(lruvec, lru);
+               lru_size = lruvec_page_state(lruvec, NR_LRU_BASE + lru);
        else
                lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
 
@@ -1107,6 +1107,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
        LIST_HEAD(ret_pages);
        LIST_HEAD(free_pages);
        unsigned nr_reclaimed = 0;
+       unsigned pgactivate = 0;
 
        memset(stat, 0, sizeof(*stat));
        cond_resched();
@@ -1466,8 +1467,10 @@ activate_locked:
                        try_to_free_swap(page);
                VM_BUG_ON_PAGE(PageActive(page), page);
                if (!PageMlocked(page)) {
+                       int type = page_is_file_cache(page);
                        SetPageActive(page);
-                       stat->nr_activate++;
+                       pgactivate++;
+                       stat->nr_activate[type] += hpage_nr_pages(page);
                        count_memcg_page_event(page, PGACTIVATE);
                }
 keep_locked:
@@ -1482,7 +1485,7 @@ keep:
        free_unref_page_list(&free_pages);
 
        list_splice(&ret_pages, page_list);
-       count_vm_events(PGACTIVATE, stat->nr_activate);
+       count_vm_events(PGACTIVATE, pgactivate);
 
        return nr_reclaimed;
 }
@@ -1804,40 +1807,54 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
        return isolated > inactive;
 }
 
-static noinline_for_stack void
-putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
+/*
+ * This moves pages from @list to corresponding LRU list.
+ *
+ * We move them the other way if the page is referenced by one or more
+ * processes, from rmap.
+ *
+ * If the pages are mostly unmapped, the processing is fast and it is
+ * appropriate to hold zone_lru_lock across the whole operation.  But if
+ * the pages are mapped, the processing is slow (page_referenced()) so we
+ * should drop zone_lru_lock around each page.  It's impossible to balance
+ * this, so instead we remove the pages from the LRU while processing them.
+ * It is safe to rely on PG_active against the non-LRU pages in here because
+ * nobody will play with that bit on a non-LRU page.
+ *
+ * The downside is that we have to touch page->_refcount against each page.
+ * But we had to alter page->flags anyway.
+ *
+ * Returns the number of pages moved to the given lruvec.
+ */
+
+static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
+                                                    struct list_head *list)
 {
-       struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+       int nr_pages, nr_moved = 0;
        LIST_HEAD(pages_to_free);
+       struct page *page;
+       enum lru_list lru;
 
-       /*
-        * Put back any unfreeable pages.
-        */
-       while (!list_empty(page_list)) {
-               struct page *page = lru_to_page(page_list);
-               int lru;
-
+       while (!list_empty(list)) {
+               page = lru_to_page(list);
                VM_BUG_ON_PAGE(PageLRU(page), page);
-               list_del(&page->lru);
                if (unlikely(!page_evictable(page))) {
+                       list_del(&page->lru);
                        spin_unlock_irq(&pgdat->lru_lock);
                        putback_lru_page(page);
                        spin_lock_irq(&pgdat->lru_lock);
                        continue;
                }
-
                lruvec = mem_cgroup_page_lruvec(page, pgdat);
 
                SetPageLRU(page);
                lru = page_lru(page);
-               add_page_to_lru_list(page, lruvec, lru);
 
-               if (is_active_lru(lru)) {
-                       int file = is_file_lru(lru);
-                       int numpages = hpage_nr_pages(page);
-                       reclaim_stat->recent_rotated[file] += numpages;
-               }
+               nr_pages = hpage_nr_pages(page);
+               update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
+               list_move(&page->lru, &lruvec->lists[lru]);
+
                if (put_page_testzero(page)) {
                        __ClearPageLRU(page);
                        __ClearPageActive(page);
@@ -1850,13 +1867,17 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
                                spin_lock_irq(&pgdat->lru_lock);
                        } else
                                list_add(&page->lru, &pages_to_free);
+               } else {
+                       nr_moved += nr_pages;
                }
        }
 
        /*
         * To save our caller's stack, now use input list for pages to free.
         */
-       list_splice(&pages_to_free, page_list);
+       list_splice(&pages_to_free, list);
+
+       return nr_moved;
 }
 
 /*
@@ -1886,6 +1907,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        unsigned long nr_taken;
        struct reclaim_stat stat;
        int file = is_file_lru(lru);
+       enum vm_event_item item;
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);
        struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
        bool stalled = false;
@@ -1913,17 +1935,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
        reclaim_stat->recent_scanned[file] += nr_taken;
 
-       if (current_is_kswapd()) {
-               if (global_reclaim(sc))
-                       __count_vm_events(PGSCAN_KSWAPD, nr_scanned);
-               count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD,
-                                  nr_scanned);
-       } else {
-               if (global_reclaim(sc))
-                       __count_vm_events(PGSCAN_DIRECT, nr_scanned);
-               count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT,
-                                  nr_scanned);
-       }
+       item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
+       if (global_reclaim(sc))
+               __count_vm_events(item, nr_scanned);
+       __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
        spin_unlock_irq(&pgdat->lru_lock);
 
        if (nr_taken == 0)
@@ -1934,19 +1949,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 
        spin_lock_irq(&pgdat->lru_lock);
 
-       if (current_is_kswapd()) {
-               if (global_reclaim(sc))
-                       __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);
-               count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD,
-                                  nr_reclaimed);
-       } else {
-               if (global_reclaim(sc))
-                       __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);
-               count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT,
-                                  nr_reclaimed);
-       }
+       item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
+       if (global_reclaim(sc))
+               __count_vm_events(item, nr_reclaimed);
+       __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
+       reclaim_stat->recent_rotated[0] = stat.nr_activate[0];
+       reclaim_stat->recent_rotated[1] = stat.nr_activate[1];
 
-       putback_inactive_pages(lruvec, &page_list);
+       move_pages_to_lru(lruvec, &page_list);
 
        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
 
@@ -1983,73 +1993,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        return nr_reclaimed;
 }
 
-/*
- * This moves pages from the active list to the inactive list.
- *
- * We move them the other way if the page is referenced by one or more
- * processes, from rmap.
- *
- * If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold pgdat->lru_lock across the whole operation.  But if
- * the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop pgdat->lru_lock around each page.  It's impossible to balance
- * this, so instead we remove the pages from the LRU while processing them.
- * It is safe to rely on PG_active against the non-LRU pages in here because
- * nobody will play with that bit on a non-LRU page.
- *
- * The downside is that we have to touch page->_refcount against each page.
- * But we had to alter page->flags anyway.
- *
- * Returns the number of pages moved to the given lru.
- */
-
-static unsigned move_active_pages_to_lru(struct lruvec *lruvec,
-                                    struct list_head *list,
-                                    struct list_head *pages_to_free,
-                                    enum lru_list lru)
-{
-       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-       struct page *page;
-       int nr_pages;
-       int nr_moved = 0;
-
-       while (!list_empty(list)) {
-               page = lru_to_page(list);
-               lruvec = mem_cgroup_page_lruvec(page, pgdat);
-
-               VM_BUG_ON_PAGE(PageLRU(page), page);
-               SetPageLRU(page);
-
-               nr_pages = hpage_nr_pages(page);
-               update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
-               list_move(&page->lru, &lruvec->lists[lru]);
-
-               if (put_page_testzero(page)) {
-                       __ClearPageLRU(page);
-                       __ClearPageActive(page);
-                       del_page_from_lru_list(page, lruvec, lru);
-
-                       if (unlikely(PageCompound(page))) {
-                               spin_unlock_irq(&pgdat->lru_lock);
-                               mem_cgroup_uncharge(page);
-                               (*get_compound_page_dtor(page))(page);
-                               spin_lock_irq(&pgdat->lru_lock);
-                       } else
-                               list_add(&page->lru, pages_to_free);
-               } else {
-                       nr_moved += nr_pages;
-               }
-       }
-
-       if (!is_active_lru(lru)) {
-               __count_vm_events(PGDEACTIVATE, nr_moved);
-               count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
-                                  nr_moved);
-       }
-
-       return nr_moved;
-}
-
 static void shrink_active_list(unsigned long nr_to_scan,
                               struct lruvec *lruvec,
                               struct scan_control *sc,
@@ -2079,7 +2022,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
        reclaim_stat->recent_scanned[file] += nr_taken;
 
        __count_vm_events(PGREFILL, nr_scanned);
-       count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
+       __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
 
        spin_unlock_irq(&pgdat->lru_lock);
 
@@ -2136,13 +2079,19 @@ static void shrink_active_list(unsigned long nr_to_scan,
         */
        reclaim_stat->recent_rotated[file] += nr_rotated;
 
-       nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
-       nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
+       nr_activate = move_pages_to_lru(lruvec, &l_active);
+       nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
+       /* Keep all free pages in l_active list */
+       list_splice(&l_inactive, &l_active);
+
+       __count_vm_events(PGDEACTIVATE, nr_deactivate);
+       __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
+
        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
        spin_unlock_irq(&pgdat->lru_lock);
 
-       mem_cgroup_uncharge_list(&l_hold);
-       free_unref_page_list(&l_hold);
+       mem_cgroup_uncharge_list(&l_active);
+       free_unref_page_list(&l_active);
        trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
                        nr_deactivate, nr_rotated, sc->priority, file);
 }
@@ -3212,10 +3161,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
        if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
                return 1;
 
-       trace_mm_vmscan_direct_reclaim_begin(order,
-                               sc.may_writepage,
-                               sc.gfp_mask,
-                               sc.reclaim_idx);
+       trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
 
        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 
@@ -3246,9 +3192,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
 
        trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
-                                                     sc.may_writepage,
-                                                     sc.gfp_mask,
-                                                     sc.reclaim_idx);
+                                                     sc.gfp_mask);
 
        /*
         * NOTE: Although we can get the priority field, using it
@@ -3297,10 +3241,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 
        zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
 
-       trace_mm_vmscan_memcg_reclaim_begin(0,
-                                           sc.may_writepage,
-                                           sc.gfp_mask,
-                                           sc.reclaim_idx);
+       trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
 
        psi_memstall_enter(&pflags);
        noreclaim_flag = memalloc_noreclaim_save();
@@ -4149,6 +4090,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
                .reclaim_idx = gfp_zone(gfp_mask),
        };
 
+       trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
+                                          sc.gfp_mask);
+
        cond_resched();
        fs_reclaim_acquire(sc.gfp_mask);
        /*
@@ -4175,6 +4119,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
        current->flags &= ~PF_SWAPWRITE;
        memalloc_noreclaim_restore(noreclaim_flag);
        fs_reclaim_release(sc.gfp_mask);
+
+       trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
+
        return sc.nr_reclaimed >= nr_pages;
 }
 
index 0bedf67..6419bae 100644 (file)
@@ -426,10 +426,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
 #ifdef CONFIG_MEMCG
        if (sc->memcg) {
                struct lruvec *lruvec;
+               int i;
 
-               pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
-                                                    LRU_ALL);
                lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg);
+               for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
+                       pages += lruvec_page_state(lruvec, NR_LRU_BASE + i);
                pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE);
                pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE);
        } else
index aee9b0b..1ffecd6 100644 (file)
 
 #include <linux/atomic.h>
 #include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/dcache.h>
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/page-flags.h>
+#include <linux/migrate.h>
+#include <linux/node.h>
+#include <linux/compaction.h>
 #include <linux/percpu.h>
+#include <linux/mount.h>
+#include <linux/fs.h>
 #include <linux/preempt.h>
 #include <linux/workqueue.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/zpool.h>
 
+/*
+ * NCHUNKS_ORDER determines the internal allocation granularity, effectively
+ * adjusting internal fragmentation.  It also determines the number of
+ * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
+ * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
+ * in the beginning of an allocated page are occupied by z3fold header, so
+ * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
+ * which shows the max number of free chunks in z3fold page, also there will
+ * be 63, or 62, respectively, freelists per pool.
+ */
+#define NCHUNKS_ORDER  6
+
+#define CHUNK_SHIFT    (PAGE_SHIFT - NCHUNKS_ORDER)
+#define CHUNK_SIZE     (1 << CHUNK_SHIFT)
+#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
+#define ZHDR_CHUNKS    (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
+#define TOTAL_CHUNKS   (PAGE_SIZE >> CHUNK_SHIFT)
+#define NCHUNKS                ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
+
+#define BUDDY_MASK     (0x3)
+#define BUDDY_SHIFT    2
+#define SLOTS_ALIGN    (0x40)
+
 /*****************
  * Structures
 *****************/
@@ -47,8 +78,18 @@ enum buddy {
        FIRST,
        MIDDLE,
        LAST,
-       BUDDIES_MAX
+       BUDDIES_MAX = LAST
+};
+
+struct z3fold_buddy_slots {
+       /*
+        * we are using BUDDY_MASK in handle_to_buddy etc. so there should
+        * be enough slots to hold all possible variants
+        */
+       unsigned long slot[BUDDY_MASK + 1];
+       unsigned long pool; /* back link + flags */
 };
+#define HANDLE_FLAG_MASK       (0x03)
 
 /*
  * struct z3fold_header - z3fold page metadata occupying first chunks of each
@@ -58,49 +99,29 @@ enum buddy {
  * @page_lock:         per-page lock
  * @refcount:          reference count for the z3fold page
  * @work:              work_struct for page layout optimization
- * @pool:              pointer to the pool which this page belongs to
+ * @slots:             pointer to the structure holding buddy slots
  * @cpu:               CPU which this page "belongs" to
  * @first_chunks:      the size of the first buddy in chunks, 0 if free
  * @middle_chunks:     the size of the middle buddy in chunks, 0 if free
  * @last_chunks:       the size of the last buddy in chunks, 0 if free
  * @first_num:         the starting number (for the first handle)
+ * @mapped_count:      the number of objects currently mapped
  */
 struct z3fold_header {
        struct list_head buddy;
        spinlock_t page_lock;
        struct kref refcount;
        struct work_struct work;
-       struct z3fold_pool *pool;
+       struct z3fold_buddy_slots *slots;
        short cpu;
        unsigned short first_chunks;
        unsigned short middle_chunks;
        unsigned short last_chunks;
        unsigned short start_middle;
        unsigned short first_num:2;
+       unsigned short mapped_count:2;
 };
 
-/*
- * NCHUNKS_ORDER determines the internal allocation granularity, effectively
- * adjusting internal fragmentation.  It also determines the number of
- * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
- * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
- * in the beginning of an allocated page are occupied by z3fold header, so
- * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
- * which shows the max number of free chunks in z3fold page, also there will
- * be 63, or 62, respectively, freelists per pool.
- */
-#define NCHUNKS_ORDER  6
-
-#define CHUNK_SHIFT    (PAGE_SHIFT - NCHUNKS_ORDER)
-#define CHUNK_SIZE     (1 << CHUNK_SHIFT)
-#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
-#define ZHDR_CHUNKS    (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
-#define TOTAL_CHUNKS   (PAGE_SIZE >> CHUNK_SHIFT)
-#define NCHUNKS                ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
-
-#define BUDDY_MASK     (0x3)
-#define BUDDY_SHIFT    2
-
 /**
  * struct z3fold_pool - stores metadata for each z3fold pool
  * @name:      pool name
@@ -113,11 +134,13 @@ struct z3fold_header {
  *             added buddy.
  * @stale:     list of pages marked for freeing
  * @pages_nr:  number of z3fold pages in the pool.
+ * @c_handle:  cache for z3fold_buddy_slots allocation
  * @ops:       pointer to a structure of user defined operations specified at
  *             pool creation time.
  * @compact_wq:        workqueue for page layout background optimization
  * @release_wq:        workqueue for safe page release
  * @work:      work_struct for safe page release
+ * @inode:     inode for z3fold pseudo filesystem
  *
  * This structure is allocated at pool creation time and maintains metadata
  * pertaining to a particular z3fold pool.
@@ -130,12 +153,14 @@ struct z3fold_pool {
        struct list_head lru;
        struct list_head stale;
        atomic64_t pages_nr;
+       struct kmem_cache *c_handle;
        const struct z3fold_ops *ops;
        struct zpool *zpool;
        const struct zpool_ops *zpool_ops;
        struct workqueue_struct *compact_wq;
        struct workqueue_struct *release_wq;
        struct work_struct work;
+       struct inode *inode;
 };
 
 /*
@@ -164,11 +189,118 @@ static int size_to_chunks(size_t size)
 
 static void compact_page_work(struct work_struct *w);
 
+static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool)
+{
+       struct z3fold_buddy_slots *slots = kmem_cache_alloc(pool->c_handle,
+                                                       GFP_KERNEL);
+
+       if (slots) {
+               memset(slots->slot, 0, sizeof(slots->slot));
+               slots->pool = (unsigned long)pool;
+       }
+
+       return slots;
+}
+
+static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s)
+{
+       return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK);
+}
+
+static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle)
+{
+       return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1));
+}
+
+static inline void free_handle(unsigned long handle)
+{
+       struct z3fold_buddy_slots *slots;
+       int i;
+       bool is_free;
+
+       if (handle & (1 << PAGE_HEADLESS))
+               return;
+
+       WARN_ON(*(unsigned long *)handle == 0);
+       *(unsigned long *)handle = 0;
+       slots = handle_to_slots(handle);
+       is_free = true;
+       for (i = 0; i <= BUDDY_MASK; i++) {
+               if (slots->slot[i]) {
+                       is_free = false;
+                       break;
+               }
+       }
+
+       if (is_free) {
+               struct z3fold_pool *pool = slots_to_pool(slots);
+
+               kmem_cache_free(pool->c_handle, slots);
+       }
+}
+
+static struct dentry *z3fold_do_mount(struct file_system_type *fs_type,
+                               int flags, const char *dev_name, void *data)
+{
+       static const struct dentry_operations ops = {
+               .d_dname = simple_dname,
+       };
+
+       return mount_pseudo(fs_type, "z3fold:", NULL, &ops, 0x33);
+}
+
+static struct file_system_type z3fold_fs = {
+       .name           = "z3fold",
+       .mount          = z3fold_do_mount,
+       .kill_sb        = kill_anon_super,
+};
+
+static struct vfsmount *z3fold_mnt;
+static int z3fold_mount(void)
+{
+       int ret = 0;
+
+       z3fold_mnt = kern_mount(&z3fold_fs);
+       if (IS_ERR(z3fold_mnt))
+               ret = PTR_ERR(z3fold_mnt);
+
+       return ret;
+}
+
+static void z3fold_unmount(void)
+{
+       kern_unmount(z3fold_mnt);
+}
+
+static const struct address_space_operations z3fold_aops;
+static int z3fold_register_migration(struct z3fold_pool *pool)
+{
+       pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb);
+       if (IS_ERR(pool->inode)) {
+               pool->inode = NULL;
+               return 1;
+       }
+
+       pool->inode->i_mapping->private_data = pool;
+       pool->inode->i_mapping->a_ops = &z3fold_aops;
+       return 0;
+}
+
+static void z3fold_unregister_migration(struct z3fold_pool *pool)
+{
+       if (pool->inode)
+               iput(pool->inode);
+ }
+
 /* Initializes the z3fold header of a newly allocated z3fold page */
 static struct z3fold_header *init_z3fold_page(struct page *page,
                                        struct z3fold_pool *pool)
 {
        struct z3fold_header *zhdr = page_address(page);
+       struct z3fold_buddy_slots *slots = alloc_slots(pool);
+
+       if (!slots)
+               return NULL;
 
        INIT_LIST_HEAD(&page->lru);
        clear_bit(PAGE_HEADLESS, &page->private);
@@ -185,15 +317,21 @@ static struct z3fold_header *init_z3fold_page(struct page *page,
        zhdr->first_num = 0;
        zhdr->start_middle = 0;
        zhdr->cpu = -1;
-       zhdr->pool = pool;
+       zhdr->slots = slots;
        INIT_LIST_HEAD(&zhdr->buddy);
        INIT_WORK(&zhdr->work, compact_page_work);
        return zhdr;
 }
 
 /* Resets the struct page fields and frees the page */
-static void free_z3fold_page(struct page *page)
+static void free_z3fold_page(struct page *page, bool headless)
 {
+       if (!headless) {
+               lock_page(page);
+               __ClearPageMovable(page);
+               unlock_page(page);
+       }
+       ClearPagePrivate(page);
        __free_page(page);
 }
 
@@ -215,33 +353,57 @@ static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
        spin_unlock(&zhdr->page_lock);
 }
 
+/* Helper function to build the index */
+static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
+{
+       return (bud + zhdr->first_num) & BUDDY_MASK;
+}
+
 /*
  * Encodes the handle of a particular buddy within a z3fold page
  * Pool lock should be held as this function accesses first_num
  */
 static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
 {
-       unsigned long handle;
+       struct z3fold_buddy_slots *slots;
+       unsigned long h = (unsigned long)zhdr;
+       int idx = 0;
 
-       handle = (unsigned long)zhdr;
-       if (bud != HEADLESS) {
-               handle |= (bud + zhdr->first_num) & BUDDY_MASK;
-               if (bud == LAST)
-                       handle |= (zhdr->last_chunks << BUDDY_SHIFT);
-       }
-       return handle;
+       /*
+        * For a headless page, its handle is its pointer with the extra
+        * PAGE_HEADLESS bit set
+        */
+       if (bud == HEADLESS)
+               return h | (1 << PAGE_HEADLESS);
+
+       /* otherwise, return pointer to encoded handle */
+       idx = __idx(zhdr, bud);
+       h += idx;
+       if (bud == LAST)
+               h |= (zhdr->last_chunks << BUDDY_SHIFT);
+
+       slots = zhdr->slots;
+       slots->slot[idx] = h;
+       return (unsigned long)&slots->slot[idx];
 }
 
 /* Returns the z3fold page where a given handle is stored */
-static struct z3fold_header *handle_to_z3fold_header(unsigned long handle)
+static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
 {
-       return (struct z3fold_header *)(handle & PAGE_MASK);
+       unsigned long addr = h;
+
+       if (!(addr & (1 << PAGE_HEADLESS)))
+               addr = *(unsigned long *)h;
+
+       return (struct z3fold_header *)(addr & PAGE_MASK);
 }
 
 /* only for LAST bud, returns zero otherwise */
 static unsigned short handle_to_chunks(unsigned long handle)
 {
-       return (handle & ~PAGE_MASK) >> BUDDY_SHIFT;
+       unsigned long addr = *(unsigned long *)handle;
+
+       return (addr & ~PAGE_MASK) >> BUDDY_SHIFT;
 }
 
 /*
@@ -251,21 +413,31 @@ static unsigned short handle_to_chunks(unsigned long handle)
  */
 static enum buddy handle_to_buddy(unsigned long handle)
 {
-       struct z3fold_header *zhdr = handle_to_z3fold_header(handle);
-       return (handle - zhdr->first_num) & BUDDY_MASK;
+       struct z3fold_header *zhdr;
+       unsigned long addr;
+
+       WARN_ON(handle & (1 << PAGE_HEADLESS));
+       addr = *(unsigned long *)handle;
+       zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
+       return (addr - zhdr->first_num) & BUDDY_MASK;
+}
+
+static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr)
+{
+       return slots_to_pool(zhdr->slots);
 }
 
 static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
 {
        struct page *page = virt_to_page(zhdr);
-       struct z3fold_pool *pool = zhdr->pool;
+       struct z3fold_pool *pool = zhdr_to_pool(zhdr);
 
        WARN_ON(!list_empty(&zhdr->buddy));
        set_bit(PAGE_STALE, &page->private);
        clear_bit(NEEDS_COMPACTING, &page->private);
        spin_lock(&pool->lock);
        if (!list_empty(&page->lru))
-               list_del(&page->lru);
+               list_del_init(&page->lru);
        spin_unlock(&pool->lock);
        if (locked)
                z3fold_page_unlock(zhdr);
@@ -295,9 +467,10 @@ static void release_z3fold_page_locked_list(struct kref *ref)
 {
        struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
                                               refcount);
-       spin_lock(&zhdr->pool->lock);
+       struct z3fold_pool *pool = zhdr_to_pool(zhdr);
+       spin_lock(&pool->lock);
        list_del_init(&zhdr->buddy);
-       spin_unlock(&zhdr->pool->lock);
+       spin_unlock(&pool->lock);
 
        WARN_ON(z3fold_page_trylock(zhdr));
        __release_z3fold_page(zhdr, true);
@@ -318,7 +491,7 @@ static void free_pages_work(struct work_struct *w)
                        continue;
                spin_unlock(&pool->stale_lock);
                cancel_work_sync(&zhdr->work);
-               free_z3fold_page(page);
+               free_z3fold_page(page, false);
                cond_resched();
                spin_lock(&pool->stale_lock);
        }
@@ -349,6 +522,23 @@ static int num_free_chunks(struct z3fold_header *zhdr)
        return nfree;
 }
 
+/* Add to the appropriate unbuddied list */
+static inline void add_to_unbuddied(struct z3fold_pool *pool,
+                               struct z3fold_header *zhdr)
+{
+       if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
+                       zhdr->middle_chunks == 0) {
+               struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
+
+               int freechunks = num_free_chunks(zhdr);
+               spin_lock(&pool->lock);
+               list_add(&zhdr->buddy, &unbuddied[freechunks]);
+               spin_unlock(&pool->lock);
+               zhdr->cpu = smp_processor_id();
+               put_cpu_ptr(pool->unbuddied);
+       }
+}
+
 static inline void *mchunk_memmove(struct z3fold_header *zhdr,
                                unsigned short dst_chunk)
 {
@@ -367,6 +557,9 @@ static int z3fold_compact_page(struct z3fold_header *zhdr)
        if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private))
                return 0; /* can't move middle chunk, it's used */
 
+       if (unlikely(PageIsolated(page)))
+               return 0;
+
        if (zhdr->middle_chunks == 0)
                return 0; /* nothing to compact */
 
@@ -406,10 +599,8 @@ static int z3fold_compact_page(struct z3fold_header *zhdr)
 
 static void do_compact_page(struct z3fold_header *zhdr, bool locked)
 {
-       struct z3fold_pool *pool = zhdr->pool;
+       struct z3fold_pool *pool = zhdr_to_pool(zhdr);
        struct page *page;
-       struct list_head *unbuddied;
-       int fchunks;
 
        page = virt_to_page(zhdr);
        if (locked)
@@ -429,19 +620,14 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
                return;
        }
 
-       z3fold_compact_page(zhdr);
-       unbuddied = get_cpu_ptr(pool->unbuddied);
-       fchunks = num_free_chunks(zhdr);
-       if (fchunks < NCHUNKS &&
-           (!zhdr->first_chunks || !zhdr->middle_chunks ||
-                       !zhdr->last_chunks)) {
-               /* the page's not completely free and it's unbuddied */
-               spin_lock(&pool->lock);
-               list_add(&zhdr->buddy, &unbuddied[fchunks]);
-               spin_unlock(&pool->lock);
-               zhdr->cpu = smp_processor_id();
+       if (unlikely(PageIsolated(page) ||
+                    test_bit(PAGE_STALE, &page->private))) {
+               z3fold_page_unlock(zhdr);
+               return;
        }
-       put_cpu_ptr(pool->unbuddied);
+
+       z3fold_compact_page(zhdr);
+       add_to_unbuddied(pool, zhdr);
        z3fold_page_unlock(zhdr);
 }
 
@@ -453,6 +639,103 @@ static void compact_page_work(struct work_struct *w)
        do_compact_page(zhdr, false);
 }
 
+/* returns _locked_ z3fold page header or NULL */
+static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
+                                               size_t size, bool can_sleep)
+{
+       struct z3fold_header *zhdr = NULL;
+       struct page *page;
+       struct list_head *unbuddied;
+       int chunks = size_to_chunks(size), i;
+
+lookup:
+       /* First, try to find an unbuddied z3fold page. */
+       unbuddied = get_cpu_ptr(pool->unbuddied);
+       for_each_unbuddied_list(i, chunks) {
+               struct list_head *l = &unbuddied[i];
+
+               zhdr = list_first_entry_or_null(READ_ONCE(l),
+                                       struct z3fold_header, buddy);
+
+               if (!zhdr)
+                       continue;
+
+               /* Re-check under lock. */
+               spin_lock(&pool->lock);
+               l = &unbuddied[i];
+               if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
+                                               struct z3fold_header, buddy)) ||
+                   !z3fold_page_trylock(zhdr)) {
+                       spin_unlock(&pool->lock);
+                       zhdr = NULL;
+                       put_cpu_ptr(pool->unbuddied);
+                       if (can_sleep)
+                               cond_resched();
+                       goto lookup;
+               }
+               list_del_init(&zhdr->buddy);
+               zhdr->cpu = -1;
+               spin_unlock(&pool->lock);
+
+               page = virt_to_page(zhdr);
+               if (test_bit(NEEDS_COMPACTING, &page->private)) {
+                       z3fold_page_unlock(zhdr);
+                       zhdr = NULL;
+                       put_cpu_ptr(pool->unbuddied);
+                       if (can_sleep)
+                               cond_resched();
+                       goto lookup;
+               }
+
+               /*
+                * this page could not be removed from its unbuddied
+                * list while pool lock was held, and then we've taken
+                * page lock so kref_put could not be called before
+                * we got here, so it's safe to just call kref_get()
+                */
+               kref_get(&zhdr->refcount);
+               break;
+       }
+       put_cpu_ptr(pool->unbuddied);
+
+       if (!zhdr) {
+               int cpu;
+
+               /* look for _exact_ match on other cpus' lists */
+               for_each_online_cpu(cpu) {
+                       struct list_head *l;
+
+                       unbuddied = per_cpu_ptr(pool->unbuddied, cpu);
+                       spin_lock(&pool->lock);
+                       l = &unbuddied[chunks];
+
+                       zhdr = list_first_entry_or_null(READ_ONCE(l),
+                                               struct z3fold_header, buddy);
+
+                       if (!zhdr || !z3fold_page_trylock(zhdr)) {
+                               spin_unlock(&pool->lock);
+                               zhdr = NULL;
+                               continue;
+                       }
+                       list_del_init(&zhdr->buddy);
+                       zhdr->cpu = -1;
+                       spin_unlock(&pool->lock);
+
+                       page = virt_to_page(zhdr);
+                       if (test_bit(NEEDS_COMPACTING, &page->private)) {
+                               z3fold_page_unlock(zhdr);
+                               zhdr = NULL;
+                               if (can_sleep)
+                                       cond_resched();
+                               continue;
+                       }
+                       kref_get(&zhdr->refcount);
+                       break;
+               }
+       }
+
+       return zhdr;
+}
 
 /*
  * API Functions
@@ -476,6 +759,11 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
        pool = kzalloc(sizeof(struct z3fold_pool), gfp);
        if (!pool)
                goto out;
+       pool->c_handle = kmem_cache_create("z3fold_handle",
+                               sizeof(struct z3fold_buddy_slots),
+                               SLOTS_ALIGN, 0, NULL);
+       if (!pool->c_handle)
+               goto out_c;
        spin_lock_init(&pool->lock);
        spin_lock_init(&pool->stale_lock);
        pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
@@ -497,15 +785,21 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
        pool->release_wq = create_singlethread_workqueue(pool->name);
        if (!pool->release_wq)
                goto out_wq;
+       if (z3fold_register_migration(pool))
+               goto out_rwq;
        INIT_WORK(&pool->work, free_pages_work);
        pool->ops = ops;
        return pool;
 
+out_rwq:
+       destroy_workqueue(pool->release_wq);
 out_wq:
        destroy_workqueue(pool->compact_wq);
 out_unbuddied:
        free_percpu(pool->unbuddied);
 out_pool:
+       kmem_cache_destroy(pool->c_handle);
+out_c:
        kfree(pool);
 out:
        return NULL;
@@ -519,6 +813,8 @@ out:
  */
 static void z3fold_destroy_pool(struct z3fold_pool *pool)
 {
+       kmem_cache_destroy(pool->c_handle);
+       z3fold_unregister_migration(pool);
        destroy_workqueue(pool->release_wq);
        destroy_workqueue(pool->compact_wq);
        kfree(pool);
@@ -546,7 +842,7 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool)
 static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
                        unsigned long *handle)
 {
-       int chunks = 0, i, freechunks;
+       int chunks = size_to_chunks(size);
        struct z3fold_header *zhdr = NULL;
        struct page *page = NULL;
        enum buddy bud;
@@ -561,56 +857,8 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
        if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
                bud = HEADLESS;
        else {
-               struct list_head *unbuddied;
-               chunks = size_to_chunks(size);
-
-lookup:
-               /* First, try to find an unbuddied z3fold page. */
-               unbuddied = get_cpu_ptr(pool->unbuddied);
-               for_each_unbuddied_list(i, chunks) {
-                       struct list_head *l = &unbuddied[i];
-
-                       zhdr = list_first_entry_or_null(READ_ONCE(l),
-                                               struct z3fold_header, buddy);
-
-                       if (!zhdr)
-                               continue;
-
-                       /* Re-check under lock. */
-                       spin_lock(&pool->lock);
-                       l = &unbuddied[i];
-                       if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
-                                       struct z3fold_header, buddy)) ||
-                           !z3fold_page_trylock(zhdr)) {
-                               spin_unlock(&pool->lock);
-                               put_cpu_ptr(pool->unbuddied);
-                               goto lookup;
-                       }
-                       list_del_init(&zhdr->buddy);
-                       zhdr->cpu = -1;
-                       spin_unlock(&pool->lock);
-
-                       page = virt_to_page(zhdr);
-                       if (test_bit(NEEDS_COMPACTING, &page->private)) {
-                               z3fold_page_unlock(zhdr);
-                               zhdr = NULL;
-                               put_cpu_ptr(pool->unbuddied);
-                               if (can_sleep)
-                                       cond_resched();
-                               goto lookup;
-                       }
-
-                       /*
-                        * this page could not be removed from its unbuddied
-                        * list while pool lock was held, and then we've taken
-                        * page lock so kref_put could not be called before
-                        * we got here, so it's safe to just call kref_get()
-                        */
-                       kref_get(&zhdr->refcount);
-                       break;
-               }
-               put_cpu_ptr(pool->unbuddied);
-
+retry:
+               zhdr = __z3fold_alloc(pool, size, can_sleep);
                if (zhdr) {
                        if (zhdr->first_chunks == 0) {
                                if (zhdr->middle_chunks != 0 &&
@@ -630,8 +878,9 @@ lookup:
                                        z3fold_page_unlock(zhdr);
                                pr_err("No free chunks in unbuddied\n");
                                WARN_ON(1);
-                               goto lookup;
+                               goto retry;
                        }
+                       page = virt_to_page(zhdr);
                        goto found;
                }
                bud = FIRST;
@@ -662,13 +911,18 @@ lookup:
        if (!page)
                return -ENOMEM;
 
-       atomic64_inc(&pool->pages_nr);
        zhdr = init_z3fold_page(page, pool);
+       if (!zhdr) {
+               __free_page(page);
+               return -ENOMEM;
+       }
+       atomic64_inc(&pool->pages_nr);
 
        if (bud == HEADLESS) {
                set_bit(PAGE_HEADLESS, &page->private);
                goto headless;
        }
+       __SetPageMovable(page, pool->inode->i_mapping);
        z3fold_page_lock(zhdr);
 
 found:
@@ -680,19 +934,7 @@ found:
                zhdr->middle_chunks = chunks;
                zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
        }
-
-       if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
-                       zhdr->middle_chunks == 0) {
-               struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
-
-               /* Add to unbuddied list */
-               freechunks = num_free_chunks(zhdr);
-               spin_lock(&pool->lock);
-               list_add(&zhdr->buddy, &unbuddied[freechunks]);
-               spin_unlock(&pool->lock);
-               zhdr->cpu = smp_processor_id();
-               put_cpu_ptr(pool->unbuddied);
-       }
+       add_to_unbuddied(pool, zhdr);
 
 headless:
        spin_lock(&pool->lock);
@@ -739,7 +981,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
                        spin_lock(&pool->lock);
                        list_del(&page->lru);
                        spin_unlock(&pool->lock);
-                       free_z3fold_page(page);
+                       free_z3fold_page(page, true);
                        atomic64_dec(&pool->pages_nr);
                }
                return;
@@ -766,6 +1008,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
                return;
        }
 
+       free_handle(handle);
        if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
                atomic64_dec(&pool->pages_nr);
                return;
@@ -774,7 +1017,8 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
                z3fold_page_unlock(zhdr);
                return;
        }
-       if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
+       if (unlikely(PageIsolated(page)) ||
+           test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
                z3fold_page_unlock(zhdr);
                return;
        }
@@ -855,10 +1099,12 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
                        if (test_and_set_bit(PAGE_CLAIMED, &page->private))
                                continue;
 
-                       zhdr = page_address(page);
+                       if (unlikely(PageIsolated(page)))
+                               continue;
                        if (test_bit(PAGE_HEADLESS, &page->private))
                                break;
 
+                       zhdr = page_address(page);
                        if (!z3fold_page_trylock(zhdr)) {
                                zhdr = NULL;
                                continue; /* can't evict at this point */
@@ -919,7 +1165,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
 next:
                if (test_bit(PAGE_HEADLESS, &page->private)) {
                        if (ret == 0) {
-                               free_z3fold_page(page);
+                               free_z3fold_page(page, true);
                                atomic64_dec(&pool->pages_nr);
                                return 0;
                        }
@@ -996,6 +1242,8 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
                break;
        }
 
+       if (addr)
+               zhdr->mapped_count++;
        z3fold_page_unlock(zhdr);
 out:
        return addr;
@@ -1022,6 +1270,7 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
        buddy = handle_to_buddy(handle);
        if (buddy == MIDDLE)
                clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+       zhdr->mapped_count--;
        z3fold_page_unlock(zhdr);
 }
 
@@ -1036,6 +1285,128 @@ static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
        return atomic64_read(&pool->pages_nr);
 }
 
+static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
+{
+       struct z3fold_header *zhdr;
+       struct z3fold_pool *pool;
+
+       VM_BUG_ON_PAGE(!PageMovable(page), page);
+       VM_BUG_ON_PAGE(PageIsolated(page), page);
+
+       if (test_bit(PAGE_HEADLESS, &page->private))
+               return false;
+
+       zhdr = page_address(page);
+       z3fold_page_lock(zhdr);
+       if (test_bit(NEEDS_COMPACTING, &page->private) ||
+           test_bit(PAGE_STALE, &page->private))
+               goto out;
+
+       pool = zhdr_to_pool(zhdr);
+
+       if (zhdr->mapped_count == 0) {
+               kref_get(&zhdr->refcount);
+               if (!list_empty(&zhdr->buddy))
+                       list_del_init(&zhdr->buddy);
+               spin_lock(&pool->lock);
+               if (!list_empty(&page->lru))
+                       list_del(&page->lru);
+               spin_unlock(&pool->lock);
+               z3fold_page_unlock(zhdr);
+               return true;
+       }
+out:
+       z3fold_page_unlock(zhdr);
+       return false;
+}
+
+static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage,
+                              struct page *page, enum migrate_mode mode)
+{
+       struct z3fold_header *zhdr, *new_zhdr;
+       struct z3fold_pool *pool;
+       struct address_space *new_mapping;
+
+       VM_BUG_ON_PAGE(!PageMovable(page), page);
+       VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
+       zhdr = page_address(page);
+       pool = zhdr_to_pool(zhdr);
+
+       if (!trylock_page(page))
+               return -EAGAIN;
+
+       if (!z3fold_page_trylock(zhdr)) {
+               unlock_page(page);
+               return -EAGAIN;
+       }
+       if (zhdr->mapped_count != 0) {
+               z3fold_page_unlock(zhdr);
+               unlock_page(page);
+               return -EBUSY;
+       }
+       new_zhdr = page_address(newpage);
+       memcpy(new_zhdr, zhdr, PAGE_SIZE);
+       newpage->private = page->private;
+       page->private = 0;
+       z3fold_page_unlock(zhdr);
+       spin_lock_init(&new_zhdr->page_lock);
+       new_mapping = page_mapping(page);
+       __ClearPageMovable(page);
+       ClearPagePrivate(page);
+
+       get_page(newpage);
+       z3fold_page_lock(new_zhdr);
+       if (new_zhdr->first_chunks)
+               encode_handle(new_zhdr, FIRST);
+       if (new_zhdr->last_chunks)
+               encode_handle(new_zhdr, LAST);
+       if (new_zhdr->middle_chunks)
+               encode_handle(new_zhdr, MIDDLE);
+       set_bit(NEEDS_COMPACTING, &newpage->private);
+       new_zhdr->cpu = smp_processor_id();
+       spin_lock(&pool->lock);
+       list_add(&newpage->lru, &pool->lru);
+       spin_unlock(&pool->lock);
+       __SetPageMovable(newpage, new_mapping);
+       z3fold_page_unlock(new_zhdr);
+
+       queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
+
+       page_mapcount_reset(page);
+       unlock_page(page);
+       put_page(page);
+       return 0;
+}
+
+static void z3fold_page_putback(struct page *page)
+{
+       struct z3fold_header *zhdr;
+       struct z3fold_pool *pool;
+
+       zhdr = page_address(page);
+       pool = zhdr_to_pool(zhdr);
+
+       z3fold_page_lock(zhdr);
+       if (!list_empty(&zhdr->buddy))
+               list_del_init(&zhdr->buddy);
+       INIT_LIST_HEAD(&page->lru);
+       if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
+               atomic64_dec(&pool->pages_nr);
+               return;
+       }
+       spin_lock(&pool->lock);
+       list_add(&page->lru, &pool->lru);
+       spin_unlock(&pool->lock);
+       z3fold_page_unlock(zhdr);
+}
+
+static const struct address_space_operations z3fold_aops = {
+       .isolate_page = z3fold_page_isolate,
+       .migratepage = z3fold_page_migrate,
+       .putback_page = z3fold_page_putback,
+};
+
 /*****************
  * zpool
  ****************/
@@ -1133,8 +1504,14 @@ MODULE_ALIAS("zpool-z3fold");
 
 static int __init init_z3fold(void)
 {
+       int ret;
+
        /* Make sure the z3fold header is not larger than the page size */
        BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE);
+       ret = z3fold_mount();
+       if (ret)
+               return ret;
+
        zpool_register_driver(&z3fold_zpool_driver);
 
        return 0;
@@ -1142,6 +1519,7 @@ static int __init init_z3fold(void)
 
 static void __exit exit_z3fold(void)
 {
+       z3fold_unmount();
        zpool_unregister_driver(&z3fold_zpool_driver);
 }
 
index d3736f5..74cafc0 100644 (file)
@@ -27,7 +27,7 @@ struct page **ceph_get_direct_page_vector(const void __user *data,
        while (got < num_pages) {
                rc = get_user_pages_fast(
                    (unsigned long)data + ((unsigned long)got * PAGE_SIZE),
-                   num_pages - got, write_page, pages + got);
+                   num_pages - got, write_page ? FOLL_WRITE : 0, pages + got);
                if (rc < 0)
                        break;
                BUG_ON(rc == 0);
index e367a97..03f6fd5 100644 (file)
@@ -193,7 +193,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
                ret = -ENOMEM;
                goto out;
        }
-       ret = get_user_pages_fast(start, nr_pages, 1, pages);
+       ret = get_user_pages_fast(start, nr_pages, FOLL_WRITE, pages);
        if (ret != nr_pages) {
                if (ret > 0)
                        nr_pages = ret;
index 182ab84..b340ed4 100644 (file)
@@ -158,7 +158,8 @@ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
 {
        int ret;
 
-       ret = get_user_pages_fast(user_addr, nr_pages, write, pages);
+       ret = get_user_pages_fast(user_addr, nr_pages, write ? FOLL_WRITE : 0,
+                                 pages);
 
        if (ret >= 0 && ret < nr_pages) {
                while (ret--)
index 989e523..2b18223 100644 (file)
@@ -253,8 +253,8 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem)
                return -ENOMEM;
 
        down_read(&current->mm->mmap_sem);
-       npgs = get_user_pages_longterm(umem->address, umem->npgs,
-                                      gup_flags, &umem->pgs[0], NULL);
+       npgs = get_user_pages(umem->address, umem->npgs,
+                             gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL);
        up_read(&current->mm->mmap_sem);
 
        if (npgs != umem->npgs) {
index a704d1f..5fb0f16 100644 (file)
@@ -391,7 +391,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
        spin_unlock(&kvm->mmu_lock);
 
        ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start,
-                                       range->end, range->blockable);
+                                       range->end,
+                                       mmu_notifier_range_blockable(range));
 
        srcu_read_unlock(&kvm->srcu, idx);