Merge branch 'akpm' (patches from Andrew)

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 1 Feb 2018 02:46:22 +0000 (18:46 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 1 Feb 2018 02:46:22 +0000 (18:46 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 1 Feb 2018 02:46:22 +0000 (18:46 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 1 Feb 2018 02:46:22 +0000 (18:46 -0800)
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt

index 5025ff9..ff234d2 100644 (file)
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -30,7 +30,6 @@ Currently, these files are in /proc/sys/vm:
  - dirty_writeback_centisecs
  - drop_caches
  - extfrag_threshold
-- hugepages_treat_as_movable
  - hugetlb_shm_group
  - laptop_mode
  - legacy_va_layout
@@ -261,30 +260,6 @@ any throttling.
  
  ==============================================================
  
-hugepages_treat_as_movable
-
-This parameter controls whether we can allocate hugepages from ZONE_MOVABLE
-or not. If set to non-zero, hugepages can be allocated from ZONE_MOVABLE.
-ZONE_MOVABLE is created when kernel boot parameter kernelcore= is specified,
-so this parameter has no effect if used without kernelcore=.
-
-Hugepage migration is now available in some situations which depend on the
-architecture and/or the hugepage size. If a hugepage supports migration,
-allocation from ZONE_MOVABLE is always enabled for the hugepage regardless
-of the value of this parameter.
-IOW, this parameter affects only non-migratable hugepages.
-
-Assuming that hugepages are not migratable in your system, one usecase of
-this parameter is that users can make hugepage pool more extensible by
-enabling the allocation from ZONE_MOVABLE. This is because on ZONE_MOVABLE
-page reclaim/migration/compaction work more and you can get contiguous
-memory more likely. Note that using ZONE_MOVABLE for non-migratable
-hugepages can do harm to other features like memory hotremove (because
-memory hotremove expects that memory blocks on ZONE_MOVABLE are always
-removable,) so it's a trade-off responsible for the users.
-
-==============================================================
-
  hugetlb_shm_group
  
  hugetlb_shm_group contains group id that is allowed to create SysV
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt

index 59cbc80..faf077d 100644 (file)
--- a/Documentation/vm/hugetlbpage.txt
+++ b/Documentation/vm/hugetlbpage.txt
@@ -20,19 +20,20 @@ options.
  
  The /proc/meminfo file provides information about the total number of
  persistent hugetlb pages in the kernel's huge page pool.  It also displays
-information about the number of free, reserved and surplus huge pages and the
-default huge page size.  The huge page size is needed for generating the
-proper alignment and size of the arguments to system calls that map huge page
-regions.
+default huge page size and information about the number of free, reserved
+and surplus huge pages in the pool of huge pages of default size.
+The huge page size is needed for generating the proper alignment and
+size of the arguments to system calls that map huge page regions.
  
  The output of "cat /proc/meminfo" will include lines like:
  
  .....
-HugePages_Total: vvv
-HugePages_Free:  www
-HugePages_Rsvd:  xxx
-HugePages_Surp:  yyy
-Hugepagesize:    zzz kB
+HugePages_Total: uuu
+HugePages_Free:  vvv
+HugePages_Rsvd:  www
+HugePages_Surp:  xxx
+Hugepagesize:    yyy kB
+Hugetlb:         zzz kB
  
  where:
  HugePages_Total is the size of the pool of huge pages.
@@ -47,6 +48,14 @@ HugePages_Surp  is short for "surplus," and is the number of huge pages in
                  the pool above the value in /proc/sys/vm/nr_hugepages. The
                  maximum number of surplus huge pages is controlled by
                  /proc/sys/vm/nr_overcommit_hugepages.
+Hugepagesize    is the default hugepage size (in Kb).
+Hugetlb         is the total amount of memory (in kB), consumed by huge
+                pages of all sizes.
+                If huge pages of different sizes are in use, this number
+                will exceed HugePages_Total * Hugepagesize. To get more
+                detailed information, please, refer to
+                /sys/kernel/mm/hugepages (described below).
+
  
  /proc/filesystems should also show a filesystem of type "hugetlbfs" configured
  in the kernel.
diff --git a/arch/arc/include/asm/hugepage.h b/arch/arc/include/asm/hugepage.h

index b18fcb6..dc8ee01 100644 (file)
--- a/arch/arc/include/asm/hugepage.h
+++ b/arch/arc/include/asm/hugepage.h
@@ -74,4 +74,7 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
  extern void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start,
                                 unsigned long end);
  
+/* We don't have hardware dirty/accessed bits, generic_pmdp_establish is fine.*/
+#define pmdp_establish generic_pmdp_establish
+
  #endif
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h

index 1a7a17b..2a48360 100644 (file)
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -249,6 +249,9 @@ PMD_BIT_FUNC(mkyoung,   |= PMD_SECT_AF);
  #define pfn_pmd(pfn,prot)      (__pmd(((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)))
  #define mk_pmd(page,prot)      pfn_pmd(page_to_pfn(page),prot)
  
+/* No hardware dirty/accessed bits -- generic_pmdp_establish() fits */
+#define pmdp_establish generic_pmdp_establish
+
  /* represent a notpresent pmd by faulting entry, this is used by pmdp_invalidate */
  static inline pmd_t pmd_mknotpresent(pmd_t pmd)
  {
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h

index 89167c4..094374c 100644 (file)
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -706,6 +706,13 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
  {
         ptep_set_wrprotect(mm, address, (pte_t *)pmdp);
  }
+
+#define pmdp_establish pmdp_establish
+static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
+               unsigned long address, pmd_t *pmdp, pmd_t pmd)
+{
+       return __pmd(xchg_relaxed(&pmd_val(*pmdp), pmd_val(pmd)));
+}
  #endif
  
  extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c

index b88a8dd..a6f300a 100644 (file)
--- a/arch/m32r/kernel/traps.c
+++ b/arch/m32r/kernel/traps.c
@@ -115,14 +115,6 @@ static void set_eit_vector_entries(void)
         _flush_cache_copyback_all();
  }
  
-void abort(void)
-{
-       BUG();
-
-       /* if that doesn't kill us, halt */
-       panic("Oops failed to kill thread");
-}
-
  void __init trap_init(void)
  {
         set_eit_vector_entries();
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h

index 1a508a7..129e032 100644 (file)
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -534,6 +534,9 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma,
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  
+/* We don't have hardware dirty/accessed bits, generic_pmdp_establish is fine.*/
+#define pmdp_establish generic_pmdp_establish
+
  #define has_transparent_hugepage has_transparent_hugepage
  extern int has_transparent_hugepage(void);
  
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig

index e92432a..73fcf59 100644 (file)
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -151,7 +151,6 @@ config PPC
         select ARCH_MIGHT_HAVE_PC_PARPORT
         select ARCH_MIGHT_HAVE_PC_SERIO
         select ARCH_SUPPORTS_ATOMIC_RMW
-       select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
         select ARCH_USE_BUILTIN_BSWAP
         select ARCH_USE_CMPXCHG_LOCKREF         if PPC64
         select ARCH_WANT_IPC_PARSE_VERSION
diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h

index 197ced1..2d9df40 100644 (file)
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -101,8 +101,6 @@ extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma,
  extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                          pgtable_t pgtable);
  extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
-extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
-                                     unsigned long address, pmd_t *pmdp);
  extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
                                        unsigned long addr, pmd_t *pmdp);
  extern int hash__has_transparent_hugepage(void);
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h

index 8d40cf0..cb46d10 100644 (file)
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -203,8 +203,6 @@ extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma,
  extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                          pgtable_t pgtable);
  extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
-extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
-                                     unsigned long address, pmd_t *pmdp);
  extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
                                        unsigned long addr, pmd_t *pmdp);
  extern int hash__has_transparent_hugepage(void);
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h

index 4469781..6ca1208 100644 (file)
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1137,17 +1137,8 @@ static inline pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm,
  }
  
  #define __HAVE_ARCH_PMDP_INVALIDATE
-extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
-                           pmd_t *pmdp);
-
-#define __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE
-static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma,
-                                          unsigned long address, pmd_t *pmdp)
-{
-       if (radix_enabled())
-               return radix__pmdp_huge_split_prepare(vma, address, pmdp);
-       return hash__pmdp_huge_split_prepare(vma, address, pmdp);
-}
+extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+                            pmd_t *pmdp);
  
  #define pmd_move_must_withdraw pmd_move_must_withdraw
  struct spinlock;
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h

index 19c44e1..365010f 100644 (file)
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -269,12 +269,6 @@ static inline pmd_t radix__pmd_mkhuge(pmd_t pmd)
                 return __pmd(pmd_val(pmd) | _PAGE_PTE | R_PAGE_LARGE);
         return __pmd(pmd_val(pmd) | _PAGE_PTE);
  }
-static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma,
-                                           unsigned long address, pmd_t *pmdp)
-{
-       /* Nothing to do for radix. */
-       return;
-}
  
  extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
                                           pmd_t *pmdp, unsigned long clr,
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c

index 3b65917..422e802 100644 (file)
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -90,16 +90,19 @@ void serialize_against_pte_lookup(struct mm_struct *mm)
   * We use this to invalidate a pmdp entry before switching from a
   * hugepte to regular pmd entry.
   */
-void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                      pmd_t *pmdp)
  {
-       pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
+       unsigned long old_pmd;
+
+       old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
         flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
         /*
          * This ensures that generic code that rely on IRQ disabling
          * to prevent a parallel THP split work as expected.
          */
         serialize_against_pte_lookup(vma->vm_mm);
+       return __pmd(old_pmd);
  }
  
  static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c

index ec27791..469808e 100644 (file)
--- a/arch/powerpc/mm/pgtable-hash64.c
+++ b/arch/powerpc/mm/pgtable-hash64.c
@@ -296,28 +296,6 @@ pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
         return pgtable;
  }
  
-void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
-                              unsigned long address, pmd_t *pmdp)
-{
-       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-       VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);
-       VM_BUG_ON(pmd_devmap(*pmdp));
-
-       /*
-        * We can't mark the pmd none here, because that will cause a race
-        * against exit_mmap. We need to continue mark pmd TRANS HUGE, while
-        * we spilt, but at the same time we wan't rest of the ppc64 code
-        * not to insert hash pte on this, because we will be modifying
-        * the deposited pgtable in the caller of this function. Hence
-        * clear the _PAGE_USER so that we move the fault handling to
-        * higher level function and that will serialize against ptl.
-        * We need to flush existing hash pte entries here even though,
-        * the translation is still valid, because we will withdraw
-        * pgtable_t after this.
-        */
-       pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED);
-}
-
  /*
   * A linux hugepage PMD was changed and the corresponding hash table entries
   * neesd to be flushed.
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig

index 9376637..0105ce2 100644 (file)
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -108,7 +108,6 @@ config S390
         select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
         select ARCH_SAVE_PAGE_KEYS if HIBERNATION
         select ARCH_SUPPORTS_ATOMIC_RMW
-       select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
         select ARCH_SUPPORTS_NUMA_BALANCING
         select ARCH_USE_BUILTIN_BSWAP
         select ARCH_USE_CMPXCHG_LOCKREF
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h

index 0a6b028..2d24d33 100644 (file)
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1505,12 +1505,12 @@ static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
  }
  
  #define __HAVE_ARCH_PMDP_INVALIDATE
-static inline void pmdp_invalidate(struct vm_area_struct *vma,
+static inline pmd_t pmdp_invalidate(struct vm_area_struct *vma,
                                    unsigned long addr, pmd_t *pmdp)
  {
         pmd_t pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
  
-       pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd);
+       return pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd);
  }
  
  #define __HAVE_ARCH_PMDP_SET_WRPROTECT
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h

index 9937c5f..339920f 100644 (file)
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -1010,7 +1010,7 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
                           pmd_t *pmd);
  
  #define __HAVE_ARCH_PMDP_INVALIDATE
-extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                             pmd_t *pmdp);
  
  #define __HAVE_ARCH_PGTABLE_DEPOSIT
diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c

index 4ae86bc..847ddff 100644 (file)
--- a/arch/sparc/mm/tlb.c
+++ b/arch/sparc/mm/tlb.c
@@ -219,17 +219,28 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
         }
  }
  
+static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
+               unsigned long address, pmd_t *pmdp, pmd_t pmd)
+{
+       pmd_t old;
+
+       do {
+               old = *pmdp;
+       } while (cmpxchg64(&pmdp->pmd, old.pmd, pmd.pmd) != old.pmd);
+
+       return old;
+}
+
  /*
   * This routine is only called when splitting a THP
   */
-void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                      pmd_t *pmdp)
  {
-       pmd_t entry = *pmdp;
-
-       pmd_val(entry) &= ~_PAGE_VALID;
+       pmd_t old, entry;
  
-       set_pmd_at(vma->vm_mm, address, pmdp, entry);
+       entry = __pmd(pmd_val(*pmdp) & ~_PAGE_VALID);
+       old = pmdp_establish(vma, address, pmdp, entry);
         flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
  
         /*
@@ -240,6 +251,8 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
         if ((pmd_val(entry) & _PAGE_PMD_HUGE) &&
             !is_huge_zero_page(pmd_page(entry)))
                 (vma->vm_mm)->context.thp_pte_count--;
+
+       return old;
  }
  
  void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index fcd3b4d..d416fd9 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -69,7 +69,6 @@ config X86
         select ARCH_MIGHT_HAVE_PC_PARPORT
         select ARCH_MIGHT_HAVE_PC_SERIO
         select ARCH_SUPPORTS_ATOMIC_RMW
-       select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
         select ARCH_SUPPORTS_NUMA_BALANCING     if X86_64
         select ARCH_USE_BUILTIN_BSWAP
         select ARCH_USE_QUEUED_RWLOCKS
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h

index bc4af54..f24df59 100644 (file)
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -158,7 +158,6 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
  #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
  #endif
  
-#ifdef CONFIG_SMP
  union split_pmd {
         struct {
                 u32 pmd_low;
@@ -166,6 +165,8 @@ union split_pmd {
         };
         pmd_t pmd;
  };
+
+#ifdef CONFIG_SMP
  static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
  {
         union split_pmd res, *orig = (union split_pmd *)pmdp;
@@ -181,6 +182,40 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
  #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
  #endif
  
+#ifndef pmdp_establish
+#define pmdp_establish pmdp_establish
+static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
+               unsigned long address, pmd_t *pmdp, pmd_t pmd)
+{
+       pmd_t old;
+
+       /*
+        * If pmd has present bit cleared we can get away without expensive
+        * cmpxchg64: we can update pmdp half-by-half without racing with
+        * anybody.
+        */
+       if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
+               union split_pmd old, new, *ptr;
+
+               ptr = (union split_pmd *)pmdp;
+
+               new.pmd = pmd;
+
+               /* xchg acts as a barrier before setting of the high bits */
+               old.pmd_low = xchg(&ptr->pmd_low, new.pmd_low);
+               old.pmd_high = ptr->pmd_high;
+               ptr->pmd_high = new.pmd_high;
+               return old.pmd;
+       }
+
+       do {
+               old = *pmdp;
+       } while (cmpxchg64(&pmdp->pmd, old.pmd, pmd.pmd) != old.pmd);
+
+       return old;
+}
+#endif
+
  #ifdef CONFIG_SMP
  union split_pud {
         struct {
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h

index e42b894..63c2552 100644 (file)
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1109,6 +1109,21 @@ static inline int pud_write(pud_t pud)
         return pud_flags(pud) & _PAGE_RW;
  }
  
+#ifndef pmdp_establish
+#define pmdp_establish pmdp_establish
+static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
+               unsigned long address, pmd_t *pmdp, pmd_t pmd)
+{
+       if (IS_ENABLED(CONFIG_SMP)) {
+               return xchg(pmdp, pmd);
+       } else {
+               pmd_t old = *pmdp;
+               *pmdp = pmd;
+               return old;
+       }
+}
+#endif
+
  /*
   * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
   *
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c

index e7b3ce1..70aceef 100644 (file)
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler,
  static void handle_remove(struct work_struct *work);
  
  static const struct mmu_notifier_ops mn_opts = {
+       .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
         .invalidate_range_start = mmu_notifier_range_start,
  };
  
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c

index 7d94e1d..df72493 100644 (file)
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
  }
  
  static const struct mmu_notifier_ops iommu_mn = {
+       .flags                  = MMU_INVALIDATE_DOES_NOT_BLOCK,
         .release                = mn_release,
         .clear_flush_young      = mn_clear_flush_young,
         .invalidate_range       = mn_invalidate_range,
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c

index ed1cf7c..0a826eb 100644 (file)
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
  }
  
  static const struct mmu_notifier_ops intel_mmuops = {
+       .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
         .release = intel_mm_release,
         .change_pte = intel_change_pte,
         .invalidate_range = intel_invalidate_range,
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c

index 9918eda..a3454eb 100644 (file)
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
  
  
  static const struct mmu_notifier_ops gru_mmuops = {
+       .flags                  = MMU_INVALIDATE_DOES_NOT_BLOCK,
         .invalidate_range_start = gru_invalidate_range_start,
         .invalidate_range_end   = gru_invalidate_range_end,
         .release                = gru_release,
diff --git a/fs/dax.c b/fs/dax.c

index 9598159..6ee6f7e 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -44,6 +44,7 @@
  
  /* The 'colour' (ie low bits) within a PMD of a page offset.  */
  #define PG_PMD_COLOUR  ((PMD_SIZE >> PAGE_SHIFT) - 1)
+#define PG_PMD_NR      (PMD_SIZE >> PAGE_SHIFT)
  
  static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
  
@@ -375,8 +376,8 @@ restart:
                  * unmapped.
                  */
                 if (pmd_downgrade && dax_is_zero_entry(entry))
-                       unmap_mapping_range(mapping,
-                               (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
+                       unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
+                                                       PG_PMD_NR, false);
  
                 err = radix_tree_preload(
                                 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
@@ -538,12 +539,10 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
         if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
                 /* we are replacing a zero page with block mapping */
                 if (dax_is_pmd_entry(entry))
-                       unmap_mapping_range(mapping,
-                                       (vmf->pgoff << PAGE_SHIFT) & PMD_MASK,
-                                       PMD_SIZE, 0);
+                       unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
+                                                       PG_PMD_NR, false);
                 else /* pte entry */
-                       unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
-                                       PAGE_SIZE, 0);
+                       unmap_mapping_pages(mapping, vmf->pgoff, 1, false);
         }
  
         spin_lock_irq(&mapping->tree_lock);
@@ -636,8 +635,8 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
                         pmd = pmd_mkclean(pmd);
                         set_pmd_at(vma->vm_mm, address, pmdp, pmd);
  unlock_pmd:
-                       spin_unlock(ptl);
  #endif
+                       spin_unlock(ptl);
                 } else {
                         if (pfn != pte_pfn(*ptep))
                                 goto unlock_pte;
@@ -1269,12 +1268,6 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
  }
  
  #ifdef CONFIG_FS_DAX_PMD
-/*
- * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
- * more often than one might expect in the below functions.
- */
-#define PG_PMD_COLOUR  ((PMD_SIZE >> PAGE_SHIFT) - 1)
-
  static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
                 void *entry)
  {
diff --git a/fs/fcntl.c b/fs/fcntl.c

index c7b9e09..e95fa0a 100644 (file)
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -418,7 +418,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
                 break;
         case F_ADD_SEALS:
         case F_GET_SEALS:
-               err = shmem_fcntl(filp, cmd, arg);
+               err = memfd_fcntl(filp, cmd, arg);
                 break;
         case F_GET_RW_HINT:
         case F_SET_RW_HINT:
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c

index 8a85f3f..8fe1b0a 100644 (file)
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -55,16 +55,6 @@ struct hugetlbfs_config {
         umode_t                 mode;
  };
  
-struct hugetlbfs_inode_info {
-       struct shared_policy policy;
-       struct inode vfs_inode;
-};
-
-static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
-{
-       return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
-}
-
  int sysctl_hugetlb_shm_group;
  
  enum {
@@ -520,8 +510,16 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
  
         if (hole_end > hole_start) {
                 struct address_space *mapping = inode->i_mapping;
+               struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
  
                 inode_lock(inode);
+
+               /* protected by i_mutex */
+               if (info->seals & F_SEAL_WRITE) {
+                       inode_unlock(inode);
+                       return -EPERM;
+               }
+
                 i_mmap_lock_write(mapping);
                 if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
                         hugetlb_vmdelete_list(&mapping->i_mmap,
@@ -539,6 +537,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
                                 loff_t len)
  {
         struct inode *inode = file_inode(file);
+       struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
         struct address_space *mapping = inode->i_mapping;
         struct hstate *h = hstate_inode(inode);
         struct vm_area_struct pseudo_vma;
@@ -570,6 +569,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
         if (error)
                 goto out;
  
+       if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
+               error = -EPERM;
+               goto out;
+       }
+
         /*
          * Initialize a pseudo vma as this is required by the huge page
          * allocation routines.  If NUMA is configured, use page index
@@ -660,6 +664,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
         struct hstate *h = hstate_inode(inode);
         int error;
         unsigned int ia_valid = attr->ia_valid;
+       struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
  
         BUG_ON(!inode);
  
@@ -668,9 +673,16 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
                 return error;
  
         if (ia_valid & ATTR_SIZE) {
-               if (attr->ia_size & ~huge_page_mask(h))
+               loff_t oldsize = inode->i_size;
+               loff_t newsize = attr->ia_size;
+
+               if (newsize & ~huge_page_mask(h))
                         return -EINVAL;
-               error = hugetlb_vmtruncate(inode, attr->ia_size);
+               /* protected by i_mutex */
+               if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
+                   (newsize > oldsize && (info->seals & F_SEAL_GROW)))
+                       return -EPERM;
+               error = hugetlb_vmtruncate(inode, newsize);
                 if (error)
                         return error;
         }
@@ -722,6 +734,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
  
         inode = new_inode(sb);
         if (inode) {
+               struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
+
                 inode->i_ino = get_next_ino();
                 inode_init_owner(inode, dir, mode);
                 lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
@@ -729,6 +743,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
                 inode->i_mapping->a_ops = &hugetlbfs_aops;
                 inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
                 inode->i_mapping->private_data = resv_map;
+               info->seals = F_SEAL_SEAL;
                 switch (mode & S_IFMT) {
                 default:
                         init_special_inode(inode, mode, dev);
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c

index 40b5cc9..917fadc 100644 (file)
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -311,7 +311,9 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
         if (had_lock < 0)
                 return ERR_PTR(had_lock);
  
+       down_read(&OCFS2_I(inode)->ip_xattr_sem);
         acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+       up_read(&OCFS2_I(inode)->ip_xattr_sem);
  
         ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
         brelse(di_bh);
@@ -330,7 +332,9 @@ int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh)
         if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
                 return 0;
  
+       down_read(&OCFS2_I(inode)->ip_xattr_sem);
         acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh);
+       up_read(&OCFS2_I(inode)->ip_xattr_sem);
         if (IS_ERR(acl) || !acl)
                 return PTR_ERR(acl);
         ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
@@ -361,8 +365,10 @@ int ocfs2_init_acl(handle_t *handle,
  
         if (!S_ISLNK(inode->i_mode)) {
                 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+                       down_read(&OCFS2_I(dir)->ip_xattr_sem);
                         acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
                                                    dir_bh);
+                       up_read(&OCFS2_I(dir)->ip_xattr_sem);
                         if (IS_ERR(acl))
                                 return PTR_ERR(acl);
                 }
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c

index ab5105f..9a876bb 100644 (file)
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
                                      struct ocfs2_extent_rec *rec);
  static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
  static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
+
+static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
+                                       struct ocfs2_extent_tree *et,
+                                       struct buffer_head **new_eb_bh,
+                                       int blk_wanted, int *blk_given);
+static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et);
+
  static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
         .eo_set_last_eb_blk     = ocfs2_dinode_set_last_eb_blk,
         .eo_get_last_eb_blk     = ocfs2_dinode_get_last_eb_blk,
@@ -448,6 +455,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
         if (!obj)
                 obj = (void *)bh->b_data;
         et->et_object = obj;
+       et->et_dealloc = NULL;
  
         et->et_ops->eo_fill_root_el(et);
         if (!et->et_ops->eo_fill_max_leaf_clusters)
@@ -1158,7 +1166,7 @@ static int ocfs2_add_branch(handle_t *handle,
                             struct buffer_head **last_eb_bh,
                             struct ocfs2_alloc_context *meta_ac)
  {
-       int status, new_blocks, i;
+       int status, new_blocks, i, block_given = 0;
         u64 next_blkno, new_last_eb_blk;
         struct buffer_head *bh;
         struct buffer_head **new_eb_bhs = NULL;
@@ -1213,11 +1221,31 @@ static int ocfs2_add_branch(handle_t *handle,
                 goto bail;
         }
  
-       status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
-                                          meta_ac, new_eb_bhs);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
+       /* Firstyly, try to reuse dealloc since we have already estimated how
+        * many extent blocks we may use.
+        */
+       if (!ocfs2_is_dealloc_empty(et)) {
+               status = ocfs2_reuse_blk_from_dealloc(handle, et,
+                                                     new_eb_bhs, new_blocks,
+                                                     &block_given);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+       }
+
+       BUG_ON(block_given > new_blocks);
+
+       if (block_given < new_blocks) {
+               BUG_ON(!meta_ac);
+               status = ocfs2_create_new_meta_bhs(handle, et,
+                                                  new_blocks - block_given,
+                                                  meta_ac,
+                                                  &new_eb_bhs[block_given]);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
         }
  
         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
@@ -1340,15 +1368,25 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
                                   struct ocfs2_alloc_context *meta_ac,
                                   struct buffer_head **ret_new_eb_bh)
  {
-       int status, i;
+       int status, i, block_given = 0;
         u32 new_clusters;
         struct buffer_head *new_eb_bh = NULL;
         struct ocfs2_extent_block *eb;
         struct ocfs2_extent_list  *root_el;
         struct ocfs2_extent_list  *eb_el;
  
-       status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
-                                          &new_eb_bh);
+       if (!ocfs2_is_dealloc_empty(et)) {
+               status = ocfs2_reuse_blk_from_dealloc(handle, et,
+                                                     &new_eb_bh, 1,
+                                                     &block_given);
+       } else if (meta_ac) {
+               status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
+                                                  &new_eb_bh);
+
+       } else {
+               BUG();
+       }
+
         if (status < 0) {
                 mlog_errno(status);
                 goto bail;
@@ -1511,7 +1549,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
         int depth = le16_to_cpu(el->l_tree_depth);
         struct buffer_head *bh = NULL;
  
-       BUG_ON(meta_ac == NULL);
+       BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et));
  
         shift = ocfs2_find_branch_target(et, &bh);
         if (shift < 0) {
@@ -2598,11 +2636,8 @@ static void ocfs2_unlink_subtree(handle_t *handle,
         int i;
         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
         struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
-       struct ocfs2_extent_list *el;
         struct ocfs2_extent_block *eb;
  
-       el = path_leaf_el(left_path);
-
         eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
  
         for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
@@ -3938,7 +3973,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
                                            struct ocfs2_path *path,
                                            struct ocfs2_extent_rec *insert_rec)
  {
-       int ret, i, next_free;
+       int i, next_free;
         struct buffer_head *bh;
         struct ocfs2_extent_list *el;
         struct ocfs2_extent_rec *rec;
@@ -3955,7 +3990,6 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
                                     "Owner %llu has a bad extent list\n",
                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
-                       ret = -EIO;
                         return;
                 }
  
@@ -5057,7 +5091,6 @@ int ocfs2_split_extent(handle_t *handle,
         struct buffer_head *last_eb_bh = NULL;
         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
         struct ocfs2_merge_ctxt ctxt;
-       struct ocfs2_extent_list *rightmost_el;
  
         if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
             ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
@@ -5093,9 +5126,7 @@ int ocfs2_split_extent(handle_t *handle,
                 }
  
                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-               rightmost_el = &eb->h_list;
-       } else
-               rightmost_el = path_root_el(path);
+       }
  
         if (rec->e_cpos == split_rec->e_cpos &&
             rec->e_leaf_clusters == split_rec->e_leaf_clusters)
@@ -6585,6 +6616,154 @@ ocfs2_find_per_slot_free_list(int type,
         return fl;
  }
  
+static struct ocfs2_per_slot_free_list *
+ocfs2_find_preferred_free_list(int type,
+                              int preferred_slot,
+                              int *real_slot,
+                              struct ocfs2_cached_dealloc_ctxt *ctxt)
+{
+       struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
+
+       while (fl) {
+               if (fl->f_inode_type == type && fl->f_slot == preferred_slot) {
+                       *real_slot = fl->f_slot;
+                       return fl;
+               }
+
+               fl = fl->f_next_suballocator;
+       }
+
+       /* If we can't find any free list matching preferred slot, just use
+        * the first one.
+        */
+       fl = ctxt->c_first_suballocator;
+       *real_slot = fl->f_slot;
+
+       return fl;
+}
+
+/* Return Value 1 indicates empty */
+static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et)
+{
+       struct ocfs2_per_slot_free_list *fl = NULL;
+
+       if (!et->et_dealloc)
+               return 1;
+
+       fl = et->et_dealloc->c_first_suballocator;
+       if (!fl)
+               return 1;
+
+       if (!fl->f_first)
+               return 1;
+
+       return 0;
+}
+
+/* If extent was deleted from tree due to extent rotation and merging, and
+ * no metadata is reserved ahead of time. Try to reuse some extents
+ * just deleted. This is only used to reuse extent blocks.
+ * It is supposed to find enough extent blocks in dealloc if our estimation
+ * on metadata is accurate.
+ */
+static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
+                                       struct ocfs2_extent_tree *et,
+                                       struct buffer_head **new_eb_bh,
+                                       int blk_wanted, int *blk_given)
+{
+       int i, status = 0, real_slot;
+       struct ocfs2_cached_dealloc_ctxt *dealloc;
+       struct ocfs2_per_slot_free_list *fl;
+       struct ocfs2_cached_block_free *bf;
+       struct ocfs2_extent_block *eb;
+       struct ocfs2_super *osb =
+               OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
+
+       *blk_given = 0;
+
+       /* If extent tree doesn't have a dealloc, this is not faulty. Just
+        * tell upper caller dealloc can't provide any block and it should
+        * ask for alloc to claim more space.
+        */
+       dealloc = et->et_dealloc;
+       if (!dealloc)
+               goto bail;
+
+       for (i = 0; i < blk_wanted; i++) {
+               /* Prefer to use local slot */
+               fl = ocfs2_find_preferred_free_list(EXTENT_ALLOC_SYSTEM_INODE,
+                                                   osb->slot_num, &real_slot,
+                                                   dealloc);
+               /* If no more block can be reused, we should claim more
+                * from alloc. Just return here normally.
+                */
+               if (!fl) {
+                       status = 0;
+                       break;
+               }
+
+               bf = fl->f_first;
+               fl->f_first = bf->free_next;
+
+               new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk);
+               if (new_eb_bh[i] == NULL) {
+                       status = -ENOMEM;
+                       mlog_errno(status);
+                       goto bail;
+               }
+
+               mlog(0, "Reusing block(%llu) from "
+                    "dealloc(local slot:%d, real slot:%d)\n",
+                    bf->free_blk, osb->slot_num, real_slot);
+
+               ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]);
+
+               status = ocfs2_journal_access_eb(handle, et->et_ci,
+                                                new_eb_bh[i],
+                                                OCFS2_JOURNAL_ACCESS_CREATE);
+               if (status < 0) {
+                       mlog_errno(status);
+                       goto bail;
+               }
+
+               memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize);
+               eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data;
+
+               /* We can't guarantee that buffer head is still cached, so
+                * polutlate the extent block again.
+                */
+               strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
+               eb->h_blkno = cpu_to_le64(bf->free_blk);
+               eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
+               eb->h_suballoc_slot = cpu_to_le16(real_slot);
+               eb->h_suballoc_loc = cpu_to_le64(bf->free_bg);
+               eb->h_suballoc_bit = cpu_to_le16(bf->free_bit);
+               eb->h_list.l_count =
+                       cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
+
+               /* We'll also be dirtied by the caller, so
+                * this isn't absolutely necessary.
+                */
+               ocfs2_journal_dirty(handle, new_eb_bh[i]);
+
+               if (!fl->f_first) {
+                       dealloc->c_first_suballocator = fl->f_next_suballocator;
+                       kfree(fl);
+               }
+               kfree(bf);
+       }
+
+       *blk_given = i;
+
+bail:
+       if (unlikely(status < 0)) {
+               for (i = 0; i < blk_wanted; i++)
+                       brelse(new_eb_bh[i]);
+       }
+
+       return status;
+}
+
  int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
                               int type, int slot, u64 suballoc,
                               u64 blkno, unsigned int bit)
@@ -7382,6 +7561,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
         struct buffer_head *gd_bh = NULL;
         struct ocfs2_dinode *main_bm;
         struct ocfs2_group_desc *gd = NULL;
+       struct ocfs2_trim_fs_info info, *pinfo = NULL;
  
         start = range->start >> osb->s_clustersize_bits;
         len = range->len >> osb->s_clustersize_bits;
@@ -7419,6 +7599,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
  
         trace_ocfs2_trim_fs(start, len, minlen);
  
+       ocfs2_trim_fs_lock_res_init(osb);
+       ret = ocfs2_trim_fs_lock(osb, NULL, 1);
+       if (ret < 0) {
+               if (ret != -EAGAIN) {
+                       mlog_errno(ret);
+                       ocfs2_trim_fs_lock_res_uninit(osb);
+                       goto out_unlock;
+               }
+
+               mlog(ML_NOTICE, "Wait for trim on device (%s) to "
+                    "finish, which is running from another node.\n",
+                    osb->dev_str);
+               ret = ocfs2_trim_fs_lock(osb, &info, 0);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       ocfs2_trim_fs_lock_res_uninit(osb);
+                       goto out_unlock;
+               }
+
+               if (info.tf_valid && info.tf_success &&
+                   info.tf_start == start && info.tf_len == len &&
+                   info.tf_minlen == minlen) {
+                       /* Avoid sending duplicated trim to a shared device */
+                       mlog(ML_NOTICE, "The same trim on device (%s) was "
+                            "just done from node (%u), return.\n",
+                            osb->dev_str, info.tf_nodenum);
+                       range->len = info.tf_trimlen;
+                       goto out_trimunlock;
+               }
+       }
+
+       info.tf_nodenum = osb->node_num;
+       info.tf_start = start;
+       info.tf_len = len;
+       info.tf_minlen = minlen;
+
         /* Determine first and last group to examine based on start and len */
         first_group = ocfs2_which_cluster_group(main_bm_inode, start);
         if (first_group == osb->first_cluster_group_blkno)
@@ -7463,6 +7679,13 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
                         group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
         }
         range->len = trimmed * sb->s_blocksize;
+
+       info.tf_trimlen = range->len;
+       info.tf_success = (ret ? 0 : 1);
+       pinfo = &info;
+out_trimunlock:
+       ocfs2_trim_fs_unlock(osb, pinfo);
+       ocfs2_trim_fs_lock_res_uninit(osb);
  out_unlock:
         ocfs2_inode_unlock(main_bm_inode, 0);
         brelse(main_bm_bh);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h

index 27b75cf..250bcac 100644 (file)
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -61,6 +61,7 @@ struct ocfs2_extent_tree {
         ocfs2_journal_access_func               et_root_journal_access;
         void                                    *et_object;
         unsigned int                            et_max_leaf_clusters;
+       struct ocfs2_cached_dealloc_ctxt        *et_dealloc;
  };
  
  /*
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c

index d151632..e8e205b 100644 (file)
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -797,6 +797,7 @@ struct ocfs2_write_ctxt {
         struct ocfs2_cached_dealloc_ctxt w_dealloc;
  
         struct list_head                w_unwritten_list;
+       unsigned int                    w_unwritten_count;
  };
  
  void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
@@ -1386,6 +1387,7 @@ retry:
         desc->c_clear_unwritten = 0;
         list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
         list_add_tail(&new->ue_node, &wc->w_unwritten_list);
+       wc->w_unwritten_count++;
         new = NULL;
  unlock:
         spin_unlock(&oi->ip_lock);
@@ -2256,7 +2258,7 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
                 ue->ue_phys = desc->c_phys;
  
                 list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
-               dwc->dw_zero_count++;
+               dwc->dw_zero_count += wc->w_unwritten_count;
         }
  
         ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
@@ -2330,6 +2332,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
  
         ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
  
+       /* Attach dealloc with extent tree in case that we may reuse extents
+        * which are already unlinked from current extent tree due to extent
+        * rotation and merging.
+        */
+       et.et_dealloc = &dealloc;
+
         ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
                                     &data_ac, &meta_ac);
         if (ret) {
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c

index 62e8ec6..af2e747 100644 (file)
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -314,12 +314,13 @@ void o2quo_conn_err(u8 node)
                                 node, qs->qs_connected);
  
                 clear_bit(node, qs->qs_conn_bm);
+
+               if (test_bit(node, qs->qs_hb_bm))
+                       o2quo_set_hold(qs, node);
         }
  
         mlog(0, "node %u, %d total\n", node, qs->qs_connected);
  
-       if (test_bit(node, qs->qs_hb_bm))
-               o2quo_set_hold(qs, node);
  
         spin_unlock(&qs->qs_lock);
  }
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h

index b95e7df..0276f7f 100644 (file)
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -196,7 +196,7 @@ struct o2net_msg_handler {
         u32                     nh_msg_type;
         u32                     nh_key;
         o2net_msg_handler_func  *nh_func;
-       o2net_msg_handler_func  *nh_func_data;
+       void                    *nh_func_data;
         o2net_post_msg_handler_func
                                 *nh_post_func;
         struct kref             nh_kref;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c

index 32f9c72..b7520e2 100644 (file)
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1958,7 +1958,7 @@ int ocfs2_readdir(struct file *file, struct dir_context *ctx)
  
         trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
  
-       error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level);
+       error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level, 1);
         if (lock_level && error >= 0) {
                 /* We release EX lock which used to update atime
                  * and get PR lock again to reduce contention
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c

index 9c3e0f1..a7df226 100644 (file)
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1122,13 +1122,6 @@ recheck:
         /* sleep if we haven't finished voting yet */
         if (sleep) {
                 unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
-
-               /*
-               if (kref_read(&mle->mle_refs) < 2)
-                       mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
-                       kref_read(&mle->mle_refs),
-                       res->lockname.len, res->lockname.name);
-               */
                 atomic_set(&mle->woken, 0);
                 (void)wait_event_timeout(mle->wq,
                                          (atomic_read(&mle->woken) == 1),
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c

index 4689940..9479f99 100644 (file)
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -259,6 +259,10 @@ static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
         .flags          = 0,
  };
  
+static struct ocfs2_lock_res_ops ocfs2_trim_fs_lops = {
+       .flags          = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
+};
+
  static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
         .flags          = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
  };
@@ -676,6 +680,24 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
                                    &ocfs2_nfs_sync_lops, osb);
  }
  
+void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
+{
+       struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+
+       ocfs2_lock_res_init_once(lockres);
+       ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
+       ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
+                                  &ocfs2_trim_fs_lops, osb);
+}
+
+void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb)
+{
+       struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+
+       ocfs2_simple_drop_lockres(osb, lockres);
+       ocfs2_lock_res_free(lockres);
+}
+
  static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
                                             struct ocfs2_super *osb)
  {
@@ -1742,6 +1764,27 @@ int ocfs2_rw_lock(struct inode *inode, int write)
         return status;
  }
  
+int ocfs2_try_rw_lock(struct inode *inode, int write)
+{
+       int status, level;
+       struct ocfs2_lock_res *lockres;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       mlog(0, "inode %llu try to take %s RW lock\n",
+            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+            write ? "EXMODE" : "PRMODE");
+
+       if (ocfs2_mount_local(osb))
+               return 0;
+
+       lockres = &OCFS2_I(inode)->ip_rw_lockres;
+
+       level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
+
+       status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0);
+       return status;
+}
+
  void ocfs2_rw_unlock(struct inode *inode, int write)
  {
         int level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
@@ -2486,6 +2529,15 @@ int ocfs2_inode_lock_with_page(struct inode *inode,
         ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
         if (ret == -EAGAIN) {
                 unlock_page(page);
+               /*
+                * If we can't get inode lock immediately, we should not return
+                * directly here, since this will lead to a softlockup problem.
+                * The method is to get a blocking lock and immediately unlock
+                * before returning, this can avoid CPU resource waste due to
+                * lots of retries, and benefits fairness in getting lock.
+                */
+               if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
+                       ocfs2_inode_unlock(inode, ex);
                 ret = AOP_TRUNCATED_PAGE;
         }
  
@@ -2494,13 +2546,18 @@ int ocfs2_inode_lock_with_page(struct inode *inode,
  
  int ocfs2_inode_lock_atime(struct inode *inode,
                           struct vfsmount *vfsmnt,
-                         int *level)
+                         int *level, int wait)
  {
         int ret;
  
-       ret = ocfs2_inode_lock(inode, NULL, 0);
+       if (wait)
+               ret = ocfs2_inode_lock(inode, NULL, 0);
+       else
+               ret = ocfs2_try_inode_lock(inode, NULL, 0);
+
         if (ret < 0) {
-               mlog_errno(ret);
+               if (ret != -EAGAIN)
+                       mlog_errno(ret);
                 return ret;
         }
  
@@ -2512,9 +2569,14 @@ int ocfs2_inode_lock_atime(struct inode *inode,
                 struct buffer_head *bh = NULL;
  
                 ocfs2_inode_unlock(inode, 0);
-               ret = ocfs2_inode_lock(inode, &bh, 1);
+               if (wait)
+                       ret = ocfs2_inode_lock(inode, &bh, 1);
+               else
+                       ret = ocfs2_try_inode_lock(inode, &bh, 1);
+
                 if (ret < 0) {
-                       mlog_errno(ret);
+                       if (ret != -EAGAIN)
+                               mlog_errno(ret);
                         return ret;
                 }
                 *level = 1;
@@ -2745,6 +2807,70 @@ void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
                                      ex ? LKM_EXMODE : LKM_PRMODE);
  }
  
+int ocfs2_trim_fs_lock(struct ocfs2_super *osb,
+                      struct ocfs2_trim_fs_info *info, int trylock)
+{
+       int status;
+       struct ocfs2_trim_fs_lvb *lvb;
+       struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+
+       if (info)
+               info->tf_valid = 0;
+
+       if (ocfs2_is_hard_readonly(osb))
+               return -EROFS;
+
+       if (ocfs2_mount_local(osb))
+               return 0;
+
+       status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX,
+                                   trylock ? DLM_LKF_NOQUEUE : 0, 0);
+       if (status < 0) {
+               if (status != -EAGAIN)
+                       mlog_errno(status);
+               return status;
+       }
+
+       if (info) {
+               lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+               if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
+                   lvb->lvb_version == OCFS2_TRIMFS_LVB_VERSION) {
+                       info->tf_valid = 1;
+                       info->tf_success = lvb->lvb_success;
+                       info->tf_nodenum = be32_to_cpu(lvb->lvb_nodenum);
+                       info->tf_start = be64_to_cpu(lvb->lvb_start);
+                       info->tf_len = be64_to_cpu(lvb->lvb_len);
+                       info->tf_minlen = be64_to_cpu(lvb->lvb_minlen);
+                       info->tf_trimlen = be64_to_cpu(lvb->lvb_trimlen);
+               }
+       }
+
+       return status;
+}
+
+void ocfs2_trim_fs_unlock(struct ocfs2_super *osb,
+                         struct ocfs2_trim_fs_info *info)
+{
+       struct ocfs2_trim_fs_lvb *lvb;
+       struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+
+       if (ocfs2_mount_local(osb))
+               return;
+
+       if (info) {
+               lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+               lvb->lvb_version = OCFS2_TRIMFS_LVB_VERSION;
+               lvb->lvb_success = info->tf_success;
+               lvb->lvb_nodenum = cpu_to_be32(info->tf_nodenum);
+               lvb->lvb_start = cpu_to_be64(info->tf_start);
+               lvb->lvb_len = cpu_to_be64(info->tf_len);
+               lvb->lvb_minlen = cpu_to_be64(info->tf_minlen);
+               lvb->lvb_trimlen = cpu_to_be64(info->tf_trimlen);
+       }
+
+       ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
+}
+
  int ocfs2_dentry_lock(struct dentry *dentry, int ex)
  {
         int ret;
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h

index a7fc18b..256e0a9 100644 (file)
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -70,6 +70,29 @@ struct ocfs2_orphan_scan_lvb {
         __be32  lvb_os_seqno;
  };
  
+#define OCFS2_TRIMFS_LVB_VERSION 1
+
+struct ocfs2_trim_fs_lvb {
+       __u8    lvb_version;
+       __u8    lvb_success;
+       __u8    lvb_reserved[2];
+       __be32  lvb_nodenum;
+       __be64  lvb_start;
+       __be64  lvb_len;
+       __be64  lvb_minlen;
+       __be64  lvb_trimlen;
+};
+
+struct ocfs2_trim_fs_info {
+       u8      tf_valid;       /* lvb is valid, or not */
+       u8      tf_success;     /* trim is successful, or not */
+       u32     tf_nodenum;     /* osb node number */
+       u64     tf_start;       /* trim start offset in clusters */
+       u64     tf_len;         /* trim end offset in clusters */
+       u64     tf_minlen;      /* trim minimum contiguous free clusters */
+       u64     tf_trimlen;     /* trimmed length in bytes */
+};
+
  struct ocfs2_lock_holder {
         struct list_head oh_list;
         struct pid *oh_owner_pid;
@@ -116,13 +139,14 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
  int ocfs2_create_new_inode_locks(struct inode *inode);
  int ocfs2_drop_inode_locks(struct inode *inode);
  int ocfs2_rw_lock(struct inode *inode, int write);
+int ocfs2_try_rw_lock(struct inode *inode, int write);
  void ocfs2_rw_unlock(struct inode *inode, int write);
  int ocfs2_open_lock(struct inode *inode);
  int ocfs2_try_open_lock(struct inode *inode, int write);
  void ocfs2_open_unlock(struct inode *inode);
  int ocfs2_inode_lock_atime(struct inode *inode,
                           struct vfsmount *vfsmnt,
-                         int *level);
+                         int *level, int wait);
  int ocfs2_inode_lock_full_nested(struct inode *inode,
                          struct buffer_head **ret_bh,
                          int ex,
@@ -140,6 +164,9 @@ int ocfs2_inode_lock_with_page(struct inode *inode,
  /* 99% of the time we don't want to supply any additional flags --
   * those are for very specific cases only. */
  #define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, OI_LS_NORMAL)
+#define ocfs2_try_inode_lock(i, b, e)\
+               ocfs2_inode_lock_full_nested(i, b, e, OCFS2_META_LOCK_NOQUEUE,\
+               OI_LS_NORMAL)
  void ocfs2_inode_unlock(struct inode *inode,
                        int ex);
  int ocfs2_super_lock(struct ocfs2_super *osb,
@@ -153,6 +180,12 @@ int ocfs2_rename_lock(struct ocfs2_super *osb);
  void ocfs2_rename_unlock(struct ocfs2_super *osb);
  int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
  void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex);
+void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb);
+void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb);
+int ocfs2_trim_fs_lock(struct ocfs2_super *osb,
+                      struct ocfs2_trim_fs_info *info, int trylock);
+void ocfs2_trim_fs_unlock(struct ocfs2_super *osb,
+                         struct ocfs2_trim_fs_info *info);
  int ocfs2_dentry_lock(struct dentry *dentry, int ex);
  void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
  int ocfs2_file_lock(struct file *file, int ex, int trylock);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c

index e4719e0..06cb964 100644 (file)
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -38,6 +38,7 @@
  #include "inode.h"
  #include "super.h"
  #include "symlink.h"
+#include "aops.h"
  #include "ocfs2_trace.h"
  
  #include "buffer_head_io.h"
@@ -832,6 +833,50 @@ out:
         return ret;
  }
  
+/* Is IO overwriting allocated blocks? */
+int ocfs2_overwrite_io(struct inode *inode, struct buffer_head *di_bh,
+                      u64 map_start, u64 map_len)
+{
+       int ret = 0, is_last;
+       u32 mapping_end, cpos;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       struct ocfs2_extent_rec rec;
+
+       if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+               if (ocfs2_size_fits_inline_data(di_bh, map_start + map_len))
+                       return ret;
+               else
+                       return -EAGAIN;
+       }
+
+       cpos = map_start >> osb->s_clustersize_bits;
+       mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
+                                              map_start + map_len);
+       is_last = 0;
+       while (cpos < mapping_end && !is_last) {
+               ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
+                                                NULL, &rec, &is_last);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               if (rec.e_blkno == 0ULL)
+                       break;
+
+               if (rec.e_flags & OCFS2_EXT_REFCOUNTED)
+                       break;
+
+               cpos = le32_to_cpu(rec.e_cpos) +
+                       le16_to_cpu(rec.e_leaf_clusters);
+       }
+
+       if (cpos < mapping_end)
+               ret = -EAGAIN;
+out:
+       return ret;
+}
+
  int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
  {
         struct inode *inode = file->f_mapping->host;
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h

index 67ea57d..1057586 100644 (file)
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -53,6 +53,9 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
  int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                  u64 map_start, u64 map_len);
  
+int ocfs2_overwrite_io(struct inode *inode, struct buffer_head *di_bh,
+                      u64 map_start, u64 map_len);
+
  int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin);
  
  int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c

index a1d0510..5d1784a 100644 (file)
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -140,6 +140,8 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
                 spin_unlock(&oi->ip_lock);
         }
  
+       file->f_mode |= FMODE_NOWAIT;
+
  leave:
         return status;
  }
@@ -2132,12 +2134,12 @@ out:
  }
  
  static int ocfs2_prepare_inode_for_write(struct file *file,
-                                        loff_t pos,
-                                        size_t count)
+                                        loff_t pos, size_t count, int wait)
  {
-       int ret = 0, meta_level = 0;
+       int ret = 0, meta_level = 0, overwrite_io = 0;
         struct dentry *dentry = file->f_path.dentry;
         struct inode *inode = d_inode(dentry);
+       struct buffer_head *di_bh = NULL;
         loff_t end;
  
         /*
@@ -2145,13 +2147,40 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
          * if we need to make modifications here.
          */
         for(;;) {
-               ret = ocfs2_inode_lock(inode, NULL, meta_level);
+               if (wait)
+                       ret = ocfs2_inode_lock(inode, NULL, meta_level);
+               else
+                       ret = ocfs2_try_inode_lock(inode,
+                               overwrite_io ? NULL : &di_bh, meta_level);
                 if (ret < 0) {
                         meta_level = -1;
-                       mlog_errno(ret);
+                       if (ret != -EAGAIN)
+                               mlog_errno(ret);
                         goto out;
                 }
  
+               /*
+                * Check if IO will overwrite allocated blocks in case
+                * IOCB_NOWAIT flag is set.
+                */
+               if (!wait && !overwrite_io) {
+                       overwrite_io = 1;
+                       if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) {
+                               ret = -EAGAIN;
+                               goto out_unlock;
+                       }
+
+                       ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
+                       brelse(di_bh);
+                       di_bh = NULL;
+                       up_read(&OCFS2_I(inode)->ip_alloc_sem);
+                       if (ret < 0) {
+                               if (ret != -EAGAIN)
+                                       mlog_errno(ret);
+                               goto out_unlock;
+                       }
+               }
+
                 /* Clear suid / sgid if necessary. We do this here
                  * instead of later in the write path because
                  * remove_suid() calls ->setattr without any hint that
@@ -2199,7 +2228,9 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
  
  out_unlock:
         trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
-                                           pos, count);
+                                           pos, count, wait);
+
+       brelse(di_bh);
  
         if (meta_level >= 0)
                 ocfs2_inode_unlock(inode, meta_level);
@@ -2211,7 +2242,7 @@ out:
  static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
                                     struct iov_iter *from)
  {
-       int direct_io, rw_level;
+       int rw_level;
         ssize_t written = 0;
         ssize_t ret;
         size_t count = iov_iter_count(from);
@@ -2223,6 +2254,8 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
         void *saved_ki_complete = NULL;
         int append_write = ((iocb->ki_pos + count) >=
                         i_size_read(inode) ? 1 : 0);
+       int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
+       int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
  
         trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
                 (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2230,12 +2263,17 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
                 file->f_path.dentry->d_name.name,
                 (unsigned int)from->nr_segs);   /* GRRRRR */
  
+       if (!direct_io && nowait)
+               return -EOPNOTSUPP;
+
         if (count == 0)
                 return 0;
  
-       direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
-
-       inode_lock(inode);
+       if (nowait) {
+               if (!inode_trylock(inode))
+                       return -EAGAIN;
+       } else
+               inode_lock(inode);
  
         /*
          * Concurrent O_DIRECT writes are allowed with
@@ -2244,9 +2282,13 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
          */
         rw_level = (!direct_io || full_coherency || append_write);
  
-       ret = ocfs2_rw_lock(inode, rw_level);
+       if (nowait)
+               ret = ocfs2_try_rw_lock(inode, rw_level);
+       else
+               ret = ocfs2_rw_lock(inode, rw_level);
         if (ret < 0) {
-               mlog_errno(ret);
+               if (ret != -EAGAIN)
+                       mlog_errno(ret);
                 goto out_mutex;
         }
  
@@ -2260,9 +2302,13 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
                  * other nodes to drop their caches.  Buffered I/O
                  * already does this in write_begin().
                  */
-               ret = ocfs2_inode_lock(inode, NULL, 1);
+               if (nowait)
+                       ret = ocfs2_try_inode_lock(inode, NULL, 1);
+               else
+                       ret = ocfs2_inode_lock(inode, NULL, 1);
                 if (ret < 0) {
-                       mlog_errno(ret);
+                       if (ret != -EAGAIN)
+                               mlog_errno(ret);
                         goto out;
                 }
  
@@ -2277,9 +2323,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
         }
         count = ret;
  
-       ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count);
+       ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, !nowait);
         if (ret < 0) {
-               mlog_errno(ret);
+               if (ret != -EAGAIN)
+                       mlog_errno(ret);
                 goto out;
         }
  
@@ -2355,6 +2402,8 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
         int ret = 0, rw_level = -1, lock_level = 0;
         struct file *filp = iocb->ki_filp;
         struct inode *inode = file_inode(filp);
+       int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
+       int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
  
         trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2369,14 +2418,22 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
                 goto bail;
         }
  
+       if (!direct_io && nowait)
+               return -EOPNOTSUPP;
+
         /*
          * buffered reads protect themselves in ->readpage().  O_DIRECT reads
          * need locks to protect pending reads from racing with truncate.
          */
-       if (iocb->ki_flags & IOCB_DIRECT) {
-               ret = ocfs2_rw_lock(inode, 0);
+       if (direct_io) {
+               if (nowait)
+                       ret = ocfs2_try_rw_lock(inode, 0);
+               else
+                       ret = ocfs2_rw_lock(inode, 0);
+
                 if (ret < 0) {
-                       mlog_errno(ret);
+                       if (ret != -EAGAIN)
+                               mlog_errno(ret);
                         goto bail;
                 }
                 rw_level = 0;
@@ -2393,9 +2450,11 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
          * like i_size. This allows the checks down below
          * generic_file_aio_read() a chance of actually working.
          */
-       ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level);
+       ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level,
+                                    !nowait);
         if (ret < 0) {
-               mlog_errno(ret);
+               if (ret != -EAGAIN)
+                       mlog_errno(ret);
                 goto bail;
         }
         ocfs2_inode_unlock(inode, lock_level);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c

index 3630443..e5dcea6 100644 (file)
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -666,23 +666,24 @@ static int __ocfs2_journal_access(handle_t *handle,
         /* we can safely remove this assertion after testing. */
         if (!buffer_uptodate(bh)) {
                 mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
-               mlog(ML_ERROR, "b_blocknr=%llu\n",
-                    (unsigned long long)bh->b_blocknr);
+               mlog(ML_ERROR, "b_blocknr=%llu, b_state=0x%lx\n",
+                    (unsigned long long)bh->b_blocknr, bh->b_state);
  
                 lock_buffer(bh);
                 /*
-                * A previous attempt to write this buffer head failed.
-                * Nothing we can do but to retry the write and hope for
-                * the best.
+                * A previous transaction with a couple of buffer heads fail
+                * to checkpoint, so all the bhs are marked as BH_Write_EIO.
+                * For current transaction, the bh is just among those error
+                * bhs which previous transaction handle. We can't just clear
+                * its BH_Write_EIO and reuse directly, since other bhs are
+                * not written to disk yet and that will cause metadata
+                * inconsistency. So we should set fs read-only to avoid
+                * further damage.
                  */
                 if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) {
-                       clear_buffer_write_io_error(bh);
-                       set_buffer_uptodate(bh);
-               }
-
-               if (!buffer_uptodate(bh)) {
                         unlock_buffer(bh);
-                       return -EIO;
+                       return ocfs2_error(osb->sb, "A previous attempt to "
+                                       "write this buffer head failed\n");
                 }
                 unlock_buffer(bh);
         }
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c

index 098f5c7..fb9a20e 100644 (file)
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -184,7 +184,7 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
         int ret = 0, lock_level = 0;
  
         ret = ocfs2_inode_lock_atime(file_inode(file),
-                                   file->f_path.mnt, &lock_level);
+                                   file->f_path.mnt, &lock_level, 1);
         if (ret < 0) {
                 mlog_errno(ret);
                 goto out;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h

index 9a50f22..6867eef 100644 (file)
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -404,6 +404,7 @@ struct ocfs2_super
         struct ocfs2_lock_res osb_super_lockres;
         struct ocfs2_lock_res osb_rename_lockres;
         struct ocfs2_lock_res osb_nfs_sync_lockres;
+       struct ocfs2_lock_res osb_trim_fs_lockres;
         struct ocfs2_dlm_debug *osb_dlm_debug;
  
         struct dentry *osb_debug_root;
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h

index d277aab..7051b99 100644 (file)
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -50,6 +50,7 @@ enum ocfs2_lock_type {
         OCFS2_LOCK_TYPE_NFS_SYNC,
         OCFS2_LOCK_TYPE_ORPHAN_SCAN,
         OCFS2_LOCK_TYPE_REFCOUNT,
+       OCFS2_LOCK_TYPE_TRIM_FS,
         OCFS2_NUM_LOCK_TYPES
  };
  
@@ -93,6 +94,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
                 case OCFS2_LOCK_TYPE_REFCOUNT:
                         c = 'T';
                         break;
+               case OCFS2_LOCK_TYPE_TRIM_FS:
+                       c = 'I';
+                       break;
                 default:
                         c = '\0';
         }
@@ -115,6 +119,7 @@ static char *ocfs2_lock_type_strings[] = {
         [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync",
         [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
         [OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount",
+       [OCFS2_LOCK_TYPE_TRIM_FS] = "TrimFs",
  };
  
  static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h

index a0b5d00..e2a11aa 100644 (file)
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1449,20 +1449,22 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
  
  TRACE_EVENT(ocfs2_prepare_inode_for_write,
         TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
-                unsigned long count),
-       TP_ARGS(ino, saved_pos, count),
+                unsigned long count, int wait),
+       TP_ARGS(ino, saved_pos, count, wait),
         TP_STRUCT__entry(
                 __field(unsigned long long, ino)
                 __field(unsigned long long, saved_pos)
                 __field(unsigned long, count)
+               __field(int, wait)
         ),
         TP_fast_assign(
                 __entry->ino = ino;
                 __entry->saved_pos = saved_pos;
                 __entry->count = count;
+               __entry->wait = wait;
         ),
-       TP_printk("%llu %llu %lu", __entry->ino,
-                 __entry->saved_pos, __entry->count)
+       TP_printk("%llu %llu %lu %d", __entry->ino,
+                 __entry->saved_pos, __entry->count, __entry->wait)
  );
  
  DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c

index 9f0b95a..d8f5f6c 100644 (file)
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -2438,6 +2438,8 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
         }
         le16_add_cpu(&bg->bg_free_bits_count, num_bits);
         if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+               if (undo_fn)
+                       jbd_unlock_bh_state(group_bh);
                 return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
                                    (unsigned long long)le64_to_cpu(bg->bg_blkno),
                                    le16_to_cpu(bg->bg_bits),
@@ -2563,16 +2565,16 @@ static int _ocfs2_free_clusters(handle_t *handle,
         int status;
         u16 bg_start_bit;
         u64 bg_blkno;
-       struct ocfs2_dinode *fe;
  
         /* You can't ever have a contiguous set of clusters
          * bigger than a block group bitmap so we never have to worry
          * about looping on them.
          * This is expensive. We can safely remove once this stuff has
          * gotten tested really well. */
-       BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
+       BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb,
+                               ocfs2_blocks_to_clusters(bitmap_inode->i_sb,
+                                                        start_blk)));
  
-       fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
  
         ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
                                      &bg_start_bit);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c

index 80efa56..ffa4952 100644 (file)
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -474,9 +474,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
                 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
                 if (!new) {
                         ocfs2_release_system_inodes(osb);
-                       status = -EINVAL;
+                       status = ocfs2_is_soft_readonly(osb) ? -EROFS : -EINVAL;
                         mlog_errno(status);
-                       /* FIXME: Should ERROR_RO_FS */
                         mlog(ML_ERROR, "Unable to load system inode %d, "
                              "possibly corrupt fs?", i);
                         goto bail;
@@ -505,7 +504,7 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
                 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
                 if (!new) {
                         ocfs2_release_system_inodes(osb);
-                       status = -EINVAL;
+                       status = ocfs2_is_soft_readonly(osb) ? -EROFS : -EINVAL;
                         mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n",
                              status, i, osb->slot_num);
                         goto bail;
@@ -1208,14 +1207,15 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
  read_super_error:
         brelse(bh);
  
+       if (status)
+               mlog_errno(status);
+
         if (osb) {
                 atomic_set(&osb->vol_state, VOLUME_DISABLED);
                 wake_up(&osb->osb_mount_event);
                 ocfs2_dismount_volume(sb, 1);
         }
  
-       if (status)
-               mlog_errno(status);
         return status;
  }
  
@@ -1843,6 +1843,9 @@ static int ocfs2_mount_volume(struct super_block *sb)
         status = ocfs2_dlm_init(osb);
         if (status < 0) {
                 mlog_errno(status);
+               if (status == -EBADR && ocfs2_userspace_stack(osb))
+                       mlog(ML_ERROR, "couldn't mount because cluster name on"
+                       " disk does not match the running cluster name.\n");
                 goto leave;
         }
  
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c

index c5898c5..c261c1d 100644 (file)
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -638,14 +638,17 @@ int ocfs2_calc_xattr_init(struct inode *dir,
                                                      si->value_len);
  
         if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+               down_read(&OCFS2_I(dir)->ip_xattr_sem);
                 acl_len = ocfs2_xattr_get_nolock(dir, dir_bh,
                                         OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
                                         "", NULL, 0);
+               up_read(&OCFS2_I(dir)->ip_xattr_sem);
                 if (acl_len > 0) {
                         a_size = ocfs2_xattr_entry_real_size(0, acl_len);
                         if (S_ISDIR(mode))
                                 a_size <<= 1;
                 } else if (acl_len != 0 && acl_len != -ENODATA) {
+                       ret = acl_len;
                         mlog_errno(ret);
                         return ret;
                 }
@@ -6415,7 +6418,7 @@ static int ocfs2_reflink_xattr_header(handle_t *handle,
                  * and then insert the extents one by one.
                  */
                 if (xv->xr_list.l_tree_depth) {
-                       memcpy(new_xv, &def_xv, sizeof(def_xv));
+                       memcpy(new_xv, &def_xv, OCFS2_XATTR_ROOT_SIZE);
                         vb->vb_xv = new_xv;
                         vb->vb_bh = value_bh;
                         ocfs2_init_xattr_value_extent_tree(&data_et,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c

index 339e4c1..ec6d298 100644 (file)
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -47,8 +47,11 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
         if (hiwater_rss < mm->hiwater_rss)
                 hiwater_rss = mm->hiwater_rss;
  
-       text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
-       lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
+       /* split executable areas between text and lib */
+       text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK);
+       text = min(text, mm->exec_vm << PAGE_SHIFT);
+       lib = (mm->exec_vm << PAGE_SHIFT) - text;
+
         swap = get_mm_counter(mm, MM_SWAPENTS);
         seq_printf(m,
                 "VmPeak:\t%8lu kB\n"
@@ -76,7 +79,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
                 file << (PAGE_SHIFT-10),
                 shmem << (PAGE_SHIFT-10),
                 mm->data_vm << (PAGE_SHIFT-10),
-               mm->stack_vm << (PAGE_SHIFT-10), text, lib,
+               mm->stack_vm << (PAGE_SHIFT-10),
+               text >> 10,
+               lib >> 10,
                 mm_pgtables_bytes(mm) >> 10,
                 swap << (PAGE_SHIFT-10));
         hugetlb_report_usage(m, mm);
@@ -977,14 +982,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
  static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
                 unsigned long addr, pmd_t *pmdp)
  {
-       pmd_t pmd = *pmdp;
+       pmd_t old, pmd = *pmdp;
  
         if (pmd_present(pmd)) {
                 /* See comment in change_huge_pmd() */
-               pmdp_invalidate(vma, addr, pmdp);
-               if (pmd_dirty(*pmdp))
+               old = pmdp_invalidate(vma, addr, pmdp);
+               if (pmd_dirty(old))
                         pmd = pmd_mkdirty(pmd);
-               if (pmd_young(*pmdp))
+               if (pmd_young(old))
                         pmd = pmd_mkyoung(pmd);
  
                 pmd = pmd_wrprotect(pmd);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c

index 743eaa6..87a13a7 100644 (file)
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -294,10 +294,13 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
          * pmd_trans_unstable) of the pmd.
          */
         _pmd = READ_ONCE(*pmd);
-       if (!pmd_present(_pmd))
+       if (pmd_none(_pmd))
                 goto out;
  
         ret = false;
+       if (!pmd_present(_pmd))
+               goto out;
+
         if (pmd_trans_huge(_pmd))
                 goto out;
  
@@ -985,24 +988,14 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
                                   struct uffd_msg *msg)
  {
         int fd;
-       struct file *file;
-       unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS;
  
-       fd = get_unused_fd_flags(flags);
+       fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, new,
+                             O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS));
         if (fd < 0)
                 return fd;
  
-       file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new,
-                                 O_RDWR | flags);
-       if (IS_ERR(file)) {
-               put_unused_fd(fd);
-               return PTR_ERR(file);
-       }
-
-       fd_install(fd, file);
         msg->arg.reserved.reserved1 = 0;
         msg->arg.fork.ufd = fd;
-
         return 0;
  }
  
@@ -1884,24 +1877,10 @@ static void init_once_userfaultfd_ctx(void *mem)
         seqcount_init(&ctx->refile_seq);
  }
  
-/**
- * userfaultfd_file_create - Creates a userfaultfd file pointer.
- * @flags: Flags for the userfaultfd file.
- *
- * This function creates a userfaultfd file pointer, w/out installing
- * it into the fd table. This is useful when the userfaultfd file is
- * used during the initialization of data structures that require
- * extra setup after the userfaultfd creation. So the userfaultfd
- * creation is split into the file pointer creation phase, and the
- * file descriptor installation phase.  In this way races with
- * userspace closing the newly installed file descriptor can be
- * avoided.  Returns a userfaultfd file pointer, or a proper error
- * pointer.
- */
-static struct file *userfaultfd_file_create(int flags)
+SYSCALL_DEFINE1(userfaultfd, int, flags)
  {
-       struct file *file;
         struct userfaultfd_ctx *ctx;
+       int fd;
  
         BUG_ON(!current->mm);
  
@@ -1909,14 +1888,12 @@ static struct file *userfaultfd_file_create(int flags)
         BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
         BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
  
-       file = ERR_PTR(-EINVAL);
         if (flags & ~UFFD_SHARED_FCNTL_FLAGS)
-               goto out;
+               return -EINVAL;
  
-       file = ERR_PTR(-ENOMEM);
         ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
         if (!ctx)
-               goto out;
+               return -ENOMEM;
  
         atomic_set(&ctx->refcount, 1);
         ctx->flags = flags;
@@ -1927,39 +1904,13 @@ static struct file *userfaultfd_file_create(int flags)
         /* prevent the mm struct to be freed */
         mmgrab(ctx->mm);
  
-       file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
-                                 O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
-       if (IS_ERR(file)) {
+       fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx,
+                             O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
+       if (fd < 0) {
                 mmdrop(ctx->mm);
                 kmem_cache_free(userfaultfd_ctx_cachep, ctx);
         }
-out:
-       return file;
-}
-
-SYSCALL_DEFINE1(userfaultfd, int, flags)
-{
-       int fd, error;
-       struct file *file;
-
-       error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
-       if (error < 0)
-               return error;
-       fd = error;
-
-       file = userfaultfd_file_create(flags);
-       if (IS_ERR(file)) {
-               error = PTR_ERR(file);
-               goto err_put_unused_fd;
-       }
-       fd_install(fd, file);
-
         return fd;
-
-err_put_unused_fd:
-       put_unused_fd(fd);
-
-       return error;
  }
  
  static int __init userfaultfd_init(void)
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h

index 868e685..2cfa307 100644 (file)
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -309,19 +309,26 @@ extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
  extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
  #endif
  
-#ifndef __HAVE_ARCH_PMDP_INVALIDATE
-extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
-                           pmd_t *pmdp);
-#endif
-
-#ifndef __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE
-static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma,
-                                          unsigned long address, pmd_t *pmdp)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * This is an implementation of pmdp_establish() that is only suitable for an
+ * architecture that doesn't have hardware dirty/accessed bits. In this case we
+ * can't race with CPU which sets these bits and non-atomic aproach is fine.
+ */
+static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma,
+               unsigned long address, pmd_t *pmdp, pmd_t pmd)
  {
-
+       pmd_t old_pmd = *pmdp;
+       set_pmd_at(vma->vm_mm, address, pmdp, pmd);
+       return old_pmd;
  }
  #endif
  
+#ifndef __HAVE_ARCH_PMDP_INVALIDATE
+extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+                           pmd_t *pmdp);
+#endif
+
  #ifndef __HAVE_ARCH_PTE_SAME
  static inline int pte_same(pte_t pte_a, pte_t pte_b)
  {
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h

index 82a2588..36fa6a2 100644 (file)
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -119,6 +119,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
                                                 long freed);
  bool isolate_huge_page(struct page *page, struct list_head *list);
  void putback_active_hugepage(struct page *page);
+void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
  void free_huge_page(struct page *page);
  void hugetlb_fix_reserve_counts(struct inode *inode);
  extern struct mutex *hugetlb_fault_mutex_table;
@@ -129,7 +130,6 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
  
  pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
  
-extern int hugepages_treat_as_movable;
  extern int sysctl_hugetlb_shm_group;
  extern struct list_head huge_boot_pages;
  
@@ -158,6 +158,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                 unsigned long address, unsigned long end, pgprot_t newprot);
  
  bool is_hugetlb_entry_migration(pte_t pte);
+
  #else /* !CONFIG_HUGETLB_PAGE */
  
  static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
@@ -198,6 +199,7 @@ static inline bool isolate_huge_page(struct page *page, struct list_head *list)
         return false;
  }
  #define putback_active_hugepage(p)     do {} while (0)
+#define move_hugetlb_state(old, new, reason)   do {} while (0)
  
  static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                 unsigned long address, unsigned long end, pgprot_t newprot)
@@ -271,6 +273,17 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
         return sb->s_fs_info;
  }
  
+struct hugetlbfs_inode_info {
+       struct shared_policy policy;
+       struct inode vfs_inode;
+       unsigned int seals;
+};
+
+static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
+{
+       return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
+}
+
  extern const struct file_operations hugetlbfs_file_operations;
  extern const struct vm_operations_struct hugetlb_vm_ops;
  struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
@@ -343,10 +356,10 @@ struct huge_bootmem_page {
  struct page *alloc_huge_page(struct vm_area_struct *vma,
                                 unsigned long addr, int avoid_reserve);
  struct page *alloc_huge_page_node(struct hstate *h, int nid);
-struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
-                               unsigned long addr, int avoid_reserve);
  struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
                                 nodemask_t *nmask);
+struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
+                               unsigned long address);
  int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
                         pgoff_t idx);
  
@@ -524,7 +537,7 @@ struct hstate {};
  #define alloc_huge_page(v, a, r) NULL
  #define alloc_huge_page_node(h, nid) NULL
  #define alloc_huge_page_nodemask(h, preferred_nid, nmask) NULL
-#define alloc_huge_page_noerr(v, a, r) NULL
+#define alloc_huge_page_vma(h, vma, address) NULL
  #define alloc_bootmem_huge_page(h) NULL
  #define hstate_file(f) NULL
  #define hstate_sizelog(s) NULL
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

index 69966c4..8820468 100644 (file)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -108,7 +108,10 @@ struct lruvec_stat {
   */
  struct mem_cgroup_per_node {
         struct lruvec           lruvec;
-       struct lruvec_stat __percpu *lruvec_stat;
+
+       struct lruvec_stat __percpu *lruvec_stat_cpu;
+       atomic_long_t           lruvec_stat[NR_VM_NODE_STAT_ITEMS];
+
         unsigned long           lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
  
         struct mem_cgroup_reclaim_iter  iter[DEF_PRIORITY + 1];
@@ -227,10 +230,10 @@ struct mem_cgroup {
         spinlock_t              move_lock;
         struct task_struct      *move_lock_task;
         unsigned long           move_lock_flags;
-       /*
-        * percpu counter.
-        */
-       struct mem_cgroup_stat_cpu __percpu *stat;
+
+       struct mem_cgroup_stat_cpu __percpu *stat_cpu;
+       atomic_long_t           stat[MEMCG_NR_STAT];
+       atomic_long_t           events[MEMCG_NR_EVENTS];
  
         unsigned long           socket_pressure;
  
@@ -265,6 +268,12 @@ struct mem_cgroup {
         /* WARNING: nodeinfo must be the last member here */
  };
  
+/*
+ * size of first charge trial. "32" comes from vmscan.c's magic value.
+ * TODO: maybe necessary to use big numbers in big irons.
+ */
+#define MEMCG_CHARGE_BATCH 32U
+
  extern struct mem_cgroup *root_mem_cgroup;
  
  static inline bool mem_cgroup_disabled(void)
@@ -272,13 +281,6 @@ static inline bool mem_cgroup_disabled(void)
         return !cgroup_subsys_enabled(memory_cgrp_subsys);
  }
  
-static inline void mem_cgroup_event(struct mem_cgroup *memcg,
-                                   enum memcg_event_item event)
-{
-       this_cpu_inc(memcg->stat->events[event]);
-       cgroup_file_notify(&memcg->events_file);
-}
-
  bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
  
  int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
@@ -492,32 +494,38 @@ void unlock_page_memcg(struct page *page);
  static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
                                              int idx)
  {
-       long val = 0;
-       int cpu;
-
-       for_each_possible_cpu(cpu)
-               val += per_cpu(memcg->stat->count[idx], cpu);
-
-       if (val < 0)
-               val = 0;
-
-       return val;
+       long x = atomic_long_read(&memcg->stat[idx]);
+#ifdef CONFIG_SMP
+       if (x < 0)
+               x = 0;
+#endif
+       return x;
  }
  
  /* idx can be of type enum memcg_stat_item or node_stat_item */
  static inline void __mod_memcg_state(struct mem_cgroup *memcg,
                                      int idx, int val)
  {
-       if (!mem_cgroup_disabled())
-               __this_cpu_add(memcg->stat->count[idx], val);
+       long x;
+
+       if (mem_cgroup_disabled())
+               return;
+
+       x = val + __this_cpu_read(memcg->stat_cpu->count[idx]);
+       if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+               atomic_long_add(x, &memcg->stat[idx]);
+               x = 0;
+       }
+       __this_cpu_write(memcg->stat_cpu->count[idx], x);
  }
  
  /* idx can be of type enum memcg_stat_item or node_stat_item */
  static inline void mod_memcg_state(struct mem_cgroup *memcg,
                                    int idx, int val)
  {
-       if (!mem_cgroup_disabled())
-               this_cpu_add(memcg->stat->count[idx], val);
+       preempt_disable();
+       __mod_memcg_state(memcg, idx, val);
+       preempt_enable();
  }
  
  /**
@@ -555,87 +563,108 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
                                               enum node_stat_item idx)
  {
         struct mem_cgroup_per_node *pn;
-       long val = 0;
-       int cpu;
+       long x;
  
         if (mem_cgroup_disabled())
                 return node_page_state(lruvec_pgdat(lruvec), idx);
  
         pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-       for_each_possible_cpu(cpu)
-               val += per_cpu(pn->lruvec_stat->count[idx], cpu);
-
-       if (val < 0)
-               val = 0;
-
-       return val;
+       x = atomic_long_read(&pn->lruvec_stat[idx]);
+#ifdef CONFIG_SMP
+       if (x < 0)
+               x = 0;
+#endif
+       return x;
  }
  
  static inline void __mod_lruvec_state(struct lruvec *lruvec,
                                       enum node_stat_item idx, int val)
  {
         struct mem_cgroup_per_node *pn;
+       long x;
  
+       /* Update node */
         __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
+
         if (mem_cgroup_disabled())
                 return;
+
         pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+
+       /* Update memcg */
         __mod_memcg_state(pn->memcg, idx, val);
-       __this_cpu_add(pn->lruvec_stat->count[idx], val);
+
+       /* Update lruvec */
+       x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
+       if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+               atomic_long_add(x, &pn->lruvec_stat[idx]);
+               x = 0;
+       }
+       __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
  }
  
  static inline void mod_lruvec_state(struct lruvec *lruvec,
                                     enum node_stat_item idx, int val)
  {
-       struct mem_cgroup_per_node *pn;
-
-       mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
-       if (mem_cgroup_disabled())
-               return;
-       pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-       mod_memcg_state(pn->memcg, idx, val);
-       this_cpu_add(pn->lruvec_stat->count[idx], val);
+       preempt_disable();
+       __mod_lruvec_state(lruvec, idx, val);
+       preempt_enable();
  }
  
  static inline void __mod_lruvec_page_state(struct page *page,
                                            enum node_stat_item idx, int val)
  {
-       struct mem_cgroup_per_node *pn;
+       pg_data_t *pgdat = page_pgdat(page);
+       struct lruvec *lruvec;
  
-       __mod_node_page_state(page_pgdat(page), idx, val);
-       if (mem_cgroup_disabled() || !page->mem_cgroup)
+       /* Untracked pages have no memcg, no lruvec. Update only the node */
+       if (!page->mem_cgroup) {
+               __mod_node_page_state(pgdat, idx, val);
                 return;
-       __mod_memcg_state(page->mem_cgroup, idx, val);
-       pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
-       __this_cpu_add(pn->lruvec_stat->count[idx], val);
+       }
+
+       lruvec = mem_cgroup_lruvec(pgdat, page->mem_cgroup);
+       __mod_lruvec_state(lruvec, idx, val);
  }
  
  static inline void mod_lruvec_page_state(struct page *page,
                                          enum node_stat_item idx, int val)
  {
-       struct mem_cgroup_per_node *pn;
-
-       mod_node_page_state(page_pgdat(page), idx, val);
-       if (mem_cgroup_disabled() || !page->mem_cgroup)
-               return;
-       mod_memcg_state(page->mem_cgroup, idx, val);
-       pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
-       this_cpu_add(pn->lruvec_stat->count[idx], val);
+       preempt_disable();
+       __mod_lruvec_page_state(page, idx, val);
+       preempt_enable();
  }
  
  unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
                                                 gfp_t gfp_mask,
                                                 unsigned long *total_scanned);
  
+/* idx can be of type enum memcg_event_item or vm_event_item */
+static inline void __count_memcg_events(struct mem_cgroup *memcg,
+                                       int idx, unsigned long count)
+{
+       unsigned long x;
+
+       if (mem_cgroup_disabled())
+               return;
+
+       x = count + __this_cpu_read(memcg->stat_cpu->events[idx]);
+       if (unlikely(x > MEMCG_CHARGE_BATCH)) {
+               atomic_long_add(x, &memcg->events[idx]);
+               x = 0;
+       }
+       __this_cpu_write(memcg->stat_cpu->events[idx], x);
+}
+
  static inline void count_memcg_events(struct mem_cgroup *memcg,
-                                     enum vm_event_item idx,
-                                     unsigned long count)
+                                     int idx, unsigned long count)
  {
-       if (!mem_cgroup_disabled())
-               this_cpu_add(memcg->stat->events[idx], count);
+       preempt_disable();
+       __count_memcg_events(memcg, idx, count);
+       preempt_enable();
  }
  
-/* idx can be of type enum memcg_stat_item or node_stat_item */
+/* idx can be of type enum memcg_event_item or vm_event_item */
  static inline void count_memcg_page_event(struct page *page,
                                           int idx)
  {
@@ -654,12 +683,20 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
         rcu_read_lock();
         memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
         if (likely(memcg)) {
-               this_cpu_inc(memcg->stat->events[idx]);
+               count_memcg_events(memcg, idx, 1);
                 if (idx == OOM_KILL)
                         cgroup_file_notify(&memcg->events_file);
         }
         rcu_read_unlock();
  }
+
+static inline void mem_cgroup_event(struct mem_cgroup *memcg,
+                                   enum memcg_event_item event)
+{
+       count_memcg_events(memcg, event, 1);
+       cgroup_file_notify(&memcg->events_file);
+}
+
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  void mem_cgroup_split_huge_fixup(struct page *head);
  #endif
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 7fc9238..173d248 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1312,8 +1312,6 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
                 unsigned long end, unsigned long floor, unsigned long ceiling);
  int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
                         struct vm_area_struct *vma);
-void unmap_mapping_range(struct address_space *mapping,
-               loff_t const holebegin, loff_t const holelen, int even_cows);
  int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
                              unsigned long *start, unsigned long *end,
                              pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp);
@@ -1324,12 +1322,6 @@ int follow_phys(struct vm_area_struct *vma, unsigned long address,
  int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
                         void *buf, int len, int write);
  
-static inline void unmap_shared_mapping_range(struct address_space *mapping,
-               loff_t const holebegin, loff_t const holelen)
-{
-       unmap_mapping_range(mapping, holebegin, holelen, 0);
-}
-
  extern void truncate_pagecache(struct inode *inode, loff_t new);
  extern void truncate_setsize(struct inode *inode, loff_t newsize);
  void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
@@ -1344,6 +1336,10 @@ extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
  extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
                             unsigned long address, unsigned int fault_flags,
                             bool *unlocked);
+void unmap_mapping_pages(struct address_space *mapping,
+               pgoff_t start, pgoff_t nr, bool even_cows);
+void unmap_mapping_range(struct address_space *mapping,
+               loff_t const holebegin, loff_t const holelen, int even_cows);
  #else
  static inline int handle_mm_fault(struct vm_area_struct *vma,
                 unsigned long address, unsigned int flags)
@@ -1360,10 +1356,20 @@ static inline int fixup_user_fault(struct task_struct *tsk,
         BUG();
         return -EFAULT;
  }
+static inline void unmap_mapping_pages(struct address_space *mapping,
+               pgoff_t start, pgoff_t nr, bool even_cows) { }
+static inline void unmap_mapping_range(struct address_space *mapping,
+               loff_t const holebegin, loff_t const holelen, int even_cows) { }
  #endif
  
-extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len,
-               unsigned int gup_flags);
+static inline void unmap_shared_mapping_range(struct address_space *mapping,
+               loff_t const holebegin, loff_t const holelen)
+{
+       unmap_mapping_range(mapping, holebegin, holelen, 0);
+}
+
+extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
+               void *buf, int len, unsigned int gup_flags);
  extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
                 void *buf, int len, unsigned int gup_flags);
  extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index cfd0ac4..fd1af6b 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -31,28 +31,56 @@ struct hmm;
   * it to keep track of whatever it is we are using the page for at the
   * moment. Note that we have no way to track which tasks are using
   * a page, though if it is a pagecache page, rmap structures can tell us
- * who is mapping it.
+ * who is mapping it. If you allocate the page using alloc_pages(), you
+ * can use some of the space in struct page for your own purposes.
   *
- * The objects in struct page are organized in double word blocks in
- * order to allows us to use atomic double word operations on portions
- * of struct page. That is currently only used by slub but the arrangement
- * allows the use of atomic double word operations on the flags/mapping
- * and lru list pointers also.
+ * Pages that were once in the page cache may be found under the RCU lock
+ * even after they have been recycled to a different purpose.  The page
+ * cache reads and writes some of the fields in struct page to pin the
+ * page before checking that it's still in the page cache.  It is vital
+ * that all users of struct page:
+ * 1. Use the first word as PageFlags.
+ * 2. Clear or preserve bit 0 of page->compound_head.  It is used as
+ *    PageTail for compound pages, and the page cache must not see false
+ *    positives.  Some users put a pointer here (guaranteed to be at least
+ *    4-byte aligned), other users avoid using the field altogether.
+ * 3. page->_refcount must either not be used, or must be used in such a
+ *    way that other CPUs temporarily incrementing and then decrementing the
+ *    refcount does not cause problems.  On receiving the page from
+ *    alloc_pages(), the refcount will be positive.
+ * 4. Either preserve page->_mapcount or restore it to -1 before freeing it.
+ *
+ * If you allocate pages of order > 0, you can use the fields in the struct
+ * page associated with each page, but bear in mind that the pages may have
+ * been inserted individually into the page cache, so you must use the above
+ * four fields in a compatible way for each struct page.
+ *
+ * SLUB uses cmpxchg_double() to atomically update its freelist and
+ * counters.  That requires that freelist & counters be adjacent and
+ * double-word aligned.  We align all struct pages to double-word
+ * boundaries, and ensure that 'freelist' is aligned within the
+ * struct.
   */
+#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
+#define _struct_page_alignment __aligned(2 * sizeof(unsigned long))
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE)
+#define _slub_counter_t                unsigned long
+#else
+#define _slub_counter_t                unsigned int
+#endif
+#else /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */
+#define _struct_page_alignment
+#define _slub_counter_t                unsigned int
+#endif /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */
+
  struct page {
         /* First double word block */
         unsigned long flags;            /* Atomic flags, some possibly
                                          * updated asynchronously */
         union {
-               struct address_space *mapping;  /* If low bit clear, points to
-                                                * inode address_space, or NULL.
-                                                * If page mapped as anonymous
-                                                * memory, low bit is set, and
-                                                * it points to anon_vma object
-                                                * or KSM private structure. See
-                                                * PAGE_MAPPING_ANON and
-                                                * PAGE_MAPPING_KSM.
-                                                */
+               /* See page-flags.h for the definition of PAGE_MAPPING_FLAGS */
+               struct address_space *mapping;
+
                 void *s_mem;                    /* slab first object */
                 atomic_t compound_mapcount;     /* first tail page */
                 /* page_deferred_list().next     -- second tail page */
@@ -66,40 +94,27 @@ struct page {
         };
  
         union {
-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
-       defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
-               /* Used for cmpxchg_double in slub */
-               unsigned long counters;
-#else
-               /*
-                * Keep _refcount separate from slub cmpxchg_double data.
-                * As the rest of the double word is protected by slab_lock
-                * but _refcount is not.
-                */
-               unsigned counters;
-#endif
-               struct {
+               _slub_counter_t counters;
+               unsigned int active;            /* SLAB */
+               struct {                        /* SLUB */
+                       unsigned inuse:16;
+                       unsigned objects:15;
+                       unsigned frozen:1;
+               };
+               int units;                      /* SLOB */
+
+               struct {                        /* Page cache */
+                       /*
+                        * Count of ptes mapped in mms, to show when
+                        * page is mapped & limit reverse map searches.
+                        *
+                        * Extra information about page type may be
+                        * stored here for pages that are never mapped,
+                        * in which case the value MUST BE <= -2.
+                        * See page-flags.h for more details.
+                        */
+                       atomic_t _mapcount;
  
-                       union {
-                               /*
-                                * Count of ptes mapped in mms, to show when
-                                * page is mapped & limit reverse map searches.
-                                *
-                                * Extra information about page type may be
-                                * stored here for pages that are never mapped,
-                                * in which case the value MUST BE <= -2.
-                                * See page-flags.h for more details.
-                                */
-                               atomic_t _mapcount;
-
-                               unsigned int active;            /* SLAB */
-                               struct {                        /* SLUB */
-                                       unsigned inuse:16;
-                                       unsigned objects:15;
-                                       unsigned frozen:1;
-                               };
-                               int units;                      /* SLOB */
-                       };
                         /*
                          * Usage count, *USE WRAPPER FUNCTION* when manual
                          * accounting. See page_ref.h
@@ -109,8 +124,6 @@ struct page {
         };
  
         /*
-        * Third double word block
-        *
          * WARNING: bit 0 of the first word encode PageTail(). That means
          * the rest users of the storage space MUST NOT use the bit to
          * avoid collision and false-positive PageTail().
@@ -145,19 +158,9 @@ struct page {
                         unsigned long compound_head; /* If bit zero is set */
  
                         /* First tail page only */
-#ifdef CONFIG_64BIT
-                       /*
-                        * On 64 bit system we have enough space in struct page
-                        * to encode compound_dtor and compound_order with
-                        * unsigned int. It can help compiler generate better or
-                        * smaller code on some archtectures.
-                        */
-                       unsigned int compound_dtor;
-                       unsigned int compound_order;
-#else
-                       unsigned short int compound_dtor;
-                       unsigned short int compound_order;
-#endif
+                       unsigned char compound_dtor;
+                       unsigned char compound_order;
+                       /* two/six bytes available here */
                 };
  
  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
@@ -171,15 +174,14 @@ struct page {
  #endif
         };
  
-       /* Remainder is not double word aligned */
         union {
-               unsigned long private;          /* Mapping-private opaque data:
-                                                * usually used for buffer_heads
-                                                * if PagePrivate set; used for
-                                                * swp_entry_t if PageSwapCache;
-                                                * indicates order in the buddy
-                                                * system if PG_buddy is set.
-                                                */
+               /*
+                * Mapping-private opaque data:
+                * Usually used for buffer_heads if PagePrivate
+                * Used for swp_entry_t if PageSwapCache
+                * Indicates order in the buddy system if PageBuddy
+                */
+               unsigned long private;
  #if USE_SPLIT_PTE_PTLOCKS
  #if ALLOC_SPLIT_PTLOCKS
                 spinlock_t *ptl;
@@ -212,15 +214,7 @@ struct page {
  #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
         int _last_cpupid;
  #endif
-}
-/*
- * The struct page can be forced to be double word aligned so that atomic ops
- * on double words work. The SLUB allocator can make use of such a feature.
- */
-#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
-       __aligned(2 * sizeof(unsigned long))
-#endif
-;
+} _struct_page_alignment;
  
  #define PAGE_FRAG_CACHE_MAX_SIZE       __ALIGN_MASK(32768, ~PAGE_MASK)
  #define PAGE_FRAG_CACHE_MAX_ORDER      get_order(PAGE_FRAG_CACHE_MAX_SIZE)
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h

index b25dc9d..2d07a1e 100644 (file)
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -2,6 +2,7 @@
  #ifndef _LINUX_MMU_NOTIFIER_H
  #define _LINUX_MMU_NOTIFIER_H
  
+#include <linux/types.h>
  #include <linux/list.h>
  #include <linux/spinlock.h>
  #include <linux/mm_types.h>
@@ -10,6 +11,9 @@
  struct mmu_notifier;
  struct mmu_notifier_ops;
  
+/* mmu_notifier_ops flags */
+#define MMU_INVALIDATE_DOES_NOT_BLOCK  (0x01)
+
  #ifdef CONFIG_MMU_NOTIFIER
  
  /*
@@ -26,6 +30,15 @@ struct mmu_notifier_mm {
  };
  
  struct mmu_notifier_ops {
+       /*
+        * Flags to specify behavior of callbacks for this MMU notifier.
+        * Used to determine which context an operation may be called.
+        *
+        * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_range_* callbacks do not
+        *      block
+        */
+       int flags;
+
         /*
          * Called either by mmu_notifier_unregister or when the mm is
          * being destroyed by exit_mmap, always before all pages are
@@ -137,6 +150,10 @@ struct mmu_notifier_ops {
          * page. Pages will no longer be referenced by the linux
          * address space but may still be referenced by sptes until
          * the last refcount is dropped.
+        *
+        * If both of these callbacks cannot block, and invalidate_range
+        * cannot block, mmu_notifier_ops.flags should have
+        * MMU_INVALIDATE_DOES_NOT_BLOCK set.
          */
         void (*invalidate_range_start)(struct mmu_notifier *mn,
                                        struct mm_struct *mm,
@@ -159,12 +176,13 @@ struct mmu_notifier_ops {
          * external TLB range needs to be flushed. For more in depth
          * discussion on this see Documentation/vm/mmu_notifier.txt
          *
-        * The invalidate_range() function is called under the ptl
-        * spin-lock and not allowed to sleep.
-        *
          * Note that this function might be called with just a sub-range
          * of what was passed to invalidate_range_start()/end(), if
          * called between those functions.
+        *
+        * If this callback cannot block, and invalidate_range_{start,end}
+        * cannot block, mmu_notifier_ops.flags should have
+        * MMU_INVALIDATE_DOES_NOT_BLOCK set.
          */
         void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm,
                                  unsigned long start, unsigned long end);
@@ -218,6 +236,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
                                   bool only_end);
  extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
                                   unsigned long start, unsigned long end);
+extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
  
  static inline void mmu_notifier_release(struct mm_struct *mm)
  {
@@ -457,6 +476,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
  {
  }
  
+static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+{
+       return false;
+}
+
  static inline void mmu_notifier_mm_init(struct mm_struct *mm)
  {
  }
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index 67f2e3c..7522a69 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1166,8 +1166,16 @@ extern unsigned long usemap_size(void);
  
  /*
   * We use the lower bits of the mem_map pointer to store
- * a little bit of information.  There should be at least
- * 3 bits here due to 32-bit alignment.
+ * a little bit of information.  The pointer is calculated
+ * as mem_map - section_nr_to_pfn(pnum).  The result is
+ * aligned to the minimum alignment of the two values:
+ *   1. All mem_map arrays are page-aligned.
+ *   2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT
+ *      lowest bits.  PFN_SECTION_SHIFT is arch-specific
+ *      (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
+ *      worst combination is powerpc with 256k pages,
+ *      which results in PFN_SECTION_SHIFT equal 6.
+ * To sum it up, at least 6 bits are available.
   */
  #define        SECTION_MARKED_PRESENT  (1UL<<0)
  #define SECTION_HAS_MEM_MAP    (1UL<<1)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h

index 3ec44e2..50c2b87 100644 (file)
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -46,11 +46,6 @@
   * guarantees that this bit is cleared for a page when it first is entered into
   * the page cache.
   *
- * PG_highmem pages are not permanently mapped into the kernel virtual address
- * space, they need to be kmapped separately for doing IO on the pages.  The
- * struct page (these bits with information) are always mapped into kernel
- * address space...
- *
   * PG_hwpoison indicates that a page got corrupted in hardware and contains
   * data with incorrect ECC bits that triggered a machine check. Accessing is
   * not safe since it may cause another machine check. Don't touch!
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h

index 5fb6580..6dc456a 100644 (file)
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -9,14 +9,14 @@
  #ifndef _LINUX_PAGEVEC_H
  #define _LINUX_PAGEVEC_H
  
-/* 14 pointers + two long's align the pagevec structure to a power of two */
-#define PAGEVEC_SIZE   14
+/* 15 pointers + header align the pagevec structure to a power of two */
+#define PAGEVEC_SIZE   15
  
  struct page;
  struct address_space;
  
  struct pagevec {
-       unsigned long nr;
+       unsigned char nr;
         bool percpu_pvec_drained;
         struct page *pages[PAGEVEC_SIZE];
  };
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h

index 3d49b91..bd42256 100644 (file)
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -11,7 +11,7 @@
  /*
   * Routines for handling mm_structs
   */
-extern struct mm_struct * mm_alloc(void);
+extern struct mm_struct *mm_alloc(void);
  
  /**
   * mmgrab() - Pin a &struct mm_struct.
@@ -35,27 +35,7 @@ static inline void mmgrab(struct mm_struct *mm)
         atomic_inc(&mm->mm_count);
  }
  
-/* mmdrop drops the mm and the page tables */
-extern void __mmdrop(struct mm_struct *);
-static inline void mmdrop(struct mm_struct *mm)
-{
-       if (unlikely(atomic_dec_and_test(&mm->mm_count)))
-               __mmdrop(mm);
-}
-
-static inline void mmdrop_async_fn(struct work_struct *work)
-{
-       struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
-       __mmdrop(mm);
-}
-
-static inline void mmdrop_async(struct mm_struct *mm)
-{
-       if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
-               INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
-               schedule_work(&mm->async_put_work);
-       }
-}
+extern void mmdrop(struct mm_struct *mm);
  
  /**
   * mmget() - Pin the address space associated with a &struct mm_struct.
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h

index 06b295b..73b5e65 100644 (file)
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -112,13 +112,11 @@ extern void shmem_uncharge(struct inode *inode, long pages);
  
  #ifdef CONFIG_TMPFS
  
-extern int shmem_add_seals(struct file *file, unsigned int seals);
-extern int shmem_get_seals(struct file *file);
-extern long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg);
+extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg);
  
  #else
  
-static inline long shmem_fcntl(struct file *f, unsigned int c, unsigned long a)
+static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a)
  {
         return -EINVAL;
  }
diff --git a/include/linux/swap.h b/include/linux/swap.h

index c2b8128..7b6a59f 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -332,7 +332,6 @@ extern void mark_page_accessed(struct page *);
  extern void lru_add_drain(void);
  extern void lru_add_drain_cpu(int cpu);
  extern void lru_add_drain_all(void);
-extern void lru_add_drain_all_cpuslocked(void);
  extern void rotate_reclaimable_page(struct page *page);
  extern void deactivate_file_page(struct page *page);
  extern void mark_page_lazyfree(struct page *page);
@@ -345,7 +344,6 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
  
  /* linux/mm/vmscan.c */
  extern unsigned long zone_reclaimable_pages(struct zone *zone);
-extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat);
  extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                         gfp_t gfp_mask, nodemask_t *mask);
  extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h

index 1779c98..a4c2317 100644 (file)
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -216,23 +216,6 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
         return x;
  }
  
-static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat,
-                                       enum node_stat_item item)
-{
-       long x = atomic_long_read(&pgdat->vm_stat[item]);
-
-#ifdef CONFIG_SMP
-       int cpu;
-       for_each_online_cpu(cpu)
-               x += per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->vm_node_stat_diff[item];
-
-       if (x < 0)
-               x = 0;
-#endif
-       return x;
-}
-
-
  #ifdef CONFIG_NUMA
  extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item);
  extern unsigned long sum_zone_node_page_state(int node,
diff --git a/include/linux/zpool.h b/include/linux/zpool.h

index 004ba80..7238865 100644 (file)
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -108,4 +108,6 @@ void zpool_register_driver(struct zpool_driver *driver);
  
  int zpool_unregister_driver(struct zpool_driver *driver);
  
+bool zpool_evictable(struct zpool *pool);
+
  #endif
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h

index d70b53e..e0b8b91 100644 (file)
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -192,12 +192,12 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_re
  
  TRACE_EVENT(mm_shrink_slab_start,
         TP_PROTO(struct shrinker *shr, struct shrink_control *sc,
-               long nr_objects_to_shrink, unsigned long pgs_scanned,
-               unsigned long lru_pgs, unsigned long cache_items,
-               unsigned long long delta, unsigned long total_scan),
+               long nr_objects_to_shrink, unsigned long cache_items,
+               unsigned long long delta, unsigned long total_scan,
+               int priority),
  
-       TP_ARGS(shr, sc, nr_objects_to_shrink, pgs_scanned, lru_pgs,
-               cache_items, delta, total_scan),
+       TP_ARGS(shr, sc, nr_objects_to_shrink, cache_items, delta, total_scan,
+               priority),
  
         TP_STRUCT__entry(
                 __field(struct shrinker *, shr)
@@ -205,11 +205,10 @@ TRACE_EVENT(mm_shrink_slab_start,
                 __field(int, nid)
                 __field(long, nr_objects_to_shrink)
                 __field(gfp_t, gfp_flags)
-               __field(unsigned long, pgs_scanned)
-               __field(unsigned long, lru_pgs)
                 __field(unsigned long, cache_items)
                 __field(unsigned long long, delta)
                 __field(unsigned long, total_scan)
+               __field(int, priority)
         ),
  
         TP_fast_assign(
@@ -218,24 +217,22 @@ TRACE_EVENT(mm_shrink_slab_start,
                 __entry->nid = sc->nid;
                 __entry->nr_objects_to_shrink = nr_objects_to_shrink;
                 __entry->gfp_flags = sc->gfp_mask;
-               __entry->pgs_scanned = pgs_scanned;
-               __entry->lru_pgs = lru_pgs;
                 __entry->cache_items = cache_items;
                 __entry->delta = delta;
                 __entry->total_scan = total_scan;
+               __entry->priority = priority;
         ),
  
-       TP_printk("%pF %p: nid: %d objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld",
+       TP_printk("%pF %p: nid: %d objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d",
                 __entry->shrink,
                 __entry->shr,
                 __entry->nid,
                 __entry->nr_objects_to_shrink,
                 show_gfp_flags(__entry->gfp_flags),
-               __entry->pgs_scanned,
-               __entry->lru_pgs,
                 __entry->cache_items,
                 __entry->delta,
-               __entry->total_scan)
+               __entry->total_scan,
+               __entry->priority)
  );
  
  TRACE_EVENT(mm_shrink_slab_end,
diff --git a/kernel/fork.c b/kernel/fork.c

index 2295fc6..5e6cf0d 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -77,6 +77,7 @@
  #include <linux/blkdev.h>
  #include <linux/fs_struct.h>
  #include <linux/magic.h>
+#include <linux/sched/mm.h>
  #include <linux/perf_event.h>
  #include <linux/posix-timers.h>
  #include <linux/user-return-notifier.h>
@@ -390,6 +391,241 @@ void free_task(struct task_struct *tsk)
  }
  EXPORT_SYMBOL(free_task);
  
+#ifdef CONFIG_MMU
+static __latent_entropy int dup_mmap(struct mm_struct *mm,
+                                       struct mm_struct *oldmm)
+{
+       struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
+       struct rb_node **rb_link, *rb_parent;
+       int retval;
+       unsigned long charge;
+       LIST_HEAD(uf);
+
+       uprobe_start_dup_mmap();
+       if (down_write_killable(&oldmm->mmap_sem)) {
+               retval = -EINTR;
+               goto fail_uprobe_end;
+       }
+       flush_cache_dup_mm(oldmm);
+       uprobe_dup_mmap(oldmm, mm);
+       /*
+        * Not linked in yet - no deadlock potential:
+        */
+       down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
+
+       /* No ordering required: file already has been exposed. */
+       RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+
+       mm->total_vm = oldmm->total_vm;
+       mm->data_vm = oldmm->data_vm;
+       mm->exec_vm = oldmm->exec_vm;
+       mm->stack_vm = oldmm->stack_vm;
+
+       rb_link = &mm->mm_rb.rb_node;
+       rb_parent = NULL;
+       pprev = &mm->mmap;
+       retval = ksm_fork(mm, oldmm);
+       if (retval)
+               goto out;
+       retval = khugepaged_fork(mm, oldmm);
+       if (retval)
+               goto out;
+
+       prev = NULL;
+       for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
+               struct file *file;
+
+               if (mpnt->vm_flags & VM_DONTCOPY) {
+                       vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
+                       continue;
+               }
+               charge = 0;
+               if (mpnt->vm_flags & VM_ACCOUNT) {
+                       unsigned long len = vma_pages(mpnt);
+
+                       if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
+                               goto fail_nomem;
+                       charge = len;
+               }
+               tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+               if (!tmp)
+                       goto fail_nomem;
+               *tmp = *mpnt;
+               INIT_LIST_HEAD(&tmp->anon_vma_chain);
+               retval = vma_dup_policy(mpnt, tmp);
+               if (retval)
+                       goto fail_nomem_policy;
+               tmp->vm_mm = mm;
+               retval = dup_userfaultfd(tmp, &uf);
+               if (retval)
+                       goto fail_nomem_anon_vma_fork;
+               if (tmp->vm_flags & VM_WIPEONFORK) {
+                       /* VM_WIPEONFORK gets a clean slate in the child. */
+                       tmp->anon_vma = NULL;
+                       if (anon_vma_prepare(tmp))
+                               goto fail_nomem_anon_vma_fork;
+               } else if (anon_vma_fork(tmp, mpnt))
+                       goto fail_nomem_anon_vma_fork;
+               tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
+               tmp->vm_next = tmp->vm_prev = NULL;
+               file = tmp->vm_file;
+               if (file) {
+                       struct inode *inode = file_inode(file);
+                       struct address_space *mapping = file->f_mapping;
+
+                       get_file(file);
+                       if (tmp->vm_flags & VM_DENYWRITE)
+                               atomic_dec(&inode->i_writecount);
+                       i_mmap_lock_write(mapping);
+                       if (tmp->vm_flags & VM_SHARED)
+                               atomic_inc(&mapping->i_mmap_writable);
+                       flush_dcache_mmap_lock(mapping);
+                       /* insert tmp into the share list, just after mpnt */
+                       vma_interval_tree_insert_after(tmp, mpnt,
+                                       &mapping->i_mmap);
+                       flush_dcache_mmap_unlock(mapping);
+                       i_mmap_unlock_write(mapping);
+               }
+
+               /*
+                * Clear hugetlb-related page reserves for children. This only
+                * affects MAP_PRIVATE mappings. Faults generated by the child
+                * are not guaranteed to succeed, even if read-only
+                */
+               if (is_vm_hugetlb_page(tmp))
+                       reset_vma_resv_huge_pages(tmp);
+
+               /*
+                * Link in the new vma and copy the page table entries.
+                */
+               *pprev = tmp;
+               pprev = &tmp->vm_next;
+               tmp->vm_prev = prev;
+               prev = tmp;
+
+               __vma_link_rb(mm, tmp, rb_link, rb_parent);
+               rb_link = &tmp->vm_rb.rb_right;
+               rb_parent = &tmp->vm_rb;
+
+               mm->map_count++;
+               if (!(tmp->vm_flags & VM_WIPEONFORK))
+                       retval = copy_page_range(mm, oldmm, mpnt);
+
+               if (tmp->vm_ops && tmp->vm_ops->open)
+                       tmp->vm_ops->open(tmp);
+
+               if (retval)
+                       goto out;
+       }
+       /* a new mm has just been created */
+       arch_dup_mmap(oldmm, mm);
+       retval = 0;
+out:
+       up_write(&mm->mmap_sem);
+       flush_tlb_mm(oldmm);
+       up_write(&oldmm->mmap_sem);
+       dup_userfaultfd_complete(&uf);
+fail_uprobe_end:
+       uprobe_end_dup_mmap();
+       return retval;
+fail_nomem_anon_vma_fork:
+       mpol_put(vma_policy(tmp));
+fail_nomem_policy:
+       kmem_cache_free(vm_area_cachep, tmp);
+fail_nomem:
+       retval = -ENOMEM;
+       vm_unacct_memory(charge);
+       goto out;
+}
+
+static inline int mm_alloc_pgd(struct mm_struct *mm)
+{
+       mm->pgd = pgd_alloc(mm);
+       if (unlikely(!mm->pgd))
+               return -ENOMEM;
+       return 0;
+}
+
+static inline void mm_free_pgd(struct mm_struct *mm)
+{
+       pgd_free(mm, mm->pgd);
+}
+#else
+static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+       down_write(&oldmm->mmap_sem);
+       RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
+       up_write(&oldmm->mmap_sem);
+       return 0;
+}
+#define mm_alloc_pgd(mm)       (0)
+#define mm_free_pgd(mm)
+#endif /* CONFIG_MMU */
+
+static void check_mm(struct mm_struct *mm)
+{
+       int i;
+
+       for (i = 0; i < NR_MM_COUNTERS; i++) {
+               long x = atomic_long_read(&mm->rss_stat.count[i]);
+
+               if (unlikely(x))
+                       printk(KERN_ALERT "BUG: Bad rss-counter state "
+                                         "mm:%p idx:%d val:%ld\n", mm, i, x);
+       }
+
+       if (mm_pgtables_bytes(mm))
+               pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
+                               mm_pgtables_bytes(mm));
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+       VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
+#endif
+}
+
+#define allocate_mm()  (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
+#define free_mm(mm)    (kmem_cache_free(mm_cachep, (mm)))
+
+/*
+ * Called when the last reference to the mm
+ * is dropped: either by a lazy thread or by
+ * mmput. Free the page directory and the mm.
+ */
+static void __mmdrop(struct mm_struct *mm)
+{
+       BUG_ON(mm == &init_mm);
+       mm_free_pgd(mm);
+       destroy_context(mm);
+       hmm_mm_destroy(mm);
+       mmu_notifier_mm_destroy(mm);
+       check_mm(mm);
+       put_user_ns(mm->user_ns);
+       free_mm(mm);
+}
+
+void mmdrop(struct mm_struct *mm)
+{
+       if (unlikely(atomic_dec_and_test(&mm->mm_count)))
+               __mmdrop(mm);
+}
+EXPORT_SYMBOL_GPL(mmdrop);
+
+static void mmdrop_async_fn(struct work_struct *work)
+{
+       struct mm_struct *mm;
+
+       mm = container_of(work, struct mm_struct, async_put_work);
+       __mmdrop(mm);
+}
+
+static void mmdrop_async(struct mm_struct *mm)
+{
+       if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
+               INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
+               schedule_work(&mm->async_put_work);
+       }
+}
+
  static inline void free_signal_struct(struct signal_struct *sig)
  {
         taskstats_tgid_free(sig);
@@ -594,181 +830,8 @@ free_tsk:
         return NULL;
  }
  
-#ifdef CONFIG_MMU
-static __latent_entropy int dup_mmap(struct mm_struct *mm,
-                                       struct mm_struct *oldmm)
-{
-       struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
-       struct rb_node **rb_link, *rb_parent;
-       int retval;
-       unsigned long charge;
-       LIST_HEAD(uf);
-
-       uprobe_start_dup_mmap();
-       if (down_write_killable(&oldmm->mmap_sem)) {
-               retval = -EINTR;
-               goto fail_uprobe_end;
-       }
-       flush_cache_dup_mm(oldmm);
-       uprobe_dup_mmap(oldmm, mm);
-       /*
-        * Not linked in yet - no deadlock potential:
-        */
-       down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
-
-       /* No ordering required: file already has been exposed. */
-       RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
-
-       mm->total_vm = oldmm->total_vm;
-       mm->data_vm = oldmm->data_vm;
-       mm->exec_vm = oldmm->exec_vm;
-       mm->stack_vm = oldmm->stack_vm;
-
-       rb_link = &mm->mm_rb.rb_node;
-       rb_parent = NULL;
-       pprev = &mm->mmap;
-       retval = ksm_fork(mm, oldmm);
-       if (retval)
-               goto out;
-       retval = khugepaged_fork(mm, oldmm);
-       if (retval)
-               goto out;
-
-       prev = NULL;
-       for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
-               struct file *file;
-
-               if (mpnt->vm_flags & VM_DONTCOPY) {
-                       vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
-                       continue;
-               }
-               charge = 0;
-               if (mpnt->vm_flags & VM_ACCOUNT) {
-                       unsigned long len = vma_pages(mpnt);
-
-                       if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
-                               goto fail_nomem;
-                       charge = len;
-               }
-               tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
-               if (!tmp)
-                       goto fail_nomem;
-               *tmp = *mpnt;
-               INIT_LIST_HEAD(&tmp->anon_vma_chain);
-               retval = vma_dup_policy(mpnt, tmp);
-               if (retval)
-                       goto fail_nomem_policy;
-               tmp->vm_mm = mm;
-               retval = dup_userfaultfd(tmp, &uf);
-               if (retval)
-                       goto fail_nomem_anon_vma_fork;
-               if (tmp->vm_flags & VM_WIPEONFORK) {
-                       /* VM_WIPEONFORK gets a clean slate in the child. */
-                       tmp->anon_vma = NULL;
-                       if (anon_vma_prepare(tmp))
-                               goto fail_nomem_anon_vma_fork;
-               } else if (anon_vma_fork(tmp, mpnt))
-                       goto fail_nomem_anon_vma_fork;
-               tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
-               tmp->vm_next = tmp->vm_prev = NULL;
-               file = tmp->vm_file;
-               if (file) {
-                       struct inode *inode = file_inode(file);
-                       struct address_space *mapping = file->f_mapping;
-
-                       get_file(file);
-                       if (tmp->vm_flags & VM_DENYWRITE)
-                               atomic_dec(&inode->i_writecount);
-                       i_mmap_lock_write(mapping);
-                       if (tmp->vm_flags & VM_SHARED)
-                               atomic_inc(&mapping->i_mmap_writable);
-                       flush_dcache_mmap_lock(mapping);
-                       /* insert tmp into the share list, just after mpnt */
-                       vma_interval_tree_insert_after(tmp, mpnt,
-                                       &mapping->i_mmap);
-                       flush_dcache_mmap_unlock(mapping);
-                       i_mmap_unlock_write(mapping);
-               }
-
-               /*
-                * Clear hugetlb-related page reserves for children. This only
-                * affects MAP_PRIVATE mappings. Faults generated by the child
-                * are not guaranteed to succeed, even if read-only
-                */
-               if (is_vm_hugetlb_page(tmp))
-                       reset_vma_resv_huge_pages(tmp);
-
-               /*
-                * Link in the new vma and copy the page table entries.
-                */
-               *pprev = tmp;
-               pprev = &tmp->vm_next;
-               tmp->vm_prev = prev;
-               prev = tmp;
-
-               __vma_link_rb(mm, tmp, rb_link, rb_parent);
-               rb_link = &tmp->vm_rb.rb_right;
-               rb_parent = &tmp->vm_rb;
-
-               mm->map_count++;
-               if (!(tmp->vm_flags & VM_WIPEONFORK))
-                       retval = copy_page_range(mm, oldmm, mpnt);
-
-               if (tmp->vm_ops && tmp->vm_ops->open)
-                       tmp->vm_ops->open(tmp);
-
-               if (retval)
-                       goto out;
-       }
-       /* a new mm has just been created */
-       retval = arch_dup_mmap(oldmm, mm);
-out:
-       up_write(&mm->mmap_sem);
-       flush_tlb_mm(oldmm);
-       up_write(&oldmm->mmap_sem);
-       dup_userfaultfd_complete(&uf);
-fail_uprobe_end:
-       uprobe_end_dup_mmap();
-       return retval;
-fail_nomem_anon_vma_fork:
-       mpol_put(vma_policy(tmp));
-fail_nomem_policy:
-       kmem_cache_free(vm_area_cachep, tmp);
-fail_nomem:
-       retval = -ENOMEM;
-       vm_unacct_memory(charge);
-       goto out;
-}
-
-static inline int mm_alloc_pgd(struct mm_struct *mm)
-{
-       mm->pgd = pgd_alloc(mm);
-       if (unlikely(!mm->pgd))
-               return -ENOMEM;
-       return 0;
-}
-
-static inline void mm_free_pgd(struct mm_struct *mm)
-{
-       pgd_free(mm, mm->pgd);
-}
-#else
-static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
-{
-       down_write(&oldmm->mmap_sem);
-       RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
-       up_write(&oldmm->mmap_sem);
-       return 0;
-}
-#define mm_alloc_pgd(mm)       (0)
-#define mm_free_pgd(mm)
-#endif /* CONFIG_MMU */
-
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
  
-#define allocate_mm()  (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
-#define free_mm(mm)    (kmem_cache_free(mm_cachep, (mm)))
-
  static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
  
  static int __init coredump_filter_setup(char *s)
@@ -858,27 +921,6 @@ fail_nopgd:
         return NULL;
  }
  
-static void check_mm(struct mm_struct *mm)
-{
-       int i;
-
-       for (i = 0; i < NR_MM_COUNTERS; i++) {
-               long x = atomic_long_read(&mm->rss_stat.count[i]);
-
-               if (unlikely(x))
-                       printk(KERN_ALERT "BUG: Bad rss-counter state "
-                                         "mm:%p idx:%d val:%ld\n", mm, i, x);
-       }
-
-       if (mm_pgtables_bytes(mm))
-               pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
-                               mm_pgtables_bytes(mm));
-
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
-       VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
-#endif
-}
-
  /*
   * Allocate and initialize an mm_struct.
   */
@@ -894,24 +936,6 @@ struct mm_struct *mm_alloc(void)
         return mm_init(mm, current, current_user_ns());
  }
  
-/*
- * Called when the last reference to the mm
- * is dropped: either by a lazy thread or by
- * mmput. Free the page directory and the mm.
- */
-void __mmdrop(struct mm_struct *mm)
-{
-       BUG_ON(mm == &init_mm);
-       mm_free_pgd(mm);
-       destroy_context(mm);
-       hmm_mm_destroy(mm);
-       mmu_notifier_mm_destroy(mm);
-       check_mm(mm);
-       put_user_ns(mm->user_ns);
-       free_mm(mm);
-}
-EXPORT_SYMBOL_GPL(__mmdrop);
-
  static inline void __mmput(struct mm_struct *mm)
  {
         VM_BUG_ON(atomic_read(&mm->mm_users));
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 557d467..2fb4e27 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1374,13 +1374,6 @@ static struct ctl_table vm_table[] = {
                 .mode           = 0644,
                 .proc_handler   = proc_dointvec,
          },
-        {
-               .procname       = "hugepages_treat_as_movable",
-               .data           = &hugepages_treat_as_movable,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec,
-       },
         {
                 .procname       = "nr_overcommit_hugepages",
                 .data           = NULL,
diff --git a/mm/Kconfig b/mm/Kconfig

index 03ff770..c782e8f 100644 (file)
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -639,15 +639,10 @@ config MAX_STACK_SIZE_MB
  
           A sane initial value is 80 MB.
  
-# For architectures that support deferred memory initialisation
-config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
-       bool
-
  config DEFERRED_STRUCT_PAGE_INIT
         bool "Defer initialisation of struct pages to kthreads"
         default n
-       depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
-       depends on NO_BOOTMEM && MEMORY_HOTPLUG
+       depends on NO_BOOTMEM
         depends on !FLATMEM
         help
           Ordinarily all struct pages are initialised during early boot in a
diff --git a/mm/compaction.c b/mm/compaction.c

index 10cd757..2c8999d 100644 (file)
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1738,7 +1738,7 @@ int sysctl_extfrag_threshold = 500;
   * @order: The order of the current allocation
   * @alloc_flags: The allocation flags of the current allocation
   * @ac: The context of current allocation
- * @mode: The migration mode for async, sync light, or sync migration
+ * @prio: Determines how hard direct compaction should try to succeed
   *
   * This is the main entry point for direct page compaction.
   */
diff --git a/mm/fadvise.c b/mm/fadvise.c

index ec70d6e..767887f 100644 (file)
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -127,7 +127,15 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
                  */
                 start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT;
                 end_index = (endbyte >> PAGE_SHIFT);
-               if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK) {
+               /*
+                * The page at end_index will be inclusively discarded according
+                * by invalidate_mapping_pages(), so subtracting 1 from
+                * end_index means we will skip the last page.  But if endbyte
+                * is page aligned or is at the end of file, we should not skip
+                * that page - discarding the last page is safe enough.
+                */
+               if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK &&
+                               endbyte != inode->i_size - 1) {
                         /* First page is tricky as 0 - 1 = -1, but pgoff_t
                          * is unsigned, so the end_index >= start_index
                          * check below would be true and we'll discard the whole
diff --git a/mm/filemap.c b/mm/filemap.c

index ee83baa..693f622 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -31,7 +31,6 @@
  #include <linux/blkdev.h>
  #include <linux/security.h>
  #include <linux/cpuset.h>
-#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
  #include <linux/hugetlb.h>
  #include <linux/memcontrol.h>
  #include <linux/cleancache.h>
diff --git a/mm/hmm.c b/mm/hmm.c

index ea19742..979211c 100644 (file)
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -418,7 +418,7 @@ again:
                 }
  
                 if (!pte_present(pte)) {
-                       swp_entry_t entry;
+                       swp_entry_t entry = pte_to_swp_entry(pte);
  
                         if (!non_swap_entry(entry)) {
                                 if (hmm_vma_walk->fault)
@@ -426,8 +426,6 @@ again:
                                 continue;
                         }
  
-                       entry = pte_to_swp_entry(pte);
-
                         /*
                          * This is a special swap entry, ignore migration, use
                          * device and report anything else as error.
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index 0e7ded9..87ab9b8 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1910,17 +1910,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
          * pmdp_invalidate() is required to make sure we don't miss
          * dirty/young flags set by hardware.
          */
-       entry = *pmd;
-       pmdp_invalidate(vma, addr, pmd);
-
-       /*
-        * Recover dirty/young flags.  It relies on pmdp_invalidate to not
-        * corrupt them.
-        */
-       if (pmd_dirty(*pmd))
-               entry = pmd_mkdirty(entry);
-       if (pmd_young(*pmd))
-               entry = pmd_mkyoung(entry);
+       entry = pmdp_invalidate(vma, addr, pmd);
  
         entry = pmd_modify(entry, newprot);
         if (preserve_write)
@@ -2073,8 +2063,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
         struct mm_struct *mm = vma->vm_mm;
         struct page *page;
         pgtable_t pgtable;
-       pmd_t _pmd;
-       bool young, write, dirty, soft_dirty, pmd_migration = false;
+       pmd_t old_pmd, _pmd;
+       bool young, write, soft_dirty, pmd_migration = false;
         unsigned long addr;
         int i;
  
@@ -2116,24 +2106,50 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                 return __split_huge_zero_page_pmd(vma, haddr, pmd);
         }
  
+       /*
+        * Up to this point the pmd is present and huge and userland has the
+        * whole access to the hugepage during the split (which happens in
+        * place). If we overwrite the pmd with the not-huge version pointing
+        * to the pte here (which of course we could if all CPUs were bug
+        * free), userland could trigger a small page size TLB miss on the
+        * small sized TLB while the hugepage TLB entry is still established in
+        * the huge TLB. Some CPU doesn't like that.
+        * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
+        * 383 on page 93. Intel should be safe but is also warns that it's
+        * only safe if the permission and cache attributes of the two entries
+        * loaded in the two TLB is identical (which should be the case here).
+        * But it is generally safer to never allow small and huge TLB entries
+        * for the same virtual address to be loaded simultaneously. So instead
+        * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
+        * current pmd notpresent (atomically because here the pmd_trans_huge
+        * must remain set at all times on the pmd until the split is complete
+        * for this pmd), then we flush the SMP TLB and finally we write the
+        * non-huge version of the pmd entry with pmd_populate.
+        */
+       old_pmd = pmdp_invalidate(vma, haddr, pmd);
+
  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
-       pmd_migration = is_pmd_migration_entry(*pmd);
+       pmd_migration = is_pmd_migration_entry(old_pmd);
         if (pmd_migration) {
                 swp_entry_t entry;
  
-               entry = pmd_to_swp_entry(*pmd);
+               entry = pmd_to_swp_entry(old_pmd);
                 page = pfn_to_page(swp_offset(entry));
         } else
  #endif
-               page = pmd_page(*pmd);
+               page = pmd_page(old_pmd);
         VM_BUG_ON_PAGE(!page_count(page), page);
         page_ref_add(page, HPAGE_PMD_NR - 1);
-       write = pmd_write(*pmd);
-       young = pmd_young(*pmd);
-       dirty = pmd_dirty(*pmd);
-       soft_dirty = pmd_soft_dirty(*pmd);
+       if (pmd_dirty(old_pmd))
+               SetPageDirty(page);
+       write = pmd_write(old_pmd);
+       young = pmd_young(old_pmd);
+       soft_dirty = pmd_soft_dirty(old_pmd);
  
-       pmdp_huge_split_prepare(vma, haddr, pmd);
+       /*
+        * Withdraw the table only after we mark the pmd entry invalid.
+        * This's critical for some architectures (Power).
+        */
         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
         pmd_populate(mm, &_pmd, pgtable);
  
@@ -2160,8 +2176,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                         if (soft_dirty)
                                 entry = pte_mksoft_dirty(entry);
                 }
-               if (dirty)
-                       SetPageDirty(page + i);
                 pte = pte_offset_map(&_pmd, addr);
                 BUG_ON(!pte_none(*pte));
                 set_pte_at(mm, addr, pte, entry);
@@ -2189,28 +2203,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
         }
  
         smp_wmb(); /* make pte visible before pmd */
-       /*
-        * Up to this point the pmd is present and huge and userland has the
-        * whole access to the hugepage during the split (which happens in
-        * place). If we overwrite the pmd with the not-huge version pointing
-        * to the pte here (which of course we could if all CPUs were bug
-        * free), userland could trigger a small page size TLB miss on the
-        * small sized TLB while the hugepage TLB entry is still established in
-        * the huge TLB. Some CPU doesn't like that.
-        * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
-        * 383 on page 93. Intel should be safe but is also warns that it's
-        * only safe if the permission and cache attributes of the two entries
-        * loaded in the two TLB is identical (which should be the case here).
-        * But it is generally safer to never allow small and huge TLB entries
-        * for the same virtual address to be loaded simultaneously. So instead
-        * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
-        * current pmd notpresent (atomically because here the pmd_trans_huge
-        * and pmd_trans_splitting must remain set at all times on the pmd
-        * until the split is complete for this pmd), then we flush the SMP TLB
-        * and finally we write the non-huge version of the pmd entry with
-        * pmd_populate.
-        */
-       pmdp_invalidate(vma, haddr, pmd);
         pmd_populate(mm, pmd, pgtable);
  
         if (freeze) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 9a334f5..7c204e3 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -34,10 +34,9 @@
  #include <linux/hugetlb_cgroup.h>
  #include <linux/node.h>
  #include <linux/userfaultfd_k.h>
+#include <linux/page_owner.h>
  #include "internal.h"
  
-int hugepages_treat_as_movable;
-
  int hugetlb_max_hstate __read_mostly;
  unsigned int default_hstate_idx;
  struct hstate hstates[HUGE_MAX_HSTATE];
@@ -926,7 +925,7 @@ retry_cpuset:
  /* Movability of hugepages depends on migration support. */
  static inline gfp_t htlb_alloc_mask(struct hstate *h)
  {
-       if (hugepages_treat_as_movable || hugepage_migration_supported(h))
+       if (hugepage_migration_supported(h))
                 return GFP_HIGHUSER_MOVABLE;
         else
                 return GFP_HIGHUSER;
@@ -1108,7 +1107,8 @@ static bool zone_spans_last_pfn(const struct zone *zone,
         return zone_spans_pfn(zone, last_pfn);
  }
  
-static struct page *alloc_gigantic_page(int nid, struct hstate *h)
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+               int nid, nodemask_t *nodemask)
  {
         unsigned int order = huge_page_order(h);
         unsigned long nr_pages = 1 << order;
@@ -1116,11 +1116,9 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h)
         struct zonelist *zonelist;
         struct zone *zone;
         struct zoneref *z;
-       gfp_t gfp_mask;
  
-       gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
         zonelist = node_zonelist(nid, gfp_mask);
-       for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) {
+       for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) {
                 spin_lock_irqsave(&zone->lock, flags);
  
                 pfn = ALIGN(zone->zone_start_pfn, nr_pages);
@@ -1151,41 +1149,13 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h)
  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
  static void prep_compound_gigantic_page(struct page *page, unsigned int order);
  
-static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
-{
-       struct page *page;
-
-       page = alloc_gigantic_page(nid, h);
-       if (page) {
-               prep_compound_gigantic_page(page, huge_page_order(h));
-               prep_new_huge_page(h, page, nid);
-       }
-
-       return page;
-}
-
-static int alloc_fresh_gigantic_page(struct hstate *h,
-                               nodemask_t *nodes_allowed)
-{
-       struct page *page = NULL;
-       int nr_nodes, node;
-
-       for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
-               page = alloc_fresh_gigantic_page_node(h, node);
-               if (page)
-                       return 1;
-       }
-
-       return 0;
-}
-
  #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
  static inline bool gigantic_page_supported(void) { return false; }
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+               int nid, nodemask_t *nodemask) { return NULL; }
  static inline void free_gigantic_page(struct page *page, unsigned int order) { }
  static inline void destroy_compound_gigantic_page(struct page *page,
                                                 unsigned int order) { }
-static inline int alloc_fresh_gigantic_page(struct hstate *h,
-                                       nodemask_t *nodes_allowed) { return 0; }
  #endif
  
  static void update_and_free_page(struct hstate *h, struct page *page)
@@ -1250,6 +1220,28 @@ static void clear_page_huge_active(struct page *page)
         ClearPagePrivate(&page[1]);
  }
  
+/*
+ * Internal hugetlb specific page flag. Do not use outside of the hugetlb
+ * code
+ */
+static inline bool PageHugeTemporary(struct page *page)
+{
+       if (!PageHuge(page))
+               return false;
+
+       return (unsigned long)page[2].mapping == -1U;
+}
+
+static inline void SetPageHugeTemporary(struct page *page)
+{
+       page[2].mapping = (void *)-1U;
+}
+
+static inline void ClearPageHugeTemporary(struct page *page)
+{
+       page[2].mapping = NULL;
+}
+
  void free_huge_page(struct page *page)
  {
         /*
@@ -1284,7 +1276,11 @@ void free_huge_page(struct page *page)
         if (restore_reserve)
                 h->resv_huge_pages++;
  
-       if (h->surplus_huge_pages_node[nid]) {
+       if (PageHugeTemporary(page)) {
+               list_del(&page->lru);
+               ClearPageHugeTemporary(page);
+               update_and_free_page(h, page);
+       } else if (h->surplus_huge_pages_node[nid]) {
                 /* remove the page from active list */
                 list_del(&page->lru);
                 update_and_free_page(h, page);
@@ -1306,7 +1302,6 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
         h->nr_huge_pages++;
         h->nr_huge_pages_node[nid]++;
         spin_unlock(&hugetlb_lock);
-       put_page(page); /* free it into the hugepage allocator */
  }
  
  static void prep_compound_gigantic_page(struct page *page, unsigned int order)
@@ -1383,41 +1378,70 @@ pgoff_t __basepage_index(struct page *page)
         return (index << compound_order(page_head)) + compound_idx;
  }
  
-static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
+static struct page *alloc_buddy_huge_page(struct hstate *h,
+               gfp_t gfp_mask, int nid, nodemask_t *nmask)
  {
+       int order = huge_page_order(h);
         struct page *page;
  
-       page = __alloc_pages_node(nid,
-               htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
-                                               __GFP_RETRY_MAYFAIL|__GFP_NOWARN,
-               huge_page_order(h));
-       if (page) {
-               prep_new_huge_page(h, page, nid);
-       }
+       gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
+       if (nid == NUMA_NO_NODE)
+               nid = numa_mem_id();
+       page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
+       if (page)
+               __count_vm_event(HTLB_BUDDY_PGALLOC);
+       else
+               __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+
+       return page;
+}
+
+/*
+ * Common helper to allocate a fresh hugetlb page. All specific allocators
+ * should use this function to get new hugetlb pages
+ */
+static struct page *alloc_fresh_huge_page(struct hstate *h,
+               gfp_t gfp_mask, int nid, nodemask_t *nmask)
+{
+       struct page *page;
+
+       if (hstate_is_gigantic(h))
+               page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
+       else
+               page = alloc_buddy_huge_page(h, gfp_mask,
+                               nid, nmask);
+       if (!page)
+               return NULL;
+
+       if (hstate_is_gigantic(h))
+               prep_compound_gigantic_page(page, huge_page_order(h));
+       prep_new_huge_page(h, page, page_to_nid(page));
  
         return page;
  }
  
-static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+/*
+ * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
+ * manner.
+ */
+static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
  {
         struct page *page;
         int nr_nodes, node;
-       int ret = 0;
+       gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
  
         for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
-               page = alloc_fresh_huge_page_node(h, node);
-               if (page) {
-                       ret = 1;
+               page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed);
+               if (page)
                         break;
-               }
         }
  
-       if (ret)
-               count_vm_event(HTLB_BUDDY_PGALLOC);
-       else
-               count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+       if (!page)
+               return 0;
  
-       return ret;
+       put_page(page); /* free it into the hugepage allocator */
+
+       return 1;
  }
  
  /*
@@ -1525,79 +1549,66 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
         return rc;
  }
  
-static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
-               gfp_t gfp_mask, int nid, nodemask_t *nmask)
-{
-       int order = huge_page_order(h);
-
-       gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
-       if (nid == NUMA_NO_NODE)
-               nid = numa_mem_id();
-       return __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
-}
-
-static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
+/*
+ * Allocates a fresh surplus page from the page allocator.
+ */
+static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
                 int nid, nodemask_t *nmask)
  {
-       struct page *page;
-       unsigned int r_nid;
+       struct page *page = NULL;
  
         if (hstate_is_gigantic(h))
                 return NULL;
  
+       spin_lock(&hugetlb_lock);
+       if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
+               goto out_unlock;
+       spin_unlock(&hugetlb_lock);
+
+       page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
+       if (!page)
+               return NULL;
+
+       spin_lock(&hugetlb_lock);
         /*
-        * Assume we will successfully allocate the surplus page to
-        * prevent racing processes from causing the surplus to exceed
-        * overcommit
-        *
-        * This however introduces a different race, where a process B
-        * tries to grow the static hugepage pool while alloc_pages() is
-        * called by process A. B will only examine the per-node
-        * counters in determining if surplus huge pages can be
-        * converted to normal huge pages in adjust_pool_surplus(). A
-        * won't be able to increment the per-node counter, until the
-        * lock is dropped by B, but B doesn't drop hugetlb_lock until
-        * no more huge pages can be converted from surplus to normal
-        * state (and doesn't try to convert again). Thus, we have a
-        * case where a surplus huge page exists, the pool is grown, and
-        * the surplus huge page still exists after, even though it
-        * should just have been converted to a normal huge page. This
-        * does not leak memory, though, as the hugepage will be freed
-        * once it is out of use. It also does not allow the counters to
-        * go out of whack in adjust_pool_surplus() as we don't modify
-        * the node values until we've gotten the hugepage and only the
-        * per-node value is checked there.
+        * We could have raced with the pool size change.
+        * Double check that and simply deallocate the new page
+        * if we would end up overcommiting the surpluses. Abuse
+        * temporary page to workaround the nasty free_huge_page
+        * codeflow
          */
-       spin_lock(&hugetlb_lock);
         if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
-               spin_unlock(&hugetlb_lock);
-               return NULL;
+               SetPageHugeTemporary(page);
+               put_page(page);
+               page = NULL;
         } else {
-               h->nr_huge_pages++;
                 h->surplus_huge_pages++;
+               h->nr_huge_pages_node[page_to_nid(page)]++;
         }
+
+out_unlock:
         spin_unlock(&hugetlb_lock);
  
-       page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
+       return page;
+}
  
-       spin_lock(&hugetlb_lock);
-       if (page) {
-               INIT_LIST_HEAD(&page->lru);
-               r_nid = page_to_nid(page);
-               set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
-               set_hugetlb_cgroup(page, NULL);
-               /*
-                * We incremented the global counters already
-                */
-               h->nr_huge_pages_node[r_nid]++;
-               h->surplus_huge_pages_node[r_nid]++;
-               __count_vm_event(HTLB_BUDDY_PGALLOC);
-       } else {
-               h->nr_huge_pages--;
-               h->surplus_huge_pages--;
-               __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
-       }
-       spin_unlock(&hugetlb_lock);
+static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+               int nid, nodemask_t *nmask)
+{
+       struct page *page;
+
+       if (hstate_is_gigantic(h))
+               return NULL;
+
+       page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
+       if (!page)
+               return NULL;
+
+       /*
+        * We do not account these pages as surplus because they are only
+        * temporary and will be released properly on the last reference
+        */
+       SetPageHugeTemporary(page);
  
         return page;
  }
@@ -1606,7 +1617,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
   * Use the VMA's mpolicy to allocate a huge page from the buddy.
   */
  static
-struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
+struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
                 struct vm_area_struct *vma, unsigned long addr)
  {
         struct page *page;
@@ -1616,17 +1627,13 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
         nodemask_t *nodemask;
  
         nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
-       page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask);
+       page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
         mpol_cond_put(mpol);
  
         return page;
  }
  
-/*
- * This allocation function is useful in the context where vma is irrelevant.
- * E.g. soft-offlining uses this function because it only cares physical
- * address of error page.
- */
+/* page migration callback function */
  struct page *alloc_huge_page_node(struct hstate *h, int nid)
  {
         gfp_t gfp_mask = htlb_alloc_mask(h);
@@ -1641,12 +1648,12 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
         spin_unlock(&hugetlb_lock);
  
         if (!page)
-               page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL);
+               page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
  
         return page;
  }
  
-
+/* page migration callback function */
  struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
                 nodemask_t *nmask)
  {
@@ -1664,9 +1671,25 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
         }
         spin_unlock(&hugetlb_lock);
  
-       /* No reservations, try to overcommit */
+       return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
+}
+
+/* mempolicy aware migration callback */
+struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
+               unsigned long address)
+{
+       struct mempolicy *mpol;
+       nodemask_t *nodemask;
+       struct page *page;
+       gfp_t gfp_mask;
+       int node;
+
+       gfp_mask = htlb_alloc_mask(h);
+       node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
+       page = alloc_huge_page_nodemask(h, node, nodemask);
+       mpol_cond_put(mpol);
  
-       return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask);
+       return page;
  }
  
  /*
@@ -1694,7 +1717,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
  retry:
         spin_unlock(&hugetlb_lock);
         for (i = 0; i < needed; i++) {
-               page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h),
+               page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
                                 NUMA_NO_NODE, NULL);
                 if (!page) {
                         alloc_ok = false;
@@ -2031,7 +2054,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
         page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
         if (!page) {
                 spin_unlock(&hugetlb_lock);
-               page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);
+               page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
                 if (!page)
                         goto out_uncharge_cgroup;
                 if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
@@ -2074,20 +2097,6 @@ out_subpool_put:
         return ERR_PTR(-ENOSPC);
  }
  
-/*
- * alloc_huge_page()'s wrapper which simply returns the page if allocation
- * succeeds, otherwise NULL. This function is called from new_vma_page(),
- * where no ERR_VALUE is expected to be returned.
- */
-struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
-                               unsigned long addr, int avoid_reserve)
-{
-       struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
-       if (IS_ERR(page))
-               page = NULL;
-       return page;
-}
-
  int alloc_bootmem_huge_page(struct hstate *h)
         __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
  int __alloc_bootmem_huge_page(struct hstate *h)
@@ -2150,6 +2159,8 @@ static void __init gather_bootmem_prealloc(void)
                 prep_compound_huge_page(page, h->order);
                 WARN_ON(PageReserved(page));
                 prep_new_huge_page(h, page, page_to_nid(page));
+               put_page(page); /* free it into the hugepage allocator */
+
                 /*
                  * If we had gigantic hugepages allocated at boot time, we need
                  * to restore the 'stolen' pages to totalram_pages in order to
@@ -2169,7 +2180,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
                 if (hstate_is_gigantic(h)) {
                         if (!alloc_bootmem_huge_page(h))
                                 break;
-               } else if (!alloc_fresh_huge_page(h,
+               } else if (!alloc_pool_huge_page(h,
                                          &node_states[N_MEMORY]))
                         break;
                 cond_resched();
@@ -2289,7 +2300,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
          * First take pages out of surplus state.  Then make up the
          * remaining difference by allocating fresh huge pages.
          *
-        * We might race with __alloc_buddy_huge_page() here and be unable
+        * We might race with alloc_surplus_huge_page() here and be unable
          * to convert a surplus huge page to a normal huge page. That is
          * not critical, though, it just means the overall size of the
          * pool might be one hugepage larger than it needs to be, but
@@ -2312,10 +2323,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
                 /* yield cpu to avoid soft lockup */
                 cond_resched();
  
-               if (hstate_is_gigantic(h))
-                       ret = alloc_fresh_gigantic_page(h, nodes_allowed);
-               else
-                       ret = alloc_fresh_huge_page(h, nodes_allowed);
+               ret = alloc_pool_huge_page(h, nodes_allowed);
                 spin_lock(&hugetlb_lock);
                 if (!ret)
                         goto out;
@@ -2335,7 +2343,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
          * By placing pages into the surplus state independent of the
          * overcommit value, we are allowing the surplus pool size to
          * exceed overcommit. There are few sane options here. Since
-        * __alloc_buddy_huge_page() is checking the global counter,
+        * alloc_surplus_huge_page() is checking the global counter,
          * though, we'll note that we're not allowed to exceed surplus
          * and won't grow the pool anywhere else. Not until one of the
          * sysctls are changed, or the surplus pages go out of use.
@@ -2975,20 +2983,32 @@ out:
  
  void hugetlb_report_meminfo(struct seq_file *m)
  {
-       struct hstate *h = &default_hstate;
+       struct hstate *h;
+       unsigned long total = 0;
+
         if (!hugepages_supported())
                 return;
-       seq_printf(m,
-                       "HugePages_Total:   %5lu\n"
-                       "HugePages_Free:    %5lu\n"
-                       "HugePages_Rsvd:    %5lu\n"
-                       "HugePages_Surp:    %5lu\n"
-                       "Hugepagesize:   %8lu kB\n",
-                       h->nr_huge_pages,
-                       h->free_huge_pages,
-                       h->resv_huge_pages,
-                       h->surplus_huge_pages,
-                       1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
+
+       for_each_hstate(h) {
+               unsigned long count = h->nr_huge_pages;
+
+               total += (PAGE_SIZE << huge_page_order(h)) * count;
+
+               if (h == &default_hstate)
+                       seq_printf(m,
+                                  "HugePages_Total:   %5lu\n"
+                                  "HugePages_Free:    %5lu\n"
+                                  "HugePages_Rsvd:    %5lu\n"
+                                  "HugePages_Surp:    %5lu\n"
+                                  "Hugepagesize:   %8lu kB\n",
+                                  count,
+                                  h->free_huge_pages,
+                                  h->resv_huge_pages,
+                                  h->surplus_huge_pages,
+                                  (PAGE_SIZE << huge_page_order(h)) / 1024);
+       }
+
+       seq_printf(m, "Hugetlb:        %8lu kB\n", total / 1024);
  }
  
  int hugetlb_report_node_meminfo(int nid, char *buf)
@@ -4799,3 +4819,36 @@ void putback_active_hugepage(struct page *page)
         spin_unlock(&hugetlb_lock);
         put_page(page);
  }
+
+void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
+{
+       struct hstate *h = page_hstate(oldpage);
+
+       hugetlb_cgroup_migrate(oldpage, newpage);
+       set_page_owner_migrate_reason(newpage, reason);
+
+       /*
+        * transfer temporary state of the new huge page. This is
+        * reverse to other transitions because the newpage is going to
+        * be final while the old one will be freed so it takes over
+        * the temporary status.
+        *
+        * Also note that we have to transfer the per-node surplus state
+        * here as well otherwise the global surplus count will not match
+        * the per-node's.
+        */
+       if (PageHugeTemporary(newpage)) {
+               int old_nid = page_to_nid(oldpage);
+               int new_nid = page_to_nid(newpage);
+
+               SetPageHugeTemporary(oldpage);
+               ClearPageHugeTemporary(newpage);
+
+               spin_lock(&hugetlb_lock);
+               if (h->surplus_huge_pages_node[old_nid]) {
+                       h->surplus_huge_pages_node[old_nid]--;
+                       h->surplus_huge_pages_node[new_nid]++;
+               }
+               spin_unlock(&hugetlb_lock);
+       }
+}
diff --git a/mm/interval_tree.c b/mm/interval_tree.c

index b476643..27ddfd2 100644 (file)
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -18,7 +18,7 @@ static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
  
  static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
  {
-       return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
+       return v->vm_pgoff + vma_pages(v) - 1;
  }
  
  INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c

index ea4ff25..b7e2268 100644 (file)
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1399,8 +1399,7 @@ static void collapse_shmem(struct mm_struct *mm,
                 }
  
                 if (page_mapped(page))
-                       unmap_mapping_range(mapping, index << PAGE_SHIFT,
-                                       PAGE_SIZE, 0);
+                       unmap_mapping_pages(mapping, index, 1, false);
  
                 spin_lock_irq(&mapping->tree_lock);
  
@@ -1674,10 +1673,14 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
         spin_unlock(&khugepaged_mm_lock);
  
         mm = mm_slot->mm;
-       down_read(&mm->mmap_sem);
-       if (unlikely(khugepaged_test_exit(mm)))
-               vma = NULL;
-       else
+       /*
+        * Don't wait for semaphore (to avoid long wait times).  Just move to
+        * the next mm on the list.
+        */
+       vma = NULL;
+       if (unlikely(!down_read_trylock(&mm->mmap_sem)))
+               goto breakouterloop_mmap_sem;
+       if (likely(!khugepaged_test_exit(mm)))
                 vma = find_vma(mm, khugepaged_scan.address);
  
         progress++;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c

index f656ca2..e83987c 100644 (file)
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -91,7 +91,6 @@
  #include <linux/stacktrace.h>
  #include <linux/cache.h>
  #include <linux/percpu.h>
-#include <linux/hardirq.h>
  #include <linux/bootmem.h>
  #include <linux/pfn.h>
  #include <linux/mmzone.h>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 9011997..0ae2dc3 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -542,39 +542,10 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
         return mz;
  }
  
-/*
- * Return page count for single (non recursive) @memcg.
- *
- * Implementation Note: reading percpu statistics for memcg.
- *
- * Both of vmstat[] and percpu_counter has threshold and do periodic
- * synchronization to implement "quick" read. There are trade-off between
- * reading cost and precision of value. Then, we may have a chance to implement
- * a periodic synchronization of counter in memcg's counter.
- *
- * But this _read() function is used for user interface now. The user accounts
- * memory usage by memory cgroup and he _always_ requires exact value because
- * he accounts memory. Even if we provide quick-and-fuzzy read, we always
- * have to visit all online cpus and make sum. So, for now, unnecessary
- * synchronization is not implemented. (just implemented for cpu hotplug)
- *
- * If there are kernel internal actions which can make use of some not-exact
- * value, and reading all cpu value can be performance bottleneck in some
- * common workload, threshold and synchronization as vmstat[] should be
- * implemented.
- *
- * The parameter idx can be of type enum memcg_event_item or vm_event_item.
- */
-
  static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
                                       int event)
  {
-       unsigned long val = 0;
-       int cpu;
-
-       for_each_possible_cpu(cpu)
-               val += per_cpu(memcg->stat->events[event], cpu);
-       return val;
+       return atomic_long_read(&memcg->events[event]);
  }
  
  static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
@@ -586,27 +557,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
          * counted as CACHE even if it's on ANON LRU.
          */
         if (PageAnon(page))
-               __this_cpu_add(memcg->stat->count[MEMCG_RSS], nr_pages);
+               __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
         else {
-               __this_cpu_add(memcg->stat->count[MEMCG_CACHE], nr_pages);
+               __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
                 if (PageSwapBacked(page))
-                       __this_cpu_add(memcg->stat->count[NR_SHMEM], nr_pages);
+                       __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
         }
  
         if (compound) {
                 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-               __this_cpu_add(memcg->stat->count[MEMCG_RSS_HUGE], nr_pages);
+               __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
         }
  
         /* pagein of a big page is an event. So, ignore page size */
         if (nr_pages > 0)
-               __this_cpu_inc(memcg->stat->events[PGPGIN]);
+               __count_memcg_events(memcg, PGPGIN, 1);
         else {
-               __this_cpu_inc(memcg->stat->events[PGPGOUT]);
+               __count_memcg_events(memcg, PGPGOUT, 1);
                 nr_pages = -nr_pages; /* for event */
         }
  
-       __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
+       __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
  }
  
  unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
@@ -642,8 +613,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
  {
         unsigned long val, next;
  
-       val = __this_cpu_read(memcg->stat->nr_page_events);
-       next = __this_cpu_read(memcg->stat->targets[target]);
+       val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
+       next = __this_cpu_read(memcg->stat_cpu->targets[target]);
         /* from time_after() in jiffies.h */
         if ((long)(next - val) < 0) {
                 switch (target) {
@@ -659,7 +630,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
                 default:
                         break;
                 }
-               __this_cpu_write(memcg->stat->targets[target], next);
+               __this_cpu_write(memcg->stat_cpu->targets[target], next);
                 return true;
         }
         return false;
@@ -1124,7 +1095,7 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
         return false;
  }
  
-unsigned int memcg1_stats[] = {
+static const unsigned int memcg1_stats[] = {
         MEMCG_CACHE,
         MEMCG_RSS,
         MEMCG_RSS_HUGE,
@@ -1205,20 +1176,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
         }
  }
  
-/*
- * This function returns the number of memcg under hierarchy tree. Returns
- * 1(self count) if no children.
- */
-static int mem_cgroup_count_children(struct mem_cgroup *memcg)
-{
-       int num = 0;
-       struct mem_cgroup *iter;
-
-       for_each_mem_cgroup_tree(iter, memcg)
-               num++;
-       return num;
-}
-
  /*
   * Return the memory (and swap, if configured) limit for a memcg.
   */
@@ -1707,11 +1664,6 @@ void unlock_page_memcg(struct page *page)
  }
  EXPORT_SYMBOL(unlock_page_memcg);
  
-/*
- * size of first charge trial. "32" comes from vmscan.c's magic value.
- * TODO: maybe necessary to use big numbers in big irons.
- */
-#define CHARGE_BATCH   32U
  struct memcg_stock_pcp {
         struct mem_cgroup *cached; /* this never be root cgroup */
         unsigned int nr_pages;
@@ -1739,7 +1691,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
         unsigned long flags;
         bool ret = false;
  
-       if (nr_pages > CHARGE_BATCH)
+       if (nr_pages > MEMCG_CHARGE_BATCH)
                 return ret;
  
         local_irq_save(flags);
@@ -1808,7 +1760,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
         }
         stock->nr_pages += nr_pages;
  
-       if (stock->nr_pages > CHARGE_BATCH)
+       if (stock->nr_pages > MEMCG_CHARGE_BATCH)
                 drain_stock(stock);
  
         local_irq_restore(flags);
@@ -1858,9 +1810,44 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
  static int memcg_hotplug_cpu_dead(unsigned int cpu)
  {
         struct memcg_stock_pcp *stock;
+       struct mem_cgroup *memcg;
  
         stock = &per_cpu(memcg_stock, cpu);
         drain_stock(stock);
+
+       for_each_mem_cgroup(memcg) {
+               int i;
+
+               for (i = 0; i < MEMCG_NR_STAT; i++) {
+                       int nid;
+                       long x;
+
+                       x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
+                       if (x)
+                               atomic_long_add(x, &memcg->stat[i]);
+
+                       if (i >= NR_VM_NODE_STAT_ITEMS)
+                               continue;
+
+                       for_each_node(nid) {
+                               struct mem_cgroup_per_node *pn;
+
+                               pn = mem_cgroup_nodeinfo(memcg, nid);
+                               x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
+                               if (x)
+                                       atomic_long_add(x, &pn->lruvec_stat[i]);
+                       }
+               }
+
+               for (i = 0; i < MEMCG_NR_EVENTS; i++) {
+                       long x;
+
+                       x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
+                       if (x)
+                               atomic_long_add(x, &memcg->events[i]);
+               }
+       }
+
         return 0;
  }
  
@@ -1881,7 +1868,7 @@ static void high_work_func(struct work_struct *work)
         struct mem_cgroup *memcg;
  
         memcg = container_of(work, struct mem_cgroup, high_work);
-       reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL);
+       reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
  }
  
  /*
@@ -1905,7 +1892,7 @@ void mem_cgroup_handle_over_high(void)
  static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
                       unsigned int nr_pages)
  {
-       unsigned int batch = max(CHARGE_BATCH, nr_pages);
+       unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
         struct mem_cgroup *mem_over_limit;
         struct page_counter *counter;
@@ -2415,18 +2402,11 @@ void mem_cgroup_split_huge_fixup(struct page *head)
         for (i = 1; i < HPAGE_PMD_NR; i++)
                 head[i].mem_cgroup = head->mem_cgroup;
  
-       __this_cpu_sub(head->mem_cgroup->stat->count[MEMCG_RSS_HUGE],
-                      HPAGE_PMD_NR);
+       __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
  }
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  
  #ifdef CONFIG_MEMCG_SWAP
-static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
-                                      int nr_entries)
-{
-       this_cpu_add(memcg->stat->count[MEMCG_SWAP], nr_entries);
-}
-
  /**
   * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
   * @entry: swap entry to be moved
@@ -2450,8 +2430,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
         new_id = mem_cgroup_id(to);
  
         if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
-               mem_cgroup_swap_statistics(from, -1);
-               mem_cgroup_swap_statistics(to, 1);
+               mod_memcg_state(from, MEMCG_SWAP, -1);
+               mod_memcg_state(to, MEMCG_SWAP, 1);
                 return 0;
         }
         return -EINVAL;
@@ -2467,23 +2447,12 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
  static DEFINE_MUTEX(memcg_limit_mutex);
  
  static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
-                                  unsigned long limit)
+                                  unsigned long limit, bool memsw)
  {
-       unsigned long curusage;
-       unsigned long oldusage;
         bool enlarge = false;
-       int retry_count;
         int ret;
-
-       /*
-        * For keeping hierarchical_reclaim simple, how long we should retry
-        * is depends on callers. We set our retry-count to be function
-        * of # of children which we should visit in this loop.
-        */
-       retry_count = MEM_CGROUP_RECLAIM_RETRIES *
-                     mem_cgroup_count_children(memcg);
-
-       oldusage = page_counter_read(&memcg->memory);
+       bool limits_invariant;
+       struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
  
         do {
                 if (signal_pending(current)) {
@@ -2492,79 +2461,31 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
                 }
  
                 mutex_lock(&memcg_limit_mutex);
-               if (limit > memcg->memsw.limit) {
+               /*
+                * Make sure that the new limit (memsw or memory limit) doesn't
+                * break our basic invariant rule memory.limit <= memsw.limit.
+                */
+               limits_invariant = memsw ? limit >= memcg->memory.limit :
+                                          limit <= memcg->memsw.limit;
+               if (!limits_invariant) {
                         mutex_unlock(&memcg_limit_mutex);
                         ret = -EINVAL;
                         break;
                 }
-               if (limit > memcg->memory.limit)
+               if (limit > counter->limit)
                         enlarge = true;
-               ret = page_counter_limit(&memcg->memory, limit);
+               ret = page_counter_limit(counter, limit);
                 mutex_unlock(&memcg_limit_mutex);
  
                 if (!ret)
                         break;
  
-               try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
-
-               curusage = page_counter_read(&memcg->memory);
-               /* Usage is reduced ? */
-               if (curusage >= oldusage)
-                       retry_count--;
-               else
-                       oldusage = curusage;
-       } while (retry_count);
-
-       if (!ret && enlarge)
-               memcg_oom_recover(memcg);
-
-       return ret;
-}
-
-static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
-                                        unsigned long limit)
-{
-       unsigned long curusage;
-       unsigned long oldusage;
-       bool enlarge = false;
-       int retry_count;
-       int ret;
-
-       /* see mem_cgroup_resize_res_limit */
-       retry_count = MEM_CGROUP_RECLAIM_RETRIES *
-                     mem_cgroup_count_children(memcg);
-
-       oldusage = page_counter_read(&memcg->memsw);
-
-       do {
-               if (signal_pending(current)) {
-                       ret = -EINTR;
+               if (!try_to_free_mem_cgroup_pages(memcg, 1,
+                                       GFP_KERNEL, !memsw)) {
+                       ret = -EBUSY;
                         break;
                 }
-
-               mutex_lock(&memcg_limit_mutex);
-               if (limit < memcg->memory.limit) {
-                       mutex_unlock(&memcg_limit_mutex);
-                       ret = -EINVAL;
-                       break;
-               }
-               if (limit > memcg->memsw.limit)
-                       enlarge = true;
-               ret = page_counter_limit(&memcg->memsw, limit);
-               mutex_unlock(&memcg_limit_mutex);
-
-               if (!ret)
-                       break;
-
-               try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
-
-               curusage = page_counter_read(&memcg->memsw);
-               /* Usage is reduced ? */
-               if (curusage >= oldusage)
-                       retry_count--;
-               else
-                       oldusage = curusage;
-       } while (retry_count);
+       } while (true);
  
         if (!ret && enlarge)
                 memcg_oom_recover(memcg);
@@ -3020,10 +2941,10 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
                 }
                 switch (MEMFILE_TYPE(of_cft(of)->private)) {
                 case _MEM:
-                       ret = mem_cgroup_resize_limit(memcg, nr_pages);
+                       ret = mem_cgroup_resize_limit(memcg, nr_pages, false);
                         break;
                 case _MEMSWAP:
-                       ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
+                       ret = mem_cgroup_resize_limit(memcg, nr_pages, true);
                         break;
                 case _KMEM:
                         ret = memcg_update_kmem_limit(memcg, nr_pages);
@@ -4168,8 +4089,8 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
         if (!pn)
                 return 1;
  
-       pn->lruvec_stat = alloc_percpu(struct lruvec_stat);
-       if (!pn->lruvec_stat) {
+       pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
+       if (!pn->lruvec_stat_cpu) {
                 kfree(pn);
                 return 1;
         }
@@ -4187,7 +4108,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
  {
         struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
  
-       free_percpu(pn->lruvec_stat);
+       free_percpu(pn->lruvec_stat_cpu);
         kfree(pn);
  }
  
@@ -4197,7 +4118,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
  
         for_each_node(node)
                 free_mem_cgroup_per_node_info(memcg, node);
-       free_percpu(memcg->stat);
+       free_percpu(memcg->stat_cpu);
         kfree(memcg);
  }
  
@@ -4226,8 +4147,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
         if (memcg->id.id < 0)
                 goto fail;
  
-       memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
-       if (!memcg->stat)
+       memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu);
+       if (!memcg->stat_cpu)
                 goto fail;
  
         for_each_node(node)
@@ -4584,8 +4505,8 @@ static int mem_cgroup_move_account(struct page *page,
         spin_lock_irqsave(&from->move_lock, flags);
  
         if (!anon && page_mapped(page)) {
-               __this_cpu_sub(from->stat->count[NR_FILE_MAPPED], nr_pages);
-               __this_cpu_add(to->stat->count[NR_FILE_MAPPED], nr_pages);
+               __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
+               __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
         }
  
         /*
@@ -4597,16 +4518,14 @@ static int mem_cgroup_move_account(struct page *page,
                 struct address_space *mapping = page_mapping(page);
  
                 if (mapping_cap_account_dirty(mapping)) {
-                       __this_cpu_sub(from->stat->count[NR_FILE_DIRTY],
-                                      nr_pages);
-                       __this_cpu_add(to->stat->count[NR_FILE_DIRTY],
-                                      nr_pages);
+                       __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
+                       __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
                 }
         }
  
         if (PageWriteback(page)) {
-               __this_cpu_sub(from->stat->count[NR_WRITEBACK], nr_pages);
-               __this_cpu_add(to->stat->count[NR_WRITEBACK], nr_pages);
+               __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
+               __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
         }
  
         /*
@@ -5642,12 +5561,12 @@ static void uncharge_batch(const struct uncharge_gather *ug)
         }
  
         local_irq_save(flags);
-       __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
-       __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
-       __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
-       __this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem);
-       __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
-       __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
+       __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
+       __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
+       __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
+       __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
+       __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
+       __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
         memcg_check_events(ug->memcg, ug->dummy_page);
         local_irq_restore(flags);
  
@@ -5874,7 +5793,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
         if (in_softirq())
                 gfp_mask = GFP_NOWAIT;
  
-       this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages);
+       mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
  
         if (try_charge(memcg, gfp_mask, nr_pages) == 0)
                 return true;
@@ -5895,7 +5814,7 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
                 return;
         }
  
-       this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages);
+       mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
  
         refill_stock(memcg, nr_pages);
  }
@@ -6019,7 +5938,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
         oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
                                    nr_entries);
         VM_BUG_ON_PAGE(oldid, page);
-       mem_cgroup_swap_statistics(swap_memcg, nr_entries);
+       mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
  
         page->mem_cgroup = NULL;
  
@@ -6085,7 +6004,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
                 mem_cgroup_id_get_many(memcg, nr_pages - 1);
         oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
         VM_BUG_ON_PAGE(oldid, page);
-       mem_cgroup_swap_statistics(memcg, nr_pages);
+       mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
  
         return 0;
  }
@@ -6113,7 +6032,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
                         else
                                 page_counter_uncharge(&memcg->memsw, nr_pages);
                 }
-               mem_cgroup_swap_statistics(memcg, -nr_pages);
+               mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
                 mem_cgroup_id_put_many(memcg, nr_pages);
         }
         rcu_read_unlock();
diff --git a/mm/memory.c b/mm/memory.c

index 7930046..53373b7 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -400,10 +400,17 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
  
  #endif /* CONFIG_HAVE_RCU_TABLE_FREE */
  
-/* tlb_gather_mmu
- *     Called to initialize an (on-stack) mmu_gather structure for page-table
- *     tear-down from @mm. The @fullmm argument is used when @mm is without
- *     users and we're going to destroy the full address space (exit/execve).
+/**
+ * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
+ * @tlb: the mmu_gather structure to initialize
+ * @mm: the mm_struct of the target address space
+ * @start: start of the region that will be removed from the page-table
+ * @end: end of the region that will be removed from the page-table
+ *
+ * Called to initialize an (on-stack) mmu_gather structure for page-table
+ * tear-down from @mm. The @start and @end are set to 0 and -1
+ * respectively when @mm is without users and we're going to destroy
+ * the full address space (exit/execve).
   */
  void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
                         unsigned long start, unsigned long end)
@@ -2791,9 +2798,38 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
         }
  }
  
+/**
+ * unmap_mapping_pages() - Unmap pages from processes.
+ * @mapping: The address space containing pages to be unmapped.
+ * @start: Index of first page to be unmapped.
+ * @nr: Number of pages to be unmapped.  0 to unmap to end of file.
+ * @even_cows: Whether to unmap even private COWed pages.
+ *
+ * Unmap the pages in this address space from any userspace process which
+ * has them mmaped.  Generally, you want to remove COWed pages as well when
+ * a file is being truncated, but not when invalidating pages from the page
+ * cache.
+ */
+void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
+               pgoff_t nr, bool even_cows)
+{
+       struct zap_details details = { };
+
+       details.check_mapping = even_cows ? NULL : mapping;
+       details.first_index = start;
+       details.last_index = start + nr - 1;
+       if (details.last_index < details.first_index)
+               details.last_index = ULONG_MAX;
+
+       i_mmap_lock_write(mapping);
+       if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
+               unmap_mapping_range_tree(&mapping->i_mmap, &details);
+       i_mmap_unlock_write(mapping);
+}
+
  /**
   * unmap_mapping_range - unmap the portion of all mmaps in the specified
- * address_space corresponding to the specified page range in the underlying
+ * address_space corresponding to the specified byte range in the underlying
   * file.
   *
   * @mapping: the address space containing mmaps to be unmapped.
@@ -2811,7 +2847,6 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
  void unmap_mapping_range(struct address_space *mapping,
                 loff_t const holebegin, loff_t const holelen, int even_cows)
  {
-       struct zap_details details = { };
         pgoff_t hba = holebegin >> PAGE_SHIFT;
         pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
  
@@ -2823,16 +2858,7 @@ void unmap_mapping_range(struct address_space *mapping,
                         hlen = ULONG_MAX - hba + 1;
         }
  
-       details.check_mapping = even_cows ? NULL : mapping;
-       details.first_index = hba;
-       details.last_index = hba + hlen - 1;
-       if (details.last_index < details.first_index)
-               details.last_index = ULONG_MAX;
-
-       i_mmap_lock_write(mapping);
-       if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
-               unmap_mapping_range_tree(&mapping->i_mmap, &details);
-       i_mmap_unlock_write(mapping);
+       unmap_mapping_pages(mapping, hba, hlen, even_cows);
  }
  EXPORT_SYMBOL(unmap_mapping_range);
  
@@ -3485,9 +3511,8 @@ static int fault_around_bytes_get(void *data, u64 *val)
  }
  
  /*
- * fault_around_pages() and fault_around_mask() expects fault_around_bytes
- * rounded down to nearest page order. It's what do_fault_around() expects to
- * see.
+ * fault_around_bytes must be rounded down to the nearest page order as it's
+ * what do_fault_around() expects to see.
   */
  static int fault_around_bytes_set(void *data, u64 val)
  {
@@ -3530,13 +3555,14 @@ late_initcall(fault_around_debugfs);
   * This function doesn't cross the VMA boundaries, in order to call map_pages()
   * only once.
   *
- * fault_around_pages() defines how many pages we'll try to map.
- * do_fault_around() expects it to return a power of two less than or equal to
- * PTRS_PER_PTE.
+ * fault_around_bytes defines how many bytes we'll try to map.
+ * do_fault_around() expects it to be set to a power of two less than or equal
+ * to PTRS_PER_PTE.
   *
- * The virtual address of the area that we map is naturally aligned to the
- * fault_around_pages() value (and therefore to page order).  This way it's
- * easier to guarantee that we don't cross page table boundaries.
+ * The virtual address of the area that we map is naturally aligned to
+ * fault_around_bytes rounded down to the machine page size
+ * (and therefore to page order).  This way it's easier to guarantee
+ * that we don't cross page table boundaries.
   */
  static int do_fault_around(struct vm_fault *vmf)
  {
@@ -3553,8 +3579,8 @@ static int do_fault_around(struct vm_fault *vmf)
         start_pgoff -= off;
  
         /*
-        *  end_pgoff is either end of page table or end of vma
-        *  or fault_around_pages() from start_pgoff, depending what is nearest.
+        *  end_pgoff is either the end of the page table, the end of
+        *  the vma or nr_pages from start_pgoff, depending what is nearest.
          */
         end_pgoff = start_pgoff -
                 ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c

index c52aa05..9bbd698 100644 (file)
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -184,7 +184,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
         for (i = 0; i < mapsize; i++, page++)
                 get_page_bootmem(section_nr, page, SECTION_INFO);
  
-       usemap = __nr_to_section(section_nr)->pageblock_flags;
+       usemap = ms->pageblock_flags;
         page = virt_to_page(usemap);
  
         mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
@@ -200,9 +200,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
         struct mem_section *ms;
         struct page *page, *memmap;
  
-       if (!pfn_valid(start_pfn))
-               return;
-
         section_nr = pfn_to_section_nr(start_pfn);
         ms = __nr_to_section(section_nr);
  
@@ -210,7 +207,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
  
         register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
  
-       usemap = __nr_to_section(section_nr)->pageblock_flags;
+       usemap = ms->pageblock_flags;
         page = virt_to_page(usemap);
  
         mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
@@ -1637,7 +1634,7 @@ repeat:
                 goto failed_removal;
  
         cond_resched();
-       lru_add_drain_all_cpuslocked();
+       lru_add_drain_all();
         drain_all_pages(zone);
  
         pfn = scan_movable_pages(start_pfn, end_pfn);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index 4ce44d3..d879f1d 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1121,8 +1121,8 @@ static struct page *new_page(struct page *page, unsigned long start, int **x)
         }
  
         if (PageHuge(page)) {
-               BUG_ON(!vma);
-               return alloc_huge_page_noerr(vma, address, 1);
+               return alloc_huge_page_vma(page_hstate(compound_head(page)),
+                               vma, address);
         } else if (thp_migration_supported() && PageTransHuge(page)) {
                 struct page *thp;
  
@@ -1263,6 +1263,7 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
                      unsigned long maxnode)
  {
         unsigned long k;
+       unsigned long t;
         unsigned long nlongs;
         unsigned long endmask;
  
@@ -1279,13 +1280,17 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
         else
                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
  
-       /* When the user specified more nodes than supported just check
-          if the non supported part is all zero. */
+       /*
+        * When the user specified more nodes than supported just check
+        * if the non supported part is all zero.
+        *
+        * If maxnode have more longs than MAX_NUMNODES, check
+        * the bits in that area first. And then go through to
+        * check the rest bits which equal or bigger than MAX_NUMNODES.
+        * Otherwise, just check bits [MAX_NUMNODES, maxnode).
+        */
         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
-               if (nlongs > PAGE_SIZE/sizeof(long))
-                       return -EINVAL;
                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
-                       unsigned long t;
                         if (get_user(t, nmask + k))
                                 return -EFAULT;
                         if (k == nlongs - 1) {
@@ -1298,6 +1303,16 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
                 endmask = ~0UL;
         }
  
+       if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
+               unsigned long valid_mask = endmask;
+
+               valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
+               if (get_user(t, nmask + nlongs - 1))
+                       return -EFAULT;
+               if (t & valid_mask)
+                       return -EINVAL;
+       }
+
         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
                 return -EFAULT;
         nodes_addr(*nodes)[nlongs-1] &= endmask;
@@ -1418,10 +1433,14 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
                 goto out_put;
         }
  
-       if (!nodes_subset(*new, node_states[N_MEMORY])) {
-               err = -EINVAL;
+       task_nodes = cpuset_mems_allowed(current);
+       nodes_and(*new, *new, task_nodes);
+       if (nodes_empty(*new))
+               goto out_put;
+
+       nodes_and(*new, *new, node_states[N_MEMORY]);
+       if (nodes_empty(*new))
                 goto out_put;
-       }
  
         err = security_task_movememory(task);
         if (err)
diff --git a/mm/migrate.c b/mm/migrate.c

index 4d0be47..1e5525a 100644 (file)
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1323,9 +1323,8 @@ put_anon:
                 put_anon_vma(anon_vma);
  
         if (rc == MIGRATEPAGE_SUCCESS) {
-               hugetlb_cgroup_migrate(hpage, new_hpage);
+               move_hugetlb_state(hpage, new_hpage, reason);
                 put_new_page = NULL;
-               set_page_owner_migrate_reason(new_hpage, reason);
         }
  
         unlock_page(hpage);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c

index 96edb33..eff6b88 100644 (file)
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
  }
  EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
  
+/*
+ * Must be called while holding mm->mmap_sem for either read or write.
+ * The result is guaranteed to be valid until mm->mmap_sem is dropped.
+ */
+bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+{
+       struct mmu_notifier *mn;
+       int id;
+       bool ret = false;
+
+       WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
+
+       if (!mm_has_notifiers(mm))
+               return ret;
+
+       id = srcu_read_lock(&srcu);
+       hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+               if (!mn->ops->invalidate_range &&
+                   !mn->ops->invalidate_range_start &&
+                   !mn->ops->invalidate_range_end)
+                               continue;
+
+               if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
+                       ret = true;
+                       break;
+               }
+       }
+       srcu_read_unlock(&srcu, id);
+       return ret;
+}
+
  static int do_mmu_notifier_register(struct mmu_notifier *mn,
                                     struct mm_struct *mm,
                                     int take_mmap_sem)
diff --git a/mm/mprotect.c b/mm/mprotect.c

index 58b629b..e3309fc 100644 (file)
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -84,6 +84,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                 if (!page || PageKsm(page))
                                         continue;
  
+                               /* Also skip shared copy-on-write pages */
+                               if (is_cow_mapping(vma->vm_flags) &&
+                                   page_mapcount(page) != 1)
+                                       continue;
+
                                 /* Avoid TLB flush if possible */
                                 if (pte_protnone(oldpte))
                                         continue;
diff --git a/mm/nommu.c b/mm/nommu.c

index 17c00d9..4b9864b 100644 (file)
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1788,13 +1788,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
         return -ENOMEM;
  }
  
-void unmap_mapping_range(struct address_space *mapping,
-                        loff_t const holebegin, loff_t const holelen,
-                        int even_cows)
-{
-}
-EXPORT_SYMBOL(unmap_mapping_range);
-
  int filemap_fault(struct vm_fault *vmf)
  {
         BUG();
diff --git a/mm/oom_kill.c b/mm/oom_kill.c

index 29f8555..f2e7dfb 100644 (file)
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -514,15 +514,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
         }
  
         /*
-        * If the mm has notifiers then we would need to invalidate them around
-        * unmap_page_range and that is risky because notifiers can sleep and
-        * what they do is basically undeterministic.  So let's have a short
+        * If the mm has invalidate_{start,end}() notifiers that could block,
          * sleep to give the oom victim some more time.
          * TODO: we really want to get rid of this ugly hack and make sure that
-        * notifiers cannot block for unbounded amount of time and add
-        * mmu_notifier_invalidate_range_{start,end} around unmap_page_range
+        * notifiers cannot block for unbounded amount of time
          */
-       if (mm_has_notifiers(mm)) {
+       if (mm_has_blockable_invalidate_notifiers(mm)) {
                 up_read(&mm->mmap_sem);
                 schedule_timeout_idle(HZ);
                 goto unlock_oom;
@@ -565,10 +562,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
                  * count elevated without a good reason.
                  */
                 if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
-                       tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
-                       unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
-                                        NULL);
-                       tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
+                       const unsigned long start = vma->vm_start;
+                       const unsigned long end = vma->vm_end;
+
+                       tlb_gather_mmu(&tlb, mm, start, end);
+                       mmu_notifier_invalidate_range_start(mm, start, end);
+                       unmap_page_range(&tlb, vma, start, end, NULL);
+                       mmu_notifier_invalidate_range_end(mm, start, end);
+                       tlb_finish_mmu(&tlb, start, end);
                 }
         }
         pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 76c9688..c7dd9c8 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -293,7 +293,7 @@ int page_group_by_mobility_disabled __read_mostly;
  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
  
  /*
- * Determine how many pages need to be initialized durig early boot
+ * Determine how many pages need to be initialized during early boot
   * (non-deferred initialization).
   * The value of first_deferred_pfn will be set later, once non-deferred pages
   * are initialized, but for now set it ULONG_MAX.
@@ -344,7 +344,7 @@ static inline bool update_defer_init(pg_data_t *pgdat,
                                 unsigned long pfn, unsigned long zone_end,
                                 unsigned long *nr_initialised)
  {
-       /* Always populate low zones for address-contrained allocations */
+       /* Always populate low zones for address-constrained allocations */
         if (zone_end < pgdat_end_pfn(pgdat))
                 return true;
         (*nr_initialised)++;
@@ -1177,9 +1177,10 @@ static void free_one_page(struct zone *zone,
  }
  
  static void __meminit __init_single_page(struct page *page, unsigned long pfn,
-                               unsigned long zone, int nid)
+                               unsigned long zone, int nid, bool zero)
  {
-       mm_zero_struct_page(page);
+       if (zero)
+               mm_zero_struct_page(page);
         set_page_links(page, zone, nid, pfn);
         init_page_count(page);
         page_mapcount_reset(page);
@@ -1194,9 +1195,9 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn,
  }
  
  static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
-                                       int nid)
+                                       int nid, bool zero)
  {
-       return __init_single_page(pfn_to_page(pfn), pfn, zone, nid);
+       return __init_single_page(pfn_to_page(pfn), pfn, zone, nid, zero);
  }
  
  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -1217,7 +1218,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
                 if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
                         break;
         }
-       __init_single_pfn(pfn, zid, nid);
+       __init_single_pfn(pfn, zid, nid, true);
  }
  #else
  static inline void init_reserved_page(unsigned long pfn)
@@ -1457,92 +1458,87 @@ static inline void __init pgdat_init_report_one_done(void)
  }
  
  /*
- * Helper for deferred_init_range, free the given range, reset the counters, and
- * return number of pages freed.
+ * Returns true if page needs to be initialized or freed to buddy allocator.
+ *
+ * First we check if pfn is valid on architectures where it is possible to have
+ * holes within pageblock_nr_pages. On systems where it is not possible, this
+ * function is optimized out.
+ *
+ * Then, we check if a current large page is valid by only checking the validity
+ * of the head pfn.
+ *
+ * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave
+ * within a node: a pfn is between start and end of a node, but does not belong
+ * to this memory node.
   */
-static inline unsigned long __init __def_free(unsigned long *nr_free,
-                                             unsigned long *free_base_pfn,
-                                             struct page **page)
+static inline bool __init
+deferred_pfn_valid(int nid, unsigned long pfn,
+                  struct mminit_pfnnid_cache *nid_init_state)
  {
-       unsigned long nr = *nr_free;
+       if (!pfn_valid_within(pfn))
+               return false;
+       if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
+               return false;
+       if (!meminit_pfn_in_nid(pfn, nid, nid_init_state))
+               return false;
+       return true;
+}
  
-       deferred_free_range(*free_base_pfn, nr);
-       *free_base_pfn = 0;
-       *nr_free = 0;
-       *page = NULL;
+/*
+ * Free pages to buddy allocator. Try to free aligned pages in
+ * pageblock_nr_pages sizes.
+ */
+static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
+                                      unsigned long end_pfn)
+{
+       struct mminit_pfnnid_cache nid_init_state = { };
+       unsigned long nr_pgmask = pageblock_nr_pages - 1;
+       unsigned long nr_free = 0;
  
-       return nr;
+       for (; pfn < end_pfn; pfn++) {
+               if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
+                       deferred_free_range(pfn - nr_free, nr_free);
+                       nr_free = 0;
+               } else if (!(pfn & nr_pgmask)) {
+                       deferred_free_range(pfn - nr_free, nr_free);
+                       nr_free = 1;
+                       cond_resched();
+               } else {
+                       nr_free++;
+               }
+       }
+       /* Free the last block of pages to allocator */
+       deferred_free_range(pfn - nr_free, nr_free);
  }
  
-static unsigned long __init deferred_init_range(int nid, int zid,
-                                               unsigned long start_pfn,
-                                               unsigned long end_pfn)
+/*
+ * Initialize struct pages.  We minimize pfn page lookups and scheduler checks
+ * by performing it only once every pageblock_nr_pages.
+ * Return number of pages initialized.
+ */
+static unsigned long  __init deferred_init_pages(int nid, int zid,
+                                                unsigned long pfn,
+                                                unsigned long end_pfn)
  {
         struct mminit_pfnnid_cache nid_init_state = { };
         unsigned long nr_pgmask = pageblock_nr_pages - 1;
-       unsigned long free_base_pfn = 0;
         unsigned long nr_pages = 0;
-       unsigned long nr_free = 0;
         struct page *page = NULL;
-       unsigned long pfn;
  
-       /*
-        * First we check if pfn is valid on architectures where it is possible
-        * to have holes within pageblock_nr_pages. On systems where it is not
-        * possible, this function is optimized out.
-        *
-        * Then, we check if a current large page is valid by only checking the
-        * validity of the head pfn.
-        *
-        * meminit_pfn_in_nid is checked on systems where pfns can interleave
-        * within a node: a pfn is between start and end of a node, but does not
-        * belong to this memory node.
-        *
-        * Finally, we minimize pfn page lookups and scheduler checks by
-        * performing it only once every pageblock_nr_pages.
-        *
-        * We do it in two loops: first we initialize struct page, than free to
-        * buddy allocator, becuse while we are freeing pages we can access
-        * pages that are ahead (computing buddy page in __free_one_page()).
-        */
-       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
-               if (!pfn_valid_within(pfn))
+       for (; pfn < end_pfn; pfn++) {
+               if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
+                       page = NULL;
                         continue;
-               if ((pfn & nr_pgmask) || pfn_valid(pfn)) {
-                       if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
-                               if (page && (pfn & nr_pgmask))
-                                       page++;
-                               else
-                                       page = pfn_to_page(pfn);
-                               __init_single_page(page, pfn, zid, nid);
-                               cond_resched();
-                       }
-               }
-       }
-
-       page = NULL;
-       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
-               if (!pfn_valid_within(pfn)) {
-                       nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
-               } else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) {
-                       nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
-               } else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
-                       nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
-               } else if (page && (pfn & nr_pgmask)) {
-                       page++;
-                       nr_free++;
-               } else {
-                       nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+               } else if (!page || !(pfn & nr_pgmask)) {
                         page = pfn_to_page(pfn);
-                       free_base_pfn = pfn;
-                       nr_free = 1;
                         cond_resched();
+               } else {
+                       page++;
                 }
+               __init_single_page(page, pfn, zid, nid, true);
+               nr_pages++;
         }
-       /* Free the last block of pages to allocator */
-       nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
-
-       return nr_pages;
+       return (nr_pages);
  }
  
  /* Initialise remaining memory on a node */
@@ -1582,10 +1578,21 @@ static int __init deferred_init_memmap(void *data)
         }
         first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
  
+       /*
+        * Initialize and free pages. We do it in two loops: first we initialize
+        * struct page, than free to buddy allocator, because while we are
+        * freeing pages we can access pages that are ahead (computing buddy
+        * page in __free_one_page()).
+        */
+       for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
+               spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
+               epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
+               nr_pages += deferred_init_pages(nid, zid, spfn, epfn);
+       }
         for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
                 spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
                 epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
-               nr_pages += deferred_init_range(nid, zid, spfn, epfn);
+               deferred_free_pages(nid, zid, spfn, epfn);
         }
  
         /* Sanity check that the next zone really is unpopulated */
@@ -3391,7 +3398,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
         if (gfp_mask & __GFP_THISNODE)
                 goto out;
  
-       /* Exhausted what can be done so it's blamo time */
+       /* Exhausted what can be done so it's blame time */
         if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
                 *did_some_progress = 1;
  
@@ -4272,7 +4279,7 @@ unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
         struct page *page;
  
         /*
-        * __get_free_pages() returns a 32-bit address, which cannot represent
+        * __get_free_pages() returns a virtual address, which cannot represent
          * a highmem page
          */
         VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
@@ -5393,15 +5400,20 @@ not_early:
                  * can be created for invalid pages (for alignment)
                  * check here not to call set_pageblock_migratetype() against
                  * pfn out of zone.
+                *
+                * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
+                * because this is done early in sparse_add_one_section
                  */
                 if (!(pfn & (pageblock_nr_pages - 1))) {
                         struct page *page = pfn_to_page(pfn);
  
-                       __init_single_page(page, pfn, zone, nid);
+                       __init_single_page(page, pfn, zone, nid,
+                                       context != MEMMAP_HOTPLUG);
                         set_pageblock_migratetype(page, MIGRATE_MOVABLE);
                         cond_resched();
                 } else {
-                       __init_single_pfn(pfn, zone, nid);
+                       __init_single_pfn(pfn, zone, nid,
+                                       context != MEMMAP_HOTPLUG);
                 }
         }
  }
diff --git a/mm/page_ext.c b/mm/page_ext.c

index 2c16216..5295ef3 100644 (file)
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -59,7 +59,9 @@
   */
  
  static struct page_ext_operations *page_ext_ops[] = {
+#ifdef CONFIG_DEBUG_PAGEALLOC
         &debug_guardpage_ops,
+#endif
  #ifdef CONFIG_PAGE_OWNER
         &page_owner_ops,
  #endif
diff --git a/mm/page_owner.c b/mm/page_owner.c

index 270a821..9886c60 100644 (file)
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -528,21 +528,18 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
  
  static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
  {
-       struct page *page;
-       struct page_ext *page_ext;
-       unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
-       unsigned long end_pfn = pfn + zone->spanned_pages;
+       unsigned long pfn = zone->zone_start_pfn;
+       unsigned long end_pfn = zone_end_pfn(zone);
         unsigned long count = 0;
  
-       /* Scan block by block. First and last block may be incomplete */
-       pfn = zone->zone_start_pfn;
-
         /*
          * Walk the zone in pageblock_nr_pages steps. If a page block spans
          * a zone boundary, it will be double counted between zones. This does
          * not matter as the mixed block count will still be correct
          */
         for (; pfn < end_pfn; ) {
+               unsigned long block_end_pfn;
+
                 if (!pfn_valid(pfn)) {
                         pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
                         continue;
@@ -551,9 +548,10 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
                 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
                 block_end_pfn = min(block_end_pfn, end_pfn);
  
-               page = pfn_to_page(pfn);
-
                 for (; pfn < block_end_pfn; pfn++) {
+                       struct page *page;
+                       struct page_ext *page_ext;
+
                         if (!pfn_valid_within(pfn))
                                 continue;
  
@@ -635,9 +633,7 @@ static int __init pageowner_init(void)
  
         dentry = debugfs_create_file("page_owner", S_IRUSR, NULL,
                         NULL, &proc_page_owner_operations);
-       if (IS_ERR(dentry))
-               return PTR_ERR(dentry);
  
-       return 0;
+       return PTR_ERR_OR_ZERO(dentry);
  }
  late_initcall(pageowner_init)
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c

index 1e4ee76..cf2af04 100644 (file)
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -181,12 +181,12 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
  #endif
  
  #ifndef __HAVE_ARCH_PMDP_INVALIDATE
-void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                      pmd_t *pmdp)
  {
-       pmd_t entry = *pmdp;
-       set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry));
+       pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mknotpresent(*pmdp));
         flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+       return old;
  }
  #endif
  
diff --git a/mm/shmem.c b/mm/shmem.c

index 7fbe67b..1907688 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2717,15 +2717,28 @@ continue_resched:
         return error;
  }
  
+static unsigned int *memfd_file_seals_ptr(struct file *file)
+{
+       if (file->f_op == &shmem_file_operations)
+               return &SHMEM_I(file_inode(file))->seals;
+
+#ifdef CONFIG_HUGETLBFS
+       if (file->f_op == &hugetlbfs_file_operations)
+               return &HUGETLBFS_I(file_inode(file))->seals;
+#endif
+
+       return NULL;
+}
+
  #define F_ALL_SEALS (F_SEAL_SEAL | \
                      F_SEAL_SHRINK | \
                      F_SEAL_GROW | \
                      F_SEAL_WRITE)
  
-int shmem_add_seals(struct file *file, unsigned int seals)
+static int memfd_add_seals(struct file *file, unsigned int seals)
  {
         struct inode *inode = file_inode(file);
-       struct shmem_inode_info *info = SHMEM_I(inode);
+       unsigned int *file_seals;
         int error;
  
         /*
@@ -2758,8 +2771,6 @@ int shmem_add_seals(struct file *file, unsigned int seals)
          * other file types.
          */
  
-       if (file->f_op != &shmem_file_operations)
-               return -EINVAL;
         if (!(file->f_mode & FMODE_WRITE))
                 return -EPERM;
         if (seals & ~(unsigned int)F_ALL_SEALS)
@@ -2767,12 +2778,18 @@ int shmem_add_seals(struct file *file, unsigned int seals)
  
         inode_lock(inode);
  
-       if (info->seals & F_SEAL_SEAL) {
+       file_seals = memfd_file_seals_ptr(file);
+       if (!file_seals) {
+               error = -EINVAL;
+               goto unlock;
+       }
+
+       if (*file_seals & F_SEAL_SEAL) {
                 error = -EPERM;
                 goto unlock;
         }
  
-       if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) {
+       if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
                 error = mapping_deny_writable(file->f_mapping);
                 if (error)
                         goto unlock;
@@ -2784,25 +2801,22 @@ int shmem_add_seals(struct file *file, unsigned int seals)
                 }
         }
  
-       info->seals |= seals;
+       *file_seals |= seals;
         error = 0;
  
  unlock:
         inode_unlock(inode);
         return error;
  }
-EXPORT_SYMBOL_GPL(shmem_add_seals);
  
-int shmem_get_seals(struct file *file)
+static int memfd_get_seals(struct file *file)
  {
-       if (file->f_op != &shmem_file_operations)
-               return -EINVAL;
+       unsigned int *seals = memfd_file_seals_ptr(file);
  
-       return SHMEM_I(file_inode(file))->seals;
+       return seals ? *seals : -EINVAL;
  }
-EXPORT_SYMBOL_GPL(shmem_get_seals);
  
-long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
  {
         long error;
  
@@ -2812,10 +2826,10 @@ long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
                 if (arg > UINT_MAX)
                         return -EINVAL;
  
-               error = shmem_add_seals(file, arg);
+               error = memfd_add_seals(file, arg);
                 break;
         case F_GET_SEALS:
-               error = shmem_get_seals(file);
+               error = memfd_get_seals(file);
                 break;
         default:
                 error = -EINVAL;
@@ -3657,7 +3671,7 @@ SYSCALL_DEFINE2(memfd_create,
                 const char __user *, uname,
                 unsigned int, flags)
  {
-       struct shmem_inode_info *info;
+       unsigned int *file_seals;
         struct file *file;
         int fd, error;
         char *name;
@@ -3667,9 +3681,6 @@ SYSCALL_DEFINE2(memfd_create,
                 if (flags & ~(unsigned int)MFD_ALL_FLAGS)
                         return -EINVAL;
         } else {
-               /* Sealing not supported in hugetlbfs (MFD_HUGETLB) */
-               if (flags & MFD_ALLOW_SEALING)
-                       return -EINVAL;
                 /* Allow huge page size encoding in flags. */
                 if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
                                 (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
@@ -3722,12 +3733,8 @@ SYSCALL_DEFINE2(memfd_create,
         file->f_flags |= O_RDWR | O_LARGEFILE;
  
         if (flags & MFD_ALLOW_SEALING) {
-               /*
-                * flags check at beginning of function ensures
-                * this is not a hugetlbfs (MFD_HUGETLB) file.
-                */
-               info = SHMEM_I(file_inode(file));
-               info->seals &= ~F_SEAL_SEAL;
+               file_seals = memfd_file_seals_ptr(file);
+               *file_seals &= ~F_SEAL_SEAL;
         }
  
         fd_install(fd, file);
diff --git a/mm/slab.c b/mm/slab.c

index 4e51ef9..2269062 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1316,8 +1316,6 @@ void __init kmem_cache_init_late(void)
  {
         struct kmem_cache *cachep;
  
-       slab_state = UP;
-
         /* 6) resize the head arrays to their final sizes */
         mutex_lock(&slab_mutex);
         list_for_each_entry(cachep, &slab_caches, list)
@@ -1353,8 +1351,6 @@ static int __init cpucache_init(void)
                                 slab_online_cpu, slab_offline_cpu);
         WARN_ON(ret < 0);
  
-       /* Done! */
-       slab_state = FULL;
         return 0;
  }
  __initcall(cpucache_init);
diff --git a/mm/slab.h b/mm/slab.h

index ad657ff..e8e2095 100644 (file)
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -78,9 +78,6 @@ extern const struct kmalloc_info_struct {
         unsigned long size;
  } kmalloc_info[];
  
-unsigned long calculate_alignment(slab_flags_t flags,
-               unsigned long align, unsigned long size);
-
  #ifndef CONFIG_SLOB
  /* Kmalloc array related functions */
  void setup_kmalloc_cache_index_table(void);
diff --git a/mm/slab_common.c b/mm/slab_common.c

index c8cb367..deeddf9 100644 (file)
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -267,6 +267,35 @@ static inline void memcg_unlink_cache(struct kmem_cache *s)
  }
  #endif /* CONFIG_MEMCG && !CONFIG_SLOB */
  
+/*
+ * Figure out what the alignment of the objects will be given a set of
+ * flags, a user specified alignment and the size of the objects.
+ */
+static unsigned long calculate_alignment(unsigned long flags,
+               unsigned long align, unsigned long size)
+{
+       /*
+        * If the user wants hardware cache aligned objects then follow that
+        * suggestion if the object is sufficiently large.
+        *
+        * The hardware cache alignment cannot override the specified
+        * alignment though. If that is greater then use it.
+        */
+       if (flags & SLAB_HWCACHE_ALIGN) {
+               unsigned long ralign;
+
+               ralign = cache_line_size();
+               while (size <= ralign / 2)
+                       ralign /= 2;
+               align = max(align, ralign);
+       }
+
+       if (align < ARCH_SLAB_MINALIGN)
+               align = ARCH_SLAB_MINALIGN;
+
+       return ALIGN(align, sizeof(void *));
+}
+
  /*
   * Find a mergeable slab cache
   */
@@ -337,33 +366,6 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
         return NULL;
  }
  
-/*
- * Figure out what the alignment of the objects will be given a set of
- * flags, a user specified alignment and the size of the objects.
- */
-unsigned long calculate_alignment(slab_flags_t flags,
-               unsigned long align, unsigned long size)
-{
-       /*
-        * If the user wants hardware cache aligned objects then follow that
-        * suggestion if the object is sufficiently large.
-        *
-        * The hardware cache alignment cannot override the specified
-        * alignment though. If that is greater then use it.
-        */
-       if (flags & SLAB_HWCACHE_ALIGN) {
-               unsigned long ralign = cache_line_size();
-               while (size <= ralign / 2)
-                       ralign /= 2;
-               align = max(align, ralign);
-       }
-
-       if (align < ARCH_SLAB_MINALIGN)
-               align = ARCH_SLAB_MINALIGN;
-
-       return ALIGN(align, sizeof(void *));
-}
-
  static struct kmem_cache *create_cache(const char *name,
                 size_t object_size, size_t size, size_t align,
                 slab_flags_t flags, void (*ctor)(void *),
diff --git a/mm/slub.c b/mm/slub.c

index cfd56e5..693b707 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -838,6 +838,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
         u8 *start;
         u8 *fault;
         u8 *end;
+       u8 *pad;
         int length;
         int remainder;
  
@@ -851,8 +852,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
         if (!remainder)
                 return 1;
  
+       pad = end - remainder;
         metadata_access_enable();
-       fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
+       fault = memchr_inv(pad, POISON_INUSE, remainder);
         metadata_access_disable();
         if (!fault)
                 return 1;
@@ -860,9 +862,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
                 end--;
  
         slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
-       print_section(KERN_ERR, "Padding ", end - remainder, remainder);
+       print_section(KERN_ERR, "Padding ", pad, remainder);
  
-       restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
+       restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
         return 0;
  }
  
@@ -2220,9 +2222,7 @@ static void unfreeze_partials(struct kmem_cache *s,
  
  /*
   * Put a page that was just frozen (in __slab_free) into a partial page
- * slot if available. This is done without interrupts disabled and without
- * preemption disabled. The cmpxchg is racy and may put the partial page
- * onto a random cpus partial slot.
+ * slot if available.
   *
   * If we did not find a slot then simply move all the partials to the
   * per node partial list.
diff --git a/mm/sparse.c b/mm/sparse.c

index 2609aba..6b8b5e9 100644 (file)
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -264,7 +264,11 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
   */
  static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
  {
-       return (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
+       unsigned long coded_mem_map =
+               (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
+       BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
+       BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
+       return coded_mem_map;
  }
  
  /*
diff --git a/mm/swap.c b/mm/swap.c

index 38e1b63..10568b1 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -411,7 +411,7 @@ static void __lru_cache_add(struct page *page)
  }
  
  /**
- * lru_cache_add: add a page to the page lists
+ * lru_cache_add_anon - add a page to the page lists
   * @page: the page to add
   */
  void lru_cache_add_anon(struct page *page)
@@ -688,7 +688,14 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
  
  static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
  
-void lru_add_drain_all_cpuslocked(void)
+/*
+ * Doesn't need any cpu hotplug locking because we do rely on per-cpu
+ * kworkers being shut down before our page_alloc_cpu_dead callback is
+ * executed on the offlined cpu.
+ * Calling this function with cpu hotplug locks held can actually lead
+ * to obscure indirect dependencies via WQ context.
+ */
+void lru_add_drain_all(void)
  {
         static DEFINE_MUTEX(lock);
         static struct cpumask has_work;
@@ -724,13 +731,6 @@ void lru_add_drain_all_cpuslocked(void)
         mutex_unlock(&lock);
  }
  
-void lru_add_drain_all(void)
-{
-       get_online_cpus();
-       lru_add_drain_all_cpuslocked();
-       put_online_cpus();
-}
-
  /**
   * release_pages - batched put_page()
   * @pages: array of pages to release
@@ -930,10 +930,10 @@ EXPORT_SYMBOL(__pagevec_lru_add);
   */
  unsigned pagevec_lookup_entries(struct pagevec *pvec,
                                 struct address_space *mapping,
-                               pgoff_t start, unsigned nr_pages,
+                               pgoff_t start, unsigned nr_entries,
                                 pgoff_t *indices)
  {
-       pvec->nr = find_get_entries(mapping, start, nr_pages,
+       pvec->nr = find_get_entries(mapping, start, nr_entries,
                                     pvec->pages, indices);
         return pagevec_count(pvec);
  }
@@ -965,9 +965,8 @@ void pagevec_remove_exceptionals(struct pagevec *pvec)
   * @mapping:   The address_space to search
   * @start:     The starting page index
   * @end:       The final page index
- * @nr_pages:  The maximum number of pages
   *
- * pagevec_lookup_range() will search for and return a group of up to @nr_pages
+ * pagevec_lookup_range() will search for & return a group of up to PAGEVEC_SIZE
   * pages in the mapping starting from index @start and upto index @end
   * (inclusive).  The pages are placed in @pvec.  pagevec_lookup() takes a
   * reference against the pages in @pvec.
@@ -977,7 +976,7 @@ void pagevec_remove_exceptionals(struct pagevec *pvec)
   * also update @start to index the next page for the traversal.
   *
   * pagevec_lookup_range() returns the number of pages which were found. If this
- * number is smaller than @nr_pages, the end of specified range has been
+ * number is smaller than PAGEVEC_SIZE, the end of specified range has been
   * reached.
   */
  unsigned pagevec_lookup_range(struct pagevec *pvec,
diff --git a/mm/truncate.c b/mm/truncate.c

index e4b4cf0..c34e2fd 100644 (file)
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -179,12 +179,8 @@ static void
  truncate_cleanup_page(struct address_space *mapping, struct page *page)
  {
         if (page_mapped(page)) {
-               loff_t holelen;
-
-               holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE;
-               unmap_mapping_range(mapping,
-                                  (loff_t)page->index << PAGE_SHIFT,
-                                  holelen, 0);
+               pgoff_t nr = PageTransHuge(page) ? HPAGE_PMD_NR : 1;
+               unmap_mapping_pages(mapping, page->index, nr, false);
         }
  
         if (page_has_private(page))
@@ -715,19 +711,15 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                                         /*
                                          * Zap the rest of the file in one hit.
                                          */
-                                       unmap_mapping_range(mapping,
-                                          (loff_t)index << PAGE_SHIFT,
-                                          (loff_t)(1 + end - index)
-                                                        << PAGE_SHIFT,
-                                                        0);
+                                       unmap_mapping_pages(mapping, index,
+                                               (1 + end - index), false);
                                         did_range_unmap = 1;
                                 } else {
                                         /*
                                          * Just zap this page
                                          */
-                                       unmap_mapping_range(mapping,
-                                          (loff_t)index << PAGE_SHIFT,
-                                          PAGE_SIZE, 0);
+                                       unmap_mapping_pages(mapping, index,
+                                                               1, false);
                                 }
                         }
                         BUG_ON(page_mapped(page));
@@ -753,8 +745,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
          * get remapped later.
          */
         if (dax_mapping(mapping)) {
-               unmap_mapping_range(mapping, (loff_t)start << PAGE_SHIFT,
-                                   (loff_t)(end - start + 1) << PAGE_SHIFT, 0);
+               unmap_mapping_pages(mapping, start, end - start + 1, false);
         }
  out:
         cleancache_invalidate_inode(mapping);
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 47d5ced..fdd3fc6 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -220,22 +220,6 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
         return nr;
  }
  
-unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat)
-{
-       unsigned long nr;
-
-       nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) +
-            node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) +
-            node_page_state_snapshot(pgdat, NR_ISOLATED_FILE);
-
-       if (get_nr_swap_pages() > 0)
-               nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) +
-                     node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) +
-                     node_page_state_snapshot(pgdat, NR_ISOLATED_ANON);
-
-       return nr;
-}
-
  /**
   * lruvec_lru_size -  Returns the number of pages on the given LRU list.
   * @lruvec: lru vector
@@ -310,9 +294,7 @@ EXPORT_SYMBOL(unregister_shrinker);
  #define SHRINK_BATCH 128
  
  static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
-                                   struct shrinker *shrinker,
-                                   unsigned long nr_scanned,
-                                   unsigned long nr_eligible)
+                                   struct shrinker *shrinker, int priority)
  {
         unsigned long freed = 0;
         unsigned long long delta;
@@ -337,9 +319,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
         nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
  
         total_scan = nr;
-       delta = (4 * nr_scanned) / shrinker->seeks;
-       delta *= freeable;
-       do_div(delta, nr_eligible + 1);
+       delta = freeable >> priority;
+       delta *= 4;
+       do_div(delta, shrinker->seeks);
         total_scan += delta;
         if (total_scan < 0) {
                 pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
@@ -373,8 +355,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
                 total_scan = freeable * 2;
  
         trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
-                                  nr_scanned, nr_eligible,
-                                  freeable, delta, total_scan);
+                                  freeable, delta, total_scan, priority);
  
         /*
          * Normally, we should not scan less than batch_size objects in one
@@ -434,8 +415,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
   * @gfp_mask: allocation context
   * @nid: node whose slab caches to target
   * @memcg: memory cgroup whose slab caches to target
- * @nr_scanned: pressure numerator
- * @nr_eligible: pressure denominator
+ * @priority: the reclaim priority
   *
   * Call the shrink functions to age shrinkable caches.
   *
@@ -447,20 +427,14 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
   * objects from the memory cgroup specified. Otherwise, only unaware
   * shrinkers are called.
   *
- * @nr_scanned and @nr_eligible form a ratio that indicate how much of
- * the available objects should be scanned.  Page reclaim for example
- * passes the number of pages scanned and the number of pages on the
- * LRU lists that it considered on @nid, plus a bias in @nr_scanned
- * when it encountered mapped pages.  The ratio is further biased by
- * the ->seeks setting of the shrink function, which indicates the
- * cost to recreate an object relative to that of an LRU page.
+ * @priority is sc->priority, we take the number of objects and >> by priority
+ * in order to get the scan target.
   *
   * Returns the number of reclaimed slab objects.
   */
  static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
                                  struct mem_cgroup *memcg,
-                                unsigned long nr_scanned,
-                                unsigned long nr_eligible)
+                                int priority)
  {
         struct shrinker *shrinker;
         unsigned long freed = 0;
@@ -468,9 +442,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
         if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
                 return 0;
  
-       if (nr_scanned == 0)
-               nr_scanned = SWAP_CLUSTER_MAX;
-
         if (!down_read_trylock(&shrinker_rwsem)) {
                 /*
                  * If we would return 0, our callers would understand that we
@@ -501,7 +472,16 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
                 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
                         sc.nid = 0;
  
-               freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
+               freed += do_shrink_slab(&sc, shrinker, priority);
+               /*
+                * Bail out if someone want to register a new shrinker to
+                * prevent the regsitration from being stalled for long periods
+                * by parallel ongoing shrinking.
+                */
+               if (rwsem_is_contended(&shrinker_rwsem)) {
+                       freed = freed ? : 1;
+                       break;
+               }
         }
  
         up_read(&shrinker_rwsem);
@@ -519,8 +499,7 @@ void drop_slab_node(int nid)
  
                 freed = 0;
                 do {
-                       freed += shrink_slab(GFP_KERNEL, nid, memcg,
-                                            1000, 1000);
+                       freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
                 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
         } while (freed > 10);
  }
@@ -1436,14 +1415,24 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
  
                 if (PageDirty(page)) {
                         struct address_space *mapping;
+                       bool migrate_dirty;
  
                         /*
                          * Only pages without mappings or that have a
                          * ->migratepage callback are possible to migrate
-                        * without blocking
+                        * without blocking. However, we can be racing with
+                        * truncation so it's necessary to lock the page
+                        * to stabilise the mapping as truncation holds
+                        * the page lock until after the page is removed
+                        * from the page cache.
                          */
+                       if (!trylock_page(page))
+                               return ret;
+
                         mapping = page_mapping(page);
-                       if (mapping && !mapping->a_ops->migratepage)
+                       migrate_dirty = mapping && mapping->a_ops->migratepage;
+                       unlock_page(page);
+                       if (!migrate_dirty)
                                 return ret;
                 }
         }
@@ -2615,14 +2604,12 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
  
                         reclaimed = sc->nr_reclaimed;
                         scanned = sc->nr_scanned;
-
                         shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
                         node_lru_pages += lru_pages;
  
                         if (memcg)
                                 shrink_slab(sc->gfp_mask, pgdat->node_id,
-                                           memcg, sc->nr_scanned - scanned,
-                                           lru_pages);
+                                           memcg, sc->priority);
  
                         /* Record the group's reclaim efficiency */
                         vmpressure(sc->gfp_mask, memcg, false,
@@ -2646,14 +2633,9 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
                         }
                 } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
  
-               /*
-                * Shrink the slab caches in the same proportion that
-                * the eligible LRU pages were scanned.
-                */
                 if (global_reclaim(sc))
                         shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
-                                   sc->nr_scanned - nr_scanned,
-                                   node_lru_pages);
+                                   sc->priority);
  
                 if (reclaim_state) {
                         sc->nr_reclaimed += reclaim_state->reclaimed_slab;
diff --git a/mm/zpool.c b/mm/zpool.c

index fd3ff71..e1e7aa6 100644 (file)
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -21,6 +21,7 @@ struct zpool {
         struct zpool_driver *driver;
         void *pool;
         const struct zpool_ops *ops;
+       bool evictable;
  
         struct list_head list;
  };
@@ -142,7 +143,7 @@ EXPORT_SYMBOL(zpool_has_pool);
   *
   * This creates a new zpool of the specified type.  The gfp flags will be
   * used when allocating memory, if the implementation supports it.  If the
- * ops param is NULL, then the created zpool will not be shrinkable.
+ * ops param is NULL, then the created zpool will not be evictable.
   *
   * Implementations must guarantee this to be thread-safe.
   *
@@ -180,6 +181,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
         zpool->driver = driver;
         zpool->pool = driver->create(name, gfp, ops, zpool);
         zpool->ops = ops;
+       zpool->evictable = driver->shrink && ops && ops->evict;
  
         if (!zpool->pool) {
                 pr_err("couldn't create %s pool\n", type);
@@ -296,7 +298,8 @@ void zpool_free(struct zpool *zpool, unsigned long handle)
  int zpool_shrink(struct zpool *zpool, unsigned int pages,
                         unsigned int *reclaimed)
  {
-       return zpool->driver->shrink(zpool->pool, pages, reclaimed);
+       return zpool->driver->shrink ?
+              zpool->driver->shrink(zpool->pool, pages, reclaimed) : -EINVAL;
  }
  
  /**
@@ -355,6 +358,24 @@ u64 zpool_get_total_size(struct zpool *zpool)
         return zpool->driver->total_size(zpool->pool);
  }
  
+/**
+ * zpool_evictable() - Test if zpool is potentially evictable
+ * @pool       The zpool to test
+ *
+ * Zpool is only potentially evictable when it's created with struct
+ * zpool_ops.evict and its driver implements struct zpool_driver.shrink.
+ *
+ * However, it doesn't necessarily mean driver will use zpool_ops.evict
+ * in its implementation of zpool_driver.shrink. It could do internal
+ * defragmentation instead.
+ *
+ * Returns: true if potentially evictable; false otherwise.
+ */
+bool zpool_evictable(struct zpool *zpool)
+{
+       return zpool->evictable;
+}
+
  MODULE_LICENSE("GPL");
  MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
  MODULE_DESCRIPTION("Common API for compressed memory storage");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c

index 683c065..c301350 100644 (file)
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -46,6 +46,7 @@
  #include <linux/vmalloc.h>
  #include <linux/preempt.h>
  #include <linux/spinlock.h>
+#include <linux/shrinker.h>
  #include <linux/types.h>
  #include <linux/debugfs.h>
  #include <linux/zsmalloc.h>
@@ -257,11 +258,7 @@ struct zs_pool {
  
         /* Compact classes */
         struct shrinker shrinker;
-       /*
-        * To signify that register_shrinker() was successful
-        * and unregister_shrinker() will not Oops.
-        */
-       bool shrinker_enabled;
+
  #ifdef CONFIG_ZSMALLOC_STAT
         struct dentry *stat_dentry;
  #endif
@@ -407,12 +404,6 @@ static void zs_zpool_free(void *pool, unsigned long handle)
         zs_free(pool, handle);
  }
  
-static int zs_zpool_shrink(void *pool, unsigned int pages,
-                       unsigned int *reclaimed)
-{
-       return -EINVAL;
-}
-
  static void *zs_zpool_map(void *pool, unsigned long handle,
                         enum zpool_mapmode mm)
  {
@@ -450,7 +441,6 @@ static struct zpool_driver zs_zpool_driver = {
         .destroy =      zs_zpool_destroy,
         .malloc =       zs_zpool_malloc,
         .free =         zs_zpool_free,
-       .shrink =       zs_zpool_shrink,
         .map =          zs_zpool_map,
         .unmap =        zs_zpool_unmap,
         .total_size =   zs_zpool_total_size,
@@ -1057,7 +1047,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
                          * Reset OBJ_TAG_BITS bit to last link to tell
                          * whether it's allocated object or not.
                          */
-                       link->next = -1 << OBJ_TAG_BITS;
+                       link->next = -1UL << OBJ_TAG_BITS;
                 }
                 kunmap_atomic(vaddr);
                 page = next_page;
@@ -2324,10 +2314,7 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker,
  
  static void zs_unregister_shrinker(struct zs_pool *pool)
  {
-       if (pool->shrinker_enabled) {
-               unregister_shrinker(&pool->shrinker);
-               pool->shrinker_enabled = false;
-       }
+       unregister_shrinker(&pool->shrinker);
  }
  
  static int zs_register_shrinker(struct zs_pool *pool)
@@ -2426,11 +2413,13 @@ struct zs_pool *zs_create_pool(const char *name)
                 goto err;
  
         /*
-        * Not critical, we still can use the pool
-        * and user can trigger compaction manually.
+        * Not critical since shrinker is only used to trigger internal
+        * defragmentation of the pool which is pretty optional thing.  If
+        * registration fails we still can use the pool normally and user can
+        * trigger compaction manually. Thus, ignore return code.
          */
-       if (zs_register_shrinker(pool) == 0)
-               pool->shrinker_enabled = true;
+       zs_register_shrinker(pool);
+
         return pool;
  
  err:
diff --git a/mm/zswap.c b/mm/zswap.c

index d39581a..c004aa4 100644 (file)
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -49,6 +49,8 @@
  static u64 zswap_pool_total_size;
  /* The number of compressed pages currently stored in zswap */
  static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
+/* The number of same-value filled pages currently stored in zswap */
+static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0);
  
  /*
   * The statistics below are not protected from concurrent access for
@@ -116,6 +118,11 @@ module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
  static unsigned int zswap_max_pool_percent = 20;
  module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
  
+/* Enable/disable handling same-value filled pages (enabled by default) */
+static bool zswap_same_filled_pages_enabled = true;
+module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
+                  bool, 0644);
+
  /*********************************
  * data structures
  **********************************/
@@ -145,9 +152,10 @@ struct zswap_pool {
   *            be held while changing the refcount.  Since the lock must
   *            be held, there is no reason to also make refcount atomic.
   * length - the length in bytes of the compressed page data.  Needed during
- *          decompression
+ *          decompression. For a same value filled page length is 0.
   * pool - the zswap_pool the entry's data is in
   * handle - zpool allocation handle that stores the compressed page data
+ * value - value of the same-value filled pages which have same content
   */
  struct zswap_entry {
         struct rb_node rbnode;
@@ -155,7 +163,10 @@ struct zswap_entry {
         int refcount;
         unsigned int length;
         struct zswap_pool *pool;
-       unsigned long handle;
+       union {
+               unsigned long handle;
+               unsigned long value;
+       };
  };
  
  struct zswap_header {
@@ -320,8 +331,12 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
   */
  static void zswap_free_entry(struct zswap_entry *entry)
  {
-       zpool_free(entry->pool->zpool, entry->handle);
-       zswap_pool_put(entry->pool);
+       if (!entry->length)
+               atomic_dec(&zswap_same_filled_pages);
+       else {
+               zpool_free(entry->pool->zpool, entry->handle);
+               zswap_pool_put(entry->pool);
+       }
         zswap_entry_cache_free(entry);
         atomic_dec(&zswap_stored_pages);
         zswap_update_total_size();
@@ -953,6 +968,28 @@ static int zswap_shrink(void)
         return ret;
  }
  
+static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
+{
+       unsigned int pos;
+       unsigned long *page;
+
+       page = (unsigned long *)ptr;
+       for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) {
+               if (page[pos] != page[0])
+                       return 0;
+       }
+       *value = page[0];
+       return 1;
+}
+
+static void zswap_fill_page(void *ptr, unsigned long value)
+{
+       unsigned long *page;
+
+       page = (unsigned long *)ptr;
+       memset_l(page, value, PAGE_SIZE / sizeof(unsigned long));
+}
+
  /*********************************
  * frontswap hooks
  **********************************/
@@ -964,11 +1001,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
         struct zswap_entry *entry, *dupentry;
         struct crypto_comp *tfm;
         int ret;
-       unsigned int dlen = PAGE_SIZE, len;
-       unsigned long handle;
+       unsigned int hlen, dlen = PAGE_SIZE;
+       unsigned long handle, value;
         char *buf;
         u8 *src, *dst;
-       struct zswap_header *zhdr;
+       struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
  
         if (!zswap_enabled || !tree) {
                 ret = -ENODEV;
@@ -993,6 +1030,19 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
                 goto reject;
         }
  
+       if (zswap_same_filled_pages_enabled) {
+               src = kmap_atomic(page);
+               if (zswap_is_page_same_filled(src, &value)) {
+                       kunmap_atomic(src);
+                       entry->offset = offset;
+                       entry->length = 0;
+                       entry->value = value;
+                       atomic_inc(&zswap_same_filled_pages);
+                       goto insert_entry;
+               }
+               kunmap_atomic(src);
+       }
+
         /* if entry is successfully added, it keeps the reference */
         entry->pool = zswap_pool_current_get();
         if (!entry->pool) {
@@ -1013,8 +1063,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
         }
  
         /* store */
-       len = dlen + sizeof(struct zswap_header);
-       ret = zpool_malloc(entry->pool->zpool, len,
+       hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
+       ret = zpool_malloc(entry->pool->zpool, hlen + dlen,
                            __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM,
                            &handle);
         if (ret == -ENOSPC) {
@@ -1025,10 +1075,9 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
                 zswap_reject_alloc_fail++;
                 goto put_dstmem;
         }
-       zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW);
-       zhdr->swpentry = swp_entry(type, offset);
-       buf = (u8 *)(zhdr + 1);
-       memcpy(buf, dst, dlen);
+       buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW);
+       memcpy(buf, &zhdr, hlen);
+       memcpy(buf + hlen, dst, dlen);
         zpool_unmap_handle(entry->pool->zpool, handle);
         put_cpu_var(zswap_dstmem);
  
@@ -1037,6 +1086,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
         entry->handle = handle;
         entry->length = dlen;
  
+insert_entry:
         /* map */
         spin_lock(&tree->lock);
         do {
@@ -1089,10 +1139,18 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
         }
         spin_unlock(&tree->lock);
  
+       if (!entry->length) {
+               dst = kmap_atomic(page);
+               zswap_fill_page(dst, entry->value);
+               kunmap_atomic(dst);
+               goto freeentry;
+       }
+
         /* decompress */
         dlen = PAGE_SIZE;
-       src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
-                       ZPOOL_MM_RO) + sizeof(struct zswap_header);
+       src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
+       if (zpool_evictable(entry->pool->zpool))
+               src += sizeof(struct zswap_header);
         dst = kmap_atomic(page);
         tfm = *get_cpu_ptr(entry->pool->tfm);
         ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen);
@@ -1101,6 +1159,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
         zpool_unmap_handle(entry->pool->zpool, entry->handle);
         BUG_ON(ret);
  
+freeentry:
         spin_lock(&tree->lock);
         zswap_entry_put(tree, entry);
         spin_unlock(&tree->lock);
@@ -1209,6 +1268,8 @@ static int __init zswap_debugfs_init(void)
                         zswap_debugfs_root, &zswap_pool_total_size);
         debugfs_create_atomic_t("stored_pages", S_IRUGO,
                         zswap_debugfs_root, &zswap_stored_pages);
+       debugfs_create_atomic_t("same_filled_pages", 0444,
+                       zswap_debugfs_root, &zswap_same_filled_pages);
  
         return 0;
  }
diff --git a/scripts/decodecode b/scripts/decodecode

index 5ea0710..9cef558 100755 (executable)
--- a/scripts/decodecode
+++ b/scripts/decodecode
@@ -21,12 +21,24 @@ trap cleanup EXIT
  
  T=`mktemp` || die "cannot create temp file"
  code=
+cont=
  
  while read i ; do
  
  case "$i" in
  *Code:*)
         code=$i
+       cont=yes
+       ;;
+*)
+       [ -n "$cont" ] && {
+               xdump="$(echo $i | grep '^[[:xdigit:]<>[:space:]]\+$')"
+               if [ -n "$xdump" ]; then
+                       code="$code $xdump"
+               else
+                       cont=
+               fi
+       }
         ;;
  esac
  
diff --git a/scripts/tags.sh b/scripts/tags.sh

index d23dcbf..78e546f 100755 (executable)
--- a/scripts/tags.sh
+++ b/scripts/tags.sh
@@ -77,7 +77,7 @@ find_include_sources()
  find_other_sources()
  {
         find ${tree}* $ignore \
-            \( -name include -o -name arch -o -name '.tmp_*' \) -prune -o \
+            \( -path ${tree}include -o -path ${tree}arch -o -name '.tmp_*' \) -prune -o \
                -name "$1" -not -type l -print;
  }
  
diff --git a/tools/testing/selftests/memfd/Makefile b/tools/testing/selftests/memfd/Makefile

index 3926a04..a5276a9 100644 (file)
--- a/tools/testing/selftests/memfd/Makefile
+++ b/tools/testing/selftests/memfd/Makefile
@@ -12,3 +12,8 @@ fuse_mnt.o: CFLAGS += $(shell pkg-config fuse --cflags)
  include ../lib.mk
  
  $(OUTPUT)/fuse_mnt: LDLIBS += $(shell pkg-config fuse --libs)
+
+$(OUTPUT)/memfd_test: memfd_test.c common.o
+$(OUTPUT)/fuse_test: fuse_test.c common.o
+
+EXTRA_CLEAN = common.o
diff --git a/tools/testing/selftests/memfd/common.c b/tools/testing/selftests/memfd/common.c

new file mode 100644 (file)

index 0000000..8eb3d75
--- /dev/null
+++ b/tools/testing/selftests/memfd/common.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#define __EXPORTED_HEADERS__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <linux/fcntl.h>
+#include <linux/memfd.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#include "common.h"
+
+int hugetlbfs_test = 0;
+
+/*
+ * Copied from mlock2-tests.c
+ */
+unsigned long default_huge_page_size(void)
+{
+       unsigned long hps = 0;
+       char *line = NULL;
+       size_t linelen = 0;
+       FILE *f = fopen("/proc/meminfo", "r");
+
+       if (!f)
+               return 0;
+       while (getline(&line, &linelen, f) > 0) {
+               if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
+                       hps <<= 10;
+                       break;
+               }
+       }
+
+       free(line);
+       fclose(f);
+       return hps;
+}
+
+int sys_memfd_create(const char *name, unsigned int flags)
+{
+       if (hugetlbfs_test)
+               flags |= MFD_HUGETLB;
+
+       return syscall(__NR_memfd_create, name, flags);
+}
diff --git a/tools/testing/selftests/memfd/common.h b/tools/testing/selftests/memfd/common.h

new file mode 100644 (file)

index 0000000..522d2c6
--- /dev/null
+++ b/tools/testing/selftests/memfd/common.h
@@ -0,0 +1,9 @@
+#ifndef COMMON_H_
+#define COMMON_H_
+
+extern int hugetlbfs_test;
+
+unsigned long default_huge_page_size(void);
+int sys_memfd_create(const char *name, unsigned int flags);
+
+#endif
diff --git a/tools/testing/selftests/memfd/fuse_test.c b/tools/testing/selftests/memfd/fuse_test.c

index 1ccb7a3..b018e83 100644 (file)
--- a/tools/testing/selftests/memfd/fuse_test.c
+++ b/tools/testing/selftests/memfd/fuse_test.c
@@ -33,14 +33,12 @@
  #include <sys/wait.h>
  #include <unistd.h>
  
+#include "common.h"
+
  #define MFD_DEF_SIZE 8192
  #define STACK_SIZE 65536
  
-static int sys_memfd_create(const char *name,
-                           unsigned int flags)
-{
-       return syscall(__NR_memfd_create, name, flags);
-}
+static size_t mfd_def_size = MFD_DEF_SIZE;
  
  static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags)
  {
@@ -127,7 +125,7 @@ static void *mfd_assert_mmap_shared(int fd)
         void *p;
  
         p = mmap(NULL,
-                MFD_DEF_SIZE,
+                mfd_def_size,
                  PROT_READ | PROT_WRITE,
                  MAP_SHARED,
                  fd,
@@ -145,7 +143,7 @@ static void *mfd_assert_mmap_private(int fd)
         void *p;
  
         p = mmap(NULL,
-                MFD_DEF_SIZE,
+                mfd_def_size,
                  PROT_READ | PROT_WRITE,
                  MAP_PRIVATE,
                  fd,
@@ -178,7 +176,7 @@ static int sealing_thread_fn(void *arg)
         usleep(200000);
  
         /* unmount mapping before sealing to avoid i_mmap_writable failures */
-       munmap(global_p, MFD_DEF_SIZE);
+       munmap(global_p, mfd_def_size);
  
         /* Try sealing the global file; expect EBUSY or success. Current
          * kernels will never succeed, but in the future, kernels might
@@ -228,7 +226,7 @@ static void join_sealing_thread(pid_t pid)
  
  int main(int argc, char **argv)
  {
-       static const char zero[MFD_DEF_SIZE];
+       char *zero;
         int fd, mfd, r;
         void *p;
         int was_sealed;
@@ -239,6 +237,25 @@ int main(int argc, char **argv)
                 abort();
         }
  
+       if (argc >= 3) {
+               if (!strcmp(argv[2], "hugetlbfs")) {
+                       unsigned long hpage_size = default_huge_page_size();
+
+                       if (!hpage_size) {
+                               printf("Unable to determine huge page size\n");
+                               abort();
+                       }
+
+                       hugetlbfs_test = 1;
+                       mfd_def_size = hpage_size * 2;
+               } else {
+                       printf("Unknown option: %s\n", argv[2]);
+                       abort();
+               }
+       }
+
+       zero = calloc(sizeof(*zero), mfd_def_size);
+
         /* open FUSE memfd file for GUP testing */
         printf("opening: %s\n", argv[1]);
         fd = open(argv[1], O_RDONLY | O_CLOEXEC);
@@ -249,7 +266,7 @@ int main(int argc, char **argv)
  
         /* create new memfd-object */
         mfd = mfd_assert_new("kern_memfd_fuse",
-                            MFD_DEF_SIZE,
+                            mfd_def_size,
                              MFD_CLOEXEC | MFD_ALLOW_SEALING);
  
         /* mmap memfd-object for writing */
@@ -268,7 +285,7 @@ int main(int argc, char **argv)
          * This guarantees that the receive-buffer is pinned for 1s until the
          * data is written into it. The racing ADD_SEALS should thus fail as
          * the pages are still pinned. */
-       r = read(fd, p, MFD_DEF_SIZE);
+       r = read(fd, p, mfd_def_size);
         if (r < 0) {
                 printf("read() failed: %m\n");
                 abort();
@@ -295,10 +312,10 @@ int main(int argc, char **argv)
          * enough to avoid any in-flight writes. */
  
         p = mfd_assert_mmap_private(mfd);
-       if (was_sealed && memcmp(p, zero, MFD_DEF_SIZE)) {
+       if (was_sealed && memcmp(p, zero, mfd_def_size)) {
                 printf("memfd sealed during read() but data not discarded\n");
                 abort();
-       } else if (!was_sealed && !memcmp(p, zero, MFD_DEF_SIZE)) {
+       } else if (!was_sealed && !memcmp(p, zero, mfd_def_size)) {
                 printf("memfd sealed after read() but data discarded\n");
                 abort();
         }
@@ -307,6 +324,7 @@ int main(int argc, char **argv)
         close(fd);
  
         printf("fuse: DONE\n");
+       free(zero);
  
         return 0;
  }
diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c

index 132a54f..10baa16 100644 (file)
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -19,7 +19,10 @@
  #include <sys/wait.h>
  #include <unistd.h>
  
+#include "common.h"
+
  #define MEMFD_STR      "memfd:"
+#define MEMFD_HUGE_STR "memfd-hugetlb:"
  #define SHARED_FT_STR  "(shared file-table)"
  
  #define MFD_DEF_SIZE 8192
@@ -28,41 +31,8 @@
  /*
   * Default is not to test hugetlbfs
   */
-static int hugetlbfs_test;
  static size_t mfd_def_size = MFD_DEF_SIZE;
-
-/*
- * Copied from mlock2-tests.c
- */
-static unsigned long default_huge_page_size(void)
-{
-       unsigned long hps = 0;
-       char *line = NULL;
-       size_t linelen = 0;
-       FILE *f = fopen("/proc/meminfo", "r");
-
-       if (!f)
-               return 0;
-       while (getline(&line, &linelen, f) > 0) {
-               if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
-                       hps <<= 10;
-                       break;
-               }
-       }
-
-       free(line);
-       fclose(f);
-       return hps;
-}
-
-static int sys_memfd_create(const char *name,
-                           unsigned int flags)
-{
-       if (hugetlbfs_test)
-               flags |= MFD_HUGETLB;
-
-       return syscall(__NR_memfd_create, name, flags);
-}
+static const char *memfd_str = MEMFD_STR;
  
  static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags)
  {
@@ -513,6 +483,10 @@ static void mfd_assert_grow_write(int fd)
         static char *buf;
         ssize_t l;
  
+       /* hugetlbfs does not support write */
+       if (hugetlbfs_test)
+               return;
+
         buf = malloc(mfd_def_size * 8);
         if (!buf) {
                 printf("malloc(%zu) failed: %m\n", mfd_def_size * 8);
@@ -533,6 +507,10 @@ static void mfd_fail_grow_write(int fd)
         static char *buf;
         ssize_t l;
  
+       /* hugetlbfs does not support write */
+       if (hugetlbfs_test)
+               return;
+
         buf = malloc(mfd_def_size * 8);
         if (!buf) {
                 printf("malloc(%zu) failed: %m\n", mfd_def_size * 8);
@@ -598,7 +576,7 @@ static void test_create(void)
         char buf[2048];
         int fd;
  
-       printf("%s CREATE\n", MEMFD_STR);
+       printf("%s CREATE\n", memfd_str);
  
         /* test NULL name */
         mfd_fail_new(NULL, 0);
@@ -627,18 +605,13 @@ static void test_create(void)
         fd = mfd_assert_new("", 0, MFD_CLOEXEC);
         close(fd);
  
-       if (!hugetlbfs_test) {
-               /* verify MFD_ALLOW_SEALING is allowed */
-               fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING);
-               close(fd);
-
-               /* verify MFD_ALLOW_SEALING | MFD_CLOEXEC is allowed */
-               fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING | MFD_CLOEXEC);
-               close(fd);
-       } else {
-               /* sealing is not supported on hugetlbfs */
-               mfd_fail_new("", MFD_ALLOW_SEALING);
-       }
+       /* verify MFD_ALLOW_SEALING is allowed */
+       fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING);
+       close(fd);
+
+       /* verify MFD_ALLOW_SEALING | MFD_CLOEXEC is allowed */
+       fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING | MFD_CLOEXEC);
+       close(fd);
  }
  
  /*
@@ -649,11 +622,7 @@ static void test_basic(void)
  {
         int fd;
  
-       /* hugetlbfs does not contain sealing support */
-       if (hugetlbfs_test)
-               return;
-
-       printf("%s BASIC\n", MEMFD_STR);
+       printf("%s BASIC\n", memfd_str);
  
         fd = mfd_assert_new("kern_memfd_basic",
                             mfd_def_size,
@@ -697,28 +666,6 @@ static void test_basic(void)
         close(fd);
  }
  
-/*
- * hugetlbfs doesn't support seals or write, so just verify grow and shrink
- * on a hugetlbfs file created via memfd_create.
- */
-static void test_hugetlbfs_grow_shrink(void)
-{
-       int fd;
-
-       printf("%s HUGETLBFS-GROW-SHRINK\n", MEMFD_STR);
-
-       fd = mfd_assert_new("kern_memfd_seal_write",
-                           mfd_def_size,
-                           MFD_CLOEXEC);
-
-       mfd_assert_read(fd);
-       mfd_assert_write(fd);
-       mfd_assert_shrink(fd);
-       mfd_assert_grow(fd);
-
-       close(fd);
-}
-
  /*
   * Test SEAL_WRITE
   * Test whether SEAL_WRITE actually prevents modifications.
@@ -727,14 +674,7 @@ static void test_seal_write(void)
  {
         int fd;
  
-       /*
-        * hugetlbfs does not contain sealing or write support.  Just test
-        * basic grow and shrink via test_hugetlbfs_grow_shrink.
-        */
-       if (hugetlbfs_test)
-               return test_hugetlbfs_grow_shrink();
-
-       printf("%s SEAL-WRITE\n", MEMFD_STR);
+       printf("%s SEAL-WRITE\n", memfd_str);
  
         fd = mfd_assert_new("kern_memfd_seal_write",
                             mfd_def_size,
@@ -760,11 +700,7 @@ static void test_seal_shrink(void)
  {
         int fd;
  
-       /* hugetlbfs does not contain sealing support */
-       if (hugetlbfs_test)
-               return;
-
-       printf("%s SEAL-SHRINK\n", MEMFD_STR);
+       printf("%s SEAL-SHRINK\n", memfd_str);
  
         fd = mfd_assert_new("kern_memfd_seal_shrink",
                             mfd_def_size,
@@ -790,11 +726,7 @@ static void test_seal_grow(void)
  {
         int fd;
  
-       /* hugetlbfs does not contain sealing support */
-       if (hugetlbfs_test)
-               return;
-
-       printf("%s SEAL-GROW\n", MEMFD_STR);
+       printf("%s SEAL-GROW\n", memfd_str);
  
         fd = mfd_assert_new("kern_memfd_seal_grow",
                             mfd_def_size,
@@ -820,11 +752,7 @@ static void test_seal_resize(void)
  {
         int fd;
  
-       /* hugetlbfs does not contain sealing support */
-       if (hugetlbfs_test)
-               return;
-
-       printf("%s SEAL-RESIZE\n", MEMFD_STR);
+       printf("%s SEAL-RESIZE\n", memfd_str);
  
         fd = mfd_assert_new("kern_memfd_seal_resize",
                             mfd_def_size,
@@ -842,32 +770,6 @@ static void test_seal_resize(void)
         close(fd);
  }
  
-/*
- * hugetlbfs does not support seals.  Basic test to dup the memfd created
- * fd and perform some basic operations on it.
- */
-static void hugetlbfs_dup(char *b_suffix)
-{
-       int fd, fd2;
-
-       printf("%s HUGETLBFS-DUP %s\n", MEMFD_STR, b_suffix);
-
-       fd = mfd_assert_new("kern_memfd_share_dup",
-                           mfd_def_size,
-                           MFD_CLOEXEC);
-
-       fd2 = mfd_assert_dup(fd);
-
-       mfd_assert_read(fd);
-       mfd_assert_write(fd);
-
-       mfd_assert_shrink(fd2);
-       mfd_assert_grow(fd2);
-
-       close(fd2);
-       close(fd);
-}
-
  /*
   * Test sharing via dup()
   * Test that seals are shared between dupped FDs and they're all equal.
@@ -876,16 +778,7 @@ static void test_share_dup(char *banner, char *b_suffix)
  {
         int fd, fd2;
  
-       /*
-        * hugetlbfs does not contain sealing support.  Perform some
-        * basic testing on dup'ed fd instead via hugetlbfs_dup.
-        */
-       if (hugetlbfs_test) {
-               hugetlbfs_dup(b_suffix);
-               return;
-       }
-
-       printf("%s %s %s\n", MEMFD_STR, banner, b_suffix);
+       printf("%s %s %s\n", memfd_str, banner, b_suffix);
  
         fd = mfd_assert_new("kern_memfd_share_dup",
                             mfd_def_size,
@@ -927,11 +820,7 @@ static void test_share_mmap(char *banner, char *b_suffix)
         int fd;
         void *p;
  
-       /* hugetlbfs does not contain sealing support */
-       if (hugetlbfs_test)
-               return;
-
-       printf("%s %s %s\n", MEMFD_STR,  banner, b_suffix);
+       printf("%s %s %s\n", memfd_str,  banner, b_suffix);
  
         fd = mfd_assert_new("kern_memfd_share_mmap",
                             mfd_def_size,
@@ -955,32 +844,6 @@ static void test_share_mmap(char *banner, char *b_suffix)
         close(fd);
  }
  
-/*
- * Basic test to make sure we can open the hugetlbfs fd via /proc and
- * perform some simple operations on it.
- */
-static void hugetlbfs_proc_open(char *b_suffix)
-{
-       int fd, fd2;
-
-       printf("%s HUGETLBFS-PROC-OPEN %s\n", MEMFD_STR, b_suffix);
-
-       fd = mfd_assert_new("kern_memfd_share_open",
-                           mfd_def_size,
-                           MFD_CLOEXEC);
-
-       fd2 = mfd_assert_open(fd, O_RDWR, 0);
-
-       mfd_assert_read(fd);
-       mfd_assert_write(fd);
-
-       mfd_assert_shrink(fd2);
-       mfd_assert_grow(fd2);
-
-       close(fd2);
-       close(fd);
-}
-
  /*
   * Test sealing with open(/proc/self/fd/%d)
   * Via /proc we can get access to a separate file-context for the same memfd.
@@ -991,16 +854,7 @@ static void test_share_open(char *banner, char *b_suffix)
  {
         int fd, fd2;
  
-       /*
-        * hugetlbfs does not contain sealing support.  So test basic
-        * functionality of using /proc fd via hugetlbfs_proc_open
-        */
-       if (hugetlbfs_test) {
-               hugetlbfs_proc_open(b_suffix);
-               return;
-       }
-
-       printf("%s %s %s\n", MEMFD_STR, banner, b_suffix);
+       printf("%s %s %s\n", memfd_str, banner, b_suffix);
  
         fd = mfd_assert_new("kern_memfd_share_open",
                             mfd_def_size,
@@ -1043,11 +897,7 @@ static void test_share_fork(char *banner, char *b_suffix)
         int fd;
         pid_t pid;
  
-       /* hugetlbfs does not contain sealing support */
-       if (hugetlbfs_test)
-               return;
-
-       printf("%s %s %s\n", MEMFD_STR, banner, b_suffix);
+       printf("%s %s %s\n", memfd_str, banner, b_suffix);
  
         fd = mfd_assert_new("kern_memfd_share_fork",
                             mfd_def_size,
@@ -1083,7 +933,11 @@ int main(int argc, char **argv)
                         }
  
                         hugetlbfs_test = 1;
+                       memfd_str = MEMFD_HUGE_STR;
                         mfd_def_size = hpage_size * 2;
+               } else {
+                       printf("Unknown option: %s\n", argv[1]);
+                       abort();
                 }
         }
  
diff --git a/tools/testing/selftests/memfd/run_fuse_test.sh b/tools/testing/selftests/memfd/run_fuse_test.sh

index 407df68..22e572e 100755 (executable)
--- a/tools/testing/selftests/memfd/run_fuse_test.sh
+++ b/tools/testing/selftests/memfd/run_fuse_test.sh
@@ -10,6 +10,6 @@ set -e
  
  mkdir mnt
  ./fuse_mnt ./mnt
-./fuse_test ./mnt/memfd
+./fuse_test ./mnt/memfd $@
  fusermount -u ./mnt
  rmdir ./mnt
diff --git a/tools/testing/selftests/memfd/run_tests.sh b/tools/testing/selftests/memfd/run_tests.sh

index daabb35..c2d41ed 100755 (executable)
--- a/tools/testing/selftests/memfd/run_tests.sh
+++ b/tools/testing/selftests/memfd/run_tests.sh
@@ -60,6 +60,7 @@ fi
  # Run the hugetlbfs test
  #
  ./memfd_test hugetlbfs
+./run_fuse_test.sh hugetlbfs
  
  #
  # Give back any huge pages allocated for the test
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile

index 7f45806..fdefa22 100644 (file)
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -8,17 +8,18 @@ endif
  CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS)
  LDLIBS = -lrt
  TEST_GEN_FILES = compaction_test
+TEST_GEN_FILES += gup_benchmark
  TEST_GEN_FILES += hugepage-mmap
  TEST_GEN_FILES += hugepage-shm
  TEST_GEN_FILES += map_hugetlb
+TEST_GEN_FILES += mlock-random-test
  TEST_GEN_FILES += mlock2-tests
  TEST_GEN_FILES += on-fault-limit
  TEST_GEN_FILES += thuge-gen
  TEST_GEN_FILES += transhuge-stress
  TEST_GEN_FILES += userfaultfd
-TEST_GEN_FILES += mlock-random-test
+TEST_GEN_FILES += va_128TBswitch
  TEST_GEN_FILES += virtual_address_range
-TEST_GEN_FILES += gup_benchmark
  
  TEST_PROGS := run_vmtests
  
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests

index cc82632..d256189 100755 (executable)
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -177,4 +177,15 @@ else
         echo "[PASS]"
  fi
  
+echo "-----------------------------"
+echo "running virtual address 128TB switch test"
+echo "-----------------------------"
+./va_128TBswitch
+if [ $? -ne 0 ]; then
+    echo "[FAIL]"
+    exitcode=1
+else
+    echo "[PASS]"
+fi
+
  exit $exitcode
diff --git a/tools/testing/selftests/vm/va_128TBswitch.c b/tools/testing/selftests/vm/va_128TBswitch.c

new file mode 100644 (file)

index 0000000..e7fe734
--- /dev/null
+++ b/tools/testing/selftests/vm/va_128TBswitch.c
@@ -0,0 +1,297 @@
+/*
+ *
+ * Authors: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+ * Authors: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <stdio.h>
+#include <sys/mman.h>
+#include <string.h>
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+#ifdef __powerpc64__
+#define PAGE_SIZE      (64 << 10)
+/*
+ * This will work with 16M and 2M hugepage size
+ */
+#define HUGETLB_SIZE   (16 << 20)
+#else
+#define PAGE_SIZE      (4 << 10)
+#define HUGETLB_SIZE   (2 << 20)
+#endif
+
+/*
+ * >= 128TB is the hint addr value we used to select
+ * large address space.
+ */
+#define ADDR_SWITCH_HINT (1UL << 47)
+#define LOW_ADDR       ((void *) (1UL << 30))
+#define HIGH_ADDR      ((void *) (1UL << 48))
+
+struct testcase {
+       void *addr;
+       unsigned long size;
+       unsigned long flags;
+       const char *msg;
+       unsigned int low_addr_required:1;
+       unsigned int keep_mapped:1;
+};
+
+static struct testcase testcases[] = {
+       {
+               /*
+                * If stack is moved, we could possibly allocate
+                * this at the requested address.
+                */
+               .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
+               .size = PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)",
+               .low_addr_required = 1,
+       },
+       {
+               /*
+                * We should never allocate at the requested address or above it
+                * The len cross the 128TB boundary. Without MAP_FIXED
+                * we will always search in the lower address space.
+                */
+               .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, (2 * PAGE_SIZE))",
+               .low_addr_required = 1,
+       },
+       {
+               /*
+                * Exact mapping at 128TB, the area is free we should get that
+                * even without MAP_FIXED.
+                */
+               .addr = ((void *)(ADDR_SWITCH_HINT)),
+               .size = PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)",
+               .keep_mapped = 1,
+       },
+       {
+               .addr = (void *)(ADDR_SWITCH_HINT),
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+               .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)",
+       },
+       {
+               .addr = NULL,
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(NULL)",
+               .low_addr_required = 1,
+       },
+       {
+               .addr = LOW_ADDR,
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(LOW_ADDR)",
+               .low_addr_required = 1,
+       },
+       {
+               .addr = HIGH_ADDR,
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(HIGH_ADDR)",
+               .keep_mapped = 1,
+       },
+       {
+               .addr = HIGH_ADDR,
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(HIGH_ADDR) again",
+               .keep_mapped = 1,
+       },
+       {
+               .addr = HIGH_ADDR,
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+               .msg = "mmap(HIGH_ADDR, MAP_FIXED)",
+       },
+       {
+               .addr = (void *) -1,
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(-1)",
+               .keep_mapped = 1,
+       },
+       {
+               .addr = (void *) -1,
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(-1) again",
+       },
+       {
+               .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
+               .size = PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)",
+               .low_addr_required = 1,
+       },
+       {
+               .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE),
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2 * PAGE_SIZE)",
+               .low_addr_required = 1,
+               .keep_mapped = 1,
+       },
+       {
+               .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE / 2),
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE/2 , 2 * PAGE_SIZE)",
+               .low_addr_required = 1,
+               .keep_mapped = 1,
+       },
+       {
+               .addr = ((void *)(ADDR_SWITCH_HINT)),
+               .size = PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)",
+       },
+       {
+               .addr = (void *)(ADDR_SWITCH_HINT),
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+               .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)",
+       },
+};
+
+static struct testcase hugetlb_testcases[] = {
+       {
+               .addr = NULL,
+               .size = HUGETLB_SIZE,
+               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(NULL, MAP_HUGETLB)",
+               .low_addr_required = 1,
+       },
+       {
+               .addr = LOW_ADDR,
+               .size = HUGETLB_SIZE,
+               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(LOW_ADDR, MAP_HUGETLB)",
+               .low_addr_required = 1,
+       },
+       {
+               .addr = HIGH_ADDR,
+               .size = HUGETLB_SIZE,
+               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)",
+               .keep_mapped = 1,
+       },
+       {
+               .addr = HIGH_ADDR,
+               .size = HUGETLB_SIZE,
+               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again",
+               .keep_mapped = 1,
+       },
+       {
+               .addr = HIGH_ADDR,
+               .size = HUGETLB_SIZE,
+               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+               .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)",
+       },
+       {
+               .addr = (void *) -1,
+               .size = HUGETLB_SIZE,
+               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(-1, MAP_HUGETLB)",
+               .keep_mapped = 1,
+       },
+       {
+               .addr = (void *) -1,
+               .size = HUGETLB_SIZE,
+               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(-1, MAP_HUGETLB) again",
+       },
+       {
+               .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE),
+               .size = 2 * HUGETLB_SIZE,
+               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2*HUGETLB_SIZE, MAP_HUGETLB)",
+               .low_addr_required = 1,
+               .keep_mapped = 1,
+       },
+       {
+               .addr = (void *)(ADDR_SWITCH_HINT),
+               .size = 2 * HUGETLB_SIZE,
+               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+               .msg = "mmap(ADDR_SWITCH_HINT , 2*HUGETLB_SIZE, MAP_FIXED | MAP_HUGETLB)",
+       },
+};
+
+static int run_test(struct testcase *test, int count)
+{
+       void *p;
+       int i, ret = 0;
+
+       for (i = 0; i < count; i++) {
+               struct testcase *t = test + i;
+
+               p = mmap(t->addr, t->size, PROT_READ | PROT_WRITE, t->flags, -1, 0);
+
+               printf("%s: %p - ", t->msg, p);
+
+               if (p == MAP_FAILED) {
+                       printf("FAILED\n");
+                       ret = 1;
+                       continue;
+               }
+
+               if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) {
+                       printf("FAILED\n");
+                       ret = 1;
+               } else {
+                       /*
+                        * Do a dereference of the address returned so that we catch
+                        * bugs in page fault handling
+                        */
+                       memset(p, 0, t->size);
+                       printf("OK\n");
+               }
+               if (!t->keep_mapped)
+                       munmap(p, t->size);
+       }
+
+       return ret;
+}
+
+static int supported_arch(void)
+{
+#if defined(__powerpc64__)
+       return 1;
+#elif defined(__x86_64__)
+       return 1;
+#else
+       return 0;
+#endif
+}
+
+int main(int argc, char **argv)
+{
+       int ret;
+
+       if (!supported_arch())
+               return 0;
+
+       ret = run_test(testcases, ARRAY_SIZE(testcases));
+       if (argc == 2 && !strcmp(argv[1], "--run-hugetlb"))
+               ret = run_test(hugetlb_testcases, ARRAY_SIZE(hugetlb_testcases));
+       return ret;
+}
diff --git a/tools/testing/selftests/x86/5lvl.c b/tools/testing/selftests/x86/5lvl.c

deleted file mode 100644 (file)

index 2eafdcd..0000000
--- a/tools/testing/selftests/x86/5lvl.c
+++ /dev/null
@@ -1,177 +0,0 @@
-#include <stdio.h>
-#include <sys/mman.h>
-
-#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
-
-#define PAGE_SIZE      4096
-#define LOW_ADDR       ((void *) (1UL << 30))
-#define HIGH_ADDR      ((void *) (1UL << 50))
-
-struct testcase {
-       void *addr;
-       unsigned long size;
-       unsigned long flags;
-       const char *msg;
-       unsigned int low_addr_required:1;
-       unsigned int keep_mapped:1;
-};
-
-static struct testcase testcases[] = {
-       {
-               .addr = NULL,
-               .size = 2 * PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(NULL)",
-               .low_addr_required = 1,
-       },
-       {
-               .addr = LOW_ADDR,
-               .size = 2 * PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(LOW_ADDR)",
-               .low_addr_required = 1,
-       },
-       {
-               .addr = HIGH_ADDR,
-               .size = 2 * PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(HIGH_ADDR)",
-               .keep_mapped = 1,
-       },
-       {
-               .addr = HIGH_ADDR,
-               .size = 2 * PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(HIGH_ADDR) again",
-               .keep_mapped = 1,
-       },
-       {
-               .addr = HIGH_ADDR,
-               .size = 2 * PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
-               .msg = "mmap(HIGH_ADDR, MAP_FIXED)",
-       },
-       {
-               .addr = (void*) -1,
-               .size = 2 * PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(-1)",
-               .keep_mapped = 1,
-       },
-       {
-               .addr = (void*) -1,
-               .size = 2 * PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(-1) again",
-       },
-       {
-               .addr = (void *)((1UL << 47) - PAGE_SIZE),
-               .size = 2 * PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap((1UL << 47), 2 * PAGE_SIZE)",
-               .low_addr_required = 1,
-               .keep_mapped = 1,
-       },
-       {
-               .addr = (void *)((1UL << 47) - PAGE_SIZE / 2),
-               .size = 2 * PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap((1UL << 47), 2 * PAGE_SIZE / 2)",
-               .low_addr_required = 1,
-               .keep_mapped = 1,
-       },
-       {
-               .addr = (void *)((1UL << 47) - PAGE_SIZE),
-               .size = 2 * PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
-               .msg = "mmap((1UL << 47) - PAGE_SIZE, 2 * PAGE_SIZE, MAP_FIXED)",
-       },
-       {
-               .addr = NULL,
-               .size = 2UL << 20,
-               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(NULL, MAP_HUGETLB)",
-               .low_addr_required = 1,
-       },
-       {
-               .addr = LOW_ADDR,
-               .size = 2UL << 20,
-               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(LOW_ADDR, MAP_HUGETLB)",
-               .low_addr_required = 1,
-       },
-       {
-               .addr = HIGH_ADDR,
-               .size = 2UL << 20,
-               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)",
-               .keep_mapped = 1,
-       },
-       {
-               .addr = HIGH_ADDR,
-               .size = 2UL << 20,
-               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again",
-               .keep_mapped = 1,
-       },
-       {
-               .addr = HIGH_ADDR,
-               .size = 2UL << 20,
-               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
-               .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)",
-       },
-       {
-               .addr = (void*) -1,
-               .size = 2UL << 20,
-               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(-1, MAP_HUGETLB)",
-               .keep_mapped = 1,
-       },
-       {
-               .addr = (void*) -1,
-               .size = 2UL << 20,
-               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(-1, MAP_HUGETLB) again",
-       },
-       {
-               .addr = (void *)((1UL << 47) - PAGE_SIZE),
-               .size = 4UL << 20,
-               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap((1UL << 47), 4UL << 20, MAP_HUGETLB)",
-               .low_addr_required = 1,
-               .keep_mapped = 1,
-       },
-       {
-               .addr = (void *)((1UL << 47) - (2UL << 20)),
-               .size = 4UL << 20,
-               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
-               .msg = "mmap((1UL << 47) - (2UL << 20), 4UL << 20, MAP_FIXED | MAP_HUGETLB)",
-       },
-};
-
-int main(int argc, char **argv)
-{
-       int i;
-       void *p;
-
-       for (i = 0; i < ARRAY_SIZE(testcases); i++) {
-               struct testcase *t = testcases + i;
-
-               p = mmap(t->addr, t->size, PROT_NONE, t->flags, -1, 0);
-
-               printf("%s: %p - ", t->msg, p);
-
-               if (p == MAP_FAILED) {
-                       printf("FAILED\n");
-                       continue;
-               }
-
-               if (t->low_addr_required && p >= (void *)(1UL << 47))
-                       printf("FAILED\n");
-               else
-                       printf("OK\n");
-               if (!t->keep_mapped)
-                       munmap(p, t->size);
-       }
-       return 0;
-}
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c

index e92903f..a8783f4 100644 (file)
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -169,9 +169,10 @@ static int         opt_raw;        /* for kernel developers */
  static int             opt_list;       /* list pages (in ranges) */
  static int             opt_no_summary; /* don't show summary */
  static pid_t           opt_pid;        /* process to walk */
-const char *           opt_file;       /* file or directory path */
+const char             *opt_file;      /* file or directory path */
  static uint64_t                opt_cgroup;     /* cgroup inode */
  static int             opt_list_cgroup;/* list page cgroup */
+static const char      *opt_kpageflags;/* kpageflags file to parse */
  
  #define MAX_ADDR_RANGES        1024
  static int             nr_addr_ranges;
@@ -258,7 +259,7 @@ static int checked_open(const char *pathname, int flags)
   * pagemap/kpageflags routines
   */
  
-static unsigned long do_u64_read(int fd, char *name,
+static unsigned long do_u64_read(int fd, const char *name,
                                  uint64_t *buf,
                                  unsigned long index,
                                  unsigned long count)
@@ -283,7 +284,7 @@ static unsigned long kpageflags_read(uint64_t *buf,
                                      unsigned long index,
                                      unsigned long pages)
  {
-       return do_u64_read(kpageflags_fd, PROC_KPAGEFLAGS, buf, index, pages);
+       return do_u64_read(kpageflags_fd, opt_kpageflags, buf, index, pages);
  }
  
  static unsigned long kpagecgroup_read(uint64_t *buf,
@@ -293,7 +294,7 @@ static unsigned long kpagecgroup_read(uint64_t *buf,
         if (kpagecgroup_fd < 0)
                 return pages;
  
-       return do_u64_read(kpagecgroup_fd, PROC_KPAGEFLAGS, buf, index, pages);
+       return do_u64_read(kpagecgroup_fd, opt_kpageflags, buf, index, pages);
  }
  
  static unsigned long pagemap_read(uint64_t *buf,
@@ -743,7 +744,7 @@ static void walk_addr_ranges(void)
  {
         int i;
  
-       kpageflags_fd = checked_open(PROC_KPAGEFLAGS, O_RDONLY);
+       kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY);
  
         if (!nr_addr_ranges)
                 add_addr_range(0, ULONG_MAX);
@@ -790,6 +791,7 @@ static void usage(void)
  "            -N|--no-summary            Don't show summary info\n"
  "            -X|--hwpoison              hwpoison pages\n"
  "            -x|--unpoison              unpoison pages\n"
+"            -F|--kpageflags filename   kpageflags file to parse\n"
  "            -h|--help                  Show this usage message\n"
  "flags:\n"
  "            0x10                       bitfield format, e.g.\n"
@@ -1013,7 +1015,7 @@ static void walk_page_cache(void)
  {
         struct stat st;
  
-       kpageflags_fd = checked_open(PROC_KPAGEFLAGS, O_RDONLY);
+       kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY);
         pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY);
         sigaction(SIGBUS, &sigbus_action, NULL);
  
@@ -1164,6 +1166,11 @@ static void parse_bits_mask(const char *optarg)
         add_bits_filter(mask, bits);
  }
  
+static void parse_kpageflags(const char *name)
+{
+       opt_kpageflags = name;
+}
+
  static void describe_flags(const char *optarg)
  {
         uint64_t flags = parse_flag_names(optarg, 0);
@@ -1188,6 +1195,7 @@ static const struct option opts[] = {
         { "no-summary", 0, NULL, 'N' },
         { "hwpoison"  , 0, NULL, 'X' },
         { "unpoison"  , 0, NULL, 'x' },
+       { "kpageflags", 0, NULL, 'F' },
         { "help"      , 0, NULL, 'h' },
         { NULL        , 0, NULL, 0 }
  };
@@ -1199,7 +1207,7 @@ int main(int argc, char *argv[])
         page_size = getpagesize();
  
         while ((c = getopt_long(argc, argv,
-                               "rp:f:a:b:d:c:ClLNXxh", opts, NULL)) != -1) {
+                               "rp:f:a:b:d:c:ClLNXxF:h", opts, NULL)) != -1) {
                 switch (c) {
                 case 'r':
                         opt_raw = 1;
@@ -1242,6 +1250,9 @@ int main(int argc, char *argv[])
                         opt_unpoison = 1;
                         prepare_hwpoison_fd();
                         break;
+               case 'F':
+                       parse_kpageflags(optarg);
+                       break;
                 case 'h':
                         usage();
                         exit(0);
@@ -1251,6 +1262,9 @@ int main(int argc, char *argv[])
                 }
         }
  
+       if (!opt_kpageflags)
+               opt_kpageflags = PROC_KPAGEFLAGS;
+
         if (opt_cgroup || opt_list_cgroup)
                 kpagecgroup_fd = checked_open(PROC_KPAGECGROUP, O_RDONLY);
  
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index d6b9370..35db929 100644 (file)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
  }
  
  static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+       .flags                  = MMU_INVALIDATE_DOES_NOT_BLOCK,
         .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
         .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
         .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 1 Feb 2018 02:46:22 +0000 (18:46 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 1 Feb 2018 02:46:22 +0000 (18:46 -0800)
Documentation/sysctl/vm.txt		patch \| blob \| history
Documentation/vm/hugetlbpage.txt		patch \| blob \| history
arch/arc/include/asm/hugepage.h		patch \| blob \| history
arch/arm/include/asm/pgtable-3level.h		patch \| blob \| history
arch/arm64/include/asm/pgtable.h		patch \| blob \| history
arch/m32r/kernel/traps.c		patch \| blob \| history
arch/mips/include/asm/pgtable.h		patch \| blob \| history
arch/powerpc/Kconfig		patch \| blob \| history
arch/powerpc/include/asm/book3s/64/hash-4k.h		patch \| blob \| history
arch/powerpc/include/asm/book3s/64/hash-64k.h		patch \| blob \| history
arch/powerpc/include/asm/book3s/64/pgtable.h		patch \| blob \| history
arch/powerpc/include/asm/book3s/64/radix.h		patch \| blob \| history
arch/powerpc/mm/pgtable-book3s64.c		patch \| blob \| history
arch/powerpc/mm/pgtable-hash64.c		patch \| blob \| history
arch/s390/Kconfig		patch \| blob \| history
arch/s390/include/asm/pgtable.h		patch \| blob \| history
arch/sparc/include/asm/pgtable_64.h		patch \| blob \| history
arch/sparc/mm/tlb.c		patch \| blob \| history
arch/x86/Kconfig		patch \| blob \| history
arch/x86/include/asm/pgtable-3level.h		patch \| blob \| history
arch/x86/include/asm/pgtable.h		patch \| blob \| history
drivers/infiniband/hw/hfi1/mmu_rb.c		patch \| blob \| history
drivers/iommu/amd_iommu_v2.c		patch \| blob \| history
drivers/iommu/intel-svm.c		patch \| blob \| history
drivers/misc/sgi-gru/grutlbpurge.c		patch \| blob \| history
fs/dax.c		patch \| blob \| history
fs/fcntl.c		patch \| blob \| history
fs/hugetlbfs/inode.c		patch \| blob \| history
fs/ocfs2/acl.c		patch \| blob \| history
fs/ocfs2/alloc.c		patch \| blob \| history
fs/ocfs2/alloc.h		patch \| blob \| history
fs/ocfs2/aops.c		patch \| blob \| history
fs/ocfs2/cluster/quorum.c		patch \| blob \| history
fs/ocfs2/cluster/tcp_internal.h		patch \| blob \| history
fs/ocfs2/dir.c		patch \| blob \| history
fs/ocfs2/dlm/dlmmaster.c		patch \| blob \| history
fs/ocfs2/dlmglue.c		patch \| blob \| history
fs/ocfs2/dlmglue.h		patch \| blob \| history
fs/ocfs2/extent_map.c		patch \| blob \| history
fs/ocfs2/extent_map.h		patch \| blob \| history
fs/ocfs2/file.c		patch \| blob \| history
fs/ocfs2/journal.c		patch \| blob \| history
fs/ocfs2/mmap.c		patch \| blob \| history
fs/ocfs2/ocfs2.h		patch \| blob \| history
fs/ocfs2/ocfs2_lockid.h		patch \| blob \| history
fs/ocfs2/ocfs2_trace.h		patch \| blob \| history
fs/ocfs2/suballoc.c		patch \| blob \| history
fs/ocfs2/super.c		patch \| blob \| history
fs/ocfs2/xattr.c		patch \| blob \| history
fs/proc/task_mmu.c		patch \| blob \| history
fs/userfaultfd.c		patch \| blob \| history
include/asm-generic/pgtable.h		patch \| blob \| history
include/linux/hugetlb.h		patch \| blob \| history
include/linux/memcontrol.h		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
include/linux/mm_types.h		patch \| blob \| history
include/linux/mmu_notifier.h		patch \| blob \| history
include/linux/mmzone.h		patch \| blob \| history
include/linux/page-flags.h		patch \| blob \| history
include/linux/pagevec.h		patch \| blob \| history
include/linux/sched/mm.h		patch \| blob \| history
include/linux/shmem_fs.h		patch \| blob \| history
include/linux/swap.h		patch \| blob \| history
include/linux/vmstat.h		patch \| blob \| history
include/linux/zpool.h		patch \| blob \| history
include/trace/events/vmscan.h		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
kernel/sysctl.c		patch \| blob \| history
mm/Kconfig		patch \| blob \| history
mm/compaction.c		patch \| blob \| history
mm/fadvise.c		patch \| blob \| history
mm/filemap.c		patch \| blob \| history
mm/hmm.c		patch \| blob \| history
mm/huge_memory.c		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history
mm/interval_tree.c		patch \| blob \| history
mm/khugepaged.c		patch \| blob \| history
mm/kmemleak.c		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history
mm/memory_hotplug.c		patch \| blob \| history
mm/mempolicy.c		patch \| blob \| history
mm/migrate.c		patch \| blob \| history
mm/mmu_notifier.c		patch \| blob \| history
mm/mprotect.c		patch \| blob \| history
mm/nommu.c		patch \| blob \| history
mm/oom_kill.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/page_ext.c		patch \| blob \| history
mm/page_owner.c		patch \| blob \| history
mm/pgtable-generic.c		patch \| blob \| history
mm/shmem.c		patch \| blob \| history
mm/slab.c		patch \| blob \| history
mm/slab.h		patch \| blob \| history
mm/slab_common.c		patch \| blob \| history
mm/slub.c		patch \| blob \| history
mm/sparse.c		patch \| blob \| history
mm/swap.c		patch \| blob \| history
mm/truncate.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history
mm/zpool.c		patch \| blob \| history
mm/zsmalloc.c		patch \| blob \| history
mm/zswap.c		patch \| blob \| history
scripts/decodecode		patch \| blob \| history
scripts/tags.sh		patch \| blob \| history
tools/testing/selftests/memfd/Makefile		patch \| blob \| history
tools/testing/selftests/memfd/common.c	[new file with mode: 0644]	patch \| blob
tools/testing/selftests/memfd/common.h	[new file with mode: 0644]	patch \| blob
tools/testing/selftests/memfd/fuse_test.c		patch \| blob \| history
tools/testing/selftests/memfd/memfd_test.c		patch \| blob \| history
tools/testing/selftests/memfd/run_fuse_test.sh		patch \| blob \| history
tools/testing/selftests/memfd/run_tests.sh		patch \| blob \| history
tools/testing/selftests/vm/Makefile		patch \| blob \| history
tools/testing/selftests/vm/run_vmtests		patch \| blob \| history
tools/testing/selftests/vm/va_128TBswitch.c	[new file with mode: 0644]	patch \| blob
tools/testing/selftests/x86/5lvl.c	[deleted file]	patch \| blob \| history
tools/vm/page-types.c		patch \| blob \| history
virt/kvm/kvm_main.c		patch \| blob \| history