Merge tag 'acpi-4.15-rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael...

[linux-2.6-microblaze.git] / mm / huge_memory.c
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index 1981ed6..0e7ded9 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -39,10 +39,10 @@
  #include "internal.h"
  
  /*
- * By default transparent hugepage support is disabled in order that avoid
- * to risk increase the memory footprint of applications without a guaranteed
- * benefit. When transparent hugepage support is enabled, is for all mappings,
- * and khugepaged scans all mappings.
+ * By default, transparent hugepage support is disabled in order to avoid
+ * risking an increased memory footprint for applications that are not
+ * guaranteed to benefit from it. When transparent hugepage support is
+ * enabled, it is for all mappings, and khugepaged scans all mappings.
   * Defrag is invoked by khugepaged hugepage allocations and by page faults
   * for all hugepage allocations.
   */
@@ -606,7 +606,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
                 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
                 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
                 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-               atomic_long_inc(&vma->vm_mm->nr_ptes);
+               mm_inc_nr_ptes(vma->vm_mm);
                 spin_unlock(vmf->ptl);
                 count_vm_event(THP_FAULT_ALLOC);
         }
@@ -662,7 +662,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
         if (pgtable)
                 pgtable_trans_huge_deposit(mm, pmd, pgtable);
         set_pmd_at(mm, haddr, pmd, entry);
-       atomic_long_inc(&mm->nr_ptes);
+       mm_inc_nr_ptes(mm);
         return true;
  }
  
@@ -747,7 +747,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
  
         if (pgtable) {
                 pgtable_trans_huge_deposit(mm, pmd, pgtable);
-               atomic_long_inc(&mm->nr_ptes);
+               mm_inc_nr_ptes(mm);
         }
  
         set_pmd_at(mm, addr, pmd, entry);
@@ -842,20 +842,15 @@ EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
  #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
  
  static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
-               pmd_t *pmd)
+               pmd_t *pmd, int flags)
  {
         pmd_t _pmd;
  
-       /*
-        * We should set the dirty bit only for FOLL_WRITE but for now
-        * the dirty bit in the pmd is meaningless.  And if the dirty
-        * bit will become meaningful and we'll only set it with
-        * FOLL_WRITE, an atomic set_bit will be required on the pmd to
-        * set the young bit, instead of the current set_pmd_at.
-        */
-       _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+       _pmd = pmd_mkyoung(*pmd);
+       if (flags & FOLL_WRITE)
+               _pmd = pmd_mkdirty(_pmd);
         if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
-                               pmd, _pmd,  1))
+                               pmd, _pmd, flags & FOLL_WRITE))
                 update_mmu_cache_pmd(vma, addr, pmd);
  }
  
@@ -884,7 +879,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
                 return NULL;
  
         if (flags & FOLL_TOUCH)
-               touch_pmd(vma, addr, pmd);
+               touch_pmd(vma, addr, pmd, flags);
  
         /*
          * device mapped pages can only be returned if the
@@ -942,7 +937,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                         set_pmd_at(src_mm, addr, src_pmd, pmd);
                 }
                 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-               atomic_long_inc(&dst_mm->nr_ptes);
+               mm_inc_nr_ptes(dst_mm);
                 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
                 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
                 ret = 0;
@@ -978,7 +973,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         get_page(src_page);
         page_dup_rmap(src_page, true);
         add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-       atomic_long_inc(&dst_mm->nr_ptes);
+       mm_inc_nr_ptes(dst_mm);
         pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
  
         pmdp_set_wrprotect(src_mm, addr, src_pmd);
@@ -995,20 +990,15 @@ out:
  
  #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
  static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
-               pud_t *pud)
+               pud_t *pud, int flags)
  {
         pud_t _pud;
  
-       /*
-        * We should set the dirty bit only for FOLL_WRITE but for now
-        * the dirty bit in the pud is meaningless.  And if the dirty
-        * bit will become meaningful and we'll only set it with
-        * FOLL_WRITE, an atomic set_bit will be required on the pud to
-        * set the young bit, instead of the current set_pud_at.
-        */
-       _pud = pud_mkyoung(pud_mkdirty(*pud));
+       _pud = pud_mkyoung(*pud);
+       if (flags & FOLL_WRITE)
+               _pud = pud_mkdirty(_pud);
         if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
-                               pud, _pud,  1))
+                               pud, _pud, flags & FOLL_WRITE))
                 update_mmu_cache_pud(vma, addr, pud);
  }
  
@@ -1031,7 +1021,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
                 return NULL;
  
         if (flags & FOLL_TOUCH)
-               touch_pud(vma, addr, pud);
+               touch_pud(vma, addr, pud, flags);
  
         /*
          * device mapped pages can only be returned if the
@@ -1189,8 +1179,15 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
                 goto out_free_pages;
         VM_BUG_ON_PAGE(!PageHead(page), page);
  
+       /*
+        * Leave pmd empty until pte is filled note we must notify here as
+        * concurrent CPU thread might write to new page before the call to
+        * mmu_notifier_invalidate_range_end() happens which can lead to a
+        * device seeing memory write in different order than CPU.
+        *
+        * See Documentation/vm/mmu_notifier.txt
+        */
         pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
-       /* leave pmd empty until pte is filled */
  
         pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
         pmd_populate(vma->vm_mm, &_pmd, pgtable);
@@ -1216,7 +1213,12 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
         page_remove_rmap(page, true);
         spin_unlock(vmf->ptl);
  
-       mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
+       /*
+        * No need to double call mmu_notifier->invalidate_range() callback as
+        * the above pmdp_huge_clear_flush_notify() did already call it.
+        */
+       mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
+                                               mmun_end);
  
         ret |= VM_FAULT_WRITE;
         put_page(page);
@@ -1365,7 +1367,12 @@ alloc:
         }
         spin_unlock(vmf->ptl);
  out_mn:
-       mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
+       /*
+        * No need to double call mmu_notifier->invalidate_range() callback as
+        * the above pmdp_huge_clear_flush_notify() did already call it.
+        */
+       mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
+                                              mmun_end);
  out:
         return ret;
  out_unlock:
@@ -1407,7 +1414,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
         page = pmd_page(*pmd);
         VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
         if (flags & FOLL_TOUCH)
-               touch_pmd(vma, addr, pmd);
+               touch_pmd(vma, addr, pmd, flags);
         if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
                 /*
                  * We don't mlock() pte-mapped THPs. This way we can avoid
@@ -1678,7 +1685,7 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
  
         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
         pte_free(mm, pgtable);
-       atomic_long_dec(&mm->nr_ptes);
+       mm_dec_nr_ptes(mm);
  }
  
  int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
@@ -2017,7 +2024,12 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
  
  out:
         spin_unlock(ptl);
-       mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PUD_SIZE);
+       /*
+        * No need to double call mmu_notifier->invalidate_range() callback as
+        * the above pudp_huge_clear_flush_notify() did already call it.
+        */
+       mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
+                                              HPAGE_PUD_SIZE);
  }
  #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
  
@@ -2029,8 +2041,15 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
         pmd_t _pmd;
         int i;
  
-       /* leave pmd empty until pte is filled */
-       pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+       /*
+        * Leave pmd empty until pte is filled note that it is fine to delay
+        * notification until mmu_notifier_invalidate_range_end() as we are
+        * replacing a zero pmd write protected page with a zero pte write
+        * protected page.
+        *
+        * See Documentation/vm/mmu_notifier.txt
+        */
+       pmdp_huge_clear_flush(vma, haddr, pmd);
  
         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
         pmd_populate(mm, &_pmd, pgtable);
@@ -2085,6 +2104,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                 add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR);
                 return;
         } else if (is_huge_zero_pmd(*pmd)) {
+               /*
+                * FIXME: Do we want to invalidate secondary mmu by calling
+                * mmu_notifier_invalidate_range() see comments below inside
+                * __split_huge_pmd() ?
+                *
+                * We are going from a zero huge page write protected to zero
+                * small page also write protected so it does not seems useful
+                * to invalidate secondary mmu at this time.
+                */
                 return __split_huge_zero_page_pmd(vma, haddr, pmd);
         }
  
@@ -2220,7 +2248,21 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
         __split_huge_pmd_locked(vma, pmd, haddr, freeze);
  out:
         spin_unlock(ptl);
-       mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
+       /*
+        * No need to double call mmu_notifier->invalidate_range() callback.
+        * They are 3 cases to consider inside __split_huge_pmd_locked():
+        *  1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
+        *  2) __split_huge_zero_page_pmd() read only zero page and any write
+        *    fault will trigger a flush_notify before pointing to a new page
+        *    (it is fine if the secondary mmu keeps pointing to the old zero
+        *    page in the meantime)
+        *  3) Split a huge pmd into pte pointing to the same page. No need
+        *     to invalidate secondary tlb entry they are all still valid.
+        *     any further changes to individual pte will notify. So no need
+        *     to call mmu_notifier->invalidate_range()
+        */
+       mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
+                                              HPAGE_PMD_SIZE);
  }
  
  void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
@@ -2718,7 +2760,7 @@ static unsigned long deferred_split_count(struct shrinker *shrink,
                 struct shrink_control *sc)
  {
         struct pglist_data *pgdata = NODE_DATA(sc->nid);
-       return ACCESS_ONCE(pgdata->split_queue_len);
+       return READ_ONCE(pgdata->split_queue_len);
  }
  
  static unsigned long deferred_split_scan(struct shrinker *shrink,