Merge branch 'ptp-virtual-clocks-and-timestamping'

[linux-2.6-microblaze.git] / mm / huge_memory.c
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index ae907a9..6d2a011 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -7,6 +7,7 @@
  
  #include <linux/mm.h>
  #include <linux/sched.h>
+#include <linux/sched/mm.h>
  #include <linux/sched/coredump.h>
  #include <linux/sched/numa_balancing.h>
  #include <linux/highmem.h>
@@ -61,6 +62,7 @@ static struct shrinker deferred_split_shrinker;
  
  static atomic_t huge_zero_refcount;
  struct page *huge_zero_page __read_mostly;
+unsigned long huge_zero_pfn __read_mostly = ~0UL;
  
  bool transparent_hugepage_enabled(struct vm_area_struct *vma)
  {
@@ -77,18 +79,18 @@ bool transparent_hugepage_enabled(struct vm_area_struct *vma)
         return false;
  }
  
-static struct page *get_huge_zero_page(void)
+static bool get_huge_zero_page(void)
  {
         struct page *zero_page;
  retry:
         if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
-               return READ_ONCE(huge_zero_page);
+               return true;
  
         zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
                         HPAGE_PMD_ORDER);
         if (!zero_page) {
                 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
-               return NULL;
+               return false;
         }
         count_vm_event(THP_ZERO_PAGE_ALLOC);
         preempt_disable();
@@ -97,11 +99,12 @@ retry:
                 __free_pages(zero_page, compound_order(zero_page));
                 goto retry;
         }
+       WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));
  
         /* We take additional reference here. It will be put back by shrinker */
         atomic_set(&huge_zero_refcount, 2);
         preempt_enable();
-       return READ_ONCE(huge_zero_page);
+       return true;
  }
  
  static void put_huge_zero_page(void)
@@ -146,6 +149,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
         if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
                 struct page *zero_page = xchg(&huge_zero_page, NULL);
                 BUG_ON(zero_page == NULL);
+               WRITE_ONCE(huge_zero_pfn, ~0UL);
                 __free_pages(zero_page, compound_order(zero_page));
                 return HPAGE_PMD_NR;
         }
@@ -624,14 +628,12 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
  
                 /* Deliver the page fault to userland */
                 if (userfaultfd_missing(vma)) {
-                       vm_fault_t ret2;
-
                         spin_unlock(vmf->ptl);
                         put_page(page);
                         pte_free(vma->vm_mm, pgtable);
-                       ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
-                       VM_BUG_ON(ret2 & VM_FAULT_FALLBACK);
-                       return ret2;
+                       ret = handle_userfault(vmf, VM_UFFD_MISSING);
+                       VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+                       return ret;
                 }
  
                 entry = mk_huge_pmd(page, vma->vm_page_prot);
@@ -1293,7 +1295,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
         }
  
         page = pmd_page(orig_pmd);
-       VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
+       VM_BUG_ON_PAGE(!PageHead(page), page);
  
         /* Lock page for reuse_swap_page() */
         if (!trylock_page(page)) {
@@ -1464,12 +1466,6 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
          */
         page_locked = trylock_page(page);
         target_nid = mpol_misplaced(page, vma, haddr);
-       if (target_nid == NUMA_NO_NODE) {
-               /* If the page was locked, there are no parallel migrations */
-               if (page_locked)
-                       goto clear_pmdnuma;
-       }
-
         /* Migration could have started since the pmd_trans_migrating check */
         if (!page_locked) {
                 page_nid = NUMA_NO_NODE;
@@ -1478,6 +1474,11 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
                 spin_unlock(vmf->ptl);
                 put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
                 goto out;
+       } else if (target_nid == NUMA_NO_NODE) {
+               /* There are no parallel migrations and page is in the right
+                * node. Clear the numa hinting info in this pmd.
+                */
+               goto clear_pmdnuma;
         }
  
         /*
@@ -1696,7 +1697,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
  
                         VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
                         entry = pmd_to_swp_entry(orig_pmd);
-                       page = pfn_to_page(swp_offset(entry));
+                       page = migration_entry_to_page(entry);
                         flush_needed = 0;
                 } else
                         WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
@@ -1794,8 +1795,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
  /*
   * Returns
   *  - 0 if PMD could not be locked
- *  - 1 if PMD was locked but protections unchange and TLB flush unnecessary
- *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
+ *  - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
+ *  - HPAGE_PMD_NR if protections changed and TLB flush necessary
   */
  int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                 unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
@@ -2046,7 +2047,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
         count_vm_event(THP_SPLIT_PMD);
  
         if (!vma_is_anonymous(vma)) {
-               _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+               old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
                 /*
                  * We are going to unmap this huge page. So
                  * just go ahead and zap it
@@ -2055,16 +2056,25 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                         zap_deposited_table(mm, pmd);
                 if (vma_is_special_huge(vma))
                         return;
-               page = pmd_page(_pmd);
-               if (!PageDirty(page) && pmd_dirty(_pmd))
-                       set_page_dirty(page);
-               if (!PageReferenced(page) && pmd_young(_pmd))
-                       SetPageReferenced(page);
-               page_remove_rmap(page, true);
-               put_page(page);
+               if (unlikely(is_pmd_migration_entry(old_pmd))) {
+                       swp_entry_t entry;
+
+                       entry = pmd_to_swp_entry(old_pmd);
+                       page = migration_entry_to_page(entry);
+               } else {
+                       page = pmd_page(old_pmd);
+                       if (!PageDirty(page) && pmd_dirty(old_pmd))
+                               set_page_dirty(page);
+                       if (!PageReferenced(page) && pmd_young(old_pmd))
+                               SetPageReferenced(page);
+                       page_remove_rmap(page, true);
+                       put_page(page);
+               }
                 add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
                 return;
-       } else if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) {
+       }
+
+       if (is_huge_zero_pmd(*pmd)) {
                 /*
                  * FIXME: Do we want to invalidate secondary mmu by calling
                  * mmu_notifier_invalidate_range() see comments below inside
@@ -2104,7 +2114,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                 swp_entry_t entry;
  
                 entry = pmd_to_swp_entry(old_pmd);
-               page = pfn_to_page(swp_offset(entry));
+               page = migration_entry_to_page(entry);
                 write = is_write_migration_entry(entry);
                 young = false;
                 soft_dirty = pmd_swp_soft_dirty(old_pmd);
@@ -2303,60 +2313,54 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
         __split_huge_pmd(vma, pmd, address, freeze, page);
  }
  
+static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
+{
+       /*
+        * If the new address isn't hpage aligned and it could previously
+        * contain an hugepage: check if we need to split an huge pmd.
+        */
+       if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
+           range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
+                        ALIGN(address, HPAGE_PMD_SIZE)))
+               split_huge_pmd_address(vma, address, false, NULL);
+}
+
  void vma_adjust_trans_huge(struct vm_area_struct *vma,
                              unsigned long start,
                              unsigned long end,
                              long adjust_next)
  {
-       /*
-        * If the new start address isn't hpage aligned and it could
-        * previously contain an hugepage: check if we need to split
-        * an huge pmd.
-        */
-       if (start & ~HPAGE_PMD_MASK &&
-           (start & HPAGE_PMD_MASK) >= vma->vm_start &&
-           (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
-               split_huge_pmd_address(vma, start, false, NULL);
+       /* Check if we need to split start first. */
+       split_huge_pmd_if_needed(vma, start);
  
-       /*
-        * If the new end address isn't hpage aligned and it could
-        * previously contain an hugepage: check if we need to split
-        * an huge pmd.
-        */
-       if (end & ~HPAGE_PMD_MASK &&
-           (end & HPAGE_PMD_MASK) >= vma->vm_start &&
-           (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
-               split_huge_pmd_address(vma, end, false, NULL);
+       /* Check if we need to split end next. */
+       split_huge_pmd_if_needed(vma, end);
  
         /*
-        * If we're also updating the vma->vm_next->vm_start, if the new
-        * vm_next->vm_start isn't hpage aligned and it could previously
-        * contain an hugepage: check if we need to split an huge pmd.
+        * If we're also updating the vma->vm_next->vm_start,
+        * check if we need to split it.
          */
         if (adjust_next > 0) {
                 struct vm_area_struct *next = vma->vm_next;
                 unsigned long nstart = next->vm_start;
                 nstart += adjust_next;
-               if (nstart & ~HPAGE_PMD_MASK &&
-                   (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
-                   (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
-                       split_huge_pmd_address(next, nstart, false, NULL);
+               split_huge_pmd_if_needed(next, nstart);
         }
  }
  
  static void unmap_page(struct page *page)
  {
-       enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK |
+       enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC |
                 TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
-       bool unmap_success;
  
         VM_BUG_ON_PAGE(!PageHead(page), page);
  
         if (PageAnon(page))
                 ttu_flags |= TTU_SPLIT_FREEZE;
  
-       unmap_success = try_to_unmap(page, ttu_flags);
-       VM_BUG_ON_PAGE(!unmap_success, page);
+       try_to_unmap(page, ttu_flags);
+
+       VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
  }
  
  static void remap_page(struct page *page, unsigned int nr)
@@ -2477,7 +2481,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
                 xa_lock(&swap_cache->i_pages);
         }
  
-       /* lock lru list/PageCompound, ref freezed by page_ref_freeze */
+       /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
         lruvec = lock_page_lruvec(head);
  
         for (i = nr - 1; i >= 1; i--) {
@@ -2667,7 +2671,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
         struct deferred_split *ds_queue = get_deferred_split_queue(head);
         struct anon_vma *anon_vma = NULL;
         struct address_space *mapping = NULL;
-       int count, mapcount, extra_pins, ret;
+       int extra_pins, ret;
         pgoff_t end;
  
         VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
@@ -2726,7 +2730,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
         }
  
         unmap_page(head);
-       VM_BUG_ON_PAGE(compound_mapcount(head), head);
  
         /* block interrupt reentry in xa_lock and spinlock */
         local_irq_disable();
@@ -2744,9 +2747,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
  
         /* Prevent deferred_split_scan() touching ->_refcount */
         spin_lock(&ds_queue->split_queue_lock);
-       count = page_count(head);
-       mapcount = total_mapcount(head);
-       if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
+       if (page_ref_freeze(head, 1 + extra_pins)) {
                 if (!list_empty(page_deferred_list(head))) {
                         ds_queue->split_queue_len--;
                         list_del(page_deferred_list(head));
@@ -2766,16 +2767,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                 __split_huge_page(page, list, end);
                 ret = 0;
         } else {
-               if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
-                       pr_alert("total_mapcount: %u, page_count(): %u\n",
-                                       mapcount, count);
-                       if (PageTail(page))
-                               dump_page(head, NULL);
-                       dump_page(page, "total_mapcount(head) > 0");
-                       BUG();
-               }
                 spin_unlock(&ds_queue->split_queue_lock);
-fail:          if (mapping)
+fail:
+               if (mapping)
                         xa_unlock(&mapping->i_pages);
                 local_irq_enable();
                 remap_page(head, thp_nr_pages(head));
@@ -2838,8 +2832,8 @@ void deferred_split_huge_page(struct page *page)
                 ds_queue->split_queue_len++;
  #ifdef CONFIG_MEMCG
                 if (memcg)
-                       memcg_set_shrinker_bit(memcg, page_to_nid(page),
-                                              deferred_split_shrinker.id);
+                       set_shrinker_bit(memcg, page_to_nid(page),
+                                        deferred_split_shrinker.id);
  #endif
         }
         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
@@ -2924,16 +2918,14 @@ static struct shrinker deferred_split_shrinker = {
  };
  
  #ifdef CONFIG_DEBUG_FS
-static int split_huge_pages_set(void *data, u64 val)
+static void split_huge_pages_all(void)
  {
         struct zone *zone;
         struct page *page;
         unsigned long pfn, max_zone_pfn;
         unsigned long total = 0, split = 0;
  
-       if (val != 1)
-               return -EINVAL;
-
+       pr_debug("Split all THPs\n");
         for_each_populated_zone(zone) {
                 max_zone_pfn = zone_end_pfn(zone);
                 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
@@ -2957,15 +2949,243 @@ static int split_huge_pages_set(void *data, u64 val)
                         unlock_page(page);
  next:
                         put_page(page);
+                       cond_resched();
                 }
         }
  
-       pr_info("%lu of %lu THP split\n", split, total);
+       pr_debug("%lu of %lu THP split\n", split, total);
+}
  
-       return 0;
+static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
+{
+       return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
+                   is_vm_hugetlb_page(vma);
  }
-DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
-               "%llu\n");
+
+static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
+                               unsigned long vaddr_end)
+{
+       int ret = 0;
+       struct task_struct *task;
+       struct mm_struct *mm;
+       unsigned long total = 0, split = 0;
+       unsigned long addr;
+
+       vaddr_start &= PAGE_MASK;
+       vaddr_end &= PAGE_MASK;
+
+       /* Find the task_struct from pid */
+       rcu_read_lock();
+       task = find_task_by_vpid(pid);
+       if (!task) {
+               rcu_read_unlock();
+               ret = -ESRCH;
+               goto out;
+       }
+       get_task_struct(task);
+       rcu_read_unlock();
+
+       /* Find the mm_struct */
+       mm = get_task_mm(task);
+       put_task_struct(task);
+
+       if (!mm) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
+                pid, vaddr_start, vaddr_end);
+
+       mmap_read_lock(mm);
+       /*
+        * always increase addr by PAGE_SIZE, since we could have a PTE page
+        * table filled with PTE-mapped THPs, each of which is distinct.
+        */
+       for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
+               struct vm_area_struct *vma = find_vma(mm, addr);
+               unsigned int follflags;
+               struct page *page;
+
+               if (!vma || addr < vma->vm_start)
+                       break;
+
+               /* skip special VMA and hugetlb VMA */
+               if (vma_not_suitable_for_thp_split(vma)) {
+                       addr = vma->vm_end;
+                       continue;
+               }
+
+               /* FOLL_DUMP to ignore special (like zero) pages */
+               follflags = FOLL_GET | FOLL_DUMP;
+               page = follow_page(vma, addr, follflags);
+
+               if (IS_ERR(page))
+                       continue;
+               if (!page)
+                       continue;
+
+               if (!is_transparent_hugepage(page))
+                       goto next;
+
+               total++;
+               if (!can_split_huge_page(compound_head(page), NULL))
+                       goto next;
+
+               if (!trylock_page(page))
+                       goto next;
+
+               if (!split_huge_page(page))
+                       split++;
+
+               unlock_page(page);
+next:
+               put_page(page);
+               cond_resched();
+       }
+       mmap_read_unlock(mm);
+       mmput(mm);
+
+       pr_debug("%lu of %lu THP split\n", split, total);
+
+out:
+       return ret;
+}
+
+static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
+                               pgoff_t off_end)
+{
+       struct filename *file;
+       struct file *candidate;
+       struct address_space *mapping;
+       int ret = -EINVAL;
+       pgoff_t index;
+       int nr_pages = 1;
+       unsigned long total = 0, split = 0;
+
+       file = getname_kernel(file_path);
+       if (IS_ERR(file))
+               return ret;
+
+       candidate = file_open_name(file, O_RDONLY, 0);
+       if (IS_ERR(candidate))
+               goto out;
+
+       pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
+                file_path, off_start, off_end);
+
+       mapping = candidate->f_mapping;
+
+       for (index = off_start; index < off_end; index += nr_pages) {
+               struct page *fpage = pagecache_get_page(mapping, index,
+                                               FGP_ENTRY | FGP_HEAD, 0);
+
+               nr_pages = 1;
+               if (xa_is_value(fpage) || !fpage)
+                       continue;
+
+               if (!is_transparent_hugepage(fpage))
+                       goto next;
+
+               total++;
+               nr_pages = thp_nr_pages(fpage);
+
+               if (!trylock_page(fpage))
+                       goto next;
+
+               if (!split_huge_page(fpage))
+                       split++;
+
+               unlock_page(fpage);
+next:
+               put_page(fpage);
+               cond_resched();
+       }
+
+       filp_close(candidate, NULL);
+       ret = 0;
+
+       pr_debug("%lu of %lu file-backed THP split\n", split, total);
+out:
+       putname(file);
+       return ret;
+}
+
+#define MAX_INPUT_BUF_SZ 255
+
+static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
+                               size_t count, loff_t *ppops)
+{
+       static DEFINE_MUTEX(split_debug_mutex);
+       ssize_t ret;
+       /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */
+       char input_buf[MAX_INPUT_BUF_SZ];
+       int pid;
+       unsigned long vaddr_start, vaddr_end;
+
+       ret = mutex_lock_interruptible(&split_debug_mutex);
+       if (ret)
+               return ret;
+
+       ret = -EFAULT;
+
+       memset(input_buf, 0, MAX_INPUT_BUF_SZ);
+       if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
+               goto out;
+
+       input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
+
+       if (input_buf[0] == '/') {
+               char *tok;
+               char *buf = input_buf;
+               char file_path[MAX_INPUT_BUF_SZ];
+               pgoff_t off_start = 0, off_end = 0;
+               size_t input_len = strlen(input_buf);
+
+               tok = strsep(&buf, ",");
+               if (tok) {
+                       strncpy(file_path, tok, MAX_INPUT_BUF_SZ);
+               } else {
+                       ret = -EINVAL;
+                       goto out;
+               }
+
+               ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end);
+               if (ret != 2) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+               ret = split_huge_pages_in_file(file_path, off_start, off_end);
+               if (!ret)
+                       ret = input_len;
+
+               goto out;
+       }
+
+       ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end);
+       if (ret == 1 && pid == 1) {
+               split_huge_pages_all();
+               ret = strlen(input_buf);
+               goto out;
+       } else if (ret != 3) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end);
+       if (!ret)
+               ret = strlen(input_buf);
+out:
+       mutex_unlock(&split_debug_mutex);
+       return ret;
+
+}
+
+static const struct file_operations split_huge_pages_fops = {
+       .owner   = THIS_MODULE,
+       .write   = split_huge_pages_write,
+       .llseek  = no_llseek,
+};
  
  static int __init split_huge_pages_debugfs(void)
  {