Linux 6.9-rc1

[linux-2.6-microblaze.git] / mm / madvise.c
diff --git a/mm/madvise.c b/mm/madvise.c

index 5f0f094..44a498c 100644 (file)
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -59,6 +59,7 @@ static int madvise_need_mmap_write(int behavior)
         case MADV_FREE:
         case MADV_POPULATE_READ:
         case MADV_POPULATE_WRITE:
+       case MADV_COLLAPSE:
                 return 0;
         default:
                 /* be safe, default to 1. list exceptions explicitly */
@@ -94,9 +95,6 @@ struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
  {
         mmap_assert_locked(vma->vm_mm);
  
-       if (vma->vm_file)
-               return NULL;
-
         return vma->anon_name;
  }
  
@@ -132,7 +130,7 @@ static int replace_anon_vma_name(struct vm_area_struct *vma,
  #endif /* CONFIG_ANON_VMA_NAME */
  /*
   * Update the vm_flags on region of a vma, splitting it or merging it as
- * necessary.  Must be called with mmap_sem held for writing;
+ * necessary.  Must be called with mmap_lock held for writing;
   * Caller should ensure anon_name stability by raising its refcount even when
   * anon_name belongs to a valid vma because this function might free that vma.
   */
@@ -143,46 +141,24 @@ static int madvise_update_vma(struct vm_area_struct *vma,
  {
         struct mm_struct *mm = vma->vm_mm;
         int error;
-       pgoff_t pgoff;
+       VMA_ITERATOR(vmi, mm, start);
  
         if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
                 *prev = vma;
                 return 0;
         }
  
-       pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
-       *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
-                         vma->vm_file, pgoff, vma_policy(vma),
-                         vma->vm_userfaultfd_ctx, anon_name);
-       if (*prev) {
-               vma = *prev;
-               goto success;
-       }
+       vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags,
+                                   anon_name);
+       if (IS_ERR(vma))
+               return PTR_ERR(vma);
  
         *prev = vma;
  
-       if (start != vma->vm_start) {
-               if (unlikely(mm->map_count >= sysctl_max_map_count))
-                       return -ENOMEM;
-               error = __split_vma(mm, vma, start, 1);
-               if (error)
-                       return error;
-       }
-
-       if (end != vma->vm_end) {
-               if (unlikely(mm->map_count >= sysctl_max_map_count))
-                       return -ENOMEM;
-               error = __split_vma(mm, vma, end, 0);
-               if (error)
-                       return error;
-       }
-
-success:
-       /*
-        * vm_flags is protected by the mmap_lock held in write mode.
-        */
-       vma->vm_flags = new_flags;
-       if (!vma->vm_file) {
+       /* vm_flags is protected by the mmap_lock held in write mode. */
+       vma_start_write(vma);
+       vm_flags_reset(vma, new_flags);
+       if (!vma->vm_file || vma_is_anon_shmem(vma)) {
                 error = replace_anon_vma_name(vma, anon_name);
                 if (error)
                         return error;
@@ -193,79 +169,89 @@ success:
  
  #ifdef CONFIG_SWAP
  static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
-       unsigned long end, struct mm_walk *walk)
+               unsigned long end, struct mm_walk *walk)
  {
         struct vm_area_struct *vma = walk->private;
-       unsigned long index;
         struct swap_iocb *splug = NULL;
+       pte_t *ptep = NULL;
+       spinlock_t *ptl;
+       unsigned long addr;
  
-       if (pmd_none_or_trans_huge_or_clear_bad(pmd))
-               return 0;
-
-       for (index = start; index != end; index += PAGE_SIZE) {
+       for (addr = start; addr < end; addr += PAGE_SIZE) {
                 pte_t pte;
                 swp_entry_t entry;
-               struct page *page;
-               spinlock_t *ptl;
-               pte_t *ptep;
+               struct folio *folio;
  
-               ptep = pte_offset_map_lock(vma->vm_mm, pmd, index, &ptl);
-               pte = *ptep;
-               pte_unmap_unlock(ptep, ptl);
+               if (!ptep++) {
+                       ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+                       if (!ptep)
+                               break;
+               }
  
+               pte = ptep_get(ptep);
                 if (!is_swap_pte(pte))
                         continue;
                 entry = pte_to_swp_entry(pte);
                 if (unlikely(non_swap_entry(entry)))
                         continue;
  
-               page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
-                                            vma, index, false, &splug);
-               if (page)
-                       put_page(page);
+               pte_unmap_unlock(ptep, ptl);
+               ptep = NULL;
+
+               folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
+                                            vma, addr, &splug);
+               if (folio)
+                       folio_put(folio);
         }
+
+       if (ptep)
+               pte_unmap_unlock(ptep, ptl);
         swap_read_unplug(splug);
+       cond_resched();
  
         return 0;
  }
  
  static const struct mm_walk_ops swapin_walk_ops = {
         .pmd_entry              = swapin_walk_pmd_entry,
+       .walk_lock              = PGWALK_RDLOCK,
  };
  
-static void force_shm_swapin_readahead(struct vm_area_struct *vma,
+static void shmem_swapin_range(struct vm_area_struct *vma,
                 unsigned long start, unsigned long end,
                 struct address_space *mapping)
  {
         XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
-       pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
-       struct page *page;
+       pgoff_t end_index = linear_page_index(vma, end) - 1;
+       struct folio *folio;
         struct swap_iocb *splug = NULL;
  
         rcu_read_lock();
-       xas_for_each(&xas, page, end_index) {
-               swp_entry_t swap;
+       xas_for_each(&xas, folio, end_index) {
+               unsigned long addr;
+               swp_entry_t entry;
  
-               if (!xa_is_value(page))
+               if (!xa_is_value(folio))
                         continue;
-               swap = radix_to_swp_entry(page);
+               entry = radix_to_swp_entry(folio);
                 /* There might be swapin error entries in shmem mapping. */
-               if (non_swap_entry(swap))
+               if (non_swap_entry(entry))
                         continue;
+
+               addr = vma->vm_start +
+                       ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT);
                 xas_pause(&xas);
                 rcu_read_unlock();
  
-               page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
-                                            NULL, 0, false, &splug);
-               if (page)
-                       put_page(page);
+               folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping),
+                                            vma, addr, &splug);
+               if (folio)
+                       folio_put(folio);
  
                 rcu_read_lock();
         }
         rcu_read_unlock();
         swap_read_unplug(splug);
-
-       lru_add_drain();        /* Push any new pages onto the LRU now */
  }
  #endif         /* CONFIG_SWAP */
  
@@ -289,8 +275,8 @@ static long madvise_willneed(struct vm_area_struct *vma,
         }
  
         if (shmem_mapping(file->f_mapping)) {
-               force_shm_swapin_readahead(vma, start, end,
-                                       file->f_mapping);
+               shmem_swapin_range(vma, start, end, file->f_mapping);
+               lru_add_drain(); /* Push any new pages onto the LRU now */
                 return 0;
         }
  #else
@@ -320,6 +306,21 @@ static long madvise_willneed(struct vm_area_struct *vma,
         return 0;
  }
  
+static inline bool can_do_file_pageout(struct vm_area_struct *vma)
+{
+       if (!vma->vm_file)
+               return false;
+       /*
+        * paging out pagecache only for non-anonymous mappings that correspond
+        * to the files the calling process could (if tried) open for writing;
+        * otherwise we'd be including shared non-exclusive mappings, which
+        * opens a side channel.
+        */
+       return inode_owner_or_capable(&nop_mnt_idmap,
+                                     file_inode(vma->vm_file)) ||
+              file_permission(vma->vm_file, MAY_WRITE) == 0;
+}
+
  static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
                                 unsigned long addr, unsigned long end,
                                 struct mm_walk *walk)
@@ -329,14 +330,19 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
         bool pageout = private->pageout;
         struct mm_struct *mm = tlb->mm;
         struct vm_area_struct *vma = walk->vma;
-       pte_t *orig_pte, *pte, ptent;
+       pte_t *start_pte, *pte, ptent;
         spinlock_t *ptl;
-       struct page *page = NULL;
-       LIST_HEAD(page_list);
+       struct folio *folio = NULL;
+       LIST_HEAD(folio_list);
+       bool pageout_anon_only_filter;
+       unsigned int batch_count = 0;
  
         if (fatal_signal_pending(current))
                 return -EINTR;
  
+       pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) &&
+                                       !can_do_file_pageout(vma);
+
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
         if (pmd_trans_huge(*pmd)) {
                 pmd_t orig_pmd;
@@ -357,27 +363,30 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
                         goto huge_unlock;
                 }
  
-               page = pmd_page(orig_pmd);
+               folio = pfn_folio(pmd_pfn(orig_pmd));
  
-               /* Do not interfere with other mappings of this page */
-               if (page_mapcount(page) != 1)
+               /* Do not interfere with other mappings of this folio */
+               if (folio_estimated_sharers(folio) != 1)
+                       goto huge_unlock;
+
+               if (pageout_anon_only_filter && !folio_test_anon(folio))
                         goto huge_unlock;
  
                 if (next - addr != HPAGE_PMD_SIZE) {
                         int err;
  
-                       get_page(page);
+                       folio_get(folio);
                         spin_unlock(ptl);
-                       lock_page(page);
-                       err = split_huge_page(page);
-                       unlock_page(page);
-                       put_page(page);
+                       folio_lock(folio);
+                       err = split_folio(folio);
+                       folio_unlock(folio);
+                       folio_put(folio);
                         if (!err)
-                               goto regular_page;
+                               goto regular_folio;
                         return 0;
                 }
  
-               if (pmd_young(orig_pmd)) {
+               if (!pageout && pmd_young(orig_pmd)) {
                         pmdp_invalidate(vma, addr, pmd);
                         orig_pmd = pmd_mkold(orig_pmd);
  
@@ -385,34 +394,47 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
                         tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
                 }
  
-               ClearPageReferenced(page);
-               test_and_clear_page_young(page);
+               folio_clear_referenced(folio);
+               folio_test_clear_young(folio);
+               if (folio_test_active(folio))
+                       folio_set_workingset(folio);
                 if (pageout) {
-                       if (!isolate_lru_page(page)) {
-                               if (PageUnevictable(page))
-                                       putback_lru_page(page);
+                       if (folio_isolate_lru(folio)) {
+                               if (folio_test_unevictable(folio))
+                                       folio_putback_lru(folio);
                                 else
-                                       list_add(&page->lru, &page_list);
+                                       list_add(&folio->lru, &folio_list);
                         }
                 } else
-                       deactivate_page(page);
+                       folio_deactivate(folio);
  huge_unlock:
                 spin_unlock(ptl);
                 if (pageout)
-                       reclaim_pages(&page_list);
+                       reclaim_pages(&folio_list, true);
                 return 0;
         }
  
-regular_page:
-       if (pmd_trans_unstable(pmd))
-               return 0;
+regular_folio:
  #endif
         tlb_change_page_size(tlb, PAGE_SIZE);
-       orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+restart:
+       start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+       if (!start_pte)
+               return 0;
         flush_tlb_batched_pending(mm);
         arch_enter_lazy_mmu_mode();
         for (; addr < end; pte++, addr += PAGE_SIZE) {
-               ptent = *pte;
+               ptent = ptep_get(pte);
+
+               if (++batch_count == SWAP_CLUSTER_MAX) {
+                       batch_count = 0;
+                       if (need_resched()) {
+                               arch_leave_lazy_mmu_mode();
+                               pte_unmap_unlock(start_pte, ptl);
+                               cond_resched();
+                               goto restart;
+                       }
+               }
  
                 if (pte_none(ptent))
                         continue;
@@ -420,44 +442,55 @@ regular_page:
                 if (!pte_present(ptent))
                         continue;
  
-               page = vm_normal_page(vma, addr, ptent);
-               if (!page || is_zone_device_page(page))
+               folio = vm_normal_folio(vma, addr, ptent);
+               if (!folio || folio_is_zone_device(folio))
                         continue;
  
                 /*
                  * Creating a THP page is expensive so split it only if we
                  * are sure it's worth. Split it if we are only owner.
                  */
-               if (PageTransCompound(page)) {
-                       if (page_mapcount(page) != 1)
+               if (folio_test_large(folio)) {
+                       int err;
+
+                       if (folio_estimated_sharers(folio) > 1)
                                 break;
-                       get_page(page);
-                       if (!trylock_page(page)) {
-                               put_page(page);
+                       if (pageout_anon_only_filter && !folio_test_anon(folio))
                                 break;
-                       }
-                       pte_unmap_unlock(orig_pte, ptl);
-                       if (split_huge_page(page)) {
-                               unlock_page(page);
-                               put_page(page);
-                               orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+                       if (!folio_trylock(folio))
                                 break;
-                       }
-                       unlock_page(page);
-                       put_page(page);
-                       orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+                       folio_get(folio);
+                       arch_leave_lazy_mmu_mode();
+                       pte_unmap_unlock(start_pte, ptl);
+                       start_pte = NULL;
+                       err = split_folio(folio);
+                       folio_unlock(folio);
+                       folio_put(folio);
+                       if (err)
+                               break;
+                       start_pte = pte =
+                               pte_offset_map_lock(mm, pmd, addr, &ptl);
+                       if (!start_pte)
+                               break;
+                       arch_enter_lazy_mmu_mode();
                         pte--;
                         addr -= PAGE_SIZE;
                         continue;
                 }
  
-               /* Do not interfere with other mappings of this page */
-               if (page_mapcount(page) != 1)
+               /*
+                * Do not interfere with other mappings of this folio and
+                * non-LRU folio.
+                */
+               if (!folio_test_lru(folio) || folio_mapcount(folio) != 1)
+                       continue;
+
+               if (pageout_anon_only_filter && !folio_test_anon(folio))
                         continue;
  
-               VM_BUG_ON_PAGE(PageTransCompound(page), page);
+               VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
  
-               if (pte_young(ptent)) {
+               if (!pageout && pte_young(ptent)) {
                         ptent = ptep_get_and_clear_full(mm, addr, pte,
                                                         tlb->fullmm);
                         ptent = pte_mkold(ptent);
@@ -466,28 +499,32 @@ regular_page:
                 }
  
                 /*
-                * We are deactivating a page for accelerating reclaiming.
-                * VM couldn't reclaim the page unless we clear PG_young.
+                * We are deactivating a folio for accelerating reclaiming.
+                * VM couldn't reclaim the folio unless we clear PG_young.
                  * As a side effect, it makes confuse idle-page tracking
                  * because they will miss recent referenced history.
                  */
-               ClearPageReferenced(page);
-               test_and_clear_page_young(page);
+               folio_clear_referenced(folio);
+               folio_test_clear_young(folio);
+               if (folio_test_active(folio))
+                       folio_set_workingset(folio);
                 if (pageout) {
-                       if (!isolate_lru_page(page)) {
-                               if (PageUnevictable(page))
-                                       putback_lru_page(page);
+                       if (folio_isolate_lru(folio)) {
+                               if (folio_test_unevictable(folio))
+                                       folio_putback_lru(folio);
                                 else
-                                       list_add(&page->lru, &page_list);
+                                       list_add(&folio->lru, &folio_list);
                         }
                 } else
-                       deactivate_page(page);
+                       folio_deactivate(folio);
         }
  
-       arch_leave_lazy_mmu_mode();
-       pte_unmap_unlock(orig_pte, ptl);
+       if (start_pte) {
+               arch_leave_lazy_mmu_mode();
+               pte_unmap_unlock(start_pte, ptl);
+       }
         if (pageout)
-               reclaim_pages(&page_list);
+               reclaim_pages(&folio_list, true);
         cond_resched();
  
         return 0;
@@ -495,6 +532,7 @@ regular_page:
  
  static const struct mm_walk_ops cold_walk_ops = {
         .pmd_entry = madvise_cold_or_pageout_pte_range,
+       .walk_lock = PGWALK_RDLOCK,
  };
  
  static void madvise_cold_page_range(struct mmu_gather *tlb,
@@ -549,23 +587,6 @@ static void madvise_pageout_page_range(struct mmu_gather *tlb,
         tlb_end_vma(tlb, vma);
  }
  
-static inline bool can_do_pageout(struct vm_area_struct *vma)
-{
-       if (vma_is_anonymous(vma))
-               return true;
-       if (!vma->vm_file)
-               return false;
-       /*
-        * paging out pagecache only for non-anonymous mappings that correspond
-        * to the files the calling process could (if tried) open for writing;
-        * otherwise we'd be including shared non-exclusive mappings, which
-        * opens a side channel.
-        */
-       return inode_owner_or_capable(&init_user_ns,
-                                     file_inode(vma->vm_file)) ||
-              file_permission(vma->vm_file, MAY_WRITE) == 0;
-}
-
  static long madvise_pageout(struct vm_area_struct *vma,
                         struct vm_area_struct **prev,
                         unsigned long start_addr, unsigned long end_addr)
@@ -577,7 +598,14 @@ static long madvise_pageout(struct vm_area_struct *vma,
         if (!can_madv_lru_vma(vma))
                 return -EINVAL;
  
-       if (!can_do_pageout(vma))
+       /*
+        * If the VMA belongs to a private file mapping, there can be private
+        * dirty pages which can be paged out if even this process is neither
+        * owner nor write capable of the file. We allow private file mappings
+        * further to pageout dirty anon pages.
+        */
+       if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) &&
+                               (vma->vm_flags & VM_MAYSHARE)))
                 return 0;
  
         lru_add_drain();
@@ -596,25 +624,24 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
         struct mm_struct *mm = tlb->mm;
         struct vm_area_struct *vma = walk->vma;
         spinlock_t *ptl;
-       pte_t *orig_pte, *pte, ptent;
-       struct page *page;
+       pte_t *start_pte, *pte, ptent;
+       struct folio *folio;
         int nr_swap = 0;
         unsigned long next;
  
         next = pmd_addr_end(addr, end);
         if (pmd_trans_huge(*pmd))
                 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
-                       goto next;
-
-       if (pmd_trans_unstable(pmd))
-               return 0;
+                       return 0;
  
         tlb_change_page_size(tlb, PAGE_SIZE);
-       orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+       start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+       if (!start_pte)
+               return 0;
         flush_tlb_batched_pending(mm);
         arch_enter_lazy_mmu_mode();
         for (; addr != end; pte++, addr += PAGE_SIZE) {
-               ptent = *pte;
+               ptent = ptep_get(pte);
  
                 if (pte_none(ptent))
                         continue;
@@ -632,65 +659,67 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
                                 free_swap_and_cache(entry);
                                 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
                         } else if (is_hwpoison_entry(entry) ||
-                                  is_swapin_error_entry(entry)) {
+                                  is_poisoned_swp_entry(entry)) {
                                 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
                         }
                         continue;
                 }
  
-               page = vm_normal_page(vma, addr, ptent);
-               if (!page || is_zone_device_page(page))
+               folio = vm_normal_folio(vma, addr, ptent);
+               if (!folio || folio_is_zone_device(folio))
                         continue;
  
                 /*
-                * If pmd isn't transhuge but the page is THP and
+                * If pmd isn't transhuge but the folio is large and
                  * is owned by only this process, split it and
                  * deactivate all pages.
                  */
-               if (PageTransCompound(page)) {
-                       if (page_mapcount(page) != 1)
-                               goto out;
-                       get_page(page);
-                       if (!trylock_page(page)) {
-                               put_page(page);
-                               goto out;
-                       }
-                       pte_unmap_unlock(orig_pte, ptl);
-                       if (split_huge_page(page)) {
-                               unlock_page(page);
-                               put_page(page);
-                               orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
-                               goto out;
-                       }
-                       unlock_page(page);
-                       put_page(page);
-                       orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+               if (folio_test_large(folio)) {
+                       int err;
+
+                       if (folio_estimated_sharers(folio) != 1)
+                               break;
+                       if (!folio_trylock(folio))
+                               break;
+                       folio_get(folio);
+                       arch_leave_lazy_mmu_mode();
+                       pte_unmap_unlock(start_pte, ptl);
+                       start_pte = NULL;
+                       err = split_folio(folio);
+                       folio_unlock(folio);
+                       folio_put(folio);
+                       if (err)
+                               break;
+                       start_pte = pte =
+                               pte_offset_map_lock(mm, pmd, addr, &ptl);
+                       if (!start_pte)
+                               break;
+                       arch_enter_lazy_mmu_mode();
                         pte--;
                         addr -= PAGE_SIZE;
                         continue;
                 }
  
-               VM_BUG_ON_PAGE(PageTransCompound(page), page);
-
-               if (PageSwapCache(page) || PageDirty(page)) {
-                       if (!trylock_page(page))
+               if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
+                       if (!folio_trylock(folio))
                                 continue;
                         /*
-                        * If page is shared with others, we couldn't clear
-                        * PG_dirty of the page.
+                        * If folio is shared with others, we mustn't clear
+                        * the folio's dirty flag.
                          */
-                       if (page_mapcount(page) != 1) {
-                               unlock_page(page);
+                       if (folio_mapcount(folio) != 1) {
+                               folio_unlock(folio);
                                 continue;
                         }
  
-                       if (PageSwapCache(page) && !try_to_free_swap(page)) {
-                               unlock_page(page);
+                       if (folio_test_swapcache(folio) &&
+                           !folio_free_swap(folio)) {
+                               folio_unlock(folio);
                                 continue;
                         }
  
-                       ClearPageDirty(page);
-                       unlock_page(page);
+                       folio_clear_dirty(folio);
+                       folio_unlock(folio);
                 }
  
                 if (pte_young(ptent) || pte_dirty(ptent)) {
@@ -708,24 +737,23 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
                         set_pte_at(mm, addr, pte, ptent);
                         tlb_remove_tlb_entry(tlb, pte, addr);
                 }
-               mark_page_lazyfree(page);
+               folio_mark_lazyfree(folio);
         }
-out:
-       if (nr_swap) {
-               if (current->mm == mm)
-                       sync_mm_rss(mm);
  
+       if (nr_swap)
                 add_mm_counter(mm, MM_SWAPENTS, nr_swap);
+       if (start_pte) {
+               arch_leave_lazy_mmu_mode();
+               pte_unmap_unlock(start_pte, ptl);
         }
-       arch_leave_lazy_mmu_mode();
-       pte_unmap_unlock(orig_pte, ptl);
         cond_resched();
-next:
+
         return 0;
  }
  
  static const struct mm_walk_ops madvise_free_walk_ops = {
         .pmd_entry              = madvise_free_pte_range,
+       .walk_lock              = PGWALK_RDLOCK,
  };
  
  static int madvise_free_single_vma(struct vm_area_struct *vma,
@@ -745,7 +773,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
         range.end = min(vma->vm_end, end_addr);
         if (range.end <= vma->vm_start)
                 return -EINVAL;
-       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                                 range.start, range.end);
  
         lru_add_drain();
@@ -767,8 +795,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
   * Application no longer needs these pages.  If the pages are dirty,
   * it's OK to just throw them away.  The app will be more careful about
   * data it wants to keep.  Be sure to free swap resources too.  The
- * zap_page_range call sets things up for shrink_active_list to actually free
- * these pages later if no one else has touched them in the meantime,
+ * zap_page_range_single call sets things up for shrink_active_list to actually
+ * free these pages later if no one else has touched them in the meantime,
   * although we could add these pages to a global reuse list for
   * shrink_active_list to pick up before reclaiming other pages.
   *
@@ -785,7 +813,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
  static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
                                         unsigned long start, unsigned long end)
  {
-       zap_page_range(vma, start, end - start);
+       zap_page_range_single(vma, start, end - start, NULL);
         return 0;
  }
  
@@ -808,7 +836,14 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
         if (start & ~huge_page_mask(hstate_vma(vma)))
                 return false;
  
-       *end = ALIGN(*end, huge_page_size(hstate_vma(vma)));
+       /*
+        * Madvise callers expect the length to be rounded up to PAGE_SIZE
+        * boundaries, and may be unaware that this VMA uses huge pages.
+        * Avoid unexpected data loss by rounding down the number of
+        * huge pages freed.
+        */
+       *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma)));
+
         return true;
  }
  
@@ -823,25 +858,16 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
         if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
                 return -EINVAL;
  
+       if (start == end)
+               return 0;
+
         if (!userfaultfd_remove(vma, start, end)) {
                 *prev = NULL; /* mmap_lock has been dropped, prev is stale */
  
                 mmap_read_lock(mm);
-               vma = find_vma(mm, start);
+               vma = vma_lookup(mm, start);
                 if (!vma)
                         return -ENOMEM;
-               if (start < vma->vm_start) {
-                       /*
-                        * This "vma" under revalidation is the one
-                        * with the lowest vma->vm_start where start
-                        * is also < vma->vm_end. If start <
-                        * vma->vm_start it means an hole materialized
-                        * in the user address space within the
-                        * virtual range passed to MADV_DONTNEED
-                        * or MADV_FREE.
-                        */
-                       return -ENOMEM;
-               }
                 /*
                  * Potential end adjustment for hugetlb vma is OK as
                  * the check below keeps end within vma.
@@ -956,7 +982,7 @@ static long madvise_remove(struct vm_area_struct *vma,
                         return -EINVAL;
         }
  
-       if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
+       if (!vma_is_shared_maywrite(vma))
                 return -EACCES;
  
         offset = (loff_t)(start - vma->vm_start)
@@ -1057,6 +1083,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
                 if (error)
                         goto out;
                 break;
+       case MADV_COLLAPSE:
+               return madvise_collapse(vma, prev, start, end);
         }
  
         anon_name = anon_vma_name(vma);
@@ -1150,6 +1178,7 @@ madvise_behavior_valid(int behavior)
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
         case MADV_HUGEPAGE:
         case MADV_NOHUGEPAGE:
+       case MADV_COLLAPSE:
  #endif
         case MADV_DONTDUMP:
         case MADV_DODUMP:
@@ -1166,13 +1195,13 @@ madvise_behavior_valid(int behavior)
         }
  }
  
-static bool
-process_madvise_behavior_valid(int behavior)
+static bool process_madvise_behavior_valid(int behavior)
  {
         switch (behavior) {
         case MADV_COLD:
         case MADV_PAGEOUT:
         case MADV_WILLNEED:
+       case MADV_COLLAPSE:
                 return true;
         default:
                 return false;
@@ -1238,7 +1267,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
                 if (start >= end)
                         break;
                 if (prev)
-                       vma = prev->vm_next;
+                       vma = find_vma(mm, prev->vm_end);
                 else    /* madvise_remove dropped mmap_lock */
                         vma = find_vma(mm, start);
         }
@@ -1255,7 +1284,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma,
         int error;
  
         /* Only anonymous mappings can be named */
-       if (vma->vm_file)
+       if (vma->vm_file && !vma_is_anon_shmem(vma))
                 return -EBADF;
  
         error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
@@ -1339,6 +1368,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
   *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
   *             transparent huge pages so the existing pages will not be
   *             coalesced into THP and new pages will not be allocated as THP.
+ *  MADV_COLLAPSE - synchronously coalesce pages into new THP.
   *  MADV_DONTDUMP - the application wants to prevent pages in the given range
   *             from being included in its core dump.
   *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
@@ -1373,8 +1403,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
         size_t len;
         struct blk_plug plug;
  
-       start = untagged_addr(start);
-
         if (!madvise_behavior_valid(behavior))
                 return -EINVAL;
  
@@ -1406,6 +1434,9 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
                 mmap_read_lock(mm);
         }
  
+       start = untagged_addr_remote(mm, start);
+       end = start + len;
+
         blk_start_plug(&plug);
         error = madvise_walk_vmas(mm, start, end, behavior,
                         madvise_vma_behavior);
@@ -1427,7 +1458,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
                 size_t, vlen, int, behavior, unsigned int, flags)
  {
         ssize_t ret;
-       struct iovec iovstack[UIO_FASTIOV], iovec;
+       struct iovec iovstack[UIO_FASTIOV];
         struct iovec *iov = iovstack;
         struct iov_iter iter;
         struct task_struct *task;
@@ -1440,7 +1471,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
                 goto out;
         }
  
-       ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+       ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
         if (ret < 0)
                 goto out;
  
@@ -1474,12 +1505,11 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
         total_len = iov_iter_count(&iter);
  
         while (iov_iter_count(&iter)) {
-               iovec = iov_iter_iovec(&iter);
-               ret = do_madvise(mm, (unsigned long)iovec.iov_base,
-                                       iovec.iov_len, behavior);
+               ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
+                                       iter_iov_len(&iter), behavior);
                 if (ret < 0)
                         break;
-               iov_iter_advance(&iter, iovec.iov_len);
+               iov_iter_advance(&iter, iter_iov_len(&iter));
         }
  
         ret = (total_len - iov_iter_count(&iter)) ? : ret;