Linux 6.9-rc1
[linux-2.6-microblaze.git] / mm / madvise.c
index 5f0f094..44a498c 100644 (file)
@@ -59,6 +59,7 @@ static int madvise_need_mmap_write(int behavior)
        case MADV_FREE:
        case MADV_POPULATE_READ:
        case MADV_POPULATE_WRITE:
+       case MADV_COLLAPSE:
                return 0;
        default:
                /* be safe, default to 1. list exceptions explicitly */
@@ -94,9 +95,6 @@ struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
 {
        mmap_assert_locked(vma->vm_mm);
 
-       if (vma->vm_file)
-               return NULL;
-
        return vma->anon_name;
 }
 
@@ -132,7 +130,7 @@ static int replace_anon_vma_name(struct vm_area_struct *vma,
 #endif /* CONFIG_ANON_VMA_NAME */
 /*
  * Update the vm_flags on region of a vma, splitting it or merging it as
- * necessary.  Must be called with mmap_sem held for writing;
+ * necessary.  Must be called with mmap_lock held for writing;
  * Caller should ensure anon_name stability by raising its refcount even when
  * anon_name belongs to a valid vma because this function might free that vma.
  */
@@ -143,46 +141,24 @@ static int madvise_update_vma(struct vm_area_struct *vma,
 {
        struct mm_struct *mm = vma->vm_mm;
        int error;
-       pgoff_t pgoff;
+       VMA_ITERATOR(vmi, mm, start);
 
        if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
                *prev = vma;
                return 0;
        }
 
-       pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
-       *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
-                         vma->vm_file, pgoff, vma_policy(vma),
-                         vma->vm_userfaultfd_ctx, anon_name);
-       if (*prev) {
-               vma = *prev;
-               goto success;
-       }
+       vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags,
+                                   anon_name);
+       if (IS_ERR(vma))
+               return PTR_ERR(vma);
 
        *prev = vma;
 
-       if (start != vma->vm_start) {
-               if (unlikely(mm->map_count >= sysctl_max_map_count))
-                       return -ENOMEM;
-               error = __split_vma(mm, vma, start, 1);
-               if (error)
-                       return error;
-       }
-
-       if (end != vma->vm_end) {
-               if (unlikely(mm->map_count >= sysctl_max_map_count))
-                       return -ENOMEM;
-               error = __split_vma(mm, vma, end, 0);
-               if (error)
-                       return error;
-       }
-
-success:
-       /*
-        * vm_flags is protected by the mmap_lock held in write mode.
-        */
-       vma->vm_flags = new_flags;
-       if (!vma->vm_file) {
+       /* vm_flags is protected by the mmap_lock held in write mode. */
+       vma_start_write(vma);
+       vm_flags_reset(vma, new_flags);
+       if (!vma->vm_file || vma_is_anon_shmem(vma)) {
                error = replace_anon_vma_name(vma, anon_name);
                if (error)
                        return error;
@@ -193,79 +169,89 @@ success:
 
 #ifdef CONFIG_SWAP
 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
-       unsigned long end, struct mm_walk *walk)
+               unsigned long end, struct mm_walk *walk)
 {
        struct vm_area_struct *vma = walk->private;
-       unsigned long index;
        struct swap_iocb *splug = NULL;
+       pte_t *ptep = NULL;
+       spinlock_t *ptl;
+       unsigned long addr;
 
-       if (pmd_none_or_trans_huge_or_clear_bad(pmd))
-               return 0;
-
-       for (index = start; index != end; index += PAGE_SIZE) {
+       for (addr = start; addr < end; addr += PAGE_SIZE) {
                pte_t pte;
                swp_entry_t entry;
-               struct page *page;
-               spinlock_t *ptl;
-               pte_t *ptep;
+               struct folio *folio;
 
-               ptep = pte_offset_map_lock(vma->vm_mm, pmd, index, &ptl);
-               pte = *ptep;
-               pte_unmap_unlock(ptep, ptl);
+               if (!ptep++) {
+                       ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+                       if (!ptep)
+                               break;
+               }
 
+               pte = ptep_get(ptep);
                if (!is_swap_pte(pte))
                        continue;
                entry = pte_to_swp_entry(pte);
                if (unlikely(non_swap_entry(entry)))
                        continue;
 
-               page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
-                                            vma, index, false, &splug);
-               if (page)
-                       put_page(page);
+               pte_unmap_unlock(ptep, ptl);
+               ptep = NULL;
+
+               folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
+                                            vma, addr, &splug);
+               if (folio)
+                       folio_put(folio);
        }
+
+       if (ptep)
+               pte_unmap_unlock(ptep, ptl);
        swap_read_unplug(splug);
+       cond_resched();
 
        return 0;
 }
 
 static const struct mm_walk_ops swapin_walk_ops = {
        .pmd_entry              = swapin_walk_pmd_entry,
+       .walk_lock              = PGWALK_RDLOCK,
 };
 
-static void force_shm_swapin_readahead(struct vm_area_struct *vma,
+static void shmem_swapin_range(struct vm_area_struct *vma,
                unsigned long start, unsigned long end,
                struct address_space *mapping)
 {
        XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
-       pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
-       struct page *page;
+       pgoff_t end_index = linear_page_index(vma, end) - 1;
+       struct folio *folio;
        struct swap_iocb *splug = NULL;
 
        rcu_read_lock();
-       xas_for_each(&xas, page, end_index) {
-               swp_entry_t swap;
+       xas_for_each(&xas, folio, end_index) {
+               unsigned long addr;
+               swp_entry_t entry;
 
-               if (!xa_is_value(page))
+               if (!xa_is_value(folio))
                        continue;
-               swap = radix_to_swp_entry(page);
+               entry = radix_to_swp_entry(folio);
                /* There might be swapin error entries in shmem mapping. */
-               if (non_swap_entry(swap))
+               if (non_swap_entry(entry))
                        continue;
+
+               addr = vma->vm_start +
+                       ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT);
                xas_pause(&xas);
                rcu_read_unlock();
 
-               page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
-                                            NULL, 0, false, &splug);
-               if (page)
-                       put_page(page);
+               folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping),
+                                            vma, addr, &splug);
+               if (folio)
+                       folio_put(folio);
 
                rcu_read_lock();
        }
        rcu_read_unlock();
        swap_read_unplug(splug);
-
-       lru_add_drain();        /* Push any new pages onto the LRU now */
 }
 #endif         /* CONFIG_SWAP */
 
@@ -289,8 +275,8 @@ static long madvise_willneed(struct vm_area_struct *vma,
        }
 
        if (shmem_mapping(file->f_mapping)) {
-               force_shm_swapin_readahead(vma, start, end,
-                                       file->f_mapping);
+               shmem_swapin_range(vma, start, end, file->f_mapping);
+               lru_add_drain(); /* Push any new pages onto the LRU now */
                return 0;
        }
 #else
@@ -320,6 +306,21 @@ static long madvise_willneed(struct vm_area_struct *vma,
        return 0;
 }
 
+static inline bool can_do_file_pageout(struct vm_area_struct *vma)
+{
+       if (!vma->vm_file)
+               return false;
+       /*
+        * paging out pagecache only for non-anonymous mappings that correspond
+        * to the files the calling process could (if tried) open for writing;
+        * otherwise we'd be including shared non-exclusive mappings, which
+        * opens a side channel.
+        */
+       return inode_owner_or_capable(&nop_mnt_idmap,
+                                     file_inode(vma->vm_file)) ||
+              file_permission(vma->vm_file, MAY_WRITE) == 0;
+}
+
 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
                                unsigned long addr, unsigned long end,
                                struct mm_walk *walk)
@@ -329,14 +330,19 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
        bool pageout = private->pageout;
        struct mm_struct *mm = tlb->mm;
        struct vm_area_struct *vma = walk->vma;
-       pte_t *orig_pte, *pte, ptent;
+       pte_t *start_pte, *pte, ptent;
        spinlock_t *ptl;
-       struct page *page = NULL;
-       LIST_HEAD(page_list);
+       struct folio *folio = NULL;
+       LIST_HEAD(folio_list);
+       bool pageout_anon_only_filter;
+       unsigned int batch_count = 0;
 
        if (fatal_signal_pending(current))
                return -EINTR;
 
+       pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) &&
+                                       !can_do_file_pageout(vma);
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        if (pmd_trans_huge(*pmd)) {
                pmd_t orig_pmd;
@@ -357,27 +363,30 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
                        goto huge_unlock;
                }
 
-               page = pmd_page(orig_pmd);
+               folio = pfn_folio(pmd_pfn(orig_pmd));
 
-               /* Do not interfere with other mappings of this page */
-               if (page_mapcount(page) != 1)
+               /* Do not interfere with other mappings of this folio */
+               if (folio_estimated_sharers(folio) != 1)
+                       goto huge_unlock;
+
+               if (pageout_anon_only_filter && !folio_test_anon(folio))
                        goto huge_unlock;
 
                if (next - addr != HPAGE_PMD_SIZE) {
                        int err;
 
-                       get_page(page);
+                       folio_get(folio);
                        spin_unlock(ptl);
-                       lock_page(page);
-                       err = split_huge_page(page);
-                       unlock_page(page);
-                       put_page(page);
+                       folio_lock(folio);
+                       err = split_folio(folio);
+                       folio_unlock(folio);
+                       folio_put(folio);
                        if (!err)
-                               goto regular_page;
+                               goto regular_folio;
                        return 0;
                }
 
-               if (pmd_young(orig_pmd)) {
+               if (!pageout && pmd_young(orig_pmd)) {
                        pmdp_invalidate(vma, addr, pmd);
                        orig_pmd = pmd_mkold(orig_pmd);
 
@@ -385,34 +394,47 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
                        tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
                }
 
-               ClearPageReferenced(page);
-               test_and_clear_page_young(page);
+               folio_clear_referenced(folio);
+               folio_test_clear_young(folio);
+               if (folio_test_active(folio))
+                       folio_set_workingset(folio);
                if (pageout) {
-                       if (!isolate_lru_page(page)) {
-                               if (PageUnevictable(page))
-                                       putback_lru_page(page);
+                       if (folio_isolate_lru(folio)) {
+                               if (folio_test_unevictable(folio))
+                                       folio_putback_lru(folio);
                                else
-                                       list_add(&page->lru, &page_list);
+                                       list_add(&folio->lru, &folio_list);
                        }
                } else
-                       deactivate_page(page);
+                       folio_deactivate(folio);
 huge_unlock:
                spin_unlock(ptl);
                if (pageout)
-                       reclaim_pages(&page_list);
+                       reclaim_pages(&folio_list, true);
                return 0;
        }
 
-regular_page:
-       if (pmd_trans_unstable(pmd))
-               return 0;
+regular_folio:
 #endif
        tlb_change_page_size(tlb, PAGE_SIZE);
-       orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+restart:
+       start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+       if (!start_pte)
+               return 0;
        flush_tlb_batched_pending(mm);
        arch_enter_lazy_mmu_mode();
        for (; addr < end; pte++, addr += PAGE_SIZE) {
-               ptent = *pte;
+               ptent = ptep_get(pte);
+
+               if (++batch_count == SWAP_CLUSTER_MAX) {
+                       batch_count = 0;
+                       if (need_resched()) {
+                               arch_leave_lazy_mmu_mode();
+                               pte_unmap_unlock(start_pte, ptl);
+                               cond_resched();
+                               goto restart;
+                       }
+               }
 
                if (pte_none(ptent))
                        continue;
@@ -420,44 +442,55 @@ regular_page:
                if (!pte_present(ptent))
                        continue;
 
-               page = vm_normal_page(vma, addr, ptent);
-               if (!page || is_zone_device_page(page))
+               folio = vm_normal_folio(vma, addr, ptent);
+               if (!folio || folio_is_zone_device(folio))
                        continue;
 
                /*
                 * Creating a THP page is expensive so split it only if we
                 * are sure it's worth. Split it if we are only owner.
                 */
-               if (PageTransCompound(page)) {
-                       if (page_mapcount(page) != 1)
+               if (folio_test_large(folio)) {
+                       int err;
+
+                       if (folio_estimated_sharers(folio) > 1)
                                break;
-                       get_page(page);
-                       if (!trylock_page(page)) {
-                               put_page(page);
+                       if (pageout_anon_only_filter && !folio_test_anon(folio))
                                break;
-                       }
-                       pte_unmap_unlock(orig_pte, ptl);
-                       if (split_huge_page(page)) {
-                               unlock_page(page);
-                               put_page(page);
-                               orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+                       if (!folio_trylock(folio))
                                break;
-                       }
-                       unlock_page(page);
-                       put_page(page);
-                       orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+                       folio_get(folio);
+                       arch_leave_lazy_mmu_mode();
+                       pte_unmap_unlock(start_pte, ptl);
+                       start_pte = NULL;
+                       err = split_folio(folio);
+                       folio_unlock(folio);
+                       folio_put(folio);
+                       if (err)
+                               break;
+                       start_pte = pte =
+                               pte_offset_map_lock(mm, pmd, addr, &ptl);
+                       if (!start_pte)
+                               break;
+                       arch_enter_lazy_mmu_mode();
                        pte--;
                        addr -= PAGE_SIZE;
                        continue;
                }
 
-               /* Do not interfere with other mappings of this page */
-               if (page_mapcount(page) != 1)
+               /*
+                * Do not interfere with other mappings of this folio and
+                * non-LRU folio.
+                */
+               if (!folio_test_lru(folio) || folio_mapcount(folio) != 1)
+                       continue;
+
+               if (pageout_anon_only_filter && !folio_test_anon(folio))
                        continue;
 
-               VM_BUG_ON_PAGE(PageTransCompound(page), page);
+               VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
 
-               if (pte_young(ptent)) {
+               if (!pageout && pte_young(ptent)) {
                        ptent = ptep_get_and_clear_full(mm, addr, pte,
                                                        tlb->fullmm);
                        ptent = pte_mkold(ptent);
@@ -466,28 +499,32 @@ regular_page:
                }
 
                /*
-                * We are deactivating a page for accelerating reclaiming.
-                * VM couldn't reclaim the page unless we clear PG_young.
+                * We are deactivating a folio for accelerating reclaiming.
+                * VM couldn't reclaim the folio unless we clear PG_young.
                 * As a side effect, it makes confuse idle-page tracking
                 * because they will miss recent referenced history.
                 */
-               ClearPageReferenced(page);
-               test_and_clear_page_young(page);
+               folio_clear_referenced(folio);
+               folio_test_clear_young(folio);
+               if (folio_test_active(folio))
+                       folio_set_workingset(folio);
                if (pageout) {
-                       if (!isolate_lru_page(page)) {
-                               if (PageUnevictable(page))
-                                       putback_lru_page(page);
+                       if (folio_isolate_lru(folio)) {
+                               if (folio_test_unevictable(folio))
+                                       folio_putback_lru(folio);
                                else
-                                       list_add(&page->lru, &page_list);
+                                       list_add(&folio->lru, &folio_list);
                        }
                } else
-                       deactivate_page(page);
+                       folio_deactivate(folio);
        }
 
-       arch_leave_lazy_mmu_mode();
-       pte_unmap_unlock(orig_pte, ptl);
+       if (start_pte) {
+               arch_leave_lazy_mmu_mode();
+               pte_unmap_unlock(start_pte, ptl);
+       }
        if (pageout)
-               reclaim_pages(&page_list);
+               reclaim_pages(&folio_list, true);
        cond_resched();
 
        return 0;
@@ -495,6 +532,7 @@ regular_page:
 
 static const struct mm_walk_ops cold_walk_ops = {
        .pmd_entry = madvise_cold_or_pageout_pte_range,
+       .walk_lock = PGWALK_RDLOCK,
 };
 
 static void madvise_cold_page_range(struct mmu_gather *tlb,
@@ -549,23 +587,6 @@ static void madvise_pageout_page_range(struct mmu_gather *tlb,
        tlb_end_vma(tlb, vma);
 }
 
-static inline bool can_do_pageout(struct vm_area_struct *vma)
-{
-       if (vma_is_anonymous(vma))
-               return true;
-       if (!vma->vm_file)
-               return false;
-       /*
-        * paging out pagecache only for non-anonymous mappings that correspond
-        * to the files the calling process could (if tried) open for writing;
-        * otherwise we'd be including shared non-exclusive mappings, which
-        * opens a side channel.
-        */
-       return inode_owner_or_capable(&init_user_ns,
-                                     file_inode(vma->vm_file)) ||
-              file_permission(vma->vm_file, MAY_WRITE) == 0;
-}
-
 static long madvise_pageout(struct vm_area_struct *vma,
                        struct vm_area_struct **prev,
                        unsigned long start_addr, unsigned long end_addr)
@@ -577,7 +598,14 @@ static long madvise_pageout(struct vm_area_struct *vma,
        if (!can_madv_lru_vma(vma))
                return -EINVAL;
 
-       if (!can_do_pageout(vma))
+       /*
+        * If the VMA belongs to a private file mapping, there can be private
+        * dirty pages which can be paged out if even this process is neither
+        * owner nor write capable of the file. We allow private file mappings
+        * further to pageout dirty anon pages.
+        */
+       if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) &&
+                               (vma->vm_flags & VM_MAYSHARE)))
                return 0;
 
        lru_add_drain();
@@ -596,25 +624,24 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
        struct mm_struct *mm = tlb->mm;
        struct vm_area_struct *vma = walk->vma;
        spinlock_t *ptl;
-       pte_t *orig_pte, *pte, ptent;
-       struct page *page;
+       pte_t *start_pte, *pte, ptent;
+       struct folio *folio;
        int nr_swap = 0;
        unsigned long next;
 
        next = pmd_addr_end(addr, end);
        if (pmd_trans_huge(*pmd))
                if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
-                       goto next;
-
-       if (pmd_trans_unstable(pmd))
-               return 0;
+                       return 0;
 
        tlb_change_page_size(tlb, PAGE_SIZE);
-       orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+       start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+       if (!start_pte)
+               return 0;
        flush_tlb_batched_pending(mm);
        arch_enter_lazy_mmu_mode();
        for (; addr != end; pte++, addr += PAGE_SIZE) {
-               ptent = *pte;
+               ptent = ptep_get(pte);
 
                if (pte_none(ptent))
                        continue;
@@ -632,65 +659,67 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
                                free_swap_and_cache(entry);
                                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
                        } else if (is_hwpoison_entry(entry) ||
-                                  is_swapin_error_entry(entry)) {
+                                  is_poisoned_swp_entry(entry)) {
                                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
                        }
                        continue;
                }
 
-               page = vm_normal_page(vma, addr, ptent);
-               if (!page || is_zone_device_page(page))
+               folio = vm_normal_folio(vma, addr, ptent);
+               if (!folio || folio_is_zone_device(folio))
                        continue;
 
                /*
-                * If pmd isn't transhuge but the page is THP and
+                * If pmd isn't transhuge but the folio is large and
                 * is owned by only this process, split it and
                 * deactivate all pages.
                 */
-               if (PageTransCompound(page)) {
-                       if (page_mapcount(page) != 1)
-                               goto out;
-                       get_page(page);
-                       if (!trylock_page(page)) {
-                               put_page(page);
-                               goto out;
-                       }
-                       pte_unmap_unlock(orig_pte, ptl);
-                       if (split_huge_page(page)) {
-                               unlock_page(page);
-                               put_page(page);
-                               orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
-                               goto out;
-                       }
-                       unlock_page(page);
-                       put_page(page);
-                       orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+               if (folio_test_large(folio)) {
+                       int err;
+
+                       if (folio_estimated_sharers(folio) != 1)
+                               break;
+                       if (!folio_trylock(folio))
+                               break;
+                       folio_get(folio);
+                       arch_leave_lazy_mmu_mode();
+                       pte_unmap_unlock(start_pte, ptl);
+                       start_pte = NULL;
+                       err = split_folio(folio);
+                       folio_unlock(folio);
+                       folio_put(folio);
+                       if (err)
+                               break;
+                       start_pte = pte =
+                               pte_offset_map_lock(mm, pmd, addr, &ptl);
+                       if (!start_pte)
+                               break;
+                       arch_enter_lazy_mmu_mode();
                        pte--;
                        addr -= PAGE_SIZE;
                        continue;
                }
 
-               VM_BUG_ON_PAGE(PageTransCompound(page), page);
-
-               if (PageSwapCache(page) || PageDirty(page)) {
-                       if (!trylock_page(page))
+               if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
+                       if (!folio_trylock(folio))
                                continue;
                        /*
-                        * If page is shared with others, we couldn't clear
-                        * PG_dirty of the page.
+                        * If folio is shared with others, we mustn't clear
+                        * the folio's dirty flag.
                         */
-                       if (page_mapcount(page) != 1) {
-                               unlock_page(page);
+                       if (folio_mapcount(folio) != 1) {
+                               folio_unlock(folio);
                                continue;
                        }
 
-                       if (PageSwapCache(page) && !try_to_free_swap(page)) {
-                               unlock_page(page);
+                       if (folio_test_swapcache(folio) &&
+                           !folio_free_swap(folio)) {
+                               folio_unlock(folio);
                                continue;
                        }
 
-                       ClearPageDirty(page);
-                       unlock_page(page);
+                       folio_clear_dirty(folio);
+                       folio_unlock(folio);
                }
 
                if (pte_young(ptent) || pte_dirty(ptent)) {
@@ -708,24 +737,23 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
                        set_pte_at(mm, addr, pte, ptent);
                        tlb_remove_tlb_entry(tlb, pte, addr);
                }
-               mark_page_lazyfree(page);
+               folio_mark_lazyfree(folio);
        }
-out:
-       if (nr_swap) {
-               if (current->mm == mm)
-                       sync_mm_rss(mm);
 
+       if (nr_swap)
                add_mm_counter(mm, MM_SWAPENTS, nr_swap);
+       if (start_pte) {
+               arch_leave_lazy_mmu_mode();
+               pte_unmap_unlock(start_pte, ptl);
        }
-       arch_leave_lazy_mmu_mode();
-       pte_unmap_unlock(orig_pte, ptl);
        cond_resched();
-next:
+
        return 0;
 }
 
 static const struct mm_walk_ops madvise_free_walk_ops = {
        .pmd_entry              = madvise_free_pte_range,
+       .walk_lock              = PGWALK_RDLOCK,
 };
 
 static int madvise_free_single_vma(struct vm_area_struct *vma,
@@ -745,7 +773,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
        range.end = min(vma->vm_end, end_addr);
        if (range.end <= vma->vm_start)
                return -EINVAL;
-       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                                range.start, range.end);
 
        lru_add_drain();
@@ -767,8 +795,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
  * Application no longer needs these pages.  If the pages are dirty,
  * it's OK to just throw them away.  The app will be more careful about
  * data it wants to keep.  Be sure to free swap resources too.  The
- * zap_page_range call sets things up for shrink_active_list to actually free
- * these pages later if no one else has touched them in the meantime,
+ * zap_page_range_single call sets things up for shrink_active_list to actually
+ * free these pages later if no one else has touched them in the meantime,
  * although we could add these pages to a global reuse list for
  * shrink_active_list to pick up before reclaiming other pages.
  *
@@ -785,7 +813,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
 static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
                                        unsigned long start, unsigned long end)
 {
-       zap_page_range(vma, start, end - start);
+       zap_page_range_single(vma, start, end - start, NULL);
        return 0;
 }
 
@@ -808,7 +836,14 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
        if (start & ~huge_page_mask(hstate_vma(vma)))
                return false;
 
-       *end = ALIGN(*end, huge_page_size(hstate_vma(vma)));
+       /*
+        * Madvise callers expect the length to be rounded up to PAGE_SIZE
+        * boundaries, and may be unaware that this VMA uses huge pages.
+        * Avoid unexpected data loss by rounding down the number of
+        * huge pages freed.
+        */
+       *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma)));
+
        return true;
 }
 
@@ -823,25 +858,16 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
        if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
                return -EINVAL;
 
+       if (start == end)
+               return 0;
+
        if (!userfaultfd_remove(vma, start, end)) {
                *prev = NULL; /* mmap_lock has been dropped, prev is stale */
 
                mmap_read_lock(mm);
-               vma = find_vma(mm, start);
+               vma = vma_lookup(mm, start);
                if (!vma)
                        return -ENOMEM;
-               if (start < vma->vm_start) {
-                       /*
-                        * This "vma" under revalidation is the one
-                        * with the lowest vma->vm_start where start
-                        * is also < vma->vm_end. If start <
-                        * vma->vm_start it means an hole materialized
-                        * in the user address space within the
-                        * virtual range passed to MADV_DONTNEED
-                        * or MADV_FREE.
-                        */
-                       return -ENOMEM;
-               }
                /*
                 * Potential end adjustment for hugetlb vma is OK as
                 * the check below keeps end within vma.
@@ -956,7 +982,7 @@ static long madvise_remove(struct vm_area_struct *vma,
                        return -EINVAL;
        }
 
-       if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
+       if (!vma_is_shared_maywrite(vma))
                return -EACCES;
 
        offset = (loff_t)(start - vma->vm_start)
@@ -1057,6 +1083,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
                if (error)
                        goto out;
                break;
+       case MADV_COLLAPSE:
+               return madvise_collapse(vma, prev, start, end);
        }
 
        anon_name = anon_vma_name(vma);
@@ -1150,6 +1178,7 @@ madvise_behavior_valid(int behavior)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        case MADV_HUGEPAGE:
        case MADV_NOHUGEPAGE:
+       case MADV_COLLAPSE:
 #endif
        case MADV_DONTDUMP:
        case MADV_DODUMP:
@@ -1166,13 +1195,13 @@ madvise_behavior_valid(int behavior)
        }
 }
 
-static bool
-process_madvise_behavior_valid(int behavior)
+static bool process_madvise_behavior_valid(int behavior)
 {
        switch (behavior) {
        case MADV_COLD:
        case MADV_PAGEOUT:
        case MADV_WILLNEED:
+       case MADV_COLLAPSE:
                return true;
        default:
                return false;
@@ -1238,7 +1267,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
                if (start >= end)
                        break;
                if (prev)
-                       vma = prev->vm_next;
+                       vma = find_vma(mm, prev->vm_end);
                else    /* madvise_remove dropped mmap_lock */
                        vma = find_vma(mm, start);
        }
@@ -1255,7 +1284,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma,
        int error;
 
        /* Only anonymous mappings can be named */
-       if (vma->vm_file)
+       if (vma->vm_file && !vma_is_anon_shmem(vma))
                return -EBADF;
 
        error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
@@ -1339,6 +1368,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
  *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
  *             transparent huge pages so the existing pages will not be
  *             coalesced into THP and new pages will not be allocated as THP.
+ *  MADV_COLLAPSE - synchronously coalesce pages into new THP.
  *  MADV_DONTDUMP - the application wants to prevent pages in the given range
  *             from being included in its core dump.
  *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
@@ -1373,8 +1403,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
        size_t len;
        struct blk_plug plug;
 
-       start = untagged_addr(start);
-
        if (!madvise_behavior_valid(behavior))
                return -EINVAL;
 
@@ -1406,6 +1434,9 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
                mmap_read_lock(mm);
        }
 
+       start = untagged_addr_remote(mm, start);
+       end = start + len;
+
        blk_start_plug(&plug);
        error = madvise_walk_vmas(mm, start, end, behavior,
                        madvise_vma_behavior);
@@ -1427,7 +1458,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
                size_t, vlen, int, behavior, unsigned int, flags)
 {
        ssize_t ret;
-       struct iovec iovstack[UIO_FASTIOV], iovec;
+       struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
        struct task_struct *task;
@@ -1440,7 +1471,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
                goto out;
        }
 
-       ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+       ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
        if (ret < 0)
                goto out;
 
@@ -1474,12 +1505,11 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
        total_len = iov_iter_count(&iter);
 
        while (iov_iter_count(&iter)) {
-               iovec = iov_iter_iovec(&iter);
-               ret = do_madvise(mm, (unsigned long)iovec.iov_base,
-                                       iovec.iov_len, behavior);
+               ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
+                                       iter_iov_len(&iter), behavior);
                if (ret < 0)
                        break;
-               iov_iter_advance(&iter, iovec.iov_len);
+               iov_iter_advance(&iter, iter_iov_len(&iter));
        }
 
        ret = (total_len - iov_iter_count(&iter)) ? : ret;