Merge tag 'locking-urgent-2020-12-27' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6-microblaze.git] / mm / vmscan.c
index 7b4e31e..257cba7 100644 (file)
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- *  linux/mm/vmscan.c
- *
  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  *
  *  Swap reorganised 29.12.95, Stephen Tweedie.
@@ -1072,7 +1070,6 @@ static void page_check_dirty_writeback(struct page *page,
 static unsigned int shrink_page_list(struct list_head *page_list,
                                     struct pglist_data *pgdat,
                                     struct scan_control *sc,
-                                    enum ttu_flags ttu_flags,
                                     struct reclaim_stat *stat,
                                     bool ignore_references)
 {
@@ -1297,7 +1294,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page)) {
-                       enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
+                       enum ttu_flags flags = TTU_BATCH_FLUSH;
                        bool was_swapbacked = PageSwapBacked(page);
 
                        if (unlikely(PageTransHuge(page)))
@@ -1372,6 +1369,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
                                if (PageDirty(page) || PageWriteback(page))
                                        goto keep_locked;
                                mapping = page_mapping(page);
+                               fallthrough;
                        case PAGE_CLEAN:
                                ; /* try to free the page below */
                        }
@@ -1393,7 +1391,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
                 *
                 * Rarely, pages can have buffers and no ->mapping.  These are
                 * the pages which were not successfully invalidated in
-                * truncate_complete_page().  We try to drop those buffers here
+                * truncate_cleanup_page().  We try to drop those buffers here
                 * and if that worked, and the page is no longer mapped into
                 * process address space (page_count == 1) it can be freed.
                 * Otherwise, leave the page on the LRU so it is swappable.
@@ -1514,7 +1512,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
        }
 
        nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
-                       TTU_IGNORE_ACCESS, &stat, true);
+                                       &stat, true);
        list_splice(&clean_pages, page_list);
        mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
                            -(long)nr_reclaimed);
@@ -1541,9 +1539,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
  *
  * returns 0 on success, -ve errno on failure.
  */
-int __isolate_lru_page(struct page *page, isolate_mode_t mode)
+int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
 {
-       int ret = -EINVAL;
+       int ret = -EBUSY;
 
        /* Only take pages on the LRU. */
        if (!PageLRU(page))
@@ -1553,8 +1551,6 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
        if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
                return ret;
 
-       ret = -EBUSY;
-
        /*
         * To minimise LRU disruption, the caller can indicate that it only
         * wants to isolate pages it will be able to operate on without
@@ -1595,20 +1591,9 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
        if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
                return ret;
 
-       if (likely(get_page_unless_zero(page))) {
-               /*
-                * Be careful not to clear PageLRU until after we're
-                * sure the page is not being freed elsewhere -- the
-                * page release code relies on it.
-                */
-               ClearPageLRU(page);
-               ret = 0;
-       }
-
-       return ret;
+       return 0;
 }
 
-
 /*
  * Update LRU sizes after isolating pages. The LRU size updates must
  * be complete before mem_cgroup_update_lru_size due to a sanity check.
@@ -1628,14 +1613,16 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
 }
 
 /**
- * pgdat->lru_lock is heavily contended.  Some of the functions that
+ * Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
+ *
+ * lruvec->lru_lock is heavily contended.  Some of the functions that
  * shrink the lists perform better by taking out a batch of pages
  * and working on them outside the LRU lock.
  *
  * For pagecache intensive workloads, this function is the hottest
  * spot in the kernel (apart from copy_*_user functions).
  *
- * Appropriate locks must be held before calling this function.
+ * Lru_lock must be held before calling this function.
  *
  * @nr_to_scan:        The number of eligible pages to look through on the list.
  * @lruvec:    The LRU vector to pull pages from.
@@ -1668,8 +1655,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                page = lru_to_page(src);
                prefetchw_prev_lru_page(page, src, flags);
 
-               VM_BUG_ON_PAGE(!PageLRU(page), page);
-
                nr_pages = compound_nr(page);
                total_scan += nr_pages;
 
@@ -1690,20 +1675,34 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                 * only when the page is being freed somewhere else.
                 */
                scan += nr_pages;
-               switch (__isolate_lru_page(page, mode)) {
+               switch (__isolate_lru_page_prepare(page, mode)) {
                case 0:
+                       /*
+                        * Be careful not to clear PageLRU until after we're
+                        * sure the page is not being freed elsewhere -- the
+                        * page release code relies on it.
+                        */
+                       if (unlikely(!get_page_unless_zero(page)))
+                               goto busy;
+
+                       if (!TestClearPageLRU(page)) {
+                               /*
+                                * This page may in other isolation path,
+                                * but we still hold lru_lock.
+                                */
+                               put_page(page);
+                               goto busy;
+                       }
+
                        nr_taken += nr_pages;
                        nr_zone_taken[page_zonenum(page)] += nr_pages;
                        list_move(&page->lru, dst);
                        break;
 
-               case -EBUSY:
+               default:
+busy:
                        /* else it is being freed elsewhere */
                        list_move(&page->lru, src);
-                       continue;
-
-               default:
-                       BUG();
                }
        }
 
@@ -1766,21 +1765,16 @@ int isolate_lru_page(struct page *page)
        VM_BUG_ON_PAGE(!page_count(page), page);
        WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
 
-       if (PageLRU(page)) {
-               pg_data_t *pgdat = page_pgdat(page);
+       if (TestClearPageLRU(page)) {
                struct lruvec *lruvec;
 
-               spin_lock_irq(&pgdat->lru_lock);
-               lruvec = mem_cgroup_page_lruvec(page, pgdat);
-               if (PageLRU(page)) {
-                       int lru = page_lru(page);
-                       get_page(page);
-                       ClearPageLRU(page);
-                       del_page_from_lru_list(page, lruvec, lru);
-                       ret = 0;
-               }
-               spin_unlock_irq(&pgdat->lru_lock);
+               get_page(page);
+               lruvec = lock_page_lruvec_irq(page);
+               del_page_from_lru_list(page, lruvec, page_lru(page));
+               unlock_page_lruvec_irq(lruvec);
+               ret = 0;
        }
+
        return ret;
 }
 
@@ -1822,29 +1816,14 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
 }
 
 /*
- * This moves pages from @list to corresponding LRU list.
- *
- * We move them the other way if the page is referenced by one or more
- * processes, from rmap.
- *
- * If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold zone_lru_lock across the whole operation.  But if
- * the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop zone_lru_lock around each page.  It's impossible to balance
- * this, so instead we remove the pages from the LRU while processing them.
- * It is safe to rely on PG_active against the non-LRU pages in here because
- * nobody will play with that bit on a non-LRU page.
- *
- * The downside is that we have to touch page->_refcount against each page.
- * But we had to alter page->flags anyway.
+ * move_pages_to_lru() moves pages from private @list to appropriate LRU list.
+ * On return, @list is reused as a list of pages to be freed by the caller.
  *
  * Returns the number of pages moved to the given lruvec.
  */
-
 static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
                                                     struct list_head *list)
 {
-       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
        int nr_pages, nr_moved = 0;
        LIST_HEAD(pages_to_free);
        struct page *page;
@@ -1853,38 +1832,54 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
        while (!list_empty(list)) {
                page = lru_to_page(list);
                VM_BUG_ON_PAGE(PageLRU(page), page);
+               list_del(&page->lru);
                if (unlikely(!page_evictable(page))) {
-                       list_del(&page->lru);
-                       spin_unlock_irq(&pgdat->lru_lock);
+                       spin_unlock_irq(&lruvec->lru_lock);
                        putback_lru_page(page);
-                       spin_lock_irq(&pgdat->lru_lock);
+                       spin_lock_irq(&lruvec->lru_lock);
                        continue;
                }
-               lruvec = mem_cgroup_page_lruvec(page, pgdat);
 
+               /*
+                * The SetPageLRU needs to be kept here for list integrity.
+                * Otherwise:
+                *   #0 move_pages_to_lru             #1 release_pages
+                *   if !put_page_testzero
+                *                                    if (put_page_testzero())
+                *                                      !PageLRU //skip lru_lock
+                *     SetPageLRU()
+                *     list_add(&page->lru,)
+                *                                        list_add(&page->lru,)
+                */
                SetPageLRU(page);
-               lru = page_lru(page);
 
-               nr_pages = thp_nr_pages(page);
-               update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
-               list_move(&page->lru, &lruvec->lists[lru]);
-
-               if (put_page_testzero(page)) {
+               if (unlikely(put_page_testzero(page))) {
                        __ClearPageLRU(page);
                        __ClearPageActive(page);
-                       del_page_from_lru_list(page, lruvec, lru);
 
                        if (unlikely(PageCompound(page))) {
-                               spin_unlock_irq(&pgdat->lru_lock);
+                               spin_unlock_irq(&lruvec->lru_lock);
                                destroy_compound_page(page);
-                               spin_lock_irq(&pgdat->lru_lock);
+                               spin_lock_irq(&lruvec->lru_lock);
                        } else
                                list_add(&page->lru, &pages_to_free);
-               } else {
-                       nr_moved += nr_pages;
-                       if (PageActive(page))
-                               workingset_age_nonresident(lruvec, nr_pages);
+
+                       continue;
                }
+
+               /*
+                * All pages were isolated from the same lruvec (and isolation
+                * inhibits memcg migration).
+                */
+               VM_BUG_ON_PAGE(!lruvec_holds_page_lru_lock(page, lruvec), page);
+               lru = page_lru(page);
+               nr_pages = thp_nr_pages(page);
+
+               update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
+               list_add(&page->lru, &lruvec->lists[lru]);
+               nr_moved += nr_pages;
+               if (PageActive(page))
+                       workingset_age_nonresident(lruvec, nr_pages);
        }
 
        /*
@@ -1941,7 +1936,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 
        lru_add_drain();
 
-       spin_lock_irq(&pgdat->lru_lock);
+       spin_lock_irq(&lruvec->lru_lock);
 
        nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
                                     &nr_scanned, sc, lru);
@@ -1953,28 +1948,25 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
        __count_vm_events(PGSCAN_ANON + file, nr_scanned);
 
-       spin_unlock_irq(&pgdat->lru_lock);
+       spin_unlock_irq(&lruvec->lru_lock);
 
        if (nr_taken == 0)
                return 0;
 
-       nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
-                               &stat, false);
-
-       spin_lock_irq(&pgdat->lru_lock);
+       nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false);
 
+       spin_lock_irq(&lruvec->lru_lock);
        move_pages_to_lru(lruvec, &page_list);
 
        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
-       lru_note_cost(lruvec, file, stat.nr_pageout);
        item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
        if (!cgroup_reclaim(sc))
                __count_vm_events(item, nr_reclaimed);
        __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
        __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
+       spin_unlock_irq(&lruvec->lru_lock);
 
-       spin_unlock_irq(&pgdat->lru_lock);
-
+       lru_note_cost(lruvec, file, stat.nr_pageout);
        mem_cgroup_uncharge_list(&page_list);
        free_unref_page_list(&page_list);
 
@@ -2006,6 +1998,23 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        return nr_reclaimed;
 }
 
+/*
+ * shrink_active_list() moves pages from the active LRU to the inactive LRU.
+ *
+ * We move them the other way if the page is referenced by one or more
+ * processes.
+ *
+ * If the pages are mostly unmapped, the processing is fast and it is
+ * appropriate to hold lru_lock across the whole operation.  But if
+ * the pages are mapped, the processing is slow (page_referenced()), so
+ * we should drop lru_lock around each page.  It's impossible to balance
+ * this, so instead we remove the pages from the LRU while processing them.
+ * It is safe to rely on PG_active against the non-LRU pages in here because
+ * nobody will play with that bit on a non-LRU page.
+ *
+ * The downside is that we have to touch page->_refcount against each page.
+ * But we had to alter page->flags anyway.
+ */
 static void shrink_active_list(unsigned long nr_to_scan,
                               struct lruvec *lruvec,
                               struct scan_control *sc,
@@ -2025,7 +2034,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
 
        lru_add_drain();
 
-       spin_lock_irq(&pgdat->lru_lock);
+       spin_lock_irq(&lruvec->lru_lock);
 
        nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
                                     &nr_scanned, sc, lru);
@@ -2036,7 +2045,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
                __count_vm_events(PGREFILL, nr_scanned);
        __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
 
-       spin_unlock_irq(&pgdat->lru_lock);
+       spin_unlock_irq(&lruvec->lru_lock);
 
        while (!list_empty(&l_hold)) {
                cond_resched();
@@ -2082,7 +2091,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
        /*
         * Move pages back to the lru list.
         */
-       spin_lock_irq(&pgdat->lru_lock);
+       spin_lock_irq(&lruvec->lru_lock);
 
        nr_activate = move_pages_to_lru(lruvec, &l_active);
        nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
@@ -2093,7 +2102,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
        __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
 
        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
-       spin_unlock_irq(&pgdat->lru_lock);
+       spin_unlock_irq(&lruvec->lru_lock);
 
        mem_cgroup_uncharge_list(&l_active);
        free_unref_page_list(&l_active);
@@ -2131,8 +2140,7 @@ unsigned long reclaim_pages(struct list_head *page_list)
 
                nr_reclaimed += shrink_page_list(&node_page_list,
                                                NODE_DATA(nid),
-                                               &sc, 0,
-                                               &dummy_stat, false);
+                                               &sc, &dummy_stat, false);
                while (!list_empty(&node_page_list)) {
                        page = lru_to_page(&node_page_list);
                        list_del(&page->lru);
@@ -2145,8 +2153,7 @@ unsigned long reclaim_pages(struct list_head *page_list)
        if (!list_empty(&node_page_list)) {
                nr_reclaimed += shrink_page_list(&node_page_list,
                                                NODE_DATA(nid),
-                                               &sc, 0,
-                                               &dummy_stat, false);
+                                               &sc, &dummy_stat, false);
                while (!list_empty(&node_page_list)) {
                        page = lru_to_page(&node_page_list);
                        list_del(&page->lru);
@@ -2683,10 +2690,10 @@ again:
        /*
         * Determine the scan balance between anon and file LRUs.
         */
-       spin_lock_irq(&pgdat->lru_lock);
+       spin_lock_irq(&target_lruvec->lru_lock);
        sc->anon_cost = target_lruvec->anon_cost;
        sc->file_cost = target_lruvec->file_cost;
-       spin_unlock_irq(&pgdat->lru_lock);
+       spin_unlock_irq(&target_lruvec->lru_lock);
 
        /*
         * Target desirable inactive:active list ratios for the anon
@@ -3899,7 +3906,7 @@ kswapd_try_sleep:
                                        highest_zoneidx);
 
                /* Read the new order and highest_zoneidx */
-               alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
+               alloc_order = READ_ONCE(pgdat->kswapd_order);
                highest_zoneidx = kswapd_highest_zoneidx(pgdat,
                                                        highest_zoneidx);
                WRITE_ONCE(pgdat->kswapd_order, 0);
@@ -4262,15 +4269,13 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
  */
 void check_move_unevictable_pages(struct pagevec *pvec)
 {
-       struct lruvec *lruvec;
-       struct pglist_data *pgdat = NULL;
+       struct lruvec *lruvec = NULL;
        int pgscanned = 0;
        int pgrescued = 0;
        int i;
 
        for (i = 0; i < pvec->nr; i++) {
                struct page *page = pvec->pages[i];
-               struct pglist_data *pagepgdat = page_pgdat(page);
                int nr_pages;
 
                if (PageTransTail(page))
@@ -4279,18 +4284,12 @@ void check_move_unevictable_pages(struct pagevec *pvec)
                nr_pages = thp_nr_pages(page);
                pgscanned += nr_pages;
 
-               if (pagepgdat != pgdat) {
-                       if (pgdat)
-                               spin_unlock_irq(&pgdat->lru_lock);
-                       pgdat = pagepgdat;
-                       spin_lock_irq(&pgdat->lru_lock);
-               }
-               lruvec = mem_cgroup_page_lruvec(page, pgdat);
-
-               if (!PageLRU(page) || !PageUnevictable(page))
+               /* block memcg migration during page moving between lru */
+               if (!TestClearPageLRU(page))
                        continue;
 
-               if (page_evictable(page)) {
+               lruvec = relock_page_lruvec_irq(page, lruvec);
+               if (page_evictable(page) && PageUnevictable(page)) {
                        enum lru_list lru = page_lru_base_type(page);
 
                        VM_BUG_ON_PAGE(PageActive(page), page);
@@ -4299,12 +4298,15 @@ void check_move_unevictable_pages(struct pagevec *pvec)
                        add_page_to_lru_list(page, lruvec, lru);
                        pgrescued += nr_pages;
                }
+               SetPageLRU(page);
        }
 
-       if (pgdat) {
+       if (lruvec) {
                __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
                __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
-               spin_unlock_irq(&pgdat->lru_lock);
+               unlock_page_lruvec_irq(lruvec);
+       } else if (pgscanned) {
+               count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
        }
 }
 EXPORT_SYMBOL_GPL(check_move_unevictable_pages);