mm: vmscan: correct some vmscan counters for THP swapout
authorYang Shi <yang.shi@linux.alibaba.com>
Fri, 12 Jul 2019 03:59:30 +0000 (20:59 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 12 Jul 2019 18:05:46 +0000 (11:05 -0700)
Commit bd4c82c22c36 ("mm, THP, swap: delay splitting THP after swapped
out"), THP can be swapped out in a whole.  But, nr_reclaimed and some
other vm counters still get inc'ed by one even though a whole THP (512
pages) gets swapped out.

This doesn't make too much sense to memory reclaim.

For example, direct reclaim may just need reclaim SWAP_CLUSTER_MAX
pages, reclaiming one THP could fulfill it.  But, if nr_reclaimed is not
increased correctly, direct reclaim may just waste time to reclaim more
pages, SWAP_CLUSTER_MAX * 512 pages in worst case.

And, it may cause pgsteal_{kswapd|direct} is greater than
pgscan_{kswapd|direct}, like the below:

pgsteal_kswapd 122933
pgsteal_direct 26600225
pgscan_kswapd 174153
pgscan_direct 14678312

nr_reclaimed and nr_scanned must be fixed in parallel otherwise it would
break some page reclaim logic, e.g.

vmpressure: this looks at the scanned/reclaimed ratio so it won't change
semantics as long as scanned & reclaimed are fixed in parallel.

compaction/reclaim: compaction wants a certain number of physical pages
freed up before going back to compacting.

kswapd priority raising: kswapd raises priority if we scan fewer pages
than the reclaim target (which itself is obviously expressed in order-0
pages).  As a result, kswapd can falsely raise its aggressiveness even
when it's making great progress.

Other than nr_scanned and nr_reclaimed, some other counters, e.g.
pgactivate, nr_skipped, nr_ref_keep and nr_unmap_fail need to be fixed too
since they are user visible via cgroup, /proc/vmstat or trace points,
otherwise they would be underreported.

When isolating pages from LRUs, nr_taken has been accounted in base page,
but nr_scanned and nr_skipped are still accounted in THP.  It doesn't make
too much sense too since this may cause trace point underreport the
numbers as well.

So accounting those counters in base page instead of accounting THP as one
page.

nr_dirty, nr_unqueued_dirty, nr_congested and nr_writeback are used by
file cache, so they are not impacted by THP swap.

This change may result in lower steal/scan ratio in some cases since THP
may get split during page reclaim, then a part of tail pages get reclaimed
instead of the whole 512 pages, but nr_scanned is accounted by 512,
particularly for direct reclaim.  But, this should be not a significant
issue.

Link: http://lkml.kernel.org/r/1559025859-72759-2-git-send-email-yang.shi@linux.alibaba.com
Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
mm/vmscan.c

index 277a36d..f8e3dcd 100644 (file)
@@ -1118,6 +1118,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                int may_enter_fs;
                enum page_references references = PAGEREF_RECLAIM_CLEAN;
                bool dirty, writeback;
+               unsigned int nr_pages;
 
                cond_resched();
 
@@ -1129,7 +1130,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
                VM_BUG_ON_PAGE(PageActive(page), page);
 
-               sc->nr_scanned++;
+               nr_pages = 1 << compound_order(page);
+
+               /* Account the number of base pages even though THP */
+               sc->nr_scanned += nr_pages;
 
                if (unlikely(!page_evictable(page)))
                        goto activate_locked;
@@ -1250,7 +1254,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                case PAGEREF_ACTIVATE:
                        goto activate_locked;
                case PAGEREF_KEEP:
-                       stat->nr_ref_keep++;
+                       stat->nr_ref_keep += nr_pages;
                        goto keep_locked;
                case PAGEREF_RECLAIM:
                case PAGEREF_RECLAIM_CLEAN:
@@ -1282,7 +1286,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                }
                                if (!add_to_swap(page)) {
                                        if (!PageTransHuge(page))
-                                               goto activate_locked;
+                                               goto activate_locked_split;
                                        /* Fallback to swap normal pages */
                                        if (split_huge_page_to_list(page,
                                                                    page_list))
@@ -1291,7 +1295,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                        count_vm_event(THP_SWPOUT_FALLBACK);
 #endif
                                        if (!add_to_swap(page))
-                                               goto activate_locked;
+                                               goto activate_locked_split;
                                }
 
                                may_enter_fs = 1;
@@ -1305,6 +1309,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                goto keep_locked;
                }
 
+               /*
+                * THP may get split above, need minus tail pages and update
+                * nr_pages to avoid accounting tail pages twice.
+                *
+                * The tail pages that are added into swap cache successfully
+                * reach here.
+                */
+               if ((nr_pages > 1) && !PageTransHuge(page)) {
+                       sc->nr_scanned -= (nr_pages - 1);
+                       nr_pages = 1;
+               }
+
                /*
                 * The page is mapped into the page tables of one or more
                 * processes. Try to unmap it here.
@@ -1315,7 +1331,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        if (unlikely(PageTransHuge(page)))
                                flags |= TTU_SPLIT_HUGE_PMD;
                        if (!try_to_unmap(page, flags)) {
-                               stat->nr_unmap_fail++;
+                               stat->nr_unmap_fail += nr_pages;
                                goto activate_locked;
                        }
                }
@@ -1442,7 +1458,11 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
                unlock_page(page);
 free_it:
-               nr_reclaimed++;
+               /*
+                * THP may get swapped out in a whole, need account
+                * all base pages.
+                */
+               nr_reclaimed += nr_pages;
 
                /*
                 * Is there need to periodically free_page_list? It would
@@ -1455,6 +1475,15 @@ free_it:
                        list_add(&page->lru, &free_pages);
                continue;
 
+activate_locked_split:
+               /*
+                * The tail pages that are failed to add into swap cache
+                * reach here.  Fixup nr_scanned and nr_pages.
+                */
+               if (nr_pages > 1) {
+                       sc->nr_scanned -= (nr_pages - 1);
+                       nr_pages = 1;
+               }
 activate_locked:
                /* Not a candidate for swapping, so reclaim swap space. */
                if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
@@ -1464,8 +1493,7 @@ activate_locked:
                if (!PageMlocked(page)) {
                        int type = page_is_file_cache(page);
                        SetPageActive(page);
-                       pgactivate++;
-                       stat->nr_activate[type] += hpage_nr_pages(page);
+                       stat->nr_activate[type] += nr_pages;
                        count_memcg_page_event(page, PGACTIVATE);
                }
 keep_locked:
@@ -1475,6 +1503,8 @@ keep:
                VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
        }
 
+       pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
+
        mem_cgroup_uncharge_list(&free_pages);
        try_to_unmap_flush();
        free_unref_page_list(&free_pages);
@@ -1646,10 +1676,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
        LIST_HEAD(pages_skipped);
        isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
 
+       total_scan = 0;
        scan = 0;
-       for (total_scan = 0;
-            scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src);
-            total_scan++) {
+       while (scan < nr_to_scan && !list_empty(src)) {
                struct page *page;
 
                page = lru_to_page(src);
@@ -1657,9 +1686,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 
                VM_BUG_ON_PAGE(!PageLRU(page), page);
 
+               nr_pages = 1 << compound_order(page);
+               total_scan += nr_pages;
+
                if (page_zonenum(page) > sc->reclaim_idx) {
                        list_move(&page->lru, &pages_skipped);
-                       nr_skipped[page_zonenum(page)]++;
+                       nr_skipped[page_zonenum(page)] += nr_pages;
                        continue;
                }
 
@@ -1668,11 +1700,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                 * return with no isolated pages if the LRU mostly contains
                 * ineligible pages.  This causes the VM to not reclaim any
                 * pages, triggering a premature OOM.
+                *
+                * Account all tail pages of THP.  This would not cause
+                * premature OOM since __isolate_lru_page() returns -EBUSY
+                * only when the page is being freed somewhere else.
                 */
-               scan++;
+               scan += nr_pages;
                switch (__isolate_lru_page(page, mode)) {
                case 0:
-                       nr_pages = hpage_nr_pages(page);
                        nr_taken += nr_pages;
                        nr_zone_taken[page_zonenum(page)] += nr_pages;
                        list_move(&page->lru, dst);