mm,hwpoison: take free pages off the buddy freelists
authorOscar Salvador <osalvador@suse.de>
Tue, 15 Dec 2020 03:11:32 +0000 (19:11 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 15 Dec 2020 20:13:44 +0000 (12:13 -0800)
The crux of the matter is that historically we left poisoned pages in the
buddy system because we have some checks in place when allocating a page
that are gatekeeper for poisoned pages.  Unfortunately, we do have other
users (e.g: compaction [1]) that scan buddy freelists and try to get a
page from there without checking whether the page is HWPoison.

As I stated already, I think it is fundamentally wrong to keep HWPoison
pages within the buddy systems, checks in place or not.

Let us fix this the same way we did for soft_offline [2], taking the page
off the buddy freelist so it is completely unreachable.

Note that this is fairly simple to trigger, as we only need to poison free
buddy pages (madvise MADV_HWPOISON) and then run some sort of memory
stress system.

Just for a matter of reference, I put a dump_page() in compaction_alloc()
to trigger for HWPoison patches:

    page:0000000012b2982b refcount:1 mapcount:0 mapping:0000000000000000 index:0x1 pfn:0x1d5db
    flags: 0xfffffc0800000(hwpoison)
    raw: 000fffffc0800000 ffffea00007573c8 ffffc90000857de0 0000000000000000
    raw: 0000000000000001 0000000000000000 00000001ffffffff 0000000000000000
    page dumped because: compaction_alloc

    CPU: 4 PID: 123 Comm: kcompactd0 Tainted: G            E     5.9.0-rc2-mm1-1-default+ #5
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014
    Call Trace:
     dump_stack+0x6d/0x8b
     compaction_alloc+0xb2/0xc0
     migrate_pages+0x2a6/0x12a0
     compact_zone+0x5eb/0x11c0
     proactive_compact_node+0x89/0xf0
     kcompactd+0x2d0/0x3a0
     kthread+0x118/0x130
     ret_from_fork+0x22/0x30

After that, if e.g: a process faults in the page,  it will get killed
unexpectedly.
Fix it by containing the page immediatelly.

Besides that, two more changes can be noticed:

* MF_DELAYED no longer suits as we are fixing the issue by containing
  the page immediately, so it does no longer rely on the allocation-time
  checks to stop HWPoison to be handed over.
  gain unless it is unpoisoned, so we fixed the situation.
  Because of that, let us use MF_RECOVERED from now on.

* The second block that handles PageBuddy pages is no longer needed:
  We call shake_page and then check whether the page is Buddy
  because shake_page calls drain_all_pages, which sends pcp-pages back to
  the buddy freelists, so we could have a chance to handle free pages.
  Currently, get_hwpoison_page already calls drain_all_pages, and we call
  get_hwpoison_page right before coming here, so we should be on the safe
  side.

[1] https://lore.kernel.org/linux-mm/20190826104144.GA7849@linux/T/#u
[2] https://patchwork.kernel.org/cover/11792607/

[osalvador@suse.de: take the poisoned subpage off the buddy frelists]
Link: https://lkml.kernel.org/r/20201013144447.6706-4-osalvador@suse.de
Link: https://lkml.kernel.org/r/20201013144447.6706-3-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
mm/memory-failure.c

index 5bcb619..0b8a101 100644 (file)
@@ -809,7 +809,7 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
  */
 static int me_huge_page(struct page *p, unsigned long pfn)
 {
-       int res = 0;
+       int res;
        struct page *hpage = compound_head(p);
        struct address_space *mapping;
 
@@ -820,6 +820,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
        if (mapping) {
                res = truncate_error_page(hpage, pfn, mapping);
        } else {
+               res = MF_FAILED;
                unlock_page(hpage);
                /*
                 * migration entry prevents later access on error anonymous
@@ -828,8 +829,10 @@ static int me_huge_page(struct page *p, unsigned long pfn)
                 */
                if (PageAnon(hpage))
                        put_page(hpage);
-               dissolve_free_huge_page(p);
-               res = MF_RECOVERED;
+               if (!dissolve_free_huge_page(p) && take_page_off_buddy(p)) {
+                       page_ref_inc(p);
+                       res = MF_RECOVERED;
+               }
                lock_page(hpage);
        }
 
@@ -1196,9 +1199,13 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
                        }
                }
                unlock_page(head);
-               dissolve_free_huge_page(p);
-               action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED);
-               return 0;
+               res = MF_FAILED;
+               if (!dissolve_free_huge_page(p) && take_page_off_buddy(p)) {
+                       page_ref_inc(p);
+                       res = MF_RECOVERED;
+               }
+               action_result(pfn, MF_MSG_FREE_HUGE, res);
+               return res == MF_RECOVERED ? 0 : -EBUSY;
        }
 
        lock_page(head);
@@ -1339,6 +1346,7 @@ int memory_failure(unsigned long pfn, int flags)
        struct dev_pagemap *pgmap;
        int res;
        unsigned long page_flags;
+       bool retry = true;
 
        if (!sysctl_memory_failure_recovery)
                panic("Memory failure on page %lx", pfn);
@@ -1356,6 +1364,7 @@ int memory_failure(unsigned long pfn, int flags)
                return -ENXIO;
        }
 
+try_again:
        if (PageHuge(p))
                return memory_failure_hugetlb(pfn, flags);
        if (TestSetPageHWPoison(p)) {
@@ -1380,8 +1389,21 @@ int memory_failure(unsigned long pfn, int flags)
         */
        if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
                if (is_free_buddy_page(p)) {
-                       action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
-                       return 0;
+                       if (take_page_off_buddy(p)) {
+                               page_ref_inc(p);
+                               res = MF_RECOVERED;
+                       } else {
+                               /* We lost the race, try again */
+                               if (retry) {
+                                       ClearPageHWPoison(p);
+                                       num_poisoned_pages_dec();
+                                       retry = false;
+                                       goto try_again;
+                               }
+                               res = MF_FAILED;
+                       }
+                       action_result(pfn, MF_MSG_BUDDY, res);
+                       return res == MF_RECOVERED ? 0 : -EBUSY;
                } else {
                        action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
                        return -EBUSY;
@@ -1405,14 +1427,6 @@ int memory_failure(unsigned long pfn, int flags)
         * walked by the page reclaim code, however that's not a big loss.
         */
        shake_page(p, 0);
-       /* shake_page could have turned it free. */
-       if (!PageLRU(p) && is_free_buddy_page(p)) {
-               if (flags & MF_COUNT_INCREASED)
-                       action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
-               else
-                       action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED);
-               return 0;
-       }
 
        lock_page(p);