mm, hwpoison: enable memory error handling on 1GB hugepage
[linux-2.6-microblaze.git] / mm / memory-failure.c
index da39ec8..1443980 100644 (file)
@@ -33,6 +33,9 @@
  * are rare we hope to get away with this. This avoids impacting the core 
  * VM.
  */
+
+#define pr_fmt(fmt) "Memory failure: " fmt
+
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/page-flags.h>
@@ -71,7 +74,13 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
 
 static bool hw_memory_failure __read_mostly = false;
 
-static bool __page_handle_poison(struct page *page)
+/*
+ * Return values:
+ *   1:   the page is dissolved (if needed) and taken off from buddy,
+ *   0:   the page is dissolved (if needed) and not taken off from buddy,
+ *   < 0: failed to dissolve.
+ */
+static int __page_handle_poison(struct page *page)
 {
        int ret;
 
@@ -81,7 +90,7 @@ static bool __page_handle_poison(struct page *page)
                ret = take_page_off_buddy(page);
        zone_pcp_enable(page_zone(page));
 
-       return ret > 0;
+       return ret;
 }
 
 static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
@@ -91,7 +100,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
                 * Doing this check for free pages is also fine since dissolve_free_huge_page
                 * returns 0 for non-hugetlb pages as well.
                 */
-               if (!__page_handle_poison(page))
+               if (__page_handle_poison(page) <= 0)
                        /*
                         * We could fail to take off the target page from buddy
                         * for example due to racy page allocation, but that's
@@ -252,7 +261,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
        short addr_lsb = tk->size_shift;
        int ret = 0;
 
-       pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
+       pr_err("%#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
                        pfn, t->comm, t->pid);
 
        if ((flags & MF_ACTION_REQUIRED) && (t == current))
@@ -270,7 +279,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
                ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
                                      addr_lsb, t);  /* synchronous? */
        if (ret < 0)
-               pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
+               pr_info("Error sending signal to %s:%d: %d\n",
                        t->comm, t->pid, ret);
        return ret;
 }
@@ -297,10 +306,9 @@ void shake_page(struct page *p)
 }
 EXPORT_SYMBOL_GPL(shake_page);
 
-static unsigned long dev_pagemap_mapping_shift(struct page *page,
-               struct vm_area_struct *vma)
+static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
+               unsigned long address)
 {
-       unsigned long address = vma_address(page, vma);
        unsigned long ret = 0;
        pgd_t *pgd;
        p4d_t *p4d;
@@ -340,23 +348,33 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
 /*
  * Schedule a process for later kill.
  * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
+ *
+ * Notice: @fsdax_pgoff is used only when @p is a fsdax page.
+ *   In other cases, such as anonymous and file-backend page, the address to be
+ *   killed can be caculated by @p itself.
  */
 static void add_to_kill(struct task_struct *tsk, struct page *p,
-                      struct vm_area_struct *vma,
-                      struct list_head *to_kill)
+                       pgoff_t fsdax_pgoff, struct vm_area_struct *vma,
+                       struct list_head *to_kill)
 {
        struct to_kill *tk;
 
        tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
        if (!tk) {
-               pr_err("Memory failure: Out of memory while machine check handling\n");
+               pr_err("Out of memory while machine check handling\n");
                return;
        }
 
        tk->addr = page_address_in_vma(p, vma);
-       if (is_zone_device_page(p))
-               tk->size_shift = dev_pagemap_mapping_shift(p, vma);
-       else
+       if (is_zone_device_page(p)) {
+               /*
+                * Since page->mapping is not used for fsdax, we need
+                * calculate the address based on the vma.
+                */
+               if (p->pgmap->type == MEMORY_DEVICE_FS_DAX)
+                       tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
+               tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
+       } else
                tk->size_shift = page_shift(compound_head(p));
 
        /*
@@ -370,7 +388,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
         * has a mapping for the page.
         */
        if (tk->addr == -EFAULT) {
-               pr_info("Memory failure: Unable to find user space address %lx in %s\n",
+               pr_info("Unable to find user space address %lx in %s\n",
                        page_to_pfn(p), tsk->comm);
        } else if (tk->size_shift == 0) {
                kfree(tk);
@@ -403,7 +421,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
                         * signal and then access the memory. Just kill it.
                         */
                        if (fail || tk->addr == -EFAULT) {
-                               pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
+                               pr_err("%#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
                                       pfn, tk->tsk->comm, tk->tsk->pid);
                                do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
                                                 tk->tsk, PIDTYPE_PID);
@@ -416,7 +434,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
                         * process anyways.
                         */
                        else if (kill_proc(tk, pfn, flags) < 0)
-                               pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
+                               pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n",
                                       pfn, tk->tsk->comm, tk->tsk->pid);
                }
                put_task_struct(tk->tsk);
@@ -505,7 +523,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
                        if (!page_mapped_in_vma(page, vma))
                                continue;
                        if (vma->vm_mm == t->mm)
-                               add_to_kill(t, page, vma, to_kill);
+                               add_to_kill(t, page, 0, vma, to_kill);
                }
        }
        read_unlock(&tasklist_lock);
@@ -541,13 +559,41 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
                         * to be informed of all such data corruptions.
                         */
                        if (vma->vm_mm == t->mm)
-                               add_to_kill(t, page, vma, to_kill);
+                               add_to_kill(t, page, 0, vma, to_kill);
                }
        }
        read_unlock(&tasklist_lock);
        i_mmap_unlock_read(mapping);
 }
 
+#ifdef CONFIG_FS_DAX
+/*
+ * Collect processes when the error hit a fsdax page.
+ */
+static void collect_procs_fsdax(struct page *page,
+               struct address_space *mapping, pgoff_t pgoff,
+               struct list_head *to_kill)
+{
+       struct vm_area_struct *vma;
+       struct task_struct *tsk;
+
+       i_mmap_lock_read(mapping);
+       read_lock(&tasklist_lock);
+       for_each_process(tsk) {
+               struct task_struct *t = task_early_kill(tsk, true);
+
+               if (!t)
+                       continue;
+               vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+                       if (vma->vm_mm == t->mm)
+                               add_to_kill(t, page, pgoff, vma, to_kill);
+               }
+       }
+       read_unlock(&tasklist_lock);
+       i_mmap_unlock_read(mapping);
+}
+#endif /* CONFIG_FS_DAX */
+
 /*
  * Collect the processes who have the corrupted page mapped to kill.
  */
@@ -722,7 +768,6 @@ static const char * const action_page_types[] = {
        [MF_MSG_DIFFERENT_COMPOUND]     = "different compound page after locking",
        [MF_MSG_HUGE]                   = "huge page",
        [MF_MSG_FREE_HUGE]              = "free huge page",
-       [MF_MSG_NON_PMD_HUGE]           = "non-pmd-sized huge page",
        [MF_MSG_UNMAP_FAILED]           = "unmapping failed page",
        [MF_MSG_DIRTY_SWAPCACHE]        = "dirty swapcache page",
        [MF_MSG_CLEAN_SWAPCACHE]        = "clean swapcache page",
@@ -779,12 +824,10 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
                int err = mapping->a_ops->error_remove_page(mapping, p);
 
                if (err != 0) {
-                       pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
-                               pfn, err);
+                       pr_info("%#lx: Failed to punch page: %d\n", pfn, err);
                } else if (page_has_private(p) &&
                           !try_to_release_page(p, GFP_NOIO)) {
-                       pr_info("Memory failure: %#lx: failed to release buffers\n",
-                               pfn);
+                       pr_info("%#lx: failed to release buffers\n", pfn);
                } else {
                        ret = MF_RECOVERED;
                }
@@ -796,8 +839,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
                if (invalidate_inode_page(p))
                        ret = MF_RECOVERED;
                else
-                       pr_info("Memory failure: %#lx: Failed to invalidate\n",
-                               pfn);
+                       pr_info("%#lx: Failed to invalidate\n", pfn);
        }
 
        return ret;
@@ -827,7 +869,7 @@ static bool has_extra_refcount(struct page_state *ps, struct page *p,
                count -= 1;
 
        if (count > 0) {
-               pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
+               pr_err("%#lx: %s still referenced by %d users\n",
                       page_to_pfn(p), action_page_types[ps->type], count);
                return true;
        }
@@ -851,7 +893,7 @@ static int me_kernel(struct page_state *ps, struct page *p)
  */
 static int me_unknown(struct page_state *ps, struct page *p)
 {
-       pr_err("Memory failure: %#lx: Unknown page state\n", page_to_pfn(p));
+       pr_err("%#lx: Unknown page state\n", page_to_pfn(p));
        unlock_page(p);
        return MF_FAILED;
 }
@@ -1007,12 +1049,13 @@ static int me_swapcache_dirty(struct page_state *ps, struct page *p)
 
 static int me_swapcache_clean(struct page_state *ps, struct page *p)
 {
+       struct folio *folio = page_folio(p);
        int ret;
 
-       delete_from_swap_cache(p);
+       delete_from_swap_cache(folio);
 
        ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
-       unlock_page(p);
+       folio_unlock(folio);
 
        if (has_extra_refcount(ps, p, false))
                ret = MF_FAILED;
@@ -1040,7 +1083,6 @@ static int me_huge_page(struct page_state *ps, struct page *p)
                res = truncate_error_page(hpage, page_to_pfn(p), mapping);
                unlock_page(hpage);
        } else {
-               res = MF_FAILED;
                unlock_page(hpage);
                /*
                 * migration entry prevents later access on error hugepage,
@@ -1048,9 +1090,11 @@ static int me_huge_page(struct page_state *ps, struct page *p)
                 * subpages.
                 */
                put_page(hpage);
-               if (__page_handle_poison(p)) {
+               if (__page_handle_poison(p) >= 0) {
                        page_ref_inc(p);
                        res = MF_RECOVERED;
+               } else {
+                       res = MF_FAILED;
                }
        }
 
@@ -1135,7 +1179,7 @@ static void action_result(unsigned long pfn, enum mf_action_page_type type,
        trace_memory_failure_event(pfn, type, result);
 
        num_poisoned_pages_inc();
-       pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
+       pr_err("%#lx: recovery action for %s: %s\n",
                pfn, action_page_types[type], action_name[result]);
 }
 
@@ -1210,8 +1254,7 @@ static int __get_hwpoison_page(struct page *page, unsigned long flags)
                if (head == compound_head(page))
                        return 1;
 
-               pr_info("Memory failure: %#lx cannot catch tail\n",
-                       page_to_pfn(page));
+               pr_info("%#lx cannot catch tail\n", page_to_pfn(page));
                put_page(head);
        }
 
@@ -1274,7 +1317,7 @@ try_again:
        }
 out:
        if (ret == -EIO)
-               pr_err("Memory failure: %#lx: unhandlable page.\n", page_to_pfn(p));
+               pr_err("%#lx: unhandlable page.\n", page_to_pfn(p));
 
        return ret;
 }
@@ -1373,13 +1416,12 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
                return true;
 
        if (PageKsm(p)) {
-               pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
+               pr_err("%#lx: can't handle KSM pages.\n", pfn);
                return false;
        }
 
        if (PageSwapCache(p)) {
-               pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
-                       pfn);
+               pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
                ttu |= TTU_IGNORE_HWPOISON;
        }
 
@@ -1397,7 +1439,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
                } else {
                        kill = 0;
                        ttu |= TTU_IGNORE_HWPOISON;
-                       pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
+                       pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
                                pfn);
                }
        }
@@ -1426,14 +1468,14 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
                        try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
                        i_mmap_unlock_write(mapping);
                } else
-                       pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
+                       pr_info("%#lx: could not lock mapping for mapped huge page\n", pfn);
        } else {
                try_to_unmap(folio, ttu);
        }
 
        unmap_success = !page_mapped(hpage);
        if (!unmap_success)
-               pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
+               pr_err("%#lx: failed to unmap page (mapcount=%d)\n",
                       pfn, page_mapcount(hpage));
 
        /*
@@ -1498,6 +1540,241 @@ static int try_to_split_thp_page(struct page *page, const char *msg)
        return 0;
 }
 
+static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
+               struct address_space *mapping, pgoff_t index, int flags)
+{
+       struct to_kill *tk;
+       unsigned long size = 0;
+
+       list_for_each_entry(tk, to_kill, nd)
+               if (tk->size_shift)
+                       size = max(size, 1UL << tk->size_shift);
+
+       if (size) {
+               /*
+                * Unmap the largest mapping to avoid breaking up device-dax
+                * mappings which are constant size. The actual size of the
+                * mapping being torn down is communicated in siginfo, see
+                * kill_proc()
+                */
+               loff_t start = (index << PAGE_SHIFT) & ~(size - 1);
+
+               unmap_mapping_range(mapping, start, size, 0);
+       }
+
+       kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags);
+}
+
+static int mf_generic_kill_procs(unsigned long long pfn, int flags,
+               struct dev_pagemap *pgmap)
+{
+       struct page *page = pfn_to_page(pfn);
+       LIST_HEAD(to_kill);
+       dax_entry_t cookie;
+       int rc = 0;
+
+       /*
+        * Pages instantiated by device-dax (not filesystem-dax)
+        * may be compound pages.
+        */
+       page = compound_head(page);
+
+       /*
+        * Prevent the inode from being freed while we are interrogating
+        * the address_space, typically this would be handled by
+        * lock_page(), but dax pages do not use the page lock. This
+        * also prevents changes to the mapping of this pfn until
+        * poison signaling is complete.
+        */
+       cookie = dax_lock_page(page);
+       if (!cookie)
+               return -EBUSY;
+
+       if (hwpoison_filter(page)) {
+               rc = -EOPNOTSUPP;
+               goto unlock;
+       }
+
+       switch (pgmap->type) {
+       case MEMORY_DEVICE_PRIVATE:
+       case MEMORY_DEVICE_COHERENT:
+               /*
+                * TODO: Handle device pages which may need coordination
+                * with device-side memory.
+                */
+               rc = -ENXIO;
+               goto unlock;
+       default:
+               break;
+       }
+
+       /*
+        * Use this flag as an indication that the dax page has been
+        * remapped UC to prevent speculative consumption of poison.
+        */
+       SetPageHWPoison(page);
+
+       /*
+        * Unlike System-RAM there is no possibility to swap in a
+        * different physical page at a given virtual address, so all
+        * userspace consumption of ZONE_DEVICE memory necessitates
+        * SIGBUS (i.e. MF_MUST_KILL)
+        */
+       flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+       collect_procs(page, &to_kill, true);
+
+       unmap_and_kill(&to_kill, pfn, page->mapping, page->index, flags);
+unlock:
+       dax_unlock_page(page, cookie);
+       return rc;
+}
+
+#ifdef CONFIG_FS_DAX
+/**
+ * mf_dax_kill_procs - Collect and kill processes who are using this file range
+ * @mapping:   address_space of the file in use
+ * @index:     start pgoff of the range within the file
+ * @count:     length of the range, in unit of PAGE_SIZE
+ * @mf_flags:  memory failure flags
+ */
+int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
+               unsigned long count, int mf_flags)
+{
+       LIST_HEAD(to_kill);
+       dax_entry_t cookie;
+       struct page *page;
+       size_t end = index + count;
+
+       mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+
+       for (; index < end; index++) {
+               page = NULL;
+               cookie = dax_lock_mapping_entry(mapping, index, &page);
+               if (!cookie)
+                       return -EBUSY;
+               if (!page)
+                       goto unlock;
+
+               SetPageHWPoison(page);
+
+               collect_procs_fsdax(page, mapping, index, &to_kill);
+               unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
+                               index, mf_flags);
+unlock:
+               dax_unlock_mapping_entry(mapping, index, cookie);
+       }
+       return 0;
+}
+EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
+#endif /* CONFIG_FS_DAX */
+
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * Struct raw_hwp_page represents information about "raw error page",
+ * constructing singly linked list originated from ->private field of
+ * SUBPAGE_INDEX_HWPOISON-th tail page.
+ */
+struct raw_hwp_page {
+       struct llist_node node;
+       struct page *page;
+};
+
+static inline struct llist_head *raw_hwp_list_head(struct page *hpage)
+{
+       return (struct llist_head *)&page_private(hpage + SUBPAGE_INDEX_HWPOISON);
+}
+
+static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag)
+{
+       struct llist_head *head;
+       struct llist_node *t, *tnode;
+       unsigned long count = 0;
+
+       head = raw_hwp_list_head(hpage);
+       llist_for_each_safe(tnode, t, head->first) {
+               struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
+
+               if (move_flag)
+                       SetPageHWPoison(p->page);
+               kfree(p);
+               count++;
+       }
+       llist_del_all(head);
+       return count;
+}
+
+static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page)
+{
+       struct llist_head *head;
+       struct raw_hwp_page *raw_hwp;
+       struct llist_node *t, *tnode;
+       int ret = TestSetPageHWPoison(hpage) ? -EHWPOISON : 0;
+
+       /*
+        * Once the hwpoison hugepage has lost reliable raw error info,
+        * there is little meaning to keep additional error info precisely,
+        * so skip to add additional raw error info.
+        */
+       if (HPageRawHwpUnreliable(hpage))
+               return -EHWPOISON;
+       head = raw_hwp_list_head(hpage);
+       llist_for_each_safe(tnode, t, head->first) {
+               struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
+
+               if (p->page == page)
+                       return -EHWPOISON;
+       }
+
+       raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
+       if (raw_hwp) {
+               raw_hwp->page = page;
+               llist_add(&raw_hwp->node, head);
+               /* the first error event will be counted in action_result(). */
+               if (ret)
+                       num_poisoned_pages_inc();
+       } else {
+               /*
+                * Failed to save raw error info.  We no longer trace all
+                * hwpoisoned subpages, and we need refuse to free/dissolve
+                * this hwpoisoned hugepage.
+                */
+               SetHPageRawHwpUnreliable(hpage);
+               /*
+                * Once HPageRawHwpUnreliable is set, raw_hwp_page is not
+                * used any more, so free it.
+                */
+               __free_raw_hwp_pages(hpage, false);
+       }
+       return ret;
+}
+
+static unsigned long free_raw_hwp_pages(struct page *hpage, bool move_flag)
+{
+       /*
+        * HPageVmemmapOptimized hugepages can't be freed because struct
+        * pages for tail pages are required but they don't exist.
+        */
+       if (move_flag && HPageVmemmapOptimized(hpage))
+               return 0;
+
+       /*
+        * HPageRawHwpUnreliable hugepages shouldn't be unpoisoned by
+        * definition.
+        */
+       if (HPageRawHwpUnreliable(hpage))
+               return 0;
+
+       return __free_raw_hwp_pages(hpage, move_flag);
+}
+
+void hugetlb_clear_page_hwpoison(struct page *hpage)
+{
+       if (HPageRawHwpUnreliable(hpage))
+               return;
+       ClearPageHWPoison(hpage);
+       free_raw_hwp_pages(hpage, true);
+}
+
 /*
  * Called from hugetlb code with hugetlb_lock held.
  *
@@ -1529,10 +1806,11 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
                        count_increased = true;
        } else {
                ret = -EBUSY;
-               goto out;
+               if (!(flags & MF_NO_RETRY))
+                       goto out;
        }
 
-       if (TestSetPageHWPoison(head)) {
+       if (hugetlb_set_page_hwpoison(head, page)) {
                ret = -EHWPOISON;
                goto out;
        }
@@ -1544,7 +1822,6 @@ out:
        return ret;
 }
 
-#ifdef CONFIG_HUGETLB_PAGE
 /*
  * Taking refcount of hugetlb pages needs extra care about race conditions
  * with basic operations like hugepage allocation/free/demotion.
@@ -1557,7 +1834,6 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
        struct page *p = pfn_to_page(pfn);
        struct page *head;
        unsigned long page_flags;
-       bool retry = true;
 
        *hugetlb = 1;
 retry:
@@ -1566,15 +1842,15 @@ retry:
                *hugetlb = 0;
                return 0;
        } else if (res == -EHWPOISON) {
-               pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn);
+               pr_err("%#lx: already hardware poisoned\n", pfn);
                if (flags & MF_ACTION_REQUIRED) {
                        head = compound_head(p);
                        res = kill_accessing_process(current, page_to_pfn(head), flags);
                }
                return res;
        } else if (res == -EBUSY) {
-               if (retry) {
-                       retry = false;
+               if (!(flags & MF_NO_RETRY)) {
+                       flags |= MF_NO_RETRY;
                        goto retry;
                }
                action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
@@ -1585,7 +1861,7 @@ retry:
        lock_page(head);
 
        if (hwpoison_filter(p)) {
-               ClearPageHWPoison(head);
+               hugetlb_clear_page_hwpoison(head);
                res = -EOPNOTSUPP;
                goto out;
        }
@@ -1596,10 +1872,11 @@ retry:
         */
        if (res == 0) {
                unlock_page(head);
-               res = MF_FAILED;
-               if (__page_handle_poison(p)) {
+               if (__page_handle_poison(p) >= 0) {
                        page_ref_inc(p);
                        res = MF_RECOVERED;
+               } else {
+                       res = MF_FAILED;
                }
                action_result(pfn, MF_MSG_FREE_HUGE, res);
                return res == MF_RECOVERED ? 0 : -EBUSY;
@@ -1607,21 +1884,6 @@ retry:
 
        page_flags = head->flags;
 
-       /*
-        * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
-        * simply disable it. In order to make it work properly, we need
-        * make sure that:
-        *  - conversion of a pud that maps an error hugetlb into hwpoison
-        *    entry properly works, and
-        *  - other mm code walking over page table is aware of pud-aligned
-        *    hwpoison entries.
-        */
-       if (huge_page_size(page_hstate(head)) > PMD_SIZE) {
-               action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED);
-               res = -EBUSY;
-               goto out;
-       }
-
        if (!hwpoison_user_mappings(p, pfn, flags, head)) {
                action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
                res = -EBUSY;
@@ -1633,23 +1895,24 @@ out:
        unlock_page(head);
        return res;
 }
+
 #else
 static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
 {
        return 0;
 }
-#endif
+
+static inline unsigned long free_raw_hwp_pages(struct page *hpage, bool flag)
+{
+       return 0;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
 
 static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
                struct dev_pagemap *pgmap)
 {
        struct page *page = pfn_to_page(pfn);
-       unsigned long size = 0;
-       struct to_kill *tk;
-       LIST_HEAD(tokill);
-       int rc = -EBUSY;
-       loff_t start;
-       dax_entry_t cookie;
+       int rc = -ENXIO;
 
        if (flags & MF_COUNT_INCREASED)
                /*
@@ -1658,73 +1921,24 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
                put_page(page);
 
        /* device metadata space is not recoverable */
-       if (!pgmap_pfn_valid(pgmap, pfn)) {
-               rc = -ENXIO;
+       if (!pgmap_pfn_valid(pgmap, pfn))
                goto out;
-       }
 
        /*
-        * Pages instantiated by device-dax (not filesystem-dax)
-        * may be compound pages.
+        * Call driver's implementation to handle the memory failure, otherwise
+        * fall back to generic handler.
         */
-       page = compound_head(page);
-
-       /*
-        * Prevent the inode from being freed while we are interrogating
-        * the address_space, typically this would be handled by
-        * lock_page(), but dax pages do not use the page lock. This
-        * also prevents changes to the mapping of this pfn until
-        * poison signaling is complete.
-        */
-       cookie = dax_lock_page(page);
-       if (!cookie)
-               goto out;
-
-       if (hwpoison_filter(page)) {
-               rc = -EOPNOTSUPP;
-               goto unlock;
-       }
-
-       if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+       if (pgmap->ops->memory_failure) {
+               rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags);
                /*
-                * TODO: Handle HMM pages which may need coordination
-                * with device-side memory.
+                * Fall back to generic handler too if operation is not
+                * supported inside the driver/device/filesystem.
                 */
-               goto unlock;
+               if (rc != -EOPNOTSUPP)
+                       goto out;
        }
 
-       /*
-        * Use this flag as an indication that the dax page has been
-        * remapped UC to prevent speculative consumption of poison.
-        */
-       SetPageHWPoison(page);
-
-       /*
-        * Unlike System-RAM there is no possibility to swap in a
-        * different physical page at a given virtual address, so all
-        * userspace consumption of ZONE_DEVICE memory necessitates
-        * SIGBUS (i.e. MF_MUST_KILL)
-        */
-       flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
-       collect_procs(page, &tokill, true);
-
-       list_for_each_entry(tk, &tokill, nd)
-               if (tk->size_shift)
-                       size = max(size, 1UL << tk->size_shift);
-       if (size) {
-               /*
-                * Unmap the largest mapping to avoid breaking up
-                * device-dax mappings which are constant size. The
-                * actual size of the mapping being torn down is
-                * communicated in siginfo, see kill_proc()
-                */
-               start = (page->index << PAGE_SHIFT) & ~(size - 1);
-               unmap_mapping_range(page->mapping, start, size, 0);
-       }
-       kill_procs(&tokill, true, false, pfn, flags);
-       rc = 0;
-unlock:
-       dax_unlock_page(page, cookie);
+       rc = mf_generic_kill_procs(pfn, flags, pgmap);
 out:
        /* drop pgmap ref acquired in caller */
        put_dev_pagemap(pgmap);
@@ -1787,8 +2001,7 @@ int memory_failure(unsigned long pfn, int flags)
                                goto unlock_mutex;
                        }
                }
-               pr_err("Memory failure: %#lx: memory outside kernel control\n",
-                       pfn);
+               pr_err("%#lx: memory outside kernel control\n", pfn);
                res = -ENXIO;
                goto unlock_mutex;
        }
@@ -1799,8 +2012,7 @@ try_again:
                goto unlock_mutex;
 
        if (TestSetPageHWPoison(p)) {
-               pr_err("Memory failure: %#lx: already hardware poisoned\n",
-                       pfn);
+               pr_err("%#lx: already hardware poisoned\n", pfn);
                res = -EHWPOISON;
                if (flags & MF_ACTION_REQUIRED)
                        res = kill_accessing_process(current, pfn, flags);
@@ -1940,7 +2152,7 @@ try_again:
 
        /*
         * Now take care of user space mappings.
-        * Abort on fail: __delete_from_page_cache() assumes unmapped page.
+        * Abort on fail: __filemap_remove_folio() assumes unmapped page.
         */
        if (!hwpoison_user_mappings(p, pfn, flags, p)) {
                action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
@@ -2016,7 +2228,7 @@ void memory_failure_queue(unsigned long pfn, int flags)
        if (kfifo_put(&mf_cpu->fifo, entry))
                schedule_work_on(smp_processor_id(), &mf_cpu->work);
        else
-               pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
+               pr_err("buffer overflow when queuing memory failure at %#lx\n",
                       pfn);
        spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
        put_cpu_var(memory_failure_cpu);
@@ -2073,6 +2285,8 @@ static int __init memory_failure_init(void)
 }
 core_initcall(memory_failure_init);
 
+#undef pr_fmt
+#define pr_fmt(fmt)    "" fmt
 #define unpoison_pr_info(fmt, pfn, rs)                 \
 ({                                                     \
        if (__ratelimit(rs))                            \
@@ -2097,6 +2311,7 @@ int unpoison_memory(unsigned long pfn)
        struct page *p;
        int ret = -EBUSY;
        int freeit = 0;
+       unsigned long count = 1;
        static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
                                        DEFAULT_RATELIMIT_BURST);
 
@@ -2144,6 +2359,13 @@ int unpoison_memory(unsigned long pfn)
 
        ret = get_hwpoison_page(p, MF_UNPOISON);
        if (!ret) {
+               if (PageHuge(p)) {
+                       count = free_raw_hwp_pages(page, false);
+                       if (count == 0) {
+                               ret = -EBUSY;
+                               goto unlock_mutex;
+                       }
+               }
                ret = TestClearPageHWPoison(page) ? 0 : -EBUSY;
        } else if (ret < 0) {
                if (ret == -EHWPOISON) {
@@ -2152,6 +2374,13 @@ int unpoison_memory(unsigned long pfn)
                        unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
                                         pfn, &unpoison_rs);
        } else {
+               if (PageHuge(p)) {
+                       count = free_raw_hwp_pages(page, false);
+                       if (count == 0) {
+                               ret = -EBUSY;
+                               goto unlock_mutex;
+                       }
+               }
                freeit = !!TestClearPageHWPoison(p);
 
                put_page(page);
@@ -2164,7 +2393,7 @@ int unpoison_memory(unsigned long pfn)
 unlock_mutex:
        mutex_unlock(&mf_mutex);
        if (!ret || freeit) {
-               num_poisoned_pages_dec();
+               num_poisoned_pages_sub(count);
                unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
                                 page_to_pfn(p), &unpoison_rs);
        }
@@ -2178,7 +2407,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist)
        bool lru = PageLRU(page);
 
        if (PageHuge(page)) {
-               isolated = isolate_huge_page(page, pagelist);
+               isolated = !isolate_hugetlb(page, pagelist);
        } else {
                if (lru)
                        isolated = !isolate_lru_page(page);