mm, hwpoison: enable memory error handling on 1GB hugepage

[linux-2.6-microblaze.git] / mm / memory-failure.c
diff --git a/mm/memory-failure.c b/mm/memory-failure.c

index b864c2e..1443980 100644 (file)
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -33,6 +33,9 @@
   * are rare we hope to get away with this. This avoids impacting the core 
   * VM.
   */
+
+#define pr_fmt(fmt) "Memory failure: " fmt
+
  #include <linux/kernel.h>
  #include <linux/mm.h>
  #include <linux/page-flags.h>
@@ -71,7 +74,13 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
  
  static bool hw_memory_failure __read_mostly = false;
  
-static bool __page_handle_poison(struct page *page)
+/*
+ * Return values:
+ *   1:   the page is dissolved (if needed) and taken off from buddy,
+ *   0:   the page is dissolved (if needed) and not taken off from buddy,
+ *   < 0: failed to dissolve.
+ */
+static int __page_handle_poison(struct page *page)
  {
         int ret;
  
@@ -81,7 +90,7 @@ static bool __page_handle_poison(struct page *page)
                 ret = take_page_off_buddy(page);
         zone_pcp_enable(page_zone(page));
  
-       return ret > 0;
+       return ret;
  }
  
  static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
@@ -91,7 +100,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
                  * Doing this check for free pages is also fine since dissolve_free_huge_page
                  * returns 0 for non-hugetlb pages as well.
                  */
-               if (!__page_handle_poison(page))
+               if (__page_handle_poison(page) <= 0)
                         /*
                          * We could fail to take off the target page from buddy
                          * for example due to racy page allocation, but that's
@@ -252,7 +261,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
         short addr_lsb = tk->size_shift;
         int ret = 0;
  
-       pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
+       pr_err("%#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
                         pfn, t->comm, t->pid);
  
         if ((flags & MF_ACTION_REQUIRED) && (t == current))
@@ -270,7 +279,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
                 ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
                                       addr_lsb, t);  /* synchronous? */
         if (ret < 0)
-               pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
+               pr_info("Error sending signal to %s:%d: %d\n",
                         t->comm, t->pid, ret);
         return ret;
  }
@@ -297,10 +306,9 @@ void shake_page(struct page *p)
  }
  EXPORT_SYMBOL_GPL(shake_page);
  
-static unsigned long dev_pagemap_mapping_shift(struct page *page,
-               struct vm_area_struct *vma)
+static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
+               unsigned long address)
  {
-       unsigned long address = vma_address(page, vma);
         unsigned long ret = 0;
         pgd_t *pgd;
         p4d_t *p4d;
@@ -340,23 +348,33 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
  /*
   * Schedule a process for later kill.
   * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
+ *
+ * Notice: @fsdax_pgoff is used only when @p is a fsdax page.
+ *   In other cases, such as anonymous and file-backend page, the address to be
+ *   killed can be caculated by @p itself.
   */
  static void add_to_kill(struct task_struct *tsk, struct page *p,
-                      struct vm_area_struct *vma,
-                      struct list_head *to_kill)
+                       pgoff_t fsdax_pgoff, struct vm_area_struct *vma,
+                       struct list_head *to_kill)
  {
         struct to_kill *tk;
  
         tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
         if (!tk) {
-               pr_err("Memory failure: Out of memory while machine check handling\n");
+               pr_err("Out of memory while machine check handling\n");
                 return;
         }
  
         tk->addr = page_address_in_vma(p, vma);
-       if (is_zone_device_page(p))
-               tk->size_shift = dev_pagemap_mapping_shift(p, vma);
-       else
+       if (is_zone_device_page(p)) {
+               /*
+                * Since page->mapping is not used for fsdax, we need
+                * calculate the address based on the vma.
+                */
+               if (p->pgmap->type == MEMORY_DEVICE_FS_DAX)
+                       tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
+               tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
+       } else
                 tk->size_shift = page_shift(compound_head(p));
  
         /*
@@ -370,7 +388,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
          * has a mapping for the page.
          */
         if (tk->addr == -EFAULT) {
-               pr_info("Memory failure: Unable to find user space address %lx in %s\n",
+               pr_info("Unable to find user space address %lx in %s\n",
                         page_to_pfn(p), tsk->comm);
         } else if (tk->size_shift == 0) {
                 kfree(tk);
@@ -403,7 +421,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
                          * signal and then access the memory. Just kill it.
                          */
                         if (fail || tk->addr == -EFAULT) {
-                               pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
+                               pr_err("%#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
                                        pfn, tk->tsk->comm, tk->tsk->pid);
                                 do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
                                                  tk->tsk, PIDTYPE_PID);
@@ -416,7 +434,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
                          * process anyways.
                          */
                         else if (kill_proc(tk, pfn, flags) < 0)
-                               pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
+                               pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n",
                                        pfn, tk->tsk->comm, tk->tsk->pid);
                 }
                 put_task_struct(tk->tsk);
@@ -505,7 +523,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
                         if (!page_mapped_in_vma(page, vma))
                                 continue;
                         if (vma->vm_mm == t->mm)
-                               add_to_kill(t, page, vma, to_kill);
+                               add_to_kill(t, page, 0, vma, to_kill);
                 }
         }
         read_unlock(&tasklist_lock);
@@ -541,13 +559,41 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
                          * to be informed of all such data corruptions.
                          */
                         if (vma->vm_mm == t->mm)
-                               add_to_kill(t, page, vma, to_kill);
+                               add_to_kill(t, page, 0, vma, to_kill);
                 }
         }
         read_unlock(&tasklist_lock);
         i_mmap_unlock_read(mapping);
  }
  
+#ifdef CONFIG_FS_DAX
+/*
+ * Collect processes when the error hit a fsdax page.
+ */
+static void collect_procs_fsdax(struct page *page,
+               struct address_space *mapping, pgoff_t pgoff,
+               struct list_head *to_kill)
+{
+       struct vm_area_struct *vma;
+       struct task_struct *tsk;
+
+       i_mmap_lock_read(mapping);
+       read_lock(&tasklist_lock);
+       for_each_process(tsk) {
+               struct task_struct *t = task_early_kill(tsk, true);
+
+               if (!t)
+                       continue;
+               vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+                       if (vma->vm_mm == t->mm)
+                               add_to_kill(t, page, pgoff, vma, to_kill);
+               }
+       }
+       read_unlock(&tasklist_lock);
+       i_mmap_unlock_read(mapping);
+}
+#endif /* CONFIG_FS_DAX */
+
  /*
   * Collect the processes who have the corrupted page mapped to kill.
   */
@@ -722,7 +768,6 @@ static const char * const action_page_types[] = {
         [MF_MSG_DIFFERENT_COMPOUND]     = "different compound page after locking",
         [MF_MSG_HUGE]                   = "huge page",
         [MF_MSG_FREE_HUGE]              = "free huge page",
-       [MF_MSG_NON_PMD_HUGE]           = "non-pmd-sized huge page",
         [MF_MSG_UNMAP_FAILED]           = "unmapping failed page",
         [MF_MSG_DIRTY_SWAPCACHE]        = "dirty swapcache page",
         [MF_MSG_CLEAN_SWAPCACHE]        = "clean swapcache page",
@@ -779,12 +824,10 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
                 int err = mapping->a_ops->error_remove_page(mapping, p);
  
                 if (err != 0) {
-                       pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
-                               pfn, err);
+                       pr_info("%#lx: Failed to punch page: %d\n", pfn, err);
                 } else if (page_has_private(p) &&
                            !try_to_release_page(p, GFP_NOIO)) {
-                       pr_info("Memory failure: %#lx: failed to release buffers\n",
-                               pfn);
+                       pr_info("%#lx: failed to release buffers\n", pfn);
                 } else {
                         ret = MF_RECOVERED;
                 }
@@ -796,8 +839,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
                 if (invalidate_inode_page(p))
                         ret = MF_RECOVERED;
                 else
-                       pr_info("Memory failure: %#lx: Failed to invalidate\n",
-                               pfn);
+                       pr_info("%#lx: Failed to invalidate\n", pfn);
         }
  
         return ret;
@@ -827,7 +869,7 @@ static bool has_extra_refcount(struct page_state *ps, struct page *p,
                 count -= 1;
  
         if (count > 0) {
-               pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
+               pr_err("%#lx: %s still referenced by %d users\n",
                        page_to_pfn(p), action_page_types[ps->type], count);
                 return true;
         }
@@ -851,7 +893,7 @@ static int me_kernel(struct page_state *ps, struct page *p)
   */
  static int me_unknown(struct page_state *ps, struct page *p)
  {
-       pr_err("Memory failure: %#lx: Unknown page state\n", page_to_pfn(p));
+       pr_err("%#lx: Unknown page state\n", page_to_pfn(p));
         unlock_page(p);
         return MF_FAILED;
  }
@@ -1007,12 +1049,13 @@ static int me_swapcache_dirty(struct page_state *ps, struct page *p)
  
  static int me_swapcache_clean(struct page_state *ps, struct page *p)
  {
+       struct folio *folio = page_folio(p);
         int ret;
  
-       delete_from_swap_cache(p);
+       delete_from_swap_cache(folio);
  
         ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
-       unlock_page(p);
+       folio_unlock(folio);
  
         if (has_extra_refcount(ps, p, false))
                 ret = MF_FAILED;
@@ -1040,7 +1083,6 @@ static int me_huge_page(struct page_state *ps, struct page *p)
                 res = truncate_error_page(hpage, page_to_pfn(p), mapping);
                 unlock_page(hpage);
         } else {
-               res = MF_FAILED;
                 unlock_page(hpage);
                 /*
                  * migration entry prevents later access on error hugepage,
@@ -1048,9 +1090,11 @@ static int me_huge_page(struct page_state *ps, struct page *p)
                  * subpages.
                  */
                 put_page(hpage);
-               if (__page_handle_poison(p)) {
+               if (__page_handle_poison(p) >= 0) {
                         page_ref_inc(p);
                         res = MF_RECOVERED;
+               } else {
+                       res = MF_FAILED;
                 }
         }
  
@@ -1135,7 +1179,7 @@ static void action_result(unsigned long pfn, enum mf_action_page_type type,
         trace_memory_failure_event(pfn, type, result);
  
         num_poisoned_pages_inc();
-       pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
+       pr_err("%#lx: recovery action for %s: %s\n",
                 pfn, action_page_types[type], action_name[result]);
  }
  
@@ -1210,8 +1254,7 @@ static int __get_hwpoison_page(struct page *page, unsigned long flags)
                 if (head == compound_head(page))
                         return 1;
  
-               pr_info("Memory failure: %#lx cannot catch tail\n",
-                       page_to_pfn(page));
+               pr_info("%#lx cannot catch tail\n", page_to_pfn(page));
                 put_page(head);
         }
  
@@ -1274,7 +1317,7 @@ try_again:
         }
  out:
         if (ret == -EIO)
-               pr_err("Memory failure: %#lx: unhandlable page.\n", page_to_pfn(p));
+               pr_err("%#lx: unhandlable page.\n", page_to_pfn(p));
  
         return ret;
  }
@@ -1373,13 +1416,12 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
                 return true;
  
         if (PageKsm(p)) {
-               pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
+               pr_err("%#lx: can't handle KSM pages.\n", pfn);
                 return false;
         }
  
         if (PageSwapCache(p)) {
-               pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
-                       pfn);
+               pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
                 ttu |= TTU_IGNORE_HWPOISON;
         }
  
@@ -1397,7 +1439,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
                 } else {
                         kill = 0;
                         ttu |= TTU_IGNORE_HWPOISON;
-                       pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
+                       pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
                                 pfn);
                 }
         }
@@ -1426,14 +1468,14 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
                         try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
                         i_mmap_unlock_write(mapping);
                 } else
-                       pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
+                       pr_info("%#lx: could not lock mapping for mapped huge page\n", pfn);
         } else {
                 try_to_unmap(folio, ttu);
         }
  
         unmap_success = !page_mapped(hpage);
         if (!unmap_success)
-               pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
+               pr_err("%#lx: failed to unmap page (mapcount=%d)\n",
                        pfn, page_mapcount(hpage));
  
         /*
@@ -1498,6 +1540,241 @@ static int try_to_split_thp_page(struct page *page, const char *msg)
         return 0;
  }
  
+static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
+               struct address_space *mapping, pgoff_t index, int flags)
+{
+       struct to_kill *tk;
+       unsigned long size = 0;
+
+       list_for_each_entry(tk, to_kill, nd)
+               if (tk->size_shift)
+                       size = max(size, 1UL << tk->size_shift);
+
+       if (size) {
+               /*
+                * Unmap the largest mapping to avoid breaking up device-dax
+                * mappings which are constant size. The actual size of the
+                * mapping being torn down is communicated in siginfo, see
+                * kill_proc()
+                */
+               loff_t start = (index << PAGE_SHIFT) & ~(size - 1);
+
+               unmap_mapping_range(mapping, start, size, 0);
+       }
+
+       kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags);
+}
+
+static int mf_generic_kill_procs(unsigned long long pfn, int flags,
+               struct dev_pagemap *pgmap)
+{
+       struct page *page = pfn_to_page(pfn);
+       LIST_HEAD(to_kill);
+       dax_entry_t cookie;
+       int rc = 0;
+
+       /*
+        * Pages instantiated by device-dax (not filesystem-dax)
+        * may be compound pages.
+        */
+       page = compound_head(page);
+
+       /*
+        * Prevent the inode from being freed while we are interrogating
+        * the address_space, typically this would be handled by
+        * lock_page(), but dax pages do not use the page lock. This
+        * also prevents changes to the mapping of this pfn until
+        * poison signaling is complete.
+        */
+       cookie = dax_lock_page(page);
+       if (!cookie)
+               return -EBUSY;
+
+       if (hwpoison_filter(page)) {
+               rc = -EOPNOTSUPP;
+               goto unlock;
+       }
+
+       switch (pgmap->type) {
+       case MEMORY_DEVICE_PRIVATE:
+       case MEMORY_DEVICE_COHERENT:
+               /*
+                * TODO: Handle device pages which may need coordination
+                * with device-side memory.
+                */
+               rc = -ENXIO;
+               goto unlock;
+       default:
+               break;
+       }
+
+       /*
+        * Use this flag as an indication that the dax page has been
+        * remapped UC to prevent speculative consumption of poison.
+        */
+       SetPageHWPoison(page);
+
+       /*
+        * Unlike System-RAM there is no possibility to swap in a
+        * different physical page at a given virtual address, so all
+        * userspace consumption of ZONE_DEVICE memory necessitates
+        * SIGBUS (i.e. MF_MUST_KILL)
+        */
+       flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+       collect_procs(page, &to_kill, true);
+
+       unmap_and_kill(&to_kill, pfn, page->mapping, page->index, flags);
+unlock:
+       dax_unlock_page(page, cookie);
+       return rc;
+}
+
+#ifdef CONFIG_FS_DAX
+/**
+ * mf_dax_kill_procs - Collect and kill processes who are using this file range
+ * @mapping:   address_space of the file in use
+ * @index:     start pgoff of the range within the file
+ * @count:     length of the range, in unit of PAGE_SIZE
+ * @mf_flags:  memory failure flags
+ */
+int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
+               unsigned long count, int mf_flags)
+{
+       LIST_HEAD(to_kill);
+       dax_entry_t cookie;
+       struct page *page;
+       size_t end = index + count;
+
+       mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+
+       for (; index < end; index++) {
+               page = NULL;
+               cookie = dax_lock_mapping_entry(mapping, index, &page);
+               if (!cookie)
+                       return -EBUSY;
+               if (!page)
+                       goto unlock;
+
+               SetPageHWPoison(page);
+
+               collect_procs_fsdax(page, mapping, index, &to_kill);
+               unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
+                               index, mf_flags);
+unlock:
+               dax_unlock_mapping_entry(mapping, index, cookie);
+       }
+       return 0;
+}
+EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
+#endif /* CONFIG_FS_DAX */
+
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * Struct raw_hwp_page represents information about "raw error page",
+ * constructing singly linked list originated from ->private field of
+ * SUBPAGE_INDEX_HWPOISON-th tail page.
+ */
+struct raw_hwp_page {
+       struct llist_node node;
+       struct page *page;
+};
+
+static inline struct llist_head *raw_hwp_list_head(struct page *hpage)
+{
+       return (struct llist_head *)&page_private(hpage + SUBPAGE_INDEX_HWPOISON);
+}
+
+static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag)
+{
+       struct llist_head *head;
+       struct llist_node *t, *tnode;
+       unsigned long count = 0;
+
+       head = raw_hwp_list_head(hpage);
+       llist_for_each_safe(tnode, t, head->first) {
+               struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
+
+               if (move_flag)
+                       SetPageHWPoison(p->page);
+               kfree(p);
+               count++;
+       }
+       llist_del_all(head);
+       return count;
+}
+
+static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page)
+{
+       struct llist_head *head;
+       struct raw_hwp_page *raw_hwp;
+       struct llist_node *t, *tnode;
+       int ret = TestSetPageHWPoison(hpage) ? -EHWPOISON : 0;
+
+       /*
+        * Once the hwpoison hugepage has lost reliable raw error info,
+        * there is little meaning to keep additional error info precisely,
+        * so skip to add additional raw error info.
+        */
+       if (HPageRawHwpUnreliable(hpage))
+               return -EHWPOISON;
+       head = raw_hwp_list_head(hpage);
+       llist_for_each_safe(tnode, t, head->first) {
+               struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node);
+
+               if (p->page == page)
+                       return -EHWPOISON;
+       }
+
+       raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
+       if (raw_hwp) {
+               raw_hwp->page = page;
+               llist_add(&raw_hwp->node, head);
+               /* the first error event will be counted in action_result(). */
+               if (ret)
+                       num_poisoned_pages_inc();
+       } else {
+               /*
+                * Failed to save raw error info.  We no longer trace all
+                * hwpoisoned subpages, and we need refuse to free/dissolve
+                * this hwpoisoned hugepage.
+                */
+               SetHPageRawHwpUnreliable(hpage);
+               /*
+                * Once HPageRawHwpUnreliable is set, raw_hwp_page is not
+                * used any more, so free it.
+                */
+               __free_raw_hwp_pages(hpage, false);
+       }
+       return ret;
+}
+
+static unsigned long free_raw_hwp_pages(struct page *hpage, bool move_flag)
+{
+       /*
+        * HPageVmemmapOptimized hugepages can't be freed because struct
+        * pages for tail pages are required but they don't exist.
+        */
+       if (move_flag && HPageVmemmapOptimized(hpage))
+               return 0;
+
+       /*
+        * HPageRawHwpUnreliable hugepages shouldn't be unpoisoned by
+        * definition.
+        */
+       if (HPageRawHwpUnreliable(hpage))
+               return 0;
+
+       return __free_raw_hwp_pages(hpage, move_flag);
+}
+
+void hugetlb_clear_page_hwpoison(struct page *hpage)
+{
+       if (HPageRawHwpUnreliable(hpage))
+               return;
+       ClearPageHWPoison(hpage);
+       free_raw_hwp_pages(hpage, true);
+}
+
  /*
   * Called from hugetlb code with hugetlb_lock held.
   *
@@ -1529,10 +1806,11 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
                         count_increased = true;
         } else {
                 ret = -EBUSY;
-               goto out;
+               if (!(flags & MF_NO_RETRY))
+                       goto out;
         }
  
-       if (TestSetPageHWPoison(head)) {
+       if (hugetlb_set_page_hwpoison(head, page)) {
                 ret = -EHWPOISON;
                 goto out;
         }
@@ -1544,7 +1822,6 @@ out:
         return ret;
  }
  
-#ifdef CONFIG_HUGETLB_PAGE
  /*
   * Taking refcount of hugetlb pages needs extra care about race conditions
   * with basic operations like hugepage allocation/free/demotion.
@@ -1557,7 +1834,6 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
         struct page *p = pfn_to_page(pfn);
         struct page *head;
         unsigned long page_flags;
-       bool retry = true;
  
         *hugetlb = 1;
  retry:
@@ -1566,15 +1842,15 @@ retry:
                 *hugetlb = 0;
                 return 0;
         } else if (res == -EHWPOISON) {
-               pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn);
+               pr_err("%#lx: already hardware poisoned\n", pfn);
                 if (flags & MF_ACTION_REQUIRED) {
                         head = compound_head(p);
                         res = kill_accessing_process(current, page_to_pfn(head), flags);
                 }
                 return res;
         } else if (res == -EBUSY) {
-               if (retry) {
-                       retry = false;
+               if (!(flags & MF_NO_RETRY)) {
+                       flags |= MF_NO_RETRY;
                         goto retry;
                 }
                 action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
@@ -1585,7 +1861,7 @@ retry:
         lock_page(head);
  
         if (hwpoison_filter(p)) {
-               ClearPageHWPoison(head);
+               hugetlb_clear_page_hwpoison(head);
                 res = -EOPNOTSUPP;
                 goto out;
         }
@@ -1596,10 +1872,11 @@ retry:
          */
         if (res == 0) {
                 unlock_page(head);
-               res = MF_FAILED;
-               if (__page_handle_poison(p)) {
+               if (__page_handle_poison(p) >= 0) {
                         page_ref_inc(p);
                         res = MF_RECOVERED;
+               } else {
+                       res = MF_FAILED;
                 }
                 action_result(pfn, MF_MSG_FREE_HUGE, res);
                 return res == MF_RECOVERED ? 0 : -EBUSY;
@@ -1607,21 +1884,6 @@ retry:
  
         page_flags = head->flags;
  
-       /*
-        * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
-        * simply disable it. In order to make it work properly, we need
-        * make sure that:
-        *  - conversion of a pud that maps an error hugetlb into hwpoison
-        *    entry properly works, and
-        *  - other mm code walking over page table is aware of pud-aligned
-        *    hwpoison entries.
-        */
-       if (huge_page_size(page_hstate(head)) > PMD_SIZE) {
-               action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED);
-               res = -EBUSY;
-               goto out;
-       }
-
         if (!hwpoison_user_mappings(p, pfn, flags, head)) {
                 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
                 res = -EBUSY;
@@ -1633,23 +1895,24 @@ out:
         unlock_page(head);
         return res;
  }
+
  #else
  static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
  {
         return 0;
  }
-#endif
+
+static inline unsigned long free_raw_hwp_pages(struct page *hpage, bool flag)
+{
+       return 0;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
  
  static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
                 struct dev_pagemap *pgmap)
  {
         struct page *page = pfn_to_page(pfn);
-       unsigned long size = 0;
-       struct to_kill *tk;
-       LIST_HEAD(tokill);
-       int rc = -EBUSY;
-       loff_t start;
-       dax_entry_t cookie;
+       int rc = -ENXIO;
  
         if (flags & MF_COUNT_INCREASED)
                 /*
@@ -1658,73 +1921,24 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
                 put_page(page);
  
         /* device metadata space is not recoverable */
-       if (!pgmap_pfn_valid(pgmap, pfn)) {
-               rc = -ENXIO;
+       if (!pgmap_pfn_valid(pgmap, pfn))
                 goto out;
-       }
  
         /*
-        * Pages instantiated by device-dax (not filesystem-dax)
-        * may be compound pages.
+        * Call driver's implementation to handle the memory failure, otherwise
+        * fall back to generic handler.
          */
-       page = compound_head(page);
-
-       /*
-        * Prevent the inode from being freed while we are interrogating
-        * the address_space, typically this would be handled by
-        * lock_page(), but dax pages do not use the page lock. This
-        * also prevents changes to the mapping of this pfn until
-        * poison signaling is complete.
-        */
-       cookie = dax_lock_page(page);
-       if (!cookie)
-               goto out;
-
-       if (hwpoison_filter(page)) {
-               rc = -EOPNOTSUPP;
-               goto unlock;
-       }
-
-       if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+       if (pgmap->ops->memory_failure) {
+               rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags);
                 /*
-                * TODO: Handle HMM pages which may need coordination
-                * with device-side memory.
+                * Fall back to generic handler too if operation is not
+                * supported inside the driver/device/filesystem.
                  */
-               goto unlock;
+               if (rc != -EOPNOTSUPP)
+                       goto out;
         }
  
-       /*
-        * Use this flag as an indication that the dax page has been
-        * remapped UC to prevent speculative consumption of poison.
-        */
-       SetPageHWPoison(page);
-
-       /*
-        * Unlike System-RAM there is no possibility to swap in a
-        * different physical page at a given virtual address, so all
-        * userspace consumption of ZONE_DEVICE memory necessitates
-        * SIGBUS (i.e. MF_MUST_KILL)
-        */
-       flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
-       collect_procs(page, &tokill, true);
-
-       list_for_each_entry(tk, &tokill, nd)
-               if (tk->size_shift)
-                       size = max(size, 1UL << tk->size_shift);
-       if (size) {
-               /*
-                * Unmap the largest mapping to avoid breaking up
-                * device-dax mappings which are constant size. The
-                * actual size of the mapping being torn down is
-                * communicated in siginfo, see kill_proc()
-                */
-               start = (page->index << PAGE_SHIFT) & ~(size - 1);
-               unmap_mapping_range(page->mapping, start, size, 0);
-       }
-       kill_procs(&tokill, true, false, pfn, flags);
-       rc = 0;
-unlock:
-       dax_unlock_page(page, cookie);
+       rc = mf_generic_kill_procs(pfn, flags, pgmap);
  out:
         /* drop pgmap ref acquired in caller */
         put_dev_pagemap(pgmap);
@@ -1787,8 +2001,7 @@ int memory_failure(unsigned long pfn, int flags)
                                 goto unlock_mutex;
                         }
                 }
-               pr_err("Memory failure: %#lx: memory outside kernel control\n",
-                       pfn);
+               pr_err("%#lx: memory outside kernel control\n", pfn);
                 res = -ENXIO;
                 goto unlock_mutex;
         }
@@ -1799,8 +2012,7 @@ try_again:
                 goto unlock_mutex;
  
         if (TestSetPageHWPoison(p)) {
-               pr_err("Memory failure: %#lx: already hardware poisoned\n",
-                       pfn);
+               pr_err("%#lx: already hardware poisoned\n", pfn);
                 res = -EHWPOISON;
                 if (flags & MF_ACTION_REQUIRED)
                         res = kill_accessing_process(current, pfn, flags);
@@ -2016,7 +2228,7 @@ void memory_failure_queue(unsigned long pfn, int flags)
         if (kfifo_put(&mf_cpu->fifo, entry))
                 schedule_work_on(smp_processor_id(), &mf_cpu->work);
         else
-               pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
+               pr_err("buffer overflow when queuing memory failure at %#lx\n",
                        pfn);
         spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
         put_cpu_var(memory_failure_cpu);
@@ -2073,6 +2285,8 @@ static int __init memory_failure_init(void)
  }
  core_initcall(memory_failure_init);
  
+#undef pr_fmt
+#define pr_fmt(fmt)    "" fmt
  #define unpoison_pr_info(fmt, pfn, rs)                 \
  ({                                                     \
         if (__ratelimit(rs))                            \
@@ -2097,6 +2311,7 @@ int unpoison_memory(unsigned long pfn)
         struct page *p;
         int ret = -EBUSY;
         int freeit = 0;
+       unsigned long count = 1;
         static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
                                         DEFAULT_RATELIMIT_BURST);
  
@@ -2144,6 +2359,13 @@ int unpoison_memory(unsigned long pfn)
  
         ret = get_hwpoison_page(p, MF_UNPOISON);
         if (!ret) {
+               if (PageHuge(p)) {
+                       count = free_raw_hwp_pages(page, false);
+                       if (count == 0) {
+                               ret = -EBUSY;
+                               goto unlock_mutex;
+                       }
+               }
                 ret = TestClearPageHWPoison(page) ? 0 : -EBUSY;
         } else if (ret < 0) {
                 if (ret == -EHWPOISON) {
@@ -2152,6 +2374,13 @@ int unpoison_memory(unsigned long pfn)
                         unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
                                          pfn, &unpoison_rs);
         } else {
+               if (PageHuge(p)) {
+                       count = free_raw_hwp_pages(page, false);
+                       if (count == 0) {
+                               ret = -EBUSY;
+                               goto unlock_mutex;
+                       }
+               }
                 freeit = !!TestClearPageHWPoison(p);
  
                 put_page(page);
@@ -2164,7 +2393,7 @@ int unpoison_memory(unsigned long pfn)
  unlock_mutex:
         mutex_unlock(&mf_mutex);
         if (!ret || freeit) {
-               num_poisoned_pages_dec();
+               num_poisoned_pages_sub(count);
                 unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
                                  page_to_pfn(p), &unpoison_rs);
         }
@@ -2178,7 +2407,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist)
         bool lru = PageLRU(page);
  
         if (PageHuge(page)) {
-               isolated = isolate_huge_page(page, pagelist);
+               isolated = !isolate_hugetlb(page, pagelist);
         } else {
                 if (lru)
                         isolated = !isolate_lru_page(page);