Merge branch 'akpm' (patches from Andrew)

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 22 Apr 2022 17:10:43 +0000 (10:10 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 22 Apr 2022 17:10:43 +0000 (10:10 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 22 Apr 2022 17:10:43 +0000 (10:10 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 22 Apr 2022 17:10:43 +0000 (10:10 -0700)
diff --git a/MAINTAINERS b/MAINTAINERS

index d21963b..5e8c2f6 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10547,6 +10547,7 @@ M:      Andrey Ryabinin <ryabinin.a.a@gmail.com>
  R:     Alexander Potapenko <glider@google.com>
  R:     Andrey Konovalov <andreyknvl@gmail.com>
  R:     Dmitry Vyukov <dvyukov@google.com>
+R:     Vincenzo Frascino <vincenzo.frascino@arm.com>
  L:     kasan-dev@googlegroups.com
  S:     Maintained
  F:     Documentation/dev-tools/kasan.rst
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c

index 99c7477..dd3a088 100644 (file)
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -206,7 +206,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
         info.flags = 0;
         info.length = len;
         info.low_limit = current->mm->mmap_base;
-       info.high_limit = TASK_SIZE;
+       info.high_limit = arch_get_mmap_end(addr);
         info.align_mask = PAGE_MASK & ~huge_page_mask(h);
         info.align_offset = 0;
         return vm_unmapped_area(&info);
@@ -222,7 +222,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
         info.flags = VM_UNMAPPED_AREA_TOPDOWN;
         info.length = len;
         info.low_limit = max(PAGE_SIZE, mmap_min_addr);
-       info.high_limit = current->mm->mmap_base;
+       info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
         info.align_mask = PAGE_MASK & ~huge_page_mask(h);
         info.align_offset = 0;
         addr = vm_unmapped_area(&info);
@@ -237,7 +237,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
                 VM_BUG_ON(addr != -ENOMEM);
                 info.flags = 0;
                 info.low_limit = current->mm->mmap_base;
-               info.high_limit = TASK_SIZE;
+               info.high_limit = arch_get_mmap_end(addr);
                 addr = vm_unmapped_area(&info);
         }
  
@@ -251,6 +251,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
         struct mm_struct *mm = current->mm;
         struct vm_area_struct *vma;
         struct hstate *h = hstate_file(file);
+       const unsigned long mmap_end = arch_get_mmap_end(addr);
  
         if (len & ~huge_page_mask(h))
                 return -EINVAL;
@@ -266,7 +267,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
         if (addr) {
                 addr = ALIGN(addr, huge_page_size(h));
                 vma = find_vma(mm, addr);
-               if (TASK_SIZE - len >= addr &&
+               if (mmap_end - len >= addr &&
                     (!vma || addr + len <= vm_start_gap(vma)))
                         return addr;
         }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h

index 53c1b60..ac2a1d7 100644 (file)
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -169,6 +169,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
                                                 long freed);
  bool isolate_huge_page(struct page *page, struct list_head *list);
  int get_hwpoison_huge_page(struct page *page, bool *hugetlb);
+int get_huge_page_for_hwpoison(unsigned long pfn, int flags);
  void putback_active_hugepage(struct page *page);
  void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
  void free_huge_page(struct page *page);
@@ -378,6 +379,11 @@ static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
         return 0;
  }
  
+static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+{
+       return 0;
+}
+
  static inline void putback_active_hugepage(struct page *page)
  {
  }
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

index a68dce3..89b1472 100644 (file)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1012,6 +1012,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
  }
  
  void mem_cgroup_flush_stats(void);
+void mem_cgroup_flush_stats_delayed(void);
  
  void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
                               int val);
@@ -1455,6 +1456,10 @@ static inline void mem_cgroup_flush_stats(void)
  {
  }
  
+static inline void mem_cgroup_flush_stats_delayed(void)
+{
+}
+
  static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec,
                                             enum node_stat_item idx, int val)
  {
diff --git a/include/linux/mm.h b/include/linux/mm.h

index e34edb7..9f44254 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3197,6 +3197,14 @@ extern int sysctl_memory_failure_recovery;
  extern void shake_page(struct page *p);
  extern atomic_long_t num_poisoned_pages __read_mostly;
  extern int soft_offline_page(unsigned long pfn, int flags);
+#ifdef CONFIG_MEMORY_FAILURE
+extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags);
+#else
+static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+{
+       return 0;
+}
+#endif
  
  #ifndef arch_memory_failure
  static inline int arch_memory_failure(unsigned long pfn, int flags)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index d5e3c00..a8911b1 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1443,6 +1443,7 @@ struct task_struct {
         int                             pagefault_disabled;
  #ifdef CONFIG_MMU
         struct task_struct              *oom_reaper_list;
+       struct timer_list               oom_reaper_timer;
  #endif
  #ifdef CONFIG_VMAP_STACK
         struct vm_struct                *stack_vm_area;
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h

index a80356e..1ad1f4b 100644 (file)
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -136,6 +136,14 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
  #endif /* CONFIG_MEMCG */
  
  #ifdef CONFIG_MMU
+#ifndef arch_get_mmap_end
+#define arch_get_mmap_end(addr)        (TASK_SIZE)
+#endif
+
+#ifndef arch_get_mmap_base
+#define arch_get_mmap_base(addr, base) (base)
+#endif
+
  extern void arch_pick_mmap_layout(struct mm_struct *mm,
                                   struct rlimit *rlim_stack);
  extern unsigned long
diff --git a/kernel/kcov.c b/kernel/kcov.c

index 475524b..b3732b2 100644 (file)
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -475,8 +475,11 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
         vma->vm_flags |= VM_DONTEXPAND;
         for (off = 0; off < size; off += PAGE_SIZE) {
                 page = vmalloc_to_page(kcov->area + off);
-               if (vm_insert_page(vma, vma->vm_start + off, page))
-                       WARN_ONCE(1, "vm_insert_page() failed");
+               res = vm_insert_page(vma, vma->vm_start + off, page);
+               if (res) {
+                       pr_warn_once("kcov: vm_insert_page() failed\n");
+                       return res;
+               }
         }
         return 0;
  exit:
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index f8ca7cc..3fc7217 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6785,6 +6785,16 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
         return ret;
  }
  
+int get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+{
+       int ret;
+
+       spin_lock_irq(&hugetlb_lock);
+       ret = __get_huge_page_for_hwpoison(pfn, flags);
+       spin_unlock_irq(&hugetlb_lock);
+       return ret;
+}
+
  void putback_active_hugepage(struct page *page)
  {
         spin_lock_irq(&hugetlb_lock);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 725f767..598fece 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -587,6 +587,9 @@ static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
  static DEFINE_SPINLOCK(stats_flush_lock);
  static DEFINE_PER_CPU(unsigned int, stats_updates);
  static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
+static u64 flush_next_time;
+
+#define FLUSH_TIME (2UL*HZ)
  
  /*
   * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
@@ -637,6 +640,7 @@ static void __mem_cgroup_flush_stats(void)
         if (!spin_trylock_irqsave(&stats_flush_lock, flag))
                 return;
  
+       flush_next_time = jiffies_64 + 2*FLUSH_TIME;
         cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
         atomic_set(&stats_flush_threshold, 0);
         spin_unlock_irqrestore(&stats_flush_lock, flag);
@@ -648,10 +652,16 @@ void mem_cgroup_flush_stats(void)
                 __mem_cgroup_flush_stats();
  }
  
+void mem_cgroup_flush_stats_delayed(void)
+{
+       if (time_after64(jiffies_64, flush_next_time))
+               mem_cgroup_flush_stats();
+}
+
  static void flush_memcg_stats_dwork(struct work_struct *w)
  {
         __mem_cgroup_flush_stats();
-       queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
+       queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
  }
  
  /**
diff --git a/mm/memory-failure.c b/mm/memory-failure.c

index dcb6bb9..27760c1 100644 (file)
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1498,50 +1498,113 @@ static int try_to_split_thp_page(struct page *page, const char *msg)
         return 0;
  }
  
-static int memory_failure_hugetlb(unsigned long pfn, int flags)
+/*
+ * Called from hugetlb code with hugetlb_lock held.
+ *
+ * Return values:
+ *   0             - free hugepage
+ *   1             - in-use hugepage
+ *   2             - not a hugepage
+ *   -EBUSY        - the hugepage is busy (try to retry)
+ *   -EHWPOISON    - the hugepage is already hwpoisoned
+ */
+int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+{
+       struct page *page = pfn_to_page(pfn);
+       struct page *head = compound_head(page);
+       int ret = 2;    /* fallback to normal page handling */
+       bool count_increased = false;
+
+       if (!PageHeadHuge(head))
+               goto out;
+
+       if (flags & MF_COUNT_INCREASED) {
+               ret = 1;
+               count_increased = true;
+       } else if (HPageFreed(head) || HPageMigratable(head)) {
+               ret = get_page_unless_zero(head);
+               if (ret)
+                       count_increased = true;
+       } else {
+               ret = -EBUSY;
+               goto out;
+       }
+
+       if (TestSetPageHWPoison(head)) {
+               ret = -EHWPOISON;
+               goto out;
+       }
+
+       return ret;
+out:
+       if (count_increased)
+               put_page(head);
+       return ret;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * Taking refcount of hugetlb pages needs extra care about race conditions
+ * with basic operations like hugepage allocation/free/demotion.
+ * So some of prechecks for hwpoison (pinning, and testing/setting
+ * PageHWPoison) should be done in single hugetlb_lock range.
+ */
+static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
  {
-       struct page *p = pfn_to_page(pfn);
-       struct page *head = compound_head(p);
         int res;
+       struct page *p = pfn_to_page(pfn);
+       struct page *head;
         unsigned long page_flags;
+       bool retry = true;
  
-       if (TestSetPageHWPoison(head)) {
-               pr_err("Memory failure: %#lx: already hardware poisoned\n",
-                      pfn);
-               res = -EHWPOISON;
-               if (flags & MF_ACTION_REQUIRED)
+       *hugetlb = 1;
+retry:
+       res = get_huge_page_for_hwpoison(pfn, flags);
+       if (res == 2) { /* fallback to normal page handling */
+               *hugetlb = 0;
+               return 0;
+       } else if (res == -EHWPOISON) {
+               pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn);
+               if (flags & MF_ACTION_REQUIRED) {
+                       head = compound_head(p);
                         res = kill_accessing_process(current, page_to_pfn(head), flags);
+               }
+               return res;
+       } else if (res == -EBUSY) {
+               if (retry) {
+                       retry = false;
+                       goto retry;
+               }
+               action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
                 return res;
         }
  
+       head = compound_head(p);
+       lock_page(head);
+
+       if (hwpoison_filter(p)) {
+               ClearPageHWPoison(head);
+               res = -EOPNOTSUPP;
+               goto out;
+       }
+
         num_poisoned_pages_inc();
  
-       if (!(flags & MF_COUNT_INCREASED)) {
-               res = get_hwpoison_page(p, flags);
-               if (!res) {
-                       lock_page(head);
-                       if (hwpoison_filter(p)) {
-                               if (TestClearPageHWPoison(head))
-                                       num_poisoned_pages_dec();
-                               unlock_page(head);
-                               return -EOPNOTSUPP;
-                       }
-                       unlock_page(head);
-                       res = MF_FAILED;
-                       if (__page_handle_poison(p)) {
-                               page_ref_inc(p);
-                               res = MF_RECOVERED;
-                       }
-                       action_result(pfn, MF_MSG_FREE_HUGE, res);
-                       return res == MF_RECOVERED ? 0 : -EBUSY;
-               } else if (res < 0) {
-                       action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
-                       return -EBUSY;
+       /*
+        * Handling free hugepage.  The possible race with hugepage allocation
+        * or demotion can be prevented by PageHWPoison flag.
+        */
+       if (res == 0) {
+               unlock_page(head);
+               res = MF_FAILED;
+               if (__page_handle_poison(p)) {
+                       page_ref_inc(p);
+                       res = MF_RECOVERED;
                 }
+               action_result(pfn, MF_MSG_FREE_HUGE, res);
+               return res == MF_RECOVERED ? 0 : -EBUSY;
         }
  
-       lock_page(head);
-
         /*
          * The page could have changed compound pages due to race window.
          * If this happens just bail out.
@@ -1554,14 +1617,6 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
  
         page_flags = head->flags;
  
-       if (hwpoison_filter(p)) {
-               if (TestClearPageHWPoison(head))
-                       num_poisoned_pages_dec();
-               put_page(p);
-               res = -EOPNOTSUPP;
-               goto out;
-       }
-
         /*
          * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
          * simply disable it. In order to make it work properly, we need
@@ -1588,6 +1643,12 @@ out:
         unlock_page(head);
         return res;
  }
+#else
+static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
+{
+       return 0;
+}
+#endif
  
  static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
                 struct dev_pagemap *pgmap)
@@ -1712,6 +1773,7 @@ int memory_failure(unsigned long pfn, int flags)
         int res = 0;
         unsigned long page_flags;
         bool retry = true;
+       int hugetlb = 0;
  
         if (!sysctl_memory_failure_recovery)
                 panic("Memory failure on page %lx", pfn);
@@ -1739,10 +1801,9 @@ int memory_failure(unsigned long pfn, int flags)
         }
  
  try_again:
-       if (PageHuge(p)) {
-               res = memory_failure_hugetlb(pfn, flags);
+       res = try_memory_failure_hugetlb(pfn, flags, &hugetlb);
+       if (hugetlb)
                 goto unlock_mutex;
-       }
  
         if (TestSetPageHWPoison(p)) {
                 pr_err("Memory failure: %#lx: already hardware poisoned\n",
@@ -1799,6 +1860,19 @@ try_again:
         }
  
         if (PageTransHuge(hpage)) {
+               /*
+                * Bail out before SetPageHasHWPoisoned() if hpage is
+                * huge_zero_page, although PG_has_hwpoisoned is not
+                * checked in set_huge_zero_page().
+                *
+                * TODO: Handle memory failure of huge_zero_page thoroughly.
+                */
+               if (is_huge_zero_page(hpage)) {
+                       action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
+                       res = -EBUSY;
+                       goto unlock_mutex;
+               }
+
                 /*
                  * The flag must be set after the refcount is bumped
                  * otherwise it may race with THP split.
diff --git a/mm/mmap.c b/mm/mmap.c

index 3aa839f..313b57d 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2117,14 +2117,6 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
         return addr;
  }
  
-#ifndef arch_get_mmap_end
-#define arch_get_mmap_end(addr)        (TASK_SIZE)
-#endif
-
-#ifndef arch_get_mmap_base
-#define arch_get_mmap_base(addr, base) (base)
-#endif
-
  /* Get an address range which is currently unmapped.
   * For shmat() with addr=0.
   *
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c

index 459d195..f45ff1b 100644 (file)
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -1036,6 +1036,18 @@ int mmu_interval_notifier_insert_locked(
  }
  EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked);
  
+static bool
+mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions,
+                         unsigned long seq)
+{
+       bool ret;
+
+       spin_lock(&subscriptions->lock);
+       ret = subscriptions->invalidate_seq != seq;
+       spin_unlock(&subscriptions->lock);
+       return ret;
+}
+
  /**
   * mmu_interval_notifier_remove - Remove a interval notifier
   * @interval_sub: Interval subscription to unregister
@@ -1083,7 +1095,7 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub)
         lock_map_release(&__mmu_notifier_invalidate_range_start_map);
         if (seq)
                 wait_event(subscriptions->wq,
-                          READ_ONCE(subscriptions->invalidate_seq) != seq);
+                          mmu_interval_seq_released(subscriptions, seq));
  
         /* pairs with mmgrab in mmu_interval_notifier_insert() */
         mmdrop(mm);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c

index 7ec3819..49d7df3 100644 (file)
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -632,7 +632,7 @@ done:
          */
         set_bit(MMF_OOM_SKIP, &mm->flags);
  
-       /* Drop a reference taken by wake_oom_reaper */
+       /* Drop a reference taken by queue_oom_reaper */
         put_task_struct(tsk);
  }
  
@@ -644,12 +644,12 @@ static int oom_reaper(void *unused)
                 struct task_struct *tsk = NULL;
  
                 wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
-               spin_lock(&oom_reaper_lock);
+               spin_lock_irq(&oom_reaper_lock);
                 if (oom_reaper_list != NULL) {
                         tsk = oom_reaper_list;
                         oom_reaper_list = tsk->oom_reaper_list;
                 }
-               spin_unlock(&oom_reaper_lock);
+               spin_unlock_irq(&oom_reaper_lock);
  
                 if (tsk)
                         oom_reap_task(tsk);
@@ -658,22 +658,48 @@ static int oom_reaper(void *unused)
         return 0;
  }
  
-static void wake_oom_reaper(struct task_struct *tsk)
+static void wake_oom_reaper(struct timer_list *timer)
  {
-       /* mm is already queued? */
-       if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
-               return;
+       struct task_struct *tsk = container_of(timer, struct task_struct,
+                       oom_reaper_timer);
+       struct mm_struct *mm = tsk->signal->oom_mm;
+       unsigned long flags;
  
-       get_task_struct(tsk);
+       /* The victim managed to terminate on its own - see exit_mmap */
+       if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
+               put_task_struct(tsk);
+               return;
+       }
  
-       spin_lock(&oom_reaper_lock);
+       spin_lock_irqsave(&oom_reaper_lock, flags);
         tsk->oom_reaper_list = oom_reaper_list;
         oom_reaper_list = tsk;
-       spin_unlock(&oom_reaper_lock);
+       spin_unlock_irqrestore(&oom_reaper_lock, flags);
         trace_wake_reaper(tsk->pid);
         wake_up(&oom_reaper_wait);
  }
  
+/*
+ * Give the OOM victim time to exit naturally before invoking the oom_reaping.
+ * The timers timeout is arbitrary... the longer it is, the longer the worst
+ * case scenario for the OOM can take. If it is too small, the oom_reaper can
+ * get in the way and release resources needed by the process exit path.
+ * e.g. The futex robust list can sit in Anon|Private memory that gets reaped
+ * before the exit path is able to wake the futex waiters.
+ */
+#define OOM_REAPER_DELAY (2*HZ)
+static void queue_oom_reaper(struct task_struct *tsk)
+{
+       /* mm is already queued? */
+       if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
+               return;
+
+       get_task_struct(tsk);
+       timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0);
+       tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY;
+       add_timer(&tsk->oom_reaper_timer);
+}
+
  static int __init oom_init(void)
  {
         oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
@@ -681,7 +707,7 @@ static int __init oom_init(void)
  }
  subsys_initcall(oom_init)
  #else
-static inline void wake_oom_reaper(struct task_struct *tsk)
+static inline void queue_oom_reaper(struct task_struct *tsk)
  {
  }
  #endif /* CONFIG_MMU */
@@ -932,7 +958,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
         rcu_read_unlock();
  
         if (can_oom_reap)
-               wake_oom_reaper(victim);
+               queue_oom_reaper(victim);
  
         mmdrop(mm);
         put_task_struct(victim);
@@ -968,7 +994,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
         task_lock(victim);
         if (task_will_free_mem(victim)) {
                 mark_oom_victim(victim);
-               wake_oom_reaper(victim);
+               queue_oom_reaper(victim);
                 task_unlock(victim);
                 put_task_struct(victim);
                 return;
@@ -1067,7 +1093,7 @@ bool out_of_memory(struct oom_control *oc)
          */
         if (task_will_free_mem(current)) {
                 mark_oom_victim(current);
-               wake_oom_reaper(current);
+               queue_oom_reaper(current);
                 return true;
         }
  
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c

index 0cb8e5e..e9bb6db 100644 (file)
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -72,12 +72,15 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
         _dst_pte = pte_mkdirty(_dst_pte);
         if (page_in_cache && !vm_shared)
                 writable = false;
-       if (writable) {
-               if (wp_copy)
-                       _dst_pte = pte_mkuffd_wp(_dst_pte);
-               else
-                       _dst_pte = pte_mkwrite(_dst_pte);
-       }
+
+       /*
+        * Always mark a PTE as write-protected when needed, regardless of
+        * VM_WRITE, which the user might change.
+        */
+       if (wp_copy)
+               _dst_pte = pte_mkuffd_wp(_dst_pte);
+       else if (writable)
+               _dst_pte = pte_mkwrite(_dst_pte);
  
         dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
  
diff --git a/mm/workingset.c b/mm/workingset.c

index 8a3828a..592569a 100644 (file)
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -355,7 +355,7 @@ void workingset_refault(struct folio *folio, void *shadow)
  
         mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
  
-       mem_cgroup_flush_stats();
+       mem_cgroup_flush_stats_delayed();
         /*
          * Compare the distance to the existing workingset size. We
          * don't activate pages that couldn't stay resident even if
diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c

index 7c0b061..db02701 100644 (file)
--- a/tools/testing/selftests/vm/mremap_test.c
+++ b/tools/testing/selftests/vm/mremap_test.c
@@ -6,9 +6,11 @@
  
  #include <errno.h>
  #include <stdlib.h>
+#include <stdio.h>
  #include <string.h>
  #include <sys/mman.h>
  #include <time.h>
+#include <stdbool.h>
  
  #include "../kselftest.h"
  
@@ -63,6 +65,59 @@ enum {
         .expect_failure = should_fail                           \
  }
  
+/*
+ * Returns false if the requested remap region overlaps with an
+ * existing mapping (e.g text, stack) else returns true.
+ */
+static bool is_remap_region_valid(void *addr, unsigned long long size)
+{
+       void *remap_addr = NULL;
+       bool ret = true;
+
+       /* Use MAP_FIXED_NOREPLACE flag to ensure region is not mapped */
+       remap_addr = mmap(addr, size, PROT_READ | PROT_WRITE,
+                                        MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
+                                        -1, 0);
+
+       if (remap_addr == MAP_FAILED) {
+               if (errno == EEXIST)
+                       ret = false;
+       } else {
+               munmap(remap_addr, size);
+       }
+
+       return ret;
+}
+
+/* Returns mmap_min_addr sysctl tunable from procfs */
+static unsigned long long get_mmap_min_addr(void)
+{
+       FILE *fp;
+       int n_matched;
+       static unsigned long long addr;
+
+       if (addr)
+               return addr;
+
+       fp = fopen("/proc/sys/vm/mmap_min_addr", "r");
+       if (fp == NULL) {
+               ksft_print_msg("Failed to open /proc/sys/vm/mmap_min_addr: %s\n",
+                       strerror(errno));
+               exit(KSFT_SKIP);
+       }
+
+       n_matched = fscanf(fp, "%llu", &addr);
+       if (n_matched != 1) {
+               ksft_print_msg("Failed to read /proc/sys/vm/mmap_min_addr: %s\n",
+                       strerror(errno));
+               fclose(fp);
+               exit(KSFT_SKIP);
+       }
+
+       fclose(fp);
+       return addr;
+}
+
  /*
   * Returns the start address of the mapping on success, else returns
   * NULL on failure.
@@ -71,11 +126,18 @@ static void *get_source_mapping(struct config c)
  {
         unsigned long long addr = 0ULL;
         void *src_addr = NULL;
+       unsigned long long mmap_min_addr;
+
+       mmap_min_addr = get_mmap_min_addr();
+
  retry:
         addr += c.src_alignment;
+       if (addr < mmap_min_addr)
+               goto retry;
+
         src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE,
-                       MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
-                       -1, 0);
+                                       MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
+                                       -1, 0);
         if (src_addr == MAP_FAILED) {
                 if (errno == EPERM || errno == EEXIST)
                         goto retry;
@@ -90,8 +152,10 @@ retry:
          * alignment in the tests.
          */
         if (((unsigned long long) src_addr & (c.src_alignment - 1)) ||
-                       !((unsigned long long) src_addr & c.src_alignment))
+                       !((unsigned long long) src_addr & c.src_alignment)) {
+               munmap(src_addr, c.region_size);
                 goto retry;
+       }
  
         if (!src_addr)
                 goto error;
@@ -140,9 +204,20 @@ static long long remap_region(struct config c, unsigned int threshold_mb,
         if (!((unsigned long long) addr & c.dest_alignment))
                 addr = (void *) ((unsigned long long) addr | c.dest_alignment);
  
+       /* Don't destroy existing mappings unless expected to overlap */
+       while (!is_remap_region_valid(addr, c.region_size) && !c.overlapping) {
+               /* Check for unsigned overflow */
+               if (addr + c.dest_alignment < addr) {
+                       ksft_print_msg("Couldn't find a valid region to remap to\n");
+                       ret = -1;
+                       goto out;
+               }
+               addr += c.dest_alignment;
+       }
+
         clock_gettime(CLOCK_MONOTONIC, &t_start);
         dest_addr = mremap(src_addr, c.region_size, c.region_size,
-                       MREMAP_MAYMOVE|MREMAP_FIXED, (char *) addr);
+                                         MREMAP_MAYMOVE|MREMAP_FIXED, (char *) addr);
         clock_gettime(CLOCK_MONOTONIC, &t_end);
  
         if (dest_addr == MAP_FAILED) {
@@ -193,7 +268,7 @@ static void run_mremap_test_case(struct test test_case, int *failures,
  
         if (remap_time < 0) {
                 if (test_case.expect_failure)
-                       ksft_test_result_pass("%s\n\tExpected mremap failure\n",
+                       ksft_test_result_xfail("%s\n\tExpected mremap failure\n",
                                               test_case.name);
                 else {
                         ksft_test_result_fail("%s\n", test_case.name);
diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh

index 3b265f1..352ba00 100755 (executable)
--- a/tools/testing/selftests/vm/run_vmtests.sh
+++ b/tools/testing/selftests/vm/run_vmtests.sh
@@ -291,11 +291,16 @@ echo "-------------------"
  echo "running mremap_test"
  echo "-------------------"
  ./mremap_test
-if [ $? -ne 0 ]; then
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+       echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+        echo "[SKIP]"
+        exitcode=$ksft_skip
+else
         echo "[FAIL]"
         exitcode=1
-else
-       echo "[PASS]"
  fi
  
  echo "-----------------"
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 22 Apr 2022 17:10:43 +0000 (10:10 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 22 Apr 2022 17:10:43 +0000 (10:10 -0700)
MAINTAINERS		patch \| blob \| history
fs/hugetlbfs/inode.c		patch \| blob \| history
include/linux/hugetlb.h		patch \| blob \| history
include/linux/memcontrol.h		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
include/linux/sched/mm.h		patch \| blob \| history
kernel/kcov.c		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history
mm/memory-failure.c		patch \| blob \| history
mm/mmap.c		patch \| blob \| history
mm/mmu_notifier.c		patch \| blob \| history
mm/oom_kill.c		patch \| blob \| history
mm/userfaultfd.c		patch \| blob \| history
mm/workingset.c		patch \| blob \| history
tools/testing/selftests/vm/mremap_test.c		patch \| blob \| history
tools/testing/selftests/vm/run_vmtests.sh		patch \| blob \| history