crypto - shash: reduce minimum alignment of shash_desc structure

[linux-2.6-microblaze.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index eaa227a..7a2c89b 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -57,6 +57,7 @@
  #include <trace/events/oom.h>
  #include <linux/prefetch.h>
  #include <linux/mm_inline.h>
+#include <linux/mmu_notifier.h>
  #include <linux/migrate.h>
  #include <linux/hugetlb.h>
  #include <linux/sched/rt.h>
@@ -70,6 +71,7 @@
  #include <linux/psi.h>
  #include <linux/padata.h>
  #include <linux/khugepaged.h>
+#include <linux/buffer_head.h>
  
  #include <asm/sections.h>
  #include <asm/tlbflush.h>
@@ -165,53 +167,26 @@ unsigned long totalcma_pages __read_mostly;
  
  int percpu_pagelist_fraction;
  gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
-#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON
-DEFINE_STATIC_KEY_TRUE(init_on_alloc);
-#else
  DEFINE_STATIC_KEY_FALSE(init_on_alloc);
-#endif
  EXPORT_SYMBOL(init_on_alloc);
  
-#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON
-DEFINE_STATIC_KEY_TRUE(init_on_free);
-#else
  DEFINE_STATIC_KEY_FALSE(init_on_free);
-#endif
  EXPORT_SYMBOL(init_on_free);
  
+static bool _init_on_alloc_enabled_early __read_mostly
+                               = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
  static int __init early_init_on_alloc(char *buf)
  {
-       int ret;
-       bool bool_result;
  
-       ret = kstrtobool(buf, &bool_result);
-       if (ret)
-               return ret;
-       if (bool_result && page_poisoning_enabled())
-               pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n");
-       if (bool_result)
-               static_branch_enable(&init_on_alloc);
-       else
-               static_branch_disable(&init_on_alloc);
-       return 0;
+       return kstrtobool(buf, &_init_on_alloc_enabled_early);
  }
  early_param("init_on_alloc", early_init_on_alloc);
  
+static bool _init_on_free_enabled_early __read_mostly
+                               = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
  static int __init early_init_on_free(char *buf)
  {
-       int ret;
-       bool bool_result;
-
-       ret = kstrtobool(buf, &bool_result);
-       if (ret)
-               return ret;
-       if (bool_result && page_poisoning_enabled())
-               pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n");
-       if (bool_result)
-               static_branch_enable(&init_on_free);
-       else
-               static_branch_disable(&init_on_free);
-       return 0;
+       return kstrtobool(buf, &_init_on_free_enabled_early);
  }
  early_param("init_on_free", early_init_on_free);
  
@@ -495,14 +470,6 @@ static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
         return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
  }
  
-/**
- * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
- * @page: The page within the block of interest
- * @pfn: The target page frame number
- * @mask: mask of bits that the caller is interested in
- *
- * Return: pageblock_bits flags
- */
  static __always_inline
  unsigned long __get_pfnblock_flags_mask(struct page *page,
                                         unsigned long pfn,
@@ -521,6 +488,14 @@ unsigned long __get_pfnblock_flags_mask(struct page *page,
         return (word >> bitidx) & mask;
  }
  
+/**
+ * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @mask: mask of bits that the caller is interested in
+ *
+ * Return: pageblock_bits flags
+ */
  unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
                                         unsigned long mask)
  {
@@ -728,19 +703,6 @@ static int __init early_debug_pagealloc(char *buf)
  }
  early_param("debug_pagealloc", early_debug_pagealloc);
  
-void init_debug_pagealloc(void)
-{
-       if (!debug_pagealloc_enabled())
-               return;
-
-       static_branch_enable(&_debug_pagealloc_enabled);
-
-       if (!debug_guardpage_minorder())
-               return;
-
-       static_branch_enable(&_debug_guardpage_enabled);
-}
-
  static int __init debug_guardpage_minorder_setup(char *buf)
  {
         unsigned long res;
@@ -792,6 +754,53 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
                                 unsigned int order, int migratetype) {}
  #endif
  
+/*
+ * Enable static keys related to various memory debugging and hardening options.
+ * Some override others, and depend on early params that are evaluated in the
+ * order of appearance. So we need to first gather the full picture of what was
+ * enabled, and then make decisions.
+ */
+void init_mem_debugging_and_hardening(void)
+{
+       if (_init_on_alloc_enabled_early) {
+               if (page_poisoning_enabled())
+                       pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
+                               "will take precedence over init_on_alloc\n");
+               else
+                       static_branch_enable(&init_on_alloc);
+       }
+       if (_init_on_free_enabled_early) {
+               if (page_poisoning_enabled())
+                       pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
+                               "will take precedence over init_on_free\n");
+               else
+                       static_branch_enable(&init_on_free);
+       }
+
+#ifdef CONFIG_PAGE_POISONING
+       /*
+        * Page poisoning is debug page alloc for some arches. If
+        * either of those options are enabled, enable poisoning.
+        */
+       if (page_poisoning_enabled() ||
+            (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
+             debug_pagealloc_enabled()))
+               static_branch_enable(&_page_poisoning_enabled);
+#endif
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+       if (!debug_pagealloc_enabled())
+               return;
+
+       static_branch_enable(&_debug_pagealloc_enabled);
+
+       if (!debug_guardpage_minorder())
+               return;
+
+       static_branch_enable(&_debug_guardpage_enabled);
+#endif
+}
+
  static inline void set_buddy_order(struct page *page, unsigned int order)
  {
         set_page_private(page, order);
@@ -994,7 +1003,7 @@ static inline void __free_one_page(struct page *page,
         struct page *buddy;
         bool to_tail;
  
-       max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
+       max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);
  
         VM_BUG_ON(!zone_is_initialized(zone));
         VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
@@ -1007,7 +1016,7 @@ static inline void __free_one_page(struct page *page,
         VM_BUG_ON_PAGE(bad_range(zone, page), page);
  
  continue_merging:
-       while (order < max_order - 1) {
+       while (order < max_order) {
                 if (compaction_capture(capc, page, order, migratetype)) {
                         __mod_zone_freepage_state(zone, -(1 << order),
                                                                 migratetype);
@@ -1033,7 +1042,7 @@ continue_merging:
                 pfn = combined_pfn;
                 order++;
         }
-       if (max_order < MAX_ORDER) {
+       if (order < MAX_ORDER - 1) {
                 /* If we are here, it means order is >= pageblock_order.
                  * We want to prevent merge between freepages on isolate
                  * pageblock and normal pageblock. Without this, pageblock
@@ -1054,7 +1063,7 @@ continue_merging:
                                                 is_migrate_isolate(buddy_mt)))
                                 goto done_merging;
                 }
-               max_order++;
+               max_order = order + 1;
                 goto continue_merging;
         }
  
@@ -1092,7 +1101,7 @@ static inline bool page_expected_state(struct page *page,
         if (unlikely((unsigned long)page->mapping |
                         page_ref_count(page) |
  #ifdef CONFIG_MEMCG
-                       (unsigned long)page->mem_cgroup |
+                       (unsigned long)page_memcg(page) |
  #endif
                         (page->flags & check_flags)))
                 return false;
@@ -1117,7 +1126,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
                         bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
         }
  #ifdef CONFIG_MEMCG
-       if (unlikely(page->mem_cgroup))
+       if (unlikely(page_memcg(page)))
                 bad_reason = "page still charged to cgroup";
  #endif
         return bad_reason;
@@ -1195,8 +1204,10 @@ static void kernel_init_free_pages(struct page *page, int numpages)
  
         /* s390's use of memset() could override KASAN redzones. */
         kasan_disable_current();
-       for (i = 0; i < numpages; i++)
+       for (i = 0; i < numpages; i++) {
+               page_kasan_tag_reset(page + i);
                 clear_highpage(page + i);
+       }
         kasan_enable_current();
  }
  
@@ -1214,7 +1225,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
                  * Do not let hwpoison pages hit pcplists/buddy
                  * Untie memcg state and reset page's owner
                  */
-               if (memcg_kmem_enabled() && PageKmemcg(page))
+               if (memcg_kmem_enabled() && PageMemcgKmem(page))
                         __memcg_kmem_uncharge_page(page, order);
                 reset_page_owner(page, order);
                 return false;
@@ -1244,7 +1255,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
         }
         if (PageMappingFlags(page))
                 page->mapping = NULL;
-       if (memcg_kmem_enabled() && PageKmemcg(page))
+       if (memcg_kmem_enabled() && PageMemcgKmem(page))
                 __memcg_kmem_uncharge_page(page, order);
         if (check_free)
                 bad += check_free_page(page);
@@ -1264,7 +1275,8 @@ static __always_inline bool free_pages_prepare(struct page *page,
         if (want_init_on_free())
                 kernel_init_free_pages(page, 1 << order);
  
-       kernel_poison_pages(page, 1 << order, 0);
+       kernel_poison_pages(page, 1 << order);
+
         /*
          * arch_free_page() can make the page's contents inaccessible.  s390
          * does this.  So nothing which can access the page's contents should
@@ -1272,8 +1284,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
          */
         arch_free_page(page, order);
  
-       if (debug_pagealloc_enabled_static())
-               kernel_map_pages(page, 1 << order, 0);
+       debug_pagealloc_unmap_pages(page, 1 << order);
  
         kasan_free_nondeferred_pages(page, order);
  
@@ -1344,7 +1355,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
  {
         int migratetype = 0;
         int batch_free = 0;
-       int prefetch_nr = 0;
+       int prefetch_nr = READ_ONCE(pcp->batch);
         bool isolated_pageblocks;
         struct page *page, *tmp;
         LIST_HEAD(head);
@@ -1395,8 +1406,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                          * avoid excessive prefetching due to large count, only
                          * prefetch buddy for the first pcp->batch nr of pages.
                          */
-                       if (prefetch_nr++ < pcp->batch)
+                       if (prefetch_nr) {
                                 prefetch_buddy(page);
+                               prefetch_nr--;
+                       }
                 } while (--count && --batch_free && !list_empty(list));
         }
  
@@ -1558,14 +1571,23 @@ void __free_pages_core(struct page *page, unsigned int order)
  
  #ifdef CONFIG_NEED_MULTIPLE_NODES
  
-static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
+/*
+ * During memory init memblocks map pfns to nids. The search is expensive and
+ * this caches recent lookups. The implementation of __early_pfn_to_nid
+ * treats start/end as pfns.
+ */
+struct mminit_pfnnid_cache {
+       unsigned long last_start;
+       unsigned long last_end;
+       int last_nid;
+};
  
-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
  
  /*
   * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
   */
-int __meminit __early_pfn_to_nid(unsigned long pfn,
+static int __meminit __early_pfn_to_nid(unsigned long pfn,
                                         struct mminit_pfnnid_cache *state)
  {
         unsigned long start_pfn, end_pfn;
@@ -1583,7 +1605,6 @@ int __meminit __early_pfn_to_nid(unsigned long pfn,
  
         return nid;
  }
-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
  
  int __meminit early_pfn_to_nid(unsigned long pfn)
  {
@@ -2103,6 +2124,8 @@ void __init page_alloc_init_late(void)
         files_maxfiles_init();
  #endif
  
+       buffer_init();
+
         /* Discard memblock private memory */
         memblock_discard();
  
@@ -2207,12 +2230,6 @@ static inline int check_new_page(struct page *page)
         return 1;
  }
  
-static inline bool free_pages_prezeroed(void)
-{
-       return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
-               page_poisoning_enabled()) || want_init_on_free();
-}
-
  #ifdef CONFIG_DEBUG_VM
  /*
   * With DEBUG_VM enabled, order-0 pages are checked for expected state when
@@ -2270,11 +2287,13 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
         set_page_refcounted(page);
  
         arch_alloc_page(page, order);
-       if (debug_pagealloc_enabled_static())
-               kernel_map_pages(page, 1 << order, 1);
+       debug_pagealloc_map_pages(page, 1 << order);
         kasan_alloc_pages(page, order);
-       kernel_poison_pages(page, 1 << order, 1);
+       kernel_unpoison_pages(page, 1 << order);
         set_page_owner(page, order, gfp_flags);
+
+       if (!want_init_on_free() && want_init_on_alloc(gfp_flags))
+               kernel_init_free_pages(page, 1 << order);
  }
  
  static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
@@ -2282,9 +2301,6 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
  {
         post_alloc_hook(page, order, gfp_flags);
  
-       if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags))
-               kernel_init_free_pages(page, 1 << order);
-
         if (order && (gfp_flags & __GFP_COMP))
                 prep_compound_page(page, order);
  
@@ -2470,12 +2486,12 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
         return false;
  }
  
-static inline void boost_watermark(struct zone *zone)
+static inline bool boost_watermark(struct zone *zone)
  {
         unsigned long max_boost;
  
         if (!watermark_boost_factor)
-               return;
+               return false;
         /*
          * Don't bother in zones that are unlikely to produce results.
          * On small machines, including kdump capture kernels running
@@ -2483,7 +2499,7 @@ static inline void boost_watermark(struct zone *zone)
          * memory situation immediately.
          */
         if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
-               return;
+               return false;
  
         max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
                         watermark_boost_factor, 10000);
@@ -2497,12 +2513,14 @@ static inline void boost_watermark(struct zone *zone)
          * boosted watermark resulting in a hang.
          */
         if (!max_boost)
-               return;
+               return false;
  
         max_boost = max(pageblock_nr_pages, max_boost);
  
         zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
                 max_boost);
+
+       return true;
  }
  
  /*
@@ -2540,8 +2558,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
          * likelihood of future fallbacks. Wake kswapd now as the node
          * may be balanced overall and kswapd will not wake naturally.
          */
-       boost_watermark(zone);
-       if (alloc_flags & ALLOC_KSWAPD)
+       if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
                 set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
  
         /* We are not allowed to try stealing from the whole block */
@@ -3017,13 +3034,16 @@ static void drain_local_pages_wq(struct work_struct *work)
  }
  
  /*
- * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
- *
- * When zone parameter is non-NULL, spill just the single zone's pages.
+ * The implementation of drain_all_pages(), exposing an extra parameter to
+ * drain on all cpus.
   *
- * Note that this can be extremely slow as the draining happens in a workqueue.
+ * drain_all_pages() is optimized to only execute on cpus where pcplists are
+ * not empty. The check for non-emptiness can however race with a free to
+ * pcplist that has not yet increased the pcp->count from 0 to 1. Callers
+ * that need the guarantee that every CPU has drained can disable the
+ * optimizing racy check.
   */
-void drain_all_pages(struct zone *zone)
+static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
  {
         int cpu;
  
@@ -3062,7 +3082,13 @@ void drain_all_pages(struct zone *zone)
                 struct zone *z;
                 bool has_pcps = false;
  
-               if (zone) {
+               if (force_all_cpus) {
+                       /*
+                        * The pcp.count check is racy, some callers need a
+                        * guarantee that no cpu is missed.
+                        */
+                       has_pcps = true;
+               } else if (zone) {
                         pcp = per_cpu_ptr(zone->pageset, cpu);
                         if (pcp->pcp.count)
                                 has_pcps = true;
@@ -3095,6 +3121,18 @@ void drain_all_pages(struct zone *zone)
         mutex_unlock(&pcpu_drain_mutex);
  }
  
+/*
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
+ *
+ * When zone parameter is non-NULL, spill just the single zone's pages.
+ *
+ * Note that this can be extremely slow as the draining happens in a workqueue.
+ */
+void drain_all_pages(struct zone *zone)
+{
+       __drain_all_pages(zone, false);
+}
+
  #ifdef CONFIG_HIBERNATION
  
  /*
@@ -3190,10 +3228,8 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn)
         pcp = &this_cpu_ptr(zone->pageset)->pcp;
         list_add(&page->lru, &pcp->lists[migratetype]);
         pcp->count++;
-       if (pcp->count >= pcp->high) {
-               unsigned long batch = READ_ONCE(pcp->batch);
-               free_pcppages_bulk(zone, batch, pcp);
-       }
+       if (pcp->count >= READ_ONCE(pcp->high))
+               free_pcppages_bulk(zone, READ_ONCE(pcp->batch), pcp);
  }
  
  /*
@@ -3378,7 +3414,7 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
         do {
                 if (list_empty(list)) {
                         pcp->count += rmqueue_bulk(zone, 0,
-                                       pcp->batch, list,
+                                       READ_ONCE(pcp->batch), list,
                                         migratetype, alloc_flags);
                         if (unlikely(list_empty(list)))
                                 return NULL;
@@ -4264,10 +4300,8 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
  static struct lockdep_map __fs_reclaim_map =
         STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
  
-static bool __need_fs_reclaim(gfp_t gfp_mask)
+static bool __need_reclaim(gfp_t gfp_mask)
  {
-       gfp_mask = current_gfp_context(gfp_mask);
-
         /* no reclaim without waiting on it */
         if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
                 return false;
@@ -4276,10 +4310,6 @@ static bool __need_fs_reclaim(gfp_t gfp_mask)
         if (current->flags & PF_MEMALLOC)
                 return false;
  
-       /* We're only interested __GFP_FS allocations for now */
-       if (!(gfp_mask & __GFP_FS))
-               return false;
-
         if (gfp_mask & __GFP_NOLOCKDEP)
                 return false;
  
@@ -4298,15 +4328,29 @@ void __fs_reclaim_release(void)
  
  void fs_reclaim_acquire(gfp_t gfp_mask)
  {
-       if (__need_fs_reclaim(gfp_mask))
-               __fs_reclaim_acquire();
+       gfp_mask = current_gfp_context(gfp_mask);
+
+       if (__need_reclaim(gfp_mask)) {
+               if (gfp_mask & __GFP_FS)
+                       __fs_reclaim_acquire();
+
+#ifdef CONFIG_MMU_NOTIFIER
+               lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
+               lock_map_release(&__mmu_notifier_invalidate_range_start_map);
+#endif
+
+       }
  }
  EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
  
  void fs_reclaim_release(gfp_t gfp_mask)
  {
-       if (__need_fs_reclaim(gfp_mask))
-               __fs_reclaim_release();
+       gfp_mask = current_gfp_context(gfp_mask);
+
+       if (__need_reclaim(gfp_mask)) {
+               if (gfp_mask & __GFP_FS)
+                       __fs_reclaim_release();
+       }
  }
  EXPORT_SYMBOL_GPL(fs_reclaim_release);
  #endif
@@ -5007,6 +5051,26 @@ static inline void free_the_page(struct page *page, unsigned int order)
                 __free_pages_ok(page, order, FPI_NONE);
  }
  
+/**
+ * __free_pages - Free pages allocated with alloc_pages().
+ * @page: The page pointer returned from alloc_pages().
+ * @order: The order of the allocation.
+ *
+ * This function can free multi-page allocations that are not compound
+ * pages.  It does not check that the @order passed in matches that of
+ * the allocation, so it is easy to leak memory.  Freeing more memory
+ * than was allocated will probably emit a warning.
+ *
+ * If the last reference to this page is speculative, it will be released
+ * by put_page() which only frees the first page of a non-compound
+ * allocation.  To prevent the remaining pages from being leaked, we free
+ * the subsequent pages here.  If you want to use the page's reference
+ * count to decide when to free the allocation, you should allocate a
+ * compound page, and use put_page() instead of __free_pages().
+ *
+ * Context: May be called in interrupt context or while holding a normal
+ * spinlock, but not in NMI context or while holding a raw spinlock.
+ */
  void __free_pages(struct page *page, unsigned int order)
  {
         if (put_page_testzero(page))
@@ -5465,7 +5529,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                 global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
                 global_node_page_state(NR_FILE_MAPPED),
                 global_node_page_state(NR_SHMEM),
-               global_zone_page_state(NR_PAGETABLE),
+               global_node_page_state(NR_PAGETABLE),
                 global_zone_page_state(NR_BOUNCE),
                 global_zone_page_state(NR_FREE_PAGES),
                 free_pcp,
@@ -5497,6 +5561,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
  #ifdef CONFIG_SHADOW_CALL_STACK
                         " shadow_call_stack:%lukB"
  #endif
+                       " pagetables:%lukB"
                         " all_unreclaimable? %s"
                         "\n",
                         pgdat->node_id,
@@ -5522,6 +5587,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
  #ifdef CONFIG_SHADOW_CALL_STACK
                         node_page_state(pgdat, NR_KERNEL_SCS_KB),
  #endif
+                       K(node_page_state(pgdat, NR_PAGETABLE)),
                         pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
                                 "yes" : "no");
         }
@@ -5553,7 +5619,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                         " present:%lukB"
                         " managed:%lukB"
                         " mlocked:%lukB"
-                       " pagetables:%lukB"
                         " bounce:%lukB"
                         " free_pcp:%lukB"
                         " local_pcp:%ukB"
@@ -5574,7 +5639,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                         K(zone->present_pages),
                         K(zone_managed_pages(zone)),
                         K(zone_page_state(zone, NR_MLOCK)),
-                       K(zone_page_state(zone, NR_PAGETABLE)),
                         K(zone_page_state(zone, NR_BOUNCE)),
                         K(free_pcp),
                         K(this_cpu_read(zone->pageset->pcp.count)),
@@ -5904,7 +5968,10 @@ static void build_zonelists(pg_data_t *pgdat)
   * not check if the processor is online before following the pageset pointer.
   * Other parts of the kernel may not check if the zone is available.
   */
-static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
+static void pageset_init(struct per_cpu_pageset *p);
+/* These effectively disable the pcplists in the boot pageset completely */
+#define BOOT_PAGESET_HIGH      0
+#define BOOT_PAGESET_BATCH     1
  static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
  static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
  
@@ -5972,7 +6039,7 @@ build_all_zonelists_init(void)
          * (a chicken-egg dilemma).
          */
         for_each_possible_cpu(cpu)
-               setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+               pageset_init(&per_cpu(boot_pageset, cpu));
  
         mminit_verify_zonelist();
         cpuset_init_current_mems_allowed();
@@ -6255,13 +6322,16 @@ static int zone_batchsize(struct zone *zone)
  }
  
  /*
- * pcp->high and pcp->batch values are related and dependent on one another:
- * ->batch must never be higher then ->high.
- * The following function updates them in a safe manner without read side
- * locking.
+ * pcp->high and pcp->batch values are related and generally batch is lower
+ * than high. They are also related to pcp->count such that count is lower
+ * than high, and as soon as it reaches high, the pcplist is flushed.
   *
- * Any new users of pcp->batch and pcp->high should ensure they can cope with
- * those fields changing asynchronously (acording to the above rule).
+ * However, guaranteeing these relations at all times would require e.g. write
+ * barriers here but also careful usage of read barriers at the read side, and
+ * thus be prone to error and bad for performance. Thus the update only prevents
+ * store tearing. Any new users of pcp->batch and pcp->high should ensure they
+ * can cope with those fields changing asynchronously, and fully trust only the
+ * pcp->count field on the local CPU with interrupts disabled.
   *
   * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
   * outside of boot time (or some other assurance that no concurrent updaters
@@ -6270,21 +6340,8 @@ static int zone_batchsize(struct zone *zone)
  static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
                 unsigned long batch)
  {
-       /* start with a fail safe value for batch */
-       pcp->batch = 1;
-       smp_wmb();
-
-       /* Update high, then batch, in order */
-       pcp->high = high;
-       smp_wmb();
-
-       pcp->batch = batch;
-}
-
-/* a companion to pageset_set_high() */
-static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
-{
-       pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
+       WRITE_ONCE(pcp->batch, batch);
+       WRITE_ONCE(pcp->high, high);
  }
  
  static void pageset_init(struct per_cpu_pageset *p)
@@ -6297,53 +6354,70 @@ static void pageset_init(struct per_cpu_pageset *p)
         pcp = &p->pcp;
         for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
                 INIT_LIST_HEAD(&pcp->lists[migratetype]);
+
+       /*
+        * Set batch and high values safe for a boot pageset. A true percpu
+        * pageset's initialization will update them subsequently. Here we don't
+        * need to be as careful as pageset_update() as nobody can access the
+        * pageset yet.
+        */
+       pcp->high = BOOT_PAGESET_HIGH;
+       pcp->batch = BOOT_PAGESET_BATCH;
  }
  
-static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
+               unsigned long batch)
  {
-       pageset_init(p);
-       pageset_set_batch(p, batch);
+       struct per_cpu_pageset *p;
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               p = per_cpu_ptr(zone->pageset, cpu);
+               pageset_update(&p->pcp, high, batch);
+       }
  }
  
  /*
- * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
- * to the value high for the pageset p.
+ * Calculate and set new high and batch values for all per-cpu pagesets of a
+ * zone, based on the zone's size and the percpu_pagelist_fraction sysctl.
   */
-static void pageset_set_high(struct per_cpu_pageset *p,
-                               unsigned long high)
+static void zone_set_pageset_high_and_batch(struct zone *zone)
  {
-       unsigned long batch = max(1UL, high / 4);
-       if ((high / 4) > (PAGE_SHIFT * 8))
-               batch = PAGE_SHIFT * 8;
+       unsigned long new_high, new_batch;
  
-       pageset_update(&p->pcp, high, batch);
-}
+       if (percpu_pagelist_fraction) {
+               new_high = zone_managed_pages(zone) / percpu_pagelist_fraction;
+               new_batch = max(1UL, new_high / 4);
+               if ((new_high / 4) > (PAGE_SHIFT * 8))
+                       new_batch = PAGE_SHIFT * 8;
+       } else {
+               new_batch = zone_batchsize(zone);
+               new_high = 6 * new_batch;
+               new_batch = max(1UL, 1 * new_batch);
+       }
  
-static void pageset_set_high_and_batch(struct zone *zone,
-                                      struct per_cpu_pageset *pcp)
-{
-       if (percpu_pagelist_fraction)
-               pageset_set_high(pcp,
-                       (zone_managed_pages(zone) /
-                               percpu_pagelist_fraction));
-       else
-               pageset_set_batch(pcp, zone_batchsize(zone));
-}
+       if (zone->pageset_high == new_high &&
+           zone->pageset_batch == new_batch)
+               return;
  
-static void __meminit zone_pageset_init(struct zone *zone, int cpu)
-{
-       struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
+       zone->pageset_high = new_high;
+       zone->pageset_batch = new_batch;
  
-       pageset_init(pcp);
-       pageset_set_high_and_batch(zone, pcp);
+       __zone_set_pageset_high_and_batch(zone, new_high, new_batch);
  }
  
  void __meminit setup_zone_pageset(struct zone *zone)
  {
+       struct per_cpu_pageset *p;
         int cpu;
+
         zone->pageset = alloc_percpu(struct per_cpu_pageset);
-       for_each_possible_cpu(cpu)
-               zone_pageset_init(zone, cpu);
+       for_each_possible_cpu(cpu) {
+               p = per_cpu_ptr(zone->pageset, cpu);
+               pageset_init(p);
+       }
+
+       zone_set_pageset_high_and_batch(zone);
  }
  
  /*
@@ -6386,6 +6460,8 @@ static __meminit void zone_pcp_init(struct zone *zone)
          * offset of a (static) per cpu variable into the per cpu area.
          */
         zone->pageset = &boot_pageset;
+       zone->pageset_high = BOOT_PAGESET_HIGH;
+       zone->pageset_batch = BOOT_PAGESET_BATCH;
  
         if (populated_zone(zone))
                 printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",
@@ -6796,7 +6872,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
         init_waitqueue_head(&pgdat->pfmemalloc_wait);
  
         pgdat_page_ext_init(pgdat);
-       spin_lock_init(&pgdat->lru_lock);
         lruvec_init(&pgdat->__lruvec);
  }
  
@@ -7598,6 +7673,11 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
                  * alias for the memset().
                  */
                 direct_map_addr = page_address(page);
+               /*
+                * Perform a kasan-unchecked memset() since this memory
+                * has not been initialized.
+                */
+               direct_map_addr = kasan_reset_tag(direct_map_addr);
                 if ((unsigned int)poison <= 0xFF)
                         memset(direct_map_addr, poison, PAGE_SIZE);
  
@@ -7791,31 +7871,24 @@ static void calculate_totalreserve_pages(void)
  static void setup_per_zone_lowmem_reserve(void)
  {
         struct pglist_data *pgdat;
-       enum zone_type j, idx;
+       enum zone_type i, j;
  
         for_each_online_pgdat(pgdat) {
-               for (j = 0; j < MAX_NR_ZONES; j++) {
-                       struct zone *zone = pgdat->node_zones + j;
-                       unsigned long managed_pages = zone_managed_pages(zone);
-
-                       zone->lowmem_reserve[j] = 0;
-
-                       idx = j;
-                       while (idx) {
-                               struct zone *lower_zone;
-
-                               idx--;
-                               lower_zone = pgdat->node_zones + idx;
-
-                               if (!sysctl_lowmem_reserve_ratio[idx] ||
-                                   !zone_managed_pages(lower_zone)) {
-                                       lower_zone->lowmem_reserve[j] = 0;
-                                       continue;
+               for (i = 0; i < MAX_NR_ZONES - 1; i++) {
+                       struct zone *zone = &pgdat->node_zones[i];
+                       int ratio = sysctl_lowmem_reserve_ratio[i];
+                       bool clear = !ratio || !zone_managed_pages(zone);
+                       unsigned long managed_pages = 0;
+
+                       for (j = i + 1; j < MAX_NR_ZONES; j++) {
+                               if (clear) {
+                                       zone->lowmem_reserve[j] = 0;
                                 } else {
-                                       lower_zone->lowmem_reserve[j] =
-                                               managed_pages / sysctl_lowmem_reserve_ratio[idx];
+                                       struct zone *upper_zone = &pgdat->node_zones[j];
+
+                                       managed_pages += zone_managed_pages(upper_zone);
+                                       zone->lowmem_reserve[j] = managed_pages / ratio;
                                 }
-                               managed_pages += zone_managed_pages(lower_zone);
                         }
                 }
         }
@@ -8077,15 +8150,6 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
         return 0;
  }
  
-static void __zone_pcp_update(struct zone *zone)
-{
-       unsigned int cpu;
-
-       for_each_possible_cpu(cpu)
-               pageset_set_high_and_batch(zone,
-                               per_cpu_ptr(zone->pageset, cpu));
-}
-
  /*
   * percpu_pagelist_fraction - changes the pcp->high for each zone on each
   * cpu.  It is the fraction of total pages in each zone that a hot per cpu
@@ -8118,7 +8182,7 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
                 goto out;
  
         for_each_populated_zone(zone)
-               __zone_pcp_update(zone);
+               zone_set_pageset_high_and_batch(zone);
  out:
         mutex_unlock(&pcp_batch_high_lock);
         return ret;
@@ -8517,6 +8581,8 @@ int alloc_contig_range(unsigned long start, unsigned long end,
         if (ret)
                 return ret;
  
+       drain_all_pages(cc.zone);
+
         /*
          * In case of -EBUSY, we'd like to know which page causes problem.
          * So, just fall through. test_pages_isolated() has a tracepoint
@@ -8725,7 +8791,28 @@ EXPORT_SYMBOL(free_contig_range);
  void __meminit zone_pcp_update(struct zone *zone)
  {
         mutex_lock(&pcp_batch_high_lock);
-       __zone_pcp_update(zone);
+       zone_set_pageset_high_and_batch(zone);
+       mutex_unlock(&pcp_batch_high_lock);
+}
+
+/*
+ * Effectively disable pcplists for the zone by setting the high limit to 0
+ * and draining all cpus. A concurrent page freeing on another CPU that's about
+ * to put the page on pcplist will either finish before the drain and the page
+ * will be drained, or observe the new high limit and skip the pcplist.
+ *
+ * Must be paired with a call to zone_pcp_enable().
+ */
+void zone_pcp_disable(struct zone *zone)
+{
+       mutex_lock(&pcp_batch_high_lock);
+       __zone_set_pageset_high_and_batch(zone, 0, 1);
+       __drain_all_pages(zone, true);
+}
+
+void zone_pcp_enable(struct zone *zone)
+{
+       __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch);
         mutex_unlock(&pcp_batch_high_lock);
  }