dt-bindings: interconnect: qcom,msm8998-bwmon: Add QCM2290 bwmon instance
[linux-2.6-microblaze.git] / mm / page_alloc.c
index 95546f3..733732e 100644 (file)
@@ -52,6 +52,7 @@
 #include <linux/psi.h>
 #include <linux/khugepaged.h>
 #include <linux/delayacct.h>
+#include <linux/cacheinfo.h>
 #include <asm/div64.h>
 #include "internal.h"
 #include "shuffle.h"
@@ -1078,6 +1079,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
        int bad = 0;
        bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
        bool init = want_init_on_free();
+       bool compound = PageCompound(page);
 
        VM_BUG_ON_PAGE(PageTail(page), page);
 
@@ -1096,16 +1098,15 @@ static __always_inline bool free_pages_prepare(struct page *page,
                return false;
        }
 
+       VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
+
        /*
         * Check tail pages before head page information is cleared to
         * avoid checking PageCompound for order-0 pages.
         */
        if (unlikely(order)) {
-               bool compound = PageCompound(page);
                int i;
 
-               VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
-
                if (compound)
                        page[1].flags &= ~PAGE_FLAGS_SECOND;
                for (i = 1; i < (1 << order); i++) {
@@ -2156,6 +2157,40 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
        return i;
 }
 
+/*
+ * Called from the vmstat counter updater to decay the PCP high.
+ * Return whether there are addition works to do.
+ */
+int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
+{
+       int high_min, to_drain, batch;
+       int todo = 0;
+
+       high_min = READ_ONCE(pcp->high_min);
+       batch = READ_ONCE(pcp->batch);
+       /*
+        * Decrease pcp->high periodically to try to free possible
+        * idle PCP pages.  And, avoid to free too many pages to
+        * control latency.  This caps pcp->high decrement too.
+        */
+       if (pcp->high > high_min) {
+               pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
+                                pcp->high - (pcp->high >> 3), high_min);
+               if (pcp->high > high_min)
+                       todo++;
+       }
+
+       to_drain = pcp->count - pcp->high;
+       if (to_drain > 0) {
+               spin_lock(&pcp->lock);
+               free_pcppages_bulk(zone, to_drain, pcp, 0);
+               spin_unlock(&pcp->lock);
+               todo++;
+       }
+
+       return todo;
+}
+
 #ifdef CONFIG_NUMA
 /*
  * Called from the vmstat counter updater to drain pagesets of this
@@ -2317,14 +2352,13 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
        return true;
 }
 
-static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high)
+static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high)
 {
        int min_nr_free, max_nr_free;
-       int batch = READ_ONCE(pcp->batch);
 
-       /* Free everything if batch freeing high-order pages. */
+       /* Free as much as possible if batch freeing high-order pages. */
        if (unlikely(free_high))
-               return pcp->count;
+               return min(pcp->count, batch << CONFIG_PCP_BATCH_SCALE_MAX);
 
        /* Check for PCP disabled or boot pageset */
        if (unlikely(high < batch))
@@ -2335,59 +2369,107 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high)
        max_nr_free = high - batch;
 
        /*
-        * Double the number of pages freed each time there is subsequent
-        * freeing of pages without any allocation.
+        * Increase the batch number to the number of the consecutive
+        * freed pages to reduce zone lock contention.
         */
-       batch <<= pcp->free_factor;
-       if (batch < max_nr_free)
-               pcp->free_factor++;
-       batch = clamp(batch, min_nr_free, max_nr_free);
+       batch = clamp_t(int, pcp->free_count, min_nr_free, max_nr_free);
 
        return batch;
 }
 
 static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
-                      bool free_high)
+                      int batch, bool free_high)
 {
-       int high = READ_ONCE(pcp->high);
+       int high, high_min, high_max;
+
+       high_min = READ_ONCE(pcp->high_min);
+       high_max = READ_ONCE(pcp->high_max);
+       high = pcp->high = clamp(pcp->high, high_min, high_max);
 
-       if (unlikely(!high || free_high))
+       if (unlikely(!high))
                return 0;
 
-       if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
-               return high;
+       if (unlikely(free_high)) {
+               pcp->high = max(high - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
+                               high_min);
+               return 0;
+       }
 
        /*
         * If reclaim is active, limit the number of pages that can be
         * stored on pcp lists
         */
-       return min(READ_ONCE(pcp->batch) << 2, high);
+       if (test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) {
+               int free_count = max_t(int, pcp->free_count, batch);
+
+               pcp->high = max(high - free_count, high_min);
+               return min(batch << 2, pcp->high);
+       }
+
+       if (high_min == high_max)
+               return high;
+
+       if (test_bit(ZONE_BELOW_HIGH, &zone->flags)) {
+               int free_count = max_t(int, pcp->free_count, batch);
+
+               pcp->high = max(high - free_count, high_min);
+               high = max(pcp->count, high_min);
+       } else if (pcp->count >= high) {
+               int need_high = pcp->free_count + batch;
+
+               /* pcp->high should be large enough to hold batch freed pages */
+               if (pcp->high < need_high)
+                       pcp->high = clamp(need_high, high_min, high_max);
+       }
+
+       return high;
 }
 
 static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
                                   struct page *page, int migratetype,
                                   unsigned int order)
 {
-       int high;
+       int high, batch;
        int pindex;
-       bool free_high;
+       bool free_high = false;
 
+       /*
+        * On freeing, reduce the number of pages that are batch allocated.
+        * See nr_pcp_alloc() where alloc_factor is increased for subsequent
+        * allocations.
+        */
+       pcp->alloc_factor >>= 1;
        __count_vm_events(PGFREE, 1 << order);
        pindex = order_to_pindex(migratetype, order);
        list_add(&page->pcp_list, &pcp->lists[pindex]);
        pcp->count += 1 << order;
 
+       batch = READ_ONCE(pcp->batch);
        /*
         * As high-order pages other than THP's stored on PCP can contribute
         * to fragmentation, limit the number stored when PCP is heavily
         * freeing without allocation. The remainder after bulk freeing
         * stops will be drained from vmstat refresh context.
         */
-       free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER);
-
-       high = nr_pcp_high(pcp, zone, free_high);
+       if (order && order <= PAGE_ALLOC_COSTLY_ORDER) {
+               free_high = (pcp->free_count >= batch &&
+                            (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
+                            (!(pcp->flags & PCPF_FREE_HIGH_BATCH) ||
+                             pcp->count >= READ_ONCE(batch)));
+               pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER;
+       } else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
+               pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
+       }
+       if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
+               pcp->free_count += (1 << order);
+       high = nr_pcp_high(pcp, zone, batch, free_high);
        if (pcp->count >= high) {
-               free_pcppages_bulk(zone, nr_pcp_free(pcp, high, free_high), pcp, pindex);
+               free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
+                                  pcp, pindex);
+               if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
+                   zone_watermark_ok(zone, 0, high_wmark_pages(zone),
+                                     ZONE_MOVABLE, 0))
+                       clear_bit(ZONE_BELOW_HIGH, &zone->flags);
        }
 }
 
@@ -2671,6 +2753,56 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
        return page;
 }
 
+static int nr_pcp_alloc(struct per_cpu_pages *pcp, struct zone *zone, int order)
+{
+       int high, base_batch, batch, max_nr_alloc;
+       int high_max, high_min;
+
+       base_batch = READ_ONCE(pcp->batch);
+       high_min = READ_ONCE(pcp->high_min);
+       high_max = READ_ONCE(pcp->high_max);
+       high = pcp->high = clamp(pcp->high, high_min, high_max);
+
+       /* Check for PCP disabled or boot pageset */
+       if (unlikely(high < base_batch))
+               return 1;
+
+       if (order)
+               batch = base_batch;
+       else
+               batch = (base_batch << pcp->alloc_factor);
+
+       /*
+        * If we had larger pcp->high, we could avoid to allocate from
+        * zone.
+        */
+       if (high_min != high_max && !test_bit(ZONE_BELOW_HIGH, &zone->flags))
+               high = pcp->high = min(high + batch, high_max);
+
+       if (!order) {
+               max_nr_alloc = max(high - pcp->count - base_batch, base_batch);
+               /*
+                * Double the number of pages allocated each time there is
+                * subsequent allocation of order-0 pages without any freeing.
+                */
+               if (batch <= max_nr_alloc &&
+                   pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX)
+                       pcp->alloc_factor++;
+               batch = min(batch, max_nr_alloc);
+       }
+
+       /*
+        * Scale batch relative to order if batch implies free pages
+        * can be stored on the PCP. Batch can be 1 for small zones or
+        * for boot pagesets which should never store free pages as
+        * the pages may belong to arbitrary zones.
+        */
+       if (batch > 1)
+               batch = max(batch >> order, 2);
+
+       return batch;
+}
+
 /* Remove page from the per-cpu list, caller must protect the list */
 static inline
 struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
@@ -2683,18 +2815,9 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
 
        do {
                if (list_empty(list)) {
-                       int batch = READ_ONCE(pcp->batch);
+                       int batch = nr_pcp_alloc(pcp, zone, order);
                        int alloced;
 
-                       /*
-                        * Scale batch relative to order if batch implies
-                        * free pages can be stored on the PCP. Batch can
-                        * be 1 for small zones or for boot pagesets which
-                        * should never store free pages as the pages may
-                        * belong to arbitrary zones.
-                        */
-                       if (batch > 1)
-                               batch = max(batch >> order, 2);
                        alloced = rmqueue_bulk(zone, order,
                                        batch, list,
                                        migratetype, alloc_flags);
@@ -2735,7 +2858,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
         * See nr_pcp_free() where free_factor is increased for subsequent
         * frees.
         */
-       pcp->free_factor >>= 1;
+       pcp->free_count >>= 1;
        list = &pcp->lists[order_to_pindex(migratetype, order)];
        page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
        pcp_spin_unlock(pcp);
@@ -3115,6 +3238,25 @@ retry:
                        }
                }
 
+               /*
+                * Detect whether the number of free pages is below high
+                * watermark.  If so, we will decrease pcp->high and free
+                * PCP pages in free path to reduce the possibility of
+                * premature page reclaiming.  Detection is done here to
+                * avoid to do that in hotter free path.
+                */
+               if (test_bit(ZONE_BELOW_HIGH, &zone->flags))
+                       goto check_alloc_wmark;
+
+               mark = high_wmark_pages(zone);
+               if (zone_watermark_fast(zone, order, mark,
+                                       ac->highest_zoneidx, alloc_flags,
+                                       gfp_mask))
+                       goto try_this_zone;
+               else
+                       set_bit(ZONE_BELOW_HIGH, &zone->flags);
+
+check_alloc_wmark:
                mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
                if (!zone_watermark_fast(zone, order, mark,
                                       ac->highest_zoneidx, alloc_flags,
@@ -4456,12 +4598,8 @@ struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
                nodemask_t *nodemask)
 {
        struct page *page = __alloc_pages(gfp | __GFP_COMP, order,
-                       preferred_nid, nodemask);
-       struct folio *folio = (struct folio *)page;
-
-       if (folio && order > 1)
-               folio_prep_large_rmappable(folio);
-       return folio;
+                                       preferred_nid, nodemask);
+       return page_rmappable_folio(page);
 }
 EXPORT_SYMBOL(__folio_alloc);
 
@@ -4878,8 +5016,11 @@ int find_next_best_node(int node, nodemask_t *used_node_mask)
        int min_val = INT_MAX;
        int best_node = NUMA_NO_NODE;
 
-       /* Use the local node if we haven't already */
-       if (!node_isset(node, *used_node_mask)) {
+       /*
+        * Use the local node if we haven't already, but for memoryless local
+        * node, we should skip it and fall back to other nodes.
+        */
+       if (!node_isset(node, *used_node_mask) && node_state(node, N_MEMORY)) {
                node_set(node, *used_node_mask);
                return node;
        }
@@ -5255,14 +5396,15 @@ static int zone_batchsize(struct zone *zone)
 }
 
 static int percpu_pagelist_high_fraction;
-static int zone_highsize(struct zone *zone, int batch, int cpu_online)
+static int zone_highsize(struct zone *zone, int batch, int cpu_online,
+                        int high_fraction)
 {
 #ifdef CONFIG_MMU
        int high;
        int nr_split_cpus;
        unsigned long total_pages;
 
-       if (!percpu_pagelist_high_fraction) {
+       if (!high_fraction) {
                /*
                 * By default, the high value of the pcp is based on the zone
                 * low watermark so that if they are full then background
@@ -5275,15 +5417,15 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online)
                 * value is based on a fraction of the managed pages in the
                 * zone.
                 */
-               total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
+               total_pages = zone_managed_pages(zone) / high_fraction;
        }
 
        /*
         * Split the high value across all online CPUs local to the zone. Note
         * that early in boot that CPUs may not be online yet and that during
         * CPU hotplug that the cpumask is not yet updated when a CPU is being
-        * onlined. For memory nodes that have no CPUs, split pcp->high across
-        * all online CPUs to mitigate the risk that reclaim is triggered
+        * onlined. For memory nodes that have no CPUs, split the high value
+        * across all online CPUs to mitigate the risk that reclaim is triggered
         * prematurely due to pages stored on pcp lists.
         */
        nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online;
@@ -5311,19 +5453,21 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online)
  * However, guaranteeing these relations at all times would require e.g. write
  * barriers here but also careful usage of read barriers at the read side, and
  * thus be prone to error and bad for performance. Thus the update only prevents
- * store tearing. Any new users of pcp->batch and pcp->high should ensure they
- * can cope with those fields changing asynchronously, and fully trust only the
- * pcp->count field on the local CPU with interrupts disabled.
+ * store tearing. Any new users of pcp->batch, pcp->high_min and pcp->high_max
+ * should ensure they can cope with those fields changing asynchronously, and
+ * fully trust only the pcp->count field on the local CPU with interrupts
+ * disabled.
  *
  * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
  * outside of boot time (or some other assurance that no concurrent updaters
  * exist).
  */
-static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
-               unsigned long batch)
+static void pageset_update(struct per_cpu_pages *pcp, unsigned long high_min,
+                          unsigned long high_max, unsigned long batch)
 {
        WRITE_ONCE(pcp->batch, batch);
-       WRITE_ONCE(pcp->high, high);
+       WRITE_ONCE(pcp->high_min, high_min);
+       WRITE_ONCE(pcp->high_max, high_max);
 }
 
 static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
@@ -5343,20 +5487,21 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
         * need to be as careful as pageset_update() as nobody can access the
         * pageset yet.
         */
-       pcp->high = BOOT_PAGESET_HIGH;
+       pcp->high_min = BOOT_PAGESET_HIGH;
+       pcp->high_max = BOOT_PAGESET_HIGH;
        pcp->batch = BOOT_PAGESET_BATCH;
-       pcp->free_factor = 0;
+       pcp->free_count = 0;
 }
 
-static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
-               unsigned long batch)
+static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min,
+                                             unsigned long high_max, unsigned long batch)
 {
        struct per_cpu_pages *pcp;
        int cpu;
 
        for_each_possible_cpu(cpu) {
                pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
-               pageset_update(pcp, high, batch);
+               pageset_update(pcp, high_min, high_max, batch);
        }
 }
 
@@ -5366,19 +5511,34 @@ static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long h
  */
 static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
 {
-       int new_high, new_batch;
+       int new_high_min, new_high_max, new_batch;
 
        new_batch = max(1, zone_batchsize(zone));
-       new_high = zone_highsize(zone, new_batch, cpu_online);
+       if (percpu_pagelist_high_fraction) {
+               new_high_min = zone_highsize(zone, new_batch, cpu_online,
+                                            percpu_pagelist_high_fraction);
+               /*
+                * PCP high is tuned manually, disable auto-tuning via
+                * setting high_min and high_max to the manual value.
+                */
+               new_high_max = new_high_min;
+       } else {
+               new_high_min = zone_highsize(zone, new_batch, cpu_online, 0);
+               new_high_max = zone_highsize(zone, new_batch, cpu_online,
+                                            MIN_PERCPU_PAGELIST_HIGH_FRACTION);
+       }
 
-       if (zone->pageset_high == new_high &&
+       if (zone->pageset_high_min == new_high_min &&
+           zone->pageset_high_max == new_high_max &&
            zone->pageset_batch == new_batch)
                return;
 
-       zone->pageset_high = new_high;
+       zone->pageset_high_min = new_high_min;
+       zone->pageset_high_max = new_high_max;
        zone->pageset_batch = new_batch;
 
-       __zone_set_pageset_high_and_batch(zone, new_high, new_batch);
+       __zone_set_pageset_high_and_batch(zone, new_high_min, new_high_max,
+                                         new_batch);
 }
 
 void __meminit setup_zone_pageset(struct zone *zone)
@@ -5413,6 +5573,39 @@ static void zone_pcp_update(struct zone *zone, int cpu_online)
        mutex_unlock(&pcp_batch_high_lock);
 }
 
+static void zone_pcp_update_cacheinfo(struct zone *zone)
+{
+       int cpu;
+       struct per_cpu_pages *pcp;
+       struct cpu_cacheinfo *cci;
+
+       for_each_online_cpu(cpu) {
+               pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+               cci = get_cpu_cacheinfo(cpu);
+               /*
+                * If data cache slice of CPU is large enough, "pcp->batch"
+                * pages can be preserved in PCP before draining PCP for
+                * consecutive high-order pages freeing without allocation.
+                * This can reduce zone lock contention without hurting
+                * cache-hot pages sharing.
+                */
+               spin_lock(&pcp->lock);
+               if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
+                       pcp->flags |= PCPF_FREE_HIGH_BATCH;
+               else
+                       pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
+               spin_unlock(&pcp->lock);
+       }
+}
+
+void setup_pcp_cacheinfo(void)
+{
+       struct zone *zone;
+
+       for_each_populated_zone(zone)
+               zone_pcp_update_cacheinfo(zone);
+}
+
 /*
  * Allocate per cpu pagesets and initialize them.
  * Before this call only boot pagesets were available.
@@ -5454,7 +5647,8 @@ __meminit void zone_pcp_init(struct zone *zone)
         */
        zone->per_cpu_pageset = &boot_pageset;
        zone->per_cpu_zonestats = &boot_zonestats;
-       zone->pageset_high = BOOT_PAGESET_HIGH;
+       zone->pageset_high_min = BOOT_PAGESET_HIGH;
+       zone->pageset_high_max = BOOT_PAGESET_HIGH;
        zone->pageset_batch = BOOT_PAGESET_BATCH;
 
        if (populated_zone(zone))
@@ -6356,13 +6550,14 @@ EXPORT_SYMBOL(free_contig_range);
 void zone_pcp_disable(struct zone *zone)
 {
        mutex_lock(&pcp_batch_high_lock);
-       __zone_set_pageset_high_and_batch(zone, 0, 1);
+       __zone_set_pageset_high_and_batch(zone, 0, 0, 1);
        __drain_all_pages(zone, true);
 }
 
 void zone_pcp_enable(struct zone *zone)
 {
-       __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch);
+       __zone_set_pageset_high_and_batch(zone, zone->pageset_high_min,
+               zone->pageset_high_max, zone->pageset_batch);
        mutex_unlock(&pcp_batch_high_lock);
 }
 
@@ -6462,28 +6657,24 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page,
                                   int migratetype)
 {
        unsigned long size = 1 << high;
-       struct page *current_buddy, *next_page;
+       struct page *current_buddy;
 
        while (high > low) {
                high--;
                size >>= 1;
 
                if (target >= &page[size]) {
-                       next_page = page + size;
                        current_buddy = page;
+                       page = page + size;
                } else {
-                       next_page = page;
                        current_buddy = page + size;
                }
 
                if (set_page_guard(zone, current_buddy, high, migratetype))
                        continue;
 
-               if (current_buddy != target) {
-                       add_to_free_list(current_buddy, zone, high, migratetype);
-                       set_buddy_order(current_buddy, high);
-                       page = next_page;
-               }
+               add_to_free_list(current_buddy, zone, high, migratetype);
+               set_buddy_order(current_buddy, high);
        }
 }