mm: proactive compaction
[linux-2.6-microblaze.git] / mm / compaction.c
index 8637560..544a988 100644 (file)
@@ -50,6 +50,24 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
 #define pageblock_start_pfn(pfn)       block_start_pfn(pfn, pageblock_order)
 #define pageblock_end_pfn(pfn)         block_end_pfn(pfn, pageblock_order)
 
+/*
+ * Fragmentation score check interval for proactive compaction purposes.
+ */
+static const int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 500;
+
+/*
+ * Page order with-respect-to which proactive compaction
+ * calculates external fragmentation, which is used as
+ * the "fragmentation score" of a node/zone.
+ */
+#if defined CONFIG_TRANSPARENT_HUGEPAGE
+#define COMPACTION_HPAGE_ORDER HPAGE_PMD_ORDER
+#elif defined HUGETLB_PAGE_ORDER
+#define COMPACTION_HPAGE_ORDER HUGETLB_PAGE_ORDER
+#else
+#define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT)
+#endif
+
 static unsigned long release_freepages(struct list_head *freelist)
 {
        struct page *page, *next;
@@ -1857,6 +1875,76 @@ static inline bool is_via_compact_memory(int order)
        return order == -1;
 }
 
+static bool kswapd_is_running(pg_data_t *pgdat)
+{
+       return pgdat->kswapd && (pgdat->kswapd->state == TASK_RUNNING);
+}
+
+/*
+ * A zone's fragmentation score is the external fragmentation wrt to the
+ * COMPACTION_HPAGE_ORDER scaled by the zone's size. It returns a value
+ * in the range [0, 100].
+ *
+ * The scaling factor ensures that proactive compaction focuses on larger
+ * zones like ZONE_NORMAL, rather than smaller, specialized zones like
+ * ZONE_DMA32. For smaller zones, the score value remains close to zero,
+ * and thus never exceeds the high threshold for proactive compaction.
+ */
+static int fragmentation_score_zone(struct zone *zone)
+{
+       unsigned long score;
+
+       score = zone->present_pages *
+                       extfrag_for_order(zone, COMPACTION_HPAGE_ORDER);
+       return div64_ul(score, zone->zone_pgdat->node_present_pages + 1);
+}
+
+/*
+ * The per-node proactive (background) compaction process is started by its
+ * corresponding kcompactd thread when the node's fragmentation score
+ * exceeds the high threshold. The compaction process remains active till
+ * the node's score falls below the low threshold, or one of the back-off
+ * conditions is met.
+ */
+static int fragmentation_score_node(pg_data_t *pgdat)
+{
+       unsigned long score = 0;
+       int zoneid;
+
+       for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+               struct zone *zone;
+
+               zone = &pgdat->node_zones[zoneid];
+               score += fragmentation_score_zone(zone);
+       }
+
+       return score;
+}
+
+static int fragmentation_score_wmark(pg_data_t *pgdat, bool low)
+{
+       int wmark_low;
+
+       /*
+        * Cap the low watermak to avoid excessive compaction
+        * activity in case a user sets the proactivess tunable
+        * close to 100 (maximum).
+        */
+       wmark_low = max(100 - sysctl_compaction_proactiveness, 5);
+       return low ? wmark_low : min(wmark_low + 10, 100);
+}
+
+static bool should_proactive_compact_node(pg_data_t *pgdat)
+{
+       int wmark_high;
+
+       if (!sysctl_compaction_proactiveness || kswapd_is_running(pgdat))
+               return false;
+
+       wmark_high = fragmentation_score_wmark(pgdat, false);
+       return fragmentation_score_node(pgdat) > wmark_high;
+}
+
 static enum compact_result __compact_finished(struct compact_control *cc)
 {
        unsigned int order;
@@ -1883,6 +1971,25 @@ static enum compact_result __compact_finished(struct compact_control *cc)
                        return COMPACT_PARTIAL_SKIPPED;
        }
 
+       if (cc->proactive_compaction) {
+               int score, wmark_low;
+               pg_data_t *pgdat;
+
+               pgdat = cc->zone->zone_pgdat;
+               if (kswapd_is_running(pgdat))
+                       return COMPACT_PARTIAL_SKIPPED;
+
+               score = fragmentation_score_zone(cc->zone);
+               wmark_low = fragmentation_score_wmark(pgdat, true);
+
+               if (score > wmark_low)
+                       ret = COMPACT_CONTINUE;
+               else
+                       ret = COMPACT_SUCCESS;
+
+               goto out;
+       }
+
        if (is_via_compact_memory(cc->order))
                return COMPACT_CONTINUE;
 
@@ -1941,6 +2048,7 @@ static enum compact_result __compact_finished(struct compact_control *cc)
                }
        }
 
+out:
        if (cc->contended || fatal_signal_pending(current))
                ret = COMPACT_CONTENDED;
 
@@ -2421,6 +2529,41 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
        return rc;
 }
 
+/*
+ * Compact all zones within a node till each zone's fragmentation score
+ * reaches within proactive compaction thresholds (as determined by the
+ * proactiveness tunable).
+ *
+ * It is possible that the function returns before reaching score targets
+ * due to various back-off conditions, such as, contention on per-node or
+ * per-zone locks.
+ */
+static void proactive_compact_node(pg_data_t *pgdat)
+{
+       int zoneid;
+       struct zone *zone;
+       struct compact_control cc = {
+               .order = -1,
+               .mode = MIGRATE_SYNC_LIGHT,
+               .ignore_skip_hint = true,
+               .whole_zone = true,
+               .gfp_mask = GFP_KERNEL,
+               .proactive_compaction = true,
+       };
+
+       for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+               zone = &pgdat->node_zones[zoneid];
+               if (!populated_zone(zone))
+                       continue;
+
+               cc.zone = zone;
+
+               compact_zone(&cc, NULL);
+
+               VM_BUG_ON(!list_empty(&cc.freepages));
+               VM_BUG_ON(!list_empty(&cc.migratepages));
+       }
+}
 
 /* Compact all zones within a node */
 static void compact_node(int nid)
@@ -2467,6 +2610,13 @@ static void compact_nodes(void)
 /* The written value is actually unused, all memory is compacted */
 int sysctl_compact_memory;
 
+/*
+ * Tunable for proactive compaction. It determines how
+ * aggressively the kernel should compact memory in the
+ * background. It takes values in the range [0, 100].
+ */
+int __read_mostly sysctl_compaction_proactiveness = 20;
+
 /*
  * This is the entry point for compacting all nodes via
  * /proc/sys/vm/compact_memory
@@ -2646,6 +2796,7 @@ static int kcompactd(void *p)
 {
        pg_data_t *pgdat = (pg_data_t*)p;
        struct task_struct *tsk = current;
+       unsigned int proactive_defer = 0;
 
        const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
 
@@ -2661,12 +2812,34 @@ static int kcompactd(void *p)
                unsigned long pflags;
 
                trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
-               wait_event_freezable(pgdat->kcompactd_wait,
-                               kcompactd_work_requested(pgdat));
+               if (wait_event_freezable_timeout(pgdat->kcompactd_wait,
+                       kcompactd_work_requested(pgdat),
+                       msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC))) {
+
+                       psi_memstall_enter(&pflags);
+                       kcompactd_do_work(pgdat);
+                       psi_memstall_leave(&pflags);
+                       continue;
+               }
 
-               psi_memstall_enter(&pflags);
-               kcompactd_do_work(pgdat);
-               psi_memstall_leave(&pflags);
+               /* kcompactd wait timeout */
+               if (should_proactive_compact_node(pgdat)) {
+                       unsigned int prev_score, score;
+
+                       if (proactive_defer) {
+                               proactive_defer--;
+                               continue;
+                       }
+                       prev_score = fragmentation_score_node(pgdat);
+                       proactive_compact_node(pgdat);
+                       score = fragmentation_score_node(pgdat);
+                       /*
+                        * Defer proactive compaction if the fragmentation
+                        * score did not go down i.e. no progress made.
+                        */
+                       proactive_defer = score < prev_score ?
+                                       0 : 1 << COMPACT_MAX_DEFER_SHIFT;
+               }
        }
 
        return 0;