/* Can pages be swapped as part of reclaim? */
unsigned int may_swap:1;
- /* Can cgroups be reclaimed below their normal consumption range? */
- unsigned int may_thrash:1;
+ /*
+ * Cgroups are not reclaimed below their configured memory.low,
+ * unless we threaten to OOM. If any cgroups are skipped due to
+ * memory.low and nothing was reclaimed, go back for memory.low.
+ */
+ unsigned int memcg_low_reclaim:1;
+ unsigned int memcg_low_skipped:1;
unsigned int hibernation_mode:1;
* Anonymous pages are not handled by flushers and must be written
* from reclaim context. Do not stall reclaim based on them
*/
- if (!page_is_file_cache(page)) {
+ if (!page_is_file_cache(page) ||
+ (PageAnon(page) && !PageSwapBacked(page))) {
*dirty = false;
*writeback = false;
return;
int may_enter_fs;
enum page_references references = PAGEREF_RECLAIM_CLEAN;
bool dirty, writeback;
- bool lazyfree = false;
- int ret = SWAP_SUCCESS;
cond_resched();
sc->nr_scanned++;
if (unlikely(!page_evictable(page)))
- goto cull_mlocked;
+ goto activate_locked;
if (!sc->may_unmap && page_mapped(page))
goto keep_locked;
/* Double the slab pressure for mapped and swapcache pages */
- if (page_mapped(page) || PageSwapCache(page))
+ if ((page_mapped(page) || PageSwapCache(page)) &&
+ !(PageAnon(page) && !PageSwapBacked(page)))
sc->nr_scanned++;
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
/*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
+ * Lazyfree page could be freed directly
*/
- if (PageAnon(page) && !PageSwapCache(page)) {
+ if (PageAnon(page) && PageSwapBacked(page) &&
+ !PageSwapCache(page)) {
if (!(sc->gfp_mask & __GFP_IO))
goto keep_locked;
if (!add_to_swap(page, page_list))
goto activate_locked;
- lazyfree = true;
may_enter_fs = 1;
/* Adding to swap updated mapping */
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
- if (page_mapped(page) && mapping) {
- switch (ret = try_to_unmap(page, lazyfree ?
- (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) :
- (ttu_flags | TTU_BATCH_FLUSH))) {
- case SWAP_FAIL:
+ if (page_mapped(page)) {
+ if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) {
nr_unmap_fail++;
goto activate_locked;
- case SWAP_AGAIN:
- goto keep_locked;
- case SWAP_MLOCK:
- goto cull_mlocked;
- case SWAP_LZFREE:
- goto lazyfree;
- case SWAP_SUCCESS:
- ; /* try to free the page below */
}
}
}
}
-lazyfree:
- if (!mapping || !__remove_mapping(mapping, page, true))
- goto keep_locked;
+ if (PageAnon(page) && !PageSwapBacked(page)) {
+ /* follow __remove_mapping for reference */
+ if (!page_ref_freeze(page, 1))
+ goto keep_locked;
+ if (PageDirty(page)) {
+ page_ref_unfreeze(page, 1);
+ goto keep_locked;
+ }
+ count_vm_event(PGLAZYFREED);
+ } else if (!mapping || !__remove_mapping(mapping, page, true))
+ goto keep_locked;
/*
* At this point, we have no other references and there is
* no way to pick any more up (removed from LRU, removed
*/
__ClearPageLocked(page);
free_it:
- if (ret == SWAP_LZFREE)
- count_vm_event(PGLAZYFREED);
-
nr_reclaimed++;
/*
list_add(&page->lru, &free_pages);
continue;
-cull_mlocked:
- if (PageSwapCache(page))
- try_to_free_swap(page);
- unlock_page(page);
- list_add(&page->lru, &ret_pages);
- continue;
-
activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
- if (PageSwapCache(page) && mem_cgroup_swap_full(page))
+ if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
+ PageMlocked(page)))
try_to_free_swap(page);
VM_BUG_ON_PAGE(PageActive(page), page);
- SetPageActive(page);
- pgactivate++;
+ if (!PageMlocked(page)) {
+ SetPageActive(page);
+ pgactivate++;
+ }
keep_locked:
unlock_page(page);
keep:
}
ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
- TTU_UNMAP|TTU_IGNORE_ACCESS, NULL, true);
+ TTU_IGNORE_ACCESS, NULL, true);
list_splice(&clean_pages, page_list);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
return ret;
if (nr_taken == 0)
return 0;
- nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP,
+ nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
&stat, false);
spin_lock_irq(&pgdat->lru_lock);
* Both inactive lists should also be large enough that each inactive
* page has a chance to be referenced again before it is reclaimed.
*
+ * If that fails and refaulting is observed, the inactive list grows.
+ *
* The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
* on this LRU, maintained by the pageout code. A zone->inactive_ratio
* of 3 means 3:1 or 25% of the pages are kept on the inactive list.
* 10TB 320 32GB
*/
static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
- struct scan_control *sc, bool trace)
+ struct mem_cgroup *memcg,
+ struct scan_control *sc, bool actual_reclaim)
{
- unsigned long inactive_ratio;
- unsigned long inactive, active;
- enum lru_list inactive_lru = file * LRU_FILE;
enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ enum lru_list inactive_lru = file * LRU_FILE;
+ unsigned long inactive, active;
+ unsigned long inactive_ratio;
+ unsigned long refaults;
unsigned long gb;
/*
inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
- gb = (inactive + active) >> (30 - PAGE_SHIFT);
- if (gb)
- inactive_ratio = int_sqrt(10 * gb);
+ if (memcg)
+ refaults = mem_cgroup_read_stat(memcg,
+ MEMCG_WORKINGSET_ACTIVATE);
else
- inactive_ratio = 1;
+ refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
- if (trace)
- trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id,
- sc->reclaim_idx,
- lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
- lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
- inactive_ratio, file);
+ /*
+ * When refaults are being observed, it means a new workingset
+ * is being established. Disable active list protection to get
+ * rid of the stale workingset quickly.
+ */
+ if (file && actual_reclaim && lruvec->refaults != refaults) {
+ inactive_ratio = 0;
+ } else {
+ gb = (inactive + active) >> (30 - PAGE_SHIFT);
+ if (gb)
+ inactive_ratio = int_sqrt(10 * gb);
+ else
+ inactive_ratio = 1;
+ }
+
+ if (actual_reclaim)
+ trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
+ lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
+ lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
+ inactive_ratio, file);
return inactive * inactive_ratio < active;
}
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
- struct lruvec *lruvec, struct scan_control *sc)
+ struct lruvec *lruvec, struct mem_cgroup *memcg,
+ struct scan_control *sc)
{
if (is_active_lru(lru)) {
- if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
+ if (inactive_list_is_low(lruvec, is_file_lru(lru),
+ memcg, sc, true))
shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0;
}
* lruvec even if it has plenty of old anonymous pages unless the
* system is under heavy pressure.
*/
- if (!inactive_list_is_low(lruvec, true, sc, false) &&
+ if (!inactive_list_is_low(lruvec, true, memcg, sc, false) &&
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
nr[lru] -= nr_to_scan;
nr_reclaimed += shrink_list(lru, nr_to_scan,
- lruvec, sc);
+ lruvec, memcg, sc);
}
}
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_list_is_low(lruvec, false, sc, true))
+ if (inactive_list_is_low(lruvec, false, memcg, sc, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
}
unsigned long scanned;
if (mem_cgroup_low(root, memcg)) {
- if (!sc->may_thrash)
+ if (!sc->memcg_low_reclaim) {
+ sc->memcg_low_skipped = 1;
continue;
- mem_cgroup_events(memcg, MEMCG_LOW, 1);
+ }
+ mem_cgroup_event(memcg, MEMCG_LOW);
}
reclaimed = sc->nr_reclaimed;
sc->gfp_mask = orig_mask;
}
+static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
+{
+ struct mem_cgroup *memcg;
+
+ memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
+ do {
+ unsigned long refaults;
+ struct lruvec *lruvec;
+
+ if (memcg)
+ refaults = mem_cgroup_read_stat(memcg,
+ MEMCG_WORKINGSET_ACTIVATE);
+ else
+ refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
+
+ lruvec = mem_cgroup_lruvec(pgdat, memcg);
+ lruvec->refaults = refaults;
+ } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
+}
+
/*
* This is the main entry point to direct page reclaim.
*
struct scan_control *sc)
{
int initial_priority = sc->priority;
+ pg_data_t *last_pgdat;
+ struct zoneref *z;
+ struct zone *zone;
retry:
delayacct_freepages_start();
sc->may_writepage = 1;
} while (--sc->priority >= 0);
+ last_pgdat = NULL;
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
+ sc->nodemask) {
+ if (zone->zone_pgdat == last_pgdat)
+ continue;
+ last_pgdat = zone->zone_pgdat;
+ snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
+ }
+
delayacct_freepages_end();
if (sc->nr_reclaimed)
return 1;
/* Untapped cgroup reserves? Don't OOM, retry. */
- if (!sc->may_thrash) {
+ if (sc->memcg_low_skipped) {
sc->priority = initial_priority;
- sc->may_thrash = 1;
+ sc->memcg_low_reclaim = 1;
+ sc->memcg_low_skipped = 0;
goto retry;
}
unsigned long nr_reclaimed;
struct scan_control sc = {
.nr_to_reclaim = SWAP_CLUSTER_MAX,
- .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+ .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
.reclaim_idx = gfp_zone(gfp_mask),
.order = order,
.nodemask = nodemask,
int nid;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
- .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+ .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
.reclaim_idx = MAX_NR_ZONES - 1,
.target_mem_cgroup = memcg,
do {
struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
- if (inactive_list_is_low(lruvec, false, sc, true))
+ if (inactive_list_is_low(lruvec, false, memcg, sc, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
} while (memcg);
}
-static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
+/*
+ * Returns true if there is an eligible zone balanced for the request order
+ * and classzone_idx
+ */
+static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
{
- unsigned long mark = high_wmark_pages(zone);
+ int i;
+ unsigned long mark = -1;
+ struct zone *zone;
- if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx))
- return false;
+ for (i = 0; i <= classzone_idx; i++) {
+ zone = pgdat->node_zones + i;
+
+ if (!managed_zone(zone))
+ continue;
+
+ mark = high_wmark_pages(zone);
+ if (zone_watermark_ok_safe(zone, order, mark, classzone_idx))
+ return true;
+ }
/*
- * If any eligible zone is balanced then the node is not considered
- * to be congested or dirty
+ * If a node has no populated zone within classzone_idx, it does not
+ * need balancing by definition. This can happen if a zone-restricted
+ * allocation tries to wake a remote kswapd.
*/
- clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);
- clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);
- clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags);
+ if (mark == -1)
+ return true;
- return true;
+ return false;
+}
+
+/* Clear pgdat state for congested, dirty or under writeback. */
+static void clear_pgdat_congested(pg_data_t *pgdat)
+{
+ clear_bit(PGDAT_CONGESTED, &pgdat->flags);
+ clear_bit(PGDAT_DIRTY, &pgdat->flags);
+ clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
}
/*
*/
static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
{
- int i;
-
/*
* The throttled processes are normally woken up in balance_pgdat() as
* soon as allow_direct_reclaim() is true. But there is a potential
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
return true;
- for (i = 0; i <= classzone_idx; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (!managed_zone(zone))
- continue;
-
- if (!zone_balanced(zone, order, classzone_idx))
- return false;
+ if (pgdat_balanced(pgdat, order, classzone_idx)) {
+ clear_pgdat_congested(pgdat);
+ return true;
}
- return true;
+ return false;
}
/*
}
/*
- * Only reclaim if there are no eligible zones. Check from
- * high to low zone as allocations prefer higher zones.
- * Scanning from low to high zone would allow congestion to be
- * cleared during a very small window when a small low
- * zone was balanced even under extreme pressure when the
- * overall node may be congested. Note that sc.reclaim_idx
- * is not used as buffer_heads_over_limit may have adjusted
- * it.
+ * Only reclaim if there are no eligible zones. Note that
+ * sc.reclaim_idx is not used as buffer_heads_over_limit may
+ * have adjusted it.
*/
- for (i = classzone_idx; i >= 0; i--) {
- zone = pgdat->node_zones + i;
- if (!managed_zone(zone))
- continue;
-
- if (zone_balanced(zone, sc.order, classzone_idx))
- goto out;
- }
+ if (pgdat_balanced(pgdat, sc.order, classzone_idx))
+ goto out;
/*
* Do some background aging of the anon list, to give
pgdat->kswapd_failures++;
out:
+ snapshot_refaults(NULL, pgdat);
/*
* Return the order kswapd stopped reclaiming at as
* prepare_kswapd_sleep() takes it into account. If another caller
return sc.order;
}
+/*
+ * pgdat->kswapd_classzone_idx is the highest zone index that a recent
+ * allocation request woke kswapd for. When kswapd has not woken recently,
+ * the value is MAX_NR_ZONES which is not a valid index. This compares a
+ * given classzone and returns it or the highest classzone index kswapd
+ * was recently woke for.
+ */
+static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
+ enum zone_type classzone_idx)
+{
+ if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
+ return classzone_idx;
+
+ return max(pgdat->kswapd_classzone_idx, classzone_idx);
+}
+
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
unsigned int classzone_idx)
{
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
- /* Try to sleep for a short interval */
+ /*
+ * Try to sleep for a short interval. Note that kcompactd will only be
+ * woken if it is possible to sleep for a short interval. This is
+ * deliberate on the assumption that if reclaim cannot keep an
+ * eligible zone balanced that it's also unlikely that compaction will
+ * succeed.
+ */
if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
/*
* Compaction records what page blocks it recently failed to
* the previous request that slept prematurely.
*/
if (remaining) {
- pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
+ pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
}
*/
static int kswapd(void *p)
{
- unsigned int alloc_order, reclaim_order, classzone_idx;
+ unsigned int alloc_order, reclaim_order;
+ unsigned int classzone_idx = MAX_NR_ZONES - 1;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
set_freezable();
- pgdat->kswapd_order = alloc_order = reclaim_order = 0;
- pgdat->kswapd_classzone_idx = classzone_idx = 0;
+ pgdat->kswapd_order = 0;
+ pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
for ( ; ; ) {
bool ret;
+ alloc_order = reclaim_order = pgdat->kswapd_order;
+ classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
+
kswapd_try_sleep:
kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
classzone_idx);
/* Read the new order and classzone_idx */
alloc_order = reclaim_order = pgdat->kswapd_order;
- classzone_idx = pgdat->kswapd_classzone_idx;
+ classzone_idx = kswapd_classzone_idx(pgdat, 0);
pgdat->kswapd_order = 0;
- pgdat->kswapd_classzone_idx = 0;
+ pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
ret = try_to_freeze();
if (kthread_should_stop())
reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
if (reclaim_order < alloc_order)
goto kswapd_try_sleep;
-
- alloc_order = reclaim_order = pgdat->kswapd_order;
- classzone_idx = pgdat->kswapd_classzone_idx;
}
tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
{
pg_data_t *pgdat;
- int z;
if (!managed_zone(zone))
return;
if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
return;
pgdat = zone->zone_pgdat;
- pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
+ pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
+ classzone_idx);
pgdat->kswapd_order = max(pgdat->kswapd_order, order);
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
return;
- /* Only wake kswapd if all zones are unbalanced */
- for (z = 0; z <= classzone_idx; z++) {
- zone = pgdat->node_zones + z;
- if (!managed_zone(zone))
- continue;
-
- if (zone_balanced(zone, order, classzone_idx))
- return;
- }
+ if (pgdat_balanced(pgdat, order, classzone_idx))
+ return;
- trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
+ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order);
wake_up_interruptible(&pgdat->kswapd_wait);
}
int classzone_idx = gfp_zone(gfp_mask);
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
- .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+ .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
.order = order,
.priority = NODE_RECLAIM_PRIORITY,
.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),