mm: thp: make deferred split shrinker memcg aware

author Yang Shi <yang.shi@linux.alibaba.com>

Mon, 23 Sep 2019 22:38:15 +0000 (15:38 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 24 Sep 2019 22:54:11 +0000 (15:54 -0700)
author Yang Shi <yang.shi@linux.alibaba.com>
Mon, 23 Sep 2019 22:38:15 +0000 (15:38 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 24 Sep 2019 22:54:11 +0000 (15:54 -0700)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h

index 45ede62..61c9ffd 100644 (file)
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -267,6 +267,15 @@ static inline bool thp_migration_supported(void)
         return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
  }
  
+static inline struct list_head *page_deferred_list(struct page *page)
+{
+       /*
+        * Global or memcg deferred list in the second tail pages is
+        * occupied by compound_head.
+        */
+       return &page[2].deferred_list;
+}
+
  #else /* CONFIG_TRANSPARENT_HUGEPAGE */
  #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
  #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

index a3c0a63..9b60863 100644 (file)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -330,6 +330,10 @@ struct mem_cgroup {
         struct list_head event_list;
         spinlock_t event_list_lock;
  
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       struct deferred_split deferred_split_queue;
+#endif
+
         struct mem_cgroup_per_node *nodeinfo[0];
         /* WARNING: nodeinfo must be the last member here */
  };
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index 0b739f3..5183e0d 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -138,6 +138,7 @@ struct page {
                 struct {        /* Second tail page of compound page */
                         unsigned long _compound_pad_1;  /* compound_head */
                         unsigned long _compound_pad_2;
+                       /* For both global and memcg */
                         struct list_head deferred_list;
                 };
                 struct {        /* Page table pages */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index c642c03..73fc517 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -496,11 +496,25 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
         return pmd;
  }
  
-static inline struct list_head *page_deferred_list(struct page *page)
+#ifdef CONFIG_MEMCG
+static inline struct deferred_split *get_deferred_split_queue(struct page *page)
  {
-       /* ->lru in the tail pages is occupied by compound_head. */
-       return &page[2].deferred_list;
+       struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+       struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+
+       if (memcg)
+               return &memcg->deferred_split_queue;
+       else
+               return &pgdat->deferred_split_queue;
  }
+#else
+static inline struct deferred_split *get_deferred_split_queue(struct page *page)
+{
+       struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+
+       return &pgdat->deferred_split_queue;
+}
+#endif
  
  void prep_transhuge_page(struct page *page)
  {
@@ -2691,7 +2705,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
  {
         struct page *head = compound_head(page);
         struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
-       struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
+       struct deferred_split *ds_queue = get_deferred_split_queue(page);
         struct anon_vma *anon_vma = NULL;
         struct address_space *mapping = NULL;
         int count, mapcount, extra_pins, ret;
@@ -2827,8 +2841,7 @@ out:
  
  void free_transhuge_page(struct page *page)
  {
-       struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
-       struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
+       struct deferred_split *ds_queue = get_deferred_split_queue(page);
         unsigned long flags;
  
         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
@@ -2842,17 +2855,37 @@ void free_transhuge_page(struct page *page)
  
  void deferred_split_huge_page(struct page *page)
  {
-       struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
-       struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
+       struct deferred_split *ds_queue = get_deferred_split_queue(page);
+#ifdef CONFIG_MEMCG
+       struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+#endif
         unsigned long flags;
  
         VM_BUG_ON_PAGE(!PageTransHuge(page), page);
  
+       /*
+        * The try_to_unmap() in page reclaim path might reach here too,
+        * this may cause a race condition to corrupt deferred split queue.
+        * And, if page reclaim is already handling the same page, it is
+        * unnecessary to handle it again in shrinker.
+        *
+        * Check PageSwapCache to determine if the page is being
+        * handled by page reclaim since THP swap would add the page into
+        * swap cache before calling try_to_unmap().
+        */
+       if (PageSwapCache(page))
+               return;
+
         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
         if (list_empty(page_deferred_list(page))) {
                 count_vm_event(THP_DEFERRED_SPLIT_PAGE);
                 list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
                 ds_queue->split_queue_len++;
+#ifdef CONFIG_MEMCG
+               if (memcg)
+                       memcg_set_shrinker_bit(memcg, page_to_nid(page),
+                                              deferred_split_shrinker.id);
+#endif
         }
         spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
  }
@@ -2862,6 +2895,11 @@ static unsigned long deferred_split_count(struct shrinker *shrink,
  {
         struct pglist_data *pgdata = NODE_DATA(sc->nid);
         struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
+
+#ifdef CONFIG_MEMCG
+       if (sc->memcg)
+               ds_queue = &sc->memcg->deferred_split_queue;
+#endif
         return READ_ONCE(ds_queue->split_queue_len);
  }
  
@@ -2875,6 +2913,11 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
         struct page *page;
         int split = 0;
  
+#ifdef CONFIG_MEMCG
+       if (sc->memcg)
+               ds_queue = &sc->memcg->deferred_split_queue;
+#endif
+
         spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
         /* Take pin on all head pages to avoid freeing them under us */
         list_for_each_safe(pos, next, &ds_queue->split_queue) {
@@ -2921,7 +2964,8 @@ static struct shrinker deferred_split_shrinker = {
         .count_objects = deferred_split_count,
         .scan_objects = deferred_split_scan,
         .seeks = DEFAULT_SEEKS,
-       .flags = SHRINKER_NUMA_AWARE,
+       .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
+                SHRINKER_NONSLAB,
  };
  
  #ifdef CONFIG_DEBUG_FS
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index a385a7c..2156ef7 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5070,6 +5070,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
         for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
                 memcg->cgwb_frn[i].done =
                         __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
+       INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
+       memcg->deferred_split_queue.split_queue_len = 0;
  #endif
         idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
         return memcg;
@@ -5449,6 +5454,14 @@ static int mem_cgroup_move_account(struct page *page,
                 __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
         }
  
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       if (compound && !list_empty(page_deferred_list(page))) {
+               spin_lock(&from->deferred_split_queue.split_queue_lock);
+               list_del_init(page_deferred_list(page));
+               from->deferred_split_queue.split_queue_len--;
+               spin_unlock(&from->deferred_split_queue.split_queue_lock);
+       }
+#endif
         /*
          * It is safe to change page->mem_cgroup here because the page
          * is referenced, charged, and isolated - we can't race with
@@ -5457,6 +5470,17 @@ static int mem_cgroup_move_account(struct page *page,
  
         /* caller should have done css_get */
         page->mem_cgroup = to;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       if (compound && list_empty(page_deferred_list(page))) {
+               spin_lock(&to->deferred_split_queue.split_queue_lock);
+               list_add_tail(page_deferred_list(page),
+                             &to->deferred_split_queue.split_queue);
+               to->deferred_split_queue.split_queue_len++;
+               spin_unlock(&to->deferred_split_queue.split_queue_lock);
+       }
+#endif
+
         spin_unlock_irqrestore(&from->move_lock, flags);
  
         ret = 0;
author	Yang Shi <yang.shi@linux.alibaba.com>
	Mon, 23 Sep 2019 22:38:15 +0000 (15:38 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 24 Sep 2019 22:54:11 +0000 (15:54 -0700)
include/linux/huge_mm.h		patch \| blob \| history
include/linux/memcontrol.h		patch \| blob \| history
include/linux/mm_types.h		patch \| blob \| history
mm/huge_memory.c		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history