khugepaged: introduce 'max_ptes_shared' tunable
authorKirill A. Shutemov <kirill.shutemov@linux.intel.com>
Wed, 3 Jun 2020 23:00:30 +0000 (16:00 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 4 Jun 2020 03:09:46 +0000 (20:09 -0700)
'max_ptes_shared' specifies how many pages can be shared across multiple
processes.  Exceeding the number would block the collapse::

/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_shared

A higher value may increase memory footprint for some workloads.

By default, at least half of pages has to be not shared.

[colin.king@canonical.com: fix several spelling mistakes]
Link: http://lkml.kernel.org/r/20200420084241.65433-1-colin.king@canonical.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Tested-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Acked-by: Yang Shi <yang.shi@linux.alibaba.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Link: http://lkml.kernel.org/r/20200416160026.16538-9-kirill.shutemov@linux.intel.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Documentation/admin-guide/mm/transhuge.rst
include/trace/events/huge_memory.h
mm/khugepaged.c
tools/testing/selftests/vm/khugepaged.c

index 2f31de8..6a233e4 100644 (file)
@@ -220,6 +220,13 @@ memory. A lower value can prevent THPs from being
 collapsed, resulting fewer pages being collapsed into
 THPs, and lower memory access performance.
 
+``max_ptes_shared`` specifies how many pages can be shared across multiple
+processes. Exceeding the number would block the collapse::
+
+       /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_shared
+
+A higher value may increase memory footprint for some workloads.
+
 Boot parameter
 ==============
 
index 70e32ff..4fdb14a 100644 (file)
@@ -12,6 +12,8 @@
        EM( SCAN_SUCCEED,               "succeeded")                    \
        EM( SCAN_PMD_NULL,              "pmd_null")                     \
        EM( SCAN_EXCEED_NONE_PTE,       "exceed_none_pte")              \
+       EM( SCAN_EXCEED_SWAP_PTE,       "exceed_swap_pte")              \
+       EM( SCAN_EXCEED_SHARED_PTE,     "exceed_shared_pte")            \
        EM( SCAN_PTE_NON_PRESENT,       "pte_non_present")              \
        EM( SCAN_PTE_UFFD_WP,           "pte_uffd_wp")                  \
        EM( SCAN_PAGE_RO,               "no_writable_page")             \
@@ -31,7 +33,6 @@
        EM( SCAN_DEL_PAGE_LRU,          "could_not_delete_page_from_lru")\
        EM( SCAN_ALLOC_HUGE_PAGE_FAIL,  "alloc_huge_page_failed")       \
        EM( SCAN_CGROUP_CHARGE_FAIL,    "ccgroup_charge_failed")        \
-       EM( SCAN_EXCEED_SWAP_PTE,       "exceed_swap_pte")              \
        EM( SCAN_TRUNCATED,             "truncated")                    \
        EMe(SCAN_PAGE_HAS_PRIVATE,      "page_has_private")             \
 
index 1bae663..e7897a7 100644 (file)
@@ -28,6 +28,8 @@ enum scan_result {
        SCAN_SUCCEED,
        SCAN_PMD_NULL,
        SCAN_EXCEED_NONE_PTE,
+       SCAN_EXCEED_SWAP_PTE,
+       SCAN_EXCEED_SHARED_PTE,
        SCAN_PTE_NON_PRESENT,
        SCAN_PTE_UFFD_WP,
        SCAN_PAGE_RO,
@@ -47,7 +49,6 @@ enum scan_result {
        SCAN_DEL_PAGE_LRU,
        SCAN_ALLOC_HUGE_PAGE_FAIL,
        SCAN_CGROUP_CHARGE_FAIL,
-       SCAN_EXCEED_SWAP_PTE,
        SCAN_TRUNCATED,
        SCAN_PAGE_HAS_PRIVATE,
 };
@@ -72,6 +73,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
  */
 static unsigned int khugepaged_max_ptes_none __read_mostly;
 static unsigned int khugepaged_max_ptes_swap __read_mostly;
+static unsigned int khugepaged_max_ptes_shared __read_mostly;
 
 #define MM_SLOTS_HASH_BITS 10
 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
@@ -291,15 +293,43 @@ static struct kobj_attribute khugepaged_max_ptes_swap_attr =
        __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
               khugepaged_max_ptes_swap_store);
 
+static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj,
+                                            struct kobj_attribute *attr,
+                                            char *buf)
+{
+       return sprintf(buf, "%u\n", khugepaged_max_ptes_shared);
+}
+
+static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj,
+                                             struct kobj_attribute *attr,
+                                             const char *buf, size_t count)
+{
+       int err;
+       unsigned long max_ptes_shared;
+
+       err  = kstrtoul(buf, 10, &max_ptes_shared);
+       if (err || max_ptes_shared > HPAGE_PMD_NR-1)
+               return -EINVAL;
+
+       khugepaged_max_ptes_shared = max_ptes_shared;
+
+       return count;
+}
+
+static struct kobj_attribute khugepaged_max_ptes_shared_attr =
+       __ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show,
+              khugepaged_max_ptes_shared_store);
+
 static struct attribute *khugepaged_attr[] = {
        &khugepaged_defrag_attr.attr,
        &khugepaged_max_ptes_none_attr.attr,
+       &khugepaged_max_ptes_swap_attr.attr,
+       &khugepaged_max_ptes_shared_attr.attr,
        &pages_to_scan_attr.attr,
        &pages_collapsed_attr.attr,
        &full_scans_attr.attr,
        &scan_sleep_millisecs_attr.attr,
        &alloc_sleep_millisecs_attr.attr,
-       &khugepaged_max_ptes_swap_attr.attr,
        NULL,
 };
 
@@ -359,6 +389,7 @@ int __init khugepaged_init(void)
        khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
        khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
        khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
+       khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
 
        return 0;
 }
@@ -557,7 +588,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 {
        struct page *page = NULL;
        pte_t *_pte;
-       int none_or_zero = 0, result = 0, referenced = 0;
+       int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
        bool writable = false;
 
        for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
@@ -585,6 +616,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 
                VM_BUG_ON_PAGE(!PageAnon(page), page);
 
+               if (page_mapcount(page) > 1 &&
+                               ++shared > khugepaged_max_ptes_shared) {
+                       result = SCAN_EXCEED_SHARED_PTE;
+                       goto out;
+               }
+
                if (PageCompound(page)) {
                        struct page *p;
                        page = compound_head(page);
@@ -1168,7 +1205,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 {
        pmd_t *pmd;
        pte_t *pte, *_pte;
-       int ret = 0, none_or_zero = 0, result = 0, referenced = 0;
+       int ret = 0, result = 0, referenced = 0;
+       int none_or_zero = 0, shared = 0;
        struct page *page = NULL;
        unsigned long _address;
        spinlock_t *ptl;
@@ -1240,6 +1278,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                        goto out_unmap;
                }
 
+               if (page_mapcount(page) > 1 &&
+                               ++shared > khugepaged_max_ptes_shared) {
+                       result = SCAN_EXCEED_SHARED_PTE;
+                       goto out_unmap;
+               }
+
                page = compound_head(page);
 
                /*
index ef67a8a..51b89ce 100644 (file)
@@ -78,6 +78,7 @@ struct khugepaged_settings {
        unsigned int scan_sleep_millisecs;
        unsigned int max_ptes_none;
        unsigned int max_ptes_swap;
+       unsigned int max_ptes_shared;
        unsigned long pages_to_scan;
 };
 
@@ -277,6 +278,7 @@ static void write_settings(struct settings *settings)
                        khugepaged->scan_sleep_millisecs);
        write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none);
        write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap);
+       write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared);
        write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
 }
 
@@ -313,6 +315,7 @@ static void save_settings(void)
                        read_num("khugepaged/scan_sleep_millisecs"),
                .max_ptes_none = read_num("khugepaged/max_ptes_none"),
                .max_ptes_swap = read_num("khugepaged/max_ptes_swap"),
+               .max_ptes_shared = read_num("khugepaged/max_ptes_shared"),
                .pages_to_scan = read_num("khugepaged/pages_to_scan"),
        };
        success("OK");
@@ -896,12 +899,90 @@ static void collapse_fork_compound(void)
                        fail("Fail");
                fill_memory(p, 0, page_size);
 
+               write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
                if (wait_for_scan("Collapse PTE table full of compound pages in child", p))
                        fail("Timeout");
                else if (check_huge(p))
                        success("OK");
                else
                        fail("Fail");
+               write_num("khugepaged/max_ptes_shared",
+                               default_settings.khugepaged.max_ptes_shared);
+
+               validate_memory(p, 0, hpage_pmd_size);
+               munmap(p, hpage_pmd_size);
+               exit(exit_status);
+       }
+
+       wait(&wstatus);
+       exit_status += WEXITSTATUS(wstatus);
+
+       printf("Check if parent still has huge page...");
+       if (check_huge(p))
+               success("OK");
+       else
+               fail("Fail");
+       validate_memory(p, 0, hpage_pmd_size);
+       munmap(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_shared()
+{
+       int max_ptes_shared = read_num("khugepaged/max_ptes_shared");
+       int wstatus;
+       void *p;
+
+       p = alloc_mapping();
+
+       printf("Allocate huge page...");
+       madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+       fill_memory(p, 0, hpage_pmd_size);
+       if (check_huge(p))
+               success("OK");
+       else
+               fail("Fail");
+
+       printf("Share huge page over fork()...");
+       if (!fork()) {
+               /* Do not touch settings on child exit */
+               skip_settings_restore = true;
+               exit_status = 0;
+
+               if (check_huge(p))
+                       success("OK");
+               else
+                       fail("Fail");
+
+               printf("Trigger CoW on page %d of %d...",
+                               hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
+               fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
+               if (!check_huge(p))
+                       success("OK");
+               else
+                       fail("Fail");
+
+               if (wait_for_scan("Do not collapse with max_ptes_shared exceeded", p))
+                       fail("Timeout");
+               else if (!check_huge(p))
+                       success("OK");
+               else
+                       fail("Fail");
+
+               printf("Trigger CoW on page %d of %d...",
+                               hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
+               fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size);
+               if (!check_huge(p))
+                       success("OK");
+               else
+                       fail("Fail");
+
+
+               if (wait_for_scan("Collapse with max_ptes_shared PTEs shared", p))
+                       fail("Timeout");
+               else if (check_huge(p))
+                       success("OK");
+               else
+                       fail("Fail");
 
                validate_memory(p, 0, hpage_pmd_size);
                munmap(p, hpage_pmd_size);
@@ -930,6 +1011,7 @@ int main(void)
 
        default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
        default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
+       default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
        default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
 
        save_settings();
@@ -947,6 +1029,7 @@ int main(void)
        collapse_compound_extreme();
        collapse_fork();
        collapse_fork_compound();
+       collapse_max_ptes_shared();
 
        restore_settings(0);
 }