Merge tag 'memblock-v5.20-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rppt...
[linux-2.6-microblaze.git] / mm / hugetlb_vmemmap.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Optimize vmemmap pages associated with HugeTLB
4  *
5  * Copyright (c) 2020, Bytedance. All rights reserved.
6  *
7  *     Author: Muchun Song <songmuchun@bytedance.com>
8  *
9  * See Documentation/mm/vmemmap_dedup.rst
10  */
11 #define pr_fmt(fmt)     "HugeTLB: " fmt
12
13 #include <linux/memory.h>
14 #include "hugetlb_vmemmap.h"
15
16 /*
17  * There are a lot of struct page structures associated with each HugeTLB page.
18  * For tail pages, the value of compound_head is the same. So we can reuse first
19  * page of head page structures. We map the virtual addresses of all the pages
20  * of tail page structures to the head page struct, and then free these page
21  * frames. Therefore, we need to reserve one pages as vmemmap areas.
22  */
23 #define RESERVE_VMEMMAP_NR              1U
24 #define RESERVE_VMEMMAP_SIZE            (RESERVE_VMEMMAP_NR << PAGE_SHIFT)
25
26 enum vmemmap_optimize_mode {
27         VMEMMAP_OPTIMIZE_OFF,
28         VMEMMAP_OPTIMIZE_ON,
29 };
30
31 DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
32                         hugetlb_optimize_vmemmap_key);
33 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
34
35 static enum vmemmap_optimize_mode vmemmap_optimize_mode =
36         IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
37
38 static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to)
39 {
40         if (vmemmap_optimize_mode == to)
41                 return;
42
43         if (to == VMEMMAP_OPTIMIZE_OFF)
44                 static_branch_dec(&hugetlb_optimize_vmemmap_key);
45         else
46                 static_branch_inc(&hugetlb_optimize_vmemmap_key);
47         WRITE_ONCE(vmemmap_optimize_mode, to);
48 }
49
50 static int __init hugetlb_vmemmap_early_param(char *buf)
51 {
52         bool enable;
53         enum vmemmap_optimize_mode mode;
54
55         if (kstrtobool(buf, &enable))
56                 return -EINVAL;
57
58         mode = enable ? VMEMMAP_OPTIMIZE_ON : VMEMMAP_OPTIMIZE_OFF;
59         vmemmap_optimize_mode_switch(mode);
60
61         return 0;
62 }
63 early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param);
64
65 /*
66  * Previously discarded vmemmap pages will be allocated and remapping
67  * after this function returns zero.
68  */
69 int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
70 {
71         int ret;
72         unsigned long vmemmap_addr = (unsigned long)head;
73         unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
74
75         if (!HPageVmemmapOptimized(head))
76                 return 0;
77
78         vmemmap_addr    += RESERVE_VMEMMAP_SIZE;
79         vmemmap_pages   = hugetlb_optimize_vmemmap_pages(h);
80         vmemmap_end     = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT);
81         vmemmap_reuse   = vmemmap_addr - PAGE_SIZE;
82
83         /*
84          * The pages which the vmemmap virtual address range [@vmemmap_addr,
85          * @vmemmap_end) are mapped to are freed to the buddy allocator, and
86          * the range is mapped to the page which @vmemmap_reuse is mapped to.
87          * When a HugeTLB page is freed to the buddy allocator, previously
88          * discarded vmemmap pages must be allocated and remapping.
89          */
90         ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
91                                   GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
92         if (!ret) {
93                 ClearHPageVmemmapOptimized(head);
94                 static_branch_dec(&hugetlb_optimize_vmemmap_key);
95         }
96
97         return ret;
98 }
99
100 static unsigned int vmemmap_optimizable_pages(struct hstate *h,
101                                               struct page *head)
102 {
103         if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
104                 return 0;
105
106         if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
107                 pmd_t *pmdp, pmd;
108                 struct page *vmemmap_page;
109                 unsigned long vaddr = (unsigned long)head;
110
111                 /*
112                  * Only the vmemmap page's vmemmap page can be self-hosted.
113                  * Walking the page tables to find the backing page of the
114                  * vmemmap page.
115                  */
116                 pmdp = pmd_off_k(vaddr);
117                 /*
118                  * The READ_ONCE() is used to stabilize *pmdp in a register or
119                  * on the stack so that it will stop changing under the code.
120                  * The only concurrent operation where it can be changed is
121                  * split_vmemmap_huge_pmd() (*pmdp will be stable after this
122                  * operation).
123                  */
124                 pmd = READ_ONCE(*pmdp);
125                 if (pmd_leaf(pmd))
126                         vmemmap_page = pmd_page(pmd) + pte_index(vaddr);
127                 else
128                         vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr));
129                 /*
130                  * Due to HugeTLB alignment requirements and the vmemmap pages
131                  * being at the start of the hotplugged memory region in
132                  * memory_hotplug.memmap_on_memory case. Checking any vmemmap
133                  * page's vmemmap page if it is marked as VmemmapSelfHosted is
134                  * sufficient.
135                  *
136                  * [                  hotplugged memory                  ]
137                  * [        section        ][...][        section        ]
138                  * [ vmemmap ][              usable memory               ]
139                  *   ^   |     |                                        |
140                  *   +---+     |                                        |
141                  *     ^       |                                        |
142                  *     +-------+                                        |
143                  *          ^                                           |
144                  *          +-------------------------------------------+
145                  */
146                 if (PageVmemmapSelfHosted(vmemmap_page))
147                         return 0;
148         }
149
150         return hugetlb_optimize_vmemmap_pages(h);
151 }
152
153 void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
154 {
155         unsigned long vmemmap_addr = (unsigned long)head;
156         unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
157
158         vmemmap_pages = vmemmap_optimizable_pages(h, head);
159         if (!vmemmap_pages)
160                 return;
161
162         static_branch_inc(&hugetlb_optimize_vmemmap_key);
163
164         vmemmap_addr    += RESERVE_VMEMMAP_SIZE;
165         vmemmap_end     = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT);
166         vmemmap_reuse   = vmemmap_addr - PAGE_SIZE;
167
168         /*
169          * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end)
170          * to the page which @vmemmap_reuse is mapped to, then free the pages
171          * which the range [@vmemmap_addr, @vmemmap_end] is mapped to.
172          */
173         if (vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse))
174                 static_branch_dec(&hugetlb_optimize_vmemmap_key);
175         else
176                 SetHPageVmemmapOptimized(head);
177 }
178
179 void __init hugetlb_vmemmap_init(struct hstate *h)
180 {
181         unsigned int nr_pages = pages_per_huge_page(h);
182         unsigned int vmemmap_pages;
183
184         /*
185          * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct
186          * page structs that can be used when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP,
187          * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page.
188          */
189         BUILD_BUG_ON(__NR_USED_SUBPAGE >=
190                      RESERVE_VMEMMAP_SIZE / sizeof(struct page));
191
192         if (!is_power_of_2(sizeof(struct page))) {
193                 pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n");
194                 static_branch_disable(&hugetlb_optimize_vmemmap_key);
195                 return;
196         }
197
198         vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
199         /*
200          * The head page is not to be freed to buddy allocator, the other tail
201          * pages will map to the head page, so they can be freed.
202          *
203          * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true
204          * on some architectures (e.g. aarch64). See Documentation/arm64/
205          * hugetlbpage.rst for more details.
206          */
207         if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR))
208                 h->optimize_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR;
209
210         pr_info("can optimize %d vmemmap pages for %s\n",
211                 h->optimize_vmemmap_pages, h->name);
212 }
213
214 #ifdef CONFIG_PROC_SYSCTL
215 static int hugetlb_optimize_vmemmap_handler(struct ctl_table *table, int write,
216                                             void *buffer, size_t *length,
217                                             loff_t *ppos)
218 {
219         int ret;
220         enum vmemmap_optimize_mode mode;
221         static DEFINE_MUTEX(sysctl_mutex);
222
223         if (write && !capable(CAP_SYS_ADMIN))
224                 return -EPERM;
225
226         mutex_lock(&sysctl_mutex);
227         mode = vmemmap_optimize_mode;
228         table->data = &mode;
229         ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
230         if (write && !ret)
231                 vmemmap_optimize_mode_switch(mode);
232         mutex_unlock(&sysctl_mutex);
233
234         return ret;
235 }
236
237 static struct ctl_table hugetlb_vmemmap_sysctls[] = {
238         {
239                 .procname       = "hugetlb_optimize_vmemmap",
240                 .maxlen         = sizeof(enum vmemmap_optimize_mode),
241                 .mode           = 0644,
242                 .proc_handler   = hugetlb_optimize_vmemmap_handler,
243                 .extra1         = SYSCTL_ZERO,
244                 .extra2         = SYSCTL_ONE,
245         },
246         { }
247 };
248
249 static __init int hugetlb_vmemmap_sysctls_init(void)
250 {
251         /*
252          * If "struct page" crosses page boundaries, the vmemmap pages cannot
253          * be optimized.
254          */
255         if (is_power_of_2(sizeof(struct page)))
256                 register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
257
258         return 0;
259 }
260 late_initcall(hugetlb_vmemmap_sysctls_init);
261 #endif /* CONFIG_PROC_SYSCTL */