mm/hugetlb_vmemmap.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * HugeTLB Vmemmap Optimization (HVO)
   4  *
   5  * Copyright (c) 2020, ByteDance. All rights reserved.
   6  *
   7  *     Author: Muchun Song <songmuchun@bytedance.com>
   8  *
   9  * See Documentation/mm/vmemmap_dedup.rst
  10  */
  11 #define pr_fmt(fmt)     "HugeTLB: " fmt
  12
  13 #include <linux/pgtable.h>
  14 #include <linux/moduleparam.h>
  15 #include <linux/bootmem_info.h>
  16 #include <linux/mmdebug.h>
  17 #include <asm/pgalloc.h>
  18 #include <asm/tlbflush.h>
  19 #include "hugetlb_vmemmap.h"
  20
  21 /**
  22  * struct vmemmap_remap_walk - walk vmemmap page table
  23  *
  24  * @remap_pte:          called for each lowest-level entry (PTE).
  25  * @nr_walked:          the number of walked pte.
  26  * @reuse_page:         the page which is reused for the tail vmemmap pages.
  27  * @reuse_addr:         the virtual address of the @reuse_page page.
  28  * @vmemmap_pages:      the list head of the vmemmap pages that can be freed
  29  *                      or is mapped from.
  30  * @flags:              used to modify behavior in vmemmap page table walking
  31  *                      operations.
  32  */
  33 struct vmemmap_remap_walk {
  34         void                    (*remap_pte)(pte_t *pte, unsigned long addr,
  35                                              struct vmemmap_remap_walk *walk);
  36         unsigned long           nr_walked;
  37         struct page             *reuse_page;
  38         unsigned long           reuse_addr;
  39         struct list_head        *vmemmap_pages;
  40
  41 /* Skip the TLB flush when we split the PMD */
  42 #define VMEMMAP_SPLIT_NO_TLB_FLUSH      BIT(0)
  43 /* Skip the TLB flush when we remap the PTE */
  44 #define VMEMMAP_REMAP_NO_TLB_FLUSH      BIT(1)
  45         unsigned long           flags;
  46 };
  47
  48 static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush)
  49 {
  50         pmd_t __pmd;
  51         int i;
  52         unsigned long addr = start;
  53         struct page *head;
  54         pte_t *pgtable;
  55
  56         spin_lock(&init_mm.page_table_lock);
  57         head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
  58         spin_unlock(&init_mm.page_table_lock);
  59
  60         if (!head)
  61                 return 0;
  62
  63         pgtable = pte_alloc_one_kernel(&init_mm);
  64         if (!pgtable)
  65                 return -ENOMEM;
  66
  67         pmd_populate_kernel(&init_mm, &__pmd, pgtable);
  68
  69         for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
  70                 pte_t entry, *pte;
  71                 pgprot_t pgprot = PAGE_KERNEL;
  72
  73                 entry = mk_pte(head + i, pgprot);
  74                 pte = pte_offset_kernel(&__pmd, addr);
  75                 set_pte_at(&init_mm, addr, pte, entry);
  76         }
  77
  78         spin_lock(&init_mm.page_table_lock);
  79         if (likely(pmd_leaf(*pmd))) {
  80                 /*
  81                  * Higher order allocations from buddy allocator must be able to
  82                  * be treated as indepdenent small pages (as they can be freed
  83                  * individually).
  84                  */
  85                 if (!PageReserved(head))
  86                         split_page(head, get_order(PMD_SIZE));
  87
  88                 /* Make pte visible before pmd. See comment in pmd_install(). */
  89                 smp_wmb();
  90                 pmd_populate_kernel(&init_mm, pmd, pgtable);
  91                 if (flush)
  92                         flush_tlb_kernel_range(start, start + PMD_SIZE);
  93         } else {
  94                 pte_free_kernel(&init_mm, pgtable);
  95         }
  96         spin_unlock(&init_mm.page_table_lock);
  97
  98         return 0;
  99 }
 100
 101 static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
 102                               unsigned long end,
 103                               struct vmemmap_remap_walk *walk)
 104 {
 105         pte_t *pte = pte_offset_kernel(pmd, addr);
 106
 107         /*
 108          * The reuse_page is found 'first' in table walk before we start
 109          * remapping (which is calling @walk->remap_pte).
 110          */
 111         if (!walk->reuse_page) {
 112                 walk->reuse_page = pte_page(ptep_get(pte));
 113                 /*
 114                  * Because the reuse address is part of the range that we are
 115                  * walking, skip the reuse address range.
 116                  */
 117                 addr += PAGE_SIZE;
 118                 pte++;
 119                 walk->nr_walked++;
 120         }
 121
 122         for (; addr != end; addr += PAGE_SIZE, pte++) {
 123                 walk->remap_pte(pte, addr, walk);
 124                 walk->nr_walked++;
 125         }
 126 }
 127
 128 static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
 129                              unsigned long end,
 130                              struct vmemmap_remap_walk *walk)
 131 {
 132         pmd_t *pmd;
 133         unsigned long next;
 134
 135         pmd = pmd_offset(pud, addr);
 136         do {
 137                 int ret;
 138
 139                 ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK,
 140                                 !(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH));
 141                 if (ret)
 142                         return ret;
 143
 144                 next = pmd_addr_end(addr, end);
 145
 146                 /*
 147                  * We are only splitting, not remapping the hugetlb vmemmap
 148                  * pages.
 149                  */
 150                 if (!walk->remap_pte)
 151                         continue;
 152
 153                 vmemmap_pte_range(pmd, addr, next, walk);
 154         } while (pmd++, addr = next, addr != end);
 155
 156         return 0;
 157 }
 158
 159 static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
 160                              unsigned long end,
 161                              struct vmemmap_remap_walk *walk)
 162 {
 163         pud_t *pud;
 164         unsigned long next;
 165
 166         pud = pud_offset(p4d, addr);
 167         do {
 168                 int ret;
 169
 170                 next = pud_addr_end(addr, end);
 171                 ret = vmemmap_pmd_range(pud, addr, next, walk);
 172                 if (ret)
 173                         return ret;
 174         } while (pud++, addr = next, addr != end);
 175
 176         return 0;
 177 }
 178
 179 static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
 180                              unsigned long end,
 181                              struct vmemmap_remap_walk *walk)
 182 {
 183         p4d_t *p4d;
 184         unsigned long next;
 185
 186         p4d = p4d_offset(pgd, addr);
 187         do {
 188                 int ret;
 189
 190                 next = p4d_addr_end(addr, end);
 191                 ret = vmemmap_pud_range(p4d, addr, next, walk);
 192                 if (ret)
 193                         return ret;
 194         } while (p4d++, addr = next, addr != end);
 195
 196         return 0;
 197 }
 198
 199 static int vmemmap_remap_range(unsigned long start, unsigned long end,
 200                                struct vmemmap_remap_walk *walk)
 201 {
 202         unsigned long addr = start;
 203         unsigned long next;
 204         pgd_t *pgd;
 205
 206         VM_BUG_ON(!PAGE_ALIGNED(start));
 207         VM_BUG_ON(!PAGE_ALIGNED(end));
 208
 209         pgd = pgd_offset_k(addr);
 210         do {
 211                 int ret;
 212
 213                 next = pgd_addr_end(addr, end);
 214                 ret = vmemmap_p4d_range(pgd, addr, next, walk);
 215                 if (ret)
 216                         return ret;
 217         } while (pgd++, addr = next, addr != end);
 218
 219         if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
 220                 flush_tlb_kernel_range(start, end);
 221
 222         return 0;
 223 }
 224
 225 /*
 226  * Free a vmemmap page. A vmemmap page can be allocated from the memblock
 227  * allocator or buddy allocator. If the PG_reserved flag is set, it means
 228  * that it allocated from the memblock allocator, just free it via the
 229  * free_bootmem_page(). Otherwise, use __free_page().
 230  */
 231 static inline void free_vmemmap_page(struct page *page)
 232 {
 233         if (PageReserved(page))
 234                 free_bootmem_page(page);
 235         else
 236                 __free_page(page);
 237 }
 238
 239 /* Free a list of the vmemmap pages */
 240 static void free_vmemmap_page_list(struct list_head *list)
 241 {
 242         struct page *page, *next;
 243
 244         list_for_each_entry_safe(page, next, list, lru)
 245                 free_vmemmap_page(page);
 246 }
 247
 248 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
 249                               struct vmemmap_remap_walk *walk)
 250 {
 251         /*
 252          * Remap the tail pages as read-only to catch illegal write operation
 253          * to the tail pages.
 254          */
 255         pgprot_t pgprot = PAGE_KERNEL_RO;
 256         struct page *page = pte_page(ptep_get(pte));
 257         pte_t entry;
 258
 259         /* Remapping the head page requires r/w */
 260         if (unlikely(addr == walk->reuse_addr)) {
 261                 pgprot = PAGE_KERNEL;
 262                 list_del(&walk->reuse_page->lru);
 263
 264                 /*
 265                  * Makes sure that preceding stores to the page contents from
 266                  * vmemmap_remap_free() become visible before the set_pte_at()
 267                  * write.
 268                  */
 269                 smp_wmb();
 270         }
 271
 272         entry = mk_pte(walk->reuse_page, pgprot);
 273         list_add(&page->lru, walk->vmemmap_pages);
 274         set_pte_at(&init_mm, addr, pte, entry);
 275 }
 276
 277 /*
 278  * How many struct page structs need to be reset. When we reuse the head
 279  * struct page, the special metadata (e.g. page->flags or page->mapping)
 280  * cannot copy to the tail struct page structs. The invalid value will be
 281  * checked in the free_tail_page_prepare(). In order to avoid the message
 282  * of "corrupted mapping in tail page". We need to reset at least 3 (one
 283  * head struct page struct and two tail struct page structs) struct page
 284  * structs.
 285  */
 286 #define NR_RESET_STRUCT_PAGE            3
 287
 288 static inline void reset_struct_pages(struct page *start)
 289 {
 290         struct page *from = start + NR_RESET_STRUCT_PAGE;
 291
 292         BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
 293         memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
 294 }
 295
 296 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
 297                                 struct vmemmap_remap_walk *walk)
 298 {
 299         pgprot_t pgprot = PAGE_KERNEL;
 300         struct page *page;
 301         void *to;
 302
 303         BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
 304
 305         page = list_first_entry(walk->vmemmap_pages, struct page, lru);
 306         list_del(&page->lru);
 307         to = page_to_virt(page);
 308         copy_page(to, (void *)walk->reuse_addr);
 309         reset_struct_pages(to);
 310
 311         /*
 312          * Makes sure that preceding stores to the page contents become visible
 313          * before the set_pte_at() write.
 314          */
 315         smp_wmb();
 316         set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
 317 }
 318
 319 /**
 320  * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
 321  *                      backing PMDs of the directmap into PTEs
 322  * @start:     start address of the vmemmap virtual address range that we want
 323  *             to remap.
 324  * @end:       end address of the vmemmap virtual address range that we want to
 325  *             remap.
 326  * @reuse:     reuse address.
 327  *
 328  * Return: %0 on success, negative error code otherwise.
 329  */
 330 static int vmemmap_remap_split(unsigned long start, unsigned long end,
 331                                 unsigned long reuse)
 332 {
 333         int ret;
 334         struct vmemmap_remap_walk walk = {
 335                 .remap_pte      = NULL,
 336                 .flags          = VMEMMAP_SPLIT_NO_TLB_FLUSH,
 337         };
 338
 339         /* See the comment in the vmemmap_remap_free(). */
 340         BUG_ON(start - reuse != PAGE_SIZE);
 341
 342         mmap_read_lock(&init_mm);
 343         ret = vmemmap_remap_range(reuse, end, &walk);
 344         mmap_read_unlock(&init_mm);
 345
 346         return ret;
 347 }
 348
 349 /**
 350  * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
 351  *                      to the page which @reuse is mapped to, then free vmemmap
 352  *                      which the range are mapped to.
 353  * @start:      start address of the vmemmap virtual address range that we want
 354  *              to remap.
 355  * @end:        end address of the vmemmap virtual address range that we want to
 356  *              remap.
 357  * @reuse:      reuse address.
 358  * @vmemmap_pages: list to deposit vmemmap pages to be freed.  It is callers
 359  *              responsibility to free pages.
 360  * @flags:      modifications to vmemmap_remap_walk flags
 361  *
 362  * Return: %0 on success, negative error code otherwise.
 363  */
 364 static int vmemmap_remap_free(unsigned long start, unsigned long end,
 365                               unsigned long reuse,
 366                               struct list_head *vmemmap_pages,
 367                               unsigned long flags)
 368 {
 369         int ret;
 370         struct vmemmap_remap_walk walk = {
 371                 .remap_pte      = vmemmap_remap_pte,
 372                 .reuse_addr     = reuse,
 373                 .vmemmap_pages  = vmemmap_pages,
 374                 .flags          = flags,
 375         };
 376         int nid = page_to_nid((struct page *)reuse);
 377         gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
 378
 379         /*
 380          * Allocate a new head vmemmap page to avoid breaking a contiguous
 381          * block of struct page memory when freeing it back to page allocator
 382          * in free_vmemmap_page_list(). This will allow the likely contiguous
 383          * struct page backing memory to be kept contiguous and allowing for
 384          * more allocations of hugepages. Fallback to the currently
 385          * mapped head page in case should it fail to allocate.
 386          */
 387         walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
 388         if (walk.reuse_page) {
 389                 copy_page(page_to_virt(walk.reuse_page),
 390                           (void *)walk.reuse_addr);
 391                 list_add(&walk.reuse_page->lru, vmemmap_pages);
 392         }
 393
 394         /*
 395          * In order to make remapping routine most efficient for the huge pages,
 396          * the routine of vmemmap page table walking has the following rules
 397          * (see more details from the vmemmap_pte_range()):
 398          *
 399          * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
 400          *   should be continuous.
 401          * - The @reuse address is part of the range [@reuse, @end) that we are
 402          *   walking which is passed to vmemmap_remap_range().
 403          * - The @reuse address is the first in the complete range.
 404          *
 405          * So we need to make sure that @start and @reuse meet the above rules.
 406          */
 407         BUG_ON(start - reuse != PAGE_SIZE);
 408
 409         mmap_read_lock(&init_mm);
 410         ret = vmemmap_remap_range(reuse, end, &walk);
 411         if (ret && walk.nr_walked) {
 412                 end = reuse + walk.nr_walked * PAGE_SIZE;
 413                 /*
 414                  * vmemmap_pages contains pages from the previous
 415                  * vmemmap_remap_range call which failed.  These
 416                  * are pages which were removed from the vmemmap.
 417                  * They will be restored in the following call.
 418                  */
 419                 walk = (struct vmemmap_remap_walk) {
 420                         .remap_pte      = vmemmap_restore_pte,
 421                         .reuse_addr     = reuse,
 422                         .vmemmap_pages  = vmemmap_pages,
 423                         .flags          = 0,
 424                 };
 425
 426                 vmemmap_remap_range(reuse, end, &walk);
 427         }
 428         mmap_read_unlock(&init_mm);
 429
 430         return ret;
 431 }
 432
 433 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
 434                                    struct list_head *list)
 435 {
 436         gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
 437         unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
 438         int nid = page_to_nid((struct page *)start);
 439         struct page *page, *next;
 440
 441         while (nr_pages--) {
 442                 page = alloc_pages_node(nid, gfp_mask, 0);
 443                 if (!page)
 444                         goto out;
 445                 list_add(&page->lru, list);
 446         }
 447
 448         return 0;
 449 out:
 450         list_for_each_entry_safe(page, next, list, lru)
 451                 __free_page(page);
 452         return -ENOMEM;
 453 }
 454
 455 /**
 456  * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
 457  *                       to the page which is from the @vmemmap_pages
 458  *                       respectively.
 459  * @start:      start address of the vmemmap virtual address range that we want
 460  *              to remap.
 461  * @end:        end address of the vmemmap virtual address range that we want to
 462  *              remap.
 463  * @reuse:      reuse address.
 464  * @flags:      modifications to vmemmap_remap_walk flags
 465  *
 466  * Return: %0 on success, negative error code otherwise.
 467  */
 468 static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
 469                                unsigned long reuse, unsigned long flags)
 470 {
 471         LIST_HEAD(vmemmap_pages);
 472         struct vmemmap_remap_walk walk = {
 473                 .remap_pte      = vmemmap_restore_pte,
 474                 .reuse_addr     = reuse,
 475                 .vmemmap_pages  = &vmemmap_pages,
 476                 .flags          = flags,
 477         };
 478
 479         /* See the comment in the vmemmap_remap_free(). */
 480         BUG_ON(start - reuse != PAGE_SIZE);
 481
 482         if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
 483                 return -ENOMEM;
 484
 485         mmap_read_lock(&init_mm);
 486         vmemmap_remap_range(reuse, end, &walk);
 487         mmap_read_unlock(&init_mm);
 488
 489         return 0;
 490 }
 491
 492 DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
 493 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 494
 495 static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
 496 core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
 497
 498 static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio, unsigned long flags)
 499 {
 500         int ret;
 501         struct page *head = &folio->page;
 502         unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
 503         unsigned long vmemmap_reuse;
 504
 505         VM_WARN_ON_ONCE(!PageHuge(head));
 506         if (!folio_test_hugetlb_vmemmap_optimized(folio))
 507                 return 0;
 508
 509         vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
 510         vmemmap_reuse   = vmemmap_start;
 511         vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
 512
 513         /*
 514          * The pages which the vmemmap virtual address range [@vmemmap_start,
 515          * @vmemmap_end) are mapped to are freed to the buddy allocator, and
 516          * the range is mapped to the page which @vmemmap_reuse is mapped to.
 517          * When a HugeTLB page is freed to the buddy allocator, previously
 518          * discarded vmemmap pages must be allocated and remapping.
 519          */
 520         ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags);
 521         if (!ret) {
 522                 folio_clear_hugetlb_vmemmap_optimized(folio);
 523                 static_branch_dec(&hugetlb_optimize_vmemmap_key);
 524         }
 525
 526         return ret;
 527 }
 528
 529 /**
 530  * hugetlb_vmemmap_restore_folio - restore previously optimized (by
 531  *                              hugetlb_vmemmap_optimize_folio()) vmemmap pages which
 532  *                              will be reallocated and remapped.
 533  * @h:          struct hstate.
 534  * @folio:     the folio whose vmemmap pages will be restored.
 535  *
 536  * Return: %0 if @folio's vmemmap pages have been reallocated and remapped,
 537  * negative error code otherwise.
 538  */
 539 int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
 540 {
 541         return __hugetlb_vmemmap_restore_folio(h, folio, 0);
 542 }
 543
 544 /**
 545  * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
 546  * @h:                  hstate.
 547  * @folio_list:         list of folios.
 548  * @non_hvo_folios:     Output list of folios for which vmemmap exists.
 549  *
 550  * Return: number of folios for which vmemmap was restored, or an error code
 551  *              if an error was encountered restoring vmemmap for a folio.
 552  *              Folios that have vmemmap are moved to the non_hvo_folios
 553  *              list.  Processing of entries stops when the first error is
 554  *              encountered. The folio that experienced the error and all
 555  *              non-processed folios will remain on folio_list.
 556  */
 557 long hugetlb_vmemmap_restore_folios(const struct hstate *h,
 558                                         struct list_head *folio_list,
 559                                         struct list_head *non_hvo_folios)
 560 {
 561         struct folio *folio, *t_folio;
 562         long restored = 0;
 563         long ret = 0;
 564
 565         list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
 566                 if (folio_test_hugetlb_vmemmap_optimized(folio)) {
 567                         ret = __hugetlb_vmemmap_restore_folio(h, folio,
 568                                                 VMEMMAP_REMAP_NO_TLB_FLUSH);
 569                         if (ret)
 570                                 break;
 571                         restored++;
 572                 }
 573
 574                 /* Add non-optimized folios to output list */
 575                 list_move(&folio->lru, non_hvo_folios);
 576         }
 577
 578         if (restored)
 579                 flush_tlb_all();
 580         if (!ret)
 581                 ret = restored;
 582         return ret;
 583 }
 584
 585 /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
 586 static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
 587 {
 588         if (HPageVmemmapOptimized((struct page *)head))
 589                 return false;
 590
 591         if (!READ_ONCE(vmemmap_optimize_enabled))
 592                 return false;
 593
 594         if (!hugetlb_vmemmap_optimizable(h))
 595                 return false;
 596
 597         if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
 598                 pmd_t *pmdp, pmd;
 599                 struct page *vmemmap_page;
 600                 unsigned long vaddr = (unsigned long)head;
 601
 602                 /*
 603                  * Only the vmemmap page's vmemmap page can be self-hosted.
 604                  * Walking the page tables to find the backing page of the
 605                  * vmemmap page.
 606                  */
 607                 pmdp = pmd_off_k(vaddr);
 608                 /*
 609                  * The READ_ONCE() is used to stabilize *pmdp in a register or
 610                  * on the stack so that it will stop changing under the code.
 611                  * The only concurrent operation where it can be changed is
 612                  * split_vmemmap_huge_pmd() (*pmdp will be stable after this
 613                  * operation).
 614                  */
 615                 pmd = READ_ONCE(*pmdp);
 616                 if (pmd_leaf(pmd))
 617                         vmemmap_page = pmd_page(pmd) + pte_index(vaddr);
 618                 else
 619                         vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr));
 620                 /*
 621                  * Due to HugeTLB alignment requirements and the vmemmap pages
 622                  * being at the start of the hotplugged memory region in
 623                  * memory_hotplug.memmap_on_memory case. Checking any vmemmap
 624                  * page's vmemmap page if it is marked as VmemmapSelfHosted is
 625                  * sufficient.
 626                  *
 627                  * [                  hotplugged memory                  ]
 628                  * [        section        ][...][        section        ]
 629                  * [ vmemmap ][              usable memory               ]
 630                  *   ^   |     |                                        |
 631                  *   +---+     |                                        |
 632                  *     ^       |                                        |
 633                  *     +-------+                                        |
 634                  *          ^                                           |
 635                  *          +-------------------------------------------+
 636                  */
 637                 if (PageVmemmapSelfHosted(vmemmap_page))
 638                         return false;
 639         }
 640
 641         return true;
 642 }
 643
 644 static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
 645                                         struct folio *folio,
 646                                         struct list_head *vmemmap_pages,
 647                                         unsigned long flags)
 648 {
 649         int ret = 0;
 650         struct page *head = &folio->page;
 651         unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
 652         unsigned long vmemmap_reuse;
 653
 654         VM_WARN_ON_ONCE(!PageHuge(head));
 655         if (!vmemmap_should_optimize(h, head))
 656                 return ret;
 657
 658         static_branch_inc(&hugetlb_optimize_vmemmap_key);
 659         /*
 660          * Very Subtle
 661          * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
 662          * immediately after remapping.  As a result, subsequent accesses
 663          * and modifications to struct pages associated with the hugetlb
 664          * page could be to the OLD struct pages.  Set the vmemmap optimized
 665          * flag here so that it is copied to the new head page.  This keeps
 666          * the old and new struct pages in sync.
 667          * If there is an error during optimization, we will immediately FLUSH
 668          * the TLB and clear the flag below.
 669          */
 670         folio_set_hugetlb_vmemmap_optimized(folio);
 671
 672         vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
 673         vmemmap_reuse   = vmemmap_start;
 674         vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
 675
 676         /*
 677          * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
 678          * to the page which @vmemmap_reuse is mapped to.  Add pages previously
 679          * mapping the range to vmemmap_pages list so that they can be freed by
 680          * the caller.
 681          */
 682         ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
 683                                                         vmemmap_pages, flags);
 684         if (ret) {
 685                 static_branch_dec(&hugetlb_optimize_vmemmap_key);
 686                 folio_clear_hugetlb_vmemmap_optimized(folio);
 687         }
 688
 689         return ret;
 690 }
 691
 692 /**
 693  * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages.
 694  * @h:          struct hstate.
 695  * @folio:     the folio whose vmemmap pages will be optimized.
 696  *
 697  * This function only tries to optimize @folio's vmemmap pages and does not
 698  * guarantee that the optimization will succeed after it returns. The caller
 699  * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's
 700  * vmemmap pages have been optimized.
 701  */
 702 void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
 703 {
 704         LIST_HEAD(vmemmap_pages);
 705
 706         __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0);
 707         free_vmemmap_page_list(&vmemmap_pages);
 708 }
 709
 710 static int hugetlb_vmemmap_split(const struct hstate *h, struct page *head)
 711 {
 712         unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
 713         unsigned long vmemmap_reuse;
 714
 715         if (!vmemmap_should_optimize(h, head))
 716                 return 0;
 717
 718         vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
 719         vmemmap_reuse   = vmemmap_start;
 720         vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
 721
 722         /*
 723          * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
 724          * @vmemmap_end]
 725          */
 726         return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
 727 }
 728
 729 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
 730 {
 731         struct folio *folio;
 732         LIST_HEAD(vmemmap_pages);
 733
 734         list_for_each_entry(folio, folio_list, lru) {
 735                 int ret = hugetlb_vmemmap_split(h, &folio->page);
 736
 737                 /*
 738                  * Spliting the PMD requires allocating a page, thus lets fail
 739                  * early once we encounter the first OOM. No point in retrying
 740                  * as it can be dynamically done on remap with the memory
 741                  * we get back from the vmemmap deduplication.
 742                  */
 743                 if (ret == -ENOMEM)
 744                         break;
 745         }
 746
 747         flush_tlb_all();
 748
 749         list_for_each_entry(folio, folio_list, lru) {
 750                 int ret = __hugetlb_vmemmap_optimize_folio(h, folio,
 751                                                 &vmemmap_pages,
 752                                                 VMEMMAP_REMAP_NO_TLB_FLUSH);
 753
 754                 /*
 755                  * Pages to be freed may have been accumulated.  If we
 756                  * encounter an ENOMEM,  free what we have and try again.
 757                  * This can occur in the case that both spliting fails
 758                  * halfway and head page allocation also failed. In this
 759                  * case __hugetlb_vmemmap_optimize_folio() would free memory
 760                  * allowing more vmemmap remaps to occur.
 761                  */
 762                 if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
 763                         flush_tlb_all();
 764                         free_vmemmap_page_list(&vmemmap_pages);
 765                         INIT_LIST_HEAD(&vmemmap_pages);
 766                         __hugetlb_vmemmap_optimize_folio(h, folio,
 767                                                 &vmemmap_pages,
 768                                                 VMEMMAP_REMAP_NO_TLB_FLUSH);
 769                 }
 770         }
 771
 772         flush_tlb_all();
 773         free_vmemmap_page_list(&vmemmap_pages);
 774 }
 775
 776 static struct ctl_table hugetlb_vmemmap_sysctls[] = {
 777         {
 778                 .procname       = "hugetlb_optimize_vmemmap",
 779                 .data           = &vmemmap_optimize_enabled,
 780                 .maxlen         = sizeof(vmemmap_optimize_enabled),
 781                 .mode           = 0644,
 782                 .proc_handler   = proc_dobool,
 783         },
 784         { }
 785 };
 786
 787 static int __init hugetlb_vmemmap_init(void)
 788 {
 789         const struct hstate *h;
 790
 791         /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
 792         BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);
 793
 794         for_each_hstate(h) {
 795                 if (hugetlb_vmemmap_optimizable(h)) {
 796                         register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
 797                         break;
 798                 }
 799         }
 800         return 0;
 801 }
 802 late_initcall(hugetlb_vmemmap_init);