mm/hugetlb_vmemmap.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * HugeTLB Vmemmap Optimization (HVO)
   4  *
   5  * Copyright (c) 2020, ByteDance. All rights reserved.
   6  *
   7  *     Author: Muchun Song <songmuchun@bytedance.com>
   8  *
   9  * See Documentation/mm/vmemmap_dedup.rst
  10  */
  11 #define pr_fmt(fmt)     "HugeTLB: " fmt
  12
  13 #include <linux/pgtable.h>
  14 #include <linux/moduleparam.h>
  15 #include <linux/bootmem_info.h>
  16 #include <linux/mmdebug.h>
  17 #include <linux/pagewalk.h>
  18 #include <asm/pgalloc.h>
  19 #include <asm/tlbflush.h>
  20 #include "hugetlb_vmemmap.h"
  21
  22 /**
  23  * struct vmemmap_remap_walk - walk vmemmap page table
  24  *
  25  * @remap_pte:          called for each lowest-level entry (PTE).
  26  * @nr_walked:          the number of walked pte.
  27  * @reuse_page:         the page which is reused for the tail vmemmap pages.
  28  * @reuse_addr:         the virtual address of the @reuse_page page.
  29  * @vmemmap_pages:      the list head of the vmemmap pages that can be freed
  30  *                      or is mapped from.
  31  * @flags:              used to modify behavior in vmemmap page table walking
  32  *                      operations.
  33  */
  34 struct vmemmap_remap_walk {
  35         void                    (*remap_pte)(pte_t *pte, unsigned long addr,
  36                                              struct vmemmap_remap_walk *walk);
  37         unsigned long           nr_walked;
  38         struct page             *reuse_page;
  39         unsigned long           reuse_addr;
  40         struct list_head        *vmemmap_pages;
  41
  42 /* Skip the TLB flush when we split the PMD */
  43 #define VMEMMAP_SPLIT_NO_TLB_FLUSH      BIT(0)
  44 /* Skip the TLB flush when we remap the PTE */
  45 #define VMEMMAP_REMAP_NO_TLB_FLUSH      BIT(1)
  46         unsigned long           flags;
  47 };
  48
  49 static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start,
  50                              struct vmemmap_remap_walk *walk)
  51 {
  52         pmd_t __pmd;
  53         int i;
  54         unsigned long addr = start;
  55         pte_t *pgtable;
  56
  57         pgtable = pte_alloc_one_kernel(&init_mm);
  58         if (!pgtable)
  59                 return -ENOMEM;
  60
  61         pmd_populate_kernel(&init_mm, &__pmd, pgtable);
  62
  63         for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
  64                 pte_t entry, *pte;
  65                 pgprot_t pgprot = PAGE_KERNEL;
  66
  67                 entry = mk_pte(head + i, pgprot);
  68                 pte = pte_offset_kernel(&__pmd, addr);
  69                 set_pte_at(&init_mm, addr, pte, entry);
  70         }
  71
  72         spin_lock(&init_mm.page_table_lock);
  73         if (likely(pmd_leaf(*pmd))) {
  74                 /*
  75                  * Higher order allocations from buddy allocator must be able to
  76                  * be treated as indepdenent small pages (as they can be freed
  77                  * individually).
  78                  */
  79                 if (!PageReserved(head))
  80                         split_page(head, get_order(PMD_SIZE));
  81
  82                 /* Make pte visible before pmd. See comment in pmd_install(). */
  83                 smp_wmb();
  84                 pmd_populate_kernel(&init_mm, pmd, pgtable);
  85                 if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH))
  86                         flush_tlb_kernel_range(start, start + PMD_SIZE);
  87         } else {
  88                 pte_free_kernel(&init_mm, pgtable);
  89         }
  90         spin_unlock(&init_mm.page_table_lock);
  91
  92         return 0;
  93 }
  94
  95 static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr,
  96                              unsigned long next, struct mm_walk *walk)
  97 {
  98         int ret = 0;
  99         struct page *head;
 100         struct vmemmap_remap_walk *vmemmap_walk = walk->private;
 101
 102         /* Only splitting, not remapping the vmemmap pages. */
 103         if (!vmemmap_walk->remap_pte)
 104                 walk->action = ACTION_CONTINUE;
 105
 106         spin_lock(&init_mm.page_table_lock);
 107         head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
 108         /*
 109          * Due to HugeTLB alignment requirements and the vmemmap
 110          * pages being at the start of the hotplugged memory
 111          * region in memory_hotplug.memmap_on_memory case. Checking
 112          * the vmemmap page associated with the first vmemmap page
 113          * if it is self-hosted is sufficient.
 114          *
 115          * [                  hotplugged memory                  ]
 116          * [        section        ][...][        section        ]
 117          * [ vmemmap ][              usable memory               ]
 118          *   ^  | ^                        |
 119          *   +--+ |                        |
 120          *        +------------------------+
 121          */
 122         if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) {
 123                 struct page *page = head ? head + pte_index(addr) :
 124                                     pte_page(ptep_get(pte_offset_kernel(pmd, addr)));
 125
 126                 if (PageVmemmapSelfHosted(page))
 127                         ret = -ENOTSUPP;
 128         }
 129         spin_unlock(&init_mm.page_table_lock);
 130         if (!head || ret)
 131                 return ret;
 132
 133         return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk);
 134 }
 135
 136 static int vmemmap_pte_entry(pte_t *pte, unsigned long addr,
 137                              unsigned long next, struct mm_walk *walk)
 138 {
 139         struct vmemmap_remap_walk *vmemmap_walk = walk->private;
 140
 141         /*
 142          * The reuse_page is found 'first' in page table walking before
 143          * starting remapping.
 144          */
 145         if (!vmemmap_walk->reuse_page)
 146                 vmemmap_walk->reuse_page = pte_page(ptep_get(pte));
 147         else
 148                 vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
 149         vmemmap_walk->nr_walked++;
 150
 151         return 0;
 152 }
 153
 154 static const struct mm_walk_ops vmemmap_remap_ops = {
 155         .pmd_entry      = vmemmap_pmd_entry,
 156         .pte_entry      = vmemmap_pte_entry,
 157 };
 158
 159 static int vmemmap_remap_range(unsigned long start, unsigned long end,
 160                                struct vmemmap_remap_walk *walk)
 161 {
 162         int ret;
 163
 164         VM_BUG_ON(!PAGE_ALIGNED(start | end));
 165
 166         mmap_read_lock(&init_mm);
 167         ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops,
 168                                     NULL, walk);
 169         mmap_read_unlock(&init_mm);
 170         if (ret)
 171                 return ret;
 172
 173         if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
 174                 flush_tlb_kernel_range(start, end);
 175
 176         return 0;
 177 }
 178
 179 /*
 180  * Free a vmemmap page. A vmemmap page can be allocated from the memblock
 181  * allocator or buddy allocator. If the PG_reserved flag is set, it means
 182  * that it allocated from the memblock allocator, just free it via the
 183  * free_bootmem_page(). Otherwise, use __free_page().
 184  */
 185 static inline void free_vmemmap_page(struct page *page)
 186 {
 187         if (PageReserved(page))
 188                 free_bootmem_page(page);
 189         else
 190                 __free_page(page);
 191 }
 192
 193 /* Free a list of the vmemmap pages */
 194 static void free_vmemmap_page_list(struct list_head *list)
 195 {
 196         struct page *page, *next;
 197
 198         list_for_each_entry_safe(page, next, list, lru)
 199                 free_vmemmap_page(page);
 200 }
 201
 202 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
 203                               struct vmemmap_remap_walk *walk)
 204 {
 205         /*
 206          * Remap the tail pages as read-only to catch illegal write operation
 207          * to the tail pages.
 208          */
 209         pgprot_t pgprot = PAGE_KERNEL_RO;
 210         struct page *page = pte_page(ptep_get(pte));
 211         pte_t entry;
 212
 213         /* Remapping the head page requires r/w */
 214         if (unlikely(addr == walk->reuse_addr)) {
 215                 pgprot = PAGE_KERNEL;
 216                 list_del(&walk->reuse_page->lru);
 217
 218                 /*
 219                  * Makes sure that preceding stores to the page contents from
 220                  * vmemmap_remap_free() become visible before the set_pte_at()
 221                  * write.
 222                  */
 223                 smp_wmb();
 224         }
 225
 226         entry = mk_pte(walk->reuse_page, pgprot);
 227         list_add(&page->lru, walk->vmemmap_pages);
 228         set_pte_at(&init_mm, addr, pte, entry);
 229 }
 230
 231 /*
 232  * How many struct page structs need to be reset. When we reuse the head
 233  * struct page, the special metadata (e.g. page->flags or page->mapping)
 234  * cannot copy to the tail struct page structs. The invalid value will be
 235  * checked in the free_tail_page_prepare(). In order to avoid the message
 236  * of "corrupted mapping in tail page". We need to reset at least 3 (one
 237  * head struct page struct and two tail struct page structs) struct page
 238  * structs.
 239  */
 240 #define NR_RESET_STRUCT_PAGE            3
 241
 242 static inline void reset_struct_pages(struct page *start)
 243 {
 244         struct page *from = start + NR_RESET_STRUCT_PAGE;
 245
 246         BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
 247         memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
 248 }
 249
 250 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
 251                                 struct vmemmap_remap_walk *walk)
 252 {
 253         pgprot_t pgprot = PAGE_KERNEL;
 254         struct page *page;
 255         void *to;
 256
 257         BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
 258
 259         page = list_first_entry(walk->vmemmap_pages, struct page, lru);
 260         list_del(&page->lru);
 261         to = page_to_virt(page);
 262         copy_page(to, (void *)walk->reuse_addr);
 263         reset_struct_pages(to);
 264
 265         /*
 266          * Makes sure that preceding stores to the page contents become visible
 267          * before the set_pte_at() write.
 268          */
 269         smp_wmb();
 270         set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
 271 }
 272
 273 /**
 274  * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
 275  *                      backing PMDs of the directmap into PTEs
 276  * @start:     start address of the vmemmap virtual address range that we want
 277  *             to remap.
 278  * @end:       end address of the vmemmap virtual address range that we want to
 279  *             remap.
 280  * @reuse:     reuse address.
 281  *
 282  * Return: %0 on success, negative error code otherwise.
 283  */
 284 static int vmemmap_remap_split(unsigned long start, unsigned long end,
 285                                unsigned long reuse)
 286 {
 287         struct vmemmap_remap_walk walk = {
 288                 .remap_pte      = NULL,
 289                 .flags          = VMEMMAP_SPLIT_NO_TLB_FLUSH,
 290         };
 291
 292         /* See the comment in the vmemmap_remap_free(). */
 293         BUG_ON(start - reuse != PAGE_SIZE);
 294
 295         return vmemmap_remap_range(reuse, end, &walk);
 296 }
 297
 298 /**
 299  * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
 300  *                      to the page which @reuse is mapped to, then free vmemmap
 301  *                      which the range are mapped to.
 302  * @start:      start address of the vmemmap virtual address range that we want
 303  *              to remap.
 304  * @end:        end address of the vmemmap virtual address range that we want to
 305  *              remap.
 306  * @reuse:      reuse address.
 307  * @vmemmap_pages: list to deposit vmemmap pages to be freed.  It is callers
 308  *              responsibility to free pages.
 309  * @flags:      modifications to vmemmap_remap_walk flags
 310  *
 311  * Return: %0 on success, negative error code otherwise.
 312  */
 313 static int vmemmap_remap_free(unsigned long start, unsigned long end,
 314                               unsigned long reuse,
 315                               struct list_head *vmemmap_pages,
 316                               unsigned long flags)
 317 {
 318         int ret;
 319         struct vmemmap_remap_walk walk = {
 320                 .remap_pte      = vmemmap_remap_pte,
 321                 .reuse_addr     = reuse,
 322                 .vmemmap_pages  = vmemmap_pages,
 323                 .flags          = flags,
 324         };
 325         int nid = page_to_nid((struct page *)reuse);
 326         gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
 327
 328         /*
 329          * Allocate a new head vmemmap page to avoid breaking a contiguous
 330          * block of struct page memory when freeing it back to page allocator
 331          * in free_vmemmap_page_list(). This will allow the likely contiguous
 332          * struct page backing memory to be kept contiguous and allowing for
 333          * more allocations of hugepages. Fallback to the currently
 334          * mapped head page in case should it fail to allocate.
 335          */
 336         walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
 337         if (walk.reuse_page) {
 338                 copy_page(page_to_virt(walk.reuse_page),
 339                           (void *)walk.reuse_addr);
 340                 list_add(&walk.reuse_page->lru, vmemmap_pages);
 341         }
 342
 343         /*
 344          * In order to make remapping routine most efficient for the huge pages,
 345          * the routine of vmemmap page table walking has the following rules
 346          * (see more details from the vmemmap_pte_range()):
 347          *
 348          * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
 349          *   should be continuous.
 350          * - The @reuse address is part of the range [@reuse, @end) that we are
 351          *   walking which is passed to vmemmap_remap_range().
 352          * - The @reuse address is the first in the complete range.
 353          *
 354          * So we need to make sure that @start and @reuse meet the above rules.
 355          */
 356         BUG_ON(start - reuse != PAGE_SIZE);
 357
 358         ret = vmemmap_remap_range(reuse, end, &walk);
 359         if (ret && walk.nr_walked) {
 360                 end = reuse + walk.nr_walked * PAGE_SIZE;
 361                 /*
 362                  * vmemmap_pages contains pages from the previous
 363                  * vmemmap_remap_range call which failed.  These
 364                  * are pages which were removed from the vmemmap.
 365                  * They will be restored in the following call.
 366                  */
 367                 walk = (struct vmemmap_remap_walk) {
 368                         .remap_pte      = vmemmap_restore_pte,
 369                         .reuse_addr     = reuse,
 370                         .vmemmap_pages  = vmemmap_pages,
 371                         .flags          = 0,
 372                 };
 373
 374                 vmemmap_remap_range(reuse, end, &walk);
 375         }
 376
 377         return ret;
 378 }
 379
 380 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
 381                                    struct list_head *list)
 382 {
 383         gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
 384         unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
 385         int nid = page_to_nid((struct page *)start);
 386         struct page *page, *next;
 387
 388         while (nr_pages--) {
 389                 page = alloc_pages_node(nid, gfp_mask, 0);
 390                 if (!page)
 391                         goto out;
 392                 list_add(&page->lru, list);
 393         }
 394
 395         return 0;
 396 out:
 397         list_for_each_entry_safe(page, next, list, lru)
 398                 __free_page(page);
 399         return -ENOMEM;
 400 }
 401
 402 /**
 403  * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
 404  *                       to the page which is from the @vmemmap_pages
 405  *                       respectively.
 406  * @start:      start address of the vmemmap virtual address range that we want
 407  *              to remap.
 408  * @end:        end address of the vmemmap virtual address range that we want to
 409  *              remap.
 410  * @reuse:      reuse address.
 411  * @flags:      modifications to vmemmap_remap_walk flags
 412  *
 413  * Return: %0 on success, negative error code otherwise.
 414  */
 415 static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
 416                                unsigned long reuse, unsigned long flags)
 417 {
 418         LIST_HEAD(vmemmap_pages);
 419         struct vmemmap_remap_walk walk = {
 420                 .remap_pte      = vmemmap_restore_pte,
 421                 .reuse_addr     = reuse,
 422                 .vmemmap_pages  = &vmemmap_pages,
 423                 .flags          = flags,
 424         };
 425
 426         /* See the comment in the vmemmap_remap_free(). */
 427         BUG_ON(start - reuse != PAGE_SIZE);
 428
 429         if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
 430                 return -ENOMEM;
 431
 432         return vmemmap_remap_range(reuse, end, &walk);
 433 }
 434
 435 DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
 436 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 437
 438 static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
 439 core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
 440
 441 static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
 442                                            struct folio *folio, unsigned long flags)
 443 {
 444         int ret;
 445         unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
 446         unsigned long vmemmap_reuse;
 447
 448         VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
 449         if (!folio_test_hugetlb_vmemmap_optimized(folio))
 450                 return 0;
 451
 452         vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
 453         vmemmap_reuse   = vmemmap_start;
 454         vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
 455
 456         /*
 457          * The pages which the vmemmap virtual address range [@vmemmap_start,
 458          * @vmemmap_end) are mapped to are freed to the buddy allocator, and
 459          * the range is mapped to the page which @vmemmap_reuse is mapped to.
 460          * When a HugeTLB page is freed to the buddy allocator, previously
 461          * discarded vmemmap pages must be allocated and remapping.
 462          */
 463         ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags);
 464         if (!ret) {
 465                 folio_clear_hugetlb_vmemmap_optimized(folio);
 466                 static_branch_dec(&hugetlb_optimize_vmemmap_key);
 467         }
 468
 469         return ret;
 470 }
 471
 472 /**
 473  * hugetlb_vmemmap_restore_folio - restore previously optimized (by
 474  *                              hugetlb_vmemmap_optimize_folio()) vmemmap pages which
 475  *                              will be reallocated and remapped.
 476  * @h:          struct hstate.
 477  * @folio:     the folio whose vmemmap pages will be restored.
 478  *
 479  * Return: %0 if @folio's vmemmap pages have been reallocated and remapped,
 480  * negative error code otherwise.
 481  */
 482 int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
 483 {
 484         return __hugetlb_vmemmap_restore_folio(h, folio, 0);
 485 }
 486
 487 /**
 488  * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
 489  * @h:                  hstate.
 490  * @folio_list:         list of folios.
 491  * @non_hvo_folios:     Output list of folios for which vmemmap exists.
 492  *
 493  * Return: number of folios for which vmemmap was restored, or an error code
 494  *              if an error was encountered restoring vmemmap for a folio.
 495  *              Folios that have vmemmap are moved to the non_hvo_folios
 496  *              list.  Processing of entries stops when the first error is
 497  *              encountered. The folio that experienced the error and all
 498  *              non-processed folios will remain on folio_list.
 499  */
 500 long hugetlb_vmemmap_restore_folios(const struct hstate *h,
 501                                         struct list_head *folio_list,
 502                                         struct list_head *non_hvo_folios)
 503 {
 504         struct folio *folio, *t_folio;
 505         long restored = 0;
 506         long ret = 0;
 507
 508         list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
 509                 if (folio_test_hugetlb_vmemmap_optimized(folio)) {
 510                         ret = __hugetlb_vmemmap_restore_folio(h, folio,
 511                                                               VMEMMAP_REMAP_NO_TLB_FLUSH);
 512                         if (ret)
 513                                 break;
 514                         restored++;
 515                 }
 516
 517                 /* Add non-optimized folios to output list */
 518                 list_move(&folio->lru, non_hvo_folios);
 519         }
 520
 521         if (restored)
 522                 flush_tlb_all();
 523         if (!ret)
 524                 ret = restored;
 525         return ret;
 526 }
 527
 528 /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
 529 static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio)
 530 {
 531         if (folio_test_hugetlb_vmemmap_optimized(folio))
 532                 return false;
 533
 534         if (!READ_ONCE(vmemmap_optimize_enabled))
 535                 return false;
 536
 537         if (!hugetlb_vmemmap_optimizable(h))
 538                 return false;
 539
 540         return true;
 541 }
 542
 543 static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
 544                                             struct folio *folio,
 545                                             struct list_head *vmemmap_pages,
 546                                             unsigned long flags)
 547 {
 548         int ret = 0;
 549         unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
 550         unsigned long vmemmap_reuse;
 551
 552         VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
 553         if (!vmemmap_should_optimize_folio(h, folio))
 554                 return ret;
 555
 556         static_branch_inc(&hugetlb_optimize_vmemmap_key);
 557         /*
 558          * Very Subtle
 559          * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
 560          * immediately after remapping.  As a result, subsequent accesses
 561          * and modifications to struct pages associated with the hugetlb
 562          * page could be to the OLD struct pages.  Set the vmemmap optimized
 563          * flag here so that it is copied to the new head page.  This keeps
 564          * the old and new struct pages in sync.
 565          * If there is an error during optimization, we will immediately FLUSH
 566          * the TLB and clear the flag below.
 567          */
 568         folio_set_hugetlb_vmemmap_optimized(folio);
 569
 570         vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
 571         vmemmap_reuse   = vmemmap_start;
 572         vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
 573
 574         /*
 575          * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
 576          * to the page which @vmemmap_reuse is mapped to.  Add pages previously
 577          * mapping the range to vmemmap_pages list so that they can be freed by
 578          * the caller.
 579          */
 580         ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
 581                                  vmemmap_pages, flags);
 582         if (ret) {
 583                 static_branch_dec(&hugetlb_optimize_vmemmap_key);
 584                 folio_clear_hugetlb_vmemmap_optimized(folio);
 585         }
 586
 587         return ret;
 588 }
 589
 590 /**
 591  * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages.
 592  * @h:          struct hstate.
 593  * @folio:     the folio whose vmemmap pages will be optimized.
 594  *
 595  * This function only tries to optimize @folio's vmemmap pages and does not
 596  * guarantee that the optimization will succeed after it returns. The caller
 597  * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's
 598  * vmemmap pages have been optimized.
 599  */
 600 void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
 601 {
 602         LIST_HEAD(vmemmap_pages);
 603
 604         __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0);
 605         free_vmemmap_page_list(&vmemmap_pages);
 606 }
 607
 608 static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio)
 609 {
 610         unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
 611         unsigned long vmemmap_reuse;
 612
 613         if (!vmemmap_should_optimize_folio(h, folio))
 614                 return 0;
 615
 616         vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
 617         vmemmap_reuse   = vmemmap_start;
 618         vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
 619
 620         /*
 621          * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
 622          * @vmemmap_end]
 623          */
 624         return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
 625 }
 626
 627 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
 628 {
 629         struct folio *folio;
 630         LIST_HEAD(vmemmap_pages);
 631
 632         list_for_each_entry(folio, folio_list, lru) {
 633                 int ret = hugetlb_vmemmap_split_folio(h, folio);
 634
 635                 /*
 636                  * Spliting the PMD requires allocating a page, thus lets fail
 637                  * early once we encounter the first OOM. No point in retrying
 638                  * as it can be dynamically done on remap with the memory
 639                  * we get back from the vmemmap deduplication.
 640                  */
 641                 if (ret == -ENOMEM)
 642                         break;
 643         }
 644
 645         flush_tlb_all();
 646
 647         list_for_each_entry(folio, folio_list, lru) {
 648                 int ret;
 649
 650                 ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages,
 651                                                        VMEMMAP_REMAP_NO_TLB_FLUSH);
 652
 653                 /*
 654                  * Pages to be freed may have been accumulated.  If we
 655                  * encounter an ENOMEM,  free what we have and try again.
 656                  * This can occur in the case that both spliting fails
 657                  * halfway and head page allocation also failed. In this
 658                  * case __hugetlb_vmemmap_optimize_folio() would free memory
 659                  * allowing more vmemmap remaps to occur.
 660                  */
 661                 if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
 662                         flush_tlb_all();
 663                         free_vmemmap_page_list(&vmemmap_pages);
 664                         INIT_LIST_HEAD(&vmemmap_pages);
 665                         __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages,
 666                                                          VMEMMAP_REMAP_NO_TLB_FLUSH);
 667                 }
 668         }
 669
 670         flush_tlb_all();
 671         free_vmemmap_page_list(&vmemmap_pages);
 672 }
 673
 674 static struct ctl_table hugetlb_vmemmap_sysctls[] = {
 675         {
 676                 .procname       = "hugetlb_optimize_vmemmap",
 677                 .data           = &vmemmap_optimize_enabled,
 678                 .maxlen         = sizeof(vmemmap_optimize_enabled),
 679                 .mode           = 0644,
 680                 .proc_handler   = proc_dobool,
 681         },
 682         { }
 683 };
 684
 685 static int __init hugetlb_vmemmap_init(void)
 686 {
 687         const struct hstate *h;
 688
 689         /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
 690         BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);
 691
 692         for_each_hstate(h) {
 693                 if (hugetlb_vmemmap_optimizable(h)) {
 694                         register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
 695                         break;
 696                 }
 697         }
 698         return 0;
 699 }
 700 late_initcall(hugetlb_vmemmap_init);