mm/vmalloc.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  *  linux/mm/vmalloc.c
   4  *
   5  *  Copyright (C) 1993  Linus Torvalds
   6  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
   7  *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
   8  *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
   9  *  Numa awareness, Christoph Lameter, SGI, June 2005
  10  */
  11
  12 #include <linux/vmalloc.h>
  13 #include <linux/mm.h>
  14 #include <linux/module.h>
  15 #include <linux/highmem.h>
  16 #include <linux/sched/signal.h>
  17 #include <linux/slab.h>
  18 #include <linux/spinlock.h>
  19 #include <linux/interrupt.h>
  20 #include <linux/proc_fs.h>
  21 #include <linux/seq_file.h>
  22 #include <linux/set_memory.h>
  23 #include <linux/debugobjects.h>
  24 #include <linux/kallsyms.h>
  25 #include <linux/list.h>
  26 #include <linux/notifier.h>
  27 #include <linux/rbtree.h>
  28 #include <linux/radix-tree.h>
  29 #include <linux/rcupdate.h>
  30 #include <linux/pfn.h>
  31 #include <linux/kmemleak.h>
  32 #include <linux/atomic.h>
  33 #include <linux/compiler.h>
  34 #include <linux/llist.h>
  35 #include <linux/bitops.h>
  36 #include <linux/rbtree_augmented.h>
  37 #include <linux/overflow.h>
  38
  39 #include <linux/uaccess.h>
  40 #include <asm/tlbflush.h>
  41 #include <asm/shmparam.h>
  42
  43 #include "internal.h"
  44 #include "pgalloc-track.h"
  45
  46 bool is_vmalloc_addr(const void *x)
  47 {
  48         unsigned long addr = (unsigned long)x;
  49
  50         return addr >= VMALLOC_START && addr < VMALLOC_END;
  51 }
  52 EXPORT_SYMBOL(is_vmalloc_addr);
  53
  54 struct vfree_deferred {
  55         struct llist_head list;
  56         struct work_struct wq;
  57 };
  58 static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
  59
  60 static void __vunmap(const void *, int);
  61
  62 static void free_work(struct work_struct *w)
  63 {
  64         struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
  65         struct llist_node *t, *llnode;
  66
  67         llist_for_each_safe(llnode, t, llist_del_all(&p->list))
  68                 __vunmap((void *)llnode, 1);
  69 }
  70
  71 /*** Page table manipulation functions ***/
  72
  73 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
  74                              pgtbl_mod_mask *mask)
  75 {
  76         pte_t *pte;
  77
  78         pte = pte_offset_kernel(pmd, addr);
  79         do {
  80                 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
  81                 WARN_ON(!pte_none(ptent) && !pte_present(ptent));
  82         } while (pte++, addr += PAGE_SIZE, addr != end);
  83         *mask |= PGTBL_PTE_MODIFIED;
  84 }
  85
  86 static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
  87                              pgtbl_mod_mask *mask)
  88 {
  89         pmd_t *pmd;
  90         unsigned long next;
  91         int cleared;
  92
  93         pmd = pmd_offset(pud, addr);
  94         do {
  95                 next = pmd_addr_end(addr, end);
  96
  97                 cleared = pmd_clear_huge(pmd);
  98                 if (cleared || pmd_bad(*pmd))
  99                         *mask |= PGTBL_PMD_MODIFIED;
 100
 101                 if (cleared)
 102                         continue;
 103                 if (pmd_none_or_clear_bad(pmd))
 104                         continue;
 105                 vunmap_pte_range(pmd, addr, next, mask);
 106         } while (pmd++, addr = next, addr != end);
 107 }
 108
 109 static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
 110                              pgtbl_mod_mask *mask)
 111 {
 112         pud_t *pud;
 113         unsigned long next;
 114         int cleared;
 115
 116         pud = pud_offset(p4d, addr);
 117         do {
 118                 next = pud_addr_end(addr, end);
 119
 120                 cleared = pud_clear_huge(pud);
 121                 if (cleared || pud_bad(*pud))
 122                         *mask |= PGTBL_PUD_MODIFIED;
 123
 124                 if (cleared)
 125                         continue;
 126                 if (pud_none_or_clear_bad(pud))
 127                         continue;
 128                 vunmap_pmd_range(pud, addr, next, mask);
 129         } while (pud++, addr = next, addr != end);
 130 }
 131
 132 static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 133                              pgtbl_mod_mask *mask)
 134 {
 135         p4d_t *p4d;
 136         unsigned long next;
 137         int cleared;
 138
 139         p4d = p4d_offset(pgd, addr);
 140         do {
 141                 next = p4d_addr_end(addr, end);
 142
 143                 cleared = p4d_clear_huge(p4d);
 144                 if (cleared || p4d_bad(*p4d))
 145                         *mask |= PGTBL_P4D_MODIFIED;
 146
 147                 if (cleared)
 148                         continue;
 149                 if (p4d_none_or_clear_bad(p4d))
 150                         continue;
 151                 vunmap_pud_range(p4d, addr, next, mask);
 152         } while (p4d++, addr = next, addr != end);
 153 }
 154
 155 /**
 156  * unmap_kernel_range_noflush - unmap kernel VM area
 157  * @start: start of the VM area to unmap
 158  * @size: size of the VM area to unmap
 159  *
 160  * Unmap PFN_UP(@size) pages at @addr.  The VM area @addr and @size specify
 161  * should have been allocated using get_vm_area() and its friends.
 162  *
 163  * NOTE:
 164  * This function does NOT do any cache flushing.  The caller is responsible
 165  * for calling flush_cache_vunmap() on to-be-mapped areas before calling this
 166  * function and flush_tlb_kernel_range() after.
 167  */
 168 void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
 169 {
 170         unsigned long end = start + size;
 171         unsigned long next;
 172         pgd_t *pgd;
 173         unsigned long addr = start;
 174         pgtbl_mod_mask mask = 0;
 175
 176         BUG_ON(addr >= end);
 177         start = addr;
 178         pgd = pgd_offset_k(addr);
 179         do {
 180                 next = pgd_addr_end(addr, end);
 181                 if (pgd_bad(*pgd))
 182                         mask |= PGTBL_PGD_MODIFIED;
 183                 if (pgd_none_or_clear_bad(pgd))
 184                         continue;
 185                 vunmap_p4d_range(pgd, addr, next, &mask);
 186         } while (pgd++, addr = next, addr != end);
 187
 188         if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
 189                 arch_sync_kernel_mappings(start, end);
 190 }
 191
 192 static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
 193                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
 194                 pgtbl_mod_mask *mask)
 195 {
 196         pte_t *pte;
 197
 198         /*
 199          * nr is a running index into the array which helps higher level
 200          * callers keep track of where we're up to.
 201          */
 202
 203         pte = pte_alloc_kernel_track(pmd, addr, mask);
 204         if (!pte)
 205                 return -ENOMEM;
 206         do {
 207                 struct page *page = pages[*nr];
 208
 209                 if (WARN_ON(!pte_none(*pte)))
 210                         return -EBUSY;
 211                 if (WARN_ON(!page))
 212                         return -ENOMEM;
 213                 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
 214                 (*nr)++;
 215         } while (pte++, addr += PAGE_SIZE, addr != end);
 216         *mask |= PGTBL_PTE_MODIFIED;
 217         return 0;
 218 }
 219
 220 static int vmap_pmd_range(pud_t *pud, unsigned long addr,
 221                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
 222                 pgtbl_mod_mask *mask)
 223 {
 224         pmd_t *pmd;
 225         unsigned long next;
 226
 227         pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
 228         if (!pmd)
 229                 return -ENOMEM;
 230         do {
 231                 next = pmd_addr_end(addr, end);
 232                 if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask))
 233                         return -ENOMEM;
 234         } while (pmd++, addr = next, addr != end);
 235         return 0;
 236 }
 237
 238 static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
 239                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
 240                 pgtbl_mod_mask *mask)
 241 {
 242         pud_t *pud;
 243         unsigned long next;
 244
 245         pud = pud_alloc_track(&init_mm, p4d, addr, mask);
 246         if (!pud)
 247                 return -ENOMEM;
 248         do {
 249                 next = pud_addr_end(addr, end);
 250                 if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask))
 251                         return -ENOMEM;
 252         } while (pud++, addr = next, addr != end);
 253         return 0;
 254 }
 255
 256 static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
 257                 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
 258                 pgtbl_mod_mask *mask)
 259 {
 260         p4d_t *p4d;
 261         unsigned long next;
 262
 263         p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
 264         if (!p4d)
 265                 return -ENOMEM;
 266         do {
 267                 next = p4d_addr_end(addr, end);
 268                 if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask))
 269                         return -ENOMEM;
 270         } while (p4d++, addr = next, addr != end);
 271         return 0;
 272 }
 273
 274 /**
 275  * map_kernel_range_noflush - map kernel VM area with the specified pages
 276  * @addr: start of the VM area to map
 277  * @size: size of the VM area to map
 278  * @prot: page protection flags to use
 279  * @pages: pages to map
 280  *
 281  * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size specify should
 282  * have been allocated using get_vm_area() and its friends.
 283  *
 284  * NOTE:
 285  * This function does NOT do any cache flushing.  The caller is responsible for
 286  * calling flush_cache_vmap() on to-be-mapped areas before calling this
 287  * function.
 288  *
 289  * RETURNS:
 290  * 0 on success, -errno on failure.
 291  */
 292 int map_kernel_range_noflush(unsigned long addr, unsigned long size,
 293                              pgprot_t prot, struct page **pages)
 294 {
 295         unsigned long start = addr;
 296         unsigned long end = addr + size;
 297         unsigned long next;
 298         pgd_t *pgd;
 299         int err = 0;
 300         int nr = 0;
 301         pgtbl_mod_mask mask = 0;
 302
 303         BUG_ON(addr >= end);
 304         pgd = pgd_offset_k(addr);
 305         do {
 306                 next = pgd_addr_end(addr, end);
 307                 if (pgd_bad(*pgd))
 308                         mask |= PGTBL_PGD_MODIFIED;
 309                 err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
 310                 if (err)
 311                         return err;
 312         } while (pgd++, addr = next, addr != end);
 313
 314         if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
 315                 arch_sync_kernel_mappings(start, end);
 316
 317         return 0;
 318 }
 319
 320 int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
 321                 struct page **pages)
 322 {
 323         int ret;
 324
 325         ret = map_kernel_range_noflush(start, size, prot, pages);
 326         flush_cache_vmap(start, start + size);
 327         return ret;
 328 }
 329
 330 int is_vmalloc_or_module_addr(const void *x)
 331 {
 332         /*
 333          * ARM, x86-64 and sparc64 put modules in a special place,
 334          * and fall back on vmalloc() if that fails. Others
 335          * just put it in the vmalloc space.
 336          */
 337 #if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
 338         unsigned long addr = (unsigned long)x;
 339         if (addr >= MODULES_VADDR && addr < MODULES_END)
 340                 return 1;
 341 #endif
 342         return is_vmalloc_addr(x);
 343 }
 344
 345 /*
 346  * Walk a vmap address to the struct page it maps.
 347  */
 348 struct page *vmalloc_to_page(const void *vmalloc_addr)
 349 {
 350         unsigned long addr = (unsigned long) vmalloc_addr;
 351         struct page *page = NULL;
 352         pgd_t *pgd = pgd_offset_k(addr);
 353         p4d_t *p4d;
 354         pud_t *pud;
 355         pmd_t *pmd;
 356         pte_t *ptep, pte;
 357
 358         /*
 359          * XXX we might need to change this if we add VIRTUAL_BUG_ON for
 360          * architectures that do not vmalloc module space
 361          */
 362         VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
 363
 364         if (pgd_none(*pgd))
 365                 return NULL;
 366         p4d = p4d_offset(pgd, addr);
 367         if (p4d_none(*p4d))
 368                 return NULL;
 369         pud = pud_offset(p4d, addr);
 370
 371         /*
 372          * Don't dereference bad PUD or PMD (below) entries. This will also
 373          * identify huge mappings, which we may encounter on architectures
 374          * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
 375          * identified as vmalloc addresses by is_vmalloc_addr(), but are
 376          * not [unambiguously] associated with a struct page, so there is
 377          * no correct value to return for them.
 378          */
 379         WARN_ON_ONCE(pud_bad(*pud));
 380         if (pud_none(*pud) || pud_bad(*pud))
 381                 return NULL;
 382         pmd = pmd_offset(pud, addr);
 383         WARN_ON_ONCE(pmd_bad(*pmd));
 384         if (pmd_none(*pmd) || pmd_bad(*pmd))
 385                 return NULL;
 386
 387         ptep = pte_offset_map(pmd, addr);
 388         pte = *ptep;
 389         if (pte_present(pte))
 390                 page = pte_page(pte);
 391         pte_unmap(ptep);
 392         return page;
 393 }
 394 EXPORT_SYMBOL(vmalloc_to_page);
 395
 396 /*
 397  * Map a vmalloc()-space virtual address to the physical page frame number.
 398  */
 399 unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
 400 {
 401         return page_to_pfn(vmalloc_to_page(vmalloc_addr));
 402 }
 403 EXPORT_SYMBOL(vmalloc_to_pfn);
 404
 405
 406 /*** Global kva allocator ***/
 407
 408 #define DEBUG_AUGMENT_PROPAGATE_CHECK 0
 409 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
 410
 411
 412 static DEFINE_SPINLOCK(vmap_area_lock);
 413 static DEFINE_SPINLOCK(free_vmap_area_lock);
 414 /* Export for kexec only */
 415 LIST_HEAD(vmap_area_list);
 416 static LLIST_HEAD(vmap_purge_list);
 417 static struct rb_root vmap_area_root = RB_ROOT;
 418 static bool vmap_initialized __read_mostly;
 419
 420 /*
 421  * This kmem_cache is used for vmap_area objects. Instead of
 422  * allocating from slab we reuse an object from this cache to
 423  * make things faster. Especially in "no edge" splitting of
 424  * free block.
 425  */
 426 static struct kmem_cache *vmap_area_cachep;
 427
 428 /*
 429  * This linked list is used in pair with free_vmap_area_root.
 430  * It gives O(1) access to prev/next to perform fast coalescing.
 431  */
 432 static LIST_HEAD(free_vmap_area_list);
 433
 434 /*
 435  * This augment red-black tree represents the free vmap space.
 436  * All vmap_area objects in this tree are sorted by va->va_start
 437  * address. It is used for allocation and merging when a vmap
 438  * object is released.
 439  *
 440  * Each vmap_area node contains a maximum available free block
 441  * of its sub-tree, right or left. Therefore it is possible to
 442  * find a lowest match of free area.
 443  */
 444 static struct rb_root free_vmap_area_root = RB_ROOT;
 445
 446 /*
 447  * Preload a CPU with one object for "no edge" split case. The
 448  * aim is to get rid of allocations from the atomic context, thus
 449  * to use more permissive allocation masks.
 450  */
 451 static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
 452
 453 static __always_inline unsigned long
 454 va_size(struct vmap_area *va)
 455 {
 456         return (va->va_end - va->va_start);
 457 }
 458
 459 static __always_inline unsigned long
 460 get_subtree_max_size(struct rb_node *node)
 461 {
 462         struct vmap_area *va;
 463
 464         va = rb_entry_safe(node, struct vmap_area, rb_node);
 465         return va ? va->subtree_max_size : 0;
 466 }
 467
 468 /*
 469  * Gets called when remove the node and rotate.
 470  */
 471 static __always_inline unsigned long
 472 compute_subtree_max_size(struct vmap_area *va)
 473 {
 474         return max3(va_size(va),
 475                 get_subtree_max_size(va->rb_node.rb_left),
 476                 get_subtree_max_size(va->rb_node.rb_right));
 477 }
 478
 479 RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
 480         struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
 481
 482 static void purge_vmap_area_lazy(void);
 483 static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
 484 static unsigned long lazy_max_pages(void);
 485
 486 static atomic_long_t nr_vmalloc_pages;
 487
 488 unsigned long vmalloc_nr_pages(void)
 489 {
 490         return atomic_long_read(&nr_vmalloc_pages);
 491 }
 492
 493 static struct vmap_area *__find_vmap_area(unsigned long addr)
 494 {
 495         struct rb_node *n = vmap_area_root.rb_node;
 496
 497         while (n) {
 498                 struct vmap_area *va;
 499
 500                 va = rb_entry(n, struct vmap_area, rb_node);
 501                 if (addr < va->va_start)
 502                         n = n->rb_left;
 503                 else if (addr >= va->va_end)
 504                         n = n->rb_right;
 505                 else
 506                         return va;
 507         }
 508
 509         return NULL;
 510 }
 511
 512 /*
 513  * This function returns back addresses of parent node
 514  * and its left or right link for further processing.
 515  */
 516 static __always_inline struct rb_node **
 517 find_va_links(struct vmap_area *va,
 518         struct rb_root *root, struct rb_node *from,
 519         struct rb_node **parent)
 520 {
 521         struct vmap_area *tmp_va;
 522         struct rb_node **link;
 523
 524         if (root) {
 525                 link = &root->rb_node;
 526                 if (unlikely(!*link)) {
 527                         *parent = NULL;
 528                         return link;
 529                 }
 530         } else {
 531                 link = &from;
 532         }
 533
 534         /*
 535          * Go to the bottom of the tree. When we hit the last point
 536          * we end up with parent rb_node and correct direction, i name
 537          * it link, where the new va->rb_node will be attached to.
 538          */
 539         do {
 540                 tmp_va = rb_entry(*link, struct vmap_area, rb_node);
 541
 542                 /*
 543                  * During the traversal we also do some sanity check.
 544                  * Trigger the BUG() if there are sides(left/right)
 545                  * or full overlaps.
 546                  */
 547                 if (va->va_start < tmp_va->va_end &&
 548                                 va->va_end <= tmp_va->va_start)
 549                         link = &(*link)->rb_left;
 550                 else if (va->va_end > tmp_va->va_start &&
 551                                 va->va_start >= tmp_va->va_end)
 552                         link = &(*link)->rb_right;
 553                 else
 554                         BUG();
 555         } while (*link);
 556
 557         *parent = &tmp_va->rb_node;
 558         return link;
 559 }
 560
 561 static __always_inline struct list_head *
 562 get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
 563 {
 564         struct list_head *list;
 565
 566         if (unlikely(!parent))
 567                 /*
 568                  * The red-black tree where we try to find VA neighbors
 569                  * before merging or inserting is empty, i.e. it means
 570                  * there is no free vmap space. Normally it does not
 571                  * happen but we handle this case anyway.
 572                  */
 573                 return NULL;
 574
 575         list = &rb_entry(parent, struct vmap_area, rb_node)->list;
 576         return (&parent->rb_right == link ? list->next : list);
 577 }
 578
 579 static __always_inline void
 580 link_va(struct vmap_area *va, struct rb_root *root,
 581         struct rb_node *parent, struct rb_node **link, struct list_head *head)
 582 {
 583         /*
 584          * VA is still not in the list, but we can
 585          * identify its future previous list_head node.
 586          */
 587         if (likely(parent)) {
 588                 head = &rb_entry(parent, struct vmap_area, rb_node)->list;
 589                 if (&parent->rb_right != link)
 590                         head = head->prev;
 591         }
 592
 593         /* Insert to the rb-tree */
 594         rb_link_node(&va->rb_node, parent, link);
 595         if (root == &free_vmap_area_root) {
 596                 /*
 597                  * Some explanation here. Just perform simple insertion
 598                  * to the tree. We do not set va->subtree_max_size to
 599                  * its current size before calling rb_insert_augmented().
 600                  * It is because of we populate the tree from the bottom
 601                  * to parent levels when the node _is_ in the tree.
 602                  *
 603                  * Therefore we set subtree_max_size to zero after insertion,
 604                  * to let __augment_tree_propagate_from() puts everything to
 605                  * the correct order later on.
 606                  */
 607                 rb_insert_augmented(&va->rb_node,
 608                         root, &free_vmap_area_rb_augment_cb);
 609                 va->subtree_max_size = 0;
 610         } else {
 611                 rb_insert_color(&va->rb_node, root);
 612         }
 613
 614         /* Address-sort this list */
 615         list_add(&va->list, head);
 616 }
 617
 618 static __always_inline void
 619 unlink_va(struct vmap_area *va, struct rb_root *root)
 620 {
 621         if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
 622                 return;
 623
 624         if (root == &free_vmap_area_root)
 625                 rb_erase_augmented(&va->rb_node,
 626                         root, &free_vmap_area_rb_augment_cb);
 627         else
 628                 rb_erase(&va->rb_node, root);
 629
 630         list_del(&va->list);
 631         RB_CLEAR_NODE(&va->rb_node);
 632 }
 633
 634 #if DEBUG_AUGMENT_PROPAGATE_CHECK
 635 static void
 636 augment_tree_propagate_check(struct rb_node *n)
 637 {
 638         struct vmap_area *va;
 639         struct rb_node *node;
 640         unsigned long size;
 641         bool found = false;
 642
 643         if (n == NULL)
 644                 return;
 645
 646         va = rb_entry(n, struct vmap_area, rb_node);
 647         size = va->subtree_max_size;
 648         node = n;
 649
 650         while (node) {
 651                 va = rb_entry(node, struct vmap_area, rb_node);
 652
 653                 if (get_subtree_max_size(node->rb_left) == size) {
 654                         node = node->rb_left;
 655                 } else {
 656                         if (va_size(va) == size) {
 657                                 found = true;
 658                                 break;
 659                         }
 660
 661                         node = node->rb_right;
 662                 }
 663         }
 664
 665         if (!found) {
 666                 va = rb_entry(n, struct vmap_area, rb_node);
 667                 pr_emerg("tree is corrupted: %lu, %lu\n",
 668                         va_size(va), va->subtree_max_size);
 669         }
 670
 671         augment_tree_propagate_check(n->rb_left);
 672         augment_tree_propagate_check(n->rb_right);
 673 }
 674 #endif
 675
 676 /*
 677  * This function populates subtree_max_size from bottom to upper
 678  * levels starting from VA point. The propagation must be done
 679  * when VA size is modified by changing its va_start/va_end. Or
 680  * in case of newly inserting of VA to the tree.
 681  *
 682  * It means that __augment_tree_propagate_from() must be called:
 683  * - After VA has been inserted to the tree(free path);
 684  * - After VA has been shrunk(allocation path);
 685  * - After VA has been increased(merging path).
 686  *
 687  * Please note that, it does not mean that upper parent nodes
 688  * and their subtree_max_size are recalculated all the time up
 689  * to the root node.
 690  *
 691  *       4--8
 692  *        /\
 693  *       /  \
 694  *      /    \
 695  *    2--2  8--8
 696  *
 697  * For example if we modify the node 4, shrinking it to 2, then
 698  * no any modification is required. If we shrink the node 2 to 1
 699  * its subtree_max_size is updated only, and set to 1. If we shrink
 700  * the node 8 to 6, then its subtree_max_size is set to 6 and parent
 701  * node becomes 4--6.
 702  */
 703 static __always_inline void
 704 augment_tree_propagate_from(struct vmap_area *va)
 705 {
 706         struct rb_node *node = &va->rb_node;
 707         unsigned long new_va_sub_max_size;
 708
 709         while (node) {
 710                 va = rb_entry(node, struct vmap_area, rb_node);
 711                 new_va_sub_max_size = compute_subtree_max_size(va);
 712
 713                 /*
 714                  * If the newly calculated maximum available size of the
 715                  * subtree is equal to the current one, then it means that
 716                  * the tree is propagated correctly. So we have to stop at
 717                  * this point to save cycles.
 718                  */
 719                 if (va->subtree_max_size == new_va_sub_max_size)
 720                         break;
 721
 722                 va->subtree_max_size = new_va_sub_max_size;
 723                 node = rb_parent(&va->rb_node);
 724         }
 725
 726 #if DEBUG_AUGMENT_PROPAGATE_CHECK
 727         augment_tree_propagate_check(free_vmap_area_root.rb_node);
 728 #endif
 729 }
 730
 731 static void
 732 insert_vmap_area(struct vmap_area *va,
 733         struct rb_root *root, struct list_head *head)
 734 {
 735         struct rb_node **link;
 736         struct rb_node *parent;
 737
 738         link = find_va_links(va, root, NULL, &parent);
 739         link_va(va, root, parent, link, head);
 740 }
 741
 742 static void
 743 insert_vmap_area_augment(struct vmap_area *va,
 744         struct rb_node *from, struct rb_root *root,
 745         struct list_head *head)
 746 {
 747         struct rb_node **link;
 748         struct rb_node *parent;
 749
 750         if (from)
 751                 link = find_va_links(va, NULL, from, &parent);
 752         else
 753                 link = find_va_links(va, root, NULL, &parent);
 754
 755         link_va(va, root, parent, link, head);
 756         augment_tree_propagate_from(va);
 757 }
 758
 759 /*
 760  * Merge de-allocated chunk of VA memory with previous
 761  * and next free blocks. If coalesce is not done a new
 762  * free area is inserted. If VA has been merged, it is
 763  * freed.
 764  */
 765 static __always_inline struct vmap_area *
 766 merge_or_add_vmap_area(struct vmap_area *va,
 767         struct rb_root *root, struct list_head *head)
 768 {
 769         struct vmap_area *sibling;
 770         struct list_head *next;
 771         struct rb_node **link;
 772         struct rb_node *parent;
 773         bool merged = false;
 774
 775         /*
 776          * Find a place in the tree where VA potentially will be
 777          * inserted, unless it is merged with its sibling/siblings.
 778          */
 779         link = find_va_links(va, root, NULL, &parent);
 780
 781         /*
 782          * Get next node of VA to check if merging can be done.
 783          */
 784         next = get_va_next_sibling(parent, link);
 785         if (unlikely(next == NULL))
 786                 goto insert;
 787
 788         /*
 789          * start            end
 790          * |                |
 791          * |<------VA------>|<-----Next----->|
 792          *                  |                |
 793          *                  start            end
 794          */
 795         if (next != head) {
 796                 sibling = list_entry(next, struct vmap_area, list);
 797                 if (sibling->va_start == va->va_end) {
 798                         sibling->va_start = va->va_start;
 799
 800                         /* Check and update the tree if needed. */
 801                         augment_tree_propagate_from(sibling);
 802
 803                         /* Free vmap_area object. */
 804                         kmem_cache_free(vmap_area_cachep, va);
 805
 806                         /* Point to the new merged area. */
 807                         va = sibling;
 808                         merged = true;
 809                 }
 810         }
 811
 812         /*
 813          * start            end
 814          * |                |
 815          * |<-----Prev----->|<------VA------>|
 816          *                  |                |
 817          *                  start            end
 818          */
 819         if (next->prev != head) {
 820                 sibling = list_entry(next->prev, struct vmap_area, list);
 821                 if (sibling->va_end == va->va_start) {
 822                         sibling->va_end = va->va_end;
 823
 824                         /* Check and update the tree if needed. */
 825                         augment_tree_propagate_from(sibling);
 826
 827                         if (merged)
 828                                 unlink_va(va, root);
 829
 830                         /* Free vmap_area object. */
 831                         kmem_cache_free(vmap_area_cachep, va);
 832
 833                         /* Point to the new merged area. */
 834                         va = sibling;
 835                         merged = true;
 836                 }
 837         }
 838
 839 insert:
 840         if (!merged) {
 841                 link_va(va, root, parent, link, head);
 842                 augment_tree_propagate_from(va);
 843         }
 844
 845         return va;
 846 }
 847
 848 static __always_inline bool
 849 is_within_this_va(struct vmap_area *va, unsigned long size,
 850         unsigned long align, unsigned long vstart)
 851 {
 852         unsigned long nva_start_addr;
 853
 854         if (va->va_start > vstart)
 855                 nva_start_addr = ALIGN(va->va_start, align);
 856         else
 857                 nva_start_addr = ALIGN(vstart, align);
 858
 859         /* Can be overflowed due to big size or alignment. */
 860         if (nva_start_addr + size < nva_start_addr ||
 861                         nva_start_addr < vstart)
 862                 return false;
 863
 864         return (nva_start_addr + size <= va->va_end);
 865 }
 866
 867 /*
 868  * Find the first free block(lowest start address) in the tree,
 869  * that will accomplish the request corresponding to passing
 870  * parameters.
 871  */
 872 static __always_inline struct vmap_area *
 873 find_vmap_lowest_match(unsigned long size,
 874         unsigned long align, unsigned long vstart)
 875 {
 876         struct vmap_area *va;
 877         struct rb_node *node;
 878         unsigned long length;
 879
 880         /* Start from the root. */
 881         node = free_vmap_area_root.rb_node;
 882
 883         /* Adjust the search size for alignment overhead. */
 884         length = size + align - 1;
 885
 886         while (node) {
 887                 va = rb_entry(node, struct vmap_area, rb_node);
 888
 889                 if (get_subtree_max_size(node->rb_left) >= length &&
 890                                 vstart < va->va_start) {
 891                         node = node->rb_left;
 892                 } else {
 893                         if (is_within_this_va(va, size, align, vstart))
 894                                 return va;
 895
 896                         /*
 897                          * Does not make sense to go deeper towards the right
 898                          * sub-tree if it does not have a free block that is
 899                          * equal or bigger to the requested search length.
 900                          */
 901                         if (get_subtree_max_size(node->rb_right) >= length) {
 902                                 node = node->rb_right;
 903                                 continue;
 904                         }
 905
 906                         /*
 907                          * OK. We roll back and find the first right sub-tree,
 908                          * that will satisfy the search criteria. It can happen
 909                          * only once due to "vstart" restriction.
 910                          */
 911                         while ((node = rb_parent(node))) {
 912                                 va = rb_entry(node, struct vmap_area, rb_node);
 913                                 if (is_within_this_va(va, size, align, vstart))
 914                                         return va;
 915
 916                                 if (get_subtree_max_size(node->rb_right) >= length &&
 917                                                 vstart <= va->va_start) {
 918                                         node = node->rb_right;
 919                                         break;
 920                                 }
 921                         }
 922                 }
 923         }
 924
 925         return NULL;
 926 }
 927
 928 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
 929 #include <linux/random.h>
 930
 931 static struct vmap_area *
 932 find_vmap_lowest_linear_match(unsigned long size,
 933         unsigned long align, unsigned long vstart)
 934 {
 935         struct vmap_area *va;
 936
 937         list_for_each_entry(va, &free_vmap_area_list, list) {
 938                 if (!is_within_this_va(va, size, align, vstart))
 939                         continue;
 940
 941                 return va;
 942         }
 943
 944         return NULL;
 945 }
 946
 947 static void
 948 find_vmap_lowest_match_check(unsigned long size)
 949 {
 950         struct vmap_area *va_1, *va_2;
 951         unsigned long vstart;
 952         unsigned int rnd;
 953
 954         get_random_bytes(&rnd, sizeof(rnd));
 955         vstart = VMALLOC_START + rnd;
 956
 957         va_1 = find_vmap_lowest_match(size, 1, vstart);
 958         va_2 = find_vmap_lowest_linear_match(size, 1, vstart);
 959
 960         if (va_1 != va_2)
 961                 pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
 962                         va_1, va_2, vstart);
 963 }
 964 #endif
 965
 966 enum fit_type {
 967         NOTHING_FIT = 0,
 968         FL_FIT_TYPE = 1,        /* full fit */
 969         LE_FIT_TYPE = 2,        /* left edge fit */
 970         RE_FIT_TYPE = 3,        /* right edge fit */
 971         NE_FIT_TYPE = 4         /* no edge fit */
 972 };
 973
 974 static __always_inline enum fit_type
 975 classify_va_fit_type(struct vmap_area *va,
 976         unsigned long nva_start_addr, unsigned long size)
 977 {
 978         enum fit_type type;
 979
 980         /* Check if it is within VA. */
 981         if (nva_start_addr < va->va_start ||
 982                         nva_start_addr + size > va->va_end)
 983                 return NOTHING_FIT;
 984
 985         /* Now classify. */
 986         if (va->va_start == nva_start_addr) {
 987                 if (va->va_end == nva_start_addr + size)
 988                         type = FL_FIT_TYPE;
 989                 else
 990                         type = LE_FIT_TYPE;
 991         } else if (va->va_end == nva_start_addr + size) {
 992                 type = RE_FIT_TYPE;
 993         } else {
 994                 type = NE_FIT_TYPE;
 995         }
 996
 997         return type;
 998 }
 999
1000 static __always_inline int
1001 adjust_va_to_fit_type(struct vmap_area *va,
1002         unsigned long nva_start_addr, unsigned long size,
1003         enum fit_type type)
1004 {
1005         struct vmap_area *lva = NULL;
1006
1007         if (type == FL_FIT_TYPE) {
1008                 /*
1009                  * No need to split VA, it fully fits.
1010                  *
1011                  * |               |
1012                  * V      NVA      V
1013                  * |---------------|
1014                  */
1015                 unlink_va(va, &free_vmap_area_root);
1016                 kmem_cache_free(vmap_area_cachep, va);
1017         } else if (type == LE_FIT_TYPE) {
1018                 /*
1019                  * Split left edge of fit VA.
1020                  *
1021                  * |       |
1022                  * V  NVA  V   R
1023                  * |-------|-------|
1024                  */
1025                 va->va_start += size;
1026         } else if (type == RE_FIT_TYPE) {
1027                 /*
1028                  * Split right edge of fit VA.
1029                  *
1030                  *         |       |
1031                  *     L   V  NVA  V
1032                  * |-------|-------|
1033                  */
1034                 va->va_end = nva_start_addr;
1035         } else if (type == NE_FIT_TYPE) {
1036                 /*
1037                  * Split no edge of fit VA.
1038                  *
1039                  *     |       |
1040                  *   L V  NVA  V R
1041                  * |---|-------|---|
1042                  */
1043                 lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
1044                 if (unlikely(!lva)) {
1045                         /*
1046                          * For percpu allocator we do not do any pre-allocation
1047                          * and leave it as it is. The reason is it most likely
1048                          * never ends up with NE_FIT_TYPE splitting. In case of
1049                          * percpu allocations offsets and sizes are aligned to
1050                          * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
1051                          * are its main fitting cases.
1052                          *
1053                          * There are a few exceptions though, as an example it is
1054                          * a first allocation (early boot up) when we have "one"
1055                          * big free space that has to be split.
1056                          *
1057                          * Also we can hit this path in case of regular "vmap"
1058                          * allocations, if "this" current CPU was not preloaded.
1059                          * See the comment in alloc_vmap_area() why. If so, then
1060                          * GFP_NOWAIT is used instead to get an extra object for
1061                          * split purpose. That is rare and most time does not
1062                          * occur.
1063                          *
1064                          * What happens if an allocation gets failed. Basically,
1065                          * an "overflow" path is triggered to purge lazily freed
1066                          * areas to free some memory, then, the "retry" path is
1067                          * triggered to repeat one more time. See more details
1068                          * in alloc_vmap_area() function.
1069                          */
1070                         lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
1071                         if (!lva)
1072                                 return -1;
1073                 }
1074
1075                 /*
1076                  * Build the remainder.
1077                  */
1078                 lva->va_start = va->va_start;
1079                 lva->va_end = nva_start_addr;
1080
1081                 /*
1082                  * Shrink this VA to remaining size.
1083                  */
1084                 va->va_start = nva_start_addr + size;
1085         } else {
1086                 return -1;
1087         }
1088
1089         if (type != FL_FIT_TYPE) {
1090                 augment_tree_propagate_from(va);
1091
1092                 if (lva)        /* type == NE_FIT_TYPE */
1093                         insert_vmap_area_augment(lva, &va->rb_node,
1094                                 &free_vmap_area_root, &free_vmap_area_list);
1095         }
1096
1097         return 0;
1098 }
1099
1100 /*
1101  * Returns a start address of the newly allocated area, if success.
1102  * Otherwise a vend is returned that indicates failure.
1103  */
1104 static __always_inline unsigned long
1105 __alloc_vmap_area(unsigned long size, unsigned long align,
1106         unsigned long vstart, unsigned long vend)
1107 {
1108         unsigned long nva_start_addr;
1109         struct vmap_area *va;
1110         enum fit_type type;
1111         int ret;
1112
1113         va = find_vmap_lowest_match(size, align, vstart);
1114         if (unlikely(!va))
1115                 return vend;
1116
1117         if (va->va_start > vstart)
1118                 nva_start_addr = ALIGN(va->va_start, align);
1119         else
1120                 nva_start_addr = ALIGN(vstart, align);
1121
1122         /* Check the "vend" restriction. */
1123         if (nva_start_addr + size > vend)
1124                 return vend;
1125
1126         /* Classify what we have found. */
1127         type = classify_va_fit_type(va, nva_start_addr, size);
1128         if (WARN_ON_ONCE(type == NOTHING_FIT))
1129                 return vend;
1130
1131         /* Update the free vmap_area. */
1132         ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
1133         if (ret)
1134                 return vend;
1135
1136 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
1137         find_vmap_lowest_match_check(size);
1138 #endif
1139
1140         return nva_start_addr;
1141 }
1142
1143 /*
1144  * Free a region of KVA allocated by alloc_vmap_area
1145  */
1146 static void free_vmap_area(struct vmap_area *va)
1147 {
1148         /*
1149          * Remove from the busy tree/list.
1150          */
1151         spin_lock(&vmap_area_lock);
1152         unlink_va(va, &vmap_area_root);
1153         spin_unlock(&vmap_area_lock);
1154
1155         /*
1156          * Insert/Merge it back to the free tree/list.
1157          */
1158         spin_lock(&free_vmap_area_lock);
1159         merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list);
1160         spin_unlock(&free_vmap_area_lock);
1161 }
1162
1163 /*
1164  * Allocate a region of KVA of the specified size and alignment, within the
1165  * vstart and vend.
1166  */
1167 static struct vmap_area *alloc_vmap_area(unsigned long size,
1168                                 unsigned long align,
1169                                 unsigned long vstart, unsigned long vend,
1170                                 int node, gfp_t gfp_mask)
1171 {
1172         struct vmap_area *va, *pva;
1173         unsigned long addr;
1174         int purged = 0;
1175         int ret;
1176
1177         BUG_ON(!size);
1178         BUG_ON(offset_in_page(size));
1179         BUG_ON(!is_power_of_2(align));
1180
1181         if (unlikely(!vmap_initialized))
1182                 return ERR_PTR(-EBUSY);
1183
1184         might_sleep();
1185         gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
1186
1187         va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
1188         if (unlikely(!va))
1189                 return ERR_PTR(-ENOMEM);
1190
1191         /*
1192          * Only scan the relevant parts containing pointers to other objects
1193          * to avoid false negatives.
1194          */
1195         kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
1196
1197 retry:
1198         /*
1199          * Preload this CPU with one extra vmap_area object. It is used
1200          * when fit type of free area is NE_FIT_TYPE. Please note, it
1201          * does not guarantee that an allocation occurs on a CPU that
1202          * is preloaded, instead we minimize the case when it is not.
1203          * It can happen because of cpu migration, because there is a
1204          * race until the below spinlock is taken.
1205          *
1206          * The preload is done in non-atomic context, thus it allows us
1207          * to use more permissive allocation masks to be more stable under
1208          * low memory condition and high memory pressure. In rare case,
1209          * if not preloaded, GFP_NOWAIT is used.
1210          *
1211          * Set "pva" to NULL here, because of "retry" path.
1212          */
1213         pva = NULL;
1214
1215         if (!this_cpu_read(ne_fit_preload_node))
1216                 /*
1217                  * Even if it fails we do not really care about that.
1218                  * Just proceed as it is. If needed "overflow" path
1219                  * will refill the cache we allocate from.
1220                  */
1221                 pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
1222
1223         spin_lock(&free_vmap_area_lock);
1224
1225         if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva))
1226                 kmem_cache_free(vmap_area_cachep, pva);
1227
1228         /*
1229          * If an allocation fails, the "vend" address is
1230          * returned. Therefore trigger the overflow path.
1231          */
1232         addr = __alloc_vmap_area(size, align, vstart, vend);
1233         spin_unlock(&free_vmap_area_lock);
1234
1235         if (unlikely(addr == vend))
1236                 goto overflow;
1237
1238         va->va_start = addr;
1239         va->va_end = addr + size;
1240         va->vm = NULL;
1241
1242
1243         spin_lock(&vmap_area_lock);
1244         insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
1245         spin_unlock(&vmap_area_lock);
1246
1247         BUG_ON(!IS_ALIGNED(va->va_start, align));
1248         BUG_ON(va->va_start < vstart);
1249         BUG_ON(va->va_end > vend);
1250
1251         ret = kasan_populate_vmalloc(addr, size);
1252         if (ret) {
1253                 free_vmap_area(va);
1254                 return ERR_PTR(ret);
1255         }
1256
1257         return va;
1258
1259 overflow:
1260         if (!purged) {
1261                 purge_vmap_area_lazy();
1262                 purged = 1;
1263                 goto retry;
1264         }
1265
1266         if (gfpflags_allow_blocking(gfp_mask)) {
1267                 unsigned long freed = 0;
1268                 blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
1269                 if (freed > 0) {
1270                         purged = 0;
1271                         goto retry;
1272                 }
1273         }
1274
1275         if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
1276                 pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
1277                         size);
1278
1279         kmem_cache_free(vmap_area_cachep, va);
1280         return ERR_PTR(-EBUSY);
1281 }
1282
1283 int register_vmap_purge_notifier(struct notifier_block *nb)
1284 {
1285         return blocking_notifier_chain_register(&vmap_notify_list, nb);
1286 }
1287 EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
1288
1289 int unregister_vmap_purge_notifier(struct notifier_block *nb)
1290 {
1291         return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
1292 }
1293 EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
1294
1295 /*
1296  * lazy_max_pages is the maximum amount of virtual address space we gather up
1297  * before attempting to purge with a TLB flush.
1298  *
1299  * There is a tradeoff here: a larger number will cover more kernel page tables
1300  * and take slightly longer to purge, but it will linearly reduce the number of
1301  * global TLB flushes that must be performed. It would seem natural to scale
1302  * this number up linearly with the number of CPUs (because vmapping activity
1303  * could also scale linearly with the number of CPUs), however it is likely
1304  * that in practice, workloads might be constrained in other ways that mean
1305  * vmap activity will not scale linearly with CPUs. Also, I want to be
1306  * conservative and not introduce a big latency on huge systems, so go with
1307  * a less aggressive log scale. It will still be an improvement over the old
1308  * code, and it will be simple to change the scale factor if we find that it
1309  * becomes a problem on bigger systems.
1310  */
1311 static unsigned long lazy_max_pages(void)
1312 {
1313         unsigned int log;
1314
1315         log = fls(num_online_cpus());
1316
1317         return log * (32UL * 1024 * 1024 / PAGE_SIZE);
1318 }
1319
1320 static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
1321
1322 /*
1323  * Serialize vmap purging.  There is no actual criticial section protected
1324  * by this look, but we want to avoid concurrent calls for performance
1325  * reasons and to make the pcpu_get_vm_areas more deterministic.
1326  */
1327 static DEFINE_MUTEX(vmap_purge_lock);
1328
1329 /* for per-CPU blocks */
1330 static void purge_fragmented_blocks_allcpus(void);
1331
1332 /*
1333  * called before a call to iounmap() if the caller wants vm_area_struct's
1334  * immediately freed.
1335  */
1336 void set_iounmap_nonlazy(void)
1337 {
1338         atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
1339 }
1340
1341 /*
1342  * Purges all lazily-freed vmap areas.
1343  */
1344 static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
1345 {
1346         unsigned long resched_threshold;
1347         struct llist_node *valist;
1348         struct vmap_area *va;
1349         struct vmap_area *n_va;
1350
1351         lockdep_assert_held(&vmap_purge_lock);
1352
1353         valist = llist_del_all(&vmap_purge_list);
1354         if (unlikely(valist == NULL))
1355                 return false;
1356
1357         /*
1358          * TODO: to calculate a flush range without looping.
1359          * The list can be up to lazy_max_pages() elements.
1360          */
1361         llist_for_each_entry(va, valist, purge_list) {
1362                 if (va->va_start < start)
1363                         start = va->va_start;
1364                 if (va->va_end > end)
1365                         end = va->va_end;
1366         }
1367
1368         flush_tlb_kernel_range(start, end);
1369         resched_threshold = lazy_max_pages() << 1;
1370
1371         spin_lock(&free_vmap_area_lock);
1372         llist_for_each_entry_safe(va, n_va, valist, purge_list) {
1373                 unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
1374                 unsigned long orig_start = va->va_start;
1375                 unsigned long orig_end = va->va_end;
1376
1377                 /*
1378                  * Finally insert or merge lazily-freed area. It is
1379                  * detached and there is no need to "unlink" it from
1380                  * anything.
1381                  */
1382                 va = merge_or_add_vmap_area(va, &free_vmap_area_root,
1383                                             &free_vmap_area_list);
1384
1385                 if (is_vmalloc_or_module_addr((void *)orig_start))
1386                         kasan_release_vmalloc(orig_start, orig_end,
1387                                               va->va_start, va->va_end);
1388
1389                 atomic_long_sub(nr, &vmap_lazy_nr);
1390
1391                 if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
1392                         cond_resched_lock(&free_vmap_area_lock);
1393         }
1394         spin_unlock(&free_vmap_area_lock);
1395         return true;
1396 }
1397
1398 /*
1399  * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
1400  * is already purging.
1401  */
1402 static void try_purge_vmap_area_lazy(void)
1403 {
1404         if (mutex_trylock(&vmap_purge_lock)) {
1405                 __purge_vmap_area_lazy(ULONG_MAX, 0);
1406                 mutex_unlock(&vmap_purge_lock);
1407         }
1408 }
1409
1410 /*
1411  * Kick off a purge of the outstanding lazy areas.
1412  */
1413 static void purge_vmap_area_lazy(void)
1414 {
1415         mutex_lock(&vmap_purge_lock);
1416         purge_fragmented_blocks_allcpus();
1417         __purge_vmap_area_lazy(ULONG_MAX, 0);
1418         mutex_unlock(&vmap_purge_lock);
1419 }
1420
1421 /*
1422  * Free a vmap area, caller ensuring that the area has been unmapped
1423  * and flush_cache_vunmap had been called for the correct range
1424  * previously.
1425  */
1426 static void free_vmap_area_noflush(struct vmap_area *va)
1427 {
1428         unsigned long nr_lazy;
1429
1430         spin_lock(&vmap_area_lock);
1431         unlink_va(va, &vmap_area_root);
1432         spin_unlock(&vmap_area_lock);
1433
1434         nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
1435                                 PAGE_SHIFT, &vmap_lazy_nr);
1436
1437         /* After this point, we may free va at any time */
1438         llist_add(&va->purge_list, &vmap_purge_list);
1439
1440         if (unlikely(nr_lazy > lazy_max_pages()))
1441                 try_purge_vmap_area_lazy();
1442 }
1443
1444 /*
1445  * Free and unmap a vmap area
1446  */
1447 static void free_unmap_vmap_area(struct vmap_area *va)
1448 {
1449         flush_cache_vunmap(va->va_start, va->va_end);
1450         unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start);
1451         if (debug_pagealloc_enabled_static())
1452                 flush_tlb_kernel_range(va->va_start, va->va_end);
1453
1454         free_vmap_area_noflush(va);
1455 }
1456
1457 static struct vmap_area *find_vmap_area(unsigned long addr)
1458 {
1459         struct vmap_area *va;
1460
1461         spin_lock(&vmap_area_lock);
1462         va = __find_vmap_area(addr);
1463         spin_unlock(&vmap_area_lock);
1464
1465         return va;
1466 }
1467
1468 /*** Per cpu kva allocator ***/
1469
1470 /*
1471  * vmap space is limited especially on 32 bit architectures. Ensure there is
1472  * room for at least 16 percpu vmap blocks per CPU.
1473  */
1474 /*
1475  * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
1476  * to #define VMALLOC_SPACE             (VMALLOC_END-VMALLOC_START). Guess
1477  * instead (we just need a rough idea)
1478  */
1479 #if BITS_PER_LONG == 32
1480 #define VMALLOC_SPACE           (128UL*1024*1024)
1481 #else
1482 #define VMALLOC_SPACE           (128UL*1024*1024*1024)
1483 #endif
1484
1485 #define VMALLOC_PAGES           (VMALLOC_SPACE / PAGE_SIZE)
1486 #define VMAP_MAX_ALLOC          BITS_PER_LONG   /* 256K with 4K pages */
1487 #define VMAP_BBMAP_BITS_MAX     1024    /* 4MB with 4K pages */
1488 #define VMAP_BBMAP_BITS_MIN     (VMAP_MAX_ALLOC*2)
1489 #define VMAP_MIN(x, y)          ((x) < (y) ? (x) : (y)) /* can't use min() */
1490 #define VMAP_MAX(x, y)          ((x) > (y) ? (x) : (y)) /* can't use max() */
1491 #define VMAP_BBMAP_BITS         \
1492                 VMAP_MIN(VMAP_BBMAP_BITS_MAX,   \
1493                 VMAP_MAX(VMAP_BBMAP_BITS_MIN,   \
1494                         VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
1495
1496 #define VMAP_BLOCK_SIZE         (VMAP_BBMAP_BITS * PAGE_SIZE)
1497
1498 struct vmap_block_queue {
1499         spinlock_t lock;
1500         struct list_head free;
1501 };
1502
1503 struct vmap_block {
1504         spinlock_t lock;
1505         struct vmap_area *va;
1506         unsigned long free, dirty;
1507         unsigned long dirty_min, dirty_max; /*< dirty range */
1508         struct list_head free_list;
1509         struct rcu_head rcu_head;
1510         struct list_head purge;
1511 };
1512
1513 /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
1514 static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
1515
1516 /*
1517  * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
1518  * in the free path. Could get rid of this if we change the API to return a
1519  * "cookie" from alloc, to be passed to free. But no big deal yet.
1520  */
1521 static DEFINE_SPINLOCK(vmap_block_tree_lock);
1522 static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
1523
1524 /*
1525  * We should probably have a fallback mechanism to allocate virtual memory
1526  * out of partially filled vmap blocks. However vmap block sizing should be
1527  * fairly reasonable according to the vmalloc size, so it shouldn't be a
1528  * big problem.
1529  */
1530
1531 static unsigned long addr_to_vb_idx(unsigned long addr)
1532 {
1533         addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
1534         addr /= VMAP_BLOCK_SIZE;
1535         return addr;
1536 }
1537
1538 static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
1539 {
1540         unsigned long addr;
1541
1542         addr = va_start + (pages_off << PAGE_SHIFT);
1543         BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
1544         return (void *)addr;
1545 }
1546
1547 /**
1548  * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
1549  *                  block. Of course pages number can't exceed VMAP_BBMAP_BITS
1550  * @order:    how many 2^order pages should be occupied in newly allocated block
1551  * @gfp_mask: flags for the page level allocator
1552  *
1553  * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
1554  */
1555 static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
1556 {
1557         struct vmap_block_queue *vbq;
1558         struct vmap_block *vb;
1559         struct vmap_area *va;
1560         unsigned long vb_idx;
1561         int node, err;
1562         void *vaddr;
1563
1564         node = numa_node_id();
1565
1566         vb = kmalloc_node(sizeof(struct vmap_block),
1567                         gfp_mask & GFP_RECLAIM_MASK, node);
1568         if (unlikely(!vb))
1569                 return ERR_PTR(-ENOMEM);
1570
1571         va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
1572                                         VMALLOC_START, VMALLOC_END,
1573                                         node, gfp_mask);
1574         if (IS_ERR(va)) {
1575                 kfree(vb);
1576                 return ERR_CAST(va);
1577         }
1578
1579         err = radix_tree_preload(gfp_mask);
1580         if (unlikely(err)) {
1581                 kfree(vb);
1582                 free_vmap_area(va);
1583                 return ERR_PTR(err);
1584         }
1585
1586         vaddr = vmap_block_vaddr(va->va_start, 0);
1587         spin_lock_init(&vb->lock);
1588         vb->va = va;
1589         /* At least something should be left free */
1590         BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
1591         vb->free = VMAP_BBMAP_BITS - (1UL << order);
1592         vb->dirty = 0;
1593         vb->dirty_min = VMAP_BBMAP_BITS;
1594         vb->dirty_max = 0;
1595         INIT_LIST_HEAD(&vb->free_list);
1596
1597         vb_idx = addr_to_vb_idx(va->va_start);
1598         spin_lock(&vmap_block_tree_lock);
1599         err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
1600         spin_unlock(&vmap_block_tree_lock);
1601         BUG_ON(err);
1602         radix_tree_preload_end();
1603
1604         vbq = &get_cpu_var(vmap_block_queue);
1605         spin_lock(&vbq->lock);
1606         list_add_tail_rcu(&vb->free_list, &vbq->free);
1607         spin_unlock(&vbq->lock);
1608         put_cpu_var(vmap_block_queue);
1609
1610         return vaddr;
1611 }
1612
1613 static void free_vmap_block(struct vmap_block *vb)
1614 {
1615         struct vmap_block *tmp;
1616         unsigned long vb_idx;
1617
1618         vb_idx = addr_to_vb_idx(vb->va->va_start);
1619         spin_lock(&vmap_block_tree_lock);
1620         tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
1621         spin_unlock(&vmap_block_tree_lock);
1622         BUG_ON(tmp != vb);
1623
1624         free_vmap_area_noflush(vb->va);
1625         kfree_rcu(vb, rcu_head);
1626 }
1627
1628 static void purge_fragmented_blocks(int cpu)
1629 {
1630         LIST_HEAD(purge);
1631         struct vmap_block *vb;
1632         struct vmap_block *n_vb;
1633         struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
1634
1635         rcu_read_lock();
1636         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
1637
1638                 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
1639                         continue;
1640
1641                 spin_lock(&vb->lock);
1642                 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
1643                         vb->free = 0; /* prevent further allocs after releasing lock */
1644                         vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
1645                         vb->dirty_min = 0;
1646                         vb->dirty_max = VMAP_BBMAP_BITS;
1647                         spin_lock(&vbq->lock);
1648                         list_del_rcu(&vb->free_list);
1649                         spin_unlock(&vbq->lock);
1650                         spin_unlock(&vb->lock);
1651                         list_add_tail(&vb->purge, &purge);
1652                 } else
1653                         spin_unlock(&vb->lock);
1654         }
1655         rcu_read_unlock();
1656
1657         list_for_each_entry_safe(vb, n_vb, &purge, purge) {
1658                 list_del(&vb->purge);
1659                 free_vmap_block(vb);
1660         }
1661 }
1662
1663 static void purge_fragmented_blocks_allcpus(void)
1664 {
1665         int cpu;
1666
1667         for_each_possible_cpu(cpu)
1668                 purge_fragmented_blocks(cpu);
1669 }
1670
1671 static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
1672 {
1673         struct vmap_block_queue *vbq;
1674         struct vmap_block *vb;
1675         void *vaddr = NULL;
1676         unsigned int order;
1677
1678         BUG_ON(offset_in_page(size));
1679         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
1680         if (WARN_ON(size == 0)) {
1681                 /*
1682                  * Allocating 0 bytes isn't what caller wants since
1683                  * get_order(0) returns funny result. Just warn and terminate
1684                  * early.
1685                  */
1686                 return NULL;
1687         }
1688         order = get_order(size);
1689
1690         rcu_read_lock();
1691         vbq = &get_cpu_var(vmap_block_queue);
1692         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
1693                 unsigned long pages_off;
1694
1695                 spin_lock(&vb->lock);
1696                 if (vb->free < (1UL << order)) {
1697                         spin_unlock(&vb->lock);
1698                         continue;
1699                 }
1700
1701                 pages_off = VMAP_BBMAP_BITS - vb->free;
1702                 vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
1703                 vb->free -= 1UL << order;
1704                 if (vb->free == 0) {
1705                         spin_lock(&vbq->lock);
1706                         list_del_rcu(&vb->free_list);
1707                         spin_unlock(&vbq->lock);
1708                 }
1709
1710                 spin_unlock(&vb->lock);
1711                 break;
1712         }
1713
1714         put_cpu_var(vmap_block_queue);
1715         rcu_read_unlock();
1716
1717         /* Allocate new block if nothing was found */
1718         if (!vaddr)
1719                 vaddr = new_vmap_block(order, gfp_mask);
1720
1721         return vaddr;
1722 }
1723
1724 static void vb_free(unsigned long addr, unsigned long size)
1725 {
1726         unsigned long offset;
1727         unsigned long vb_idx;
1728         unsigned int order;
1729         struct vmap_block *vb;
1730
1731         BUG_ON(offset_in_page(size));
1732         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
1733
1734         flush_cache_vunmap(addr, addr + size);
1735
1736         order = get_order(size);
1737
1738         offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
1739
1740         vb_idx = addr_to_vb_idx(addr);
1741         rcu_read_lock();
1742         vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
1743         rcu_read_unlock();
1744         BUG_ON(!vb);
1745
1746         unmap_kernel_range_noflush(addr, size);
1747
1748         if (debug_pagealloc_enabled_static())
1749                 flush_tlb_kernel_range(addr, addr + size);
1750
1751         spin_lock(&vb->lock);
1752
1753         /* Expand dirty range */
1754         vb->dirty_min = min(vb->dirty_min, offset);
1755         vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
1756
1757         vb->dirty += 1UL << order;
1758         if (vb->dirty == VMAP_BBMAP_BITS) {
1759                 BUG_ON(vb->free);
1760                 spin_unlock(&vb->lock);
1761                 free_vmap_block(vb);
1762         } else
1763                 spin_unlock(&vb->lock);
1764 }
1765
1766 static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
1767 {
1768         int cpu;
1769
1770         if (unlikely(!vmap_initialized))
1771                 return;
1772
1773         might_sleep();
1774
1775         for_each_possible_cpu(cpu) {
1776                 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
1777                 struct vmap_block *vb;
1778
1779                 rcu_read_lock();
1780                 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
1781                         spin_lock(&vb->lock);
1782                         if (vb->dirty) {
1783                                 unsigned long va_start = vb->va->va_start;
1784                                 unsigned long s, e;
1785
1786                                 s = va_start + (vb->dirty_min << PAGE_SHIFT);
1787                                 e = va_start + (vb->dirty_max << PAGE_SHIFT);
1788
1789                                 start = min(s, start);
1790                                 end   = max(e, end);
1791
1792                                 flush = 1;
1793                         }
1794                         spin_unlock(&vb->lock);
1795                 }
1796                 rcu_read_unlock();
1797         }
1798
1799         mutex_lock(&vmap_purge_lock);
1800         purge_fragmented_blocks_allcpus();
1801         if (!__purge_vmap_area_lazy(start, end) && flush)
1802                 flush_tlb_kernel_range(start, end);
1803         mutex_unlock(&vmap_purge_lock);
1804 }
1805
1806 /**
1807  * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
1808  *
1809  * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
1810  * to amortize TLB flushing overheads. What this means is that any page you
1811  * have now, may, in a former life, have been mapped into kernel virtual
1812  * address by the vmap layer and so there might be some CPUs with TLB entries
1813  * still referencing that page (additional to the regular 1:1 kernel mapping).
1814  *
1815  * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
1816  * be sure that none of the pages we have control over will have any aliases
1817  * from the vmap layer.
1818  */
1819 void vm_unmap_aliases(void)
1820 {
1821         unsigned long start = ULONG_MAX, end = 0;
1822         int flush = 0;
1823
1824         _vm_unmap_aliases(start, end, flush);
1825 }
1826 EXPORT_SYMBOL_GPL(vm_unmap_aliases);
1827
1828 /**
1829  * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
1830  * @mem: the pointer returned by vm_map_ram
1831  * @count: the count passed to that vm_map_ram call (cannot unmap partial)
1832  */
1833 void vm_unmap_ram(const void *mem, unsigned int count)
1834 {
1835         unsigned long size = (unsigned long)count << PAGE_SHIFT;
1836         unsigned long addr = (unsigned long)mem;
1837         struct vmap_area *va;
1838
1839         might_sleep();
1840         BUG_ON(!addr);
1841         BUG_ON(addr < VMALLOC_START);
1842         BUG_ON(addr > VMALLOC_END);
1843         BUG_ON(!PAGE_ALIGNED(addr));
1844
1845         kasan_poison_vmalloc(mem, size);
1846
1847         if (likely(count <= VMAP_MAX_ALLOC)) {
1848                 debug_check_no_locks_freed(mem, size);
1849                 vb_free(addr, size);
1850                 return;
1851         }
1852
1853         va = find_vmap_area(addr);
1854         BUG_ON(!va);
1855         debug_check_no_locks_freed((void *)va->va_start,
1856                                     (va->va_end - va->va_start));
1857         free_unmap_vmap_area(va);
1858 }
1859 EXPORT_SYMBOL(vm_unmap_ram);
1860
1861 /**
1862  * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
1863  * @pages: an array of pointers to the pages to be mapped
1864  * @count: number of pages
1865  * @node: prefer to allocate data structures on this node
1866  *
1867  * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
1868  * faster than vmap so it's good.  But if you mix long-life and short-life
1869  * objects with vm_map_ram(), it could consume lots of address space through
1870  * fragmentation (especially on a 32bit machine).  You could see failures in
1871  * the end.  Please use this function for short-lived objects.
1872  *
1873  * Returns: a pointer to the address that has been mapped, or %NULL on failure
1874  */
1875 void *vm_map_ram(struct page **pages, unsigned int count, int node)
1876 {
1877         unsigned long size = (unsigned long)count << PAGE_SHIFT;
1878         unsigned long addr;
1879         void *mem;
1880
1881         if (likely(count <= VMAP_MAX_ALLOC)) {
1882                 mem = vb_alloc(size, GFP_KERNEL);
1883                 if (IS_ERR(mem))
1884                         return NULL;
1885                 addr = (unsigned long)mem;
1886         } else {
1887                 struct vmap_area *va;
1888                 va = alloc_vmap_area(size, PAGE_SIZE,
1889                                 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
1890                 if (IS_ERR(va))
1891                         return NULL;
1892
1893                 addr = va->va_start;
1894                 mem = (void *)addr;
1895         }
1896
1897         kasan_unpoison_vmalloc(mem, size);
1898
1899         if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) {
1900                 vm_unmap_ram(mem, count);
1901                 return NULL;
1902         }
1903         return mem;
1904 }
1905 EXPORT_SYMBOL(vm_map_ram);
1906
1907 static struct vm_struct *vmlist __initdata;
1908
1909 /**
1910  * vm_area_add_early - add vmap area early during boot
1911  * @vm: vm_struct to add
1912  *
1913  * This function is used to add fixed kernel vm area to vmlist before
1914  * vmalloc_init() is called.  @vm->addr, @vm->size, and @vm->flags
1915  * should contain proper values and the other fields should be zero.
1916  *
1917  * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
1918  */
1919 void __init vm_area_add_early(struct vm_struct *vm)
1920 {
1921         struct vm_struct *tmp, **p;
1922
1923         BUG_ON(vmap_initialized);
1924         for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1925                 if (tmp->addr >= vm->addr) {
1926                         BUG_ON(tmp->addr < vm->addr + vm->size);
1927                         break;
1928                 } else
1929                         BUG_ON(tmp->addr + tmp->size > vm->addr);
1930         }
1931         vm->next = *p;
1932         *p = vm;
1933 }
1934
1935 /**
1936  * vm_area_register_early - register vmap area early during boot
1937  * @vm: vm_struct to register
1938  * @align: requested alignment
1939  *
1940  * This function is used to register kernel vm area before
1941  * vmalloc_init() is called.  @vm->size and @vm->flags should contain
1942  * proper values on entry and other fields should be zero.  On return,
1943  * vm->addr contains the allocated address.
1944  *
1945  * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
1946  */
1947 void __init vm_area_register_early(struct vm_struct *vm, size_t align)
1948 {
1949         static size_t vm_init_off __initdata;
1950         unsigned long addr;
1951
1952         addr = ALIGN(VMALLOC_START + vm_init_off, align);
1953         vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
1954
1955         vm->addr = (void *)addr;
1956
1957         vm_area_add_early(vm);
1958 }
1959
1960 static void vmap_init_free_space(void)
1961 {
1962         unsigned long vmap_start = 1;
1963         const unsigned long vmap_end = ULONG_MAX;
1964         struct vmap_area *busy, *free;
1965
1966         /*
1967          *     B     F     B     B     B     F
1968          * -|-----|.....|-----|-----|-----|.....|-
1969          *  |           The KVA space           |
1970          *  |<--------------------------------->|
1971          */
1972         list_for_each_entry(busy, &vmap_area_list, list) {
1973                 if (busy->va_start - vmap_start > 0) {
1974                         free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
1975                         if (!WARN_ON_ONCE(!free)) {
1976                                 free->va_start = vmap_start;
1977                                 free->va_end = busy->va_start;
1978
1979                                 insert_vmap_area_augment(free, NULL,
1980                                         &free_vmap_area_root,
1981                                                 &free_vmap_area_list);
1982                         }
1983                 }
1984
1985                 vmap_start = busy->va_end;
1986         }
1987
1988         if (vmap_end - vmap_start > 0) {
1989                 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
1990                 if (!WARN_ON_ONCE(!free)) {
1991                         free->va_start = vmap_start;
1992                         free->va_end = vmap_end;
1993
1994                         insert_vmap_area_augment(free, NULL,
1995                                 &free_vmap_area_root,
1996                                         &free_vmap_area_list);
1997                 }
1998         }
1999 }
2000
2001 void __init vmalloc_init(void)
2002 {
2003         struct vmap_area *va;
2004         struct vm_struct *tmp;
2005         int i;
2006
2007         /*
2008          * Create the cache for vmap_area objects.
2009          */
2010         vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
2011
2012         for_each_possible_cpu(i) {
2013                 struct vmap_block_queue *vbq;
2014                 struct vfree_deferred *p;
2015
2016                 vbq = &per_cpu(vmap_block_queue, i);
2017                 spin_lock_init(&vbq->lock);
2018                 INIT_LIST_HEAD(&vbq->free);
2019                 p = &per_cpu(vfree_deferred, i);
2020                 init_llist_head(&p->list);
2021                 INIT_WORK(&p->wq, free_work);
2022         }
2023
2024         /* Import existing vmlist entries. */
2025         for (tmp = vmlist; tmp; tmp = tmp->next) {
2026                 va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
2027                 if (WARN_ON_ONCE(!va))
2028                         continue;
2029
2030                 va->va_start = (unsigned long)tmp->addr;
2031                 va->va_end = va->va_start + tmp->size;
2032                 va->vm = tmp;
2033                 insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
2034         }
2035
2036         /*
2037          * Now we can initialize a free vmap space.
2038          */
2039         vmap_init_free_space();
2040         vmap_initialized = true;
2041 }
2042
2043 /**
2044  * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
2045  * @addr: start of the VM area to unmap
2046  * @size: size of the VM area to unmap
2047  *
2048  * Similar to unmap_kernel_range_noflush() but flushes vcache before
2049  * the unmapping and tlb after.
2050  */
2051 void unmap_kernel_range(unsigned long addr, unsigned long size)
2052 {
2053         unsigned long end = addr + size;
2054
2055         flush_cache_vunmap(addr, end);
2056         unmap_kernel_range_noflush(addr, size);
2057         flush_tlb_kernel_range(addr, end);
2058 }
2059
2060 static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
2061         struct vmap_area *va, unsigned long flags, const void *caller)
2062 {
2063         vm->flags = flags;
2064         vm->addr = (void *)va->va_start;
2065         vm->size = va->va_end - va->va_start;
2066         vm->caller = caller;
2067         va->vm = vm;
2068 }
2069
2070 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
2071                               unsigned long flags, const void *caller)
2072 {
2073         spin_lock(&vmap_area_lock);
2074         setup_vmalloc_vm_locked(vm, va, flags, caller);
2075         spin_unlock(&vmap_area_lock);
2076 }
2077
2078 static void clear_vm_uninitialized_flag(struct vm_struct *vm)
2079 {
2080         /*
2081          * Before removing VM_UNINITIALIZED,
2082          * we should make sure that vm has proper values.
2083          * Pair with smp_rmb() in show_numa_info().
2084          */
2085         smp_wmb();
2086         vm->flags &= ~VM_UNINITIALIZED;
2087 }
2088
2089 static struct vm_struct *__get_vm_area_node(unsigned long size,
2090                 unsigned long align, unsigned long flags, unsigned long start,
2091                 unsigned long end, int node, gfp_t gfp_mask, const void *caller)
2092 {
2093         struct vmap_area *va;
2094         struct vm_struct *area;
2095         unsigned long requested_size = size;
2096
2097         BUG_ON(in_interrupt());
2098         size = PAGE_ALIGN(size);
2099         if (unlikely(!size))
2100                 return NULL;
2101
2102         if (flags & VM_IOREMAP)
2103                 align = 1ul << clamp_t(int, get_count_order_long(size),
2104                                        PAGE_SHIFT, IOREMAP_MAX_ORDER);
2105
2106         area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
2107         if (unlikely(!area))
2108                 return NULL;
2109
2110         if (!(flags & VM_NO_GUARD))
2111                 size += PAGE_SIZE;
2112
2113         va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
2114         if (IS_ERR(va)) {
2115                 kfree(area);
2116                 return NULL;
2117         }
2118
2119         kasan_unpoison_vmalloc((void *)va->va_start, requested_size);
2120
2121         setup_vmalloc_vm(area, va, flags, caller);
2122
2123         return area;
2124 }
2125
2126 struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
2127                                        unsigned long start, unsigned long end,
2128                                        const void *caller)
2129 {
2130         return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
2131                                   GFP_KERNEL, caller);
2132 }
2133
2134 /**
2135  * get_vm_area - reserve a contiguous kernel virtual area
2136  * @size:        size of the area
2137  * @flags:       %VM_IOREMAP for I/O mappings or VM_ALLOC
2138  *
2139  * Search an area of @size in the kernel virtual mapping area,
2140  * and reserved it for out purposes.  Returns the area descriptor
2141  * on success or %NULL on failure.
2142  *
2143  * Return: the area descriptor on success or %NULL on failure.
2144  */
2145 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
2146 {
2147         return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
2148                                   NUMA_NO_NODE, GFP_KERNEL,
2149                                   __builtin_return_address(0));
2150 }
2151
2152 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
2153                                 const void *caller)
2154 {
2155         return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
2156                                   NUMA_NO_NODE, GFP_KERNEL, caller);
2157 }
2158
2159 /**
2160  * find_vm_area - find a continuous kernel virtual area
2161  * @addr:         base address
2162  *
2163  * Search for the kernel VM area starting at @addr, and return it.
2164  * It is up to the caller to do all required locking to keep the returned
2165  * pointer valid.
2166  *
2167  * Return: pointer to the found area or %NULL on faulure
2168  */
2169 struct vm_struct *find_vm_area(const void *addr)
2170 {
2171         struct vmap_area *va;
2172
2173         va = find_vmap_area((unsigned long)addr);
2174         if (!va)
2175                 return NULL;
2176
2177         return va->vm;
2178 }
2179
2180 /**
2181  * remove_vm_area - find and remove a continuous kernel virtual area
2182  * @addr:           base address
2183  *
2184  * Search for the kernel VM area starting at @addr, and remove it.
2185  * This function returns the found VM area, but using it is NOT safe
2186  * on SMP machines, except for its size or flags.
2187  *
2188  * Return: pointer to the found area or %NULL on faulure
2189  */
2190 struct vm_struct *remove_vm_area(const void *addr)
2191 {
2192         struct vmap_area *va;
2193
2194         might_sleep();
2195
2196         spin_lock(&vmap_area_lock);
2197         va = __find_vmap_area((unsigned long)addr);
2198         if (va && va->vm) {
2199                 struct vm_struct *vm = va->vm;
2200
2201                 va->vm = NULL;
2202                 spin_unlock(&vmap_area_lock);
2203
2204                 kasan_free_shadow(vm);
2205                 free_unmap_vmap_area(va);
2206
2207                 return vm;
2208         }
2209
2210         spin_unlock(&vmap_area_lock);
2211         return NULL;
2212 }
2213
2214 static inline void set_area_direct_map(const struct vm_struct *area,
2215                                        int (*set_direct_map)(struct page *page))
2216 {
2217         int i;
2218
2219         for (i = 0; i < area->nr_pages; i++)
2220                 if (page_address(area->pages[i]))
2221                         set_direct_map(area->pages[i]);
2222 }
2223
2224 /* Handle removing and resetting vm mappings related to the vm_struct. */
2225 static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
2226 {
2227         unsigned long start = ULONG_MAX, end = 0;
2228         int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
2229         int flush_dmap = 0;
2230         int i;
2231
2232         remove_vm_area(area->addr);
2233
2234         /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
2235         if (!flush_reset)
2236                 return;
2237
2238         /*
2239          * If not deallocating pages, just do the flush of the VM area and
2240          * return.
2241          */
2242         if (!deallocate_pages) {
2243                 vm_unmap_aliases();
2244                 return;
2245         }
2246
2247         /*
2248          * If execution gets here, flush the vm mapping and reset the direct
2249          * map. Find the start and end range of the direct mappings to make sure
2250          * the vm_unmap_aliases() flush includes the direct map.
2251          */
2252         for (i = 0; i < area->nr_pages; i++) {
2253                 unsigned long addr = (unsigned long)page_address(area->pages[i]);
2254                 if (addr) {
2255                         start = min(addr, start);
2256                         end = max(addr + PAGE_SIZE, end);
2257                         flush_dmap = 1;
2258                 }
2259         }
2260
2261         /*
2262          * Set direct map to something invalid so that it won't be cached if
2263          * there are any accesses after the TLB flush, then flush the TLB and
2264          * reset the direct map permissions to the default.
2265          */
2266         set_area_direct_map(area, set_direct_map_invalid_noflush);
2267         _vm_unmap_aliases(start, end, flush_dmap);
2268         set_area_direct_map(area, set_direct_map_default_noflush);
2269 }
2270
2271 static void __vunmap(const void *addr, int deallocate_pages)
2272 {
2273         struct vm_struct *area;
2274
2275         if (!addr)
2276                 return;
2277
2278         if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
2279                         addr))
2280                 return;
2281
2282         area = find_vm_area(addr);
2283         if (unlikely(!area)) {
2284                 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
2285                                 addr);
2286                 return;
2287         }
2288
2289         debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
2290         debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
2291
2292         kasan_poison_vmalloc(area->addr, area->size);
2293
2294         vm_remove_mappings(area, deallocate_pages);
2295
2296         if (deallocate_pages) {
2297                 int i;
2298
2299                 for (i = 0; i < area->nr_pages; i++) {
2300                         struct page *page = area->pages[i];
2301
2302                         BUG_ON(!page);
2303                         __free_pages(page, 0);
2304                 }
2305                 atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
2306
2307                 kvfree(area->pages);
2308         }
2309
2310         kfree(area);
2311         return;
2312 }
2313
2314 static inline void __vfree_deferred(const void *addr)
2315 {
2316         /*
2317          * Use raw_cpu_ptr() because this can be called from preemptible
2318          * context. Preemption is absolutely fine here, because the llist_add()
2319          * implementation is lockless, so it works even if we are adding to
2320          * another cpu's list. schedule_work() should be fine with this too.
2321          */
2322         struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
2323
2324         if (llist_add((struct llist_node *)addr, &p->list))
2325                 schedule_work(&p->wq);
2326 }
2327
2328 /**
2329  * vfree_atomic - release memory allocated by vmalloc()
2330  * @addr:         memory base address
2331  *
2332  * This one is just like vfree() but can be called in any atomic context
2333  * except NMIs.
2334  */
2335 void vfree_atomic(const void *addr)
2336 {
2337         BUG_ON(in_nmi());
2338
2339         kmemleak_free(addr);
2340
2341         if (!addr)
2342                 return;
2343         __vfree_deferred(addr);
2344 }
2345
2346 static void __vfree(const void *addr)
2347 {
2348         if (unlikely(in_interrupt()))
2349                 __vfree_deferred(addr);
2350         else
2351                 __vunmap(addr, 1);
2352 }
2353
2354 /**
2355  * vfree - release memory allocated by vmalloc()
2356  * @addr:  memory base address
2357  *
2358  * Free the virtually continuous memory area starting at @addr, as
2359  * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
2360  * NULL, no operation is performed.
2361  *
2362  * Must not be called in NMI context (strictly speaking, only if we don't
2363  * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
2364  * conventions for vfree() arch-depenedent would be a really bad idea)
2365  *
2366  * May sleep if called *not* from interrupt context.
2367  *
2368  * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
2369  */
2370 void vfree(const void *addr)
2371 {
2372         BUG_ON(in_nmi());
2373
2374         kmemleak_free(addr);
2375
2376         might_sleep_if(!in_interrupt());
2377
2378         if (!addr)
2379                 return;
2380
2381         __vfree(addr);
2382 }
2383 EXPORT_SYMBOL(vfree);
2384
2385 /**
2386  * vunmap - release virtual mapping obtained by vmap()
2387  * @addr:   memory base address
2388  *
2389  * Free the virtually contiguous memory area starting at @addr,
2390  * which was created from the page array passed to vmap().
2391  *
2392  * Must not be called in interrupt context.
2393  */
2394 void vunmap(const void *addr)
2395 {
2396         BUG_ON(in_interrupt());
2397         might_sleep();
2398         if (addr)
2399                 __vunmap(addr, 0);
2400 }
2401 EXPORT_SYMBOL(vunmap);
2402
2403 /**
2404  * vmap - map an array of pages into virtually contiguous space
2405  * @pages: array of page pointers
2406  * @count: number of pages to map
2407  * @flags: vm_area->flags
2408  * @prot: page protection for the mapping
2409  *
2410  * Maps @count pages from @pages into contiguous kernel virtual
2411  * space.
2412  *
2413  * Return: the address of the area or %NULL on failure
2414  */
2415 void *vmap(struct page **pages, unsigned int count,
2416            unsigned long flags, pgprot_t prot)
2417 {
2418         struct vm_struct *area;
2419         unsigned long size;             /* In bytes */
2420
2421         might_sleep();
2422
2423         if (count > totalram_pages())
2424                 return NULL;
2425
2426         size = (unsigned long)count << PAGE_SHIFT;
2427         area = get_vm_area_caller(size, flags, __builtin_return_address(0));
2428         if (!area)
2429                 return NULL;
2430
2431         if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot),
2432                         pages) < 0) {
2433                 vunmap(area->addr);
2434                 return NULL;
2435         }
2436
2437         return area->addr;
2438 }
2439 EXPORT_SYMBOL(vmap);
2440
2441 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
2442                                  pgprot_t prot, int node)
2443 {
2444         struct page **pages;
2445         unsigned int nr_pages, array_size, i;
2446         const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
2447         const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
2448         const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
2449                                         0 :
2450                                         __GFP_HIGHMEM;
2451
2452         nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
2453         array_size = (nr_pages * sizeof(struct page *));
2454
2455         /* Please note that the recursion is strictly bounded. */
2456         if (array_size > PAGE_SIZE) {
2457                 pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
2458                                 node, area->caller);
2459         } else {
2460                 pages = kmalloc_node(array_size, nested_gfp, node);
2461         }
2462
2463         if (!pages) {
2464                 remove_vm_area(area->addr);
2465                 kfree(area);
2466                 return NULL;
2467         }
2468
2469         area->pages = pages;
2470         area->nr_pages = nr_pages;
2471
2472         for (i = 0; i < area->nr_pages; i++) {
2473                 struct page *page;
2474
2475                 if (node == NUMA_NO_NODE)
2476                         page = alloc_page(alloc_mask|highmem_mask);
2477                 else
2478                         page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
2479
2480                 if (unlikely(!page)) {
2481                         /* Successfully allocated i pages, free them in __vunmap() */
2482                         area->nr_pages = i;
2483                         atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
2484                         goto fail;
2485                 }
2486                 area->pages[i] = page;
2487                 if (gfpflags_allow_blocking(gfp_mask))
2488                         cond_resched();
2489         }
2490         atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
2491
2492         if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
2493                         prot, pages) < 0)
2494                 goto fail;
2495
2496         return area->addr;
2497
2498 fail:
2499         warn_alloc(gfp_mask, NULL,
2500                           "vmalloc: allocation failure, allocated %ld of %ld bytes",
2501                           (area->nr_pages*PAGE_SIZE), area->size);
2502         __vfree(area->addr);
2503         return NULL;
2504 }
2505
2506 /**
2507  * __vmalloc_node_range - allocate virtually contiguous memory
2508  * @size:                 allocation size
2509  * @align:                desired alignment
2510  * @start:                vm area range start
2511  * @end:                  vm area range end
2512  * @gfp_mask:             flags for the page level allocator
2513  * @prot:                 protection mask for the allocated pages
2514  * @vm_flags:             additional vm area flags (e.g. %VM_NO_GUARD)
2515  * @node:                 node to use for allocation or NUMA_NO_NODE
2516  * @caller:               caller's return address
2517  *
2518  * Allocate enough pages to cover @size from the page level
2519  * allocator with @gfp_mask flags.  Map them into contiguous
2520  * kernel virtual space, using a pagetable protection of @prot.
2521  *
2522  * Return: the address of the area or %NULL on failure
2523  */
2524 void *__vmalloc_node_range(unsigned long size, unsigned long align,
2525                         unsigned long start, unsigned long end, gfp_t gfp_mask,
2526                         pgprot_t prot, unsigned long vm_flags, int node,
2527                         const void *caller)
2528 {
2529         struct vm_struct *area;
2530         void *addr;
2531         unsigned long real_size = size;
2532
2533         size = PAGE_ALIGN(size);
2534         if (!size || (size >> PAGE_SHIFT) > totalram_pages())
2535                 goto fail;
2536
2537         area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED |
2538                                 vm_flags, start, end, node, gfp_mask, caller);
2539         if (!area)
2540                 goto fail;
2541
2542         addr = __vmalloc_area_node(area, gfp_mask, prot, node);
2543         if (!addr)
2544                 return NULL;
2545
2546         /*
2547          * In this function, newly allocated vm_struct has VM_UNINITIALIZED
2548          * flag. It means that vm_struct is not fully initialized.
2549          * Now, it is fully initialized, so remove this flag here.
2550          */
2551         clear_vm_uninitialized_flag(area);
2552
2553         kmemleak_vmalloc(area, size, gfp_mask);
2554
2555         return addr;
2556
2557 fail:
2558         warn_alloc(gfp_mask, NULL,
2559                           "vmalloc: allocation failure: %lu bytes", real_size);
2560         return NULL;
2561 }
2562
2563 /**
2564  * __vmalloc_node - allocate virtually contiguous memory
2565  * @size:           allocation size
2566  * @align:          desired alignment
2567  * @gfp_mask:       flags for the page level allocator
2568  * @node:           node to use for allocation or NUMA_NO_NODE
2569  * @caller:         caller's return address
2570  *
2571  * Allocate enough pages to cover @size from the page level allocator with
2572  * @gfp_mask flags.  Map them into contiguous kernel virtual space.
2573  *
2574  * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
2575  * and __GFP_NOFAIL are not supported
2576  *
2577  * Any use of gfp flags outside of GFP_KERNEL should be consulted
2578  * with mm people.
2579  *
2580  * Return: pointer to the allocated memory or %NULL on error
2581  */
2582 void *__vmalloc_node(unsigned long size, unsigned long align,
2583                             gfp_t gfp_mask, int node, const void *caller)
2584 {
2585         return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
2586                                 gfp_mask, PAGE_KERNEL, 0, node, caller);
2587 }
2588 /*
2589  * This is only for performance analysis of vmalloc and stress purpose.
2590  * It is required by vmalloc test module, therefore do not use it other
2591  * than that.
2592  */
2593 #ifdef CONFIG_TEST_VMALLOC_MODULE
2594 EXPORT_SYMBOL_GPL(__vmalloc_node);
2595 #endif
2596
2597 void *__vmalloc(unsigned long size, gfp_t gfp_mask)
2598 {
2599         return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
2600                                 __builtin_return_address(0));
2601 }
2602 EXPORT_SYMBOL(__vmalloc);
2603
2604 /**
2605  * vmalloc - allocate virtually contiguous memory
2606  * @size:    allocation size
2607  *
2608  * Allocate enough pages to cover @size from the page level
2609  * allocator and map them into contiguous kernel virtual space.
2610  *
2611  * For tight control over page level allocator and protection flags
2612  * use __vmalloc() instead.
2613  *
2614  * Return: pointer to the allocated memory or %NULL on error
2615  */
2616 void *vmalloc(unsigned long size)
2617 {
2618         return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
2619                                 __builtin_return_address(0));
2620 }
2621 EXPORT_SYMBOL(vmalloc);
2622
2623 /**
2624  * vzalloc - allocate virtually contiguous memory with zero fill
2625  * @size:    allocation size
2626  *
2627  * Allocate enough pages to cover @size from the page level
2628  * allocator and map them into contiguous kernel virtual space.
2629  * The memory allocated is set to zero.
2630  *
2631  * For tight control over page level allocator and protection flags
2632  * use __vmalloc() instead.
2633  *
2634  * Return: pointer to the allocated memory or %NULL on error
2635  */
2636 void *vzalloc(unsigned long size)
2637 {
2638         return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
2639                                 __builtin_return_address(0));
2640 }
2641 EXPORT_SYMBOL(vzalloc);
2642
2643 /**
2644  * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
2645  * @size: allocation size
2646  *
2647  * The resulting memory area is zeroed so it can be mapped to userspace
2648  * without leaking data.
2649  *
2650  * Return: pointer to the allocated memory or %NULL on error
2651  */
2652 void *vmalloc_user(unsigned long size)
2653 {
2654         return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
2655                                     GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
2656                                     VM_USERMAP, NUMA_NO_NODE,
2657                                     __builtin_return_address(0));
2658 }
2659 EXPORT_SYMBOL(vmalloc_user);
2660
2661 /**
2662  * vmalloc_node - allocate memory on a specific node
2663  * @size:         allocation size
2664  * @node:         numa node
2665  *
2666  * Allocate enough pages to cover @size from the page level
2667  * allocator and map them into contiguous kernel virtual space.
2668  *
2669  * For tight control over page level allocator and protection flags
2670  * use __vmalloc() instead.
2671  *
2672  * Return: pointer to the allocated memory or %NULL on error
2673  */
2674 void *vmalloc_node(unsigned long size, int node)
2675 {
2676         return __vmalloc_node(size, 1, GFP_KERNEL, node,
2677                         __builtin_return_address(0));
2678 }
2679 EXPORT_SYMBOL(vmalloc_node);
2680
2681 /**
2682  * vzalloc_node - allocate memory on a specific node with zero fill
2683  * @size:       allocation size
2684  * @node:       numa node
2685  *
2686  * Allocate enough pages to cover @size from the page level
2687  * allocator and map them into contiguous kernel virtual space.
2688  * The memory allocated is set to zero.
2689  *
2690  * Return: pointer to the allocated memory or %NULL on error
2691  */
2692 void *vzalloc_node(unsigned long size, int node)
2693 {
2694         return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
2695                                 __builtin_return_address(0));
2696 }
2697 EXPORT_SYMBOL(vzalloc_node);
2698
2699 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
2700 #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
2701 #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
2702 #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
2703 #else
2704 /*
2705  * 64b systems should always have either DMA or DMA32 zones. For others
2706  * GFP_DMA32 should do the right thing and use the normal zone.
2707  */
2708 #define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
2709 #endif
2710
2711 /**
2712  * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
2713  * @size:       allocation size
2714  *
2715  * Allocate enough 32bit PA addressable pages to cover @size from the
2716  * page level allocator and map them into contiguous kernel virtual space.
2717  *
2718  * Return: pointer to the allocated memory or %NULL on error
2719  */
2720 void *vmalloc_32(unsigned long size)
2721 {
2722         return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
2723                         __builtin_return_address(0));
2724 }
2725 EXPORT_SYMBOL(vmalloc_32);
2726
2727 /**
2728  * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
2729  * @size:            allocation size
2730  *
2731  * The resulting memory area is 32bit addressable and zeroed so it can be
2732  * mapped to userspace without leaking data.
2733  *
2734  * Return: pointer to the allocated memory or %NULL on error
2735  */
2736 void *vmalloc_32_user(unsigned long size)
2737 {
2738         return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
2739                                     GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
2740                                     VM_USERMAP, NUMA_NO_NODE,
2741                                     __builtin_return_address(0));
2742 }
2743 EXPORT_SYMBOL(vmalloc_32_user);
2744
2745 /*
2746  * small helper routine , copy contents to buf from addr.
2747  * If the page is not present, fill zero.
2748  */
2749
2750 static int aligned_vread(char *buf, char *addr, unsigned long count)
2751 {
2752         struct page *p;
2753         int copied = 0;
2754
2755         while (count) {
2756                 unsigned long offset, length;
2757
2758                 offset = offset_in_page(addr);
2759                 length = PAGE_SIZE - offset;
2760                 if (length > count)
2761                         length = count;
2762                 p = vmalloc_to_page(addr);
2763                 /*
2764                  * To do safe access to this _mapped_ area, we need
2765                  * lock. But adding lock here means that we need to add
2766                  * overhead of vmalloc()/vfree() calles for this _debug_
2767                  * interface, rarely used. Instead of that, we'll use
2768                  * kmap() and get small overhead in this access function.
2769                  */
2770                 if (p) {
2771                         /*
2772                          * we can expect USER0 is not used (see vread/vwrite's
2773                          * function description)
2774                          */
2775                         void *map = kmap_atomic(p);
2776                         memcpy(buf, map + offset, length);
2777                         kunmap_atomic(map);
2778                 } else
2779                         memset(buf, 0, length);
2780
2781                 addr += length;
2782                 buf += length;
2783                 copied += length;
2784                 count -= length;
2785         }
2786         return copied;
2787 }
2788
2789 static int aligned_vwrite(char *buf, char *addr, unsigned long count)
2790 {
2791         struct page *p;
2792         int copied = 0;
2793
2794         while (count) {
2795                 unsigned long offset, length;
2796
2797                 offset = offset_in_page(addr);
2798                 length = PAGE_SIZE - offset;
2799                 if (length > count)
2800                         length = count;
2801                 p = vmalloc_to_page(addr);
2802                 /*
2803                  * To do safe access to this _mapped_ area, we need
2804                  * lock. But adding lock here means that we need to add
2805                  * overhead of vmalloc()/vfree() calles for this _debug_
2806                  * interface, rarely used. Instead of that, we'll use
2807                  * kmap() and get small overhead in this access function.
2808                  */
2809                 if (p) {
2810                         /*
2811                          * we can expect USER0 is not used (see vread/vwrite's
2812                          * function description)
2813                          */
2814                         void *map = kmap_atomic(p);
2815                         memcpy(map + offset, buf, length);
2816                         kunmap_atomic(map);
2817                 }
2818                 addr += length;
2819                 buf += length;
2820                 copied += length;
2821                 count -= length;
2822         }
2823         return copied;
2824 }
2825
2826 /**
2827  * vread() - read vmalloc area in a safe way.
2828  * @buf:     buffer for reading data
2829  * @addr:    vm address.
2830  * @count:   number of bytes to be read.
2831  *
2832  * This function checks that addr is a valid vmalloc'ed area, and
2833  * copy data from that area to a given buffer. If the given memory range
2834  * of [addr...addr+count) includes some valid address, data is copied to
2835  * proper area of @buf. If there are memory holes, they'll be zero-filled.
2836  * IOREMAP area is treated as memory hole and no copy is done.
2837  *
2838  * If [addr...addr+count) doesn't includes any intersects with alive
2839  * vm_struct area, returns 0. @buf should be kernel's buffer.
2840  *
2841  * Note: In usual ops, vread() is never necessary because the caller
2842  * should know vmalloc() area is valid and can use memcpy().
2843  * This is for routines which have to access vmalloc area without
2844  * any information, as /dev/kmem.
2845  *
2846  * Return: number of bytes for which addr and buf should be increased
2847  * (same number as @count) or %0 if [addr...addr+count) doesn't
2848  * include any intersection with valid vmalloc area
2849  */
2850 long vread(char *buf, char *addr, unsigned long count)
2851 {
2852         struct vmap_area *va;
2853         struct vm_struct *vm;
2854         char *vaddr, *buf_start = buf;
2855         unsigned long buflen = count;
2856         unsigned long n;
2857
2858         /* Don't allow overflow */
2859         if ((unsigned long) addr + count < count)
2860                 count = -(unsigned long) addr;
2861
2862         spin_lock(&vmap_area_lock);
2863         list_for_each_entry(va, &vmap_area_list, list) {
2864                 if (!count)
2865                         break;
2866
2867                 if (!va->vm)
2868                         continue;
2869
2870                 vm = va->vm;
2871                 vaddr = (char *) vm->addr;
2872                 if (addr >= vaddr + get_vm_area_size(vm))
2873                         continue;
2874                 while (addr < vaddr) {
2875                         if (count == 0)
2876                                 goto finished;
2877                         *buf = '\0';
2878                         buf++;
2879                         addr++;
2880                         count--;
2881                 }
2882                 n = vaddr + get_vm_area_size(vm) - addr;
2883                 if (n > count)
2884                         n = count;
2885                 if (!(vm->flags & VM_IOREMAP))
2886                         aligned_vread(buf, addr, n);
2887                 else /* IOREMAP area is treated as memory hole */
2888                         memset(buf, 0, n);
2889                 buf += n;
2890                 addr += n;
2891                 count -= n;
2892         }
2893 finished:
2894         spin_unlock(&vmap_area_lock);
2895
2896         if (buf == buf_start)
2897                 return 0;
2898         /* zero-fill memory holes */
2899         if (buf != buf_start + buflen)
2900                 memset(buf, 0, buflen - (buf - buf_start));
2901
2902         return buflen;
2903 }
2904
2905 /**
2906  * vwrite() - write vmalloc area in a safe way.
2907  * @buf:      buffer for source data
2908  * @addr:     vm address.
2909  * @count:    number of bytes to be read.
2910  *
2911  * This function checks that addr is a valid vmalloc'ed area, and
2912  * copy data from a buffer to the given addr. If specified range of
2913  * [addr...addr+count) includes some valid address, data is copied from
2914  * proper area of @buf. If there are memory holes, no copy to hole.
2915  * IOREMAP area is treated as memory hole and no copy is done.
2916  *
2917  * If [addr...addr+count) doesn't includes any intersects with alive
2918  * vm_struct area, returns 0. @buf should be kernel's buffer.
2919  *
2920  * Note: In usual ops, vwrite() is never necessary because the caller
2921  * should know vmalloc() area is valid and can use memcpy().
2922  * This is for routines which have to access vmalloc area without
2923  * any information, as /dev/kmem.
2924  *
2925  * Return: number of bytes for which addr and buf should be
2926  * increased (same number as @count) or %0 if [addr...addr+count)
2927  * doesn't include any intersection with valid vmalloc area
2928  */
2929 long vwrite(char *buf, char *addr, unsigned long count)
2930 {
2931         struct vmap_area *va;
2932         struct vm_struct *vm;
2933         char *vaddr;
2934         unsigned long n, buflen;
2935         int copied = 0;
2936
2937         /* Don't allow overflow */
2938         if ((unsigned long) addr + count < count)
2939                 count = -(unsigned long) addr;
2940         buflen = count;
2941
2942         spin_lock(&vmap_area_lock);
2943         list_for_each_entry(va, &vmap_area_list, list) {
2944                 if (!count)
2945                         break;
2946
2947                 if (!va->vm)
2948                         continue;
2949
2950                 vm = va->vm;
2951                 vaddr = (char *) vm->addr;
2952                 if (addr >= vaddr + get_vm_area_size(vm))
2953                         continue;
2954                 while (addr < vaddr) {
2955                         if (count == 0)
2956                                 goto finished;
2957                         buf++;
2958                         addr++;
2959                         count--;
2960                 }
2961                 n = vaddr + get_vm_area_size(vm) - addr;
2962                 if (n > count)
2963                         n = count;
2964                 if (!(vm->flags & VM_IOREMAP)) {
2965                         aligned_vwrite(buf, addr, n);
2966                         copied++;
2967                 }
2968                 buf += n;
2969                 addr += n;
2970                 count -= n;
2971         }
2972 finished:
2973         spin_unlock(&vmap_area_lock);
2974         if (!copied)
2975                 return 0;
2976         return buflen;
2977 }
2978
2979 /**
2980  * remap_vmalloc_range_partial - map vmalloc pages to userspace
2981  * @vma:                vma to cover
2982  * @uaddr:              target user address to start at
2983  * @kaddr:              virtual address of vmalloc kernel memory
2984  * @pgoff:              offset from @kaddr to start at
2985  * @size:               size of map area
2986  *
2987  * Returns:     0 for success, -Exxx on failure
2988  *
2989  * This function checks that @kaddr is a valid vmalloc'ed area,
2990  * and that it is big enough to cover the range starting at
2991  * @uaddr in @vma. Will return failure if that criteria isn't
2992  * met.
2993  *
2994  * Similar to remap_pfn_range() (see mm/memory.c)
2995  */
2996 int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
2997                                 void *kaddr, unsigned long pgoff,
2998                                 unsigned long size)
2999 {
3000         struct vm_struct *area;
3001         unsigned long off;
3002         unsigned long end_index;
3003
3004         if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
3005                 return -EINVAL;
3006
3007         size = PAGE_ALIGN(size);
3008
3009         if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
3010                 return -EINVAL;
3011
3012         area = find_vm_area(kaddr);
3013         if (!area)
3014                 return -EINVAL;
3015
3016         if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
3017                 return -EINVAL;
3018
3019         if (check_add_overflow(size, off, &end_index) ||
3020             end_index > get_vm_area_size(area))
3021                 return -EINVAL;
3022         kaddr += off;
3023
3024         do {
3025                 struct page *page = vmalloc_to_page(kaddr);
3026                 int ret;
3027
3028                 ret = vm_insert_page(vma, uaddr, page);
3029                 if (ret)
3030                         return ret;
3031
3032                 uaddr += PAGE_SIZE;
3033                 kaddr += PAGE_SIZE;
3034                 size -= PAGE_SIZE;
3035         } while (size > 0);
3036
3037         vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
3038
3039         return 0;
3040 }
3041 EXPORT_SYMBOL(remap_vmalloc_range_partial);
3042
3043 /**
3044  * remap_vmalloc_range - map vmalloc pages to userspace
3045  * @vma:                vma to cover (map full range of vma)
3046  * @addr:               vmalloc memory
3047  * @pgoff:              number of pages into addr before first page to map
3048  *
3049  * Returns:     0 for success, -Exxx on failure
3050  *
3051  * This function checks that addr is a valid vmalloc'ed area, and
3052  * that it is big enough to cover the vma. Will return failure if
3053  * that criteria isn't met.
3054  *
3055  * Similar to remap_pfn_range() (see mm/memory.c)
3056  */
3057 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
3058                                                 unsigned long pgoff)
3059 {
3060         return remap_vmalloc_range_partial(vma, vma->vm_start,
3061                                            addr, pgoff,
3062                                            vma->vm_end - vma->vm_start);
3063 }
3064 EXPORT_SYMBOL(remap_vmalloc_range);
3065
3066 static int f(pte_t *pte, unsigned long addr, void *data)
3067 {
3068         pte_t ***p = data;
3069
3070         if (p) {
3071                 *(*p) = pte;
3072                 (*p)++;
3073         }
3074         return 0;
3075 }
3076
3077 /**
3078  * alloc_vm_area - allocate a range of kernel address space
3079  * @size:          size of the area
3080  * @ptes:          returns the PTEs for the address space
3081  *
3082  * Returns:     NULL on failure, vm_struct on success
3083  *
3084  * This function reserves a range of kernel address space, and
3085  * allocates pagetables to map that range.  No actual mappings
3086  * are created.
3087  *
3088  * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
3089  * allocated for the VM area are returned.
3090  */
3091 struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
3092 {
3093         struct vm_struct *area;
3094
3095         area = get_vm_area_caller(size, VM_IOREMAP,
3096                                 __builtin_return_address(0));
3097         if (area == NULL)
3098                 return NULL;
3099
3100         /*
3101          * This ensures that page tables are constructed for this region
3102          * of kernel virtual address space and mapped into init_mm.
3103          */
3104         if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
3105                                 size, f, ptes ? &ptes : NULL)) {
3106                 free_vm_area(area);
3107                 return NULL;
3108         }
3109
3110         return area;
3111 }
3112 EXPORT_SYMBOL_GPL(alloc_vm_area);
3113
3114 void free_vm_area(struct vm_struct *area)
3115 {
3116         struct vm_struct *ret;
3117         ret = remove_vm_area(area->addr);
3118         BUG_ON(ret != area);
3119         kfree(area);
3120 }
3121 EXPORT_SYMBOL_GPL(free_vm_area);
3122
3123 #ifdef CONFIG_SMP
3124 static struct vmap_area *node_to_va(struct rb_node *n)
3125 {
3126         return rb_entry_safe(n, struct vmap_area, rb_node);
3127 }
3128
3129 /**
3130  * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
3131  * @addr: target address
3132  *
3133  * Returns: vmap_area if it is found. If there is no such area
3134  *   the first highest(reverse order) vmap_area is returned
3135  *   i.e. va->va_start < addr && va->va_end < addr or NULL
3136  *   if there are no any areas before @addr.
3137  */
3138 static struct vmap_area *
3139 pvm_find_va_enclose_addr(unsigned long addr)
3140 {
3141         struct vmap_area *va, *tmp;
3142         struct rb_node *n;
3143
3144         n = free_vmap_area_root.rb_node;
3145         va = NULL;
3146
3147         while (n) {
3148                 tmp = rb_entry(n, struct vmap_area, rb_node);
3149                 if (tmp->va_start <= addr) {
3150                         va = tmp;
3151                         if (tmp->va_end >= addr)
3152                                 break;
3153
3154                         n = n->rb_right;
3155                 } else {
3156                         n = n->rb_left;
3157                 }
3158         }
3159
3160         return va;
3161 }
3162
3163 /**
3164  * pvm_determine_end_from_reverse - find the highest aligned address
3165  * of free block below VMALLOC_END
3166  * @va:
3167  *   in - the VA we start the search(reverse order);
3168  *   out - the VA with the highest aligned end address.
3169  *
3170  * Returns: determined end address within vmap_area
3171  */
3172 static unsigned long
3173 pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
3174 {
3175         unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
3176         unsigned long addr;
3177
3178         if (likely(*va)) {
3179                 list_for_each_entry_from_reverse((*va),
3180                                 &free_vmap_area_list, list) {
3181                         addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
3182                         if ((*va)->va_start < addr)
3183                                 return addr;
3184                 }
3185         }
3186
3187         return 0;
3188 }
3189
3190 /**
3191  * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
3192  * @offsets: array containing offset of each area
3193  * @sizes: array containing size of each area
3194  * @nr_vms: the number of areas to allocate
3195  * @align: alignment, all entries in @offsets and @sizes must be aligned to this
3196  *
3197  * Returns: kmalloc'd vm_struct pointer array pointing to allocated
3198  *          vm_structs on success, %NULL on failure
3199  *
3200  * Percpu allocator wants to use congruent vm areas so that it can
3201  * maintain the offsets among percpu areas.  This function allocates
3202  * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
3203  * be scattered pretty far, distance between two areas easily going up
3204  * to gigabytes.  To avoid interacting with regular vmallocs, these
3205  * areas are allocated from top.
3206  *
3207  * Despite its complicated look, this allocator is rather simple. It
3208  * does everything top-down and scans free blocks from the end looking
3209  * for matching base. While scanning, if any of the areas do not fit the
3210  * base address is pulled down to fit the area. Scanning is repeated till
3211  * all the areas fit and then all necessary data structures are inserted
3212  * and the result is returned.
3213  */
3214 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
3215                                      const size_t *sizes, int nr_vms,
3216                                      size_t align)
3217 {
3218         const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
3219         const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
3220         struct vmap_area **vas, *va;
3221         struct vm_struct **vms;
3222         int area, area2, last_area, term_area;
3223         unsigned long base, start, size, end, last_end, orig_start, orig_end;
3224         bool purged = false;
3225         enum fit_type type;
3226
3227         /* verify parameters and allocate data structures */
3228         BUG_ON(offset_in_page(align) || !is_power_of_2(align));
3229         for (last_area = 0, area = 0; area < nr_vms; area++) {
3230                 start = offsets[area];
3231                 end = start + sizes[area];
3232
3233                 /* is everything aligned properly? */
3234                 BUG_ON(!IS_ALIGNED(offsets[area], align));
3235                 BUG_ON(!IS_ALIGNED(sizes[area], align));
3236
3237                 /* detect the area with the highest address */
3238                 if (start > offsets[last_area])
3239                         last_area = area;
3240
3241                 for (area2 = area + 1; area2 < nr_vms; area2++) {
3242                         unsigned long start2 = offsets[area2];
3243                         unsigned long end2 = start2 + sizes[area2];
3244
3245                         BUG_ON(start2 < end && start < end2);
3246                 }
3247         }
3248         last_end = offsets[last_area] + sizes[last_area];
3249
3250         if (vmalloc_end - vmalloc_start < last_end) {
3251                 WARN_ON(true);
3252                 return NULL;
3253         }
3254
3255         vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
3256         vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
3257         if (!vas || !vms)
3258                 goto err_free2;
3259
3260         for (area = 0; area < nr_vms; area++) {
3261                 vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
3262                 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
3263                 if (!vas[area] || !vms[area])
3264                         goto err_free;
3265         }
3266 retry:
3267         spin_lock(&free_vmap_area_lock);
3268
3269         /* start scanning - we scan from the top, begin with the last area */
3270         area = term_area = last_area;
3271         start = offsets[area];
3272         end = start + sizes[area];
3273
3274         va = pvm_find_va_enclose_addr(vmalloc_end);
3275         base = pvm_determine_end_from_reverse(&va, align) - end;
3276
3277         while (true) {
3278                 /*
3279                  * base might have underflowed, add last_end before
3280                  * comparing.
3281                  */
3282                 if (base + last_end < vmalloc_start + last_end)
3283                         goto overflow;
3284
3285                 /*
3286                  * Fitting base has not been found.
3287                  */
3288                 if (va == NULL)
3289                         goto overflow;
3290
3291                 /*
3292                  * If required width exceeds current VA block, move
3293                  * base downwards and then recheck.
3294                  */
3295                 if (base + end > va->va_end) {
3296                         base = pvm_determine_end_from_reverse(&va, align) - end;
3297                         term_area = area;
3298                         continue;
3299                 }
3300
3301                 /*
3302                  * If this VA does not fit, move base downwards and recheck.
3303                  */
3304                 if (base + start < va->va_start) {
3305                         va = node_to_va(rb_prev(&va->rb_node));
3306                         base = pvm_determine_end_from_reverse(&va, align) - end;
3307                         term_area = area;
3308                         continue;
3309                 }
3310
3311                 /*
3312                  * This area fits, move on to the previous one.  If
3313                  * the previous one is the terminal one, we're done.
3314                  */
3315                 area = (area + nr_vms - 1) % nr_vms;
3316                 if (area == term_area)
3317                         break;
3318
3319                 start = offsets[area];
3320                 end = start + sizes[area];
3321                 va = pvm_find_va_enclose_addr(base + end);
3322         }
3323
3324         /* we've found a fitting base, insert all va's */
3325         for (area = 0; area < nr_vms; area++) {
3326                 int ret;
3327
3328                 start = base + offsets[area];
3329                 size = sizes[area];
3330
3331                 va = pvm_find_va_enclose_addr(start);
3332                 if (WARN_ON_ONCE(va == NULL))
3333                         /* It is a BUG(), but trigger recovery instead. */
3334                         goto recovery;
3335
3336                 type = classify_va_fit_type(va, start, size);
3337                 if (WARN_ON_ONCE(type == NOTHING_FIT))
3338                         /* It is a BUG(), but trigger recovery instead. */
3339                         goto recovery;
3340
3341                 ret = adjust_va_to_fit_type(va, start, size, type);
3342                 if (unlikely(ret))
3343                         goto recovery;
3344
3345                 /* Allocated area. */
3346                 va = vas[area];
3347                 va->va_start = start;
3348                 va->va_end = start + size;
3349         }
3350
3351         spin_unlock(&free_vmap_area_lock);
3352
3353         /* populate the kasan shadow space */
3354         for (area = 0; area < nr_vms; area++) {
3355                 if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
3356                         goto err_free_shadow;
3357
3358                 kasan_unpoison_vmalloc((void *)vas[area]->va_start,
3359                                        sizes[area]);
3360         }
3361
3362         /* insert all vm's */
3363         spin_lock(&vmap_area_lock);
3364         for (area = 0; area < nr_vms; area++) {
3365                 insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
3366
3367                 setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
3368                                  pcpu_get_vm_areas);
3369         }
3370         spin_unlock(&vmap_area_lock);
3371
3372         kfree(vas);
3373         return vms;
3374
3375 recovery:
3376         /*
3377          * Remove previously allocated areas. There is no
3378          * need in removing these areas from the busy tree,
3379          * because they are inserted only on the final step
3380          * and when pcpu_get_vm_areas() is success.
3381          */
3382         while (area--) {
3383                 orig_start = vas[area]->va_start;
3384                 orig_end = vas[area]->va_end;
3385                 va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
3386                                             &free_vmap_area_list);
3387                 kasan_release_vmalloc(orig_start, orig_end,
3388                                       va->va_start, va->va_end);
3389                 vas[area] = NULL;
3390         }
3391
3392 overflow:
3393         spin_unlock(&free_vmap_area_lock);
3394         if (!purged) {
3395                 purge_vmap_area_lazy();
3396                 purged = true;
3397
3398                 /* Before "retry", check if we recover. */
3399                 for (area = 0; area < nr_vms; area++) {
3400                         if (vas[area])
3401                                 continue;
3402
3403                         vas[area] = kmem_cache_zalloc(
3404                                 vmap_area_cachep, GFP_KERNEL);
3405                         if (!vas[area])
3406                                 goto err_free;
3407                 }
3408
3409                 goto retry;
3410         }
3411
3412 err_free:
3413         for (area = 0; area < nr_vms; area++) {
3414                 if (vas[area])
3415                         kmem_cache_free(vmap_area_cachep, vas[area]);
3416
3417                 kfree(vms[area]);
3418         }
3419 err_free2:
3420         kfree(vas);
3421         kfree(vms);
3422         return NULL;
3423
3424 err_free_shadow:
3425         spin_lock(&free_vmap_area_lock);
3426         /*
3427          * We release all the vmalloc shadows, even the ones for regions that
3428          * hadn't been successfully added. This relies on kasan_release_vmalloc
3429          * being able to tolerate this case.
3430          */
3431         for (area = 0; area < nr_vms; area++) {
3432                 orig_start = vas[area]->va_start;
3433                 orig_end = vas[area]->va_end;
3434                 va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
3435                                             &free_vmap_area_list);
3436                 kasan_release_vmalloc(orig_start, orig_end,
3437                                       va->va_start, va->va_end);
3438                 vas[area] = NULL;
3439                 kfree(vms[area]);
3440         }
3441         spin_unlock(&free_vmap_area_lock);
3442         kfree(vas);
3443         kfree(vms);
3444         return NULL;
3445 }
3446
3447 /**
3448  * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
3449  * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
3450  * @nr_vms: the number of allocated areas
3451  *
3452  * Free vm_structs and the array allocated by pcpu_get_vm_areas().
3453  */
3454 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
3455 {
3456         int i;
3457
3458         for (i = 0; i < nr_vms; i++)
3459                 free_vm_area(vms[i]);
3460         kfree(vms);
3461 }
3462 #endif  /* CONFIG_SMP */
3463
3464 #ifdef CONFIG_PROC_FS
3465 static void *s_start(struct seq_file *m, loff_t *pos)
3466         __acquires(&vmap_purge_lock)
3467         __acquires(&vmap_area_lock)
3468 {
3469         mutex_lock(&vmap_purge_lock);
3470         spin_lock(&vmap_area_lock);
3471
3472         return seq_list_start(&vmap_area_list, *pos);
3473 }
3474
3475 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3476 {
3477         return seq_list_next(p, &vmap_area_list, pos);
3478 }
3479
3480 static void s_stop(struct seq_file *m, void *p)
3481         __releases(&vmap_purge_lock)
3482         __releases(&vmap_area_lock)
3483 {
3484         mutex_unlock(&vmap_purge_lock);
3485         spin_unlock(&vmap_area_lock);
3486 }
3487
3488 static void show_numa_info(struct seq_file *m, struct vm_struct *v)
3489 {
3490         if (IS_ENABLED(CONFIG_NUMA)) {
3491                 unsigned int nr, *counters = m->private;
3492
3493                 if (!counters)
3494                         return;
3495
3496                 if (v->flags & VM_UNINITIALIZED)
3497                         return;
3498                 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
3499                 smp_rmb();
3500
3501                 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
3502
3503                 for (nr = 0; nr < v->nr_pages; nr++)
3504                         counters[page_to_nid(v->pages[nr])]++;
3505
3506                 for_each_node_state(nr, N_HIGH_MEMORY)
3507                         if (counters[nr])
3508                                 seq_printf(m, " N%u=%u", nr, counters[nr]);
3509         }
3510 }
3511
3512 static void show_purge_info(struct seq_file *m)
3513 {
3514         struct llist_node *head;
3515         struct vmap_area *va;
3516
3517         head = READ_ONCE(vmap_purge_list.first);
3518         if (head == NULL)
3519                 return;
3520
3521         llist_for_each_entry(va, head, purge_list) {
3522                 seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
3523                         (void *)va->va_start, (void *)va->va_end,
3524                         va->va_end - va->va_start);
3525         }
3526 }
3527
3528 static int s_show(struct seq_file *m, void *p)
3529 {
3530         struct vmap_area *va;
3531         struct vm_struct *v;
3532
3533         va = list_entry(p, struct vmap_area, list);
3534
3535         /*
3536          * s_show can encounter race with remove_vm_area, !vm on behalf
3537          * of vmap area is being tear down or vm_map_ram allocation.
3538          */
3539         if (!va->vm) {
3540                 seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
3541                         (void *)va->va_start, (void *)va->va_end,
3542                         va->va_end - va->va_start);
3543
3544                 return 0;
3545         }
3546
3547         v = va->vm;
3548
3549         seq_printf(m, "0x%pK-0x%pK %7ld",
3550                 v->addr, v->addr + v->size, v->size);
3551
3552         if (v->caller)
3553                 seq_printf(m, " %pS", v->caller);
3554
3555         if (v->nr_pages)
3556                 seq_printf(m, " pages=%d", v->nr_pages);
3557
3558         if (v->phys_addr)
3559                 seq_printf(m, " phys=%pa", &v->phys_addr);
3560
3561         if (v->flags & VM_IOREMAP)
3562                 seq_puts(m, " ioremap");
3563
3564         if (v->flags & VM_ALLOC)
3565                 seq_puts(m, " vmalloc");
3566
3567         if (v->flags & VM_MAP)
3568                 seq_puts(m, " vmap");
3569
3570         if (v->flags & VM_USERMAP)
3571                 seq_puts(m, " user");
3572
3573         if (v->flags & VM_DMA_COHERENT)
3574                 seq_puts(m, " dma-coherent");
3575
3576         if (is_vmalloc_addr(v->pages))
3577                 seq_puts(m, " vpages");
3578
3579         show_numa_info(m, v);
3580         seq_putc(m, '\n');
3581
3582         /*
3583          * As a final step, dump "unpurged" areas. Note,
3584          * that entire "/proc/vmallocinfo" output will not
3585          * be address sorted, because the purge list is not
3586          * sorted.
3587          */
3588         if (list_is_last(&va->list, &vmap_area_list))
3589                 show_purge_info(m);
3590
3591         return 0;
3592 }
3593
3594 static const struct seq_operations vmalloc_op = {
3595         .start = s_start,
3596         .next = s_next,
3597         .stop = s_stop,
3598         .show = s_show,
3599 };
3600
3601 static int __init proc_vmalloc_init(void)
3602 {
3603         if (IS_ENABLED(CONFIG_NUMA))
3604                 proc_create_seq_private("vmallocinfo", 0400, NULL,
3605                                 &vmalloc_op,
3606                                 nr_node_ids * sizeof(unsigned int), NULL);
3607         else
3608                 proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
3609         return 0;
3610 }
3611 module_init(proc_vmalloc_init);
3612
3613 #endif