X-Git-Url: http://git.monstr.eu/?a=blobdiff_plain;f=mm%2Fvmalloc.c;h=9c539f0730a510738cebabb878e6ab4ca5083241;hb=aef511fb91b6efb2d355c2704cf979f3202d310a;hp=d5f2a84e488ad8e066815607f79bbd077d317bc0;hpb=d0cc7ecacba8a5b6bbdd5aa6ba3d1bc2fe59b580;p=linux-2.6-microblaze.git diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d5f2a84e488a..9c539f0730a5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -34,7 +34,7 @@ #include #include #include - +#include #include #include #include @@ -42,6 +42,19 @@ #include "internal.h" #include "pgalloc-track.h" +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC +static bool __ro_after_init vmap_allow_huge = true; + +static int __init set_nohugevmalloc(char *str) +{ + vmap_allow_huge = false; + return 0; +} +early_param("nohugevmalloc", set_nohugevmalloc); +#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ +static const bool vmap_allow_huge = false; +#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ + bool is_vmalloc_addr(const void *x) { unsigned long addr = (unsigned long)x; @@ -68,6 +81,218 @@ static void free_work(struct work_struct *w) } /*** Page table manipulation functions ***/ +static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + pgtbl_mod_mask *mask) +{ + pte_t *pte; + u64 pfn; + + pfn = phys_addr >> PAGE_SHIFT; + pte = pte_alloc_kernel_track(pmd, addr, mask); + if (!pte) + return -ENOMEM; + do { + BUG_ON(!pte_none(*pte)); + set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); + pfn++; + } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; + return 0; +} + +static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + if (max_page_shift < PMD_SHIFT) + return 0; + + if (!arch_vmap_pmd_supported(prot)) + return 0; + + if ((end - addr) != PMD_SIZE) + return 0; + + if (!IS_ALIGNED(addr, PMD_SIZE)) + return 0; + + if (!IS_ALIGNED(phys_addr, PMD_SIZE)) + return 0; + + if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) + return 0; + + return pmd_set_huge(pmd, phys_addr, prot); +} + +static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift, pgtbl_mod_mask *mask) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_alloc_track(&init_mm, pud, addr, mask); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + + if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, + max_page_shift)) { + *mask |= PGTBL_PMD_MODIFIED; + continue; + } + + if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask)) + return -ENOMEM; + } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); + return 0; +} + +static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + if (max_page_shift < PUD_SHIFT) + return 0; + + if (!arch_vmap_pud_supported(prot)) + return 0; + + if ((end - addr) != PUD_SIZE) + return 0; + + if (!IS_ALIGNED(addr, PUD_SIZE)) + return 0; + + if (!IS_ALIGNED(phys_addr, PUD_SIZE)) + return 0; + + if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) + return 0; + + return pud_set_huge(pud, phys_addr, prot); +} + +static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift, pgtbl_mod_mask *mask) +{ + pud_t *pud; + unsigned long next; + + pud = pud_alloc_track(&init_mm, p4d, addr, mask); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + + if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, + max_page_shift)) { + *mask |= PGTBL_PUD_MODIFIED; + continue; + } + + if (vmap_pmd_range(pud, addr, next, phys_addr, prot, + max_page_shift, mask)) + return -ENOMEM; + } while (pud++, phys_addr += (next - addr), addr = next, addr != end); + return 0; +} + +static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + if (max_page_shift < P4D_SHIFT) + return 0; + + if (!arch_vmap_p4d_supported(prot)) + return 0; + + if ((end - addr) != P4D_SIZE) + return 0; + + if (!IS_ALIGNED(addr, P4D_SIZE)) + return 0; + + if (!IS_ALIGNED(phys_addr, P4D_SIZE)) + return 0; + + if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) + return 0; + + return p4d_set_huge(p4d, phys_addr, prot); +} + +static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift, pgtbl_mod_mask *mask) +{ + p4d_t *p4d; + unsigned long next; + + p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); + if (!p4d) + return -ENOMEM; + do { + next = p4d_addr_end(addr, end); + + if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, + max_page_shift)) { + *mask |= PGTBL_P4D_MODIFIED; + continue; + } + + if (vmap_pud_range(p4d, addr, next, phys_addr, prot, + max_page_shift, mask)) + return -ENOMEM; + } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); + return 0; +} + +static int vmap_range_noflush(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + pgd_t *pgd; + unsigned long start; + unsigned long next; + int err; + pgtbl_mod_mask mask = 0; + + might_sleep(); + BUG_ON(addr >= end); + + start = addr; + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, end); + err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, + max_page_shift, &mask); + if (err) + break; + } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); + + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); + + return err; +} + +int vmap_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + int err; + + err = vmap_range_noflush(addr, end, phys_addr, prot, max_page_shift); + flush_cache_vmap(addr, end); + + return err; +} static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgtbl_mod_mask *mask) @@ -153,22 +378,20 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, } while (p4d++, addr = next, addr != end); } -/** - * unmap_kernel_range_noflush - unmap kernel VM area - * @start: start of the VM area to unmap - * @size: size of the VM area to unmap +/* + * vunmap_range_noflush is similar to vunmap_range, but does not + * flush caches or TLBs. * - * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify - * should have been allocated using get_vm_area() and its friends. + * The caller is responsible for calling flush_cache_vmap() before calling + * this function, and flush_tlb_kernel_range after it has returned + * successfully (and before the addresses are expected to cause a page fault + * or be re-mapped for something else, if TLB flushes are being delayed or + * coalesced). * - * NOTE: - * This function does NOT do any cache flushing. The caller is responsible - * for calling flush_cache_vunmap() on to-be-mapped areas before calling this - * function and flush_tlb_kernel_range() after. + * This is an internal function only. Do not use outside mm/. */ -void unmap_kernel_range_noflush(unsigned long start, unsigned long size) +void vunmap_range_noflush(unsigned long start, unsigned long end) { - unsigned long end = start + size; unsigned long next; pgd_t *pgd; unsigned long addr = start; @@ -189,7 +412,23 @@ void unmap_kernel_range_noflush(unsigned long start, unsigned long size) arch_sync_kernel_mappings(start, end); } -static int vmap_pte_range(pmd_t *pmd, unsigned long addr, +/** + * vunmap_range - unmap kernel virtual addresses + * @addr: start of the VM area to unmap + * @end: end of the VM area to unmap (non-inclusive) + * + * Clears any present PTEs in the virtual address range, flushes TLBs and + * caches. Any subsequent access to the address before it has been re-mapped + * is a kernel bug. + */ +void vunmap_range(unsigned long addr, unsigned long end) +{ + flush_cache_vunmap(addr, end); + vunmap_range_noflush(addr, end); + flush_tlb_kernel_range(addr, end); +} + +static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { @@ -217,7 +456,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, return 0; } -static int vmap_pmd_range(pud_t *pud, unsigned long addr, +static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { @@ -229,13 +468,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, return -ENOMEM; do { next = pmd_addr_end(addr, end); - if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask)) + if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } -static int vmap_pud_range(p4d_t *p4d, unsigned long addr, +static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { @@ -247,13 +486,13 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr, return -ENOMEM; do { next = pud_addr_end(addr, end); - if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask)) + if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } -static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, +static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) { @@ -265,37 +504,18 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, return -ENOMEM; do { next = p4d_addr_end(addr, end); - if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask)) + if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (p4d++, addr = next, addr != end); return 0; } -/** - * map_kernel_range_noflush - map kernel VM area with the specified pages - * @addr: start of the VM area to map - * @size: size of the VM area to map - * @prot: page protection flags to use - * @pages: pages to map - * - * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should - * have been allocated using get_vm_area() and its friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is responsible for - * calling flush_cache_vmap() on to-be-mapped areas before calling this - * function. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int map_kernel_range_noflush(unsigned long addr, unsigned long size, - pgprot_t prot, struct page **pages) +static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages) { unsigned long start = addr; - unsigned long end = addr + size; - unsigned long next; pgd_t *pgd; + unsigned long next; int err = 0; int nr = 0; pgtbl_mod_mask mask = 0; @@ -306,7 +526,7 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size, next = pgd_addr_end(addr, end); if (pgd_bad(*pgd)) mask |= PGTBL_PGD_MODIFIED; - err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); + err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); if (err) return err; } while (pgd++, addr = next, addr != end); @@ -317,14 +537,61 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size, return 0; } -int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, - struct page **pages) +/* + * vmap_pages_range_noflush is similar to vmap_pages_range, but does not + * flush caches. + * + * The caller is responsible for calling flush_cache_vmap() after this + * function returns successfully and before the addresses are accessed. + * + * This is an internal function only. Do not use outside mm/. + */ +int vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) { - int ret; + unsigned int i, nr = (end - addr) >> PAGE_SHIFT; + + WARN_ON(page_shift < PAGE_SHIFT); + + if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) || + page_shift == PAGE_SHIFT) + return vmap_small_pages_range_noflush(addr, end, prot, pages); + + for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { + int err; + + err = vmap_range_noflush(addr, addr + (1UL << page_shift), + __pa(page_address(pages[i])), prot, + page_shift); + if (err) + return err; + + addr += 1UL << page_shift; + } + + return 0; +} + +/** + * vmap_pages_range - map pages to a kernel virtual address + * @addr: start of the VM area to map + * @end: end of the VM area to map (non-inclusive) + * @prot: page protection flags to use + * @pages: pages to map (always PAGE_SIZE pages) + * @page_shift: maximum shift that the pages may be mapped with, @pages must + * be aligned and contiguous up to at least this shift. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int vmap_pages_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + int err; - ret = map_kernel_range_noflush(start, size, prot, pages); - flush_cache_vmap(start, start + size); - return ret; + err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); + flush_cache_vmap(addr, end); + return err; } int is_vmalloc_or_module_addr(const void *x) @@ -343,7 +610,9 @@ int is_vmalloc_or_module_addr(const void *x) } /* - * Walk a vmap address to the struct page it maps. + * Walk a vmap address to the struct page it maps. Huge vmap mappings will + * return the tail page that corresponds to the base page address, which + * matches small vmap mappings. */ struct page *vmalloc_to_page(const void *vmalloc_addr) { @@ -363,25 +632,33 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) if (pgd_none(*pgd)) return NULL; + if (WARN_ON_ONCE(pgd_leaf(*pgd))) + return NULL; /* XXX: no allowance for huge pgd */ + if (WARN_ON_ONCE(pgd_bad(*pgd))) + return NULL; + p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) return NULL; - pud = pud_offset(p4d, addr); + if (p4d_leaf(*p4d)) + return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(p4d_bad(*p4d))) + return NULL; - /* - * Don't dereference bad PUD or PMD (below) entries. This will also - * identify huge mappings, which we may encounter on architectures - * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be - * identified as vmalloc addresses by is_vmalloc_addr(), but are - * not [unambiguously] associated with a struct page, so there is - * no correct value to return for them. - */ - WARN_ON_ONCE(pud_bad(*pud)); - if (pud_none(*pud) || pud_bad(*pud)) + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) + return NULL; + if (pud_leaf(*pud)) + return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(pud_bad(*pud))) return NULL; + pmd = pmd_offset(pud, addr); - WARN_ON_ONCE(pmd_bad(*pmd)); - if (pmd_none(*pmd) || pmd_bad(*pmd)) + if (pmd_none(*pmd)) + return NULL; + if (pmd_leaf(*pmd)) + return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(pmd_bad(*pmd))) return NULL; ptep = pte_offset_map(pmd, addr); @@ -389,6 +666,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) if (pte_present(pte)) page = pte_page(pte); pte_unmap(ptep); + return page; } EXPORT_SYMBOL(vmalloc_to_page); @@ -1152,6 +1430,29 @@ static void free_vmap_area(struct vmap_area *va) spin_unlock(&free_vmap_area_lock); } +static inline void +preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) +{ + struct vmap_area *va = NULL; + + /* + * Preload this CPU with one extra vmap_area object. It is used + * when fit type of free area is NE_FIT_TYPE. It guarantees that + * a CPU that does an allocation is preloaded. + * + * We do it in non-atomic context, thus it allows us to use more + * permissive allocation masks to be more stable under low memory + * condition and high memory pressure. + */ + if (!this_cpu_read(ne_fit_preload_node)) + va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); + + spin_lock(lock); + + if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va)) + kmem_cache_free(vmap_area_cachep, va); +} + /* * Allocate a region of KVA of the specified size and alignment, within the * vstart and vend. @@ -1161,7 +1462,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long vstart, unsigned long vend, int node, gfp_t gfp_mask) { - struct vmap_area *va, *pva; + struct vmap_area *va; unsigned long addr; int purged = 0; int ret; @@ -1187,43 +1488,14 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); retry: - /* - * Preload this CPU with one extra vmap_area object. It is used - * when fit type of free area is NE_FIT_TYPE. Please note, it - * does not guarantee that an allocation occurs on a CPU that - * is preloaded, instead we minimize the case when it is not. - * It can happen because of cpu migration, because there is a - * race until the below spinlock is taken. - * - * The preload is done in non-atomic context, thus it allows us - * to use more permissive allocation masks to be more stable under - * low memory condition and high memory pressure. In rare case, - * if not preloaded, GFP_NOWAIT is used. - * - * Set "pva" to NULL here, because of "retry" path. - */ - pva = NULL; - - if (!this_cpu_read(ne_fit_preload_node)) - /* - * Even if it fails we do not really care about that. - * Just proceed as it is. If needed "overflow" path - * will refill the cache we allocate from. - */ - pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); - - spin_lock(&free_vmap_area_lock); - - if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) - kmem_cache_free(vmap_area_cachep, pva); + preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); + addr = __alloc_vmap_area(size, align, vstart, vend); + spin_unlock(&free_vmap_area_lock); /* * If an allocation fails, the "vend" address is * returned. Therefore trigger the overflow path. */ - addr = __alloc_vmap_area(size, align, vstart, vend); - spin_unlock(&free_vmap_area_lock); - if (unlikely(addr == vend)) goto overflow; @@ -1231,7 +1503,6 @@ retry: va->va_end = addr + size; va->vm = NULL; - spin_lock(&vmap_area_lock); insert_vmap_area(va, &vmap_area_root, &vmap_area_list); spin_unlock(&vmap_area_lock); @@ -1448,7 +1719,7 @@ static void free_vmap_area_noflush(struct vmap_area *va) static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); - unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start); + vunmap_range_noflush(va->va_start, va->va_end); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(va->va_start, va->va_end); @@ -1726,7 +1997,7 @@ static void vb_free(unsigned long addr, unsigned long size) offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); - unmap_kernel_range_noflush(addr, size); + vunmap_range_noflush(addr, addr + size); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(addr, addr + size); @@ -1762,7 +2033,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) rcu_read_lock(); list_for_each_entry_rcu(vb, &vbq->free, free_list) { spin_lock(&vb->lock); - if (vb->dirty) { + if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) { unsigned long va_start = vb->va->va_start; unsigned long s, e; @@ -1879,16 +2150,36 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) kasan_unpoison_vmalloc(mem, size); - if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) { + if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, + pages, PAGE_SHIFT) < 0) { vm_unmap_ram(mem, count); return NULL; } + return mem; } EXPORT_SYMBOL(vm_map_ram); static struct vm_struct *vmlist __initdata; +static inline unsigned int vm_area_page_order(struct vm_struct *vm) +{ +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC + return vm->page_order; +#else + return 0; +#endif +} + +static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order) +{ +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC + vm->page_order = order; +#else + BUG_ON(order != 0); +#endif +} + /** * vm_area_add_early - add vmap area early during boot * @vm: vm_struct to add @@ -2023,23 +2314,6 @@ void __init vmalloc_init(void) vmap_initialized = true; } -/** - * unmap_kernel_range - unmap kernel VM area and flush cache and TLB - * @addr: start of the VM area to unmap - * @size: size of the VM area to unmap - * - * Similar to unmap_kernel_range_noflush() but flushes vcache before - * the unmapping and tlb after. - */ -void unmap_kernel_range(unsigned long addr, unsigned long size) -{ - unsigned long end = addr + size; - - flush_cache_vunmap(addr, end); - unmap_kernel_range_noflush(addr, size); - flush_tlb_kernel_range(addr, end); -} - static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { @@ -2199,6 +2473,7 @@ static inline void set_area_direct_map(const struct vm_struct *area, { int i; + /* HUGE_VMALLOC passes small pages to set_direct_map */ for (i = 0; i < area->nr_pages; i++) if (page_address(area->pages[i])) set_direct_map(area->pages[i]); @@ -2208,6 +2483,7 @@ static inline void set_area_direct_map(const struct vm_struct *area, static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) { unsigned long start = ULONG_MAX, end = 0; + unsigned int page_order = vm_area_page_order(area); int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; int flush_dmap = 0; int i; @@ -2232,11 +2508,14 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) * map. Find the start and end range of the direct mappings to make sure * the vm_unmap_aliases() flush includes the direct map. */ - for (i = 0; i < area->nr_pages; i++) { + for (i = 0; i < area->nr_pages; i += 1U << page_order) { unsigned long addr = (unsigned long)page_address(area->pages[i]); if (addr) { + unsigned long page_size; + + page_size = PAGE_SIZE << page_order; start = min(addr, start); - end = max(addr + PAGE_SIZE, end); + end = max(addr + page_size, end); flush_dmap = 1; } } @@ -2277,13 +2556,14 @@ static void __vunmap(const void *addr, int deallocate_pages) vm_remove_mappings(area, deallocate_pages); if (deallocate_pages) { + unsigned int page_order = vm_area_page_order(area); int i; - for (i = 0; i < area->nr_pages; i++) { + for (i = 0; i < area->nr_pages; i += 1U << page_order) { struct page *page = area->pages[i]; BUG_ON(!page); - __free_pages(page, 0); + __free_pages(page, page_order); } atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); @@ -2402,6 +2682,7 @@ void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) { struct vm_struct *area; + unsigned long addr; unsigned long size; /* In bytes */ might_sleep(); @@ -2414,8 +2695,9 @@ void *vmap(struct page **pages, unsigned int count, if (!area) return NULL; - if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot), - pages) < 0) { + addr = (unsigned long)area->addr; + if (vmap_pages_range(addr, addr + size, pgprot_nx(prot), + pages, PAGE_SHIFT) < 0) { vunmap(area->addr); return NULL; } @@ -2474,15 +2756,19 @@ EXPORT_SYMBOL_GPL(vmap_pfn); #endif /* CONFIG_VMAP_PFN */ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot, int node) + pgprot_t prot, unsigned int page_shift, + int node) { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; + unsigned long addr = (unsigned long)area->addr; + unsigned long size = get_vm_area_size(area); unsigned long array_size; - unsigned int i; + unsigned int nr_small_pages = size >> PAGE_SHIFT; + unsigned int page_order; struct page **pages; + unsigned int i; - array_size = (unsigned long)nr_pages * sizeof(struct page *); + array_size = (unsigned long)nr_small_pages * sizeof(struct page *); gfp_mask |= __GFP_NOWARN; if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; @@ -2497,42 +2783,60 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, if (!pages) { free_vm_area(area); + warn_alloc(gfp_mask, NULL, + "vmalloc size %lu allocation failure: " + "page array size %lu allocation failed", + nr_small_pages * PAGE_SIZE, array_size); return NULL; } area->pages = pages; - area->nr_pages = nr_pages; + area->nr_pages = nr_small_pages; + set_vm_area_page_order(area, page_shift - PAGE_SHIFT); - for (i = 0; i < area->nr_pages; i++) { - struct page *page; + page_order = vm_area_page_order(area); - if (node == NUMA_NO_NODE) - page = alloc_page(gfp_mask); - else - page = alloc_pages_node(node, gfp_mask, 0); + /* + * Careful, we allocate and map page_order pages, but tracking is done + * per PAGE_SIZE page so as to keep the vm_struct APIs independent of + * the physical/mapped size. + */ + for (i = 0; i < area->nr_pages; i += 1U << page_order) { + struct page *page; + int p; + /* Compound pages required for remap_vmalloc_page */ + page = alloc_pages_node(node, gfp_mask | __GFP_COMP, page_order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vfree() */ area->nr_pages = i; atomic_long_add(area->nr_pages, &nr_vmalloc_pages); + warn_alloc(gfp_mask, NULL, + "vmalloc size %lu allocation failure: " + "page order %u allocation failed", + area->nr_pages * PAGE_SIZE, page_order); goto fail; } - area->pages[i] = page; + + for (p = 0; p < (1U << page_order); p++) + area->pages[i + p] = page + p; + if (gfpflags_allow_blocking(gfp_mask)) cond_resched(); } atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), - prot, pages) < 0) + if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0) { + warn_alloc(gfp_mask, NULL, + "vmalloc size %lu allocation failure: " + "failed to map pages", + area->nr_pages * PAGE_SIZE); goto fail; + } return area->addr; fail: - warn_alloc(gfp_mask, NULL, - "vmalloc: allocation failure, allocated %ld of %ld bytes", - (area->nr_pages*PAGE_SIZE), area->size); __vfree(area->addr); return NULL; } @@ -2563,19 +2867,54 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, struct vm_struct *area; void *addr; unsigned long real_size = size; + unsigned long real_align = align; + unsigned int shift = PAGE_SHIFT; - size = PAGE_ALIGN(size); - if (!size || (size >> PAGE_SHIFT) > totalram_pages()) - goto fail; + if (WARN_ON_ONCE(!size)) + return NULL; + + if ((size >> PAGE_SHIFT) > totalram_pages()) { + warn_alloc(gfp_mask, NULL, + "vmalloc size %lu allocation failure: " + "exceeds total pages", real_size); + return NULL; + } + + if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP) && + arch_vmap_pmd_supported(prot)) { + unsigned long size_per_node; - area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED | + /* + * Try huge pages. Only try for PAGE_KERNEL allocations, + * others like modules don't yet expect huge pages in + * their allocations due to apply_to_page_range not + * supporting them. + */ + + size_per_node = size; + if (node == NUMA_NO_NODE) + size_per_node /= num_online_nodes(); + if (size_per_node >= PMD_SIZE) { + shift = PMD_SHIFT; + align = max(real_align, 1UL << shift); + size = ALIGN(real_size, 1UL << shift); + } + } + +again: + size = PAGE_ALIGN(size); + area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); - if (!area) + if (!area) { + warn_alloc(gfp_mask, NULL, + "vmalloc size %lu allocation failure: " + "vm_struct allocation failed", real_size); goto fail; + } - addr = __vmalloc_area_node(area, gfp_mask, prot, node); + addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!addr) - return NULL; + goto fail; /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED @@ -2589,8 +2928,13 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, return addr; fail: - warn_alloc(gfp_mask, NULL, - "vmalloc: allocation failure: %lu bytes", real_size); + if (shift > PAGE_SHIFT) { + shift = PAGE_SHIFT; + align = real_align; + size = real_size; + goto again; + } + return NULL; } @@ -2739,7 +3083,7 @@ EXPORT_SYMBOL(vzalloc_node); * 64b systems should always have either DMA or DMA32 zones. For others * GFP_DMA32 should do the right thing and use the normal zone. */ -#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL +#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #endif /** @@ -2894,7 +3238,10 @@ long vread(char *buf, char *addr, unsigned long count) count = -(unsigned long) addr; spin_lock(&vmap_area_lock); - list_for_each_entry(va, &vmap_area_list, list) { + va = __find_vmap_area((unsigned long)addr); + if (!va) + goto finished; + list_for_each_entry_from(va, &vmap_area_list, list) { if (!count) break; @@ -3072,7 +3419,6 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, return 0; } -EXPORT_SYMBOL(remap_vmalloc_range_partial); /** * remap_vmalloc_range - map vmalloc pages to userspace