Merge branch 'work.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
[linux-2.6-microblaze.git] / mm / vmalloc.c
index 5a2b55c..b482d24 100644 (file)
@@ -7,6 +7,7 @@
  *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
  *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
  *  Numa awareness, Christoph Lameter, SGI, June 2005
+ *  Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
  */
 
 #include <linux/vmalloc.h>
@@ -25,7 +26,7 @@
 #include <linux/list.h>
 #include <linux/notifier.h>
 #include <linux/rbtree.h>
-#include <linux/radix-tree.h>
+#include <linux/xarray.h>
 #include <linux/rcupdate.h>
 #include <linux/pfn.h>
 #include <linux/kmemleak.h>
@@ -41,6 +42,7 @@
 #include <asm/shmparam.h>
 
 #include "internal.h"
+#include "pgalloc-track.h"
 
 bool is_vmalloc_addr(const void *x)
 {
@@ -173,7 +175,6 @@ void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
        pgtbl_mod_mask mask = 0;
 
        BUG_ON(addr >= end);
-       start = addr;
        pgd = pgd_offset_k(addr);
        do {
                next = pgd_addr_end(addr, end);
@@ -511,6 +512,10 @@ static struct vmap_area *__find_vmap_area(unsigned long addr)
 /*
  * This function returns back addresses of parent node
  * and its left or right link for further processing.
+ *
+ * Otherwise NULL is returned. In that case all further
+ * steps regarding inserting of conflicting overlap range
+ * have to be declined and actually considered as a bug.
  */
 static __always_inline struct rb_node **
 find_va_links(struct vmap_area *va,
@@ -549,8 +554,12 @@ find_va_links(struct vmap_area *va,
                else if (va->va_end > tmp_va->va_start &&
                                va->va_start >= tmp_va->va_end)
                        link = &(*link)->rb_right;
-               else
-                       BUG();
+               else {
+                       WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
+                               va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
+
+                       return NULL;
+               }
        } while (*link);
 
        *parent = &tmp_va->rb_node;
@@ -632,43 +641,17 @@ unlink_va(struct vmap_area *va, struct rb_root *root)
 
 #if DEBUG_AUGMENT_PROPAGATE_CHECK
 static void
-augment_tree_propagate_check(struct rb_node *n)
+augment_tree_propagate_check(void)
 {
        struct vmap_area *va;
-       struct rb_node *node;
-       unsigned long size;
-       bool found = false;
-
-       if (n == NULL)
-               return;
+       unsigned long computed_size;
 
-       va = rb_entry(n, struct vmap_area, rb_node);
-       size = va->subtree_max_size;
-       node = n;
-
-       while (node) {
-               va = rb_entry(node, struct vmap_area, rb_node);
-
-               if (get_subtree_max_size(node->rb_left) == size) {
-                       node = node->rb_left;
-               } else {
-                       if (va_size(va) == size) {
-                               found = true;
-                               break;
-                       }
-
-                       node = node->rb_right;
-               }
-       }
-
-       if (!found) {
-               va = rb_entry(n, struct vmap_area, rb_node);
-               pr_emerg("tree is corrupted: %lu, %lu\n",
-                       va_size(va), va->subtree_max_size);
+       list_for_each_entry(va, &free_vmap_area_list, list) {
+               computed_size = compute_subtree_max_size(va);
+               if (computed_size != va->subtree_max_size)
+                       pr_emerg("tree is corrupted: %lu, %lu\n",
+                               va_size(va), va->subtree_max_size);
        }
-
-       augment_tree_propagate_check(n->rb_left);
-       augment_tree_propagate_check(n->rb_right);
 }
 #endif
 
@@ -702,28 +685,15 @@ augment_tree_propagate_check(struct rb_node *n)
 static __always_inline void
 augment_tree_propagate_from(struct vmap_area *va)
 {
-       struct rb_node *node = &va->rb_node;
-       unsigned long new_va_sub_max_size;
-
-       while (node) {
-               va = rb_entry(node, struct vmap_area, rb_node);
-               new_va_sub_max_size = compute_subtree_max_size(va);
-
-               /*
-                * If the newly calculated maximum available size of the
-                * subtree is equal to the current one, then it means that
-                * the tree is propagated correctly. So we have to stop at
-                * this point to save cycles.
-                */
-               if (va->subtree_max_size == new_va_sub_max_size)
-                       break;
-
-               va->subtree_max_size = new_va_sub_max_size;
-               node = rb_parent(&va->rb_node);
-       }
+       /*
+        * Populate the tree from bottom towards the root until
+        * the calculated maximum available size of checked node
+        * is equal to its current one.
+        */
+       free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
 
 #if DEBUG_AUGMENT_PROPAGATE_CHECK
-       augment_tree_propagate_check(free_vmap_area_root.rb_node);
+       augment_tree_propagate_check();
 #endif
 }
 
@@ -735,7 +705,8 @@ insert_vmap_area(struct vmap_area *va,
        struct rb_node *parent;
 
        link = find_va_links(va, root, NULL, &parent);
-       link_va(va, root, parent, link, head);
+       if (link)
+               link_va(va, root, parent, link, head);
 }
 
 static void
@@ -751,8 +722,10 @@ insert_vmap_area_augment(struct vmap_area *va,
        else
                link = find_va_links(va, root, NULL, &parent);
 
-       link_va(va, root, parent, link, head);
-       augment_tree_propagate_from(va);
+       if (link) {
+               link_va(va, root, parent, link, head);
+               augment_tree_propagate_from(va);
+       }
 }
 
 /*
@@ -760,6 +733,11 @@ insert_vmap_area_augment(struct vmap_area *va,
  * and next free blocks. If coalesce is not done a new
  * free area is inserted. If VA has been merged, it is
  * freed.
+ *
+ * Please note, it can return NULL in case of overlap
+ * ranges, followed by WARN() report. Despite it is a
+ * buggy behaviour, a system can be alive and keep
+ * ongoing.
  */
 static __always_inline struct vmap_area *
 merge_or_add_vmap_area(struct vmap_area *va,
@@ -776,6 +754,8 @@ merge_or_add_vmap_area(struct vmap_area *va,
         * inserted, unless it is merged with its sibling/siblings.
         */
        link = find_va_links(va, root, NULL, &parent);
+       if (!link)
+               return NULL;
 
        /*
         * Get next node of VA to check if merging can be done.
@@ -796,9 +776,6 @@ merge_or_add_vmap_area(struct vmap_area *va,
                if (sibling->va_start == va->va_end) {
                        sibling->va_start = va->va_start;
 
-                       /* Check and update the tree if needed. */
-                       augment_tree_propagate_from(sibling);
-
                        /* Free vmap_area object. */
                        kmem_cache_free(vmap_area_cachep, va);
 
@@ -818,14 +795,18 @@ merge_or_add_vmap_area(struct vmap_area *va,
        if (next->prev != head) {
                sibling = list_entry(next->prev, struct vmap_area, list);
                if (sibling->va_end == va->va_start) {
-                       sibling->va_end = va->va_end;
-
-                       /* Check and update the tree if needed. */
-                       augment_tree_propagate_from(sibling);
-
+                       /*
+                        * If both neighbors are coalesced, it is important
+                        * to unlink the "next" node first, followed by merging
+                        * with "previous" one. Otherwise the tree might not be
+                        * fully populated if a sibling's augmented value is
+                        * "normalized" because of rotation operations.
+                        */
                        if (merged)
                                unlink_va(va, root);
 
+                       sibling->va_end = va->va_end;
+
                        /* Free vmap_area object. */
                        kmem_cache_free(vmap_area_cachep, va);
 
@@ -836,11 +817,13 @@ merge_or_add_vmap_area(struct vmap_area *va,
        }
 
 insert:
-       if (!merged) {
+       if (!merged)
                link_va(va, root, parent, link, head);
-               augment_tree_propagate_from(va);
-       }
 
+       /*
+        * Last step is to check and update the tree.
+        */
+       augment_tree_propagate_from(va);
        return va;
 }
 
@@ -1381,6 +1364,9 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
                va = merge_or_add_vmap_area(va, &free_vmap_area_root,
                                            &free_vmap_area_list);
 
+               if (!va)
+                       continue;
+
                if (is_vmalloc_or_module_addr((void *)orig_start))
                        kasan_release_vmalloc(orig_start, orig_end,
                                              va->va_start, va->va_end);
@@ -1513,12 +1499,11 @@ struct vmap_block {
 static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
 
 /*
- * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
+ * XArray of vmap blocks, indexed by address, to quickly find a vmap block
  * in the free path. Could get rid of this if we change the API to return a
  * "cookie" from alloc, to be passed to free. But no big deal yet.
  */
-static DEFINE_SPINLOCK(vmap_block_tree_lock);
-static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
+static DEFINE_XARRAY(vmap_blocks);
 
 /*
  * We should probably have a fallback mechanism to allocate virtual memory
@@ -1575,13 +1560,6 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
                return ERR_CAST(va);
        }
 
-       err = radix_tree_preload(gfp_mask);
-       if (unlikely(err)) {
-               kfree(vb);
-               free_vmap_area(va);
-               return ERR_PTR(err);
-       }
-
        vaddr = vmap_block_vaddr(va->va_start, 0);
        spin_lock_init(&vb->lock);
        vb->va = va;
@@ -1594,11 +1572,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
        INIT_LIST_HEAD(&vb->free_list);
 
        vb_idx = addr_to_vb_idx(va->va_start);
-       spin_lock(&vmap_block_tree_lock);
-       err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
-       spin_unlock(&vmap_block_tree_lock);
-       BUG_ON(err);
-       radix_tree_preload_end();
+       err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask);
+       if (err) {
+               kfree(vb);
+               free_vmap_area(va);
+               return ERR_PTR(err);
+       }
 
        vbq = &get_cpu_var(vmap_block_queue);
        spin_lock(&vbq->lock);
@@ -1612,12 +1591,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
 static void free_vmap_block(struct vmap_block *vb)
 {
        struct vmap_block *tmp;
-       unsigned long vb_idx;
 
-       vb_idx = addr_to_vb_idx(vb->va->va_start);
-       spin_lock(&vmap_block_tree_lock);
-       tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
-       spin_unlock(&vmap_block_tree_lock);
+       tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
        BUG_ON(tmp != vb);
 
        free_vmap_area_noflush(vb->va);
@@ -1723,7 +1698,6 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
 static void vb_free(unsigned long addr, unsigned long size)
 {
        unsigned long offset;
-       unsigned long vb_idx;
        unsigned int order;
        struct vmap_block *vb;
 
@@ -1733,14 +1707,8 @@ static void vb_free(unsigned long addr, unsigned long size)
        flush_cache_vunmap(addr, addr + size);
 
        order = get_order(size);
-
        offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
-
-       vb_idx = addr_to_vb_idx(addr);
-       rcu_read_lock();
-       vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
-       rcu_read_unlock();
-       BUG_ON(!vb);
+       vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
 
        unmap_kernel_range_noflush(addr, size);
 
@@ -3383,8 +3351,9 @@ recovery:
                orig_end = vas[area]->va_end;
                va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
                                            &free_vmap_area_list);
-               kasan_release_vmalloc(orig_start, orig_end,
-                                     va->va_start, va->va_end);
+               if (va)
+                       kasan_release_vmalloc(orig_start, orig_end,
+                               va->va_start, va->va_end);
                vas[area] = NULL;
        }
 
@@ -3432,8 +3401,9 @@ err_free_shadow:
                orig_end = vas[area]->va_end;
                va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
                                            &free_vmap_area_list);
-               kasan_release_vmalloc(orig_start, orig_end,
-                                     va->va_start, va->va_end);
+               if (va)
+                       kasan_release_vmalloc(orig_start, orig_end,
+                               va->va_start, va->va_end);
                vas[area] = NULL;
                kfree(vms[area]);
        }