X-Git-Url: http://git.monstr.eu/?a=blobdiff_plain;f=mm%2Fslub.c;h=3d2025f7163b295c45c2b26a8c5260414b2fc238;hb=2c3ef25c4a60cc18cf3f05a74c78220748f25684;hp=b5788040d92e629628739fbc618c23610c2d58ca;hpb=9f101ee89465e0b2c11b477f5b55e03039b2c308;p=linux-2.6-microblaze.git diff --git a/mm/slub.c b/mm/slub.c index b5788040d92e..3d2025f7163b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -46,13 +46,21 @@ /* * Lock order: * 1. slab_mutex (Global Mutex) - * 2. node->list_lock - * 3. slab_lock(page) (Only on some arches and for debugging) + * 2. node->list_lock (Spinlock) + * 3. kmem_cache->cpu_slab->lock (Local lock) + * 4. slab_lock(page) (Only on some arches or for debugging) + * 5. object_map_lock (Only for debugging) * * slab_mutex * * The role of the slab_mutex is to protect the list of all the slabs * and to synchronize major metadata changes to slab cache structures. + * Also synchronizes memory hotplug callbacks. + * + * slab_lock + * + * The slab_lock is a wrapper around the page lock, thus it is a bit + * spinlock. * * The slab_lock is only used for debugging and on arches that do not * have the ability to do a cmpxchg_double. It only protects: @@ -61,6 +69,8 @@ * C. page->objects -> Number of objects in page * D. page->frozen -> frozen state * + * Frozen slabs + * * If a slab is frozen then it is exempt from list management. It is not * on any list except per cpu partial list. The processor that froze the * slab is the one who can perform list operations on the page. Other @@ -68,6 +78,8 @@ * froze the slab is the only one that can retrieve the objects from the * page's freelist. * + * list_lock + * * The list_lock protects the partial and full list on each node and * the partial slab counter. If taken then no new slabs may be added or * removed from the lists nor make the number of partial slabs be modified. @@ -79,10 +91,36 @@ * slabs, operations can continue without any centralized lock. F.e. * allocating a long series of objects that fill up slabs does not require * the list lock. - * Interrupts are disabled during allocation and deallocation in order to - * make the slab allocator safe to use in the context of an irq. In addition - * interrupts are disabled to ensure that the processor does not change - * while handling per_cpu slabs, due to kernel preemption. + * + * cpu_slab->lock local lock + * + * This locks protect slowpath manipulation of all kmem_cache_cpu fields + * except the stat counters. This is a percpu structure manipulated only by + * the local cpu, so the lock protects against being preempted or interrupted + * by an irq. Fast path operations rely on lockless operations instead. + * On PREEMPT_RT, the local lock does not actually disable irqs (and thus + * prevent the lockless operations), so fastpath operations also need to take + * the lock and are no longer lockless. + * + * lockless fastpaths + * + * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free()) + * are fully lockless when satisfied from the percpu slab (and when + * cmpxchg_double is possible to use, otherwise slab_lock is taken). + * They also don't disable preemption or migration or irqs. They rely on + * the transaction id (tid) field to detect being preempted or moved to + * another cpu. + * + * irq, preemption, migration considerations + * + * Interrupts are disabled as part of list_lock or local_lock operations, or + * around the slab_lock operation, in order to make the slab allocator safe + * to use in the context of an irq. + * + * In addition, preemption (or migration on PREEMPT_RT) is disabled in the + * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the + * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer + * doesn't have to be revalidated in each section protected by the local lock. * * SLUB assigns one slab for allocation to each processor. * Allocations only occur from these slabs called cpu slabs. @@ -118,6 +156,26 @@ * the fast path and disables lockless freelists. */ +/* + * We could simply use migrate_disable()/enable() but as long as it's a + * function call even on !PREEMPT_RT, use inline preempt_disable() there. + */ +#ifndef CONFIG_PREEMPT_RT +#define slub_get_cpu_ptr(var) get_cpu_ptr(var) +#define slub_put_cpu_ptr(var) put_cpu_ptr(var) +#else +#define slub_get_cpu_ptr(var) \ +({ \ + migrate_disable(); \ + this_cpu_ptr(var); \ +}) +#define slub_put_cpu_ptr(var) \ +do { \ + (void)(var); \ + migrate_enable(); \ +} while (0) +#endif + #ifdef CONFIG_SLUB_DEBUG #ifdef CONFIG_SLUB_DEBUG_ON DEFINE_STATIC_KEY_TRUE(slub_debug_enabled); @@ -359,25 +417,44 @@ static inline unsigned int oo_objects(struct kmem_cache_order_objects x) /* * Per slab locking using the pagelock */ -static __always_inline void slab_lock(struct page *page) +static __always_inline void __slab_lock(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); bit_spin_lock(PG_locked, &page->flags); } -static __always_inline void slab_unlock(struct page *page) +static __always_inline void __slab_unlock(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); __bit_spin_unlock(PG_locked, &page->flags); } -/* Interrupts must be disabled (for the fallback code to work right) */ +static __always_inline void slab_lock(struct page *page, unsigned long *flags) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_save(*flags); + __slab_lock(page); +} + +static __always_inline void slab_unlock(struct page *page, unsigned long *flags) +{ + __slab_unlock(page); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(*flags); +} + +/* + * Interrupts must be disabled (for the fallback code to work right), typically + * by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different + * so we disable interrupts as part of slab_[un]lock(). + */ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_old, unsigned long counters_old, void *freelist_new, unsigned long counters_new, const char *n) { - VM_BUG_ON(!irqs_disabled()); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + lockdep_assert_irqs_disabled(); #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { @@ -388,15 +465,18 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page } else #endif { - slab_lock(page); + /* init to 0 to prevent spurious warnings */ + unsigned long flags = 0; + + slab_lock(page, &flags); if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; page->counters = counters_new; - slab_unlock(page); + slab_unlock(page, &flags); return true; } - slab_unlock(page); + slab_unlock(page, &flags); } cpu_relax(); @@ -427,16 +507,16 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, unsigned long flags; local_irq_save(flags); - slab_lock(page); + __slab_lock(page); if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; page->counters = counters_new; - slab_unlock(page); + __slab_unlock(page); local_irq_restore(flags); return true; } - slab_unlock(page); + __slab_unlock(page); local_irq_restore(flags); } @@ -452,7 +532,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, #ifdef CONFIG_SLUB_DEBUG static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; -static DEFINE_SPINLOCK(object_map_lock); +static DEFINE_RAW_SPINLOCK(object_map_lock); static void __fill_map(unsigned long *obj_map, struct kmem_cache *s, struct page *page) @@ -497,7 +577,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page) { VM_BUG_ON(!irqs_disabled()); - spin_lock(&object_map_lock); + raw_spin_lock(&object_map_lock); __fill_map(object_map, s, page); @@ -507,7 +587,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page) static void put_map(unsigned long *map) __releases(&object_map_lock) { VM_BUG_ON(map != object_map); - spin_unlock(&object_map_lock); + raw_spin_unlock(&object_map_lock); } static inline unsigned int size_from_object(struct kmem_cache *s) @@ -1269,11 +1349,11 @@ static noinline int free_debug_processing( struct kmem_cache_node *n = get_node(s, page_to_nid(page)); void *object = head; int cnt = 0; - unsigned long flags; + unsigned long flags, flags2; int ret = 0; spin_lock_irqsave(&n->list_lock, flags); - slab_lock(page); + slab_lock(page, &flags2); if (s->flags & SLAB_CONSISTENCY_CHECKS) { if (!check_slab(s, page)) @@ -1306,7 +1386,7 @@ out: slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n", bulk_cnt, cnt); - slab_unlock(page); + slab_unlock(page, &flags2); spin_unlock_irqrestore(&n->list_lock, flags); if (!ret) slab_fix(s, "Object at 0x%p not freed", object); @@ -2003,18 +2083,24 @@ static inline void *acquire_slab(struct kmem_cache *s, return freelist; } +#ifdef CONFIG_SLUB_CPU_PARTIAL static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); +#else +static inline void put_cpu_partial(struct kmem_cache *s, struct page *page, + int drain) { } +#endif static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags); /* * Try to allocate a partial slab from a specific node. */ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, - struct page **ret_page, gfp_t flags) + struct page **ret_page, gfp_t gfpflags) { struct page *page, *page2; void *object = NULL; unsigned int available = 0; + unsigned long flags; int objects; /* @@ -2026,11 +2112,11 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, if (!n || !n->nr_partial) return NULL; - spin_lock(&n->list_lock); + spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(page, page2, &n->partial, slab_list) { void *t; - if (!pfmemalloc_match(page, flags)) + if (!pfmemalloc_match(page, gfpflags)) continue; t = acquire_slab(s, n, page, object == NULL, &objects); @@ -2051,7 +2137,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, break; } - spin_unlock(&n->list_lock); + spin_unlock_irqrestore(&n->list_lock, flags); return object; } @@ -2202,16 +2288,23 @@ static inline void note_cmpxchg_failure(const char *n, static void init_kmem_cache_cpus(struct kmem_cache *s) { int cpu; + struct kmem_cache_cpu *c; - for_each_possible_cpu(cpu) - per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(s->cpu_slab, cpu); + local_lock_init(&c->lock); + c->tid = init_tid(cpu); + } } /* - * Remove the cpu slab + * Finishes removing the cpu slab. Merges cpu's freelist with page's freelist, + * unfreezes the slabs and puts it on the proper list. + * Assumes the slab has been already safely taken away from kmem_cache_cpu + * by the caller. */ static void deactivate_slab(struct kmem_cache *s, struct page *page, - void *freelist, struct kmem_cache_cpu *c) + void *freelist) { enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; struct kmem_cache_node *n = get_node(s, page_to_nid(page)); @@ -2219,6 +2312,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, enum slab_modes l = M_NONE, m = M_NONE; void *nextfree, *freelist_iter, *freelist_tail; int tail = DEACTIVATE_TO_HEAD; + unsigned long flags = 0; struct page new; struct page old; @@ -2294,7 +2388,7 @@ redo: * that acquire_slab() will see a slab page that * is frozen */ - spin_lock(&n->list_lock); + spin_lock_irqsave(&n->list_lock, flags); } } else { m = M_FULL; @@ -2305,7 +2399,7 @@ redo: * slabs from diagnostic functions will not see * any frozen slabs. */ - spin_lock(&n->list_lock); + spin_lock_irqsave(&n->list_lock, flags); } } @@ -2322,14 +2416,14 @@ redo: } l = m; - if (!__cmpxchg_double_slab(s, page, + if (!cmpxchg_double_slab(s, page, old.freelist, old.counters, new.freelist, new.counters, "unfreezing slab")) goto redo; if (lock) - spin_unlock(&n->list_lock); + spin_unlock_irqrestore(&n->list_lock, flags); if (m == M_PARTIAL) stat(s, tail); @@ -2340,38 +2434,29 @@ redo: discard_slab(s, page); stat(s, FREE_SLAB); } - - c->page = NULL; - c->freelist = NULL; } -/* - * Unfreeze all the cpu partial slabs. - * - * This function must be called with interrupts disabled - * for the cpu using c (or some other guarantee must be there - * to guarantee no concurrent accesses). - */ -static void unfreeze_partials(struct kmem_cache *s, - struct kmem_cache_cpu *c) -{ #ifdef CONFIG_SLUB_CPU_PARTIAL +static void __unfreeze_partials(struct kmem_cache *s, struct page *partial_page) +{ struct kmem_cache_node *n = NULL, *n2 = NULL; struct page *page, *discard_page = NULL; + unsigned long flags = 0; - while ((page = slub_percpu_partial(c))) { + while (partial_page) { struct page new; struct page old; - slub_set_percpu_partial(c, page); + page = partial_page; + partial_page = page->next; n2 = get_node(s, page_to_nid(page)); if (n != n2) { if (n) - spin_unlock(&n->list_lock); + spin_unlock_irqrestore(&n->list_lock, flags); n = n2; - spin_lock(&n->list_lock); + spin_lock_irqsave(&n->list_lock, flags); } do { @@ -2400,7 +2485,7 @@ static void unfreeze_partials(struct kmem_cache *s, } if (n) - spin_unlock(&n->list_lock); + spin_unlock_irqrestore(&n->list_lock, flags); while (discard_page) { page = discard_page; @@ -2410,7 +2495,35 @@ static void unfreeze_partials(struct kmem_cache *s, discard_slab(s, page); stat(s, FREE_SLAB); } -#endif /* CONFIG_SLUB_CPU_PARTIAL */ +} + +/* + * Unfreeze all the cpu partial slabs. + */ +static void unfreeze_partials(struct kmem_cache *s) +{ + struct page *partial_page; + unsigned long flags; + + local_lock_irqsave(&s->cpu_slab->lock, flags); + partial_page = this_cpu_read(s->cpu_slab->partial); + this_cpu_write(s->cpu_slab->partial, NULL); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + + if (partial_page) + __unfreeze_partials(s, partial_page); +} + +static void unfreeze_partials_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) +{ + struct page *partial_page; + + partial_page = slub_percpu_partial(c); + c->partial = NULL; + + if (partial_page) + __unfreeze_partials(s, partial_page); } /* @@ -2422,90 +2535,170 @@ static void unfreeze_partials(struct kmem_cache *s, */ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) { -#ifdef CONFIG_SLUB_CPU_PARTIAL struct page *oldpage; - int pages; - int pobjects; + struct page *page_to_unfreeze = NULL; + unsigned long flags; + int pages = 0; + int pobjects = 0; - preempt_disable(); - do { - pages = 0; - pobjects = 0; - oldpage = this_cpu_read(s->cpu_slab->partial); + local_lock_irqsave(&s->cpu_slab->lock, flags); + + oldpage = this_cpu_read(s->cpu_slab->partial); - if (oldpage) { + if (oldpage) { + if (drain && oldpage->pobjects > slub_cpu_partial(s)) { + /* + * Partial array is full. Move the existing set to the + * per node partial list. Postpone the actual unfreezing + * outside of the critical section. + */ + page_to_unfreeze = oldpage; + oldpage = NULL; + } else { pobjects = oldpage->pobjects; pages = oldpage->pages; - if (drain && pobjects > slub_cpu_partial(s)) { - unsigned long flags; - /* - * partial array is full. Move the existing - * set to the per node partial list. - */ - local_irq_save(flags); - unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); - local_irq_restore(flags); - oldpage = NULL; - pobjects = 0; - pages = 0; - stat(s, CPU_PARTIAL_DRAIN); - } } + } - pages++; - pobjects += page->objects - page->inuse; + pages++; + pobjects += page->objects - page->inuse; - page->pages = pages; - page->pobjects = pobjects; - page->next = oldpage; + page->pages = pages; + page->pobjects = pobjects; + page->next = oldpage; - } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) - != oldpage); - preempt_enable(); -#endif /* CONFIG_SLUB_CPU_PARTIAL */ + this_cpu_write(s->cpu_slab->partial, page); + + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + + if (page_to_unfreeze) { + __unfreeze_partials(s, page_to_unfreeze); + stat(s, CPU_PARTIAL_DRAIN); + } } +#else /* CONFIG_SLUB_CPU_PARTIAL */ + +static inline void unfreeze_partials(struct kmem_cache *s) { } +static inline void unfreeze_partials_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) { } + +#endif /* CONFIG_SLUB_CPU_PARTIAL */ + static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { - stat(s, CPUSLAB_FLUSH); - deactivate_slab(s, c->page, c->freelist, c); + unsigned long flags; + struct page *page; + void *freelist; + local_lock_irqsave(&s->cpu_slab->lock, flags); + + page = c->page; + freelist = c->freelist; + + c->page = NULL; + c->freelist = NULL; c->tid = next_tid(c->tid); + + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + + if (page) { + deactivate_slab(s, page, freelist); + stat(s, CPUSLAB_FLUSH); + } } +static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) +{ + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + void *freelist = c->freelist; + struct page *page = c->page; + + c->page = NULL; + c->freelist = NULL; + c->tid = next_tid(c->tid); + + if (page) { + deactivate_slab(s, page, freelist); + stat(s, CPUSLAB_FLUSH); + } + + unfreeze_partials_cpu(s, c); +} + +struct slub_flush_work { + struct work_struct work; + struct kmem_cache *s; + bool skip; +}; + /* * Flush cpu slab. * - * Called from IPI handler with interrupts disabled. + * Called from CPU work handler with migration disabled. */ -static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) +static void flush_cpu_slab(struct work_struct *w) { - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + struct kmem_cache *s; + struct kmem_cache_cpu *c; + struct slub_flush_work *sfw; + + sfw = container_of(w, struct slub_flush_work, work); + + s = sfw->s; + c = this_cpu_ptr(s->cpu_slab); if (c->page) flush_slab(s, c); - unfreeze_partials(s, c); + unfreeze_partials(s); } -static void flush_cpu_slab(void *d) +static bool has_cpu_slab(int cpu, struct kmem_cache *s) { - struct kmem_cache *s = d; + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - __flush_cpu_slab(s, smp_processor_id()); + return c->page || slub_percpu_partial(c); } -static bool has_cpu_slab(int cpu, void *info) +static DEFINE_MUTEX(flush_lock); +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); + +static void flush_all_cpus_locked(struct kmem_cache *s) { - struct kmem_cache *s = info; - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + struct slub_flush_work *sfw; + unsigned int cpu; - return c->page || slub_percpu_partial(c); + lockdep_assert_cpus_held(); + mutex_lock(&flush_lock); + + for_each_online_cpu(cpu) { + sfw = &per_cpu(slub_flush, cpu); + if (!has_cpu_slab(cpu, s)) { + sfw->skip = true; + continue; + } + INIT_WORK(&sfw->work, flush_cpu_slab); + sfw->skip = false; + sfw->s = s; + schedule_work_on(cpu, &sfw->work); + } + + for_each_online_cpu(cpu) { + sfw = &per_cpu(slub_flush, cpu); + if (sfw->skip) + continue; + flush_work(&sfw->work); + } + + mutex_unlock(&flush_lock); } static void flush_all(struct kmem_cache *s) { - on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1); + cpus_read_lock(); + flush_all_cpus_locked(s); + cpus_read_unlock(); } /* @@ -2515,14 +2708,10 @@ static void flush_all(struct kmem_cache *s) static int slub_cpu_dead(unsigned int cpu) { struct kmem_cache *s; - unsigned long flags; mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) { - local_irq_save(flags); + list_for_each_entry(s, &slab_caches, list) __flush_cpu_slab(s, cpu); - local_irq_restore(flags); - } mutex_unlock(&slab_mutex); return 0; } @@ -2633,8 +2822,6 @@ static inline bool pfmemalloc_match_unsafe(struct page *page, gfp_t gfpflags) * The page is still frozen if the return value is not NULL. * * If this function returns NULL then the page has been unfrozen. - * - * This function must be called with interrupt disabled. */ static inline void *get_freelist(struct kmem_cache *s, struct page *page) { @@ -2642,6 +2829,8 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) unsigned long counters; void *freelist; + lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); + do { freelist = page->freelist; counters = page->counters; @@ -2725,10 +2914,10 @@ redo: if (unlikely(!pfmemalloc_match_unsafe(page, gfpflags))) goto deactivate_slab; - /* must check again c->page in case IRQ handler changed it */ - local_irq_save(flags); + /* must check again c->page in case we got preempted and it changed */ + local_lock_irqsave(&s->cpu_slab->lock, flags); if (unlikely(page != c->page)) { - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); goto reread_page; } freelist = c->freelist; @@ -2739,7 +2928,7 @@ redo: if (!freelist) { c->page = NULL; - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); stat(s, DEACTIVATE_BYPASS); goto new_slab; } @@ -2748,7 +2937,7 @@ redo: load_freelist: - lockdep_assert_irqs_disabled(); + lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); /* * freelist is pointing to the list of objects to be used. @@ -2758,55 +2947,52 @@ load_freelist: VM_BUG_ON(!c->page->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); return freelist; deactivate_slab: - local_irq_save(flags); + local_lock_irqsave(&s->cpu_slab->lock, flags); if (page != c->page) { - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); goto reread_page; } - deactivate_slab(s, page, c->freelist, c); - local_irq_restore(flags); + freelist = c->freelist; + c->page = NULL; + c->freelist = NULL; + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + deactivate_slab(s, page, freelist); new_slab: if (slub_percpu_partial(c)) { - local_irq_save(flags); + local_lock_irqsave(&s->cpu_slab->lock, flags); if (unlikely(c->page)) { - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); goto reread_page; } - if (unlikely(!slub_percpu_partial(c))) - goto new_objects; /* stolen by an IRQ handler */ + if (unlikely(!slub_percpu_partial(c))) { + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + /* we were preempted and partial list got empty */ + goto new_objects; + } page = c->page = slub_percpu_partial(c); slub_set_percpu_partial(c, page); - local_irq_restore(flags); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); stat(s, CPU_PARTIAL_ALLOC); goto redo; } - local_irq_save(flags); - if (unlikely(c->page)) { - local_irq_restore(flags); - goto reread_page; - } - new_objects: - lockdep_assert_irqs_disabled(); - freelist = get_partial(s, gfpflags, node, &page); - local_irq_restore(flags); if (freelist) goto check_new_page; - put_cpu_ptr(s->cpu_slab); + slub_put_cpu_ptr(s->cpu_slab); page = new_slab(s, gfpflags, node); - c = get_cpu_ptr(s->cpu_slab); + c = slub_get_cpu_ptr(s->cpu_slab); if (unlikely(!page)) { slab_out_of_memory(s, gfpflags, node); @@ -2844,22 +3030,32 @@ check_new_page: */ goto return_single; - local_irq_save(flags); - if (unlikely(c->page)) - flush_slab(s, c); +retry_load_page: + + local_lock_irqsave(&s->cpu_slab->lock, flags); + if (unlikely(c->page)) { + void *flush_freelist = c->freelist; + struct page *flush_page = c->page; + + c->page = NULL; + c->freelist = NULL; + c->tid = next_tid(c->tid); + + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + + deactivate_slab(s, flush_page, flush_freelist); + + stat(s, CPUSLAB_FLUSH); + + goto retry_load_page; + } c->page = page; goto load_freelist; return_single: - local_irq_save(flags); - if (unlikely(c->page)) - flush_slab(s, c); - c->page = page; - - deactivate_slab(s, page, get_freepointer(s, freelist), c); - local_irq_restore(flags); + deactivate_slab(s, page, get_freepointer(s, freelist)); return freelist; } @@ -2879,12 +3075,12 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, * cpu before disabling preemption. Need to reload cpu area * pointer. */ - c = get_cpu_ptr(s->cpu_slab); + c = slub_get_cpu_ptr(s->cpu_slab); #endif p = ___slab_alloc(s, gfpflags, node, addr, c); #ifdef CONFIG_PREEMPT_COUNT - put_cpu_ptr(s->cpu_slab); + slub_put_cpu_ptr(s->cpu_slab); #endif return p; } @@ -2964,7 +3160,15 @@ redo: object = c->freelist; page = c->page; - if (unlikely(!object || !page || !node_match(page, node))) { + /* + * We cannot use the lockless fastpath on PREEMPT_RT because if a + * slowpath has taken the local_lock_irqsave(), it is not protected + * against a fast path operation in an irq handler. So we need to take + * the slow path which uses local_lock. It is still relatively fast if + * there is a suitable cpu freelist. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) || + unlikely(!object || !page || !node_match(page, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); } else { void *next_object = get_freepointer_safe(s, object); @@ -3224,6 +3428,7 @@ redo: barrier(); if (likely(page == c->page)) { +#ifndef CONFIG_PREEMPT_RT void **freelist = READ_ONCE(c->freelist); set_freepointer(s, tail_obj, freelist); @@ -3236,6 +3441,31 @@ redo: note_cmpxchg_failure("slab_free", s, tid); goto redo; } +#else /* CONFIG_PREEMPT_RT */ + /* + * We cannot use the lockless fastpath on PREEMPT_RT because if + * a slowpath has taken the local_lock_irqsave(), it is not + * protected against a fast path operation in an irq handler. So + * we need to take the local_lock. We shouldn't simply defer to + * __slab_free() as that wouldn't use the cpu freelist at all. + */ + void **freelist; + + local_lock(&s->cpu_slab->lock); + c = this_cpu_ptr(s->cpu_slab); + if (unlikely(page != c->page)) { + local_unlock(&s->cpu_slab->lock); + goto redo; + } + tid = c->tid; + freelist = c->freelist; + + set_freepointer(s, tail_obj, freelist); + c->freelist = head; + c->tid = next_tid(tid); + + local_unlock(&s->cpu_slab->lock); +#endif stat(s, FREE_FASTPATH); } else __slab_free(s, page, head, tail_obj, cnt, addr); @@ -3413,8 +3643,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, * IRQs, which protects against PREEMPT and interrupts * handlers invoking normal fastpath. */ - c = get_cpu_ptr(s->cpu_slab); - local_irq_disable(); + c = slub_get_cpu_ptr(s->cpu_slab); + local_lock_irq(&s->cpu_slab->lock); for (i = 0; i < size; i++) { void *object = kfence_alloc(s, s->object_size, flags); @@ -3435,7 +3665,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, */ c->tid = next_tid(c->tid); - local_irq_enable(); + local_unlock_irq(&s->cpu_slab->lock); /* * Invoking slow path likely have side-effect @@ -3449,7 +3679,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, c = this_cpu_ptr(s->cpu_slab); maybe_wipe_obj_freeptr(s, p[i]); - local_irq_disable(); + local_lock_irq(&s->cpu_slab->lock); continue; /* goto for-loop */ } @@ -3458,8 +3688,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, maybe_wipe_obj_freeptr(s, p[i]); } c->tid = next_tid(c->tid); - local_irq_enable(); - put_cpu_ptr(s->cpu_slab); + local_unlock_irq(&s->cpu_slab->lock); + slub_put_cpu_ptr(s->cpu_slab); /* * memcg and kmem_cache debug support and memory initialization. @@ -3469,7 +3699,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, slab_want_init_on_alloc(flags, s)); return i; error: - put_cpu_ptr(s->cpu_slab); + slub_put_cpu_ptr(s->cpu_slab); slab_post_alloc_hook(s, objcg, flags, i, p, false); __kmem_cache_free_bulk(s, i, p); return 0; @@ -3983,11 +4213,12 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, { #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); + unsigned long flags; unsigned long *map; void *p; slab_err(s, page, text, s->name); - slab_lock(page); + slab_lock(page, &flags); map = get_map(s, page); for_each_object(p, s, addr, page->objects) { @@ -3998,7 +4229,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, } } put_map(map); - slab_unlock(page); + slab_unlock(page, &flags); #endif } @@ -4048,7 +4279,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s) int node; struct kmem_cache_node *n; - flush_all(s); + flush_all_cpus_locked(s); /* Attempt to free all objects */ for_each_kmem_cache_node(s, node, n) { free_partial(s, n); @@ -4324,7 +4555,7 @@ EXPORT_SYMBOL(kfree); * being allocated from last increasing the chance that the last objects * are freed in them. */ -int __kmem_cache_shrink(struct kmem_cache *s) +static int __kmem_cache_do_shrink(struct kmem_cache *s) { int node; int i; @@ -4336,7 +4567,6 @@ int __kmem_cache_shrink(struct kmem_cache *s) unsigned long flags; int ret = 0; - flush_all(s); for_each_kmem_cache_node(s, node, n) { INIT_LIST_HEAD(&discard); for (i = 0; i < SHRINK_PROMOTE_MAX; i++) @@ -4386,13 +4616,21 @@ int __kmem_cache_shrink(struct kmem_cache *s) return ret; } +int __kmem_cache_shrink(struct kmem_cache *s) +{ + flush_all(s); + return __kmem_cache_do_shrink(s); +} + static int slab_mem_going_offline_callback(void *arg) { struct kmem_cache *s; mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) - __kmem_cache_shrink(s); + list_for_each_entry(s, &slab_caches, list) { + flush_all_cpus_locked(s); + __kmem_cache_do_shrink(s); + } mutex_unlock(&slab_mutex); return 0; @@ -4723,8 +4961,9 @@ static void validate_slab(struct kmem_cache *s, struct page *page, { void *p; void *addr = page_address(page); + unsigned long flags; - slab_lock(page); + slab_lock(page, &flags); if (!check_slab(s, page) || !on_freelist(s, page, NULL)) goto unlock; @@ -4739,7 +4978,7 @@ static void validate_slab(struct kmem_cache *s, struct page *page, break; } unlock: - slab_unlock(page); + slab_unlock(page, &flags); } static int validate_slab_node(struct kmem_cache *s,