Merge tag 'nfsd-5.15-1' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux

[linux-2.6-microblaze.git] / mm / slub.c
diff --git a/mm/slub.c b/mm/slub.c

index c4a9b89..3d2025f 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -46,13 +46,21 @@
  /*
   * Lock order:
   *   1. slab_mutex (Global Mutex)
- *   2. node->list_lock
- *   3. slab_lock(page) (Only on some arches and for debugging)
+ *   2. node->list_lock (Spinlock)
+ *   3. kmem_cache->cpu_slab->lock (Local lock)
+ *   4. slab_lock(page) (Only on some arches or for debugging)
+ *   5. object_map_lock (Only for debugging)
   *
   *   slab_mutex
   *
   *   The role of the slab_mutex is to protect the list of all the slabs
   *   and to synchronize major metadata changes to slab cache structures.
+ *   Also synchronizes memory hotplug callbacks.
+ *
+ *   slab_lock
+ *
+ *   The slab_lock is a wrapper around the page lock, thus it is a bit
+ *   spinlock.
   *
   *   The slab_lock is only used for debugging and on arches that do not
   *   have the ability to do a cmpxchg_double. It only protects:
@@ -61,6 +69,8 @@
   *     C. page->objects        -> Number of objects in page
   *     D. page->frozen         -> frozen state
   *
+ *   Frozen slabs
+ *
   *   If a slab is frozen then it is exempt from list management. It is not
   *   on any list except per cpu partial list. The processor that froze the
   *   slab is the one who can perform list operations on the page. Other
@@ -68,6 +78,8 @@
   *   froze the slab is the only one that can retrieve the objects from the
   *   page's freelist.
   *
+ *   list_lock
+ *
   *   The list_lock protects the partial and full list on each node and
   *   the partial slab counter. If taken then no new slabs may be added or
   *   removed from the lists nor make the number of partial slabs be modified.
@@ -79,10 +91,36 @@
   *   slabs, operations can continue without any centralized lock. F.e.
   *   allocating a long series of objects that fill up slabs does not require
   *   the list lock.
- *   Interrupts are disabled during allocation and deallocation in order to
- *   make the slab allocator safe to use in the context of an irq. In addition
- *   interrupts are disabled to ensure that the processor does not change
- *   while handling per_cpu slabs, due to kernel preemption.
+ *
+ *   cpu_slab->lock local lock
+ *
+ *   This locks protect slowpath manipulation of all kmem_cache_cpu fields
+ *   except the stat counters. This is a percpu structure manipulated only by
+ *   the local cpu, so the lock protects against being preempted or interrupted
+ *   by an irq. Fast path operations rely on lockless operations instead.
+ *   On PREEMPT_RT, the local lock does not actually disable irqs (and thus
+ *   prevent the lockless operations), so fastpath operations also need to take
+ *   the lock and are no longer lockless.
+ *
+ *   lockless fastpaths
+ *
+ *   The fast path allocation (slab_alloc_node()) and freeing (do_slab_free())
+ *   are fully lockless when satisfied from the percpu slab (and when
+ *   cmpxchg_double is possible to use, otherwise slab_lock is taken).
+ *   They also don't disable preemption or migration or irqs. They rely on
+ *   the transaction id (tid) field to detect being preempted or moved to
+ *   another cpu.
+ *
+ *   irq, preemption, migration considerations
+ *
+ *   Interrupts are disabled as part of list_lock or local_lock operations, or
+ *   around the slab_lock operation, in order to make the slab allocator safe
+ *   to use in the context of an irq.
+ *
+ *   In addition, preemption (or migration on PREEMPT_RT) is disabled in the
+ *   allocation slowpath, bulk allocation, and put_cpu_partial(), so that the
+ *   local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer
+ *   doesn't have to be revalidated in each section protected by the local lock.
   *
   * SLUB assigns one slab for allocation to each processor.
   * Allocations only occur from these slabs called cpu slabs.
@@ -118,6 +156,26 @@
   *                     the fast path and disables lockless freelists.
   */
  
+/*
+ * We could simply use migrate_disable()/enable() but as long as it's a
+ * function call even on !PREEMPT_RT, use inline preempt_disable() there.
+ */
+#ifndef CONFIG_PREEMPT_RT
+#define slub_get_cpu_ptr(var)  get_cpu_ptr(var)
+#define slub_put_cpu_ptr(var)  put_cpu_ptr(var)
+#else
+#define slub_get_cpu_ptr(var)          \
+({                                     \
+       migrate_disable();              \
+       this_cpu_ptr(var);              \
+})
+#define slub_put_cpu_ptr(var)          \
+do {                                   \
+       (void)(var);                    \
+       migrate_enable();               \
+} while (0)
+#endif
+
  #ifdef CONFIG_SLUB_DEBUG
  #ifdef CONFIG_SLUB_DEBUG_ON
  DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
@@ -359,25 +417,44 @@ static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
  /*
   * Per slab locking using the pagelock
   */
-static __always_inline void slab_lock(struct page *page)
+static __always_inline void __slab_lock(struct page *page)
  {
         VM_BUG_ON_PAGE(PageTail(page), page);
         bit_spin_lock(PG_locked, &page->flags);
  }
  
-static __always_inline void slab_unlock(struct page *page)
+static __always_inline void __slab_unlock(struct page *page)
  {
         VM_BUG_ON_PAGE(PageTail(page), page);
         __bit_spin_unlock(PG_locked, &page->flags);
  }
  
-/* Interrupts must be disabled (for the fallback code to work right) */
+static __always_inline void slab_lock(struct page *page, unsigned long *flags)
+{
+       if (IS_ENABLED(CONFIG_PREEMPT_RT))
+               local_irq_save(*flags);
+       __slab_lock(page);
+}
+
+static __always_inline void slab_unlock(struct page *page, unsigned long *flags)
+{
+       __slab_unlock(page);
+       if (IS_ENABLED(CONFIG_PREEMPT_RT))
+               local_irq_restore(*flags);
+}
+
+/*
+ * Interrupts must be disabled (for the fallback code to work right), typically
+ * by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different
+ * so we disable interrupts as part of slab_[un]lock().
+ */
  static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
                 void *freelist_old, unsigned long counters_old,
                 void *freelist_new, unsigned long counters_new,
                 const char *n)
  {
-       VM_BUG_ON(!irqs_disabled());
+       if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+               lockdep_assert_irqs_disabled();
  #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
      defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
         if (s->flags & __CMPXCHG_DOUBLE) {
@@ -388,15 +465,18 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
         } else
  #endif
         {
-               slab_lock(page);
+               /* init to 0 to prevent spurious warnings */
+               unsigned long flags = 0;
+
+               slab_lock(page, &flags);
                 if (page->freelist == freelist_old &&
                                         page->counters == counters_old) {
                         page->freelist = freelist_new;
                         page->counters = counters_new;
-                       slab_unlock(page);
+                       slab_unlock(page, &flags);
                         return true;
                 }
-               slab_unlock(page);
+               slab_unlock(page, &flags);
         }
  
         cpu_relax();
@@ -427,16 +507,16 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
                 unsigned long flags;
  
                 local_irq_save(flags);
-               slab_lock(page);
+               __slab_lock(page);
                 if (page->freelist == freelist_old &&
                                         page->counters == counters_old) {
                         page->freelist = freelist_new;
                         page->counters = counters_new;
-                       slab_unlock(page);
+                       __slab_unlock(page);
                         local_irq_restore(flags);
                         return true;
                 }
-               slab_unlock(page);
+               __slab_unlock(page);
                 local_irq_restore(flags);
         }
  
@@ -452,7 +532,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
  
  #ifdef CONFIG_SLUB_DEBUG
  static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
-static DEFINE_SPINLOCK(object_map_lock);
+static DEFINE_RAW_SPINLOCK(object_map_lock);
  
  static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
                        struct page *page)
@@ -497,7 +577,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page)
  {
         VM_BUG_ON(!irqs_disabled());
  
-       spin_lock(&object_map_lock);
+       raw_spin_lock(&object_map_lock);
  
         __fill_map(object_map, s, page);
  
@@ -507,7 +587,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page)
  static void put_map(unsigned long *map) __releases(&object_map_lock)
  {
         VM_BUG_ON(map != object_map);
-       spin_unlock(&object_map_lock);
+       raw_spin_unlock(&object_map_lock);
  }
  
  static inline unsigned int size_from_object(struct kmem_cache *s)
@@ -1269,11 +1349,11 @@ static noinline int free_debug_processing(
         struct kmem_cache_node *n = get_node(s, page_to_nid(page));
         void *object = head;
         int cnt = 0;
-       unsigned long flags;
+       unsigned long flags, flags2;
         int ret = 0;
  
         spin_lock_irqsave(&n->list_lock, flags);
-       slab_lock(page);
+       slab_lock(page, &flags2);
  
         if (s->flags & SLAB_CONSISTENCY_CHECKS) {
                 if (!check_slab(s, page))
@@ -1306,7 +1386,7 @@ out:
                 slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
                          bulk_cnt, cnt);
  
-       slab_unlock(page);
+       slab_unlock(page, &flags2);
         spin_unlock_irqrestore(&n->list_lock, flags);
         if (!ret)
                 slab_fix(s, "Object at 0x%p not freed", object);
@@ -2003,7 +2083,12 @@ static inline void *acquire_slab(struct kmem_cache *s,
         return freelist;
  }
  
+#ifdef CONFIG_SLUB_CPU_PARTIAL
  static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
+#else
+static inline void put_cpu_partial(struct kmem_cache *s, struct page *page,
+                                  int drain) { }
+#endif
  static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
  
  /*
@@ -2203,9 +2288,13 @@ static inline void note_cmpxchg_failure(const char *n,
  static void init_kmem_cache_cpus(struct kmem_cache *s)
  {
         int cpu;
+       struct kmem_cache_cpu *c;
  
-       for_each_possible_cpu(cpu)
-               per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
+       for_each_possible_cpu(cpu) {
+               c = per_cpu_ptr(s->cpu_slab, cpu);
+               local_lock_init(&c->lock);
+               c->tid = init_tid(cpu);
+       }
  }
  
  /*
@@ -2416,10 +2505,10 @@ static void unfreeze_partials(struct kmem_cache *s)
         struct page *partial_page;
         unsigned long flags;
  
-       local_irq_save(flags);
+       local_lock_irqsave(&s->cpu_slab->lock, flags);
         partial_page = this_cpu_read(s->cpu_slab->partial);
         this_cpu_write(s->cpu_slab->partial, NULL);
-       local_irq_restore(flags);
+       local_unlock_irqrestore(&s->cpu_slab->lock, flags);
  
         if (partial_page)
                 __unfreeze_partials(s, partial_page);
@@ -2437,14 +2526,6 @@ static void unfreeze_partials_cpu(struct kmem_cache *s,
                 __unfreeze_partials(s, partial_page);
  }
  
-#else  /* CONFIG_SLUB_CPU_PARTIAL */
-
-static inline void unfreeze_partials(struct kmem_cache *s) { }
-static inline void unfreeze_partials_cpu(struct kmem_cache *s,
-                                 struct kmem_cache_cpu *c) { }
-
-#endif /* CONFIG_SLUB_CPU_PARTIAL */
-
  /*
   * Put a page that was just frozen (in __slab_free|get_partial_node) into a
   * partial page slot if available.
@@ -2454,79 +2535,118 @@ static inline void unfreeze_partials_cpu(struct kmem_cache *s,
   */
  static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
  {
-#ifdef CONFIG_SLUB_CPU_PARTIAL
         struct page *oldpage;
-       int pages;
-       int pobjects;
+       struct page *page_to_unfreeze = NULL;
+       unsigned long flags;
+       int pages = 0;
+       int pobjects = 0;
  
-       preempt_disable();
-       do {
-               pages = 0;
-               pobjects = 0;
-               oldpage = this_cpu_read(s->cpu_slab->partial);
+       local_lock_irqsave(&s->cpu_slab->lock, flags);
+
+       oldpage = this_cpu_read(s->cpu_slab->partial);
  
-               if (oldpage) {
+       if (oldpage) {
+               if (drain && oldpage->pobjects > slub_cpu_partial(s)) {
+                       /*
+                        * Partial array is full. Move the existing set to the
+                        * per node partial list. Postpone the actual unfreezing
+                        * outside of the critical section.
+                        */
+                       page_to_unfreeze = oldpage;
+                       oldpage = NULL;
+               } else {
                         pobjects = oldpage->pobjects;
                         pages = oldpage->pages;
-                       if (drain && pobjects > slub_cpu_partial(s)) {
-                               /*
-                                * partial array is full. Move the existing
-                                * set to the per node partial list.
-                                */
-                               unfreeze_partials(s);
-                               oldpage = NULL;
-                               pobjects = 0;
-                               pages = 0;
-                               stat(s, CPU_PARTIAL_DRAIN);
-                       }
                 }
+       }
  
-               pages++;
-               pobjects += page->objects - page->inuse;
+       pages++;
+       pobjects += page->objects - page->inuse;
  
-               page->pages = pages;
-               page->pobjects = pobjects;
-               page->next = oldpage;
+       page->pages = pages;
+       page->pobjects = pobjects;
+       page->next = oldpage;
  
-       } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
-                                                               != oldpage);
-       preempt_enable();
-#endif /* CONFIG_SLUB_CPU_PARTIAL */
+       this_cpu_write(s->cpu_slab->partial, page);
+
+       local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+
+       if (page_to_unfreeze) {
+               __unfreeze_partials(s, page_to_unfreeze);
+               stat(s, CPU_PARTIAL_DRAIN);
+       }
  }
  
+#else  /* CONFIG_SLUB_CPU_PARTIAL */
+
+static inline void unfreeze_partials(struct kmem_cache *s) { }
+static inline void unfreeze_partials_cpu(struct kmem_cache *s,
+                                 struct kmem_cache_cpu *c) { }
+
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
+
  static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
  {
-       void *freelist = c->freelist;
-       struct page *page = c->page;
+       unsigned long flags;
+       struct page *page;
+       void *freelist;
+
+       local_lock_irqsave(&s->cpu_slab->lock, flags);
+
+       page = c->page;
+       freelist = c->freelist;
  
         c->page = NULL;
         c->freelist = NULL;
         c->tid = next_tid(c->tid);
  
-       deactivate_slab(s, page, freelist);
+       local_unlock_irqrestore(&s->cpu_slab->lock, flags);
  
-       stat(s, CPUSLAB_FLUSH);
+       if (page) {
+               deactivate_slab(s, page, freelist);
+               stat(s, CPUSLAB_FLUSH);
+       }
  }
  
  static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
  {
         struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
+       void *freelist = c->freelist;
+       struct page *page = c->page;
  
-       if (c->page)
-               flush_slab(s, c);
+       c->page = NULL;
+       c->freelist = NULL;
+       c->tid = next_tid(c->tid);
+
+       if (page) {
+               deactivate_slab(s, page, freelist);
+               stat(s, CPUSLAB_FLUSH);
+       }
  
         unfreeze_partials_cpu(s, c);
  }
  
+struct slub_flush_work {
+       struct work_struct work;
+       struct kmem_cache *s;
+       bool skip;
+};
+
  /*
   * Flush cpu slab.
   *
- * Called from IPI handler with interrupts disabled.
+ * Called from CPU work handler with migration disabled.
   */
-static void flush_cpu_slab(void *d)
+static void flush_cpu_slab(struct work_struct *w)
  {
-       struct kmem_cache *s = d;
-       struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
+       struct kmem_cache *s;
+       struct kmem_cache_cpu *c;
+       struct slub_flush_work *sfw;
+
+       sfw = container_of(w, struct slub_flush_work, work);
+
+       s = sfw->s;
+       c = this_cpu_ptr(s->cpu_slab);
  
         if (c->page)
                 flush_slab(s, c);
@@ -2534,17 +2654,51 @@ static void flush_cpu_slab(void *d)
         unfreeze_partials(s);
  }
  
-static bool has_cpu_slab(int cpu, void *info)
+static bool has_cpu_slab(int cpu, struct kmem_cache *s)
  {
-       struct kmem_cache *s = info;
         struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
  
         return c->page || slub_percpu_partial(c);
  }
  
+static DEFINE_MUTEX(flush_lock);
+static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
+
+static void flush_all_cpus_locked(struct kmem_cache *s)
+{
+       struct slub_flush_work *sfw;
+       unsigned int cpu;
+
+       lockdep_assert_cpus_held();
+       mutex_lock(&flush_lock);
+
+       for_each_online_cpu(cpu) {
+               sfw = &per_cpu(slub_flush, cpu);
+               if (!has_cpu_slab(cpu, s)) {
+                       sfw->skip = true;
+                       continue;
+               }
+               INIT_WORK(&sfw->work, flush_cpu_slab);
+               sfw->skip = false;
+               sfw->s = s;
+               schedule_work_on(cpu, &sfw->work);
+       }
+
+       for_each_online_cpu(cpu) {
+               sfw = &per_cpu(slub_flush, cpu);
+               if (sfw->skip)
+                       continue;
+               flush_work(&sfw->work);
+       }
+
+       mutex_unlock(&flush_lock);
+}
+
  static void flush_all(struct kmem_cache *s)
  {
-       on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
+       cpus_read_lock();
+       flush_all_cpus_locked(s);
+       cpus_read_unlock();
  }
  
  /*
@@ -2668,8 +2822,6 @@ static inline bool pfmemalloc_match_unsafe(struct page *page, gfp_t gfpflags)
   * The page is still frozen if the return value is not NULL.
   *
   * If this function returns NULL then the page has been unfrozen.
- *
- * This function must be called with interrupt disabled.
   */
  static inline void *get_freelist(struct kmem_cache *s, struct page *page)
  {
@@ -2677,6 +2829,8 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
         unsigned long counters;
         void *freelist;
  
+       lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
+
         do {
                 freelist = page->freelist;
                 counters = page->counters;
@@ -2760,10 +2914,10 @@ redo:
         if (unlikely(!pfmemalloc_match_unsafe(page, gfpflags)))
                 goto deactivate_slab;
  
-       /* must check again c->page in case IRQ handler changed it */
-       local_irq_save(flags);
+       /* must check again c->page in case we got preempted and it changed */
+       local_lock_irqsave(&s->cpu_slab->lock, flags);
         if (unlikely(page != c->page)) {
-               local_irq_restore(flags);
+               local_unlock_irqrestore(&s->cpu_slab->lock, flags);
                 goto reread_page;
         }
         freelist = c->freelist;
@@ -2774,7 +2928,7 @@ redo:
  
         if (!freelist) {
                 c->page = NULL;
-               local_irq_restore(flags);
+               local_unlock_irqrestore(&s->cpu_slab->lock, flags);
                 stat(s, DEACTIVATE_BYPASS);
                 goto new_slab;
         }
@@ -2783,7 +2937,7 @@ redo:
  
  load_freelist:
  
-       lockdep_assert_irqs_disabled();
+       lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
  
         /*
          * freelist is pointing to the list of objects to be used.
@@ -2793,38 +2947,39 @@ load_freelist:
         VM_BUG_ON(!c->page->frozen);
         c->freelist = get_freepointer(s, freelist);
         c->tid = next_tid(c->tid);
-       local_irq_restore(flags);
+       local_unlock_irqrestore(&s->cpu_slab->lock, flags);
         return freelist;
  
  deactivate_slab:
  
-       local_irq_save(flags);
+       local_lock_irqsave(&s->cpu_slab->lock, flags);
         if (page != c->page) {
-               local_irq_restore(flags);
+               local_unlock_irqrestore(&s->cpu_slab->lock, flags);
                 goto reread_page;
         }
         freelist = c->freelist;
         c->page = NULL;
         c->freelist = NULL;
-       local_irq_restore(flags);
+       local_unlock_irqrestore(&s->cpu_slab->lock, flags);
         deactivate_slab(s, page, freelist);
  
  new_slab:
  
         if (slub_percpu_partial(c)) {
-               local_irq_save(flags);
+               local_lock_irqsave(&s->cpu_slab->lock, flags);
                 if (unlikely(c->page)) {
-                       local_irq_restore(flags);
+                       local_unlock_irqrestore(&s->cpu_slab->lock, flags);
                         goto reread_page;
                 }
                 if (unlikely(!slub_percpu_partial(c))) {
-                       local_irq_restore(flags);
-                       goto new_objects; /* stolen by an IRQ handler */
+                       local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+                       /* we were preempted and partial list got empty */
+                       goto new_objects;
                 }
  
                 page = c->page = slub_percpu_partial(c);
                 slub_set_percpu_partial(c, page);
-               local_irq_restore(flags);
+               local_unlock_irqrestore(&s->cpu_slab->lock, flags);
                 stat(s, CPU_PARTIAL_ALLOC);
                 goto redo;
         }
@@ -2835,9 +2990,9 @@ new_objects:
         if (freelist)
                 goto check_new_page;
  
-       put_cpu_ptr(s->cpu_slab);
+       slub_put_cpu_ptr(s->cpu_slab);
         page = new_slab(s, gfpflags, node);
-       c = get_cpu_ptr(s->cpu_slab);
+       c = slub_get_cpu_ptr(s->cpu_slab);
  
         if (unlikely(!page)) {
                 slab_out_of_memory(s, gfpflags, node);
@@ -2877,7 +3032,7 @@ check_new_page:
  
  retry_load_page:
  
-       local_irq_save(flags);
+       local_lock_irqsave(&s->cpu_slab->lock, flags);
         if (unlikely(c->page)) {
                 void *flush_freelist = c->freelist;
                 struct page *flush_page = c->page;
@@ -2886,7 +3041,7 @@ retry_load_page:
                 c->freelist = NULL;
                 c->tid = next_tid(c->tid);
  
-               local_irq_restore(flags);
+               local_unlock_irqrestore(&s->cpu_slab->lock, flags);
  
                 deactivate_slab(s, flush_page, flush_freelist);
  
@@ -2920,12 +3075,12 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
          * cpu before disabling preemption. Need to reload cpu area
          * pointer.
          */
-       c = get_cpu_ptr(s->cpu_slab);
+       c = slub_get_cpu_ptr(s->cpu_slab);
  #endif
  
         p = ___slab_alloc(s, gfpflags, node, addr, c);
  #ifdef CONFIG_PREEMPT_COUNT
-       put_cpu_ptr(s->cpu_slab);
+       slub_put_cpu_ptr(s->cpu_slab);
  #endif
         return p;
  }
@@ -3005,7 +3160,15 @@ redo:
  
         object = c->freelist;
         page = c->page;
-       if (unlikely(!object || !page || !node_match(page, node))) {
+       /*
+        * We cannot use the lockless fastpath on PREEMPT_RT because if a
+        * slowpath has taken the local_lock_irqsave(), it is not protected
+        * against a fast path operation in an irq handler. So we need to take
+        * the slow path which uses local_lock. It is still relatively fast if
+        * there is a suitable cpu freelist.
+        */
+       if (IS_ENABLED(CONFIG_PREEMPT_RT) ||
+           unlikely(!object || !page || !node_match(page, node))) {
                 object = __slab_alloc(s, gfpflags, node, addr, c);
         } else {
                 void *next_object = get_freepointer_safe(s, object);
@@ -3265,6 +3428,7 @@ redo:
         barrier();
  
         if (likely(page == c->page)) {
+#ifndef CONFIG_PREEMPT_RT
                 void **freelist = READ_ONCE(c->freelist);
  
                 set_freepointer(s, tail_obj, freelist);
@@ -3277,6 +3441,31 @@ redo:
                         note_cmpxchg_failure("slab_free", s, tid);
                         goto redo;
                 }
+#else /* CONFIG_PREEMPT_RT */
+               /*
+                * We cannot use the lockless fastpath on PREEMPT_RT because if
+                * a slowpath has taken the local_lock_irqsave(), it is not
+                * protected against a fast path operation in an irq handler. So
+                * we need to take the local_lock. We shouldn't simply defer to
+                * __slab_free() as that wouldn't use the cpu freelist at all.
+                */
+               void **freelist;
+
+               local_lock(&s->cpu_slab->lock);
+               c = this_cpu_ptr(s->cpu_slab);
+               if (unlikely(page != c->page)) {
+                       local_unlock(&s->cpu_slab->lock);
+                       goto redo;
+               }
+               tid = c->tid;
+               freelist = c->freelist;
+
+               set_freepointer(s, tail_obj, freelist);
+               c->freelist = head;
+               c->tid = next_tid(tid);
+
+               local_unlock(&s->cpu_slab->lock);
+#endif
                 stat(s, FREE_FASTPATH);
         } else
                 __slab_free(s, page, head, tail_obj, cnt, addr);
@@ -3454,8 +3643,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
          * IRQs, which protects against PREEMPT and interrupts
          * handlers invoking normal fastpath.
          */
-       c = get_cpu_ptr(s->cpu_slab);
-       local_irq_disable();
+       c = slub_get_cpu_ptr(s->cpu_slab);
+       local_lock_irq(&s->cpu_slab->lock);
  
         for (i = 0; i < size; i++) {
                 void *object = kfence_alloc(s, s->object_size, flags);
@@ -3476,7 +3665,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
                          */
                         c->tid = next_tid(c->tid);
  
-                       local_irq_enable();
+                       local_unlock_irq(&s->cpu_slab->lock);
  
                         /*
                          * Invoking slow path likely have side-effect
@@ -3490,7 +3679,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
                         c = this_cpu_ptr(s->cpu_slab);
                         maybe_wipe_obj_freeptr(s, p[i]);
  
-                       local_irq_disable();
+                       local_lock_irq(&s->cpu_slab->lock);
  
                         continue; /* goto for-loop */
                 }
@@ -3499,8 +3688,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
                 maybe_wipe_obj_freeptr(s, p[i]);
         }
         c->tid = next_tid(c->tid);
-       local_irq_enable();
-       put_cpu_ptr(s->cpu_slab);
+       local_unlock_irq(&s->cpu_slab->lock);
+       slub_put_cpu_ptr(s->cpu_slab);
  
         /*
          * memcg and kmem_cache debug support and memory initialization.
@@ -3510,7 +3699,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
                                 slab_want_init_on_alloc(flags, s));
         return i;
  error:
-       put_cpu_ptr(s->cpu_slab);
+       slub_put_cpu_ptr(s->cpu_slab);
         slab_post_alloc_hook(s, objcg, flags, i, p, false);
         __kmem_cache_free_bulk(s, i, p);
         return 0;
@@ -4024,11 +4213,12 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
  {
  #ifdef CONFIG_SLUB_DEBUG
         void *addr = page_address(page);
+       unsigned long flags;
         unsigned long *map;
         void *p;
  
         slab_err(s, page, text, s->name);
-       slab_lock(page);
+       slab_lock(page, &flags);
  
         map = get_map(s, page);
         for_each_object(p, s, addr, page->objects) {
@@ -4039,7 +4229,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
                 }
         }
         put_map(map);
-       slab_unlock(page);
+       slab_unlock(page, &flags);
  #endif
  }
  
@@ -4089,7 +4279,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
         int node;
         struct kmem_cache_node *n;
  
-       flush_all(s);
+       flush_all_cpus_locked(s);
         /* Attempt to free all objects */
         for_each_kmem_cache_node(s, node, n) {
                 free_partial(s, n);
@@ -4365,7 +4555,7 @@ EXPORT_SYMBOL(kfree);
   * being allocated from last increasing the chance that the last objects
   * are freed in them.
   */
-int __kmem_cache_shrink(struct kmem_cache *s)
+static int __kmem_cache_do_shrink(struct kmem_cache *s)
  {
         int node;
         int i;
@@ -4377,7 +4567,6 @@ int __kmem_cache_shrink(struct kmem_cache *s)
         unsigned long flags;
         int ret = 0;
  
-       flush_all(s);
         for_each_kmem_cache_node(s, node, n) {
                 INIT_LIST_HEAD(&discard);
                 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
@@ -4427,13 +4616,21 @@ int __kmem_cache_shrink(struct kmem_cache *s)
         return ret;
  }
  
+int __kmem_cache_shrink(struct kmem_cache *s)
+{
+       flush_all(s);
+       return __kmem_cache_do_shrink(s);
+}
+
  static int slab_mem_going_offline_callback(void *arg)
  {
         struct kmem_cache *s;
  
         mutex_lock(&slab_mutex);
-       list_for_each_entry(s, &slab_caches, list)
-               __kmem_cache_shrink(s);
+       list_for_each_entry(s, &slab_caches, list) {
+               flush_all_cpus_locked(s);
+               __kmem_cache_do_shrink(s);
+       }
         mutex_unlock(&slab_mutex);
  
         return 0;
@@ -4764,8 +4961,9 @@ static void validate_slab(struct kmem_cache *s, struct page *page,
  {
         void *p;
         void *addr = page_address(page);
+       unsigned long flags;
  
-       slab_lock(page);
+       slab_lock(page, &flags);
  
         if (!check_slab(s, page) || !on_freelist(s, page, NULL))
                 goto unlock;
@@ -4780,7 +4978,7 @@ static void validate_slab(struct kmem_cache *s, struct page *page,
                         break;
         }
  unlock:
-       slab_unlock(page);
+       slab_unlock(page, &flags);
  }
  
  static int validate_slab_node(struct kmem_cache *s,