kfence: await for allocation using wait_event
[linux-2.6-microblaze.git] / mm / kfence / core.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * KFENCE guarded object allocator and fault handling.
4  *
5  * Copyright (C) 2020, Google LLC.
6  */
7
8 #define pr_fmt(fmt) "kfence: " fmt
9
10 #include <linux/atomic.h>
11 #include <linux/bug.h>
12 #include <linux/debugfs.h>
13 #include <linux/irq_work.h>
14 #include <linux/kcsan-checks.h>
15 #include <linux/kfence.h>
16 #include <linux/kmemleak.h>
17 #include <linux/list.h>
18 #include <linux/lockdep.h>
19 #include <linux/memblock.h>
20 #include <linux/moduleparam.h>
21 #include <linux/random.h>
22 #include <linux/rcupdate.h>
23 #include <linux/seq_file.h>
24 #include <linux/slab.h>
25 #include <linux/spinlock.h>
26 #include <linux/string.h>
27
28 #include <asm/kfence.h>
29
30 #include "kfence.h"
31
32 /* Disables KFENCE on the first warning assuming an irrecoverable error. */
33 #define KFENCE_WARN_ON(cond)                                                   \
34         ({                                                                     \
35                 const bool __cond = WARN_ON(cond);                             \
36                 if (unlikely(__cond))                                          \
37                         WRITE_ONCE(kfence_enabled, false);                     \
38                 __cond;                                                        \
39         })
40
41 /* === Data ================================================================= */
42
43 static bool kfence_enabled __read_mostly;
44
45 static unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL;
46
47 #ifdef MODULE_PARAM_PREFIX
48 #undef MODULE_PARAM_PREFIX
49 #endif
50 #define MODULE_PARAM_PREFIX "kfence."
51
52 static int param_set_sample_interval(const char *val, const struct kernel_param *kp)
53 {
54         unsigned long num;
55         int ret = kstrtoul(val, 0, &num);
56
57         if (ret < 0)
58                 return ret;
59
60         if (!num) /* Using 0 to indicate KFENCE is disabled. */
61                 WRITE_ONCE(kfence_enabled, false);
62         else if (!READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING)
63                 return -EINVAL; /* Cannot (re-)enable KFENCE on-the-fly. */
64
65         *((unsigned long *)kp->arg) = num;
66         return 0;
67 }
68
69 static int param_get_sample_interval(char *buffer, const struct kernel_param *kp)
70 {
71         if (!READ_ONCE(kfence_enabled))
72                 return sprintf(buffer, "0\n");
73
74         return param_get_ulong(buffer, kp);
75 }
76
77 static const struct kernel_param_ops sample_interval_param_ops = {
78         .set = param_set_sample_interval,
79         .get = param_get_sample_interval,
80 };
81 module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
82
83 /* The pool of pages used for guard pages and objects. */
84 char *__kfence_pool __ro_after_init;
85 EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
86
87 /*
88  * Per-object metadata, with one-to-one mapping of object metadata to
89  * backing pages (in __kfence_pool).
90  */
91 static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0);
92 struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
93
94 /* Freelist with available objects. */
95 static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist);
96 static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */
97
98 #ifdef CONFIG_KFENCE_STATIC_KEYS
99 /* The static key to set up a KFENCE allocation. */
100 DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
101 #endif
102
103 /* Gates the allocation, ensuring only one succeeds in a given period. */
104 atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
105
106 /* Statistics counters for debugfs. */
107 enum kfence_counter_id {
108         KFENCE_COUNTER_ALLOCATED,
109         KFENCE_COUNTER_ALLOCS,
110         KFENCE_COUNTER_FREES,
111         KFENCE_COUNTER_ZOMBIES,
112         KFENCE_COUNTER_BUGS,
113         KFENCE_COUNTER_COUNT,
114 };
115 static atomic_long_t counters[KFENCE_COUNTER_COUNT];
116 static const char *const counter_names[] = {
117         [KFENCE_COUNTER_ALLOCATED]      = "currently allocated",
118         [KFENCE_COUNTER_ALLOCS]         = "total allocations",
119         [KFENCE_COUNTER_FREES]          = "total frees",
120         [KFENCE_COUNTER_ZOMBIES]        = "zombie allocations",
121         [KFENCE_COUNTER_BUGS]           = "total bugs",
122 };
123 static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
124
125 /* === Internals ============================================================ */
126
127 static bool kfence_protect(unsigned long addr)
128 {
129         return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
130 }
131
132 static bool kfence_unprotect(unsigned long addr)
133 {
134         return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false));
135 }
136
137 static inline struct kfence_metadata *addr_to_metadata(unsigned long addr)
138 {
139         long index;
140
141         /* The checks do not affect performance; only called from slow-paths. */
142
143         if (!is_kfence_address((void *)addr))
144                 return NULL;
145
146         /*
147          * May be an invalid index if called with an address at the edge of
148          * __kfence_pool, in which case we would report an "invalid access"
149          * error.
150          */
151         index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1;
152         if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS)
153                 return NULL;
154
155         return &kfence_metadata[index];
156 }
157
158 static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta)
159 {
160         unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2;
161         unsigned long pageaddr = (unsigned long)&__kfence_pool[offset];
162
163         /* The checks do not affect performance; only called from slow-paths. */
164
165         /* Only call with a pointer into kfence_metadata. */
166         if (KFENCE_WARN_ON(meta < kfence_metadata ||
167                            meta >= kfence_metadata + CONFIG_KFENCE_NUM_OBJECTS))
168                 return 0;
169
170         /*
171          * This metadata object only ever maps to 1 page; verify that the stored
172          * address is in the expected range.
173          */
174         if (KFENCE_WARN_ON(ALIGN_DOWN(meta->addr, PAGE_SIZE) != pageaddr))
175                 return 0;
176
177         return pageaddr;
178 }
179
180 /*
181  * Update the object's metadata state, including updating the alloc/free stacks
182  * depending on the state transition.
183  */
184 static noinline void metadata_update_state(struct kfence_metadata *meta,
185                                            enum kfence_object_state next)
186 {
187         struct kfence_track *track =
188                 next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track;
189
190         lockdep_assert_held(&meta->lock);
191
192         /*
193          * Skip over 1 (this) functions; noinline ensures we do not accidentally
194          * skip over the caller by never inlining.
195          */
196         track->num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
197         track->pid = task_pid_nr(current);
198
199         /*
200          * Pairs with READ_ONCE() in
201          *      kfence_shutdown_cache(),
202          *      kfence_handle_page_fault().
203          */
204         WRITE_ONCE(meta->state, next);
205 }
206
207 /* Write canary byte to @addr. */
208 static inline bool set_canary_byte(u8 *addr)
209 {
210         *addr = KFENCE_CANARY_PATTERN(addr);
211         return true;
212 }
213
214 /* Check canary byte at @addr. */
215 static inline bool check_canary_byte(u8 *addr)
216 {
217         if (likely(*addr == KFENCE_CANARY_PATTERN(addr)))
218                 return true;
219
220         atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
221         kfence_report_error((unsigned long)addr, false, NULL, addr_to_metadata((unsigned long)addr),
222                             KFENCE_ERROR_CORRUPTION);
223         return false;
224 }
225
226 /* __always_inline this to ensure we won't do an indirect call to fn. */
227 static __always_inline void for_each_canary(const struct kfence_metadata *meta, bool (*fn)(u8 *))
228 {
229         const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
230         unsigned long addr;
231
232         lockdep_assert_held(&meta->lock);
233
234         /*
235          * We'll iterate over each canary byte per-side until fn() returns
236          * false. However, we'll still iterate over the canary bytes to the
237          * right of the object even if there was an error in the canary bytes to
238          * the left of the object. Specifically, if check_canary_byte()
239          * generates an error, showing both sides might give more clues as to
240          * what the error is about when displaying which bytes were corrupted.
241          */
242
243         /* Apply to left of object. */
244         for (addr = pageaddr; addr < meta->addr; addr++) {
245                 if (!fn((u8 *)addr))
246                         break;
247         }
248
249         /* Apply to right of object. */
250         for (addr = meta->addr + meta->size; addr < pageaddr + PAGE_SIZE; addr++) {
251                 if (!fn((u8 *)addr))
252                         break;
253         }
254 }
255
256 static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp)
257 {
258         struct kfence_metadata *meta = NULL;
259         unsigned long flags;
260         struct page *page;
261         void *addr;
262
263         /* Try to obtain a free object. */
264         raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
265         if (!list_empty(&kfence_freelist)) {
266                 meta = list_entry(kfence_freelist.next, struct kfence_metadata, list);
267                 list_del_init(&meta->list);
268         }
269         raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
270         if (!meta)
271                 return NULL;
272
273         if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
274                 /*
275                  * This is extremely unlikely -- we are reporting on a
276                  * use-after-free, which locked meta->lock, and the reporting
277                  * code via printk calls kmalloc() which ends up in
278                  * kfence_alloc() and tries to grab the same object that we're
279                  * reporting on. While it has never been observed, lockdep does
280                  * report that there is a possibility of deadlock. Fix it by
281                  * using trylock and bailing out gracefully.
282                  */
283                 raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
284                 /* Put the object back on the freelist. */
285                 list_add_tail(&meta->list, &kfence_freelist);
286                 raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
287
288                 return NULL;
289         }
290
291         meta->addr = metadata_to_pageaddr(meta);
292         /* Unprotect if we're reusing this page. */
293         if (meta->state == KFENCE_OBJECT_FREED)
294                 kfence_unprotect(meta->addr);
295
296         /*
297          * Note: for allocations made before RNG initialization, will always
298          * return zero. We still benefit from enabling KFENCE as early as
299          * possible, even when the RNG is not yet available, as this will allow
300          * KFENCE to detect bugs due to earlier allocations. The only downside
301          * is that the out-of-bounds accesses detected are deterministic for
302          * such allocations.
303          */
304         if (prandom_u32_max(2)) {
305                 /* Allocate on the "right" side, re-calculate address. */
306                 meta->addr += PAGE_SIZE - size;
307                 meta->addr = ALIGN_DOWN(meta->addr, cache->align);
308         }
309
310         addr = (void *)meta->addr;
311
312         /* Update remaining metadata. */
313         metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED);
314         /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
315         WRITE_ONCE(meta->cache, cache);
316         meta->size = size;
317         for_each_canary(meta, set_canary_byte);
318
319         /* Set required struct page fields. */
320         page = virt_to_page(meta->addr);
321         page->slab_cache = cache;
322         if (IS_ENABLED(CONFIG_SLUB))
323                 page->objects = 1;
324         if (IS_ENABLED(CONFIG_SLAB))
325                 page->s_mem = addr;
326
327         raw_spin_unlock_irqrestore(&meta->lock, flags);
328
329         /* Memory initialization. */
330
331         /*
332          * We check slab_want_init_on_alloc() ourselves, rather than letting
333          * SL*B do the initialization, as otherwise we might overwrite KFENCE's
334          * redzone.
335          */
336         if (unlikely(slab_want_init_on_alloc(gfp, cache)))
337                 memzero_explicit(addr, size);
338         if (cache->ctor)
339                 cache->ctor(addr);
340
341         if (CONFIG_KFENCE_STRESS_TEST_FAULTS && !prandom_u32_max(CONFIG_KFENCE_STRESS_TEST_FAULTS))
342                 kfence_protect(meta->addr); /* Random "faults" by protecting the object. */
343
344         atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]);
345         atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]);
346
347         return addr;
348 }
349
350 static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
351 {
352         struct kcsan_scoped_access assert_page_exclusive;
353         unsigned long flags;
354
355         raw_spin_lock_irqsave(&meta->lock, flags);
356
357         if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) {
358                 /* Invalid or double-free, bail out. */
359                 atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
360                 kfence_report_error((unsigned long)addr, false, NULL, meta,
361                                     KFENCE_ERROR_INVALID_FREE);
362                 raw_spin_unlock_irqrestore(&meta->lock, flags);
363                 return;
364         }
365
366         /* Detect racy use-after-free, or incorrect reallocation of this page by KFENCE. */
367         kcsan_begin_scoped_access((void *)ALIGN_DOWN((unsigned long)addr, PAGE_SIZE), PAGE_SIZE,
368                                   KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT,
369                                   &assert_page_exclusive);
370
371         if (CONFIG_KFENCE_STRESS_TEST_FAULTS)
372                 kfence_unprotect((unsigned long)addr); /* To check canary bytes. */
373
374         /* Restore page protection if there was an OOB access. */
375         if (meta->unprotected_page) {
376                 memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE);
377                 kfence_protect(meta->unprotected_page);
378                 meta->unprotected_page = 0;
379         }
380
381         /* Check canary bytes for memory corruption. */
382         for_each_canary(meta, check_canary_byte);
383
384         /*
385          * Clear memory if init-on-free is set. While we protect the page, the
386          * data is still there, and after a use-after-free is detected, we
387          * unprotect the page, so the data is still accessible.
388          */
389         if (!zombie && unlikely(slab_want_init_on_free(meta->cache)))
390                 memzero_explicit(addr, meta->size);
391
392         /* Mark the object as freed. */
393         metadata_update_state(meta, KFENCE_OBJECT_FREED);
394
395         raw_spin_unlock_irqrestore(&meta->lock, flags);
396
397         /* Protect to detect use-after-frees. */
398         kfence_protect((unsigned long)addr);
399
400         kcsan_end_scoped_access(&assert_page_exclusive);
401         if (!zombie) {
402                 /* Add it to the tail of the freelist for reuse. */
403                 raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
404                 KFENCE_WARN_ON(!list_empty(&meta->list));
405                 list_add_tail(&meta->list, &kfence_freelist);
406                 raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
407
408                 atomic_long_dec(&counters[KFENCE_COUNTER_ALLOCATED]);
409                 atomic_long_inc(&counters[KFENCE_COUNTER_FREES]);
410         } else {
411                 /* See kfence_shutdown_cache(). */
412                 atomic_long_inc(&counters[KFENCE_COUNTER_ZOMBIES]);
413         }
414 }
415
416 static void rcu_guarded_free(struct rcu_head *h)
417 {
418         struct kfence_metadata *meta = container_of(h, struct kfence_metadata, rcu_head);
419
420         kfence_guarded_free((void *)meta->addr, meta, false);
421 }
422
423 static bool __init kfence_init_pool(void)
424 {
425         unsigned long addr = (unsigned long)__kfence_pool;
426         struct page *pages;
427         int i;
428
429         if (!__kfence_pool)
430                 return false;
431
432         if (!arch_kfence_init_pool())
433                 goto err;
434
435         pages = virt_to_page(addr);
436
437         /*
438          * Set up object pages: they must have PG_slab set, to avoid freeing
439          * these as real pages.
440          *
441          * We also want to avoid inserting kfence_free() in the kfree()
442          * fast-path in SLUB, and therefore need to ensure kfree() correctly
443          * enters __slab_free() slow-path.
444          */
445         for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
446                 if (!i || (i % 2))
447                         continue;
448
449                 /* Verify we do not have a compound head page. */
450                 if (WARN_ON(compound_head(&pages[i]) != &pages[i]))
451                         goto err;
452
453                 __SetPageSlab(&pages[i]);
454         }
455
456         /*
457          * Protect the first 2 pages. The first page is mostly unnecessary, and
458          * merely serves as an extended guard page. However, adding one
459          * additional page in the beginning gives us an even number of pages,
460          * which simplifies the mapping of address to metadata index.
461          */
462         for (i = 0; i < 2; i++) {
463                 if (unlikely(!kfence_protect(addr)))
464                         goto err;
465
466                 addr += PAGE_SIZE;
467         }
468
469         for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
470                 struct kfence_metadata *meta = &kfence_metadata[i];
471
472                 /* Initialize metadata. */
473                 INIT_LIST_HEAD(&meta->list);
474                 raw_spin_lock_init(&meta->lock);
475                 meta->state = KFENCE_OBJECT_UNUSED;
476                 meta->addr = addr; /* Initialize for validation in metadata_to_pageaddr(). */
477                 list_add_tail(&meta->list, &kfence_freelist);
478
479                 /* Protect the right redzone. */
480                 if (unlikely(!kfence_protect(addr + PAGE_SIZE)))
481                         goto err;
482
483                 addr += 2 * PAGE_SIZE;
484         }
485
486         /*
487          * The pool is live and will never be deallocated from this point on.
488          * Remove the pool object from the kmemleak object tree, as it would
489          * otherwise overlap with allocations returned by kfence_alloc(), which
490          * are registered with kmemleak through the slab post-alloc hook.
491          */
492         kmemleak_free(__kfence_pool);
493
494         return true;
495
496 err:
497         /*
498          * Only release unprotected pages, and do not try to go back and change
499          * page attributes due to risk of failing to do so as well. If changing
500          * page attributes for some pages fails, it is very likely that it also
501          * fails for the first page, and therefore expect addr==__kfence_pool in
502          * most failure cases.
503          */
504         memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
505         __kfence_pool = NULL;
506         return false;
507 }
508
509 /* === DebugFS Interface ==================================================== */
510
511 static int stats_show(struct seq_file *seq, void *v)
512 {
513         int i;
514
515         seq_printf(seq, "enabled: %i\n", READ_ONCE(kfence_enabled));
516         for (i = 0; i < KFENCE_COUNTER_COUNT; i++)
517                 seq_printf(seq, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i]));
518
519         return 0;
520 }
521 DEFINE_SHOW_ATTRIBUTE(stats);
522
523 /*
524  * debugfs seq_file operations for /sys/kernel/debug/kfence/objects.
525  * start_object() and next_object() return the object index + 1, because NULL is used
526  * to stop iteration.
527  */
528 static void *start_object(struct seq_file *seq, loff_t *pos)
529 {
530         if (*pos < CONFIG_KFENCE_NUM_OBJECTS)
531                 return (void *)((long)*pos + 1);
532         return NULL;
533 }
534
535 static void stop_object(struct seq_file *seq, void *v)
536 {
537 }
538
539 static void *next_object(struct seq_file *seq, void *v, loff_t *pos)
540 {
541         ++*pos;
542         if (*pos < CONFIG_KFENCE_NUM_OBJECTS)
543                 return (void *)((long)*pos + 1);
544         return NULL;
545 }
546
547 static int show_object(struct seq_file *seq, void *v)
548 {
549         struct kfence_metadata *meta = &kfence_metadata[(long)v - 1];
550         unsigned long flags;
551
552         raw_spin_lock_irqsave(&meta->lock, flags);
553         kfence_print_object(seq, meta);
554         raw_spin_unlock_irqrestore(&meta->lock, flags);
555         seq_puts(seq, "---------------------------------\n");
556
557         return 0;
558 }
559
560 static const struct seq_operations object_seqops = {
561         .start = start_object,
562         .next = next_object,
563         .stop = stop_object,
564         .show = show_object,
565 };
566
567 static int open_objects(struct inode *inode, struct file *file)
568 {
569         return seq_open(file, &object_seqops);
570 }
571
572 static const struct file_operations objects_fops = {
573         .open = open_objects,
574         .read = seq_read,
575         .llseek = seq_lseek,
576 };
577
578 static int __init kfence_debugfs_init(void)
579 {
580         struct dentry *kfence_dir = debugfs_create_dir("kfence", NULL);
581
582         debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops);
583         debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops);
584         return 0;
585 }
586
587 late_initcall(kfence_debugfs_init);
588
589 /* === Allocation Gate Timer ================================================ */
590
591 #ifdef CONFIG_KFENCE_STATIC_KEYS
592 /* Wait queue to wake up allocation-gate timer task. */
593 static DECLARE_WAIT_QUEUE_HEAD(allocation_wait);
594
595 static void wake_up_kfence_timer(struct irq_work *work)
596 {
597         wake_up(&allocation_wait);
598 }
599 static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer);
600 #endif
601
602 /*
603  * Set up delayed work, which will enable and disable the static key. We need to
604  * use a work queue (rather than a simple timer), since enabling and disabling a
605  * static key cannot be done from an interrupt.
606  *
607  * Note: Toggling a static branch currently causes IPIs, and here we'll end up
608  * with a total of 2 IPIs to all CPUs. If this ends up a problem in future (with
609  * more aggressive sampling intervals), we could get away with a variant that
610  * avoids IPIs, at the cost of not immediately capturing allocations if the
611  * instructions remain cached.
612  */
613 static struct delayed_work kfence_timer;
614 static void toggle_allocation_gate(struct work_struct *work)
615 {
616         if (!READ_ONCE(kfence_enabled))
617                 return;
618
619         atomic_set(&kfence_allocation_gate, 0);
620 #ifdef CONFIG_KFENCE_STATIC_KEYS
621         /* Enable static key, and await allocation to happen. */
622         static_branch_enable(&kfence_allocation_key);
623
624         wait_event_timeout(allocation_wait, atomic_read(&kfence_allocation_gate), HZ);
625
626         /* Disable static key and reset timer. */
627         static_branch_disable(&kfence_allocation_key);
628 #endif
629         schedule_delayed_work(&kfence_timer, msecs_to_jiffies(kfence_sample_interval));
630 }
631 static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate);
632
633 /* === Public interface ===================================================== */
634
635 void __init kfence_alloc_pool(void)
636 {
637         if (!kfence_sample_interval)
638                 return;
639
640         __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
641
642         if (!__kfence_pool)
643                 pr_err("failed to allocate pool\n");
644 }
645
646 void __init kfence_init(void)
647 {
648         /* Setting kfence_sample_interval to 0 on boot disables KFENCE. */
649         if (!kfence_sample_interval)
650                 return;
651
652         if (!kfence_init_pool()) {
653                 pr_err("%s failed\n", __func__);
654                 return;
655         }
656
657         WRITE_ONCE(kfence_enabled, true);
658         schedule_delayed_work(&kfence_timer, 0);
659         pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE,
660                 CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool,
661                 (void *)(__kfence_pool + KFENCE_POOL_SIZE));
662 }
663
664 void kfence_shutdown_cache(struct kmem_cache *s)
665 {
666         unsigned long flags;
667         struct kfence_metadata *meta;
668         int i;
669
670         for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
671                 bool in_use;
672
673                 meta = &kfence_metadata[i];
674
675                 /*
676                  * If we observe some inconsistent cache and state pair where we
677                  * should have returned false here, cache destruction is racing
678                  * with either kmem_cache_alloc() or kmem_cache_free(). Taking
679                  * the lock will not help, as different critical section
680                  * serialization will have the same outcome.
681                  */
682                 if (READ_ONCE(meta->cache) != s ||
683                     READ_ONCE(meta->state) != KFENCE_OBJECT_ALLOCATED)
684                         continue;
685
686                 raw_spin_lock_irqsave(&meta->lock, flags);
687                 in_use = meta->cache == s && meta->state == KFENCE_OBJECT_ALLOCATED;
688                 raw_spin_unlock_irqrestore(&meta->lock, flags);
689
690                 if (in_use) {
691                         /*
692                          * This cache still has allocations, and we should not
693                          * release them back into the freelist so they can still
694                          * safely be used and retain the kernel's default
695                          * behaviour of keeping the allocations alive (leak the
696                          * cache); however, they effectively become "zombie
697                          * allocations" as the KFENCE objects are the only ones
698                          * still in use and the owning cache is being destroyed.
699                          *
700                          * We mark them freed, so that any subsequent use shows
701                          * more useful error messages that will include stack
702                          * traces of the user of the object, the original
703                          * allocation, and caller to shutdown_cache().
704                          */
705                         kfence_guarded_free((void *)meta->addr, meta, /*zombie=*/true);
706                 }
707         }
708
709         for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
710                 meta = &kfence_metadata[i];
711
712                 /* See above. */
713                 if (READ_ONCE(meta->cache) != s || READ_ONCE(meta->state) != KFENCE_OBJECT_FREED)
714                         continue;
715
716                 raw_spin_lock_irqsave(&meta->lock, flags);
717                 if (meta->cache == s && meta->state == KFENCE_OBJECT_FREED)
718                         meta->cache = NULL;
719                 raw_spin_unlock_irqrestore(&meta->lock, flags);
720         }
721 }
722
723 void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
724 {
725         /*
726          * allocation_gate only needs to become non-zero, so it doesn't make
727          * sense to continue writing to it and pay the associated contention
728          * cost, in case we have a large number of concurrent allocations.
729          */
730         if (atomic_read(&kfence_allocation_gate) || atomic_inc_return(&kfence_allocation_gate) > 1)
731                 return NULL;
732 #ifdef CONFIG_KFENCE_STATIC_KEYS
733         /*
734          * waitqueue_active() is fully ordered after the update of
735          * kfence_allocation_gate per atomic_inc_return().
736          */
737         if (waitqueue_active(&allocation_wait)) {
738                 /*
739                  * Calling wake_up() here may deadlock when allocations happen
740                  * from within timer code. Use an irq_work to defer it.
741                  */
742                 irq_work_queue(&wake_up_kfence_timer_work);
743         }
744 #endif
745
746         if (!READ_ONCE(kfence_enabled))
747                 return NULL;
748
749         if (size > PAGE_SIZE)
750                 return NULL;
751
752         return kfence_guarded_alloc(s, size, flags);
753 }
754
755 size_t kfence_ksize(const void *addr)
756 {
757         const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
758
759         /*
760          * Read locklessly -- if there is a race with __kfence_alloc(), this is
761          * either a use-after-free or invalid access.
762          */
763         return meta ? meta->size : 0;
764 }
765
766 void *kfence_object_start(const void *addr)
767 {
768         const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
769
770         /*
771          * Read locklessly -- if there is a race with __kfence_alloc(), this is
772          * either a use-after-free or invalid access.
773          */
774         return meta ? (void *)meta->addr : NULL;
775 }
776
777 void __kfence_free(void *addr)
778 {
779         struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
780
781         /*
782          * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing
783          * the object, as the object page may be recycled for other-typed
784          * objects once it has been freed. meta->cache may be NULL if the cache
785          * was destroyed.
786          */
787         if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU)))
788                 call_rcu(&meta->rcu_head, rcu_guarded_free);
789         else
790                 kfence_guarded_free(addr, meta, false);
791 }
792
793 bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs)
794 {
795         const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE;
796         struct kfence_metadata *to_report = NULL;
797         enum kfence_error_type error_type;
798         unsigned long flags;
799
800         if (!is_kfence_address((void *)addr))
801                 return false;
802
803         if (!READ_ONCE(kfence_enabled)) /* If disabled at runtime ... */
804                 return kfence_unprotect(addr); /* ... unprotect and proceed. */
805
806         atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
807
808         if (page_index % 2) {
809                 /* This is a redzone, report a buffer overflow. */
810                 struct kfence_metadata *meta;
811                 int distance = 0;
812
813                 meta = addr_to_metadata(addr - PAGE_SIZE);
814                 if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) {
815                         to_report = meta;
816                         /* Data race ok; distance calculation approximate. */
817                         distance = addr - data_race(meta->addr + meta->size);
818                 }
819
820                 meta = addr_to_metadata(addr + PAGE_SIZE);
821                 if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) {
822                         /* Data race ok; distance calculation approximate. */
823                         if (!to_report || distance > data_race(meta->addr) - addr)
824                                 to_report = meta;
825                 }
826
827                 if (!to_report)
828                         goto out;
829
830                 raw_spin_lock_irqsave(&to_report->lock, flags);
831                 to_report->unprotected_page = addr;
832                 error_type = KFENCE_ERROR_OOB;
833
834                 /*
835                  * If the object was freed before we took the look we can still
836                  * report this as an OOB -- the report will simply show the
837                  * stacktrace of the free as well.
838                  */
839         } else {
840                 to_report = addr_to_metadata(addr);
841                 if (!to_report)
842                         goto out;
843
844                 raw_spin_lock_irqsave(&to_report->lock, flags);
845                 error_type = KFENCE_ERROR_UAF;
846                 /*
847                  * We may race with __kfence_alloc(), and it is possible that a
848                  * freed object may be reallocated. We simply report this as a
849                  * use-after-free, with the stack trace showing the place where
850                  * the object was re-allocated.
851                  */
852         }
853
854 out:
855         if (to_report) {
856                 kfence_report_error(addr, is_write, regs, to_report, error_type);
857                 raw_spin_unlock_irqrestore(&to_report->lock, flags);
858         } else {
859                 /* This may be a UAF or OOB access, but we can't be sure. */
860                 kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID);
861         }
862
863         return kfence_unprotect(addr); /* Unprotect and let access proceed. */
864 }