mm/z3fold: remove unused function handle_to_z3fold_header()
[linux-2.6-microblaze.git] / mm / z3fold.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * z3fold.c
4  *
5  * Author: Vitaly Wool <vitaly.wool@konsulko.com>
6  * Copyright (C) 2016, Sony Mobile Communications Inc.
7  *
8  * This implementation is based on zbud written by Seth Jennings.
9  *
10  * z3fold is an special purpose allocator for storing compressed pages. It
11  * can store up to three compressed pages per page which improves the
12  * compression ratio of zbud while retaining its main concepts (e. g. always
13  * storing an integral number of objects per page) and simplicity.
14  * It still has simple and deterministic reclaim properties that make it
15  * preferable to a higher density approach (with no requirement on integral
16  * number of object per page) when reclaim is used.
17  *
18  * As in zbud, pages are divided into "chunks".  The size of the chunks is
19  * fixed at compile time and is determined by NCHUNKS_ORDER below.
20  *
21  * z3fold doesn't export any API and is meant to be used via zpool API.
22  */
23
24 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
25
26 #include <linux/atomic.h>
27 #include <linux/sched.h>
28 #include <linux/cpumask.h>
29 #include <linux/list.h>
30 #include <linux/mm.h>
31 #include <linux/module.h>
32 #include <linux/page-flags.h>
33 #include <linux/migrate.h>
34 #include <linux/node.h>
35 #include <linux/compaction.h>
36 #include <linux/percpu.h>
37 #include <linux/mount.h>
38 #include <linux/pseudo_fs.h>
39 #include <linux/fs.h>
40 #include <linux/preempt.h>
41 #include <linux/workqueue.h>
42 #include <linux/slab.h>
43 #include <linux/spinlock.h>
44 #include <linux/zpool.h>
45 #include <linux/magic.h>
46 #include <linux/kmemleak.h>
47
48 /*
49  * NCHUNKS_ORDER determines the internal allocation granularity, effectively
50  * adjusting internal fragmentation.  It also determines the number of
51  * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
52  * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
53  * in the beginning of an allocated page are occupied by z3fold header, so
54  * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
55  * which shows the max number of free chunks in z3fold page, also there will
56  * be 63, or 62, respectively, freelists per pool.
57  */
58 #define NCHUNKS_ORDER   6
59
60 #define CHUNK_SHIFT     (PAGE_SHIFT - NCHUNKS_ORDER)
61 #define CHUNK_SIZE      (1 << CHUNK_SHIFT)
62 #define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
63 #define ZHDR_CHUNKS     (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
64 #define TOTAL_CHUNKS    (PAGE_SIZE >> CHUNK_SHIFT)
65 #define NCHUNKS         (TOTAL_CHUNKS - ZHDR_CHUNKS)
66
67 #define BUDDY_MASK      (0x3)
68 #define BUDDY_SHIFT     2
69 #define SLOTS_ALIGN     (0x40)
70
71 /*****************
72  * Structures
73 *****************/
74 struct z3fold_pool;
75 struct z3fold_ops {
76         int (*evict)(struct z3fold_pool *pool, unsigned long handle);
77 };
78
79 enum buddy {
80         HEADLESS = 0,
81         FIRST,
82         MIDDLE,
83         LAST,
84         BUDDIES_MAX = LAST
85 };
86
87 struct z3fold_buddy_slots {
88         /*
89          * we are using BUDDY_MASK in handle_to_buddy etc. so there should
90          * be enough slots to hold all possible variants
91          */
92         unsigned long slot[BUDDY_MASK + 1];
93         unsigned long pool; /* back link */
94         rwlock_t lock;
95 };
96 #define HANDLE_FLAG_MASK        (0x03)
97
98 /*
99  * struct z3fold_header - z3fold page metadata occupying first chunks of each
100  *                      z3fold page, except for HEADLESS pages
101  * @buddy:              links the z3fold page into the relevant list in the
102  *                      pool
103  * @page_lock:          per-page lock
104  * @refcount:           reference count for the z3fold page
105  * @work:               work_struct for page layout optimization
106  * @slots:              pointer to the structure holding buddy slots
107  * @pool:               pointer to the containing pool
108  * @cpu:                CPU which this page "belongs" to
109  * @first_chunks:       the size of the first buddy in chunks, 0 if free
110  * @middle_chunks:      the size of the middle buddy in chunks, 0 if free
111  * @last_chunks:        the size of the last buddy in chunks, 0 if free
112  * @first_num:          the starting number (for the first handle)
113  * @mapped_count:       the number of objects currently mapped
114  */
115 struct z3fold_header {
116         struct list_head buddy;
117         spinlock_t page_lock;
118         struct kref refcount;
119         struct work_struct work;
120         struct z3fold_buddy_slots *slots;
121         struct z3fold_pool *pool;
122         short cpu;
123         unsigned short first_chunks;
124         unsigned short middle_chunks;
125         unsigned short last_chunks;
126         unsigned short start_middle;
127         unsigned short first_num:2;
128         unsigned short mapped_count:2;
129         unsigned short foreign_handles:2;
130 };
131
132 /**
133  * struct z3fold_pool - stores metadata for each z3fold pool
134  * @name:       pool name
135  * @lock:       protects pool unbuddied/lru lists
136  * @stale_lock: protects pool stale page list
137  * @unbuddied:  per-cpu array of lists tracking z3fold pages that contain 2-
138  *              buddies; the list each z3fold page is added to depends on
139  *              the size of its free region.
140  * @lru:        list tracking the z3fold pages in LRU order by most recently
141  *              added buddy.
142  * @stale:      list of pages marked for freeing
143  * @pages_nr:   number of z3fold pages in the pool.
144  * @c_handle:   cache for z3fold_buddy_slots allocation
145  * @ops:        pointer to a structure of user defined operations specified at
146  *              pool creation time.
147  * @compact_wq: workqueue for page layout background optimization
148  * @release_wq: workqueue for safe page release
149  * @work:       work_struct for safe page release
150  * @inode:      inode for z3fold pseudo filesystem
151  *
152  * This structure is allocated at pool creation time and maintains metadata
153  * pertaining to a particular z3fold pool.
154  */
155 struct z3fold_pool {
156         const char *name;
157         spinlock_t lock;
158         spinlock_t stale_lock;
159         struct list_head *unbuddied;
160         struct list_head lru;
161         struct list_head stale;
162         atomic64_t pages_nr;
163         struct kmem_cache *c_handle;
164         const struct z3fold_ops *ops;
165         struct zpool *zpool;
166         const struct zpool_ops *zpool_ops;
167         struct workqueue_struct *compact_wq;
168         struct workqueue_struct *release_wq;
169         struct work_struct work;
170         struct inode *inode;
171 };
172
173 /*
174  * Internal z3fold page flags
175  */
176 enum z3fold_page_flags {
177         PAGE_HEADLESS = 0,
178         MIDDLE_CHUNK_MAPPED,
179         NEEDS_COMPACTING,
180         PAGE_STALE,
181         PAGE_CLAIMED, /* by either reclaim or free */
182 };
183
184 /*
185  * handle flags, go under HANDLE_FLAG_MASK
186  */
187 enum z3fold_handle_flags {
188         HANDLES_NOFREE = 0,
189 };
190
191 /*
192  * Forward declarations
193  */
194 static struct z3fold_header *__z3fold_alloc(struct z3fold_pool *, size_t, bool);
195 static void compact_page_work(struct work_struct *w);
196
197 /*****************
198  * Helpers
199 *****************/
200
201 /* Converts an allocation size in bytes to size in z3fold chunks */
202 static int size_to_chunks(size_t size)
203 {
204         return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
205 }
206
207 #define for_each_unbuddied_list(_iter, _begin) \
208         for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
209
210 static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
211                                                         gfp_t gfp)
212 {
213         struct z3fold_buddy_slots *slots;
214
215         slots = kmem_cache_zalloc(pool->c_handle,
216                                  (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE)));
217
218         if (slots) {
219                 /* It will be freed separately in free_handle(). */
220                 kmemleak_not_leak(slots);
221                 slots->pool = (unsigned long)pool;
222                 rwlock_init(&slots->lock);
223         }
224
225         return slots;
226 }
227
228 static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s)
229 {
230         return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK);
231 }
232
233 static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle)
234 {
235         return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1));
236 }
237
238 /* Lock a z3fold page */
239 static inline void z3fold_page_lock(struct z3fold_header *zhdr)
240 {
241         spin_lock(&zhdr->page_lock);
242 }
243
244 /* Try to lock a z3fold page */
245 static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
246 {
247         return spin_trylock(&zhdr->page_lock);
248 }
249
250 /* Unlock a z3fold page */
251 static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
252 {
253         spin_unlock(&zhdr->page_lock);
254 }
255
256 /* return locked z3fold page if it's not headless */
257 static inline struct z3fold_header *get_z3fold_header(unsigned long handle)
258 {
259         struct z3fold_buddy_slots *slots;
260         struct z3fold_header *zhdr;
261         int locked = 0;
262
263         if (!(handle & (1 << PAGE_HEADLESS))) {
264                 slots = handle_to_slots(handle);
265                 do {
266                         unsigned long addr;
267
268                         read_lock(&slots->lock);
269                         addr = *(unsigned long *)handle;
270                         zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
271                         locked = z3fold_page_trylock(zhdr);
272                         read_unlock(&slots->lock);
273                         if (locked)
274                                 break;
275                         cpu_relax();
276                 } while (true);
277         } else {
278                 zhdr = (struct z3fold_header *)(handle & PAGE_MASK);
279         }
280
281         return zhdr;
282 }
283
284 static inline void put_z3fold_header(struct z3fold_header *zhdr)
285 {
286         struct page *page = virt_to_page(zhdr);
287
288         if (!test_bit(PAGE_HEADLESS, &page->private))
289                 z3fold_page_unlock(zhdr);
290 }
291
292 static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr)
293 {
294         struct z3fold_buddy_slots *slots;
295         int i;
296         bool is_free;
297
298         if (handle & (1 << PAGE_HEADLESS))
299                 return;
300
301         if (WARN_ON(*(unsigned long *)handle == 0))
302                 return;
303
304         slots = handle_to_slots(handle);
305         write_lock(&slots->lock);
306         *(unsigned long *)handle = 0;
307
308         if (test_bit(HANDLES_NOFREE, &slots->pool)) {
309                 write_unlock(&slots->lock);
310                 return; /* simple case, nothing else to do */
311         }
312
313         if (zhdr->slots != slots)
314                 zhdr->foreign_handles--;
315
316         is_free = true;
317         for (i = 0; i <= BUDDY_MASK; i++) {
318                 if (slots->slot[i]) {
319                         is_free = false;
320                         break;
321                 }
322         }
323         write_unlock(&slots->lock);
324
325         if (is_free) {
326                 struct z3fold_pool *pool = slots_to_pool(slots);
327
328                 if (zhdr->slots == slots)
329                         zhdr->slots = NULL;
330                 kmem_cache_free(pool->c_handle, slots);
331         }
332 }
333
334 static int z3fold_init_fs_context(struct fs_context *fc)
335 {
336         return init_pseudo(fc, Z3FOLD_MAGIC) ? 0 : -ENOMEM;
337 }
338
339 static struct file_system_type z3fold_fs = {
340         .name           = "z3fold",
341         .init_fs_context = z3fold_init_fs_context,
342         .kill_sb        = kill_anon_super,
343 };
344
345 static struct vfsmount *z3fold_mnt;
346 static int z3fold_mount(void)
347 {
348         int ret = 0;
349
350         z3fold_mnt = kern_mount(&z3fold_fs);
351         if (IS_ERR(z3fold_mnt))
352                 ret = PTR_ERR(z3fold_mnt);
353
354         return ret;
355 }
356
357 static void z3fold_unmount(void)
358 {
359         kern_unmount(z3fold_mnt);
360 }
361
362 static const struct address_space_operations z3fold_aops;
363 static int z3fold_register_migration(struct z3fold_pool *pool)
364 {
365         pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb);
366         if (IS_ERR(pool->inode)) {
367                 pool->inode = NULL;
368                 return 1;
369         }
370
371         pool->inode->i_mapping->private_data = pool;
372         pool->inode->i_mapping->a_ops = &z3fold_aops;
373         return 0;
374 }
375
376 static void z3fold_unregister_migration(struct z3fold_pool *pool)
377 {
378         if (pool->inode)
379                 iput(pool->inode);
380 }
381
382 /* Initializes the z3fold header of a newly allocated z3fold page */
383 static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
384                                         struct z3fold_pool *pool, gfp_t gfp)
385 {
386         struct z3fold_header *zhdr = page_address(page);
387         struct z3fold_buddy_slots *slots;
388
389         INIT_LIST_HEAD(&page->lru);
390         clear_bit(PAGE_HEADLESS, &page->private);
391         clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
392         clear_bit(NEEDS_COMPACTING, &page->private);
393         clear_bit(PAGE_STALE, &page->private);
394         clear_bit(PAGE_CLAIMED, &page->private);
395         if (headless)
396                 return zhdr;
397
398         slots = alloc_slots(pool, gfp);
399         if (!slots)
400                 return NULL;
401
402         memset(zhdr, 0, sizeof(*zhdr));
403         spin_lock_init(&zhdr->page_lock);
404         kref_init(&zhdr->refcount);
405         zhdr->cpu = -1;
406         zhdr->slots = slots;
407         zhdr->pool = pool;
408         INIT_LIST_HEAD(&zhdr->buddy);
409         INIT_WORK(&zhdr->work, compact_page_work);
410         return zhdr;
411 }
412
413 /* Resets the struct page fields and frees the page */
414 static void free_z3fold_page(struct page *page, bool headless)
415 {
416         if (!headless) {
417                 lock_page(page);
418                 __ClearPageMovable(page);
419                 unlock_page(page);
420         }
421         ClearPagePrivate(page);
422         __free_page(page);
423 }
424
425 /* Helper function to build the index */
426 static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
427 {
428         return (bud + zhdr->first_num) & BUDDY_MASK;
429 }
430
431 /*
432  * Encodes the handle of a particular buddy within a z3fold page
433  * Pool lock should be held as this function accesses first_num
434  */
435 static unsigned long __encode_handle(struct z3fold_header *zhdr,
436                                 struct z3fold_buddy_slots *slots,
437                                 enum buddy bud)
438 {
439         unsigned long h = (unsigned long)zhdr;
440         int idx = 0;
441
442         /*
443          * For a headless page, its handle is its pointer with the extra
444          * PAGE_HEADLESS bit set
445          */
446         if (bud == HEADLESS)
447                 return h | (1 << PAGE_HEADLESS);
448
449         /* otherwise, return pointer to encoded handle */
450         idx = __idx(zhdr, bud);
451         h += idx;
452         if (bud == LAST)
453                 h |= (zhdr->last_chunks << BUDDY_SHIFT);
454
455         write_lock(&slots->lock);
456         slots->slot[idx] = h;
457         write_unlock(&slots->lock);
458         return (unsigned long)&slots->slot[idx];
459 }
460
461 static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
462 {
463         return __encode_handle(zhdr, zhdr->slots, bud);
464 }
465
466 /* only for LAST bud, returns zero otherwise */
467 static unsigned short handle_to_chunks(unsigned long handle)
468 {
469         struct z3fold_buddy_slots *slots = handle_to_slots(handle);
470         unsigned long addr;
471
472         read_lock(&slots->lock);
473         addr = *(unsigned long *)handle;
474         read_unlock(&slots->lock);
475         return (addr & ~PAGE_MASK) >> BUDDY_SHIFT;
476 }
477
478 /*
479  * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle
480  *  but that doesn't matter. because the masking will result in the
481  *  correct buddy number.
482  */
483 static enum buddy handle_to_buddy(unsigned long handle)
484 {
485         struct z3fold_header *zhdr;
486         struct z3fold_buddy_slots *slots = handle_to_slots(handle);
487         unsigned long addr;
488
489         read_lock(&slots->lock);
490         WARN_ON(handle & (1 << PAGE_HEADLESS));
491         addr = *(unsigned long *)handle;
492         read_unlock(&slots->lock);
493         zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
494         return (addr - zhdr->first_num) & BUDDY_MASK;
495 }
496
497 static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr)
498 {
499         return zhdr->pool;
500 }
501
502 static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
503 {
504         struct page *page = virt_to_page(zhdr);
505         struct z3fold_pool *pool = zhdr_to_pool(zhdr);
506
507         WARN_ON(!list_empty(&zhdr->buddy));
508         set_bit(PAGE_STALE, &page->private);
509         clear_bit(NEEDS_COMPACTING, &page->private);
510         spin_lock(&pool->lock);
511         if (!list_empty(&page->lru))
512                 list_del_init(&page->lru);
513         spin_unlock(&pool->lock);
514
515         if (locked)
516                 z3fold_page_unlock(zhdr);
517
518         spin_lock(&pool->stale_lock);
519         list_add(&zhdr->buddy, &pool->stale);
520         queue_work(pool->release_wq, &pool->work);
521         spin_unlock(&pool->stale_lock);
522 }
523
524 static void release_z3fold_page(struct kref *ref)
525 {
526         struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
527                                                 refcount);
528         __release_z3fold_page(zhdr, false);
529 }
530
531 static void release_z3fold_page_locked(struct kref *ref)
532 {
533         struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
534                                                 refcount);
535         WARN_ON(z3fold_page_trylock(zhdr));
536         __release_z3fold_page(zhdr, true);
537 }
538
539 static void release_z3fold_page_locked_list(struct kref *ref)
540 {
541         struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
542                                                refcount);
543         struct z3fold_pool *pool = zhdr_to_pool(zhdr);
544
545         spin_lock(&pool->lock);
546         list_del_init(&zhdr->buddy);
547         spin_unlock(&pool->lock);
548
549         WARN_ON(z3fold_page_trylock(zhdr));
550         __release_z3fold_page(zhdr, true);
551 }
552
553 static void free_pages_work(struct work_struct *w)
554 {
555         struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work);
556
557         spin_lock(&pool->stale_lock);
558         while (!list_empty(&pool->stale)) {
559                 struct z3fold_header *zhdr = list_first_entry(&pool->stale,
560                                                 struct z3fold_header, buddy);
561                 struct page *page = virt_to_page(zhdr);
562
563                 list_del(&zhdr->buddy);
564                 if (WARN_ON(!test_bit(PAGE_STALE, &page->private)))
565                         continue;
566                 spin_unlock(&pool->stale_lock);
567                 cancel_work_sync(&zhdr->work);
568                 free_z3fold_page(page, false);
569                 cond_resched();
570                 spin_lock(&pool->stale_lock);
571         }
572         spin_unlock(&pool->stale_lock);
573 }
574
575 /*
576  * Returns the number of free chunks in a z3fold page.
577  * NB: can't be used with HEADLESS pages.
578  */
579 static int num_free_chunks(struct z3fold_header *zhdr)
580 {
581         int nfree;
582         /*
583          * If there is a middle object, pick up the bigger free space
584          * either before or after it. Otherwise just subtract the number
585          * of chunks occupied by the first and the last objects.
586          */
587         if (zhdr->middle_chunks != 0) {
588                 int nfree_before = zhdr->first_chunks ?
589                         0 : zhdr->start_middle - ZHDR_CHUNKS;
590                 int nfree_after = zhdr->last_chunks ?
591                         0 : TOTAL_CHUNKS -
592                                 (zhdr->start_middle + zhdr->middle_chunks);
593                 nfree = max(nfree_before, nfree_after);
594         } else
595                 nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
596         return nfree;
597 }
598
599 /* Add to the appropriate unbuddied list */
600 static inline void add_to_unbuddied(struct z3fold_pool *pool,
601                                 struct z3fold_header *zhdr)
602 {
603         if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
604                         zhdr->middle_chunks == 0) {
605                 struct list_head *unbuddied;
606                 int freechunks = num_free_chunks(zhdr);
607
608                 migrate_disable();
609                 unbuddied = this_cpu_ptr(pool->unbuddied);
610                 spin_lock(&pool->lock);
611                 list_add(&zhdr->buddy, &unbuddied[freechunks]);
612                 spin_unlock(&pool->lock);
613                 zhdr->cpu = smp_processor_id();
614                 migrate_enable();
615         }
616 }
617
618 static inline enum buddy get_free_buddy(struct z3fold_header *zhdr, int chunks)
619 {
620         enum buddy bud = HEADLESS;
621
622         if (zhdr->middle_chunks) {
623                 if (!zhdr->first_chunks &&
624                     chunks <= zhdr->start_middle - ZHDR_CHUNKS)
625                         bud = FIRST;
626                 else if (!zhdr->last_chunks)
627                         bud = LAST;
628         } else {
629                 if (!zhdr->first_chunks)
630                         bud = FIRST;
631                 else if (!zhdr->last_chunks)
632                         bud = LAST;
633                 else
634                         bud = MIDDLE;
635         }
636
637         return bud;
638 }
639
640 static inline void *mchunk_memmove(struct z3fold_header *zhdr,
641                                 unsigned short dst_chunk)
642 {
643         void *beg = zhdr;
644         return memmove(beg + (dst_chunk << CHUNK_SHIFT),
645                        beg + (zhdr->start_middle << CHUNK_SHIFT),
646                        zhdr->middle_chunks << CHUNK_SHIFT);
647 }
648
649 static inline bool buddy_single(struct z3fold_header *zhdr)
650 {
651         return !((zhdr->first_chunks && zhdr->middle_chunks) ||
652                         (zhdr->first_chunks && zhdr->last_chunks) ||
653                         (zhdr->middle_chunks && zhdr->last_chunks));
654 }
655
656 static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr)
657 {
658         struct z3fold_pool *pool = zhdr_to_pool(zhdr);
659         void *p = zhdr;
660         unsigned long old_handle = 0;
661         size_t sz = 0;
662         struct z3fold_header *new_zhdr = NULL;
663         int first_idx = __idx(zhdr, FIRST);
664         int middle_idx = __idx(zhdr, MIDDLE);
665         int last_idx = __idx(zhdr, LAST);
666         unsigned short *moved_chunks = NULL;
667
668         /*
669          * No need to protect slots here -- all the slots are "local" and
670          * the page lock is already taken
671          */
672         if (zhdr->first_chunks && zhdr->slots->slot[first_idx]) {
673                 p += ZHDR_SIZE_ALIGNED;
674                 sz = zhdr->first_chunks << CHUNK_SHIFT;
675                 old_handle = (unsigned long)&zhdr->slots->slot[first_idx];
676                 moved_chunks = &zhdr->first_chunks;
677         } else if (zhdr->middle_chunks && zhdr->slots->slot[middle_idx]) {
678                 p += zhdr->start_middle << CHUNK_SHIFT;
679                 sz = zhdr->middle_chunks << CHUNK_SHIFT;
680                 old_handle = (unsigned long)&zhdr->slots->slot[middle_idx];
681                 moved_chunks = &zhdr->middle_chunks;
682         } else if (zhdr->last_chunks && zhdr->slots->slot[last_idx]) {
683                 p += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
684                 sz = zhdr->last_chunks << CHUNK_SHIFT;
685                 old_handle = (unsigned long)&zhdr->slots->slot[last_idx];
686                 moved_chunks = &zhdr->last_chunks;
687         }
688
689         if (sz > 0) {
690                 enum buddy new_bud = HEADLESS;
691                 short chunks = size_to_chunks(sz);
692                 void *q;
693
694                 new_zhdr = __z3fold_alloc(pool, sz, false);
695                 if (!new_zhdr)
696                         return NULL;
697
698                 if (WARN_ON(new_zhdr == zhdr))
699                         goto out_fail;
700
701                 new_bud = get_free_buddy(new_zhdr, chunks);
702                 q = new_zhdr;
703                 switch (new_bud) {
704                 case FIRST:
705                         new_zhdr->first_chunks = chunks;
706                         q += ZHDR_SIZE_ALIGNED;
707                         break;
708                 case MIDDLE:
709                         new_zhdr->middle_chunks = chunks;
710                         new_zhdr->start_middle =
711                                 new_zhdr->first_chunks + ZHDR_CHUNKS;
712                         q += new_zhdr->start_middle << CHUNK_SHIFT;
713                         break;
714                 case LAST:
715                         new_zhdr->last_chunks = chunks;
716                         q += PAGE_SIZE - (new_zhdr->last_chunks << CHUNK_SHIFT);
717                         break;
718                 default:
719                         goto out_fail;
720                 }
721                 new_zhdr->foreign_handles++;
722                 memcpy(q, p, sz);
723                 write_lock(&zhdr->slots->lock);
724                 *(unsigned long *)old_handle = (unsigned long)new_zhdr +
725                         __idx(new_zhdr, new_bud);
726                 if (new_bud == LAST)
727                         *(unsigned long *)old_handle |=
728                                         (new_zhdr->last_chunks << BUDDY_SHIFT);
729                 write_unlock(&zhdr->slots->lock);
730                 add_to_unbuddied(pool, new_zhdr);
731                 z3fold_page_unlock(new_zhdr);
732
733                 *moved_chunks = 0;
734         }
735
736         return new_zhdr;
737
738 out_fail:
739         if (new_zhdr) {
740                 if (kref_put(&new_zhdr->refcount, release_z3fold_page_locked))
741                         atomic64_dec(&pool->pages_nr);
742                 else {
743                         add_to_unbuddied(pool, new_zhdr);
744                         z3fold_page_unlock(new_zhdr);
745                 }
746         }
747         return NULL;
748
749 }
750
751 #define BIG_CHUNK_GAP   3
752 /* Has to be called with lock held */
753 static int z3fold_compact_page(struct z3fold_header *zhdr)
754 {
755         struct page *page = virt_to_page(zhdr);
756
757         if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private))
758                 return 0; /* can't move middle chunk, it's used */
759
760         if (unlikely(PageIsolated(page)))
761                 return 0;
762
763         if (zhdr->middle_chunks == 0)
764                 return 0; /* nothing to compact */
765
766         if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
767                 /* move to the beginning */
768                 mchunk_memmove(zhdr, ZHDR_CHUNKS);
769                 zhdr->first_chunks = zhdr->middle_chunks;
770                 zhdr->middle_chunks = 0;
771                 zhdr->start_middle = 0;
772                 zhdr->first_num++;
773                 return 1;
774         }
775
776         /*
777          * moving data is expensive, so let's only do that if
778          * there's substantial gain (at least BIG_CHUNK_GAP chunks)
779          */
780         if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 &&
781             zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >=
782                         BIG_CHUNK_GAP) {
783                 mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS);
784                 zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
785                 return 1;
786         } else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 &&
787                    TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle
788                                         + zhdr->middle_chunks) >=
789                         BIG_CHUNK_GAP) {
790                 unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks -
791                         zhdr->middle_chunks;
792                 mchunk_memmove(zhdr, new_start);
793                 zhdr->start_middle = new_start;
794                 return 1;
795         }
796
797         return 0;
798 }
799
800 static void do_compact_page(struct z3fold_header *zhdr, bool locked)
801 {
802         struct z3fold_pool *pool = zhdr_to_pool(zhdr);
803         struct page *page;
804
805         page = virt_to_page(zhdr);
806         if (locked)
807                 WARN_ON(z3fold_page_trylock(zhdr));
808         else
809                 z3fold_page_lock(zhdr);
810         if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, &page->private))) {
811                 z3fold_page_unlock(zhdr);
812                 return;
813         }
814         spin_lock(&pool->lock);
815         list_del_init(&zhdr->buddy);
816         spin_unlock(&pool->lock);
817
818         if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
819                 atomic64_dec(&pool->pages_nr);
820                 return;
821         }
822
823         if (test_bit(PAGE_STALE, &page->private) ||
824             test_and_set_bit(PAGE_CLAIMED, &page->private)) {
825                 z3fold_page_unlock(zhdr);
826                 return;
827         }
828
829         if (!zhdr->foreign_handles && buddy_single(zhdr) &&
830             zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) {
831                 if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
832                         atomic64_dec(&pool->pages_nr);
833                 else {
834                         clear_bit(PAGE_CLAIMED, &page->private);
835                         z3fold_page_unlock(zhdr);
836                 }
837                 return;
838         }
839
840         z3fold_compact_page(zhdr);
841         add_to_unbuddied(pool, zhdr);
842         clear_bit(PAGE_CLAIMED, &page->private);
843         z3fold_page_unlock(zhdr);
844 }
845
846 static void compact_page_work(struct work_struct *w)
847 {
848         struct z3fold_header *zhdr = container_of(w, struct z3fold_header,
849                                                 work);
850
851         do_compact_page(zhdr, false);
852 }
853
854 /* returns _locked_ z3fold page header or NULL */
855 static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
856                                                 size_t size, bool can_sleep)
857 {
858         struct z3fold_header *zhdr = NULL;
859         struct page *page;
860         struct list_head *unbuddied;
861         int chunks = size_to_chunks(size), i;
862
863 lookup:
864         migrate_disable();
865         /* First, try to find an unbuddied z3fold page. */
866         unbuddied = this_cpu_ptr(pool->unbuddied);
867         for_each_unbuddied_list(i, chunks) {
868                 struct list_head *l = &unbuddied[i];
869
870                 zhdr = list_first_entry_or_null(READ_ONCE(l),
871                                         struct z3fold_header, buddy);
872
873                 if (!zhdr)
874                         continue;
875
876                 /* Re-check under lock. */
877                 spin_lock(&pool->lock);
878                 l = &unbuddied[i];
879                 if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
880                                                 struct z3fold_header, buddy)) ||
881                     !z3fold_page_trylock(zhdr)) {
882                         spin_unlock(&pool->lock);
883                         zhdr = NULL;
884                         migrate_enable();
885                         if (can_sleep)
886                                 cond_resched();
887                         goto lookup;
888                 }
889                 list_del_init(&zhdr->buddy);
890                 zhdr->cpu = -1;
891                 spin_unlock(&pool->lock);
892
893                 page = virt_to_page(zhdr);
894                 if (test_bit(NEEDS_COMPACTING, &page->private) ||
895                     test_bit(PAGE_CLAIMED, &page->private)) {
896                         z3fold_page_unlock(zhdr);
897                         zhdr = NULL;
898                         migrate_enable();
899                         if (can_sleep)
900                                 cond_resched();
901                         goto lookup;
902                 }
903
904                 /*
905                  * this page could not be removed from its unbuddied
906                  * list while pool lock was held, and then we've taken
907                  * page lock so kref_put could not be called before
908                  * we got here, so it's safe to just call kref_get()
909                  */
910                 kref_get(&zhdr->refcount);
911                 break;
912         }
913         migrate_enable();
914
915         if (!zhdr) {
916                 int cpu;
917
918                 /* look for _exact_ match on other cpus' lists */
919                 for_each_online_cpu(cpu) {
920                         struct list_head *l;
921
922                         unbuddied = per_cpu_ptr(pool->unbuddied, cpu);
923                         spin_lock(&pool->lock);
924                         l = &unbuddied[chunks];
925
926                         zhdr = list_first_entry_or_null(READ_ONCE(l),
927                                                 struct z3fold_header, buddy);
928
929                         if (!zhdr || !z3fold_page_trylock(zhdr)) {
930                                 spin_unlock(&pool->lock);
931                                 zhdr = NULL;
932                                 continue;
933                         }
934                         list_del_init(&zhdr->buddy);
935                         zhdr->cpu = -1;
936                         spin_unlock(&pool->lock);
937
938                         page = virt_to_page(zhdr);
939                         if (test_bit(NEEDS_COMPACTING, &page->private) ||
940                             test_bit(PAGE_CLAIMED, &page->private)) {
941                                 z3fold_page_unlock(zhdr);
942                                 zhdr = NULL;
943                                 if (can_sleep)
944                                         cond_resched();
945                                 continue;
946                         }
947                         kref_get(&zhdr->refcount);
948                         break;
949                 }
950         }
951
952         if (zhdr && !zhdr->slots)
953                 zhdr->slots = alloc_slots(pool,
954                                         can_sleep ? GFP_NOIO : GFP_ATOMIC);
955         return zhdr;
956 }
957
958 /*
959  * API Functions
960  */
961
962 /**
963  * z3fold_create_pool() - create a new z3fold pool
964  * @name:       pool name
965  * @gfp:        gfp flags when allocating the z3fold pool structure
966  * @ops:        user-defined operations for the z3fold pool
967  *
968  * Return: pointer to the new z3fold pool or NULL if the metadata allocation
969  * failed.
970  */
971 static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
972                 const struct z3fold_ops *ops)
973 {
974         struct z3fold_pool *pool = NULL;
975         int i, cpu;
976
977         pool = kzalloc(sizeof(struct z3fold_pool), gfp);
978         if (!pool)
979                 goto out;
980         pool->c_handle = kmem_cache_create("z3fold_handle",
981                                 sizeof(struct z3fold_buddy_slots),
982                                 SLOTS_ALIGN, 0, NULL);
983         if (!pool->c_handle)
984                 goto out_c;
985         spin_lock_init(&pool->lock);
986         spin_lock_init(&pool->stale_lock);
987         pool->unbuddied = __alloc_percpu(sizeof(struct list_head) * NCHUNKS,
988                                          __alignof__(struct list_head));
989         if (!pool->unbuddied)
990                 goto out_pool;
991         for_each_possible_cpu(cpu) {
992                 struct list_head *unbuddied =
993                                 per_cpu_ptr(pool->unbuddied, cpu);
994                 for_each_unbuddied_list(i, 0)
995                         INIT_LIST_HEAD(&unbuddied[i]);
996         }
997         INIT_LIST_HEAD(&pool->lru);
998         INIT_LIST_HEAD(&pool->stale);
999         atomic64_set(&pool->pages_nr, 0);
1000         pool->name = name;
1001         pool->compact_wq = create_singlethread_workqueue(pool->name);
1002         if (!pool->compact_wq)
1003                 goto out_unbuddied;
1004         pool->release_wq = create_singlethread_workqueue(pool->name);
1005         if (!pool->release_wq)
1006                 goto out_wq;
1007         if (z3fold_register_migration(pool))
1008                 goto out_rwq;
1009         INIT_WORK(&pool->work, free_pages_work);
1010         pool->ops = ops;
1011         return pool;
1012
1013 out_rwq:
1014         destroy_workqueue(pool->release_wq);
1015 out_wq:
1016         destroy_workqueue(pool->compact_wq);
1017 out_unbuddied:
1018         free_percpu(pool->unbuddied);
1019 out_pool:
1020         kmem_cache_destroy(pool->c_handle);
1021 out_c:
1022         kfree(pool);
1023 out:
1024         return NULL;
1025 }
1026
1027 /**
1028  * z3fold_destroy_pool() - destroys an existing z3fold pool
1029  * @pool:       the z3fold pool to be destroyed
1030  *
1031  * The pool should be emptied before this function is called.
1032  */
1033 static void z3fold_destroy_pool(struct z3fold_pool *pool)
1034 {
1035         kmem_cache_destroy(pool->c_handle);
1036
1037         /*
1038          * We need to destroy pool->compact_wq before pool->release_wq,
1039          * as any pending work on pool->compact_wq will call
1040          * queue_work(pool->release_wq, &pool->work).
1041          *
1042          * There are still outstanding pages until both workqueues are drained,
1043          * so we cannot unregister migration until then.
1044          */
1045
1046         destroy_workqueue(pool->compact_wq);
1047         destroy_workqueue(pool->release_wq);
1048         z3fold_unregister_migration(pool);
1049         kfree(pool);
1050 }
1051
1052 /**
1053  * z3fold_alloc() - allocates a region of a given size
1054  * @pool:       z3fold pool from which to allocate
1055  * @size:       size in bytes of the desired allocation
1056  * @gfp:        gfp flags used if the pool needs to grow
1057  * @handle:     handle of the new allocation
1058  *
1059  * This function will attempt to find a free region in the pool large enough to
1060  * satisfy the allocation request.  A search of the unbuddied lists is
1061  * performed first. If no suitable free region is found, then a new page is
1062  * allocated and added to the pool to satisfy the request.
1063  *
1064  * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
1065  * as z3fold pool pages.
1066  *
1067  * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
1068  * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
1069  * a new page.
1070  */
1071 static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
1072                         unsigned long *handle)
1073 {
1074         int chunks = size_to_chunks(size);
1075         struct z3fold_header *zhdr = NULL;
1076         struct page *page = NULL;
1077         enum buddy bud;
1078         bool can_sleep = gfpflags_allow_blocking(gfp);
1079
1080         if (!size)
1081                 return -EINVAL;
1082
1083         if (size > PAGE_SIZE)
1084                 return -ENOSPC;
1085
1086         if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
1087                 bud = HEADLESS;
1088         else {
1089 retry:
1090                 zhdr = __z3fold_alloc(pool, size, can_sleep);
1091                 if (zhdr) {
1092                         bud = get_free_buddy(zhdr, chunks);
1093                         if (bud == HEADLESS) {
1094                                 if (kref_put(&zhdr->refcount,
1095                                              release_z3fold_page_locked))
1096                                         atomic64_dec(&pool->pages_nr);
1097                                 else
1098                                         z3fold_page_unlock(zhdr);
1099                                 pr_err("No free chunks in unbuddied\n");
1100                                 WARN_ON(1);
1101                                 goto retry;
1102                         }
1103                         page = virt_to_page(zhdr);
1104                         goto found;
1105                 }
1106                 bud = FIRST;
1107         }
1108
1109         page = NULL;
1110         if (can_sleep) {
1111                 spin_lock(&pool->stale_lock);
1112                 zhdr = list_first_entry_or_null(&pool->stale,
1113                                                 struct z3fold_header, buddy);
1114                 /*
1115                  * Before allocating a page, let's see if we can take one from
1116                  * the stale pages list. cancel_work_sync() can sleep so we
1117                  * limit this case to the contexts where we can sleep
1118                  */
1119                 if (zhdr) {
1120                         list_del(&zhdr->buddy);
1121                         spin_unlock(&pool->stale_lock);
1122                         cancel_work_sync(&zhdr->work);
1123                         page = virt_to_page(zhdr);
1124                 } else {
1125                         spin_unlock(&pool->stale_lock);
1126                 }
1127         }
1128         if (!page)
1129                 page = alloc_page(gfp);
1130
1131         if (!page)
1132                 return -ENOMEM;
1133
1134         zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp);
1135         if (!zhdr) {
1136                 __free_page(page);
1137                 return -ENOMEM;
1138         }
1139         atomic64_inc(&pool->pages_nr);
1140
1141         if (bud == HEADLESS) {
1142                 set_bit(PAGE_HEADLESS, &page->private);
1143                 goto headless;
1144         }
1145         if (can_sleep) {
1146                 lock_page(page);
1147                 __SetPageMovable(page, pool->inode->i_mapping);
1148                 unlock_page(page);
1149         } else {
1150                 if (trylock_page(page)) {
1151                         __SetPageMovable(page, pool->inode->i_mapping);
1152                         unlock_page(page);
1153                 }
1154         }
1155         z3fold_page_lock(zhdr);
1156
1157 found:
1158         if (bud == FIRST)
1159                 zhdr->first_chunks = chunks;
1160         else if (bud == LAST)
1161                 zhdr->last_chunks = chunks;
1162         else {
1163                 zhdr->middle_chunks = chunks;
1164                 zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
1165         }
1166         add_to_unbuddied(pool, zhdr);
1167
1168 headless:
1169         spin_lock(&pool->lock);
1170         /* Add/move z3fold page to beginning of LRU */
1171         if (!list_empty(&page->lru))
1172                 list_del(&page->lru);
1173
1174         list_add(&page->lru, &pool->lru);
1175
1176         *handle = encode_handle(zhdr, bud);
1177         spin_unlock(&pool->lock);
1178         if (bud != HEADLESS)
1179                 z3fold_page_unlock(zhdr);
1180
1181         return 0;
1182 }
1183
1184 /**
1185  * z3fold_free() - frees the allocation associated with the given handle
1186  * @pool:       pool in which the allocation resided
1187  * @handle:     handle associated with the allocation returned by z3fold_alloc()
1188  *
1189  * In the case that the z3fold page in which the allocation resides is under
1190  * reclaim, as indicated by the PG_reclaim flag being set, this function
1191  * only sets the first|last_chunks to 0.  The page is actually freed
1192  * once both buddies are evicted (see z3fold_reclaim_page() below).
1193  */
1194 static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
1195 {
1196         struct z3fold_header *zhdr;
1197         struct page *page;
1198         enum buddy bud;
1199         bool page_claimed;
1200
1201         zhdr = get_z3fold_header(handle);
1202         page = virt_to_page(zhdr);
1203         page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private);
1204
1205         if (test_bit(PAGE_HEADLESS, &page->private)) {
1206                 /* if a headless page is under reclaim, just leave.
1207                  * NB: we use test_and_set_bit for a reason: if the bit
1208                  * has not been set before, we release this page
1209                  * immediately so we don't care about its value any more.
1210                  */
1211                 if (!page_claimed) {
1212                         spin_lock(&pool->lock);
1213                         list_del(&page->lru);
1214                         spin_unlock(&pool->lock);
1215                         put_z3fold_header(zhdr);
1216                         free_z3fold_page(page, true);
1217                         atomic64_dec(&pool->pages_nr);
1218                 }
1219                 return;
1220         }
1221
1222         /* Non-headless case */
1223         bud = handle_to_buddy(handle);
1224
1225         switch (bud) {
1226         case FIRST:
1227                 zhdr->first_chunks = 0;
1228                 break;
1229         case MIDDLE:
1230                 zhdr->middle_chunks = 0;
1231                 break;
1232         case LAST:
1233                 zhdr->last_chunks = 0;
1234                 break;
1235         default:
1236                 pr_err("%s: unknown bud %d\n", __func__, bud);
1237                 WARN_ON(1);
1238                 put_z3fold_header(zhdr);
1239                 return;
1240         }
1241
1242         if (!page_claimed)
1243                 free_handle(handle, zhdr);
1244         if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
1245                 atomic64_dec(&pool->pages_nr);
1246                 return;
1247         }
1248         if (page_claimed) {
1249                 /* the page has not been claimed by us */
1250                 z3fold_page_unlock(zhdr);
1251                 return;
1252         }
1253         if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
1254                 put_z3fold_header(zhdr);
1255                 clear_bit(PAGE_CLAIMED, &page->private);
1256                 return;
1257         }
1258         if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
1259                 spin_lock(&pool->lock);
1260                 list_del_init(&zhdr->buddy);
1261                 spin_unlock(&pool->lock);
1262                 zhdr->cpu = -1;
1263                 kref_get(&zhdr->refcount);
1264                 clear_bit(PAGE_CLAIMED, &page->private);
1265                 do_compact_page(zhdr, true);
1266                 return;
1267         }
1268         kref_get(&zhdr->refcount);
1269         clear_bit(PAGE_CLAIMED, &page->private);
1270         queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
1271         put_z3fold_header(zhdr);
1272 }
1273
1274 /**
1275  * z3fold_reclaim_page() - evicts allocations from a pool page and frees it
1276  * @pool:       pool from which a page will attempt to be evicted
1277  * @retries:    number of pages on the LRU list for which eviction will
1278  *              be attempted before failing
1279  *
1280  * z3fold reclaim is different from normal system reclaim in that it is done
1281  * from the bottom, up. This is because only the bottom layer, z3fold, has
1282  * information on how the allocations are organized within each z3fold page.
1283  * This has the potential to create interesting locking situations between
1284  * z3fold and the user, however.
1285  *
1286  * To avoid these, this is how z3fold_reclaim_page() should be called:
1287  *
1288  * The user detects a page should be reclaimed and calls z3fold_reclaim_page().
1289  * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and
1290  * call the user-defined eviction handler with the pool and handle as
1291  * arguments.
1292  *
1293  * If the handle can not be evicted, the eviction handler should return
1294  * non-zero. z3fold_reclaim_page() will add the z3fold page back to the
1295  * appropriate list and try the next z3fold page on the LRU up to
1296  * a user defined number of retries.
1297  *
1298  * If the handle is successfully evicted, the eviction handler should
1299  * return 0 _and_ should have called z3fold_free() on the handle. z3fold_free()
1300  * contains logic to delay freeing the page if the page is under reclaim,
1301  * as indicated by the setting of the PG_reclaim flag on the underlying page.
1302  *
1303  * If all buddies in the z3fold page are successfully evicted, then the
1304  * z3fold page can be freed.
1305  *
1306  * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
1307  * no pages to evict or an eviction handler is not registered, -EAGAIN if
1308  * the retry limit was hit.
1309  */
1310 static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
1311 {
1312         int i, ret = -1;
1313         struct z3fold_header *zhdr = NULL;
1314         struct page *page = NULL;
1315         struct list_head *pos;
1316         unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
1317         struct z3fold_buddy_slots slots __attribute__((aligned(SLOTS_ALIGN)));
1318
1319         rwlock_init(&slots.lock);
1320         slots.pool = (unsigned long)pool | (1 << HANDLES_NOFREE);
1321
1322         spin_lock(&pool->lock);
1323         if (!pool->ops || !pool->ops->evict || retries == 0) {
1324                 spin_unlock(&pool->lock);
1325                 return -EINVAL;
1326         }
1327         for (i = 0; i < retries; i++) {
1328                 if (list_empty(&pool->lru)) {
1329                         spin_unlock(&pool->lock);
1330                         return -EINVAL;
1331                 }
1332                 list_for_each_prev(pos, &pool->lru) {
1333                         page = list_entry(pos, struct page, lru);
1334
1335                         zhdr = page_address(page);
1336                         if (test_bit(PAGE_HEADLESS, &page->private)) {
1337                                 /*
1338                                  * For non-headless pages, we wait to do this
1339                                  * until we have the page lock to avoid racing
1340                                  * with __z3fold_alloc(). Headless pages don't
1341                                  * have a lock (and __z3fold_alloc() will never
1342                                  * see them), but we still need to test and set
1343                                  * PAGE_CLAIMED to avoid racing with
1344                                  * z3fold_free(), so just do it now before
1345                                  * leaving the loop.
1346                                  */
1347                                 if (test_and_set_bit(PAGE_CLAIMED, &page->private))
1348                                         continue;
1349
1350                                 break;
1351                         }
1352
1353                         if (kref_get_unless_zero(&zhdr->refcount) == 0) {
1354                                 zhdr = NULL;
1355                                 break;
1356                         }
1357                         if (!z3fold_page_trylock(zhdr)) {
1358                                 if (kref_put(&zhdr->refcount,
1359                                                 release_z3fold_page))
1360                                         atomic64_dec(&pool->pages_nr);
1361                                 zhdr = NULL;
1362                                 continue; /* can't evict at this point */
1363                         }
1364
1365                         /* test_and_set_bit is of course atomic, but we still
1366                          * need to do it under page lock, otherwise checking
1367                          * that bit in __z3fold_alloc wouldn't make sense
1368                          */
1369                         if (zhdr->foreign_handles ||
1370                             test_and_set_bit(PAGE_CLAIMED, &page->private)) {
1371                                 if (kref_put(&zhdr->refcount,
1372                                                 release_z3fold_page))
1373                                         atomic64_dec(&pool->pages_nr);
1374                                 else
1375                                         z3fold_page_unlock(zhdr);
1376                                 zhdr = NULL;
1377                                 continue; /* can't evict such page */
1378                         }
1379                         list_del_init(&zhdr->buddy);
1380                         zhdr->cpu = -1;
1381                         break;
1382                 }
1383
1384                 if (!zhdr)
1385                         break;
1386
1387                 list_del_init(&page->lru);
1388                 spin_unlock(&pool->lock);
1389
1390                 if (!test_bit(PAGE_HEADLESS, &page->private)) {
1391                         /*
1392                          * We need encode the handles before unlocking, and
1393                          * use our local slots structure because z3fold_free
1394                          * can zero out zhdr->slots and we can't do much
1395                          * about that
1396                          */
1397                         first_handle = 0;
1398                         last_handle = 0;
1399                         middle_handle = 0;
1400                         memset(slots.slot, 0, sizeof(slots.slot));
1401                         if (zhdr->first_chunks)
1402                                 first_handle = __encode_handle(zhdr, &slots,
1403                                                                 FIRST);
1404                         if (zhdr->middle_chunks)
1405                                 middle_handle = __encode_handle(zhdr, &slots,
1406                                                                 MIDDLE);
1407                         if (zhdr->last_chunks)
1408                                 last_handle = __encode_handle(zhdr, &slots,
1409                                                                 LAST);
1410                         /*
1411                          * it's safe to unlock here because we hold a
1412                          * reference to this page
1413                          */
1414                         z3fold_page_unlock(zhdr);
1415                 } else {
1416                         first_handle = encode_handle(zhdr, HEADLESS);
1417                         last_handle = middle_handle = 0;
1418                 }
1419                 /* Issue the eviction callback(s) */
1420                 if (middle_handle) {
1421                         ret = pool->ops->evict(pool, middle_handle);
1422                         if (ret)
1423                                 goto next;
1424                 }
1425                 if (first_handle) {
1426                         ret = pool->ops->evict(pool, first_handle);
1427                         if (ret)
1428                                 goto next;
1429                 }
1430                 if (last_handle) {
1431                         ret = pool->ops->evict(pool, last_handle);
1432                         if (ret)
1433                                 goto next;
1434                 }
1435 next:
1436                 if (test_bit(PAGE_HEADLESS, &page->private)) {
1437                         if (ret == 0) {
1438                                 free_z3fold_page(page, true);
1439                                 atomic64_dec(&pool->pages_nr);
1440                                 return 0;
1441                         }
1442                         spin_lock(&pool->lock);
1443                         list_add(&page->lru, &pool->lru);
1444                         spin_unlock(&pool->lock);
1445                         clear_bit(PAGE_CLAIMED, &page->private);
1446                 } else {
1447                         struct z3fold_buddy_slots *slots = zhdr->slots;
1448                         z3fold_page_lock(zhdr);
1449                         if (kref_put(&zhdr->refcount,
1450                                         release_z3fold_page_locked)) {
1451                                 kmem_cache_free(pool->c_handle, slots);
1452                                 atomic64_dec(&pool->pages_nr);
1453                                 return 0;
1454                         }
1455                         /*
1456                          * if we are here, the page is still not completely
1457                          * free. Take the global pool lock then to be able
1458                          * to add it back to the lru list
1459                          */
1460                         spin_lock(&pool->lock);
1461                         list_add(&page->lru, &pool->lru);
1462                         spin_unlock(&pool->lock);
1463                         z3fold_page_unlock(zhdr);
1464                         clear_bit(PAGE_CLAIMED, &page->private);
1465                 }
1466
1467                 /* We started off locked to we need to lock the pool back */
1468                 spin_lock(&pool->lock);
1469         }
1470         spin_unlock(&pool->lock);
1471         return -EAGAIN;
1472 }
1473
1474 /**
1475  * z3fold_map() - maps the allocation associated with the given handle
1476  * @pool:       pool in which the allocation resides
1477  * @handle:     handle associated with the allocation to be mapped
1478  *
1479  * Extracts the buddy number from handle and constructs the pointer to the
1480  * correct starting chunk within the page.
1481  *
1482  * Returns: a pointer to the mapped allocation
1483  */
1484 static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
1485 {
1486         struct z3fold_header *zhdr;
1487         struct page *page;
1488         void *addr;
1489         enum buddy buddy;
1490
1491         zhdr = get_z3fold_header(handle);
1492         addr = zhdr;
1493         page = virt_to_page(zhdr);
1494
1495         if (test_bit(PAGE_HEADLESS, &page->private))
1496                 goto out;
1497
1498         buddy = handle_to_buddy(handle);
1499         switch (buddy) {
1500         case FIRST:
1501                 addr += ZHDR_SIZE_ALIGNED;
1502                 break;
1503         case MIDDLE:
1504                 addr += zhdr->start_middle << CHUNK_SHIFT;
1505                 set_bit(MIDDLE_CHUNK_MAPPED, &page->private);
1506                 break;
1507         case LAST:
1508                 addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT);
1509                 break;
1510         default:
1511                 pr_err("unknown buddy id %d\n", buddy);
1512                 WARN_ON(1);
1513                 addr = NULL;
1514                 break;
1515         }
1516
1517         if (addr)
1518                 zhdr->mapped_count++;
1519 out:
1520         put_z3fold_header(zhdr);
1521         return addr;
1522 }
1523
1524 /**
1525  * z3fold_unmap() - unmaps the allocation associated with the given handle
1526  * @pool:       pool in which the allocation resides
1527  * @handle:     handle associated with the allocation to be unmapped
1528  */
1529 static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
1530 {
1531         struct z3fold_header *zhdr;
1532         struct page *page;
1533         enum buddy buddy;
1534
1535         zhdr = get_z3fold_header(handle);
1536         page = virt_to_page(zhdr);
1537
1538         if (test_bit(PAGE_HEADLESS, &page->private))
1539                 return;
1540
1541         buddy = handle_to_buddy(handle);
1542         if (buddy == MIDDLE)
1543                 clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
1544         zhdr->mapped_count--;
1545         put_z3fold_header(zhdr);
1546 }
1547
1548 /**
1549  * z3fold_get_pool_size() - gets the z3fold pool size in pages
1550  * @pool:       pool whose size is being queried
1551  *
1552  * Returns: size in pages of the given pool.
1553  */
1554 static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
1555 {
1556         return atomic64_read(&pool->pages_nr);
1557 }
1558
1559 static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
1560 {
1561         struct z3fold_header *zhdr;
1562         struct z3fold_pool *pool;
1563
1564         VM_BUG_ON_PAGE(!PageMovable(page), page);
1565         VM_BUG_ON_PAGE(PageIsolated(page), page);
1566
1567         if (test_bit(PAGE_HEADLESS, &page->private))
1568                 return false;
1569
1570         zhdr = page_address(page);
1571         z3fold_page_lock(zhdr);
1572         if (test_bit(NEEDS_COMPACTING, &page->private) ||
1573             test_bit(PAGE_STALE, &page->private))
1574                 goto out;
1575
1576         if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0)
1577                 goto out;
1578
1579         if (test_and_set_bit(PAGE_CLAIMED, &page->private))
1580                 goto out;
1581         pool = zhdr_to_pool(zhdr);
1582         spin_lock(&pool->lock);
1583         if (!list_empty(&zhdr->buddy))
1584                 list_del_init(&zhdr->buddy);
1585         if (!list_empty(&page->lru))
1586                 list_del_init(&page->lru);
1587         spin_unlock(&pool->lock);
1588
1589         kref_get(&zhdr->refcount);
1590         z3fold_page_unlock(zhdr);
1591         return true;
1592
1593 out:
1594         z3fold_page_unlock(zhdr);
1595         return false;
1596 }
1597
1598 static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage,
1599                                struct page *page, enum migrate_mode mode)
1600 {
1601         struct z3fold_header *zhdr, *new_zhdr;
1602         struct z3fold_pool *pool;
1603         struct address_space *new_mapping;
1604
1605         VM_BUG_ON_PAGE(!PageMovable(page), page);
1606         VM_BUG_ON_PAGE(!PageIsolated(page), page);
1607         VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page);
1608         VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
1609
1610         zhdr = page_address(page);
1611         pool = zhdr_to_pool(zhdr);
1612
1613         if (!z3fold_page_trylock(zhdr))
1614                 return -EAGAIN;
1615         if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) {
1616                 z3fold_page_unlock(zhdr);
1617                 clear_bit(PAGE_CLAIMED, &page->private);
1618                 return -EBUSY;
1619         }
1620         if (work_pending(&zhdr->work)) {
1621                 z3fold_page_unlock(zhdr);
1622                 return -EAGAIN;
1623         }
1624         new_zhdr = page_address(newpage);
1625         memcpy(new_zhdr, zhdr, PAGE_SIZE);
1626         newpage->private = page->private;
1627         page->private = 0;
1628         z3fold_page_unlock(zhdr);
1629         spin_lock_init(&new_zhdr->page_lock);
1630         INIT_WORK(&new_zhdr->work, compact_page_work);
1631         /*
1632          * z3fold_page_isolate() ensures that new_zhdr->buddy is empty,
1633          * so we only have to reinitialize it.
1634          */
1635         INIT_LIST_HEAD(&new_zhdr->buddy);
1636         new_mapping = page_mapping(page);
1637         __ClearPageMovable(page);
1638         ClearPagePrivate(page);
1639
1640         get_page(newpage);
1641         z3fold_page_lock(new_zhdr);
1642         if (new_zhdr->first_chunks)
1643                 encode_handle(new_zhdr, FIRST);
1644         if (new_zhdr->last_chunks)
1645                 encode_handle(new_zhdr, LAST);
1646         if (new_zhdr->middle_chunks)
1647                 encode_handle(new_zhdr, MIDDLE);
1648         set_bit(NEEDS_COMPACTING, &newpage->private);
1649         new_zhdr->cpu = smp_processor_id();
1650         spin_lock(&pool->lock);
1651         list_add(&newpage->lru, &pool->lru);
1652         spin_unlock(&pool->lock);
1653         __SetPageMovable(newpage, new_mapping);
1654         z3fold_page_unlock(new_zhdr);
1655
1656         queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
1657
1658         page_mapcount_reset(page);
1659         clear_bit(PAGE_CLAIMED, &page->private);
1660         put_page(page);
1661         return 0;
1662 }
1663
1664 static void z3fold_page_putback(struct page *page)
1665 {
1666         struct z3fold_header *zhdr;
1667         struct z3fold_pool *pool;
1668
1669         zhdr = page_address(page);
1670         pool = zhdr_to_pool(zhdr);
1671
1672         z3fold_page_lock(zhdr);
1673         if (!list_empty(&zhdr->buddy))
1674                 list_del_init(&zhdr->buddy);
1675         INIT_LIST_HEAD(&page->lru);
1676         if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
1677                 atomic64_dec(&pool->pages_nr);
1678                 return;
1679         }
1680         spin_lock(&pool->lock);
1681         list_add(&page->lru, &pool->lru);
1682         spin_unlock(&pool->lock);
1683         clear_bit(PAGE_CLAIMED, &page->private);
1684         z3fold_page_unlock(zhdr);
1685 }
1686
1687 static const struct address_space_operations z3fold_aops = {
1688         .isolate_page = z3fold_page_isolate,
1689         .migratepage = z3fold_page_migrate,
1690         .putback_page = z3fold_page_putback,
1691 };
1692
1693 /*****************
1694  * zpool
1695  ****************/
1696
1697 static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle)
1698 {
1699         if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
1700                 return pool->zpool_ops->evict(pool->zpool, handle);
1701         else
1702                 return -ENOENT;
1703 }
1704
1705 static const struct z3fold_ops z3fold_zpool_ops = {
1706         .evict =        z3fold_zpool_evict
1707 };
1708
1709 static void *z3fold_zpool_create(const char *name, gfp_t gfp,
1710                                const struct zpool_ops *zpool_ops,
1711                                struct zpool *zpool)
1712 {
1713         struct z3fold_pool *pool;
1714
1715         pool = z3fold_create_pool(name, gfp,
1716                                 zpool_ops ? &z3fold_zpool_ops : NULL);
1717         if (pool) {
1718                 pool->zpool = zpool;
1719                 pool->zpool_ops = zpool_ops;
1720         }
1721         return pool;
1722 }
1723
1724 static void z3fold_zpool_destroy(void *pool)
1725 {
1726         z3fold_destroy_pool(pool);
1727 }
1728
1729 static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp,
1730                         unsigned long *handle)
1731 {
1732         return z3fold_alloc(pool, size, gfp, handle);
1733 }
1734 static void z3fold_zpool_free(void *pool, unsigned long handle)
1735 {
1736         z3fold_free(pool, handle);
1737 }
1738
1739 static int z3fold_zpool_shrink(void *pool, unsigned int pages,
1740                         unsigned int *reclaimed)
1741 {
1742         unsigned int total = 0;
1743         int ret = -EINVAL;
1744
1745         while (total < pages) {
1746                 ret = z3fold_reclaim_page(pool, 8);
1747                 if (ret < 0)
1748                         break;
1749                 total++;
1750         }
1751
1752         if (reclaimed)
1753                 *reclaimed = total;
1754
1755         return ret;
1756 }
1757
1758 static void *z3fold_zpool_map(void *pool, unsigned long handle,
1759                         enum zpool_mapmode mm)
1760 {
1761         return z3fold_map(pool, handle);
1762 }
1763 static void z3fold_zpool_unmap(void *pool, unsigned long handle)
1764 {
1765         z3fold_unmap(pool, handle);
1766 }
1767
1768 static u64 z3fold_zpool_total_size(void *pool)
1769 {
1770         return z3fold_get_pool_size(pool) * PAGE_SIZE;
1771 }
1772
1773 static struct zpool_driver z3fold_zpool_driver = {
1774         .type =         "z3fold",
1775         .sleep_mapped = true,
1776         .owner =        THIS_MODULE,
1777         .create =       z3fold_zpool_create,
1778         .destroy =      z3fold_zpool_destroy,
1779         .malloc =       z3fold_zpool_malloc,
1780         .free =         z3fold_zpool_free,
1781         .shrink =       z3fold_zpool_shrink,
1782         .map =          z3fold_zpool_map,
1783         .unmap =        z3fold_zpool_unmap,
1784         .total_size =   z3fold_zpool_total_size,
1785 };
1786
1787 MODULE_ALIAS("zpool-z3fold");
1788
1789 static int __init init_z3fold(void)
1790 {
1791         int ret;
1792
1793         /*
1794          * Make sure the z3fold header is not larger than the page size and
1795          * there has remaining spaces for its buddy.
1796          */
1797         BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE - CHUNK_SIZE);
1798         ret = z3fold_mount();
1799         if (ret)
1800                 return ret;
1801
1802         zpool_register_driver(&z3fold_zpool_driver);
1803
1804         return 0;
1805 }
1806
1807 static void __exit exit_z3fold(void)
1808 {
1809         z3fold_unmount();
1810         zpool_unregister_driver(&z3fold_zpool_driver);
1811 }
1812
1813 module_init(init_z3fold);
1814 module_exit(exit_z3fold);
1815
1816 MODULE_LICENSE("GPL");
1817 MODULE_AUTHOR("Vitaly Wool <vitalywool@gmail.com>");
1818 MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages");