Merge tag 'ceph-for-6.7-rc1' of https://github.com/ceph/ceph-client
[linux-2.6-microblaze.git] / mm / zswap.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * zswap.c - zswap driver file
4  *
5  * zswap is a cache that takes pages that are in the process
6  * of being swapped out and attempts to compress and store them in a
7  * RAM-based memory pool.  This can result in a significant I/O reduction on
8  * the swap device and, in the case where decompressing from RAM is faster
9  * than reading from the swap device, can also improve workload performance.
10  *
11  * Copyright (C) 2012  Seth Jennings <sjenning@linux.vnet.ibm.com>
12 */
13
14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15
16 #include <linux/module.h>
17 #include <linux/cpu.h>
18 #include <linux/highmem.h>
19 #include <linux/slab.h>
20 #include <linux/spinlock.h>
21 #include <linux/types.h>
22 #include <linux/atomic.h>
23 #include <linux/rbtree.h>
24 #include <linux/swap.h>
25 #include <linux/crypto.h>
26 #include <linux/scatterlist.h>
27 #include <linux/mempolicy.h>
28 #include <linux/mempool.h>
29 #include <linux/zpool.h>
30 #include <crypto/acompress.h>
31 #include <linux/zswap.h>
32 #include <linux/mm_types.h>
33 #include <linux/page-flags.h>
34 #include <linux/swapops.h>
35 #include <linux/writeback.h>
36 #include <linux/pagemap.h>
37 #include <linux/workqueue.h>
38
39 #include "swap.h"
40 #include "internal.h"
41
42 /*********************************
43 * statistics
44 **********************************/
45 /* Total bytes used by the compressed storage */
46 u64 zswap_pool_total_size;
47 /* The number of compressed pages currently stored in zswap */
48 atomic_t zswap_stored_pages = ATOMIC_INIT(0);
49 /* The number of same-value filled pages currently stored in zswap */
50 static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0);
51
52 /*
53  * The statistics below are not protected from concurrent access for
54  * performance reasons so they may not be a 100% accurate.  However,
55  * they do provide useful information on roughly how many times a
56  * certain event is occurring.
57 */
58
59 /* Pool limit was hit (see zswap_max_pool_percent) */
60 static u64 zswap_pool_limit_hit;
61 /* Pages written back when pool limit was reached */
62 static u64 zswap_written_back_pages;
63 /* Store failed due to a reclaim failure after pool limit was reached */
64 static u64 zswap_reject_reclaim_fail;
65 /* Store failed due to compression algorithm failure */
66 static u64 zswap_reject_compress_fail;
67 /* Compressed page was too big for the allocator to (optimally) store */
68 static u64 zswap_reject_compress_poor;
69 /* Store failed because underlying allocator could not get memory */
70 static u64 zswap_reject_alloc_fail;
71 /* Store failed because the entry metadata could not be allocated (rare) */
72 static u64 zswap_reject_kmemcache_fail;
73 /* Duplicate store was encountered (rare) */
74 static u64 zswap_duplicate_entry;
75
76 /* Shrinker work queue */
77 static struct workqueue_struct *shrink_wq;
78 /* Pool limit was hit, we need to calm down */
79 static bool zswap_pool_reached_full;
80
81 /*********************************
82 * tunables
83 **********************************/
84
85 #define ZSWAP_PARAM_UNSET ""
86
87 static int zswap_setup(void);
88
89 /* Enable/disable zswap */
90 static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON);
91 static int zswap_enabled_param_set(const char *,
92                                    const struct kernel_param *);
93 static const struct kernel_param_ops zswap_enabled_param_ops = {
94         .set =          zswap_enabled_param_set,
95         .get =          param_get_bool,
96 };
97 module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644);
98
99 /* Crypto compressor to use */
100 static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
101 static int zswap_compressor_param_set(const char *,
102                                       const struct kernel_param *);
103 static const struct kernel_param_ops zswap_compressor_param_ops = {
104         .set =          zswap_compressor_param_set,
105         .get =          param_get_charp,
106         .free =         param_free_charp,
107 };
108 module_param_cb(compressor, &zswap_compressor_param_ops,
109                 &zswap_compressor, 0644);
110
111 /* Compressed storage zpool to use */
112 static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
113 static int zswap_zpool_param_set(const char *, const struct kernel_param *);
114 static const struct kernel_param_ops zswap_zpool_param_ops = {
115         .set =          zswap_zpool_param_set,
116         .get =          param_get_charp,
117         .free =         param_free_charp,
118 };
119 module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
120
121 /* The maximum percentage of memory that the compressed pool can occupy */
122 static unsigned int zswap_max_pool_percent = 20;
123 module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
124
125 /* The threshold for accepting new pages after the max_pool_percent was hit */
126 static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */
127 module_param_named(accept_threshold_percent, zswap_accept_thr_percent,
128                    uint, 0644);
129
130 /*
131  * Enable/disable handling same-value filled pages (enabled by default).
132  * If disabled every page is considered non-same-value filled.
133  */
134 static bool zswap_same_filled_pages_enabled = true;
135 module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
136                    bool, 0644);
137
138 /* Enable/disable handling non-same-value filled pages (enabled by default) */
139 static bool zswap_non_same_filled_pages_enabled = true;
140 module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
141                    bool, 0644);
142
143 static bool zswap_exclusive_loads_enabled = IS_ENABLED(
144                 CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
145 module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
146
147 /* Number of zpools in zswap_pool (empirically determined for scalability) */
148 #define ZSWAP_NR_ZPOOLS 32
149
150 /*********************************
151 * data structures
152 **********************************/
153
154 struct crypto_acomp_ctx {
155         struct crypto_acomp *acomp;
156         struct acomp_req *req;
157         struct crypto_wait wait;
158         u8 *dstmem;
159         struct mutex *mutex;
160 };
161
162 /*
163  * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock.
164  * The only case where lru_lock is not acquired while holding tree.lock is
165  * when a zswap_entry is taken off the lru for writeback, in that case it
166  * needs to be verified that it's still valid in the tree.
167  */
168 struct zswap_pool {
169         struct zpool *zpools[ZSWAP_NR_ZPOOLS];
170         struct crypto_acomp_ctx __percpu *acomp_ctx;
171         struct kref kref;
172         struct list_head list;
173         struct work_struct release_work;
174         struct work_struct shrink_work;
175         struct hlist_node node;
176         char tfm_name[CRYPTO_MAX_ALG_NAME];
177         struct list_head lru;
178         spinlock_t lru_lock;
179 };
180
181 /*
182  * struct zswap_entry
183  *
184  * This structure contains the metadata for tracking a single compressed
185  * page within zswap.
186  *
187  * rbnode - links the entry into red-black tree for the appropriate swap type
188  * swpentry - associated swap entry, the offset indexes into the red-black tree
189  * refcount - the number of outstanding reference to the entry. This is needed
190  *            to protect against premature freeing of the entry by code
191  *            concurrent calls to load, invalidate, and writeback.  The lock
192  *            for the zswap_tree structure that contains the entry must
193  *            be held while changing the refcount.  Since the lock must
194  *            be held, there is no reason to also make refcount atomic.
195  * length - the length in bytes of the compressed page data.  Needed during
196  *          decompression. For a same value filled page length is 0, and both
197  *          pool and lru are invalid and must be ignored.
198  * pool - the zswap_pool the entry's data is in
199  * handle - zpool allocation handle that stores the compressed page data
200  * value - value of the same-value filled pages which have same content
201  * objcg - the obj_cgroup that the compressed memory is charged to
202  * lru - handle to the pool's lru used to evict pages.
203  */
204 struct zswap_entry {
205         struct rb_node rbnode;
206         swp_entry_t swpentry;
207         int refcount;
208         unsigned int length;
209         struct zswap_pool *pool;
210         union {
211                 unsigned long handle;
212                 unsigned long value;
213         };
214         struct obj_cgroup *objcg;
215         struct list_head lru;
216 };
217
218 /*
219  * The tree lock in the zswap_tree struct protects a few things:
220  * - the rbtree
221  * - the refcount field of each entry in the tree
222  */
223 struct zswap_tree {
224         struct rb_root rbroot;
225         spinlock_t lock;
226 };
227
228 static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
229
230 /* RCU-protected iteration */
231 static LIST_HEAD(zswap_pools);
232 /* protects zswap_pools list modification */
233 static DEFINE_SPINLOCK(zswap_pools_lock);
234 /* pool counter to provide unique names to zpool */
235 static atomic_t zswap_pools_count = ATOMIC_INIT(0);
236
237 enum zswap_init_type {
238         ZSWAP_UNINIT,
239         ZSWAP_INIT_SUCCEED,
240         ZSWAP_INIT_FAILED
241 };
242
243 static enum zswap_init_type zswap_init_state;
244
245 /* used to ensure the integrity of initialization */
246 static DEFINE_MUTEX(zswap_init_lock);
247
248 /* init completed, but couldn't create the initial pool */
249 static bool zswap_has_pool;
250
251 /*********************************
252 * helpers and fwd declarations
253 **********************************/
254
255 #define zswap_pool_debug(msg, p)                                \
256         pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,         \
257                  zpool_get_type((p)->zpools[0]))
258
259 static int zswap_writeback_entry(struct zswap_entry *entry,
260                                  struct zswap_tree *tree);
261 static int zswap_pool_get(struct zswap_pool *pool);
262 static void zswap_pool_put(struct zswap_pool *pool);
263
264 static bool zswap_is_full(void)
265 {
266         return totalram_pages() * zswap_max_pool_percent / 100 <
267                         DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
268 }
269
270 static bool zswap_can_accept(void)
271 {
272         return totalram_pages() * zswap_accept_thr_percent / 100 *
273                                 zswap_max_pool_percent / 100 >
274                         DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
275 }
276
277 static void zswap_update_total_size(void)
278 {
279         struct zswap_pool *pool;
280         u64 total = 0;
281         int i;
282
283         rcu_read_lock();
284
285         list_for_each_entry_rcu(pool, &zswap_pools, list)
286                 for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
287                         total += zpool_get_total_size(pool->zpools[i]);
288
289         rcu_read_unlock();
290
291         zswap_pool_total_size = total;
292 }
293
294 /*********************************
295 * zswap entry functions
296 **********************************/
297 static struct kmem_cache *zswap_entry_cache;
298
299 static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
300 {
301         struct zswap_entry *entry;
302         entry = kmem_cache_alloc(zswap_entry_cache, gfp);
303         if (!entry)
304                 return NULL;
305         entry->refcount = 1;
306         RB_CLEAR_NODE(&entry->rbnode);
307         return entry;
308 }
309
310 static void zswap_entry_cache_free(struct zswap_entry *entry)
311 {
312         kmem_cache_free(zswap_entry_cache, entry);
313 }
314
315 /*********************************
316 * rbtree functions
317 **********************************/
318 static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
319 {
320         struct rb_node *node = root->rb_node;
321         struct zswap_entry *entry;
322         pgoff_t entry_offset;
323
324         while (node) {
325                 entry = rb_entry(node, struct zswap_entry, rbnode);
326                 entry_offset = swp_offset(entry->swpentry);
327                 if (entry_offset > offset)
328                         node = node->rb_left;
329                 else if (entry_offset < offset)
330                         node = node->rb_right;
331                 else
332                         return entry;
333         }
334         return NULL;
335 }
336
337 /*
338  * In the case that a entry with the same offset is found, a pointer to
339  * the existing entry is stored in dupentry and the function returns -EEXIST
340  */
341 static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
342                         struct zswap_entry **dupentry)
343 {
344         struct rb_node **link = &root->rb_node, *parent = NULL;
345         struct zswap_entry *myentry;
346         pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
347
348         while (*link) {
349                 parent = *link;
350                 myentry = rb_entry(parent, struct zswap_entry, rbnode);
351                 myentry_offset = swp_offset(myentry->swpentry);
352                 if (myentry_offset > entry_offset)
353                         link = &(*link)->rb_left;
354                 else if (myentry_offset < entry_offset)
355                         link = &(*link)->rb_right;
356                 else {
357                         *dupentry = myentry;
358                         return -EEXIST;
359                 }
360         }
361         rb_link_node(&entry->rbnode, parent, link);
362         rb_insert_color(&entry->rbnode, root);
363         return 0;
364 }
365
366 static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
367 {
368         if (!RB_EMPTY_NODE(&entry->rbnode)) {
369                 rb_erase(&entry->rbnode, root);
370                 RB_CLEAR_NODE(&entry->rbnode);
371                 return true;
372         }
373         return false;
374 }
375
376 static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
377 {
378         int i = 0;
379
380         if (ZSWAP_NR_ZPOOLS > 1)
381                 i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS));
382
383         return entry->pool->zpools[i];
384 }
385
386 /*
387  * Carries out the common pattern of freeing and entry's zpool allocation,
388  * freeing the entry itself, and decrementing the number of stored pages.
389  */
390 static void zswap_free_entry(struct zswap_entry *entry)
391 {
392         if (entry->objcg) {
393                 obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
394                 obj_cgroup_put(entry->objcg);
395         }
396         if (!entry->length)
397                 atomic_dec(&zswap_same_filled_pages);
398         else {
399                 spin_lock(&entry->pool->lru_lock);
400                 list_del(&entry->lru);
401                 spin_unlock(&entry->pool->lru_lock);
402                 zpool_free(zswap_find_zpool(entry), entry->handle);
403                 zswap_pool_put(entry->pool);
404         }
405         zswap_entry_cache_free(entry);
406         atomic_dec(&zswap_stored_pages);
407         zswap_update_total_size();
408 }
409
410 /* caller must hold the tree lock */
411 static void zswap_entry_get(struct zswap_entry *entry)
412 {
413         entry->refcount++;
414 }
415
416 /* caller must hold the tree lock
417 * remove from the tree and free it, if nobody reference the entry
418 */
419 static void zswap_entry_put(struct zswap_tree *tree,
420                         struct zswap_entry *entry)
421 {
422         int refcount = --entry->refcount;
423
424         WARN_ON_ONCE(refcount < 0);
425         if (refcount == 0) {
426                 WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode));
427                 zswap_free_entry(entry);
428         }
429 }
430
431 /* caller must hold the tree lock */
432 static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
433                                 pgoff_t offset)
434 {
435         struct zswap_entry *entry;
436
437         entry = zswap_rb_search(root, offset);
438         if (entry)
439                 zswap_entry_get(entry);
440
441         return entry;
442 }
443
444 /*********************************
445 * per-cpu code
446 **********************************/
447 static DEFINE_PER_CPU(u8 *, zswap_dstmem);
448 /*
449  * If users dynamically change the zpool type and compressor at runtime, i.e.
450  * zswap is running, zswap can have more than one zpool on one cpu, but they
451  * are sharing dtsmem. So we need this mutex to be per-cpu.
452  */
453 static DEFINE_PER_CPU(struct mutex *, zswap_mutex);
454
455 static int zswap_dstmem_prepare(unsigned int cpu)
456 {
457         struct mutex *mutex;
458         u8 *dst;
459
460         dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
461         if (!dst)
462                 return -ENOMEM;
463
464         mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu));
465         if (!mutex) {
466                 kfree(dst);
467                 return -ENOMEM;
468         }
469
470         mutex_init(mutex);
471         per_cpu(zswap_dstmem, cpu) = dst;
472         per_cpu(zswap_mutex, cpu) = mutex;
473         return 0;
474 }
475
476 static int zswap_dstmem_dead(unsigned int cpu)
477 {
478         struct mutex *mutex;
479         u8 *dst;
480
481         mutex = per_cpu(zswap_mutex, cpu);
482         kfree(mutex);
483         per_cpu(zswap_mutex, cpu) = NULL;
484
485         dst = per_cpu(zswap_dstmem, cpu);
486         kfree(dst);
487         per_cpu(zswap_dstmem, cpu) = NULL;
488
489         return 0;
490 }
491
492 static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
493 {
494         struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
495         struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
496         struct crypto_acomp *acomp;
497         struct acomp_req *req;
498
499         acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
500         if (IS_ERR(acomp)) {
501                 pr_err("could not alloc crypto acomp %s : %ld\n",
502                                 pool->tfm_name, PTR_ERR(acomp));
503                 return PTR_ERR(acomp);
504         }
505         acomp_ctx->acomp = acomp;
506
507         req = acomp_request_alloc(acomp_ctx->acomp);
508         if (!req) {
509                 pr_err("could not alloc crypto acomp_request %s\n",
510                        pool->tfm_name);
511                 crypto_free_acomp(acomp_ctx->acomp);
512                 return -ENOMEM;
513         }
514         acomp_ctx->req = req;
515
516         crypto_init_wait(&acomp_ctx->wait);
517         /*
518          * if the backend of acomp is async zip, crypto_req_done() will wakeup
519          * crypto_wait_req(); if the backend of acomp is scomp, the callback
520          * won't be called, crypto_wait_req() will return without blocking.
521          */
522         acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
523                                    crypto_req_done, &acomp_ctx->wait);
524
525         acomp_ctx->mutex = per_cpu(zswap_mutex, cpu);
526         acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu);
527
528         return 0;
529 }
530
531 static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
532 {
533         struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
534         struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
535
536         if (!IS_ERR_OR_NULL(acomp_ctx)) {
537                 if (!IS_ERR_OR_NULL(acomp_ctx->req))
538                         acomp_request_free(acomp_ctx->req);
539                 if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
540                         crypto_free_acomp(acomp_ctx->acomp);
541         }
542
543         return 0;
544 }
545
546 /*********************************
547 * pool functions
548 **********************************/
549
550 static struct zswap_pool *__zswap_pool_current(void)
551 {
552         struct zswap_pool *pool;
553
554         pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
555         WARN_ONCE(!pool && zswap_has_pool,
556                   "%s: no page storage pool!\n", __func__);
557
558         return pool;
559 }
560
561 static struct zswap_pool *zswap_pool_current(void)
562 {
563         assert_spin_locked(&zswap_pools_lock);
564
565         return __zswap_pool_current();
566 }
567
568 static struct zswap_pool *zswap_pool_current_get(void)
569 {
570         struct zswap_pool *pool;
571
572         rcu_read_lock();
573
574         pool = __zswap_pool_current();
575         if (!zswap_pool_get(pool))
576                 pool = NULL;
577
578         rcu_read_unlock();
579
580         return pool;
581 }
582
583 static struct zswap_pool *zswap_pool_last_get(void)
584 {
585         struct zswap_pool *pool, *last = NULL;
586
587         rcu_read_lock();
588
589         list_for_each_entry_rcu(pool, &zswap_pools, list)
590                 last = pool;
591         WARN_ONCE(!last && zswap_has_pool,
592                   "%s: no page storage pool!\n", __func__);
593         if (!zswap_pool_get(last))
594                 last = NULL;
595
596         rcu_read_unlock();
597
598         return last;
599 }
600
601 /* type and compressor must be null-terminated */
602 static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
603 {
604         struct zswap_pool *pool;
605
606         assert_spin_locked(&zswap_pools_lock);
607
608         list_for_each_entry_rcu(pool, &zswap_pools, list) {
609                 if (strcmp(pool->tfm_name, compressor))
610                         continue;
611                 /* all zpools share the same type */
612                 if (strcmp(zpool_get_type(pool->zpools[0]), type))
613                         continue;
614                 /* if we can't get it, it's about to be destroyed */
615                 if (!zswap_pool_get(pool))
616                         continue;
617                 return pool;
618         }
619
620         return NULL;
621 }
622
623 /*
624  * If the entry is still valid in the tree, drop the initial ref and remove it
625  * from the tree. This function must be called with an additional ref held,
626  * otherwise it may race with another invalidation freeing the entry.
627  */
628 static void zswap_invalidate_entry(struct zswap_tree *tree,
629                                    struct zswap_entry *entry)
630 {
631         if (zswap_rb_erase(&tree->rbroot, entry))
632                 zswap_entry_put(tree, entry);
633 }
634
635 static int zswap_reclaim_entry(struct zswap_pool *pool)
636 {
637         struct zswap_entry *entry;
638         struct zswap_tree *tree;
639         pgoff_t swpoffset;
640         int ret;
641
642         /* Get an entry off the LRU */
643         spin_lock(&pool->lru_lock);
644         if (list_empty(&pool->lru)) {
645                 spin_unlock(&pool->lru_lock);
646                 return -EINVAL;
647         }
648         entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
649         list_del_init(&entry->lru);
650         /*
651          * Once the lru lock is dropped, the entry might get freed. The
652          * swpoffset is copied to the stack, and entry isn't deref'd again
653          * until the entry is verified to still be alive in the tree.
654          */
655         swpoffset = swp_offset(entry->swpentry);
656         tree = zswap_trees[swp_type(entry->swpentry)];
657         spin_unlock(&pool->lru_lock);
658
659         /* Check for invalidate() race */
660         spin_lock(&tree->lock);
661         if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) {
662                 ret = -EAGAIN;
663                 goto unlock;
664         }
665         /* Hold a reference to prevent a free during writeback */
666         zswap_entry_get(entry);
667         spin_unlock(&tree->lock);
668
669         ret = zswap_writeback_entry(entry, tree);
670
671         spin_lock(&tree->lock);
672         if (ret) {
673                 /* Writeback failed, put entry back on LRU */
674                 spin_lock(&pool->lru_lock);
675                 list_move(&entry->lru, &pool->lru);
676                 spin_unlock(&pool->lru_lock);
677                 goto put_unlock;
678         }
679
680         /*
681          * Writeback started successfully, the page now belongs to the
682          * swapcache. Drop the entry from zswap - unless invalidate already
683          * took it out while we had the tree->lock released for IO.
684          */
685         zswap_invalidate_entry(tree, entry);
686
687 put_unlock:
688         /* Drop local reference */
689         zswap_entry_put(tree, entry);
690 unlock:
691         spin_unlock(&tree->lock);
692         return ret ? -EAGAIN : 0;
693 }
694
695 static void shrink_worker(struct work_struct *w)
696 {
697         struct zswap_pool *pool = container_of(w, typeof(*pool),
698                                                 shrink_work);
699         int ret, failures = 0;
700
701         do {
702                 ret = zswap_reclaim_entry(pool);
703                 if (ret) {
704                         zswap_reject_reclaim_fail++;
705                         if (ret != -EAGAIN)
706                                 break;
707                         if (++failures == MAX_RECLAIM_RETRIES)
708                                 break;
709                 }
710                 cond_resched();
711         } while (!zswap_can_accept());
712         zswap_pool_put(pool);
713 }
714
715 static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
716 {
717         int i;
718         struct zswap_pool *pool;
719         char name[38]; /* 'zswap' + 32 char (max) num + \0 */
720         gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
721         int ret;
722
723         if (!zswap_has_pool) {
724                 /* if either are unset, pool initialization failed, and we
725                  * need both params to be set correctly before trying to
726                  * create a pool.
727                  */
728                 if (!strcmp(type, ZSWAP_PARAM_UNSET))
729                         return NULL;
730                 if (!strcmp(compressor, ZSWAP_PARAM_UNSET))
731                         return NULL;
732         }
733
734         pool = kzalloc(sizeof(*pool), GFP_KERNEL);
735         if (!pool)
736                 return NULL;
737
738         for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) {
739                 /* unique name for each pool specifically required by zsmalloc */
740                 snprintf(name, 38, "zswap%x",
741                          atomic_inc_return(&zswap_pools_count));
742
743                 pool->zpools[i] = zpool_create_pool(type, name, gfp);
744                 if (!pool->zpools[i]) {
745                         pr_err("%s zpool not available\n", type);
746                         goto error;
747                 }
748         }
749         pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0]));
750
751         strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
752
753         pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
754         if (!pool->acomp_ctx) {
755                 pr_err("percpu alloc failed\n");
756                 goto error;
757         }
758
759         ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
760                                        &pool->node);
761         if (ret)
762                 goto error;
763         pr_debug("using %s compressor\n", pool->tfm_name);
764
765         /* being the current pool takes 1 ref; this func expects the
766          * caller to always add the new pool as the current pool
767          */
768         kref_init(&pool->kref);
769         INIT_LIST_HEAD(&pool->list);
770         INIT_LIST_HEAD(&pool->lru);
771         spin_lock_init(&pool->lru_lock);
772         INIT_WORK(&pool->shrink_work, shrink_worker);
773
774         zswap_pool_debug("created", pool);
775
776         return pool;
777
778 error:
779         if (pool->acomp_ctx)
780                 free_percpu(pool->acomp_ctx);
781         while (i--)
782                 zpool_destroy_pool(pool->zpools[i]);
783         kfree(pool);
784         return NULL;
785 }
786
787 static struct zswap_pool *__zswap_pool_create_fallback(void)
788 {
789         bool has_comp, has_zpool;
790
791         has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
792         if (!has_comp && strcmp(zswap_compressor,
793                                 CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
794                 pr_err("compressor %s not available, using default %s\n",
795                        zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT);
796                 param_free_charp(&zswap_compressor);
797                 zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
798                 has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
799         }
800         if (!has_comp) {
801                 pr_err("default compressor %s not available\n",
802                        zswap_compressor);
803                 param_free_charp(&zswap_compressor);
804                 zswap_compressor = ZSWAP_PARAM_UNSET;
805         }
806
807         has_zpool = zpool_has_pool(zswap_zpool_type);
808         if (!has_zpool && strcmp(zswap_zpool_type,
809                                  CONFIG_ZSWAP_ZPOOL_DEFAULT)) {
810                 pr_err("zpool %s not available, using default %s\n",
811                        zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT);
812                 param_free_charp(&zswap_zpool_type);
813                 zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
814                 has_zpool = zpool_has_pool(zswap_zpool_type);
815         }
816         if (!has_zpool) {
817                 pr_err("default zpool %s not available\n",
818                        zswap_zpool_type);
819                 param_free_charp(&zswap_zpool_type);
820                 zswap_zpool_type = ZSWAP_PARAM_UNSET;
821         }
822
823         if (!has_comp || !has_zpool)
824                 return NULL;
825
826         return zswap_pool_create(zswap_zpool_type, zswap_compressor);
827 }
828
829 static void zswap_pool_destroy(struct zswap_pool *pool)
830 {
831         int i;
832
833         zswap_pool_debug("destroying", pool);
834
835         cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
836         free_percpu(pool->acomp_ctx);
837         for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
838                 zpool_destroy_pool(pool->zpools[i]);
839         kfree(pool);
840 }
841
842 static int __must_check zswap_pool_get(struct zswap_pool *pool)
843 {
844         if (!pool)
845                 return 0;
846
847         return kref_get_unless_zero(&pool->kref);
848 }
849
850 static void __zswap_pool_release(struct work_struct *work)
851 {
852         struct zswap_pool *pool = container_of(work, typeof(*pool),
853                                                 release_work);
854
855         synchronize_rcu();
856
857         /* nobody should have been able to get a kref... */
858         WARN_ON(kref_get_unless_zero(&pool->kref));
859
860         /* pool is now off zswap_pools list and has no references. */
861         zswap_pool_destroy(pool);
862 }
863
864 static void __zswap_pool_empty(struct kref *kref)
865 {
866         struct zswap_pool *pool;
867
868         pool = container_of(kref, typeof(*pool), kref);
869
870         spin_lock(&zswap_pools_lock);
871
872         WARN_ON(pool == zswap_pool_current());
873
874         list_del_rcu(&pool->list);
875
876         INIT_WORK(&pool->release_work, __zswap_pool_release);
877         schedule_work(&pool->release_work);
878
879         spin_unlock(&zswap_pools_lock);
880 }
881
882 static void zswap_pool_put(struct zswap_pool *pool)
883 {
884         kref_put(&pool->kref, __zswap_pool_empty);
885 }
886
887 /*********************************
888 * param callbacks
889 **********************************/
890
891 static bool zswap_pool_changed(const char *s, const struct kernel_param *kp)
892 {
893         /* no change required */
894         if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool)
895                 return false;
896         return true;
897 }
898
899 /* val must be a null-terminated string */
900 static int __zswap_param_set(const char *val, const struct kernel_param *kp,
901                              char *type, char *compressor)
902 {
903         struct zswap_pool *pool, *put_pool = NULL;
904         char *s = strstrip((char *)val);
905         int ret = 0;
906         bool new_pool = false;
907
908         mutex_lock(&zswap_init_lock);
909         switch (zswap_init_state) {
910         case ZSWAP_UNINIT:
911                 /* if this is load-time (pre-init) param setting,
912                  * don't create a pool; that's done during init.
913                  */
914                 ret = param_set_charp(s, kp);
915                 break;
916         case ZSWAP_INIT_SUCCEED:
917                 new_pool = zswap_pool_changed(s, kp);
918                 break;
919         case ZSWAP_INIT_FAILED:
920                 pr_err("can't set param, initialization failed\n");
921                 ret = -ENODEV;
922         }
923         mutex_unlock(&zswap_init_lock);
924
925         /* no need to create a new pool, return directly */
926         if (!new_pool)
927                 return ret;
928
929         if (!type) {
930                 if (!zpool_has_pool(s)) {
931                         pr_err("zpool %s not available\n", s);
932                         return -ENOENT;
933                 }
934                 type = s;
935         } else if (!compressor) {
936                 if (!crypto_has_acomp(s, 0, 0)) {
937                         pr_err("compressor %s not available\n", s);
938                         return -ENOENT;
939                 }
940                 compressor = s;
941         } else {
942                 WARN_ON(1);
943                 return -EINVAL;
944         }
945
946         spin_lock(&zswap_pools_lock);
947
948         pool = zswap_pool_find_get(type, compressor);
949         if (pool) {
950                 zswap_pool_debug("using existing", pool);
951                 WARN_ON(pool == zswap_pool_current());
952                 list_del_rcu(&pool->list);
953         }
954
955         spin_unlock(&zswap_pools_lock);
956
957         if (!pool)
958                 pool = zswap_pool_create(type, compressor);
959
960         if (pool)
961                 ret = param_set_charp(s, kp);
962         else
963                 ret = -EINVAL;
964
965         spin_lock(&zswap_pools_lock);
966
967         if (!ret) {
968                 put_pool = zswap_pool_current();
969                 list_add_rcu(&pool->list, &zswap_pools);
970                 zswap_has_pool = true;
971         } else if (pool) {
972                 /* add the possibly pre-existing pool to the end of the pools
973                  * list; if it's new (and empty) then it'll be removed and
974                  * destroyed by the put after we drop the lock
975                  */
976                 list_add_tail_rcu(&pool->list, &zswap_pools);
977                 put_pool = pool;
978         }
979
980         spin_unlock(&zswap_pools_lock);
981
982         if (!zswap_has_pool && !pool) {
983                 /* if initial pool creation failed, and this pool creation also
984                  * failed, maybe both compressor and zpool params were bad.
985                  * Allow changing this param, so pool creation will succeed
986                  * when the other param is changed. We already verified this
987                  * param is ok in the zpool_has_pool() or crypto_has_acomp()
988                  * checks above.
989                  */
990                 ret = param_set_charp(s, kp);
991         }
992
993         /* drop the ref from either the old current pool,
994          * or the new pool we failed to add
995          */
996         if (put_pool)
997                 zswap_pool_put(put_pool);
998
999         return ret;
1000 }
1001
1002 static int zswap_compressor_param_set(const char *val,
1003                                       const struct kernel_param *kp)
1004 {
1005         return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
1006 }
1007
1008 static int zswap_zpool_param_set(const char *val,
1009                                  const struct kernel_param *kp)
1010 {
1011         return __zswap_param_set(val, kp, NULL, zswap_compressor);
1012 }
1013
1014 static int zswap_enabled_param_set(const char *val,
1015                                    const struct kernel_param *kp)
1016 {
1017         int ret = -ENODEV;
1018
1019         /* if this is load-time (pre-init) param setting, only set param. */
1020         if (system_state != SYSTEM_RUNNING)
1021                 return param_set_bool(val, kp);
1022
1023         mutex_lock(&zswap_init_lock);
1024         switch (zswap_init_state) {
1025         case ZSWAP_UNINIT:
1026                 if (zswap_setup())
1027                         break;
1028                 fallthrough;
1029         case ZSWAP_INIT_SUCCEED:
1030                 if (!zswap_has_pool)
1031                         pr_err("can't enable, no pool configured\n");
1032                 else
1033                         ret = param_set_bool(val, kp);
1034                 break;
1035         case ZSWAP_INIT_FAILED:
1036                 pr_err("can't enable, initialization failed\n");
1037         }
1038         mutex_unlock(&zswap_init_lock);
1039
1040         return ret;
1041 }
1042
1043 /*********************************
1044 * writeback code
1045 **********************************/
1046 /*
1047  * Attempts to free an entry by adding a page to the swap cache,
1048  * decompressing the entry data into the page, and issuing a
1049  * bio write to write the page back to the swap device.
1050  *
1051  * This can be thought of as a "resumed writeback" of the page
1052  * to the swap device.  We are basically resuming the same swap
1053  * writeback path that was intercepted with the zswap_store()
1054  * in the first place.  After the page has been decompressed into
1055  * the swap cache, the compressed version stored by zswap can be
1056  * freed.
1057  */
1058 static int zswap_writeback_entry(struct zswap_entry *entry,
1059                                  struct zswap_tree *tree)
1060 {
1061         swp_entry_t swpentry = entry->swpentry;
1062         struct page *page;
1063         struct mempolicy *mpol;
1064         struct scatterlist input, output;
1065         struct crypto_acomp_ctx *acomp_ctx;
1066         struct zpool *pool = zswap_find_zpool(entry);
1067         bool page_was_allocated;
1068         u8 *src, *tmp = NULL;
1069         unsigned int dlen;
1070         int ret;
1071         struct writeback_control wbc = {
1072                 .sync_mode = WB_SYNC_NONE,
1073         };
1074
1075         if (!zpool_can_sleep_mapped(pool)) {
1076                 tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
1077                 if (!tmp)
1078                         return -ENOMEM;
1079         }
1080
1081         /* try to allocate swap cache page */
1082         mpol = get_task_policy(current);
1083         page = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
1084                                 NO_INTERLEAVE_INDEX, &page_was_allocated);
1085         if (!page) {
1086                 ret = -ENOMEM;
1087                 goto fail;
1088         }
1089
1090         /* Found an existing page, we raced with load/swapin */
1091         if (!page_was_allocated) {
1092                 put_page(page);
1093                 ret = -EEXIST;
1094                 goto fail;
1095         }
1096
1097         /*
1098          * Page is locked, and the swapcache is now secured against
1099          * concurrent swapping to and from the slot. Verify that the
1100          * swap entry hasn't been invalidated and recycled behind our
1101          * backs (our zswap_entry reference doesn't prevent that), to
1102          * avoid overwriting a new swap page with old compressed data.
1103          */
1104         spin_lock(&tree->lock);
1105         if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
1106                 spin_unlock(&tree->lock);
1107                 delete_from_swap_cache(page_folio(page));
1108                 ret = -ENOMEM;
1109                 goto fail;
1110         }
1111         spin_unlock(&tree->lock);
1112
1113         /* decompress */
1114         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
1115         dlen = PAGE_SIZE;
1116
1117         src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO);
1118         if (!zpool_can_sleep_mapped(pool)) {
1119                 memcpy(tmp, src, entry->length);
1120                 src = tmp;
1121                 zpool_unmap_handle(pool, entry->handle);
1122         }
1123
1124         mutex_lock(acomp_ctx->mutex);
1125         sg_init_one(&input, src, entry->length);
1126         sg_init_table(&output, 1);
1127         sg_set_page(&output, page, PAGE_SIZE, 0);
1128         acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
1129         ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
1130         dlen = acomp_ctx->req->dlen;
1131         mutex_unlock(acomp_ctx->mutex);
1132
1133         if (!zpool_can_sleep_mapped(pool))
1134                 kfree(tmp);
1135         else
1136                 zpool_unmap_handle(pool, entry->handle);
1137
1138         BUG_ON(ret);
1139         BUG_ON(dlen != PAGE_SIZE);
1140
1141         /* page is up to date */
1142         SetPageUptodate(page);
1143
1144         /* move it to the tail of the inactive list after end_writeback */
1145         SetPageReclaim(page);
1146
1147         /* start writeback */
1148         __swap_writepage(page, &wbc);
1149         put_page(page);
1150         zswap_written_back_pages++;
1151
1152         return ret;
1153
1154 fail:
1155         if (!zpool_can_sleep_mapped(pool))
1156                 kfree(tmp);
1157
1158         /*
1159          * If we get here because the page is already in swapcache, a
1160          * load may be happening concurrently. It is safe and okay to
1161          * not free the entry. It is also okay to return !0.
1162          */
1163         return ret;
1164 }
1165
1166 static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
1167 {
1168         unsigned long *page;
1169         unsigned long val;
1170         unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1;
1171
1172         page = (unsigned long *)ptr;
1173         val = page[0];
1174
1175         if (val != page[last_pos])
1176                 return 0;
1177
1178         for (pos = 1; pos < last_pos; pos++) {
1179                 if (val != page[pos])
1180                         return 0;
1181         }
1182
1183         *value = val;
1184
1185         return 1;
1186 }
1187
1188 static void zswap_fill_page(void *ptr, unsigned long value)
1189 {
1190         unsigned long *page;
1191
1192         page = (unsigned long *)ptr;
1193         memset_l(page, value, PAGE_SIZE / sizeof(unsigned long));
1194 }
1195
1196 bool zswap_store(struct folio *folio)
1197 {
1198         swp_entry_t swp = folio->swap;
1199         int type = swp_type(swp);
1200         pgoff_t offset = swp_offset(swp);
1201         struct page *page = &folio->page;
1202         struct zswap_tree *tree = zswap_trees[type];
1203         struct zswap_entry *entry, *dupentry;
1204         struct scatterlist input, output;
1205         struct crypto_acomp_ctx *acomp_ctx;
1206         struct obj_cgroup *objcg = NULL;
1207         struct zswap_pool *pool;
1208         struct zpool *zpool;
1209         unsigned int dlen = PAGE_SIZE;
1210         unsigned long handle, value;
1211         char *buf;
1212         u8 *src, *dst;
1213         gfp_t gfp;
1214         int ret;
1215
1216         VM_WARN_ON_ONCE(!folio_test_locked(folio));
1217         VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
1218
1219         /* Large folios aren't supported */
1220         if (folio_test_large(folio))
1221                 return false;
1222
1223         if (!zswap_enabled || !tree)
1224                 return false;
1225
1226         /*
1227          * If this is a duplicate, it must be removed before attempting to store
1228          * it, otherwise, if the store fails the old page won't be removed from
1229          * the tree, and it might be written back overriding the new data.
1230          */
1231         spin_lock(&tree->lock);
1232         dupentry = zswap_rb_search(&tree->rbroot, offset);
1233         if (dupentry) {
1234                 zswap_duplicate_entry++;
1235                 zswap_invalidate_entry(tree, dupentry);
1236         }
1237         spin_unlock(&tree->lock);
1238
1239         /*
1240          * XXX: zswap reclaim does not work with cgroups yet. Without a
1241          * cgroup-aware entry LRU, we will push out entries system-wide based on
1242          * local cgroup limits.
1243          */
1244         objcg = get_obj_cgroup_from_folio(folio);
1245         if (objcg && !obj_cgroup_may_zswap(objcg))
1246                 goto reject;
1247
1248         /* reclaim space if needed */
1249         if (zswap_is_full()) {
1250                 zswap_pool_limit_hit++;
1251                 zswap_pool_reached_full = true;
1252                 goto shrink;
1253         }
1254
1255         if (zswap_pool_reached_full) {
1256                if (!zswap_can_accept())
1257                         goto shrink;
1258                 else
1259                         zswap_pool_reached_full = false;
1260         }
1261
1262         /* allocate entry */
1263         entry = zswap_entry_cache_alloc(GFP_KERNEL);
1264         if (!entry) {
1265                 zswap_reject_kmemcache_fail++;
1266                 goto reject;
1267         }
1268
1269         if (zswap_same_filled_pages_enabled) {
1270                 src = kmap_atomic(page);
1271                 if (zswap_is_page_same_filled(src, &value)) {
1272                         kunmap_atomic(src);
1273                         entry->swpentry = swp_entry(type, offset);
1274                         entry->length = 0;
1275                         entry->value = value;
1276                         atomic_inc(&zswap_same_filled_pages);
1277                         goto insert_entry;
1278                 }
1279                 kunmap_atomic(src);
1280         }
1281
1282         if (!zswap_non_same_filled_pages_enabled)
1283                 goto freepage;
1284
1285         /* if entry is successfully added, it keeps the reference */
1286         entry->pool = zswap_pool_current_get();
1287         if (!entry->pool)
1288                 goto freepage;
1289
1290         /* compress */
1291         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
1292
1293         mutex_lock(acomp_ctx->mutex);
1294
1295         dst = acomp_ctx->dstmem;
1296         sg_init_table(&input, 1);
1297         sg_set_page(&input, page, PAGE_SIZE, 0);
1298
1299         /* zswap_dstmem is of size (PAGE_SIZE * 2). Reflect same in sg_list */
1300         sg_init_one(&output, dst, PAGE_SIZE * 2);
1301         acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
1302         /*
1303          * it maybe looks a little bit silly that we send an asynchronous request,
1304          * then wait for its completion synchronously. This makes the process look
1305          * synchronous in fact.
1306          * Theoretically, acomp supports users send multiple acomp requests in one
1307          * acomp instance, then get those requests done simultaneously. but in this
1308          * case, zswap actually does store and load page by page, there is no
1309          * existing method to send the second page before the first page is done
1310          * in one thread doing zwap.
1311          * but in different threads running on different cpu, we have different
1312          * acomp instance, so multiple threads can do (de)compression in parallel.
1313          */
1314         ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
1315         dlen = acomp_ctx->req->dlen;
1316
1317         if (ret) {
1318                 zswap_reject_compress_fail++;
1319                 goto put_dstmem;
1320         }
1321
1322         /* store */
1323         zpool = zswap_find_zpool(entry);
1324         gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
1325         if (zpool_malloc_support_movable(zpool))
1326                 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
1327         ret = zpool_malloc(zpool, dlen, gfp, &handle);
1328         if (ret == -ENOSPC) {
1329                 zswap_reject_compress_poor++;
1330                 goto put_dstmem;
1331         }
1332         if (ret) {
1333                 zswap_reject_alloc_fail++;
1334                 goto put_dstmem;
1335         }
1336         buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
1337         memcpy(buf, dst, dlen);
1338         zpool_unmap_handle(zpool, handle);
1339         mutex_unlock(acomp_ctx->mutex);
1340
1341         /* populate entry */
1342         entry->swpentry = swp_entry(type, offset);
1343         entry->handle = handle;
1344         entry->length = dlen;
1345
1346 insert_entry:
1347         entry->objcg = objcg;
1348         if (objcg) {
1349                 obj_cgroup_charge_zswap(objcg, entry->length);
1350                 /* Account before objcg ref is moved to tree */
1351                 count_objcg_event(objcg, ZSWPOUT);
1352         }
1353
1354         /* map */
1355         spin_lock(&tree->lock);
1356         /*
1357          * A duplicate entry should have been removed at the beginning of this
1358          * function. Since the swap entry should be pinned, if a duplicate is
1359          * found again here it means that something went wrong in the swap
1360          * cache.
1361          */
1362         while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
1363                 WARN_ON(1);
1364                 zswap_duplicate_entry++;
1365                 zswap_invalidate_entry(tree, dupentry);
1366         }
1367         if (entry->length) {
1368                 spin_lock(&entry->pool->lru_lock);
1369                 list_add(&entry->lru, &entry->pool->lru);
1370                 spin_unlock(&entry->pool->lru_lock);
1371         }
1372         spin_unlock(&tree->lock);
1373
1374         /* update stats */
1375         atomic_inc(&zswap_stored_pages);
1376         zswap_update_total_size();
1377         count_vm_event(ZSWPOUT);
1378
1379         return true;
1380
1381 put_dstmem:
1382         mutex_unlock(acomp_ctx->mutex);
1383         zswap_pool_put(entry->pool);
1384 freepage:
1385         zswap_entry_cache_free(entry);
1386 reject:
1387         if (objcg)
1388                 obj_cgroup_put(objcg);
1389         return false;
1390
1391 shrink:
1392         pool = zswap_pool_last_get();
1393         if (pool && !queue_work(shrink_wq, &pool->shrink_work))
1394                 zswap_pool_put(pool);
1395         goto reject;
1396 }
1397
1398 bool zswap_load(struct folio *folio)
1399 {
1400         swp_entry_t swp = folio->swap;
1401         int type = swp_type(swp);
1402         pgoff_t offset = swp_offset(swp);
1403         struct page *page = &folio->page;
1404         struct zswap_tree *tree = zswap_trees[type];
1405         struct zswap_entry *entry;
1406         struct scatterlist input, output;
1407         struct crypto_acomp_ctx *acomp_ctx;
1408         u8 *src, *dst, *tmp;
1409         struct zpool *zpool;
1410         unsigned int dlen;
1411         bool ret;
1412
1413         VM_WARN_ON_ONCE(!folio_test_locked(folio));
1414
1415         /* find */
1416         spin_lock(&tree->lock);
1417         entry = zswap_entry_find_get(&tree->rbroot, offset);
1418         if (!entry) {
1419                 spin_unlock(&tree->lock);
1420                 return false;
1421         }
1422         spin_unlock(&tree->lock);
1423
1424         if (!entry->length) {
1425                 dst = kmap_atomic(page);
1426                 zswap_fill_page(dst, entry->value);
1427                 kunmap_atomic(dst);
1428                 ret = true;
1429                 goto stats;
1430         }
1431
1432         zpool = zswap_find_zpool(entry);
1433         if (!zpool_can_sleep_mapped(zpool)) {
1434                 tmp = kmalloc(entry->length, GFP_KERNEL);
1435                 if (!tmp) {
1436                         ret = false;
1437                         goto freeentry;
1438                 }
1439         }
1440
1441         /* decompress */
1442         dlen = PAGE_SIZE;
1443         src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
1444
1445         if (!zpool_can_sleep_mapped(zpool)) {
1446                 memcpy(tmp, src, entry->length);
1447                 src = tmp;
1448                 zpool_unmap_handle(zpool, entry->handle);
1449         }
1450
1451         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
1452         mutex_lock(acomp_ctx->mutex);
1453         sg_init_one(&input, src, entry->length);
1454         sg_init_table(&output, 1);
1455         sg_set_page(&output, page, PAGE_SIZE, 0);
1456         acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
1457         if (crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait))
1458                 WARN_ON(1);
1459         mutex_unlock(acomp_ctx->mutex);
1460
1461         if (zpool_can_sleep_mapped(zpool))
1462                 zpool_unmap_handle(zpool, entry->handle);
1463         else
1464                 kfree(tmp);
1465
1466         ret = true;
1467 stats:
1468         count_vm_event(ZSWPIN);
1469         if (entry->objcg)
1470                 count_objcg_event(entry->objcg, ZSWPIN);
1471 freeentry:
1472         spin_lock(&tree->lock);
1473         if (ret && zswap_exclusive_loads_enabled) {
1474                 zswap_invalidate_entry(tree, entry);
1475                 folio_mark_dirty(folio);
1476         } else if (entry->length) {
1477                 spin_lock(&entry->pool->lru_lock);
1478                 list_move(&entry->lru, &entry->pool->lru);
1479                 spin_unlock(&entry->pool->lru_lock);
1480         }
1481         zswap_entry_put(tree, entry);
1482         spin_unlock(&tree->lock);
1483
1484         return ret;
1485 }
1486
1487 void zswap_invalidate(int type, pgoff_t offset)
1488 {
1489         struct zswap_tree *tree = zswap_trees[type];
1490         struct zswap_entry *entry;
1491
1492         /* find */
1493         spin_lock(&tree->lock);
1494         entry = zswap_rb_search(&tree->rbroot, offset);
1495         if (!entry) {
1496                 /* entry was written back */
1497                 spin_unlock(&tree->lock);
1498                 return;
1499         }
1500         zswap_invalidate_entry(tree, entry);
1501         spin_unlock(&tree->lock);
1502 }
1503
1504 void zswap_swapon(int type)
1505 {
1506         struct zswap_tree *tree;
1507
1508         tree = kzalloc(sizeof(*tree), GFP_KERNEL);
1509         if (!tree) {
1510                 pr_err("alloc failed, zswap disabled for swap type %d\n", type);
1511                 return;
1512         }
1513
1514         tree->rbroot = RB_ROOT;
1515         spin_lock_init(&tree->lock);
1516         zswap_trees[type] = tree;
1517 }
1518
1519 void zswap_swapoff(int type)
1520 {
1521         struct zswap_tree *tree = zswap_trees[type];
1522         struct zswap_entry *entry, *n;
1523
1524         if (!tree)
1525                 return;
1526
1527         /* walk the tree and free everything */
1528         spin_lock(&tree->lock);
1529         rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
1530                 zswap_free_entry(entry);
1531         tree->rbroot = RB_ROOT;
1532         spin_unlock(&tree->lock);
1533         kfree(tree);
1534         zswap_trees[type] = NULL;
1535 }
1536
1537 /*********************************
1538 * debugfs functions
1539 **********************************/
1540 #ifdef CONFIG_DEBUG_FS
1541 #include <linux/debugfs.h>
1542
1543 static struct dentry *zswap_debugfs_root;
1544
1545 static int zswap_debugfs_init(void)
1546 {
1547         if (!debugfs_initialized())
1548                 return -ENODEV;
1549
1550         zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
1551
1552         debugfs_create_u64("pool_limit_hit", 0444,
1553                            zswap_debugfs_root, &zswap_pool_limit_hit);
1554         debugfs_create_u64("reject_reclaim_fail", 0444,
1555                            zswap_debugfs_root, &zswap_reject_reclaim_fail);
1556         debugfs_create_u64("reject_alloc_fail", 0444,
1557                            zswap_debugfs_root, &zswap_reject_alloc_fail);
1558         debugfs_create_u64("reject_kmemcache_fail", 0444,
1559                            zswap_debugfs_root, &zswap_reject_kmemcache_fail);
1560         debugfs_create_u64("reject_compress_fail", 0444,
1561                            zswap_debugfs_root, &zswap_reject_compress_fail);
1562         debugfs_create_u64("reject_compress_poor", 0444,
1563                            zswap_debugfs_root, &zswap_reject_compress_poor);
1564         debugfs_create_u64("written_back_pages", 0444,
1565                            zswap_debugfs_root, &zswap_written_back_pages);
1566         debugfs_create_u64("duplicate_entry", 0444,
1567                            zswap_debugfs_root, &zswap_duplicate_entry);
1568         debugfs_create_u64("pool_total_size", 0444,
1569                            zswap_debugfs_root, &zswap_pool_total_size);
1570         debugfs_create_atomic_t("stored_pages", 0444,
1571                                 zswap_debugfs_root, &zswap_stored_pages);
1572         debugfs_create_atomic_t("same_filled_pages", 0444,
1573                                 zswap_debugfs_root, &zswap_same_filled_pages);
1574
1575         return 0;
1576 }
1577 #else
1578 static int zswap_debugfs_init(void)
1579 {
1580         return 0;
1581 }
1582 #endif
1583
1584 /*********************************
1585 * module init and exit
1586 **********************************/
1587 static int zswap_setup(void)
1588 {
1589         struct zswap_pool *pool;
1590         int ret;
1591
1592         zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
1593         if (!zswap_entry_cache) {
1594                 pr_err("entry cache creation failed\n");
1595                 goto cache_fail;
1596         }
1597
1598         ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare",
1599                                 zswap_dstmem_prepare, zswap_dstmem_dead);
1600         if (ret) {
1601                 pr_err("dstmem alloc failed\n");
1602                 goto dstmem_fail;
1603         }
1604
1605         ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE,
1606                                       "mm/zswap_pool:prepare",
1607                                       zswap_cpu_comp_prepare,
1608                                       zswap_cpu_comp_dead);
1609         if (ret)
1610                 goto hp_fail;
1611
1612         pool = __zswap_pool_create_fallback();
1613         if (pool) {
1614                 pr_info("loaded using pool %s/%s\n", pool->tfm_name,
1615                         zpool_get_type(pool->zpools[0]));
1616                 list_add(&pool->list, &zswap_pools);
1617                 zswap_has_pool = true;
1618         } else {
1619                 pr_err("pool creation failed\n");
1620                 zswap_enabled = false;
1621         }
1622
1623         shrink_wq = create_workqueue("zswap-shrink");
1624         if (!shrink_wq)
1625                 goto fallback_fail;
1626
1627         if (zswap_debugfs_init())
1628                 pr_warn("debugfs initialization failed\n");
1629         zswap_init_state = ZSWAP_INIT_SUCCEED;
1630         return 0;
1631
1632 fallback_fail:
1633         if (pool)
1634                 zswap_pool_destroy(pool);
1635 hp_fail:
1636         cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
1637 dstmem_fail:
1638         kmem_cache_destroy(zswap_entry_cache);
1639 cache_fail:
1640         /* if built-in, we aren't unloaded on failure; don't allow use */
1641         zswap_init_state = ZSWAP_INIT_FAILED;
1642         zswap_enabled = false;
1643         return -ENOMEM;
1644 }
1645
1646 static int __init zswap_init(void)
1647 {
1648         if (!zswap_enabled)
1649                 return 0;
1650         return zswap_setup();
1651 }
1652 /* must be late so crypto has time to come up */
1653 late_initcall(zswap_init);
1654
1655 MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
1656 MODULE_DESCRIPTION("Compressed cache for swap pages");