Merge tag 'erofs-for-5.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 26 Apr 2021 20:28:12 +0000 (13:28 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 26 Apr 2021 20:28:12 +0000 (13:28 -0700)
Pull erofs updates from Gao Xiang:
 "In this cycle, we would like to introduce a new feature called big
  pcluster so EROFS can compress file data into more than 1 fs block and
  different pcluster size can be selected for each (sub-)files by
  design.

  The current EROFS test results on my laptop are [1]:

    Testscript: erofs-openbenchmark
    Testdata: enwik9 (1000000000 bytes)
     ________________________________________________________________
    |  file system  |   size    | seq read | rand read | rand9m read |
    |_______________|___________|_ MiB/s __|__ MiB/s __|___ MiB/s ___|
    |___erofs_4k____|_556879872_|_ 781.4 __|__ 55.3 ___|___ 25.3  ___|
    |___erofs_16k___|_452509696_|_ 864.8 __|_ 123.2 ___|___ 20.8  ___|
    |___erofs_32k___|_415223808_|_ 899.8 __|_ 105.8 _*_|___ 16.8 ____|
    |___erofs_64k___|_393814016_|_ 906.6 __|__ 66.6 _*_|___ 11.8 ____|
    |__squashfs_8k__|_556191744_|_  64.9 __|__ 19.3 ___|____ 9.1 ____|
    |__squashfs_16k_|_502661120_|_  98.9 __|__ 38.0 ___|____ 9.8 ____|
    |__squashfs_32k_|_458784768_|_ 115.4 __|__ 71.6 _*_|___ 10.0 ____|
    |_squashfs_128k_|_398204928_|_ 257.2 __|_ 253.8 _*_|___ 10.9 ____|
    |____ext4_4k____|____()_____|_ 786.6 __|__ 28.6 ___|___ 27.8 ____|

  which has been verified but I'd like warn it as experimental for a
  while. This matches erofs-utils dev branch and I'll also release a new
  userspace version for this later.

  Apart from that, several improvements are also included: eg complete a
  missing case for inplace I/O, optimize endio decompression logic for
  non-atomic contexts and support adjustable sliding window size, ... In
  addition to those, there are some cleanups as always.

  Summary:

   - avoid memory failure when applying rolling decompression

   - optimize endio decompression logic for non-atomic contexts

   - complete a missing case which can be safely selected for inplace
     I/O and thus decreasing more memory footprint

   - check unsupported on-disk inode i_format strictly

   - support adjustable lz4 sliding window size to decrease runtime
     memory footprint

   - support on-disk compression configurations

   - support big pcluster decompression

   - several code cleanups / spelling correction"

* tag 'erofs-for-5.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs: (21 commits)
  erofs: enable big pcluster feature
  erofs: support decompress big pcluster for lz4 backend
  erofs: support parsing big pcluster compact indexes
  erofs: support parsing big pcluster compress indexes
  erofs: adjust per-CPU buffers according to max_pclusterblks
  erofs: add big physical cluster definition
  erofs: fix up inplace I/O pointer for big pcluster
  erofs: introduce physical cluster slab pools
  erofs: introduce multipage per-CPU buffers
  erofs: reserve physical_clusterbits[]
  erofs: Clean up spelling mistakes found in fs/erofs
  erofs: add on-disk compression configurations
  erofs: introduce on-disk lz4 fs configurations
  erofs: support adjust lz4 history window size
  erofs: introduce erofs_sb_has_xxx() helpers
  erofs: add unsupported inode i_format check
  erofs: don't use erofs_map_blocks() any more
  erofs: complete a missing case for inplace I/O
  erofs: use sync decompression for atomic contexts only
  erofs: use workqueue decompression for atomic contexts only
  ...

13 files changed:
fs/erofs/Kconfig
fs/erofs/Makefile
fs/erofs/data.c
fs/erofs/decompressor.c
fs/erofs/erofs_fs.h
fs/erofs/inode.c
fs/erofs/internal.h
fs/erofs/pcpubuf.c [new file with mode: 0644]
fs/erofs/super.c
fs/erofs/utils.c
fs/erofs/zdata.c
fs/erofs/zdata.h
fs/erofs/zmap.c

index 74b0aaa..858b333 100644 (file)
@@ -76,17 +76,3 @@ config EROFS_FS_ZIP
 
          If you don't want to enable compression feature, say N.
 
-config EROFS_FS_CLUSTER_PAGE_LIMIT
-       int "EROFS Cluster Pages Hard Limit"
-       depends on EROFS_FS_ZIP
-       range 1 256
-       default "1"
-       help
-         Indicates maximum # of pages of a compressed
-         physical cluster.
-
-         For example, if files in a image were compressed
-         into 8k-unit, hard limit should not be configured
-         less than 2. Otherwise, the image will be refused
-         to mount on this kernel.
-
index af15953..1f9aced 100644 (file)
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
 obj-$(CONFIG_EROFS_FS) += erofs.o
-erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
 erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
index 1249e74..ebac756 100644 (file)
@@ -109,21 +109,6 @@ err_out:
        return err;
 }
 
-int erofs_map_blocks(struct inode *inode,
-                    struct erofs_map_blocks *map, int flags)
-{
-       if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) {
-               int err = z_erofs_map_blocks_iter(inode, map, flags);
-
-               if (map->mpage) {
-                       put_page(map->mpage);
-                       map->mpage = NULL;
-               }
-               return err;
-       }
-       return erofs_map_blocks_flatmode(inode, map, flags);
-}
-
 static inline struct bio *erofs_read_raw_page(struct bio *bio,
                                              struct address_space *mapping,
                                              struct page *page,
@@ -159,7 +144,7 @@ submit_bio_retry:
                erofs_blk_t blknr;
                unsigned int blkoff;
 
-               err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+               err = erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW);
                if (err)
                        goto err_out;
 
@@ -318,7 +303,7 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
                        return 0;
        }
 
-       if (!erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW))
+       if (!erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW))
                return erofs_blknr(map.m_pa);
 
        return 0;
index 1cb1ffd..88e33ad 100644 (file)
@@ -28,6 +28,42 @@ struct z_erofs_decompressor {
        char *name;
 };
 
+int z_erofs_load_lz4_config(struct super_block *sb,
+                           struct erofs_super_block *dsb,
+                           struct z_erofs_lz4_cfgs *lz4, int size)
+{
+       struct erofs_sb_info *sbi = EROFS_SB(sb);
+       u16 distance;
+
+       if (lz4) {
+               if (size < sizeof(struct z_erofs_lz4_cfgs)) {
+                       erofs_err(sb, "invalid lz4 cfgs, size=%u", size);
+                       return -EINVAL;
+               }
+               distance = le16_to_cpu(lz4->max_distance);
+
+               sbi->lz4.max_pclusterblks = le16_to_cpu(lz4->max_pclusterblks);
+               if (!sbi->lz4.max_pclusterblks) {
+                       sbi->lz4.max_pclusterblks = 1;  /* reserved case */
+               } else if (sbi->lz4.max_pclusterblks >
+                          Z_EROFS_PCLUSTER_MAX_SIZE / EROFS_BLKSIZ) {
+                       erofs_err(sb, "too large lz4 pclusterblks %u",
+                                 sbi->lz4.max_pclusterblks);
+                       return -EINVAL;
+               } else if (sbi->lz4.max_pclusterblks >= 2) {
+                       erofs_info(sb, "EXPERIMENTAL big pcluster feature in use. Use at your own risk!");
+               }
+       } else {
+               distance = le16_to_cpu(dsb->u1.lz4_max_distance);
+               sbi->lz4.max_pclusterblks = 1;
+       }
+
+       sbi->lz4.max_distance_pages = distance ?
+                                       DIV_ROUND_UP(distance, PAGE_SIZE) + 1 :
+                                       LZ4_MAX_DISTANCE_PAGES;
+       return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks);
+}
+
 static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
                                         struct list_head *pagepool)
 {
@@ -36,6 +72,8 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
        struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL };
        unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES,
                                           BITS_PER_LONG)] = { 0 };
+       unsigned int lz4_max_distance_pages =
+                               EROFS_SB(rq->sb)->lz4.max_distance_pages;
        void *kaddr = NULL;
        unsigned int i, j, top;
 
@@ -44,14 +82,14 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
                struct page *const page = rq->out[i];
                struct page *victim;
 
-               if (j >= LZ4_MAX_DISTANCE_PAGES)
+               if (j >= lz4_max_distance_pages)
                        j = 0;
 
                /* 'valid' bounced can only be tested after a complete round */
                if (test_bit(j, bounced)) {
-                       DBG_BUGON(i < LZ4_MAX_DISTANCE_PAGES);
-                       DBG_BUGON(top >= LZ4_MAX_DISTANCE_PAGES);
-                       availables[top++] = rq->out[i - LZ4_MAX_DISTANCE_PAGES];
+                       DBG_BUGON(i < lz4_max_distance_pages);
+                       DBG_BUGON(top >= lz4_max_distance_pages);
+                       availables[top++] = rq->out[i - lz4_max_distance_pages];
                }
 
                if (page) {
@@ -73,9 +111,8 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
                        victim = availables[--top];
                        get_page(victim);
                } else {
-                       victim = erofs_allocpage(pagepool, GFP_KERNEL);
-                       if (!victim)
-                               return -ENOMEM;
+                       victim = erofs_allocpage(pagepool,
+                                                GFP_KERNEL | __GFP_NOFAIL);
                        set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE);
                }
                rq->out[i] = victim;
@@ -83,96 +120,123 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
        return kaddr ? 1 : 0;
 }
 
-static void *generic_copy_inplace_data(struct z_erofs_decompress_req *rq,
-                                      u8 *src, unsigned int pageofs_in)
+static void *z_erofs_handle_inplace_io(struct z_erofs_decompress_req *rq,
+                       void *inpage, unsigned int *inputmargin, int *maptype,
+                       bool support_0padding)
 {
-       /*
-        * if in-place decompression is ongoing, those decompressed
-        * pages should be copied in order to avoid being overlapped.
-        */
-       struct page **in = rq->in;
-       u8 *const tmp = erofs_get_pcpubuf(0);
-       u8 *tmpp = tmp;
-       unsigned int inlen = rq->inputsize - pageofs_in;
-       unsigned int count = min_t(uint, inlen, PAGE_SIZE - pageofs_in);
-
-       while (tmpp < tmp + inlen) {
-               if (!src)
-                       src = kmap_atomic(*in);
-               memcpy(tmpp, src + pageofs_in, count);
-               kunmap_atomic(src);
-               src = NULL;
-               tmpp += count;
-               pageofs_in = 0;
-               count = PAGE_SIZE;
+       unsigned int nrpages_in, nrpages_out;
+       unsigned int ofull, oend, inputsize, total, i, j;
+       struct page **in;
+       void *src, *tmp;
+
+       inputsize = rq->inputsize;
+       nrpages_in = PAGE_ALIGN(inputsize) >> PAGE_SHIFT;
+       oend = rq->pageofs_out + rq->outputsize;
+       ofull = PAGE_ALIGN(oend);
+       nrpages_out = ofull >> PAGE_SHIFT;
+
+       if (rq->inplace_io) {
+               if (rq->partial_decoding || !support_0padding ||
+                   ofull - oend < LZ4_DECOMPRESS_INPLACE_MARGIN(inputsize))
+                       goto docopy;
+
+               for (i = 0; i < nrpages_in; ++i) {
+                       DBG_BUGON(rq->in[i] == NULL);
+                       for (j = 0; j < nrpages_out - nrpages_in + i; ++j)
+                               if (rq->out[j] == rq->in[i])
+                                       goto docopy;
+               }
+       }
+
+       if (nrpages_in <= 1) {
+               *maptype = 0;
+               return inpage;
+       }
+       kunmap_atomic(inpage);
+       might_sleep();
+       src = erofs_vm_map_ram(rq->in, nrpages_in);
+       if (!src)
+               return ERR_PTR(-ENOMEM);
+       *maptype = 1;
+       return src;
+
+docopy:
+       /* Or copy compressed data which can be overlapped to per-CPU buffer */
+       in = rq->in;
+       src = erofs_get_pcpubuf(nrpages_in);
+       if (!src) {
+               DBG_BUGON(1);
+               kunmap_atomic(inpage);
+               return ERR_PTR(-EFAULT);
+       }
+
+       tmp = src;
+       total = rq->inputsize;
+       while (total) {
+               unsigned int page_copycnt =
+                       min_t(unsigned int, total, PAGE_SIZE - *inputmargin);
+
+               if (!inpage)
+                       inpage = kmap_atomic(*in);
+               memcpy(tmp, inpage + *inputmargin, page_copycnt);
+               kunmap_atomic(inpage);
+               inpage = NULL;
+               tmp += page_copycnt;
+               total -= page_copycnt;
                ++in;
+               *inputmargin = 0;
        }
-       return tmp;
+       *maptype = 2;
+       return src;
 }
 
 static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
 {
-       unsigned int inputmargin, inlen;
-       u8 *src;
-       bool copied, support_0padding;
-       int ret;
-
-       if (rq->inputsize > PAGE_SIZE)
-               return -EOPNOTSUPP;
+       unsigned int inputmargin;
+       u8 *headpage, *src;
+       bool support_0padding;
+       int ret, maptype;
 
-       src = kmap_atomic(*rq->in);
+       DBG_BUGON(*rq->in == NULL);
+       headpage = kmap_atomic(*rq->in);
        inputmargin = 0;
        support_0padding = false;
 
        /* decompression inplace is only safe when 0padding is enabled */
-       if (EROFS_SB(rq->sb)->feature_incompat &
-           EROFS_FEATURE_INCOMPAT_LZ4_0PADDING) {
+       if (erofs_sb_has_lz4_0padding(EROFS_SB(rq->sb))) {
                support_0padding = true;
 
-               while (!src[inputmargin & ~PAGE_MASK])
+               while (!headpage[inputmargin & ~PAGE_MASK])
                        if (!(++inputmargin & ~PAGE_MASK))
                                break;
 
                if (inputmargin >= rq->inputsize) {
-                       kunmap_atomic(src);
+                       kunmap_atomic(headpage);
                        return -EIO;
                }
        }
 
-       copied = false;
-       inlen = rq->inputsize - inputmargin;
-       if (rq->inplace_io) {
-               const uint oend = (rq->pageofs_out +
-                                  rq->outputsize) & ~PAGE_MASK;
-               const uint nr = PAGE_ALIGN(rq->pageofs_out +
-                                          rq->outputsize) >> PAGE_SHIFT;
-
-               if (rq->partial_decoding || !support_0padding ||
-                   rq->out[nr - 1] != rq->in[0] ||
-                   rq->inputsize - oend <
-                     LZ4_DECOMPRESS_INPLACE_MARGIN(inlen)) {
-                       src = generic_copy_inplace_data(rq, src, inputmargin);
-                       inputmargin = 0;
-                       copied = true;
-               }
-       }
+       rq->inputsize -= inputmargin;
+       src = z_erofs_handle_inplace_io(rq, headpage, &inputmargin, &maptype,
+                                       support_0padding);
+       if (IS_ERR(src))
+               return PTR_ERR(src);
 
        /* legacy format could compress extra data in a pcluster. */
        if (rq->partial_decoding || !support_0padding)
                ret = LZ4_decompress_safe_partial(src + inputmargin, out,
-                                                 inlen, rq->outputsize,
-                                                 rq->outputsize);
+                               rq->inputsize, rq->outputsize, rq->outputsize);
        else
                ret = LZ4_decompress_safe(src + inputmargin, out,
-                                         inlen, rq->outputsize);
+                                         rq->inputsize, rq->outputsize);
 
        if (ret != rq->outputsize) {
                erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
-                         ret, inlen, inputmargin, rq->outputsize);
+                         ret, rq->inputsize, inputmargin, rq->outputsize);
 
                WARN_ON(1);
                print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
-                              16, 1, src + inputmargin, inlen, true);
+                              16, 1, src + inputmargin, rq->inputsize, true);
                print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
                               16, 1, out, rq->outputsize, true);
 
@@ -181,10 +245,16 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
                ret = -EIO;
        }
 
-       if (copied)
-               erofs_put_pcpubuf(src);
-       else
+       if (maptype == 0) {
                kunmap_atomic(src);
+       } else if (maptype == 1) {
+               vm_unmap_ram(src, PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT);
+       } else if (maptype == 2) {
+               erofs_put_pcpubuf(src);
+       } else {
+               DBG_BUGON(1);
+               return -EFAULT;
+       }
        return ret;
 }
 
@@ -234,57 +304,51 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
        const struct z_erofs_decompressor *alg = decompressors + rq->alg;
        unsigned int dst_maptype;
        void *dst;
-       int ret, i;
+       int ret;
 
-       if (nrpages_out == 1 && !rq->inplace_io) {
-               DBG_BUGON(!*rq->out);
-               dst = kmap_atomic(*rq->out);
-               dst_maptype = 0;
-               goto dstmap_out;
-       }
+       /* two optimized fast paths only for non bigpcluster cases yet */
+       if (rq->inputsize <= PAGE_SIZE) {
+               if (nrpages_out == 1 && !rq->inplace_io) {
+                       DBG_BUGON(!*rq->out);
+                       dst = kmap_atomic(*rq->out);
+                       dst_maptype = 0;
+                       goto dstmap_out;
+               }
 
-       /*
-        * For the case of small output size (especially much less
-        * than PAGE_SIZE), memcpy the decompressed data rather than
-        * compressed data is preferred.
-        */
-       if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
-               dst = erofs_get_pcpubuf(0);
-               if (IS_ERR(dst))
-                       return PTR_ERR(dst);
-
-               rq->inplace_io = false;
-               ret = alg->decompress(rq, dst);
-               if (!ret)
-                       copy_from_pcpubuf(rq->out, dst, rq->pageofs_out,
-                                         rq->outputsize);
-
-               erofs_put_pcpubuf(dst);
-               return ret;
+               /*
+                * For the case of small output size (especially much less
+                * than PAGE_SIZE), memcpy the decompressed data rather than
+                * compressed data is preferred.
+                */
+               if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
+                       dst = erofs_get_pcpubuf(1);
+                       if (IS_ERR(dst))
+                               return PTR_ERR(dst);
+
+                       rq->inplace_io = false;
+                       ret = alg->decompress(rq, dst);
+                       if (!ret)
+                               copy_from_pcpubuf(rq->out, dst, rq->pageofs_out,
+                                                 rq->outputsize);
+
+                       erofs_put_pcpubuf(dst);
+                       return ret;
+               }
        }
 
+       /* general decoding path which can be used for all cases */
        ret = alg->prepare_destpages(rq, pagepool);
-       if (ret < 0) {
+       if (ret < 0)
                return ret;
-       } else if (ret) {
+       if (ret) {
                dst = page_address(*rq->out);
                dst_maptype = 1;
                goto dstmap_out;
        }
 
-       i = 0;
-       while (1) {
-               dst = vm_map_ram(rq->out, nrpages_out, -1);
-
-               /* retry two more times (totally 3 times) */
-               if (dst || ++i >= 3)
-                       break;
-               vm_unmap_aliases();
-       }
-
+       dst = erofs_vm_map_ram(rq->out, nrpages_out);
        if (!dst)
                return -ENOMEM;
-
        dst_maptype = 2;
 
 dstmap_out:
index 9ad1615..8739d3a 100644 (file)
  * be incompatible with this kernel version.
  */
 #define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING    0x00000001
-#define EROFS_ALL_FEATURE_INCOMPAT             EROFS_FEATURE_INCOMPAT_LZ4_0PADDING
+#define EROFS_FEATURE_INCOMPAT_COMPR_CFGS      0x00000002
+#define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER    0x00000002
+#define EROFS_ALL_FEATURE_INCOMPAT             \
+       (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \
+        EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
+        EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER)
 
-/* 128-byte erofs on-disk super block */
+#define EROFS_SB_EXTSLOT_SIZE  16
+
+/* erofs on-disk super block (currently 128 bytes) */
 struct erofs_super_block {
        __le32 magic;           /* file system magic number */
        __le32 checksum;        /* crc32c(super_block) */
        __le32 feature_compat;
        __u8 blkszbits;         /* support block_size == PAGE_SIZE only */
-       __u8 reserved;
+       __u8 sb_extslots;       /* superblock size = 128 + sb_extslots * 16 */
 
        __le16 root_nid;        /* nid of root directory */
        __le64 inos;            /* total valid ino # (== f_files - f_favail) */
@@ -39,7 +46,13 @@ struct erofs_super_block {
        __u8 uuid[16];          /* 128-bit uuid for volume */
        __u8 volume_name[16];   /* volume name */
        __le32 feature_incompat;
-       __u8 reserved2[44];
+       union {
+               /* bitmap for available compression algorithms */
+               __le16 available_compr_algs;
+               /* customized sliding window size instead of 64k by default */
+               __le16 lz4_max_distance;
+       } __packed u1;
+       __u8 reserved2[42];
 };
 
 /*
@@ -75,6 +88,9 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
 #define EROFS_I_VERSION_BIT             0
 #define EROFS_I_DATALAYOUT_BIT          1
 
+#define EROFS_I_ALL    \
+       ((1 << (EROFS_I_DATALAYOUT_BIT + EROFS_I_DATALAYOUT_BITS)) - 1)
+
 /* 32-byte reduced form of an ondisk inode */
 struct erofs_inode_compact {
        __le16 i_format;        /* inode format hints */
@@ -189,20 +205,33 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
                                 e->e_name_len + le16_to_cpu(e->e_value_size));
 }
 
+/* maximum supported size of a physical compression cluster */
+#define Z_EROFS_PCLUSTER_MAX_SIZE      (1024 * 1024)
+
 /* available compression algorithm types (for h_algorithmtype) */
 enum {
        Z_EROFS_COMPRESSION_LZ4 = 0,
        Z_EROFS_COMPRESSION_MAX
 };
+#define Z_EROFS_ALL_COMPR_ALGS         (1 << (Z_EROFS_COMPRESSION_MAX - 1))
+
+/* 14 bytes (+ length field = 16 bytes) */
+struct z_erofs_lz4_cfgs {
+       __le16 max_distance;
+       __le16 max_pclusterblks;
+       u8 reserved[10];
+} __packed;
 
 /*
  * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
  *  e.g. for 4k logical cluster size,      4B        if compacted 2B is off;
  *                                  (4B) + 2B + (4B) if compacted 2B is on.
+ * bit 1 : HEAD1 big pcluster (0 - off; 1 - on)
+ * bit 2 : HEAD2 big pcluster (0 - off; 1 - on)
  */
-#define Z_EROFS_ADVISE_COMPACTED_2B_BIT         0
-
-#define Z_EROFS_ADVISE_COMPACTED_2B     (1 << Z_EROFS_ADVISE_COMPACTED_2B_BIT)
+#define Z_EROFS_ADVISE_COMPACTED_2B            0x0001
+#define Z_EROFS_ADVISE_BIG_PCLUSTER_1          0x0002
+#define Z_EROFS_ADVISE_BIG_PCLUSTER_2          0x0004
 
 struct z_erofs_map_header {
        __le32  h_reserved1;
@@ -214,9 +243,7 @@ struct z_erofs_map_header {
        __u8    h_algorithmtype;
        /*
         * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096;
-        * bit 3-4 : (physical - logical) cluster bits of head 1:
-        *       For example, if logical clustersize = 4096, 1 for 8192.
-        * bit 5-7 : (physical - logical) cluster bits of head 2.
+        * bit 3-7 : reserved.
         */
        __u8    h_clusterbits;
 };
@@ -259,6 +286,13 @@ enum {
 #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS        2
 #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT         0
 
+/*
+ * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the
+ * compressed block count of a compressed extent (in logical clusters, aka.
+ * block count of a pcluster).
+ */
+#define Z_EROFS_VLE_DI_D0_CBLKCNT              (1 << 11)
+
 struct z_erofs_vle_decompressed_index {
        __le16 di_advise;
        /* where to decompress in the head cluster */
index 119fdce..7ed2d73 100644 (file)
@@ -44,6 +44,13 @@ static struct page *erofs_read_inode(struct inode *inode,
        dic = page_address(page) + *ofs;
        ifmt = le16_to_cpu(dic->i_format);
 
+       if (ifmt & ~EROFS_I_ALL) {
+               erofs_err(inode->i_sb, "unsupported i_format %u of nid %llu",
+                         ifmt, vi->nid);
+               err = -EOPNOTSUPP;
+               goto err_out;
+       }
+
        vi->datalayout = erofs_inode_datalayout(ifmt);
        if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) {
                erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu",
index 351dae5..f92e3e3 100644 (file)
@@ -50,6 +50,8 @@ struct erofs_fs_context {
 #ifdef CONFIG_EROFS_FS_ZIP
        /* current strategy of how to use managed cache */
        unsigned char cache_strategy;
+       /* strategy of sync decompression (false - auto, true - force on) */
+       bool readahead_sync_decompress;
 
        /* threshold for decompression synchronously */
        unsigned int max_sync_decompress_pages;
@@ -57,6 +59,14 @@ struct erofs_fs_context {
        unsigned int mount_opt;
 };
 
+/* all filesystem-wide lz4 configurations */
+struct erofs_sb_lz4_info {
+       /* # of pages needed for EROFS lz4 rolling decompression */
+       u16 max_distance_pages;
+       /* maximum possible blocks for pclusters in the filesystem */
+       u16 max_pclusterblks;
+};
+
 struct erofs_sb_info {
 #ifdef CONFIG_EROFS_FS_ZIP
        /* list for all registered superblocks, mainly for shrinker */
@@ -67,9 +77,12 @@ struct erofs_sb_info {
        struct xarray managed_pslots;
 
        unsigned int shrinker_run_no;
+       u16 available_compr_algs;
 
        /* pseudo inode to manage cached pages */
        struct inode *managed_cache;
+
+       struct erofs_sb_lz4_info lz4;
 #endif /* CONFIG_EROFS_FS_ZIP */
        u32 blocks;
        u32 meta_blkaddr;
@@ -80,6 +93,7 @@ struct erofs_sb_info {
        /* inode slot unit size in bit shift */
        unsigned char islotbits;
 
+       u32 sb_size;                    /* total superblock size */
        u32 build_time_nsec;
        u64 build_time;
 
@@ -182,12 +196,6 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
        return v;
 }
 #endif /* !CONFIG_SMP */
-
-/* hard limit of pages per compressed cluster */
-#define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
-#define EROFS_PCPUBUF_NR_PAGES          Z_EROFS_CLUSTER_MAX_PAGES
-#else
-#define EROFS_PCPUBUF_NR_PAGES          0
 #endif /* !CONFIG_EROFS_FS_ZIP */
 
 /* we strictly follow PAGE_SIZE and no buffer head yet */
@@ -216,6 +224,17 @@ static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid)
        return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits);
 }
 
+#define EROFS_FEATURE_FUNCS(name, compat, feature) \
+static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \
+{ \
+       return sbi->feature_##compat & EROFS_FEATURE_##feature; \
+}
+
+EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING)
+EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS)
+EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER)
+EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
+
 /* atomic flag definitions */
 #define EROFS_I_EA_INITED_BIT  0
 #define EROFS_I_Z_INITED_BIT   1
@@ -244,7 +263,6 @@ struct erofs_inode {
                        unsigned short z_advise;
                        unsigned char  z_algorithmtype[2];
                        unsigned char  z_logical_clusterbits;
-                       unsigned char  z_physical_clusterbits[2];
                };
 #endif /* CONFIG_EROFS_FS_ZIP */
        };
@@ -287,7 +305,7 @@ extern const struct address_space_operations erofs_raw_access_aops;
 extern const struct address_space_operations z_erofs_aops;
 
 /*
- * Logical to physical block mapping, used by erofs_map_blocks()
+ * Logical to physical block mapping
  *
  * Different with other file systems, it is used for 2 access modes:
  *
@@ -334,7 +352,7 @@ struct erofs_map_blocks {
        struct page *mpage;
 };
 
-/* Flags used by erofs_map_blocks() */
+/* Flags used by erofs_map_blocks_flatmode() */
 #define EROFS_GET_BLOCKS_RAW    0x0001
 
 /* zmap.c */
@@ -356,8 +374,6 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode,
 /* data.c */
 struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr);
 
-int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
-
 /* inode.c */
 static inline unsigned long erofs_inode_hash(erofs_nid_t nid)
 {
@@ -386,23 +402,30 @@ int erofs_namei(struct inode *dir, struct qstr *name,
 /* dir.c */
 extern const struct file_operations erofs_dir_fops;
 
-/* utils.c / zdata.c */
-struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
-
-#if (EROFS_PCPUBUF_NR_PAGES > 0)
-void *erofs_get_pcpubuf(unsigned int pagenr);
-#define erofs_put_pcpubuf(buf) do { \
-       (void)&(buf);   \
-       preempt_enable();       \
-} while (0)
-#else
-static inline void *erofs_get_pcpubuf(unsigned int pagenr)
+static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count)
 {
-       return ERR_PTR(-EOPNOTSUPP);
+       int retried = 0;
+
+       while (1) {
+               void *p = vm_map_ram(pages, count, -1);
+
+               /* retry two more times (totally 3 times) */
+               if (p || ++retried >= 3)
+                       return p;
+               vm_unmap_aliases();
+       }
+       return NULL;
 }
 
-#define erofs_put_pcpubuf(buf) do {} while (0)
-#endif
+/* pcpubuf.c */
+void *erofs_get_pcpubuf(unsigned int requiredpages);
+void erofs_put_pcpubuf(void *ptr);
+int erofs_pcpubuf_growsize(unsigned int nrpages);
+void erofs_pcpubuf_init(void);
+void erofs_pcpubuf_exit(void);
+
+/* utils.c / zdata.c */
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
 
 #ifdef CONFIG_EROFS_FS_ZIP
 int erofs_workgroup_put(struct erofs_workgroup *grp);
@@ -421,6 +444,9 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
                                       struct erofs_workgroup *egrp);
 int erofs_try_to_free_cached_page(struct address_space *mapping,
                                  struct page *page);
+int z_erofs_load_lz4_config(struct super_block *sb,
+                           struct erofs_super_block *dsb,
+                           struct z_erofs_lz4_cfgs *lz4, int len);
 #else
 static inline void erofs_shrinker_register(struct super_block *sb) {}
 static inline void erofs_shrinker_unregister(struct super_block *sb) {}
@@ -428,6 +454,16 @@ static inline int erofs_init_shrinker(void) { return 0; }
 static inline void erofs_exit_shrinker(void) {}
 static inline int z_erofs_init_zip_subsystem(void) { return 0; }
 static inline void z_erofs_exit_zip_subsystem(void) {}
+static inline int z_erofs_load_lz4_config(struct super_block *sb,
+                                 struct erofs_super_block *dsb,
+                                 struct z_erofs_lz4_cfgs *lz4, int len)
+{
+       if (lz4 || dsb->u1.lz4_max_distance) {
+               erofs_err(sb, "lz4 algorithm isn't enabled");
+               return -EINVAL;
+       }
+       return 0;
+}
 #endif /* !CONFIG_EROFS_FS_ZIP */
 
 #define EFSCORRUPTED    EUCLEAN         /* Filesystem is corrupted */
diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c
new file mode 100644 (file)
index 0000000..6c88557
--- /dev/null
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) Gao Xiang <xiang@kernel.org>
+ *
+ * For low-latency decompression algorithms (e.g. lz4), reserve consecutive
+ * per-CPU virtual memory (in pages) in advance to store such inplace I/O
+ * data if inplace decompression is failed (due to unmet inplace margin for
+ * example).
+ */
+#include "internal.h"
+
+struct erofs_pcpubuf {
+       raw_spinlock_t lock;
+       void *ptr;
+       struct page **pages;
+       unsigned int nrpages;
+};
+
+static DEFINE_PER_CPU(struct erofs_pcpubuf, erofs_pcb);
+
+void *erofs_get_pcpubuf(unsigned int requiredpages)
+       __acquires(pcb->lock)
+{
+       struct erofs_pcpubuf *pcb = &get_cpu_var(erofs_pcb);
+
+       raw_spin_lock(&pcb->lock);
+       /* check if the per-CPU buffer is too small */
+       if (requiredpages > pcb->nrpages) {
+               raw_spin_unlock(&pcb->lock);
+               put_cpu_var(erofs_pcb);
+               /* (for sparse checker) pretend pcb->lock is still taken */
+               __acquire(pcb->lock);
+               return NULL;
+       }
+       return pcb->ptr;
+}
+
+void erofs_put_pcpubuf(void *ptr) __releases(pcb->lock)
+{
+       struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, smp_processor_id());
+
+       DBG_BUGON(pcb->ptr != ptr);
+       raw_spin_unlock(&pcb->lock);
+       put_cpu_var(erofs_pcb);
+}
+
+/* the next step: support per-CPU page buffers hotplug */
+int erofs_pcpubuf_growsize(unsigned int nrpages)
+{
+       static DEFINE_MUTEX(pcb_resize_mutex);
+       static unsigned int pcb_nrpages;
+       LIST_HEAD(pagepool);
+       int delta, cpu, ret, i;
+
+       mutex_lock(&pcb_resize_mutex);
+       delta = nrpages - pcb_nrpages;
+       ret = 0;
+       /* avoid shrinking pcpubuf, since no idea how many fses rely on */
+       if (delta <= 0)
+               goto out;
+
+       for_each_possible_cpu(cpu) {
+               struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
+               struct page **pages, **oldpages;
+               void *ptr, *old_ptr;
+
+               pages = kmalloc_array(nrpages, sizeof(*pages), GFP_KERNEL);
+               if (!pages) {
+                       ret = -ENOMEM;
+                       break;
+               }
+
+               for (i = 0; i < nrpages; ++i) {
+                       pages[i] = erofs_allocpage(&pagepool, GFP_KERNEL);
+                       if (!pages[i]) {
+                               ret = -ENOMEM;
+                               oldpages = pages;
+                               goto free_pagearray;
+                       }
+               }
+               ptr = vmap(pages, nrpages, VM_MAP, PAGE_KERNEL);
+               if (!ptr) {
+                       ret = -ENOMEM;
+                       oldpages = pages;
+                       goto free_pagearray;
+               }
+               raw_spin_lock(&pcb->lock);
+               old_ptr = pcb->ptr;
+               pcb->ptr = ptr;
+               oldpages = pcb->pages;
+               pcb->pages = pages;
+               i = pcb->nrpages;
+               pcb->nrpages = nrpages;
+               raw_spin_unlock(&pcb->lock);
+
+               if (!oldpages) {
+                       DBG_BUGON(old_ptr);
+                       continue;
+               }
+
+               if (old_ptr)
+                       vunmap(old_ptr);
+free_pagearray:
+               while (i)
+                       list_add(&oldpages[--i]->lru, &pagepool);
+               kfree(oldpages);
+               if (ret)
+                       break;
+       }
+       pcb_nrpages = nrpages;
+       put_pages_list(&pagepool);
+out:
+       mutex_unlock(&pcb_resize_mutex);
+       return ret;
+}
+
+void erofs_pcpubuf_init(void)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
+
+               raw_spin_lock_init(&pcb->lock);
+       }
+}
+
+void erofs_pcpubuf_exit(void)
+{
+       int cpu, i;
+
+       for_each_possible_cpu(cpu) {
+               struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
+
+               if (pcb->ptr) {
+                       vunmap(pcb->ptr);
+                       pcb->ptr = NULL;
+               }
+               if (!pcb->pages)
+                       continue;
+
+               for (i = 0; i < pcb->nrpages; ++i)
+                       if (pcb->pages[i])
+                               put_page(pcb->pages[i]);
+               kfree(pcb->pages);
+               pcb->pages = NULL;
+       }
+}
index d5a6b9b..bbf3bbd 100644 (file)
@@ -122,6 +122,136 @@ static bool check_layout_compatibility(struct super_block *sb,
        return true;
 }
 
+#ifdef CONFIG_EROFS_FS_ZIP
+/* read variable-sized metadata, offset will be aligned by 4-byte */
+static void *erofs_read_metadata(struct super_block *sb, struct page **pagep,
+                                erofs_off_t *offset, int *lengthp)
+{
+       struct page *page = *pagep;
+       u8 *buffer, *ptr;
+       int len, i, cnt;
+       erofs_blk_t blk;
+
+       *offset = round_up(*offset, 4);
+       blk = erofs_blknr(*offset);
+
+       if (!page || page->index != blk) {
+               if (page) {
+                       unlock_page(page);
+                       put_page(page);
+               }
+               page = erofs_get_meta_page(sb, blk);
+               if (IS_ERR(page))
+                       goto err_nullpage;
+       }
+
+       ptr = kmap(page);
+       len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(*offset)]);
+       if (!len)
+               len = U16_MAX + 1;
+       buffer = kmalloc(len, GFP_KERNEL);
+       if (!buffer) {
+               buffer = ERR_PTR(-ENOMEM);
+               goto out;
+       }
+       *offset += sizeof(__le16);
+       *lengthp = len;
+
+       for (i = 0; i < len; i += cnt) {
+               cnt = min(EROFS_BLKSIZ - (int)erofs_blkoff(*offset), len - i);
+               blk = erofs_blknr(*offset);
+
+               if (!page || page->index != blk) {
+                       if (page) {
+                               kunmap(page);
+                               unlock_page(page);
+                               put_page(page);
+                       }
+                       page = erofs_get_meta_page(sb, blk);
+                       if (IS_ERR(page)) {
+                               kfree(buffer);
+                               goto err_nullpage;
+                       }
+                       ptr = kmap(page);
+               }
+               memcpy(buffer + i, ptr + erofs_blkoff(*offset), cnt);
+               *offset += cnt;
+       }
+out:
+       kunmap(page);
+       *pagep = page;
+       return buffer;
+err_nullpage:
+       *pagep = NULL;
+       return page;
+}
+
+static int erofs_load_compr_cfgs(struct super_block *sb,
+                                struct erofs_super_block *dsb)
+{
+       struct erofs_sb_info *sbi;
+       struct page *page;
+       unsigned int algs, alg;
+       erofs_off_t offset;
+       int size, ret;
+
+       sbi = EROFS_SB(sb);
+       sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs);
+
+       if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) {
+               erofs_err(sb, "try to load compressed fs with unsupported algorithms %x",
+                         sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS);
+               return -EINVAL;
+       }
+
+       offset = EROFS_SUPER_OFFSET + sbi->sb_size;
+       page = NULL;
+       alg = 0;
+       ret = 0;
+
+       for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) {
+               void *data;
+
+               if (!(algs & 1))
+                       continue;
+
+               data = erofs_read_metadata(sb, &page, &offset, &size);
+               if (IS_ERR(data)) {
+                       ret = PTR_ERR(data);
+                       goto err;
+               }
+
+               switch (alg) {
+               case Z_EROFS_COMPRESSION_LZ4:
+                       ret = z_erofs_load_lz4_config(sb, dsb, data, size);
+                       break;
+               default:
+                       DBG_BUGON(1);
+                       ret = -EFAULT;
+               }
+               kfree(data);
+               if (ret)
+                       goto err;
+       }
+err:
+       if (page) {
+               unlock_page(page);
+               put_page(page);
+       }
+       return ret;
+}
+#else
+static int erofs_load_compr_cfgs(struct super_block *sb,
+                                struct erofs_super_block *dsb)
+{
+       if (dsb->u1.available_compr_algs) {
+               erofs_err(sb, "try to load compressed fs when compression is disabled");
+               return -EINVAL;
+       }
+       return 0;
+}
+#endif
+
 static int erofs_read_superblock(struct super_block *sb)
 {
        struct erofs_sb_info *sbi;
@@ -149,7 +279,7 @@ static int erofs_read_superblock(struct super_block *sb)
        }
 
        sbi->feature_compat = le32_to_cpu(dsb->feature_compat);
-       if (sbi->feature_compat & EROFS_FEATURE_COMPAT_SB_CHKSUM) {
+       if (erofs_sb_has_sb_chksum(sbi)) {
                ret = erofs_superblock_csum_verify(sb, data);
                if (ret)
                        goto out;
@@ -166,6 +296,12 @@ static int erofs_read_superblock(struct super_block *sb)
        if (!check_layout_compatibility(sb, dsb))
                goto out;
 
+       sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE;
+       if (sbi->sb_size > EROFS_BLKSIZ) {
+               erofs_err(sb, "invalid sb_extslots %u (more than a fs block)",
+                         sbi->sb_size);
+               goto out;
+       }
        sbi->blocks = le32_to_cpu(dsb->blocks);
        sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
 #ifdef CONFIG_EROFS_FS_XATTR
@@ -187,7 +323,12 @@ static int erofs_read_superblock(struct super_block *sb)
                ret = -EFSCORRUPTED;
                goto out;
        }
-       ret = 0;
+
+       /* parse on-disk compression configurations */
+       if (erofs_sb_has_compr_cfgs(sbi))
+               ret = erofs_load_compr_cfgs(sb, dsb);
+       else
+               ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0);
 out:
        kunmap(page);
        put_page(page);
@@ -200,6 +341,7 @@ static void erofs_default_options(struct erofs_fs_context *ctx)
 #ifdef CONFIG_EROFS_FS_ZIP
        ctx->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
        ctx->max_sync_decompress_pages = 3;
+       ctx->readahead_sync_decompress = false;
 #endif
 #ifdef CONFIG_EROFS_FS_XATTR
        set_opt(ctx, XATTR_USER);
@@ -513,6 +655,7 @@ static int __init erofs_module_init(void)
        if (err)
                goto shrinker_err;
 
+       erofs_pcpubuf_init();
        err = z_erofs_init_zip_subsystem();
        if (err)
                goto zip_err;
@@ -542,6 +685,7 @@ static void __exit erofs_module_exit(void)
        /* Ensure all RCU free inodes are safe before cache is destroyed. */
        rcu_barrier();
        kmem_cache_destroy(erofs_inode_cachep);
+       erofs_pcpubuf_exit();
 }
 
 /* get filesystem statistics */
index de9986d..6758c5b 100644 (file)
@@ -21,18 +21,6 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
        return page;
 }
 
-#if (EROFS_PCPUBUF_NR_PAGES > 0)
-static struct {
-       u8 data[PAGE_SIZE * EROFS_PCPUBUF_NR_PAGES];
-} ____cacheline_aligned_in_smp erofs_pcpubuf[NR_CPUS];
-
-void *erofs_get_pcpubuf(unsigned int pagenr)
-{
-       preempt_disable();
-       return &erofs_pcpubuf[smp_processor_id()].data[pagenr * PAGE_SIZE];
-}
-#endif
-
 #ifdef CONFIG_EROFS_FS_ZIP
 /* global shrink count (for all mounted EROFS instances) */
 static atomic_long_t erofs_global_shrink_cnt;
index 3851e1a..78e4b59 100644 (file)
 
 #include <trace/events/erofs.h>
 
+/*
+ * since pclustersize is variable for big pcluster feature, introduce slab
+ * pools implementation for different pcluster sizes.
+ */
+struct z_erofs_pcluster_slab {
+       struct kmem_cache *slab;
+       unsigned int maxpages;
+       char name[48];
+};
+
+#define _PCLP(n) { .maxpages = n }
+
+static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = {
+       _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128),
+       _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES)
+};
+
+static void z_erofs_destroy_pcluster_pool(void)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
+               if (!pcluster_pool[i].slab)
+                       continue;
+               kmem_cache_destroy(pcluster_pool[i].slab);
+               pcluster_pool[i].slab = NULL;
+       }
+}
+
+static int z_erofs_create_pcluster_pool(void)
+{
+       struct z_erofs_pcluster_slab *pcs;
+       struct z_erofs_pcluster *a;
+       unsigned int size;
+
+       for (pcs = pcluster_pool;
+            pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
+               size = struct_size(a, compressed_pages, pcs->maxpages);
+
+               sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages);
+               pcs->slab = kmem_cache_create(pcs->name, size, 0,
+                                             SLAB_RECLAIM_ACCOUNT, NULL);
+               if (pcs->slab)
+                       continue;
+
+               z_erofs_destroy_pcluster_pool();
+               return -ENOMEM;
+       }
+       return 0;
+}
+
+static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
+               struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
+               struct z_erofs_pcluster *pcl;
+
+               if (nrpages > pcs->maxpages)
+                       continue;
+
+               pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS);
+               if (!pcl)
+                       return ERR_PTR(-ENOMEM);
+               pcl->pclusterpages = nrpages;
+               return pcl;
+       }
+       return ERR_PTR(-EINVAL);
+}
+
+static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
+               struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
+
+               if (pcl->pclusterpages > pcs->maxpages)
+                       continue;
+
+               kmem_cache_free(pcs->slab, pcl);
+               return;
+       }
+       DBG_BUGON(1);
+}
+
 /*
  * a compressed_pages[] placeholder in order to avoid
  * being filled with file pages for in-place decompression.
@@ -37,12 +124,11 @@ typedef tagptr1_t compressed_page_t;
        tagptr_fold(compressed_page_t, page, 1)
 
 static struct workqueue_struct *z_erofs_workqueue __read_mostly;
-static struct kmem_cache *pcluster_cachep __read_mostly;
 
 void z_erofs_exit_zip_subsystem(void)
 {
        destroy_workqueue(z_erofs_workqueue);
-       kmem_cache_destroy(pcluster_cachep);
+       z_erofs_destroy_pcluster_pool();
 }
 
 static inline int z_erofs_init_workqueue(void)
@@ -59,32 +145,16 @@ static inline int z_erofs_init_workqueue(void)
        return z_erofs_workqueue ? 0 : -ENOMEM;
 }
 
-static void z_erofs_pcluster_init_once(void *ptr)
-{
-       struct z_erofs_pcluster *pcl = ptr;
-       struct z_erofs_collection *cl = z_erofs_primarycollection(pcl);
-       unsigned int i;
-
-       mutex_init(&cl->lock);
-       cl->nr_pages = 0;
-       cl->vcnt = 0;
-       for (i = 0; i < Z_EROFS_CLUSTER_MAX_PAGES; ++i)
-               pcl->compressed_pages[i] = NULL;
-}
-
 int __init z_erofs_init_zip_subsystem(void)
 {
-       pcluster_cachep = kmem_cache_create("erofs_compress",
-                                           Z_EROFS_WORKGROUP_SIZE, 0,
-                                           SLAB_RECLAIM_ACCOUNT,
-                                           z_erofs_pcluster_init_once);
-       if (pcluster_cachep) {
-               if (!z_erofs_init_workqueue())
-                       return 0;
-
-               kmem_cache_destroy(pcluster_cachep);
-       }
-       return -ENOMEM;
+       int err = z_erofs_create_pcluster_pool();
+
+       if (err)
+               return err;
+       err = z_erofs_init_workqueue();
+       if (err)
+               z_erofs_destroy_pcluster_pool();
+       return err;
 }
 
 enum z_erofs_collectmode {
@@ -104,6 +174,12 @@ enum z_erofs_collectmode {
         * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________|
         */
        COLLECT_PRIMARY_HOOKED,
+       /*
+        * a weak form of COLLECT_PRIMARY_FOLLOWED, the difference is that it
+        * could be dispatched into bypass queue later due to uptodated managed
+        * pages. All related online pages cannot be reused for inplace I/O (or
+        * pagevec) since it can be directly decoded without I/O submission.
+        */
        COLLECT_PRIMARY_FOLLOWED_NOINPLACE,
        /*
         * The current collection has been linked with the owned chain, and
@@ -128,7 +204,8 @@ struct z_erofs_collector {
 
        struct z_erofs_pcluster *pcl, *tailpcl;
        struct z_erofs_collection *cl;
-       struct page **compressedpages;
+       /* a pointer used to pick up inplace I/O pages */
+       struct page **icpage_ptr;
        z_erofs_next_pcluster_t owned_head;
 
        enum z_erofs_collectmode mode;
@@ -162,18 +239,19 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
                                     enum z_erofs_cache_alloctype type,
                                     struct list_head *pagepool)
 {
-       const struct z_erofs_pcluster *pcl = clt->pcl;
-       const unsigned int clusterpages = BIT(pcl->clusterbits);
-       struct page **pages = clt->compressedpages;
-       pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages);
+       struct z_erofs_pcluster *pcl = clt->pcl;
        bool standalone = true;
        gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
                        __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
+       struct page **pages;
+       pgoff_t index;
 
        if (clt->mode < COLLECT_PRIMARY_FOLLOWED)
                return;
 
-       for (; pages < pcl->compressed_pages + clusterpages; ++pages) {
+       pages = pcl->compressed_pages;
+       index = pcl->obj.index;
+       for (; index < pcl->obj.index + pcl->pclusterpages; ++index, ++pages) {
                struct page *page;
                compressed_page_t t;
                struct page *newpage = NULL;
@@ -186,21 +264,25 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
 
                if (page) {
                        t = tag_compressed_page_justfound(page);
-               } else if (type == DELAYEDALLOC) {
-                       t = tagptr_init(compressed_page_t, PAGE_UNALLOCATED);
-               } else if (type == TRYALLOC) {
-                       newpage = erofs_allocpage(pagepool, gfp);
-                       if (!newpage)
-                               goto dontalloc;
-
-                       set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
-                       t = tag_compressed_page_justfound(newpage);
-               } else {        /* DONTALLOC */
-dontalloc:
-                       if (standalone)
-                               clt->compressedpages = pages;
+               } else {
+                       /* I/O is needed, no possible to decompress directly */
                        standalone = false;
-                       continue;
+                       switch (type) {
+                       case DELAYEDALLOC:
+                               t = tagptr_init(compressed_page_t,
+                                               PAGE_UNALLOCATED);
+                               break;
+                       case TRYALLOC:
+                               newpage = erofs_allocpage(pagepool, gfp);
+                               if (!newpage)
+                                       continue;
+                               set_page_private(newpage,
+                                                Z_EROFS_PREALLOCATED_PAGE);
+                               t = tag_compressed_page_justfound(newpage);
+                               break;
+                       default:        /* DONTALLOC */
+                               continue;
+                       }
                }
 
                if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t)))
@@ -214,7 +296,11 @@ dontalloc:
                }
        }
 
-       if (standalone)         /* downgrade to PRIMARY_FOLLOWED_NOINPLACE */
+       /*
+        * don't do inplace I/O if all compressed pages are available in
+        * managed cache since it can be moved to the bypass queue instead.
+        */
+       if (standalone)
                clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
 }
 
@@ -225,14 +311,13 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
        struct z_erofs_pcluster *const pcl =
                container_of(grp, struct z_erofs_pcluster, obj);
        struct address_space *const mapping = MNGD_MAPPING(sbi);
-       const unsigned int clusterpages = BIT(pcl->clusterbits);
        int i;
 
        /*
         * refcount of workgroup is now freezed as 1,
         * therefore no need to worry about available decompression users.
         */
-       for (i = 0; i < clusterpages; ++i) {
+       for (i = 0; i < pcl->pclusterpages; ++i) {
                struct page *page = pcl->compressed_pages[i];
 
                if (!page)
@@ -257,13 +342,12 @@ int erofs_try_to_free_cached_page(struct address_space *mapping,
                                  struct page *page)
 {
        struct z_erofs_pcluster *const pcl = (void *)page_private(page);
-       const unsigned int clusterpages = BIT(pcl->clusterbits);
        int ret = 0;    /* 0 - busy */
 
        if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) {
                unsigned int i;
 
-               for (i = 0; i < clusterpages; ++i) {
+               for (i = 0; i < pcl->pclusterpages; ++i) {
                        if (pcl->compressed_pages[i] == page) {
                                WRITE_ONCE(pcl->compressed_pages[i], NULL);
                                ret = 1;
@@ -279,16 +363,14 @@ int erofs_try_to_free_cached_page(struct address_space *mapping,
 }
 
 /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
-static inline bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
-                                         struct page *page)
+static bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
+                                  struct page *page)
 {
        struct z_erofs_pcluster *const pcl = clt->pcl;
-       const unsigned int clusterpages = BIT(pcl->clusterbits);
 
-       while (clt->compressedpages < pcl->compressed_pages + clusterpages) {
-               if (!cmpxchg(clt->compressedpages++, NULL, page))
+       while (clt->icpage_ptr > pcl->compressed_pages)
+               if (!cmpxchg(--clt->icpage_ptr, NULL, page))
                        return true;
-       }
        return false;
 }
 
@@ -399,10 +481,10 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
        struct erofs_workgroup *grp;
        int err;
 
-       /* no available workgroup, let's allocate one */
-       pcl = kmem_cache_alloc(pcluster_cachep, GFP_NOFS);
-       if (!pcl)
-               return -ENOMEM;
+       /* no available pcluster, let's allocate one */
+       pcl = z_erofs_alloc_pcluster(map->m_plen >> PAGE_SHIFT);
+       if (IS_ERR(pcl))
+               return PTR_ERR(pcl);
 
        atomic_set(&pcl->obj.refcount, 1);
        pcl->obj.index = map->m_pa >> PAGE_SHIFT;
@@ -416,25 +498,18 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
        else
                pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
 
-       pcl->clusterbits = EROFS_I(inode)->z_physical_clusterbits[0];
-       pcl->clusterbits -= PAGE_SHIFT;
-
        /* new pclusters should be claimed as type 1, primary and followed */
        pcl->next = clt->owned_head;
        clt->mode = COLLECT_PRIMARY_FOLLOWED;
 
        cl = z_erofs_primarycollection(pcl);
-
-       /* must be cleaned before freeing to slab */
-       DBG_BUGON(cl->nr_pages);
-       DBG_BUGON(cl->vcnt);
-
        cl->pageofs = map->m_la & ~PAGE_MASK;
 
        /*
         * lock all primary followed works before visible to others
         * and mutex_trylock *never* fails for a new pcluster.
         */
+       mutex_init(&cl->lock);
        DBG_BUGON(!mutex_trylock(&cl->lock));
 
        grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj);
@@ -458,7 +533,7 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
 
 err_out:
        mutex_unlock(&cl->lock);
-       kmem_cache_free(pcluster_cachep, pcl);
+       z_erofs_free_pcluster(pcl);
        return err;
 }
 
@@ -502,9 +577,8 @@ out:
        z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS,
                                  clt->cl->pagevec, clt->cl->vcnt);
 
-       clt->compressedpages = clt->pcl->compressed_pages;
-       if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */
-               clt->compressedpages += Z_EROFS_CLUSTER_MAX_PAGES;
+       /* since file-backed online pages are traversed in reverse order */
+       clt->icpage_ptr = clt->pcl->compressed_pages + clt->pcl->pclusterpages;
        return 0;
 }
 
@@ -517,9 +591,8 @@ static void z_erofs_rcu_callback(struct rcu_head *head)
        struct z_erofs_collection *const cl =
                container_of(head, struct z_erofs_collection, rcu);
 
-       kmem_cache_free(pcluster_cachep,
-                       container_of(cl, struct z_erofs_pcluster,
-                                    primary_collection));
+       z_erofs_free_pcluster(container_of(cl, struct z_erofs_pcluster,
+                                          primary_collection));
 }
 
 void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
@@ -706,9 +779,12 @@ err_out:
        goto out;
 }
 
+static void z_erofs_decompressqueue_work(struct work_struct *work);
 static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
                                       bool sync, int bios)
 {
+       struct erofs_sb_info *const sbi = EROFS_SB(io->sb);
+
        /* wake up the caller thread for sync decompression */
        if (sync) {
                unsigned long flags;
@@ -720,8 +796,15 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
                return;
        }
 
-       if (!atomic_add_return(bios, &io->pending_bios))
+       if (atomic_add_return(bios, &io->pending_bios))
+               return;
+       /* Use workqueue and sync decompression for atomic contexts only */
+       if (in_atomic() || irqs_disabled()) {
                queue_work(z_erofs_workqueue, &io->u.work);
+               sbi->ctx.readahead_sync_decompress = true;
+               return;
+       }
+       z_erofs_decompressqueue_work(&io->u.work);
 }
 
 static bool z_erofs_page_is_invalidated(struct page *page)
@@ -761,9 +844,8 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
                                       struct list_head *pagepool)
 {
        struct erofs_sb_info *const sbi = EROFS_SB(sb);
-       const unsigned int clusterpages = BIT(pcl->clusterbits);
        struct z_erofs_pagevec_ctor ctor;
-       unsigned int i, outputsize, llen, nr_pages;
+       unsigned int i, inputsize, outputsize, llen, nr_pages;
        struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES];
        struct page **pages, **compressed_pages, *page;
 
@@ -843,7 +925,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
        overlapped = false;
        compressed_pages = pcl->compressed_pages;
 
-       for (i = 0; i < clusterpages; ++i) {
+       for (i = 0; i < pcl->pclusterpages; ++i) {
                unsigned int pagenr;
 
                page = compressed_pages[i];
@@ -896,12 +978,13 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
                partial = true;
        }
 
+       inputsize = pcl->pclusterpages * PAGE_SIZE;
        err = z_erofs_decompress(&(struct z_erofs_decompress_req) {
                                        .sb = sb,
                                        .in = compressed_pages,
                                        .out = pages,
                                        .pageofs_out = cl->pageofs,
-                                       .inputsize = PAGE_SIZE,
+                                       .inputsize = inputsize,
                                        .outputsize = outputsize,
                                        .alg = pcl->algorithmformat,
                                        .inplace_io = overlapped,
@@ -909,8 +992,8 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
                                 }, pagepool);
 
 out:
-       /* must handle all compressed pages before endding pages */
-       for (i = 0; i < clusterpages; ++i) {
+       /* must handle all compressed pages before ending pages */
+       for (i = 0; i < pcl->pclusterpages; ++i) {
                page = compressed_pages[i];
 
                if (erofs_page_is_managed(sbi, page))
@@ -1213,7 +1296,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
                pcl = container_of(owned_head, struct z_erofs_pcluster, next);
 
                cur = pcl->obj.index;
-               end = cur + BIT(pcl->clusterbits);
+               end = cur + pcl->pclusterpages;
 
                /* close the main owned chain at first */
                owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
@@ -1333,7 +1416,8 @@ static void z_erofs_readahead(struct readahead_control *rac)
        struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
 
        unsigned int nr_pages = readahead_count(rac);
-       bool sync = (nr_pages <= sbi->ctx.max_sync_decompress_pages);
+       bool sync = (sbi->ctx.readahead_sync_decompress &&
+                       nr_pages <= sbi->ctx.max_sync_decompress_pages);
        struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
        struct page *page, *head = NULL;
        LIST_HEAD(pagepool);
index b503b35..942ee69 100644 (file)
@@ -10,6 +10,7 @@
 #include "internal.h"
 #include "zpvec.h"
 
+#define Z_EROFS_PCLUSTER_MAX_PAGES     (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
 #define Z_EROFS_NR_INLINE_PAGEVECS      3
 
 /*
@@ -59,16 +60,17 @@ struct z_erofs_pcluster {
        /* A: point to next chained pcluster or TAILs */
        z_erofs_next_pcluster_t next;
 
-       /* A: compressed pages (including multi-usage pages) */
-       struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES];
-
        /* A: lower limit of decompressed length and if full length or not */
        unsigned int length;
 
+       /* I: physical cluster size in pages */
+       unsigned short pclusterpages;
+
        /* I: compression algorithm format */
        unsigned char algorithmformat;
-       /* I: bit shift of physical cluster size */
-       unsigned char clusterbits;
+
+       /* A: compressed pages (can be cached or inplaced pages) */
+       struct page *compressed_pages[];
 };
 
 #define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection)
@@ -82,8 +84,6 @@ struct z_erofs_pcluster {
 
 #define Z_EROFS_PCLUSTER_NIL            (NULL)
 
-#define Z_EROFS_WORKGROUP_SIZE  sizeof(struct z_erofs_pcluster)
-
 struct z_erofs_decompressqueue {
        struct super_block *sb;
        atomic_t pending_bios;
index 14d2de3..e62d813 100644 (file)
 int z_erofs_fill_inode(struct inode *inode)
 {
        struct erofs_inode *const vi = EROFS_I(inode);
+       struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
 
-       if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) {
+       if (!erofs_sb_has_big_pcluster(sbi) &&
+           vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) {
                vi->z_advise = 0;
                vi->z_algorithmtype[0] = 0;
                vi->z_algorithmtype[1] = 0;
                vi->z_logical_clusterbits = LOG_BLOCK_SIZE;
-               vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits;
-               vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits;
                set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
        }
-
        inode->i_mapping->a_ops = &z_erofs_aops;
        return 0;
 }
@@ -52,7 +51,8 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
        if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
                goto out_unlock;
 
-       DBG_BUGON(vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY);
+       DBG_BUGON(!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
+                 vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY);
 
        pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
                    vi->xattr_isize, 8);
@@ -77,18 +77,22 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
        }
 
        vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7);
-       vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits +
-                                       ((h->h_clusterbits >> 3) & 3);
-
-       if (vi->z_physical_clusterbits[0] != LOG_BLOCK_SIZE) {
-               erofs_err(sb, "unsupported physical clusterbits %u for nid %llu, please upgrade kernel",
-                         vi->z_physical_clusterbits[0], vi->nid);
-               err = -EOPNOTSUPP;
+       if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
+           vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
+                           Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
+               erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu",
+                         vi->nid);
+               err = -EFSCORRUPTED;
+               goto unmap_done;
+       }
+       if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION &&
+           !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^
+           !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
+               erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu",
+                         vi->nid);
+               err = -EFSCORRUPTED;
                goto unmap_done;
        }
-
-       vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits +
-                                       ((h->h_clusterbits >> 5) & 7);
        /* paired with smp_mb() at the beginning of the function */
        smp_mb();
        set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
@@ -111,7 +115,7 @@ struct z_erofs_maprecorder {
        u8  type;
        u16 clusterofs;
        u16 delta[2];
-       erofs_blk_t pblk;
+       erofs_blk_t pblk, compressedlcs;
 };
 
 static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m,
@@ -174,6 +178,15 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
        case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
                m->clusterofs = 1 << vi->z_logical_clusterbits;
                m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
+               if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) {
+                       if (!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
+                               DBG_BUGON(1);
+                               return -EFSCORRUPTED;
+                       }
+                       m->compressedlcs = m->delta[0] &
+                               ~Z_EROFS_VLE_DI_D0_CBLKCNT;
+                       m->delta[0] = 1;
+               }
                m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
                break;
        case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
@@ -210,6 +223,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
        unsigned int vcnt, base, lo, encodebits, nblk;
        int i;
        u8 *in, type;
+       bool big_pcluster;
 
        if (1 << amortizedshift == 4)
                vcnt = 2;
@@ -218,6 +232,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
        else
                return -EOPNOTSUPP;
 
+       big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
        encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
        base = round_down(eofs, vcnt << amortizedshift);
        in = m->kaddr + base;
@@ -229,7 +244,15 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
        m->type = type;
        if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
                m->clusterofs = 1 << lclusterbits;
-               if (i + 1 != vcnt) {
+               if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) {
+                       if (!big_pcluster) {
+                               DBG_BUGON(1);
+                               return -EFSCORRUPTED;
+                       }
+                       m->compressedlcs = lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT;
+                       m->delta[0] = 1;
+                       return 0;
+               } else if (i + 1 != (int)vcnt) {
                        m->delta[0] = lo;
                        return 0;
                }
@@ -242,22 +265,48 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
                                          in, encodebits * (i - 1), &type);
                if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
                        lo = 0;
+               else if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT)
+                       lo = 1;
                m->delta[0] = lo + 1;
                return 0;
        }
        m->clusterofs = lo;
        m->delta[0] = 0;
        /* figout out blkaddr (pblk) for HEAD lclusters */
-       nblk = 1;
-       while (i > 0) {
-               --i;
-               lo = decode_compactedbits(lclusterbits, lomask,
-                                         in, encodebits * i, &type);
-               if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
-                       i -= lo;
-
-               if (i >= 0)
+       if (!big_pcluster) {
+               nblk = 1;
+               while (i > 0) {
+                       --i;
+                       lo = decode_compactedbits(lclusterbits, lomask,
+                                                 in, encodebits * i, &type);
+                       if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
+                               i -= lo;
+
+                       if (i >= 0)
+                               ++nblk;
+               }
+       } else {
+               nblk = 0;
+               while (i > 0) {
+                       --i;
+                       lo = decode_compactedbits(lclusterbits, lomask,
+                                                 in, encodebits * i, &type);
+                       if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
+                               if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) {
+                                       --i;
+                                       nblk += lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT;
+                                       continue;
+                               }
+                               /* bigpcluster shouldn't have plain d0 == 1 */
+                               if (lo <= 1) {
+                                       DBG_BUGON(1);
+                                       return -EFSCORRUPTED;
+                               }
+                               i -= lo - 2;
+                               continue;
+                       }
                        ++nblk;
+               }
        }
        in += (vcnt << amortizedshift) - sizeof(__le32);
        m->pblk = le32_to_cpu(*(__le32 *)in) + nblk;
@@ -381,6 +430,58 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
        return 0;
 }
 
+static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
+                                           unsigned int initial_lcn)
+{
+       struct erofs_inode *const vi = EROFS_I(m->inode);
+       struct erofs_map_blocks *const map = m->map;
+       const unsigned int lclusterbits = vi->z_logical_clusterbits;
+       unsigned long lcn;
+       int err;
+
+       DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN &&
+                 m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD);
+       if (!(map->m_flags & EROFS_MAP_ZIPPED) ||
+           !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
+               map->m_plen = 1 << lclusterbits;
+               return 0;
+       }
+
+       lcn = m->lcn + 1;
+       if (m->compressedlcs)
+               goto out;
+       if (lcn == initial_lcn)
+               goto err_bonus_cblkcnt;
+
+       err = z_erofs_load_cluster_from_disk(m, lcn);
+       if (err)
+               return err;
+
+       switch (m->type) {
+       case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+               if (m->delta[0] != 1)
+                       goto err_bonus_cblkcnt;
+               if (m->compressedlcs)
+                       break;
+               fallthrough;
+       default:
+               erofs_err(m->inode->i_sb,
+                         "cannot found CBLKCNT @ lcn %lu of nid %llu",
+                         lcn, vi->nid);
+               DBG_BUGON(1);
+               return -EFSCORRUPTED;
+       }
+out:
+       map->m_plen = m->compressedlcs << lclusterbits;
+       return 0;
+err_bonus_cblkcnt:
+       erofs_err(m->inode->i_sb,
+                 "bogus CBLKCNT @ lcn %lu of nid %llu",
+                 lcn, vi->nid);
+       DBG_BUGON(1);
+       return -EFSCORRUPTED;
+}
+
 int z_erofs_map_blocks_iter(struct inode *inode,
                            struct erofs_map_blocks *map,
                            int flags)
@@ -392,6 +493,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
        };
        int err = 0;
        unsigned int lclusterbits, endoff;
+       unsigned long initial_lcn;
        unsigned long long ofs, end;
 
        trace_z_erofs_map_blocks_iter_enter(inode, map, flags);
@@ -410,10 +512,10 @@ int z_erofs_map_blocks_iter(struct inode *inode,
 
        lclusterbits = vi->z_logical_clusterbits;
        ofs = map->m_la;
-       m.lcn = ofs >> lclusterbits;
+       initial_lcn = ofs >> lclusterbits;
        endoff = ofs & ((1 << lclusterbits) - 1);
 
-       err = z_erofs_load_cluster_from_disk(&m, m.lcn);
+       err = z_erofs_load_cluster_from_disk(&m, initial_lcn);
        if (err)
                goto unmap_out;
 
@@ -443,7 +545,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
                m.delta[0] = 1;
                fallthrough;
        case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
-               /* get the correspoinding first chunk */
+               /* get the corresponding first chunk */
                err = z_erofs_extent_lookback(&m, m.delta[0]);
                if (err)
                        goto unmap_out;
@@ -457,10 +559,12 @@ int z_erofs_map_blocks_iter(struct inode *inode,
        }
 
        map->m_llen = end - map->m_la;
-       map->m_plen = 1 << lclusterbits;
        map->m_pa = blknr_to_addr(m.pblk);
        map->m_flags |= EROFS_MAP_MAPPED;
 
+       err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
+       if (err)
+               goto out;
 unmap_out:
        if (m.kaddr)
                kunmap_atomic(m.kaddr);