Merge tag 'erofs-for-5.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang...

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 26 Apr 2021 20:28:12 +0000 (13:28 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 26 Apr 2021 20:28:12 +0000 (13:28 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 26 Apr 2021 20:28:12 +0000 (13:28 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 26 Apr 2021 20:28:12 +0000 (13:28 -0700)
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig

index 74b0aaa..858b333 100644 (file)
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -76,17 +76,3 @@ config EROFS_FS_ZIP
  
           If you don't want to enable compression feature, say N.
  
-config EROFS_FS_CLUSTER_PAGE_LIMIT
-       int "EROFS Cluster Pages Hard Limit"
-       depends on EROFS_FS_ZIP
-       range 1 256
-       default "1"
-       help
-         Indicates maximum # of pages of a compressed
-         physical cluster.
-
-         For example, if files in a image were compressed
-         into 8k-unit, hard limit should not be configured
-         less than 2. Otherwise, the image will be refused
-         to mount on this kernel.
-
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile

index af15953..1f9aced 100644 (file)
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -1,6 +1,6 @@
  # SPDX-License-Identifier: GPL-2.0-only
  
  obj-$(CONFIG_EROFS_FS) += erofs.o
-erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o
  erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
  erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
diff --git a/fs/erofs/data.c b/fs/erofs/data.c

index 1249e74..ebac756 100644 (file)
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -109,21 +109,6 @@ err_out:
         return err;
  }
  
-int erofs_map_blocks(struct inode *inode,
-                    struct erofs_map_blocks *map, int flags)
-{
-       if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) {
-               int err = z_erofs_map_blocks_iter(inode, map, flags);
-
-               if (map->mpage) {
-                       put_page(map->mpage);
-                       map->mpage = NULL;
-               }
-               return err;
-       }
-       return erofs_map_blocks_flatmode(inode, map, flags);
-}
-
  static inline struct bio *erofs_read_raw_page(struct bio *bio,
                                               struct address_space *mapping,
                                               struct page *page,
@@ -159,7 +144,7 @@ submit_bio_retry:
                 erofs_blk_t blknr;
                 unsigned int blkoff;
  
-               err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+               err = erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW);
                 if (err)
                         goto err_out;
  
@@ -318,7 +303,7 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
                         return 0;
         }
  
-       if (!erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW))
+       if (!erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW))
                 return erofs_blknr(map.m_pa);
  
         return 0;
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c

index 1cb1ffd..88e33ad 100644 (file)
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -28,6 +28,42 @@ struct z_erofs_decompressor {
         char *name;
  };
  
+int z_erofs_load_lz4_config(struct super_block *sb,
+                           struct erofs_super_block *dsb,
+                           struct z_erofs_lz4_cfgs *lz4, int size)
+{
+       struct erofs_sb_info *sbi = EROFS_SB(sb);
+       u16 distance;
+
+       if (lz4) {
+               if (size < sizeof(struct z_erofs_lz4_cfgs)) {
+                       erofs_err(sb, "invalid lz4 cfgs, size=%u", size);
+                       return -EINVAL;
+               }
+               distance = le16_to_cpu(lz4->max_distance);
+
+               sbi->lz4.max_pclusterblks = le16_to_cpu(lz4->max_pclusterblks);
+               if (!sbi->lz4.max_pclusterblks) {
+                       sbi->lz4.max_pclusterblks = 1;  /* reserved case */
+               } else if (sbi->lz4.max_pclusterblks >
+                          Z_EROFS_PCLUSTER_MAX_SIZE / EROFS_BLKSIZ) {
+                       erofs_err(sb, "too large lz4 pclusterblks %u",
+                                 sbi->lz4.max_pclusterblks);
+                       return -EINVAL;
+               } else if (sbi->lz4.max_pclusterblks >= 2) {
+                       erofs_info(sb, "EXPERIMENTAL big pcluster feature in use. Use at your own risk!");
+               }
+       } else {
+               distance = le16_to_cpu(dsb->u1.lz4_max_distance);
+               sbi->lz4.max_pclusterblks = 1;
+       }
+
+       sbi->lz4.max_distance_pages = distance ?
+                                       DIV_ROUND_UP(distance, PAGE_SIZE) + 1 :
+                                       LZ4_MAX_DISTANCE_PAGES;
+       return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks);
+}
+
  static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
                                          struct list_head *pagepool)
  {
@@ -36,6 +72,8 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
         struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL };
         unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES,
                                            BITS_PER_LONG)] = { 0 };
+       unsigned int lz4_max_distance_pages =
+                               EROFS_SB(rq->sb)->lz4.max_distance_pages;
         void *kaddr = NULL;
         unsigned int i, j, top;
  
@@ -44,14 +82,14 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
                 struct page *const page = rq->out[i];
                 struct page *victim;
  
-               if (j >= LZ4_MAX_DISTANCE_PAGES)
+               if (j >= lz4_max_distance_pages)
                         j = 0;
  
                 /* 'valid' bounced can only be tested after a complete round */
                 if (test_bit(j, bounced)) {
-                       DBG_BUGON(i < LZ4_MAX_DISTANCE_PAGES);
-                       DBG_BUGON(top >= LZ4_MAX_DISTANCE_PAGES);
-                       availables[top++] = rq->out[i - LZ4_MAX_DISTANCE_PAGES];
+                       DBG_BUGON(i < lz4_max_distance_pages);
+                       DBG_BUGON(top >= lz4_max_distance_pages);
+                       availables[top++] = rq->out[i - lz4_max_distance_pages];
                 }
  
                 if (page) {
@@ -73,9 +111,8 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
                         victim = availables[--top];
                         get_page(victim);
                 } else {
-                       victim = erofs_allocpage(pagepool, GFP_KERNEL);
-                       if (!victim)
-                               return -ENOMEM;
+                       victim = erofs_allocpage(pagepool,
+                                                GFP_KERNEL | __GFP_NOFAIL);
                         set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE);
                 }
                 rq->out[i] = victim;
@@ -83,96 +120,123 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
         return kaddr ? 1 : 0;
  }
  
-static void *generic_copy_inplace_data(struct z_erofs_decompress_req *rq,
-                                      u8 *src, unsigned int pageofs_in)
+static void *z_erofs_handle_inplace_io(struct z_erofs_decompress_req *rq,
+                       void *inpage, unsigned int *inputmargin, int *maptype,
+                       bool support_0padding)
  {
-       /*
-        * if in-place decompression is ongoing, those decompressed
-        * pages should be copied in order to avoid being overlapped.
-        */
-       struct page **in = rq->in;
-       u8 *const tmp = erofs_get_pcpubuf(0);
-       u8 *tmpp = tmp;
-       unsigned int inlen = rq->inputsize - pageofs_in;
-       unsigned int count = min_t(uint, inlen, PAGE_SIZE - pageofs_in);
-
-       while (tmpp < tmp + inlen) {
-               if (!src)
-                       src = kmap_atomic(*in);
-               memcpy(tmpp, src + pageofs_in, count);
-               kunmap_atomic(src);
-               src = NULL;
-               tmpp += count;
-               pageofs_in = 0;
-               count = PAGE_SIZE;
+       unsigned int nrpages_in, nrpages_out;
+       unsigned int ofull, oend, inputsize, total, i, j;
+       struct page **in;
+       void *src, *tmp;
+
+       inputsize = rq->inputsize;
+       nrpages_in = PAGE_ALIGN(inputsize) >> PAGE_SHIFT;
+       oend = rq->pageofs_out + rq->outputsize;
+       ofull = PAGE_ALIGN(oend);
+       nrpages_out = ofull >> PAGE_SHIFT;
+
+       if (rq->inplace_io) {
+               if (rq->partial_decoding || !support_0padding ||
+                   ofull - oend < LZ4_DECOMPRESS_INPLACE_MARGIN(inputsize))
+                       goto docopy;
+
+               for (i = 0; i < nrpages_in; ++i) {
+                       DBG_BUGON(rq->in[i] == NULL);
+                       for (j = 0; j < nrpages_out - nrpages_in + i; ++j)
+                               if (rq->out[j] == rq->in[i])
+                                       goto docopy;
+               }
+       }
+
+       if (nrpages_in <= 1) {
+               *maptype = 0;
+               return inpage;
+       }
+       kunmap_atomic(inpage);
+       might_sleep();
+       src = erofs_vm_map_ram(rq->in, nrpages_in);
+       if (!src)
+               return ERR_PTR(-ENOMEM);
+       *maptype = 1;
+       return src;
+
+docopy:
+       /* Or copy compressed data which can be overlapped to per-CPU buffer */
+       in = rq->in;
+       src = erofs_get_pcpubuf(nrpages_in);
+       if (!src) {
+               DBG_BUGON(1);
+               kunmap_atomic(inpage);
+               return ERR_PTR(-EFAULT);
+       }
+
+       tmp = src;
+       total = rq->inputsize;
+       while (total) {
+               unsigned int page_copycnt =
+                       min_t(unsigned int, total, PAGE_SIZE - *inputmargin);
+
+               if (!inpage)
+                       inpage = kmap_atomic(*in);
+               memcpy(tmp, inpage + *inputmargin, page_copycnt);
+               kunmap_atomic(inpage);
+               inpage = NULL;
+               tmp += page_copycnt;
+               total -= page_copycnt;
                 ++in;
+               *inputmargin = 0;
         }
-       return tmp;
+       *maptype = 2;
+       return src;
  }
  
  static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
  {
-       unsigned int inputmargin, inlen;
-       u8 *src;
-       bool copied, support_0padding;
-       int ret;
-
-       if (rq->inputsize > PAGE_SIZE)
-               return -EOPNOTSUPP;
+       unsigned int inputmargin;
+       u8 *headpage, *src;
+       bool support_0padding;
+       int ret, maptype;
  
-       src = kmap_atomic(*rq->in);
+       DBG_BUGON(*rq->in == NULL);
+       headpage = kmap_atomic(*rq->in);
         inputmargin = 0;
         support_0padding = false;
  
         /* decompression inplace is only safe when 0padding is enabled */
-       if (EROFS_SB(rq->sb)->feature_incompat &
-           EROFS_FEATURE_INCOMPAT_LZ4_0PADDING) {
+       if (erofs_sb_has_lz4_0padding(EROFS_SB(rq->sb))) {
                 support_0padding = true;
  
-               while (!src[inputmargin & ~PAGE_MASK])
+               while (!headpage[inputmargin & ~PAGE_MASK])
                         if (!(++inputmargin & ~PAGE_MASK))
                                 break;
  
                 if (inputmargin >= rq->inputsize) {
-                       kunmap_atomic(src);
+                       kunmap_atomic(headpage);
                         return -EIO;
                 }
         }
  
-       copied = false;
-       inlen = rq->inputsize - inputmargin;
-       if (rq->inplace_io) {
-               const uint oend = (rq->pageofs_out +
-                                  rq->outputsize) & ~PAGE_MASK;
-               const uint nr = PAGE_ALIGN(rq->pageofs_out +
-                                          rq->outputsize) >> PAGE_SHIFT;
-
-               if (rq->partial_decoding || !support_0padding ||
-                   rq->out[nr - 1] != rq->in[0] ||
-                   rq->inputsize - oend <
-                     LZ4_DECOMPRESS_INPLACE_MARGIN(inlen)) {
-                       src = generic_copy_inplace_data(rq, src, inputmargin);
-                       inputmargin = 0;
-                       copied = true;
-               }
-       }
+       rq->inputsize -= inputmargin;
+       src = z_erofs_handle_inplace_io(rq, headpage, &inputmargin, &maptype,
+                                       support_0padding);
+       if (IS_ERR(src))
+               return PTR_ERR(src);
  
         /* legacy format could compress extra data in a pcluster. */
         if (rq->partial_decoding || !support_0padding)
                 ret = LZ4_decompress_safe_partial(src + inputmargin, out,
-                                                 inlen, rq->outputsize,
-                                                 rq->outputsize);
+                               rq->inputsize, rq->outputsize, rq->outputsize);
         else
                 ret = LZ4_decompress_safe(src + inputmargin, out,
-                                         inlen, rq->outputsize);
+                                         rq->inputsize, rq->outputsize);
  
         if (ret != rq->outputsize) {
                 erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
-                         ret, inlen, inputmargin, rq->outputsize);
+                         ret, rq->inputsize, inputmargin, rq->outputsize);
  
                 WARN_ON(1);
                 print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
-                              16, 1, src + inputmargin, inlen, true);
+                              16, 1, src + inputmargin, rq->inputsize, true);
                 print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
                                16, 1, out, rq->outputsize, true);
  
@@ -181,10 +245,16 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
                 ret = -EIO;
         }
  
-       if (copied)
-               erofs_put_pcpubuf(src);
-       else
+       if (maptype == 0) {
                 kunmap_atomic(src);
+       } else if (maptype == 1) {
+               vm_unmap_ram(src, PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT);
+       } else if (maptype == 2) {
+               erofs_put_pcpubuf(src);
+       } else {
+               DBG_BUGON(1);
+               return -EFAULT;
+       }
         return ret;
  }
  
@@ -234,57 +304,51 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
         const struct z_erofs_decompressor *alg = decompressors + rq->alg;
         unsigned int dst_maptype;
         void *dst;
-       int ret, i;
+       int ret;
  
-       if (nrpages_out == 1 && !rq->inplace_io) {
-               DBG_BUGON(!*rq->out);
-               dst = kmap_atomic(*rq->out);
-               dst_maptype = 0;
-               goto dstmap_out;
-       }
+       /* two optimized fast paths only for non bigpcluster cases yet */
+       if (rq->inputsize <= PAGE_SIZE) {
+               if (nrpages_out == 1 && !rq->inplace_io) {
+                       DBG_BUGON(!*rq->out);
+                       dst = kmap_atomic(*rq->out);
+                       dst_maptype = 0;
+                       goto dstmap_out;
+               }
  
-       /*
-        * For the case of small output size (especially much less
-        * than PAGE_SIZE), memcpy the decompressed data rather than
-        * compressed data is preferred.
-        */
-       if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
-               dst = erofs_get_pcpubuf(0);
-               if (IS_ERR(dst))
-                       return PTR_ERR(dst);
-
-               rq->inplace_io = false;
-               ret = alg->decompress(rq, dst);
-               if (!ret)
-                       copy_from_pcpubuf(rq->out, dst, rq->pageofs_out,
-                                         rq->outputsize);
-
-               erofs_put_pcpubuf(dst);
-               return ret;
+               /*
+                * For the case of small output size (especially much less
+                * than PAGE_SIZE), memcpy the decompressed data rather than
+                * compressed data is preferred.
+                */
+               if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
+                       dst = erofs_get_pcpubuf(1);
+                       if (IS_ERR(dst))
+                               return PTR_ERR(dst);
+
+                       rq->inplace_io = false;
+                       ret = alg->decompress(rq, dst);
+                       if (!ret)
+                               copy_from_pcpubuf(rq->out, dst, rq->pageofs_out,
+                                                 rq->outputsize);
+
+                       erofs_put_pcpubuf(dst);
+                       return ret;
+               }
         }
  
+       /* general decoding path which can be used for all cases */
         ret = alg->prepare_destpages(rq, pagepool);
-       if (ret < 0) {
+       if (ret < 0)
                 return ret;
-       } else if (ret) {
+       if (ret) {
                 dst = page_address(*rq->out);
                 dst_maptype = 1;
                 goto dstmap_out;
         }
  
-       i = 0;
-       while (1) {
-               dst = vm_map_ram(rq->out, nrpages_out, -1);
-
-               /* retry two more times (totally 3 times) */
-               if (dst || ++i >= 3)
-                       break;
-               vm_unmap_aliases();
-       }
-
+       dst = erofs_vm_map_ram(rq->out, nrpages_out);
         if (!dst)
                 return -ENOMEM;
-
         dst_maptype = 2;
  
  dstmap_out:
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h

index 9ad1615..8739d3a 100644 (file)
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -18,15 +18,22 @@
   * be incompatible with this kernel version.
   */
  #define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING    0x00000001
-#define EROFS_ALL_FEATURE_INCOMPAT             EROFS_FEATURE_INCOMPAT_LZ4_0PADDING
+#define EROFS_FEATURE_INCOMPAT_COMPR_CFGS      0x00000002
+#define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER    0x00000002
+#define EROFS_ALL_FEATURE_INCOMPAT             \
+       (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \
+        EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
+        EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER)
  
-/* 128-byte erofs on-disk super block */
+#define EROFS_SB_EXTSLOT_SIZE  16
+
+/* erofs on-disk super block (currently 128 bytes) */
  struct erofs_super_block {
         __le32 magic;           /* file system magic number */
         __le32 checksum;        /* crc32c(super_block) */
         __le32 feature_compat;
         __u8 blkszbits;         /* support block_size == PAGE_SIZE only */
-       __u8 reserved;
+       __u8 sb_extslots;       /* superblock size = 128 + sb_extslots * 16 */
  
         __le16 root_nid;        /* nid of root directory */
         __le64 inos;            /* total valid ino # (== f_files - f_favail) */
@@ -39,7 +46,13 @@ struct erofs_super_block {
         __u8 uuid[16];          /* 128-bit uuid for volume */
         __u8 volume_name[16];   /* volume name */
         __le32 feature_incompat;
-       __u8 reserved2[44];
+       union {
+               /* bitmap for available compression algorithms */
+               __le16 available_compr_algs;
+               /* customized sliding window size instead of 64k by default */
+               __le16 lz4_max_distance;
+       } __packed u1;
+       __u8 reserved2[42];
  };
  
  /*
@@ -75,6 +88,9 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
  #define EROFS_I_VERSION_BIT             0
  #define EROFS_I_DATALAYOUT_BIT          1
  
+#define EROFS_I_ALL    \
+       ((1 << (EROFS_I_DATALAYOUT_BIT + EROFS_I_DATALAYOUT_BITS)) - 1)
+
  /* 32-byte reduced form of an ondisk inode */
  struct erofs_inode_compact {
         __le16 i_format;        /* inode format hints */
@@ -189,20 +205,33 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
                                  e->e_name_len + le16_to_cpu(e->e_value_size));
  }
  
+/* maximum supported size of a physical compression cluster */
+#define Z_EROFS_PCLUSTER_MAX_SIZE      (1024 * 1024)
+
  /* available compression algorithm types (for h_algorithmtype) */
  enum {
         Z_EROFS_COMPRESSION_LZ4 = 0,
         Z_EROFS_COMPRESSION_MAX
  };
+#define Z_EROFS_ALL_COMPR_ALGS         (1 << (Z_EROFS_COMPRESSION_MAX - 1))
+
+/* 14 bytes (+ length field = 16 bytes) */
+struct z_erofs_lz4_cfgs {
+       __le16 max_distance;
+       __le16 max_pclusterblks;
+       u8 reserved[10];
+} __packed;
  
  /*
   * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
   *  e.g. for 4k logical cluster size,      4B        if compacted 2B is off;
   *                                  (4B) + 2B + (4B) if compacted 2B is on.
+ * bit 1 : HEAD1 big pcluster (0 - off; 1 - on)
+ * bit 2 : HEAD2 big pcluster (0 - off; 1 - on)
   */
-#define Z_EROFS_ADVISE_COMPACTED_2B_BIT         0
-
-#define Z_EROFS_ADVISE_COMPACTED_2B     (1 << Z_EROFS_ADVISE_COMPACTED_2B_BIT)
+#define Z_EROFS_ADVISE_COMPACTED_2B            0x0001
+#define Z_EROFS_ADVISE_BIG_PCLUSTER_1          0x0002
+#define Z_EROFS_ADVISE_BIG_PCLUSTER_2          0x0004
  
  struct z_erofs_map_header {
         __le32  h_reserved1;
@@ -214,9 +243,7 @@ struct z_erofs_map_header {
         __u8    h_algorithmtype;
         /*
          * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096;
-        * bit 3-4 : (physical - logical) cluster bits of head 1:
-        *       For example, if logical clustersize = 4096, 1 for 8192.
-        * bit 5-7 : (physical - logical) cluster bits of head 2.
+        * bit 3-7 : reserved.
          */
         __u8    h_clusterbits;
  };
@@ -259,6 +286,13 @@ enum {
  #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS        2
  #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT         0
  
+/*
+ * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the
+ * compressed block count of a compressed extent (in logical clusters, aka.
+ * block count of a pcluster).
+ */
+#define Z_EROFS_VLE_DI_D0_CBLKCNT              (1 << 11)
+
  struct z_erofs_vle_decompressed_index {
         __le16 di_advise;
         /* where to decompress in the head cluster */
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c

index 119fdce..7ed2d73 100644 (file)
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -44,6 +44,13 @@ static struct page *erofs_read_inode(struct inode *inode,
         dic = page_address(page) + *ofs;
         ifmt = le16_to_cpu(dic->i_format);
  
+       if (ifmt & ~EROFS_I_ALL) {
+               erofs_err(inode->i_sb, "unsupported i_format %u of nid %llu",
+                         ifmt, vi->nid);
+               err = -EOPNOTSUPP;
+               goto err_out;
+       }
+
         vi->datalayout = erofs_inode_datalayout(ifmt);
         if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) {
                 erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu",
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h

index 351dae5..f92e3e3 100644 (file)
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -50,6 +50,8 @@ struct erofs_fs_context {
  #ifdef CONFIG_EROFS_FS_ZIP
         /* current strategy of how to use managed cache */
         unsigned char cache_strategy;
+       /* strategy of sync decompression (false - auto, true - force on) */
+       bool readahead_sync_decompress;
  
         /* threshold for decompression synchronously */
         unsigned int max_sync_decompress_pages;
@@ -57,6 +59,14 @@ struct erofs_fs_context {
         unsigned int mount_opt;
  };
  
+/* all filesystem-wide lz4 configurations */
+struct erofs_sb_lz4_info {
+       /* # of pages needed for EROFS lz4 rolling decompression */
+       u16 max_distance_pages;
+       /* maximum possible blocks for pclusters in the filesystem */
+       u16 max_pclusterblks;
+};
+
  struct erofs_sb_info {
  #ifdef CONFIG_EROFS_FS_ZIP
         /* list for all registered superblocks, mainly for shrinker */
@@ -67,9 +77,12 @@ struct erofs_sb_info {
         struct xarray managed_pslots;
  
         unsigned int shrinker_run_no;
+       u16 available_compr_algs;
  
         /* pseudo inode to manage cached pages */
         struct inode *managed_cache;
+
+       struct erofs_sb_lz4_info lz4;
  #endif /* CONFIG_EROFS_FS_ZIP */
         u32 blocks;
         u32 meta_blkaddr;
@@ -80,6 +93,7 @@ struct erofs_sb_info {
         /* inode slot unit size in bit shift */
         unsigned char islotbits;
  
+       u32 sb_size;                    /* total superblock size */
         u32 build_time_nsec;
         u64 build_time;
  
@@ -182,12 +196,6 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
         return v;
  }
  #endif /* !CONFIG_SMP */
-
-/* hard limit of pages per compressed cluster */
-#define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
-#define EROFS_PCPUBUF_NR_PAGES          Z_EROFS_CLUSTER_MAX_PAGES
-#else
-#define EROFS_PCPUBUF_NR_PAGES          0
  #endif /* !CONFIG_EROFS_FS_ZIP */
  
  /* we strictly follow PAGE_SIZE and no buffer head yet */
@@ -216,6 +224,17 @@ static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid)
         return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits);
  }
  
+#define EROFS_FEATURE_FUNCS(name, compat, feature) \
+static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \
+{ \
+       return sbi->feature_##compat & EROFS_FEATURE_##feature; \
+}
+
+EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING)
+EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS)
+EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER)
+EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
+
  /* atomic flag definitions */
  #define EROFS_I_EA_INITED_BIT  0
  #define EROFS_I_Z_INITED_BIT   1
@@ -244,7 +263,6 @@ struct erofs_inode {
                         unsigned short z_advise;
                         unsigned char  z_algorithmtype[2];
                         unsigned char  z_logical_clusterbits;
-                       unsigned char  z_physical_clusterbits[2];
                 };
  #endif /* CONFIG_EROFS_FS_ZIP */
         };
@@ -287,7 +305,7 @@ extern const struct address_space_operations erofs_raw_access_aops;
  extern const struct address_space_operations z_erofs_aops;
  
  /*
- * Logical to physical block mapping, used by erofs_map_blocks()
+ * Logical to physical block mapping
   *
   * Different with other file systems, it is used for 2 access modes:
   *
@@ -334,7 +352,7 @@ struct erofs_map_blocks {
         struct page *mpage;
  };
  
-/* Flags used by erofs_map_blocks() */
+/* Flags used by erofs_map_blocks_flatmode() */
  #define EROFS_GET_BLOCKS_RAW    0x0001
  
  /* zmap.c */
@@ -356,8 +374,6 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode,
  /* data.c */
  struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr);
  
-int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
-
  /* inode.c */
  static inline unsigned long erofs_inode_hash(erofs_nid_t nid)
  {
@@ -386,23 +402,30 @@ int erofs_namei(struct inode *dir, struct qstr *name,
  /* dir.c */
  extern const struct file_operations erofs_dir_fops;
  
-/* utils.c / zdata.c */
-struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
-
-#if (EROFS_PCPUBUF_NR_PAGES > 0)
-void *erofs_get_pcpubuf(unsigned int pagenr);
-#define erofs_put_pcpubuf(buf) do { \
-       (void)&(buf);   \
-       preempt_enable();       \
-} while (0)
-#else
-static inline void *erofs_get_pcpubuf(unsigned int pagenr)
+static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count)
  {
-       return ERR_PTR(-EOPNOTSUPP);
+       int retried = 0;
+
+       while (1) {
+               void *p = vm_map_ram(pages, count, -1);
+
+               /* retry two more times (totally 3 times) */
+               if (p || ++retried >= 3)
+                       return p;
+               vm_unmap_aliases();
+       }
+       return NULL;
  }
  
-#define erofs_put_pcpubuf(buf) do {} while (0)
-#endif
+/* pcpubuf.c */
+void *erofs_get_pcpubuf(unsigned int requiredpages);
+void erofs_put_pcpubuf(void *ptr);
+int erofs_pcpubuf_growsize(unsigned int nrpages);
+void erofs_pcpubuf_init(void);
+void erofs_pcpubuf_exit(void);
+
+/* utils.c / zdata.c */
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
  
  #ifdef CONFIG_EROFS_FS_ZIP
  int erofs_workgroup_put(struct erofs_workgroup *grp);
@@ -421,6 +444,9 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
                                        struct erofs_workgroup *egrp);
  int erofs_try_to_free_cached_page(struct address_space *mapping,
                                   struct page *page);
+int z_erofs_load_lz4_config(struct super_block *sb,
+                           struct erofs_super_block *dsb,
+                           struct z_erofs_lz4_cfgs *lz4, int len);
  #else
  static inline void erofs_shrinker_register(struct super_block *sb) {}
  static inline void erofs_shrinker_unregister(struct super_block *sb) {}
@@ -428,6 +454,16 @@ static inline int erofs_init_shrinker(void) { return 0; }
  static inline void erofs_exit_shrinker(void) {}
  static inline int z_erofs_init_zip_subsystem(void) { return 0; }
  static inline void z_erofs_exit_zip_subsystem(void) {}
+static inline int z_erofs_load_lz4_config(struct super_block *sb,
+                                 struct erofs_super_block *dsb,
+                                 struct z_erofs_lz4_cfgs *lz4, int len)
+{
+       if (lz4 || dsb->u1.lz4_max_distance) {
+               erofs_err(sb, "lz4 algorithm isn't enabled");
+               return -EINVAL;
+       }
+       return 0;
+}
  #endif /* !CONFIG_EROFS_FS_ZIP */
  
  #define EFSCORRUPTED    EUCLEAN         /* Filesystem is corrupted */
diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c

new file mode 100644 (file)

index 0000000..6c88557
--- /dev/null
+++ b/fs/erofs/pcpubuf.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) Gao Xiang <xiang@kernel.org>
+ *
+ * For low-latency decompression algorithms (e.g. lz4), reserve consecutive
+ * per-CPU virtual memory (in pages) in advance to store such inplace I/O
+ * data if inplace decompression is failed (due to unmet inplace margin for
+ * example).
+ */
+#include "internal.h"
+
+struct erofs_pcpubuf {
+       raw_spinlock_t lock;
+       void *ptr;
+       struct page **pages;
+       unsigned int nrpages;
+};
+
+static DEFINE_PER_CPU(struct erofs_pcpubuf, erofs_pcb);
+
+void *erofs_get_pcpubuf(unsigned int requiredpages)
+       __acquires(pcb->lock)
+{
+       struct erofs_pcpubuf *pcb = &get_cpu_var(erofs_pcb);
+
+       raw_spin_lock(&pcb->lock);
+       /* check if the per-CPU buffer is too small */
+       if (requiredpages > pcb->nrpages) {
+               raw_spin_unlock(&pcb->lock);
+               put_cpu_var(erofs_pcb);
+               /* (for sparse checker) pretend pcb->lock is still taken */
+               __acquire(pcb->lock);
+               return NULL;
+       }
+       return pcb->ptr;
+}
+
+void erofs_put_pcpubuf(void *ptr) __releases(pcb->lock)
+{
+       struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, smp_processor_id());
+
+       DBG_BUGON(pcb->ptr != ptr);
+       raw_spin_unlock(&pcb->lock);
+       put_cpu_var(erofs_pcb);
+}
+
+/* the next step: support per-CPU page buffers hotplug */
+int erofs_pcpubuf_growsize(unsigned int nrpages)
+{
+       static DEFINE_MUTEX(pcb_resize_mutex);
+       static unsigned int pcb_nrpages;
+       LIST_HEAD(pagepool);
+       int delta, cpu, ret, i;
+
+       mutex_lock(&pcb_resize_mutex);
+       delta = nrpages - pcb_nrpages;
+       ret = 0;
+       /* avoid shrinking pcpubuf, since no idea how many fses rely on */
+       if (delta <= 0)
+               goto out;
+
+       for_each_possible_cpu(cpu) {
+               struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
+               struct page **pages, **oldpages;
+               void *ptr, *old_ptr;
+
+               pages = kmalloc_array(nrpages, sizeof(*pages), GFP_KERNEL);
+               if (!pages) {
+                       ret = -ENOMEM;
+                       break;
+               }
+
+               for (i = 0; i < nrpages; ++i) {
+                       pages[i] = erofs_allocpage(&pagepool, GFP_KERNEL);
+                       if (!pages[i]) {
+                               ret = -ENOMEM;
+                               oldpages = pages;
+                               goto free_pagearray;
+                       }
+               }
+               ptr = vmap(pages, nrpages, VM_MAP, PAGE_KERNEL);
+               if (!ptr) {
+                       ret = -ENOMEM;
+                       oldpages = pages;
+                       goto free_pagearray;
+               }
+               raw_spin_lock(&pcb->lock);
+               old_ptr = pcb->ptr;
+               pcb->ptr = ptr;
+               oldpages = pcb->pages;
+               pcb->pages = pages;
+               i = pcb->nrpages;
+               pcb->nrpages = nrpages;
+               raw_spin_unlock(&pcb->lock);
+
+               if (!oldpages) {
+                       DBG_BUGON(old_ptr);
+                       continue;
+               }
+
+               if (old_ptr)
+                       vunmap(old_ptr);
+free_pagearray:
+               while (i)
+                       list_add(&oldpages[--i]->lru, &pagepool);
+               kfree(oldpages);
+               if (ret)
+                       break;
+       }
+       pcb_nrpages = nrpages;
+       put_pages_list(&pagepool);
+out:
+       mutex_unlock(&pcb_resize_mutex);
+       return ret;
+}
+
+void erofs_pcpubuf_init(void)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
+
+               raw_spin_lock_init(&pcb->lock);
+       }
+}
+
+void erofs_pcpubuf_exit(void)
+{
+       int cpu, i;
+
+       for_each_possible_cpu(cpu) {
+               struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
+
+               if (pcb->ptr) {
+                       vunmap(pcb->ptr);
+                       pcb->ptr = NULL;
+               }
+               if (!pcb->pages)
+                       continue;
+
+               for (i = 0; i < pcb->nrpages; ++i)
+                       if (pcb->pages[i])
+                               put_page(pcb->pages[i]);
+               kfree(pcb->pages);
+               pcb->pages = NULL;
+       }
+}
diff --git a/fs/erofs/super.c b/fs/erofs/super.c

index d5a6b9b..bbf3bbd 100644 (file)
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -122,6 +122,136 @@ static bool check_layout_compatibility(struct super_block *sb,
         return true;
  }
  
+#ifdef CONFIG_EROFS_FS_ZIP
+/* read variable-sized metadata, offset will be aligned by 4-byte */
+static void *erofs_read_metadata(struct super_block *sb, struct page **pagep,
+                                erofs_off_t *offset, int *lengthp)
+{
+       struct page *page = *pagep;
+       u8 *buffer, *ptr;
+       int len, i, cnt;
+       erofs_blk_t blk;
+
+       *offset = round_up(*offset, 4);
+       blk = erofs_blknr(*offset);
+
+       if (!page || page->index != blk) {
+               if (page) {
+                       unlock_page(page);
+                       put_page(page);
+               }
+               page = erofs_get_meta_page(sb, blk);
+               if (IS_ERR(page))
+                       goto err_nullpage;
+       }
+
+       ptr = kmap(page);
+       len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(*offset)]);
+       if (!len)
+               len = U16_MAX + 1;
+       buffer = kmalloc(len, GFP_KERNEL);
+       if (!buffer) {
+               buffer = ERR_PTR(-ENOMEM);
+               goto out;
+       }
+       *offset += sizeof(__le16);
+       *lengthp = len;
+
+       for (i = 0; i < len; i += cnt) {
+               cnt = min(EROFS_BLKSIZ - (int)erofs_blkoff(*offset), len - i);
+               blk = erofs_blknr(*offset);
+
+               if (!page || page->index != blk) {
+                       if (page) {
+                               kunmap(page);
+                               unlock_page(page);
+                               put_page(page);
+                       }
+                       page = erofs_get_meta_page(sb, blk);
+                       if (IS_ERR(page)) {
+                               kfree(buffer);
+                               goto err_nullpage;
+                       }
+                       ptr = kmap(page);
+               }
+               memcpy(buffer + i, ptr + erofs_blkoff(*offset), cnt);
+               *offset += cnt;
+       }
+out:
+       kunmap(page);
+       *pagep = page;
+       return buffer;
+err_nullpage:
+       *pagep = NULL;
+       return page;
+}
+
+static int erofs_load_compr_cfgs(struct super_block *sb,
+                                struct erofs_super_block *dsb)
+{
+       struct erofs_sb_info *sbi;
+       struct page *page;
+       unsigned int algs, alg;
+       erofs_off_t offset;
+       int size, ret;
+
+       sbi = EROFS_SB(sb);
+       sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs);
+
+       if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) {
+               erofs_err(sb, "try to load compressed fs with unsupported algorithms %x",
+                         sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS);
+               return -EINVAL;
+       }
+
+       offset = EROFS_SUPER_OFFSET + sbi->sb_size;
+       page = NULL;
+       alg = 0;
+       ret = 0;
+
+       for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) {
+               void *data;
+
+               if (!(algs & 1))
+                       continue;
+
+               data = erofs_read_metadata(sb, &page, &offset, &size);
+               if (IS_ERR(data)) {
+                       ret = PTR_ERR(data);
+                       goto err;
+               }
+
+               switch (alg) {
+               case Z_EROFS_COMPRESSION_LZ4:
+                       ret = z_erofs_load_lz4_config(sb, dsb, data, size);
+                       break;
+               default:
+                       DBG_BUGON(1);
+                       ret = -EFAULT;
+               }
+               kfree(data);
+               if (ret)
+                       goto err;
+       }
+err:
+       if (page) {
+               unlock_page(page);
+               put_page(page);
+       }
+       return ret;
+}
+#else
+static int erofs_load_compr_cfgs(struct super_block *sb,
+                                struct erofs_super_block *dsb)
+{
+       if (dsb->u1.available_compr_algs) {
+               erofs_err(sb, "try to load compressed fs when compression is disabled");
+               return -EINVAL;
+       }
+       return 0;
+}
+#endif
+
  static int erofs_read_superblock(struct super_block *sb)
  {
         struct erofs_sb_info *sbi;
@@ -149,7 +279,7 @@ static int erofs_read_superblock(struct super_block *sb)
         }
  
         sbi->feature_compat = le32_to_cpu(dsb->feature_compat);
-       if (sbi->feature_compat & EROFS_FEATURE_COMPAT_SB_CHKSUM) {
+       if (erofs_sb_has_sb_chksum(sbi)) {
                 ret = erofs_superblock_csum_verify(sb, data);
                 if (ret)
                         goto out;
@@ -166,6 +296,12 @@ static int erofs_read_superblock(struct super_block *sb)
         if (!check_layout_compatibility(sb, dsb))
                 goto out;
  
+       sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE;
+       if (sbi->sb_size > EROFS_BLKSIZ) {
+               erofs_err(sb, "invalid sb_extslots %u (more than a fs block)",
+                         sbi->sb_size);
+               goto out;
+       }
         sbi->blocks = le32_to_cpu(dsb->blocks);
         sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
  #ifdef CONFIG_EROFS_FS_XATTR
@@ -187,7 +323,12 @@ static int erofs_read_superblock(struct super_block *sb)
                 ret = -EFSCORRUPTED;
                 goto out;
         }
-       ret = 0;
+
+       /* parse on-disk compression configurations */
+       if (erofs_sb_has_compr_cfgs(sbi))
+               ret = erofs_load_compr_cfgs(sb, dsb);
+       else
+               ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0);
  out:
         kunmap(page);
         put_page(page);
@@ -200,6 +341,7 @@ static void erofs_default_options(struct erofs_fs_context *ctx)
  #ifdef CONFIG_EROFS_FS_ZIP
         ctx->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
         ctx->max_sync_decompress_pages = 3;
+       ctx->readahead_sync_decompress = false;
  #endif
  #ifdef CONFIG_EROFS_FS_XATTR
         set_opt(ctx, XATTR_USER);
@@ -513,6 +655,7 @@ static int __init erofs_module_init(void)
         if (err)
                 goto shrinker_err;
  
+       erofs_pcpubuf_init();
         err = z_erofs_init_zip_subsystem();
         if (err)
                 goto zip_err;
@@ -542,6 +685,7 @@ static void __exit erofs_module_exit(void)
         /* Ensure all RCU free inodes are safe before cache is destroyed. */
         rcu_barrier();
         kmem_cache_destroy(erofs_inode_cachep);
+       erofs_pcpubuf_exit();
  }
  
  /* get filesystem statistics */
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c

index de9986d..6758c5b 100644 (file)
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -21,18 +21,6 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
         return page;
  }
  
-#if (EROFS_PCPUBUF_NR_PAGES > 0)
-static struct {
-       u8 data[PAGE_SIZE * EROFS_PCPUBUF_NR_PAGES];
-} ____cacheline_aligned_in_smp erofs_pcpubuf[NR_CPUS];
-
-void *erofs_get_pcpubuf(unsigned int pagenr)
-{
-       preempt_disable();
-       return &erofs_pcpubuf[smp_processor_id()].data[pagenr * PAGE_SIZE];
-}
-#endif
-
  #ifdef CONFIG_EROFS_FS_ZIP
  /* global shrink count (for all mounted EROFS instances) */
  static atomic_long_t erofs_global_shrink_cnt;
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c

index 3851e1a..78e4b59 100644 (file)
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -10,6 +10,93 @@
  
  #include <trace/events/erofs.h>
  
+/*
+ * since pclustersize is variable for big pcluster feature, introduce slab
+ * pools implementation for different pcluster sizes.
+ */
+struct z_erofs_pcluster_slab {
+       struct kmem_cache *slab;
+       unsigned int maxpages;
+       char name[48];
+};
+
+#define _PCLP(n) { .maxpages = n }
+
+static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = {
+       _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128),
+       _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES)
+};
+
+static void z_erofs_destroy_pcluster_pool(void)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
+               if (!pcluster_pool[i].slab)
+                       continue;
+               kmem_cache_destroy(pcluster_pool[i].slab);
+               pcluster_pool[i].slab = NULL;
+       }
+}
+
+static int z_erofs_create_pcluster_pool(void)
+{
+       struct z_erofs_pcluster_slab *pcs;
+       struct z_erofs_pcluster *a;
+       unsigned int size;
+
+       for (pcs = pcluster_pool;
+            pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
+               size = struct_size(a, compressed_pages, pcs->maxpages);
+
+               sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages);
+               pcs->slab = kmem_cache_create(pcs->name, size, 0,
+                                             SLAB_RECLAIM_ACCOUNT, NULL);
+               if (pcs->slab)
+                       continue;
+
+               z_erofs_destroy_pcluster_pool();
+               return -ENOMEM;
+       }
+       return 0;
+}
+
+static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
+               struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
+               struct z_erofs_pcluster *pcl;
+
+               if (nrpages > pcs->maxpages)
+                       continue;
+
+               pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS);
+               if (!pcl)
+                       return ERR_PTR(-ENOMEM);
+               pcl->pclusterpages = nrpages;
+               return pcl;
+       }
+       return ERR_PTR(-EINVAL);
+}
+
+static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
+               struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
+
+               if (pcl->pclusterpages > pcs->maxpages)
+                       continue;
+
+               kmem_cache_free(pcs->slab, pcl);
+               return;
+       }
+       DBG_BUGON(1);
+}
+
  /*
   * a compressed_pages[] placeholder in order to avoid
   * being filled with file pages for in-place decompression.
@@ -37,12 +124,11 @@ typedef tagptr1_t compressed_page_t;
         tagptr_fold(compressed_page_t, page, 1)
  
  static struct workqueue_struct *z_erofs_workqueue __read_mostly;
-static struct kmem_cache *pcluster_cachep __read_mostly;
  
  void z_erofs_exit_zip_subsystem(void)
  {
         destroy_workqueue(z_erofs_workqueue);
-       kmem_cache_destroy(pcluster_cachep);
+       z_erofs_destroy_pcluster_pool();
  }
  
  static inline int z_erofs_init_workqueue(void)
@@ -59,32 +145,16 @@ static inline int z_erofs_init_workqueue(void)
         return z_erofs_workqueue ? 0 : -ENOMEM;
  }
  
-static void z_erofs_pcluster_init_once(void *ptr)
-{
-       struct z_erofs_pcluster *pcl = ptr;
-       struct z_erofs_collection *cl = z_erofs_primarycollection(pcl);
-       unsigned int i;
-
-       mutex_init(&cl->lock);
-       cl->nr_pages = 0;
-       cl->vcnt = 0;
-       for (i = 0; i < Z_EROFS_CLUSTER_MAX_PAGES; ++i)
-               pcl->compressed_pages[i] = NULL;
-}
-
  int __init z_erofs_init_zip_subsystem(void)
  {
-       pcluster_cachep = kmem_cache_create("erofs_compress",
-                                           Z_EROFS_WORKGROUP_SIZE, 0,
-                                           SLAB_RECLAIM_ACCOUNT,
-                                           z_erofs_pcluster_init_once);
-       if (pcluster_cachep) {
-               if (!z_erofs_init_workqueue())
-                       return 0;
-
-               kmem_cache_destroy(pcluster_cachep);
-       }
-       return -ENOMEM;
+       int err = z_erofs_create_pcluster_pool();
+
+       if (err)
+               return err;
+       err = z_erofs_init_workqueue();
+       if (err)
+               z_erofs_destroy_pcluster_pool();
+       return err;
  }
  
  enum z_erofs_collectmode {
@@ -104,6 +174,12 @@ enum z_erofs_collectmode {
          * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________|
          */
         COLLECT_PRIMARY_HOOKED,
+       /*
+        * a weak form of COLLECT_PRIMARY_FOLLOWED, the difference is that it
+        * could be dispatched into bypass queue later due to uptodated managed
+        * pages. All related online pages cannot be reused for inplace I/O (or
+        * pagevec) since it can be directly decoded without I/O submission.
+        */
         COLLECT_PRIMARY_FOLLOWED_NOINPLACE,
         /*
          * The current collection has been linked with the owned chain, and
@@ -128,7 +204,8 @@ struct z_erofs_collector {
  
         struct z_erofs_pcluster *pcl, *tailpcl;
         struct z_erofs_collection *cl;
-       struct page **compressedpages;
+       /* a pointer used to pick up inplace I/O pages */
+       struct page **icpage_ptr;
         z_erofs_next_pcluster_t owned_head;
  
         enum z_erofs_collectmode mode;
@@ -162,18 +239,19 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
                                      enum z_erofs_cache_alloctype type,
                                      struct list_head *pagepool)
  {
-       const struct z_erofs_pcluster *pcl = clt->pcl;
-       const unsigned int clusterpages = BIT(pcl->clusterbits);
-       struct page **pages = clt->compressedpages;
-       pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages);
+       struct z_erofs_pcluster *pcl = clt->pcl;
         bool standalone = true;
         gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
                         __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
+       struct page **pages;
+       pgoff_t index;
  
         if (clt->mode < COLLECT_PRIMARY_FOLLOWED)
                 return;
  
-       for (; pages < pcl->compressed_pages + clusterpages; ++pages) {
+       pages = pcl->compressed_pages;
+       index = pcl->obj.index;
+       for (; index < pcl->obj.index + pcl->pclusterpages; ++index, ++pages) {
                 struct page *page;
                 compressed_page_t t;
                 struct page *newpage = NULL;
@@ -186,21 +264,25 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
  
                 if (page) {
                         t = tag_compressed_page_justfound(page);
-               } else if (type == DELAYEDALLOC) {
-                       t = tagptr_init(compressed_page_t, PAGE_UNALLOCATED);
-               } else if (type == TRYALLOC) {
-                       newpage = erofs_allocpage(pagepool, gfp);
-                       if (!newpage)
-                               goto dontalloc;
-
-                       set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
-                       t = tag_compressed_page_justfound(newpage);
-               } else {        /* DONTALLOC */
-dontalloc:
-                       if (standalone)
-                               clt->compressedpages = pages;
+               } else {
+                       /* I/O is needed, no possible to decompress directly */
                         standalone = false;
-                       continue;
+                       switch (type) {
+                       case DELAYEDALLOC:
+                               t = tagptr_init(compressed_page_t,
+                                               PAGE_UNALLOCATED);
+                               break;
+                       case TRYALLOC:
+                               newpage = erofs_allocpage(pagepool, gfp);
+                               if (!newpage)
+                                       continue;
+                               set_page_private(newpage,
+                                                Z_EROFS_PREALLOCATED_PAGE);
+                               t = tag_compressed_page_justfound(newpage);
+                               break;
+                       default:        /* DONTALLOC */
+                               continue;
+                       }
                 }
  
                 if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t)))
@@ -214,7 +296,11 @@ dontalloc:
                 }
         }
  
-       if (standalone)         /* downgrade to PRIMARY_FOLLOWED_NOINPLACE */
+       /*
+        * don't do inplace I/O if all compressed pages are available in
+        * managed cache since it can be moved to the bypass queue instead.
+        */
+       if (standalone)
                 clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
  }
  
@@ -225,14 +311,13 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
         struct z_erofs_pcluster *const pcl =
                 container_of(grp, struct z_erofs_pcluster, obj);
         struct address_space *const mapping = MNGD_MAPPING(sbi);
-       const unsigned int clusterpages = BIT(pcl->clusterbits);
         int i;
  
         /*
          * refcount of workgroup is now freezed as 1,
          * therefore no need to worry about available decompression users.
          */
-       for (i = 0; i < clusterpages; ++i) {
+       for (i = 0; i < pcl->pclusterpages; ++i) {
                 struct page *page = pcl->compressed_pages[i];
  
                 if (!page)
@@ -257,13 +342,12 @@ int erofs_try_to_free_cached_page(struct address_space *mapping,
                                   struct page *page)
  {
         struct z_erofs_pcluster *const pcl = (void *)page_private(page);
-       const unsigned int clusterpages = BIT(pcl->clusterbits);
         int ret = 0;    /* 0 - busy */
  
         if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) {
                 unsigned int i;
  
-               for (i = 0; i < clusterpages; ++i) {
+               for (i = 0; i < pcl->pclusterpages; ++i) {
                         if (pcl->compressed_pages[i] == page) {
                                 WRITE_ONCE(pcl->compressed_pages[i], NULL);
                                 ret = 1;
@@ -279,16 +363,14 @@ int erofs_try_to_free_cached_page(struct address_space *mapping,
  }
  
  /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
-static inline bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
-                                         struct page *page)
+static bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
+                                  struct page *page)
  {
         struct z_erofs_pcluster *const pcl = clt->pcl;
-       const unsigned int clusterpages = BIT(pcl->clusterbits);
  
-       while (clt->compressedpages < pcl->compressed_pages + clusterpages) {
-               if (!cmpxchg(clt->compressedpages++, NULL, page))
+       while (clt->icpage_ptr > pcl->compressed_pages)
+               if (!cmpxchg(--clt->icpage_ptr, NULL, page))
                         return true;
-       }
         return false;
  }
  
@@ -399,10 +481,10 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
         struct erofs_workgroup *grp;
         int err;
  
-       /* no available workgroup, let's allocate one */
-       pcl = kmem_cache_alloc(pcluster_cachep, GFP_NOFS);
-       if (!pcl)
-               return -ENOMEM;
+       /* no available pcluster, let's allocate one */
+       pcl = z_erofs_alloc_pcluster(map->m_plen >> PAGE_SHIFT);
+       if (IS_ERR(pcl))
+               return PTR_ERR(pcl);
  
         atomic_set(&pcl->obj.refcount, 1);
         pcl->obj.index = map->m_pa >> PAGE_SHIFT;
@@ -416,25 +498,18 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
         else
                 pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
  
-       pcl->clusterbits = EROFS_I(inode)->z_physical_clusterbits[0];
-       pcl->clusterbits -= PAGE_SHIFT;
-
         /* new pclusters should be claimed as type 1, primary and followed */
         pcl->next = clt->owned_head;
         clt->mode = COLLECT_PRIMARY_FOLLOWED;
  
         cl = z_erofs_primarycollection(pcl);
-
-       /* must be cleaned before freeing to slab */
-       DBG_BUGON(cl->nr_pages);
-       DBG_BUGON(cl->vcnt);
-
         cl->pageofs = map->m_la & ~PAGE_MASK;
  
         /*
          * lock all primary followed works before visible to others
          * and mutex_trylock *never* fails for a new pcluster.
          */
+       mutex_init(&cl->lock);
         DBG_BUGON(!mutex_trylock(&cl->lock));
  
         grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj);
@@ -458,7 +533,7 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
  
  err_out:
         mutex_unlock(&cl->lock);
-       kmem_cache_free(pcluster_cachep, pcl);
+       z_erofs_free_pcluster(pcl);
         return err;
  }
  
@@ -502,9 +577,8 @@ out:
         z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS,
                                   clt->cl->pagevec, clt->cl->vcnt);
  
-       clt->compressedpages = clt->pcl->compressed_pages;
-       if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */
-               clt->compressedpages += Z_EROFS_CLUSTER_MAX_PAGES;
+       /* since file-backed online pages are traversed in reverse order */
+       clt->icpage_ptr = clt->pcl->compressed_pages + clt->pcl->pclusterpages;
         return 0;
  }
  
@@ -517,9 +591,8 @@ static void z_erofs_rcu_callback(struct rcu_head *head)
         struct z_erofs_collection *const cl =
                 container_of(head, struct z_erofs_collection, rcu);
  
-       kmem_cache_free(pcluster_cachep,
-                       container_of(cl, struct z_erofs_pcluster,
-                                    primary_collection));
+       z_erofs_free_pcluster(container_of(cl, struct z_erofs_pcluster,
+                                          primary_collection));
  }
  
  void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
@@ -706,9 +779,12 @@ err_out:
         goto out;
  }
  
+static void z_erofs_decompressqueue_work(struct work_struct *work);
  static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
                                        bool sync, int bios)
  {
+       struct erofs_sb_info *const sbi = EROFS_SB(io->sb);
+
         /* wake up the caller thread for sync decompression */
         if (sync) {
                 unsigned long flags;
@@ -720,8 +796,15 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
                 return;
         }
  
-       if (!atomic_add_return(bios, &io->pending_bios))
+       if (atomic_add_return(bios, &io->pending_bios))
+               return;
+       /* Use workqueue and sync decompression for atomic contexts only */
+       if (in_atomic() || irqs_disabled()) {
                 queue_work(z_erofs_workqueue, &io->u.work);
+               sbi->ctx.readahead_sync_decompress = true;
+               return;
+       }
+       z_erofs_decompressqueue_work(&io->u.work);
  }
  
  static bool z_erofs_page_is_invalidated(struct page *page)
@@ -761,9 +844,8 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
                                        struct list_head *pagepool)
  {
         struct erofs_sb_info *const sbi = EROFS_SB(sb);
-       const unsigned int clusterpages = BIT(pcl->clusterbits);
         struct z_erofs_pagevec_ctor ctor;
-       unsigned int i, outputsize, llen, nr_pages;
+       unsigned int i, inputsize, outputsize, llen, nr_pages;
         struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES];
         struct page **pages, **compressed_pages, *page;
  
@@ -843,7 +925,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
         overlapped = false;
         compressed_pages = pcl->compressed_pages;
  
-       for (i = 0; i < clusterpages; ++i) {
+       for (i = 0; i < pcl->pclusterpages; ++i) {
                 unsigned int pagenr;
  
                 page = compressed_pages[i];
@@ -896,12 +978,13 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
                 partial = true;
         }
  
+       inputsize = pcl->pclusterpages * PAGE_SIZE;
         err = z_erofs_decompress(&(struct z_erofs_decompress_req) {
                                         .sb = sb,
                                         .in = compressed_pages,
                                         .out = pages,
                                         .pageofs_out = cl->pageofs,
-                                       .inputsize = PAGE_SIZE,
+                                       .inputsize = inputsize,
                                         .outputsize = outputsize,
                                         .alg = pcl->algorithmformat,
                                         .inplace_io = overlapped,
@@ -909,8 +992,8 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
                                  }, pagepool);
  
  out:
-       /* must handle all compressed pages before endding pages */
-       for (i = 0; i < clusterpages; ++i) {
+       /* must handle all compressed pages before ending pages */
+       for (i = 0; i < pcl->pclusterpages; ++i) {
                 page = compressed_pages[i];
  
                 if (erofs_page_is_managed(sbi, page))
@@ -1213,7 +1296,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
                 pcl = container_of(owned_head, struct z_erofs_pcluster, next);
  
                 cur = pcl->obj.index;
-               end = cur + BIT(pcl->clusterbits);
+               end = cur + pcl->pclusterpages;
  
                 /* close the main owned chain at first */
                 owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
@@ -1333,7 +1416,8 @@ static void z_erofs_readahead(struct readahead_control *rac)
         struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
  
         unsigned int nr_pages = readahead_count(rac);
-       bool sync = (nr_pages <= sbi->ctx.max_sync_decompress_pages);
+       bool sync = (sbi->ctx.readahead_sync_decompress &&
+                       nr_pages <= sbi->ctx.max_sync_decompress_pages);
         struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
         struct page *page, *head = NULL;
         LIST_HEAD(pagepool);
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h

index b503b35..942ee69 100644 (file)
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -10,6 +10,7 @@
  #include "internal.h"
  #include "zpvec.h"
  
+#define Z_EROFS_PCLUSTER_MAX_PAGES     (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
  #define Z_EROFS_NR_INLINE_PAGEVECS      3
  
  /*
@@ -59,16 +60,17 @@ struct z_erofs_pcluster {
         /* A: point to next chained pcluster or TAILs */
         z_erofs_next_pcluster_t next;
  
-       /* A: compressed pages (including multi-usage pages) */
-       struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES];
-
         /* A: lower limit of decompressed length and if full length or not */
         unsigned int length;
  
+       /* I: physical cluster size in pages */
+       unsigned short pclusterpages;
+
         /* I: compression algorithm format */
         unsigned char algorithmformat;
-       /* I: bit shift of physical cluster size */
-       unsigned char clusterbits;
+
+       /* A: compressed pages (can be cached or inplaced pages) */
+       struct page *compressed_pages[];
  };
  
  #define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection)
@@ -82,8 +84,6 @@ struct z_erofs_pcluster {
  
  #define Z_EROFS_PCLUSTER_NIL            (NULL)
  
-#define Z_EROFS_WORKGROUP_SIZE  sizeof(struct z_erofs_pcluster)
-
  struct z_erofs_decompressqueue {
         struct super_block *sb;
         atomic_t pending_bios;
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c

index 14d2de3..e62d813 100644 (file)
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -11,17 +11,16 @@
  int z_erofs_fill_inode(struct inode *inode)
  {
         struct erofs_inode *const vi = EROFS_I(inode);
+       struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
  
-       if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) {
+       if (!erofs_sb_has_big_pcluster(sbi) &&
+           vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) {
                 vi->z_advise = 0;
                 vi->z_algorithmtype[0] = 0;
                 vi->z_algorithmtype[1] = 0;
                 vi->z_logical_clusterbits = LOG_BLOCK_SIZE;
-               vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits;
-               vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits;
                 set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
         }
-
         inode->i_mapping->a_ops = &z_erofs_aops;
         return 0;
  }
@@ -52,7 +51,8 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
         if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
                 goto out_unlock;
  
-       DBG_BUGON(vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY);
+       DBG_BUGON(!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
+                 vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY);
  
         pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
                     vi->xattr_isize, 8);
@@ -77,18 +77,22 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
         }
  
         vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7);
-       vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits +
-                                       ((h->h_clusterbits >> 3) & 3);
-
-       if (vi->z_physical_clusterbits[0] != LOG_BLOCK_SIZE) {
-               erofs_err(sb, "unsupported physical clusterbits %u for nid %llu, please upgrade kernel",
-                         vi->z_physical_clusterbits[0], vi->nid);
-               err = -EOPNOTSUPP;
+       if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
+           vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
+                           Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
+               erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu",
+                         vi->nid);
+               err = -EFSCORRUPTED;
+               goto unmap_done;
+       }
+       if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION &&
+           !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^
+           !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
+               erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu",
+                         vi->nid);
+               err = -EFSCORRUPTED;
                 goto unmap_done;
         }
-
-       vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits +
-                                       ((h->h_clusterbits >> 5) & 7);
         /* paired with smp_mb() at the beginning of the function */
         smp_mb();
         set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
@@ -111,7 +115,7 @@ struct z_erofs_maprecorder {
         u8  type;
         u16 clusterofs;
         u16 delta[2];
-       erofs_blk_t pblk;
+       erofs_blk_t pblk, compressedlcs;
  };
  
  static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m,
@@ -174,6 +178,15 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
         case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
                 m->clusterofs = 1 << vi->z_logical_clusterbits;
                 m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
+               if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) {
+                       if (!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
+                               DBG_BUGON(1);
+                               return -EFSCORRUPTED;
+                       }
+                       m->compressedlcs = m->delta[0] &
+                               ~Z_EROFS_VLE_DI_D0_CBLKCNT;
+                       m->delta[0] = 1;
+               }
                 m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
                 break;
         case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
@@ -210,6 +223,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
         unsigned int vcnt, base, lo, encodebits, nblk;
         int i;
         u8 *in, type;
+       bool big_pcluster;
  
         if (1 << amortizedshift == 4)
                 vcnt = 2;
@@ -218,6 +232,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
         else
                 return -EOPNOTSUPP;
  
+       big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
         encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
         base = round_down(eofs, vcnt << amortizedshift);
         in = m->kaddr + base;
@@ -229,7 +244,15 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
         m->type = type;
         if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
                 m->clusterofs = 1 << lclusterbits;
-               if (i + 1 != vcnt) {
+               if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) {
+                       if (!big_pcluster) {
+                               DBG_BUGON(1);
+                               return -EFSCORRUPTED;
+                       }
+                       m->compressedlcs = lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT;
+                       m->delta[0] = 1;
+                       return 0;
+               } else if (i + 1 != (int)vcnt) {
                         m->delta[0] = lo;
                         return 0;
                 }
@@ -242,22 +265,48 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
                                           in, encodebits * (i - 1), &type);
                 if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
                         lo = 0;
+               else if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT)
+                       lo = 1;
                 m->delta[0] = lo + 1;
                 return 0;
         }
         m->clusterofs = lo;
         m->delta[0] = 0;
         /* figout out blkaddr (pblk) for HEAD lclusters */
-       nblk = 1;
-       while (i > 0) {
-               --i;
-               lo = decode_compactedbits(lclusterbits, lomask,
-                                         in, encodebits * i, &type);
-               if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
-                       i -= lo;
-
-               if (i >= 0)
+       if (!big_pcluster) {
+               nblk = 1;
+               while (i > 0) {
+                       --i;
+                       lo = decode_compactedbits(lclusterbits, lomask,
+                                                 in, encodebits * i, &type);
+                       if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
+                               i -= lo;
+
+                       if (i >= 0)
+                               ++nblk;
+               }
+       } else {
+               nblk = 0;
+               while (i > 0) {
+                       --i;
+                       lo = decode_compactedbits(lclusterbits, lomask,
+                                                 in, encodebits * i, &type);
+                       if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
+                               if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) {
+                                       --i;
+                                       nblk += lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT;
+                                       continue;
+                               }
+                               /* bigpcluster shouldn't have plain d0 == 1 */
+                               if (lo <= 1) {
+                                       DBG_BUGON(1);
+                                       return -EFSCORRUPTED;
+                               }
+                               i -= lo - 2;
+                               continue;
+                       }
                         ++nblk;
+               }
         }
         in += (vcnt << amortizedshift) - sizeof(__le32);
         m->pblk = le32_to_cpu(*(__le32 *)in) + nblk;
@@ -381,6 +430,58 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
         return 0;
  }
  
+static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
+                                           unsigned int initial_lcn)
+{
+       struct erofs_inode *const vi = EROFS_I(m->inode);
+       struct erofs_map_blocks *const map = m->map;
+       const unsigned int lclusterbits = vi->z_logical_clusterbits;
+       unsigned long lcn;
+       int err;
+
+       DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN &&
+                 m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD);
+       if (!(map->m_flags & EROFS_MAP_ZIPPED) ||
+           !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
+               map->m_plen = 1 << lclusterbits;
+               return 0;
+       }
+
+       lcn = m->lcn + 1;
+       if (m->compressedlcs)
+               goto out;
+       if (lcn == initial_lcn)
+               goto err_bonus_cblkcnt;
+
+       err = z_erofs_load_cluster_from_disk(m, lcn);
+       if (err)
+               return err;
+
+       switch (m->type) {
+       case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+               if (m->delta[0] != 1)
+                       goto err_bonus_cblkcnt;
+               if (m->compressedlcs)
+                       break;
+               fallthrough;
+       default:
+               erofs_err(m->inode->i_sb,
+                         "cannot found CBLKCNT @ lcn %lu of nid %llu",
+                         lcn, vi->nid);
+               DBG_BUGON(1);
+               return -EFSCORRUPTED;
+       }
+out:
+       map->m_plen = m->compressedlcs << lclusterbits;
+       return 0;
+err_bonus_cblkcnt:
+       erofs_err(m->inode->i_sb,
+                 "bogus CBLKCNT @ lcn %lu of nid %llu",
+                 lcn, vi->nid);
+       DBG_BUGON(1);
+       return -EFSCORRUPTED;
+}
+
  int z_erofs_map_blocks_iter(struct inode *inode,
                             struct erofs_map_blocks *map,
                             int flags)
@@ -392,6 +493,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
         };
         int err = 0;
         unsigned int lclusterbits, endoff;
+       unsigned long initial_lcn;
         unsigned long long ofs, end;
  
         trace_z_erofs_map_blocks_iter_enter(inode, map, flags);
@@ -410,10 +512,10 @@ int z_erofs_map_blocks_iter(struct inode *inode,
  
         lclusterbits = vi->z_logical_clusterbits;
         ofs = map->m_la;
-       m.lcn = ofs >> lclusterbits;
+       initial_lcn = ofs >> lclusterbits;
         endoff = ofs & ((1 << lclusterbits) - 1);
  
-       err = z_erofs_load_cluster_from_disk(&m, m.lcn);
+       err = z_erofs_load_cluster_from_disk(&m, initial_lcn);
         if (err)
                 goto unmap_out;
  
@@ -443,7 +545,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
                 m.delta[0] = 1;
                 fallthrough;
         case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
-               /* get the correspoinding first chunk */
+               /* get the corresponding first chunk */
                 err = z_erofs_extent_lookback(&m, m.delta[0]);
                 if (err)
                         goto unmap_out;
@@ -457,10 +559,12 @@ int z_erofs_map_blocks_iter(struct inode *inode,
         }
  
         map->m_llen = end - map->m_la;
-       map->m_plen = 1 << lclusterbits;
         map->m_pa = blknr_to_addr(m.pblk);
         map->m_flags |= EROFS_MAP_MAPPED;
  
+       err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
+       if (err)
+               goto out;
  unmap_out:
         if (m.kaddr)
                 kunmap_atomic(m.kaddr);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 26 Apr 2021 20:28:12 +0000 (13:28 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 26 Apr 2021 20:28:12 +0000 (13:28 -0700)
fs/erofs/Kconfig		patch \| blob \| history
fs/erofs/Makefile		patch \| blob \| history
fs/erofs/data.c		patch \| blob \| history
fs/erofs/decompressor.c		patch \| blob \| history
fs/erofs/erofs_fs.h		patch \| blob \| history
fs/erofs/inode.c		patch \| blob \| history
fs/erofs/internal.h		patch \| blob \| history
fs/erofs/pcpubuf.c	[new file with mode: 0644]	patch \| blob
fs/erofs/super.c		patch \| blob \| history
fs/erofs/utils.c		patch \| blob \| history
fs/erofs/zdata.c		patch \| blob \| history
fs/erofs/zdata.h		patch \| blob \| history
fs/erofs/zmap.c		patch \| blob \| history