From: Linus Torvalds Date: Thu, 2 Sep 2021 16:12:05 +0000 (-0700) Subject: Merge tag 'erofs-for-5.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang... X-Git-Tag: microblaze-v5.16~132 X-Git-Url: http://git.monstr.eu/?p=linux-2.6-microblaze.git;a=commitdiff_plain;h=412106c203b759fa7fbcc4f855a90ab18e681ccb;hp=89594c746b00d3755e0792a2407f0b557a30ef37 Merge tag 'erofs-for-5.15-rc1' of git://git./linux/kernel/git/xiang/erofs Pull erofs updates from Gao Xiang: "In this cycle, direct I/O and fsdax support for uncompressed files are now added in order to avoid double-caching for loop device and VM container use cases. All uncompressed cases are now turned into iomap infrastructure, which looks much simpler and cleaner. In addition, fiemap support is added for both (un)compressed files by using iomap infrastructure as well so end users can easily get file distribution. We've also added chunk-based uncompressed files support for data deduplication as the next step of VM container use cases. Summary: - support direct I/O for all uncompressed files - support fsdax for non-tailpacking regular files - use iomap infrastructure for all uncompressed cases - support fiemap for both (un)compressed files - introduce chunk-based files for chunk deduplication - some cleanups" * tag 'erofs-for-5.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs: erofs: fix double free of 'copied' erofs: support reading chunk-based uncompressed files erofs: introduce chunk-based file on-disk format erofs: add fiemap support with iomap erofs: add support for the full decompressed length erofs: remove the mapping parameter from erofs_try_to_free_cached_page() erofs: directly use wrapper erofs_page_is_managed() when shrinking erofs: convert all uncompressed cases to iomap erofs: dax support for non-tailpacking regular file erofs: iomap support for non-tailpacking DIO --- diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst index 832839fcf4c3..b97579b7d8fb 100644 --- a/Documentation/filesystems/erofs.rst +++ b/Documentation/filesystems/erofs.rst @@ -84,6 +84,9 @@ cache_strategy=%s Select a strategy for cached decompression from now on: It still does in-place I/O decompression for the rest compressed physical clusters. ========== ============================================= +dax={always,never} Use direct access (no page cache). See + Documentation/filesystems/dax.rst. +dax A legacy option which is an alias for ``dax=always``. =================== ========================================================= On-disk details @@ -153,13 +156,14 @@ may not. All metadatas can be now observed in two different spaces (views): Xattrs, extents, data inline are followed by the corresponding inode with proper alignment, and they could be optional for different data mappings. - _currently_ total 4 valid data mappings are supported: + _currently_ total 5 data layouts are supported: == ==================================================================== 0 flat file data without data inline (no extent); 1 fixed-sized output data compression (with non-compacted indexes); 2 flat file data with tail packing data inline (no extent); - 3 fixed-sized output data compression (with compacted indexes, v5.3+). + 3 fixed-sized output data compression (with compacted indexes, v5.3+); + 4 chunk-based file (v5.15+). == ==================================================================== The size of the optional xattrs is indicated by i_xattr_count in inode @@ -210,6 +214,17 @@ Note that apart from the offset of the first filename, nameoff0 also indicates the total number of directory entries in this block since it is no need to introduce another on-disk field at all. +Chunk-based file +---------------- +In order to support chunk-based data deduplication, a new inode data layout has +been supported since Linux v5.15: Files are split in equal-sized data chunks +with ``extents`` area of the inode metadata indicating how to get the chunk +data: these can be simply as a 4-byte block address array or in the 8-byte +chunk index form (see struct erofs_inode_chunk_index in erofs_fs.h for more +details.) + +By the way, chunk-based files are all uncompressed for now. + Data compression ---------------- EROFS implements LZ4 fixed-sized output compression which generates fixed-sized diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 906af0c1998c..14b747026742 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -3,6 +3,7 @@ config EROFS_FS tristate "EROFS filesystem support" depends on BLOCK + select FS_IOMAP select LIBCRC32C help EROFS (Enhanced Read-Only File System) is a lightweight diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 3787a5fb0a42..9db829715652 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -2,35 +2,13 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2021, Alibaba Cloud */ #include "internal.h" #include - +#include #include -static void erofs_readendio(struct bio *bio) -{ - struct bio_vec *bvec; - blk_status_t err = bio->bi_status; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - - /* page is already locked */ - DBG_BUGON(PageUptodate(page)); - - if (err) - SetPageError(page); - else - SetPageUptodate(page); - - unlock_page(page); - /* page could be reclaimed now */ - } - bio_put(bio); -} - struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr) { struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping; @@ -59,13 +37,6 @@ static int erofs_map_blocks_flatmode(struct inode *inode, nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE); lastblk = nblocks - tailendpacking; - if (offset >= inode->i_size) { - /* leave out-of-bound access unmapped */ - map->m_flags = 0; - map->m_plen = 0; - goto out; - } - /* there is no hole in flatmode */ map->m_flags = EROFS_MAP_MAPPED; @@ -100,217 +71,273 @@ static int erofs_map_blocks_flatmode(struct inode *inode, goto err_out; } -out: map->m_llen = map->m_plen; - err_out: trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0); return err; } -static inline struct bio *erofs_read_raw_page(struct bio *bio, - struct address_space *mapping, - struct page *page, - erofs_off_t *last_block, - unsigned int nblocks, - unsigned int *eblks, - bool ra) +static int erofs_map_blocks(struct inode *inode, + struct erofs_map_blocks *map, int flags) { - struct inode *const inode = mapping->host; - struct super_block *const sb = inode->i_sb; - erofs_off_t current_block = (erofs_off_t)page->index; - int err; - - DBG_BUGON(!nblocks); - - if (PageUptodate(page)) { - err = 0; - goto has_updated; - } + struct super_block *sb = inode->i_sb; + struct erofs_inode *vi = EROFS_I(inode); + struct erofs_inode_chunk_index *idx; + struct page *page; + u64 chunknr; + unsigned int unit; + erofs_off_t pos; + int err = 0; - /* note that for readpage case, bio also equals to NULL */ - if (bio && - (*last_block + 1 != current_block || !*eblks)) { -submit_bio_retry: - submit_bio(bio); - bio = NULL; + if (map->m_la >= inode->i_size) { + /* leave out-of-bound access unmapped */ + map->m_flags = 0; + map->m_plen = 0; + goto out; } - if (!bio) { - struct erofs_map_blocks map = { - .m_la = blknr_to_addr(current_block), - }; - erofs_blk_t blknr; - unsigned int blkoff; - - err = erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW); - if (err) - goto err_out; - - /* zero out the holed page */ - if (!(map.m_flags & EROFS_MAP_MAPPED)) { - zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); - - /* imply err = 0, see erofs_map_blocks */ - goto has_updated; - } - - /* for RAW access mode, m_plen must be equal to m_llen */ - DBG_BUGON(map.m_plen != map.m_llen); - - blknr = erofs_blknr(map.m_pa); - blkoff = erofs_blkoff(map.m_pa); - - /* deal with inline page */ - if (map.m_flags & EROFS_MAP_META) { - void *vsrc, *vto; - struct page *ipage; + if (vi->datalayout != EROFS_INODE_CHUNK_BASED) + return erofs_map_blocks_flatmode(inode, map, flags); - DBG_BUGON(map.m_plen > PAGE_SIZE); + if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) + unit = sizeof(*idx); /* chunk index */ + else + unit = EROFS_BLOCK_MAP_ENTRY_SIZE; /* block map */ - ipage = erofs_get_meta_page(inode->i_sb, blknr); + chunknr = map->m_la >> vi->chunkbits; + pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + + vi->xattr_isize, unit) + unit * chunknr; - if (IS_ERR(ipage)) { - err = PTR_ERR(ipage); - goto err_out; - } + page = erofs_get_meta_page(inode->i_sb, erofs_blknr(pos)); + if (IS_ERR(page)) + return PTR_ERR(page); - vsrc = kmap_atomic(ipage); - vto = kmap_atomic(page); - memcpy(vto, vsrc + blkoff, map.m_plen); - memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen); - kunmap_atomic(vto); - kunmap_atomic(vsrc); - flush_dcache_page(page); + map->m_la = chunknr << vi->chunkbits; + map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits, + roundup(inode->i_size - map->m_la, EROFS_BLKSIZ)); - SetPageUptodate(page); - /* TODO: could we unlock the page earlier? */ - unlock_page(ipage); - put_page(ipage); + /* handle block map */ + if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) { + __le32 *blkaddr = page_address(page) + erofs_blkoff(pos); - /* imply err = 0, see erofs_map_blocks */ - goto has_updated; + if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) { + map->m_flags = 0; + } else { + map->m_pa = blknr_to_addr(le32_to_cpu(*blkaddr)); + map->m_flags = EROFS_MAP_MAPPED; } + goto out_unlock; + } + /* parse chunk indexes */ + idx = page_address(page) + erofs_blkoff(pos); + switch (le32_to_cpu(idx->blkaddr)) { + case EROFS_NULL_ADDR: + map->m_flags = 0; + break; + default: + /* only one device is supported for now */ + if (idx->device_id) { + erofs_err(sb, "invalid device id %u @ %llu for nid %llu", + le16_to_cpu(idx->device_id), + chunknr, vi->nid); + err = -EFSCORRUPTED; + goto out_unlock; + } + map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr)); + map->m_flags = EROFS_MAP_MAPPED; + break; + } +out_unlock: + unlock_page(page); + put_page(page); +out: + map->m_llen = map->m_plen; + return err; +} - /* pa must be block-aligned for raw reading */ - DBG_BUGON(erofs_blkoff(map.m_pa)); +static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, struct iomap *srcmap) +{ + int ret; + struct erofs_map_blocks map; + + map.m_la = offset; + map.m_llen = length; + + ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + if (ret < 0) + return ret; + + iomap->bdev = inode->i_sb->s_bdev; + iomap->dax_dev = EROFS_I_SB(inode)->dax_dev; + iomap->offset = map.m_la; + iomap->length = map.m_llen; + iomap->flags = 0; + iomap->private = NULL; + + if (!(map.m_flags & EROFS_MAP_MAPPED)) { + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + if (!iomap->length) + iomap->length = length; + return 0; + } - /* max # of continuous pages */ - if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE)) - nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE); + if (map.m_flags & EROFS_MAP_META) { + struct page *ipage; + + iomap->type = IOMAP_INLINE; + ipage = erofs_get_meta_page(inode->i_sb, + erofs_blknr(map.m_pa)); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + iomap->inline_data = page_address(ipage) + + erofs_blkoff(map.m_pa); + iomap->private = ipage; + } else { + iomap->type = IOMAP_MAPPED; + iomap->addr = map.m_pa; + } + return 0; +} - *eblks = bio_max_segs(nblocks); - bio = bio_alloc(GFP_NOIO, *eblks); +static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned int flags, struct iomap *iomap) +{ + struct page *ipage = iomap->private; - bio->bi_end_io = erofs_readendio; - bio_set_dev(bio, sb->s_bdev); - bio->bi_iter.bi_sector = (sector_t)blknr << - LOG_SECTORS_PER_BLOCK; - bio->bi_opf = REQ_OP_READ | (ra ? REQ_RAHEAD : 0); + if (ipage) { + DBG_BUGON(iomap->type != IOMAP_INLINE); + unlock_page(ipage); + put_page(ipage); + } else { + DBG_BUGON(iomap->type == IOMAP_INLINE); } + return written; +} - err = bio_add_page(bio, page, PAGE_SIZE, 0); - /* out of the extent or bio is full */ - if (err < PAGE_SIZE) - goto submit_bio_retry; - --*eblks; - *last_block = current_block; - return bio; +static const struct iomap_ops erofs_iomap_ops = { + .iomap_begin = erofs_iomap_begin, + .iomap_end = erofs_iomap_end, +}; -err_out: - /* for sync reading, set page error immediately */ - if (!ra) { - SetPageError(page); - ClearPageUptodate(page); +int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) { +#ifdef CONFIG_EROFS_FS_ZIP + return iomap_fiemap(inode, fieinfo, start, len, + &z_erofs_iomap_report_ops); +#else + return -EOPNOTSUPP; +#endif } -has_updated: - unlock_page(page); - - /* if updated manually, continuous pages has a gap */ - if (bio) - submit_bio(bio); - return err ? ERR_PTR(err) : NULL; + return iomap_fiemap(inode, fieinfo, start, len, &erofs_iomap_ops); } /* * since we dont have write or truncate flows, so no inode * locking needs to be held at the moment. */ -static int erofs_raw_access_readpage(struct file *file, struct page *page) +static int erofs_readpage(struct file *file, struct page *page) { - erofs_off_t last_block; - unsigned int eblks; - struct bio *bio; - - trace_erofs_readpage(page, true); + return iomap_readpage(page, &erofs_iomap_ops); +} - bio = erofs_read_raw_page(NULL, page->mapping, - page, &last_block, 1, &eblks, false); +static void erofs_readahead(struct readahead_control *rac) +{ + return iomap_readahead(rac, &erofs_iomap_ops); +} - if (IS_ERR(bio)) - return PTR_ERR(bio); +static sector_t erofs_bmap(struct address_space *mapping, sector_t block) +{ + return iomap_bmap(mapping, block, &erofs_iomap_ops); +} - if (bio) - submit_bio(bio); +static int erofs_prepare_dio(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + loff_t align = iocb->ki_pos | iov_iter_count(to) | + iov_iter_alignment(to); + struct block_device *bdev = inode->i_sb->s_bdev; + unsigned int blksize_mask; + + if (bdev) + blksize_mask = (1 << ilog2(bdev_logical_block_size(bdev))) - 1; + else + blksize_mask = (1 << inode->i_blkbits) - 1; + + if (align & blksize_mask) + return -EINVAL; return 0; } -static void erofs_raw_access_readahead(struct readahead_control *rac) +static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { - erofs_off_t last_block; - unsigned int eblks; - struct bio *bio = NULL; - struct page *page; - - trace_erofs_readpages(rac->mapping->host, readahead_index(rac), - readahead_count(rac), true); - - while ((page = readahead_page(rac))) { - prefetchw(&page->flags); - - bio = erofs_read_raw_page(bio, rac->mapping, page, &last_block, - readahead_count(rac), &eblks, true); - - /* all the page errors are ignored when readahead */ - if (IS_ERR(bio)) { - pr_err("%s, readahead error at page %lu of nid %llu\n", - __func__, page->index, - EROFS_I(rac->mapping->host)->nid); - - bio = NULL; - } - - put_page(page); + /* no need taking (shared) inode lock since it's a ro filesystem */ + if (!iov_iter_count(to)) + return 0; + +#ifdef CONFIG_FS_DAX + if (IS_DAX(iocb->ki_filp->f_mapping->host)) + return dax_iomap_rw(iocb, to, &erofs_iomap_ops); +#endif + if (iocb->ki_flags & IOCB_DIRECT) { + int err = erofs_prepare_dio(iocb, to); + + if (!err) + return iomap_dio_rw(iocb, to, &erofs_iomap_ops, + NULL, 0); + if (err < 0) + return err; } + return filemap_read(iocb, to, 0); +} + +/* for uncompressed (aligned) files and raw access for other files */ +const struct address_space_operations erofs_raw_access_aops = { + .readpage = erofs_readpage, + .readahead = erofs_readahead, + .bmap = erofs_bmap, + .direct_IO = noop_direct_IO, +}; - if (bio) - submit_bio(bio); +#ifdef CONFIG_FS_DAX +static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf, + enum page_entry_size pe_size) +{ + return dax_iomap_fault(vmf, pe_size, NULL, NULL, &erofs_iomap_ops); } -static sector_t erofs_bmap(struct address_space *mapping, sector_t block) +static vm_fault_t erofs_dax_fault(struct vm_fault *vmf) { - struct inode *inode = mapping->host; - struct erofs_map_blocks map = { - .m_la = blknr_to_addr(block), - }; + return erofs_dax_huge_fault(vmf, PE_SIZE_PTE); +} - if (EROFS_I(inode)->datalayout == EROFS_INODE_FLAT_INLINE) { - erofs_blk_t blks = i_size_read(inode) >> LOG_BLOCK_SIZE; +static const struct vm_operations_struct erofs_dax_vm_ops = { + .fault = erofs_dax_fault, + .huge_fault = erofs_dax_huge_fault, +}; - if (block >> LOG_SECTORS_PER_BLOCK >= blks) - return 0; - } +static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + if (!IS_DAX(file_inode(file))) + return generic_file_readonly_mmap(file, vma); - if (!erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW)) - return erofs_blknr(map.m_pa); + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + return -EINVAL; + vma->vm_ops = &erofs_dax_vm_ops; + vma->vm_flags |= VM_HUGEPAGE; return 0; } - -/* for uncompressed (aligned) files and raw access for other files */ -const struct address_space_operations erofs_raw_access_aops = { - .readpage = erofs_raw_access_readpage, - .readahead = erofs_raw_access_readahead, - .bmap = erofs_bmap, +#else +#define erofs_file_mmap generic_file_readonly_mmap +#endif + +const struct file_operations erofs_file_fops = { + .llseek = generic_file_llseek, + .read_iter = erofs_file_read_iter, + .mmap = erofs_file_mmap, + .splice_read = generic_file_splice_read, }; diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 0f8da74570b4..b0b23f41abc3 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -4,6 +4,7 @@ * * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2021, Alibaba Cloud */ #ifndef __EROFS_FS_H #define __EROFS_FS_H @@ -19,10 +20,12 @@ #define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING 0x00000001 #define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002 #define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002 +#define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004 #define EROFS_ALL_FEATURE_INCOMPAT \ (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ - EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER) + EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \ + EROFS_FEATURE_INCOMPAT_CHUNKED_FILE) #define EROFS_SB_EXTSLOT_SIZE 16 @@ -64,13 +67,16 @@ struct erofs_super_block { * inode, [xattrs], last_inline_data, ... | ... | no-holed data * 3 - inode compression D: * inode, [xattrs], map_header, extents ... | ... - * 4~7 - reserved + * 4 - inode chunk-based E: + * inode, [xattrs], chunk indexes ... | ... + * 5~7 - reserved */ enum { EROFS_INODE_FLAT_PLAIN = 0, EROFS_INODE_FLAT_COMPRESSION_LEGACY = 1, EROFS_INODE_FLAT_INLINE = 2, EROFS_INODE_FLAT_COMPRESSION = 3, + EROFS_INODE_CHUNK_BASED = 4, EROFS_INODE_DATALAYOUT_MAX }; @@ -90,6 +96,19 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode) #define EROFS_I_ALL \ ((1 << (EROFS_I_DATALAYOUT_BIT + EROFS_I_DATALAYOUT_BITS)) - 1) +/* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */ +#define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F +/* with chunk indexes or just a 4-byte blkaddr array */ +#define EROFS_CHUNK_FORMAT_INDEXES 0x0020 + +#define EROFS_CHUNK_FORMAT_ALL \ + (EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES) + +struct erofs_inode_chunk_info { + __le16 format; /* chunk blkbits, etc. */ + __le16 reserved; +}; + /* 32-byte reduced form of an ondisk inode */ struct erofs_inode_compact { __le16 i_format; /* inode format hints */ @@ -107,6 +126,9 @@ struct erofs_inode_compact { /* for device files, used to indicate old/new device # */ __le32 rdev; + + /* for chunk-based files, it contains the summary info */ + struct erofs_inode_chunk_info c; } i_u; __le32 i_ino; /* only used for 32-bit stat compatibility */ __le16 i_uid; @@ -135,6 +157,9 @@ struct erofs_inode_extended { /* for device files, used to indicate old/new device # */ __le32 rdev; + + /* for chunk-based files, it contains the summary info */ + struct erofs_inode_chunk_info c; } i_u; /* only used for 32-bit stat compatibility */ @@ -204,6 +229,19 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e) e->e_name_len + le16_to_cpu(e->e_value_size)); } +/* represent a zeroed chunk (hole) */ +#define EROFS_NULL_ADDR -1 + +/* 4-byte block address array */ +#define EROFS_BLOCK_MAP_ENTRY_SIZE sizeof(__le32) + +/* 8-byte inode chunk indexes */ +struct erofs_inode_chunk_index { + __le16 advise; /* always 0, don't care for now */ + __le16 device_id; /* back-end storage id, always 0 for now */ + __le32 blkaddr; /* start block address of this inode chunk */ +}; + /* maximum supported size of a physical compression cluster */ #define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024) @@ -338,9 +376,14 @@ static inline void erofs_check_ondisk_layout_definitions(void) BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64); BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12); BUILD_BUG_ON(sizeof(struct erofs_xattr_entry) != 4); + BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_info) != 4); + BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != 8); BUILD_BUG_ON(sizeof(struct z_erofs_map_header) != 8); BUILD_BUG_ON(sizeof(struct z_erofs_vle_decompressed_index) != 8); BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12); + /* keep in sync between 2 index structures for better extendibility */ + BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != + sizeof(struct z_erofs_vle_decompressed_index)); BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) < Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index aa8a0d770ba3..31ac3a73b390 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2021, Alibaba Cloud */ #include "xattr.h" @@ -122,8 +123,11 @@ static struct page *erofs_read_inode(struct inode *inode, /* total blocks for compressed files */ if (erofs_inode_is_data_compressed(vi->datalayout)) nblks = le32_to_cpu(die->i_u.compressed_blocks); - + else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) + /* fill chunked inode summary info */ + vi->chunkformat = le16_to_cpu(die->i_u.c.format); kfree(copied); + copied = NULL; break; case EROFS_INODE_LAYOUT_COMPACT: vi->inode_isize = sizeof(struct erofs_inode_compact); @@ -160,6 +164,8 @@ static struct page *erofs_read_inode(struct inode *inode, inode->i_size = le32_to_cpu(dic->i_size); if (erofs_inode_is_data_compressed(vi->datalayout)) nblks = le32_to_cpu(dic->i_u.compressed_blocks); + else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) + vi->chunkformat = le16_to_cpu(dic->i_u.c.format); break; default: erofs_err(inode->i_sb, @@ -169,11 +175,26 @@ static struct page *erofs_read_inode(struct inode *inode, goto err_out; } + if (vi->datalayout == EROFS_INODE_CHUNK_BASED) { + if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_ALL)) { + erofs_err(inode->i_sb, + "unsupported chunk format %x of nid %llu", + vi->chunkformat, vi->nid); + err = -EOPNOTSUPP; + goto err_out; + } + vi->chunkbits = LOG_BLOCK_SIZE + + (vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK); + } inode->i_mtime.tv_sec = inode->i_ctime.tv_sec; inode->i_atime.tv_sec = inode->i_ctime.tv_sec; inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec; inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec; + inode->i_flags &= ~S_DAX; + if (test_opt(&sbi->ctx, DAX_ALWAYS) && S_ISREG(inode->i_mode) && + vi->datalayout == EROFS_INODE_FLAT_PLAIN) + inode->i_flags |= S_DAX; if (!nblks) /* measure inode.i_blocks as generic filesystems */ inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9; @@ -247,7 +268,10 @@ static int erofs_fill_inode(struct inode *inode, int isdir) switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op = &erofs_generic_iops; - inode->i_fop = &generic_ro_fops; + if (erofs_inode_is_data_compressed(vi->datalayout)) + inode->i_fop = &generic_ro_fops; + else + inode->i_fop = &erofs_file_fops; break; case S_IFDIR: inode->i_op = &erofs_dir_iops; @@ -358,6 +382,7 @@ const struct inode_operations erofs_generic_iops = { .getattr = erofs_getattr, .listxattr = erofs_listxattr, .get_acl = erofs_get_acl, + .fiemap = erofs_fiemap, }; const struct inode_operations erofs_symlink_iops = { diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 543c2ff97d30..9524e155b38f 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -2,6 +2,7 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2021, Alibaba Cloud */ #ifndef __EROFS_INTERNAL_H #define __EROFS_INTERNAL_H @@ -15,6 +16,7 @@ #include #include #include +#include #include "erofs_fs.h" /* redefine pr_fmt "erofs: " */ @@ -83,6 +85,7 @@ struct erofs_sb_info { struct erofs_sb_lz4_info lz4; #endif /* CONFIG_EROFS_FS_ZIP */ + struct dax_device *dax_dev; u32 blocks; u32 meta_blkaddr; #ifdef CONFIG_EROFS_FS_XATTR @@ -115,6 +118,8 @@ struct erofs_sb_info { /* Mount flags set via mount options or defaults */ #define EROFS_MOUNT_XATTR_USER 0x00000010 #define EROFS_MOUNT_POSIX_ACL 0x00000020 +#define EROFS_MOUNT_DAX_ALWAYS 0x00000040 +#define EROFS_MOUNT_DAX_NEVER 0x00000080 #define clear_opt(ctx, option) ((ctx)->mount_opt &= ~EROFS_MOUNT_##option) #define set_opt(ctx, option) ((ctx)->mount_opt |= EROFS_MOUNT_##option) @@ -257,6 +262,10 @@ struct erofs_inode { union { erofs_blk_t raw_blkaddr; + struct { + unsigned short chunkformat; + unsigned char chunkbits; + }; #ifdef CONFIG_EROFS_FS_ZIP struct { unsigned short z_advise; @@ -353,8 +362,15 @@ struct erofs_map_blocks { /* Flags used by erofs_map_blocks_flatmode() */ #define EROFS_GET_BLOCKS_RAW 0x0001 +/* + * Used to get the exact decompressed length, e.g. fiemap (consider lookback + * approach instead if possible since it's more metadata lightweight.) + */ +#define EROFS_GET_BLOCKS_FIEMAP 0x0002 /* zmap.c */ +extern const struct iomap_ops z_erofs_iomap_report_ops; + #ifdef CONFIG_EROFS_FS_ZIP int z_erofs_fill_inode(struct inode *inode); int z_erofs_map_blocks_iter(struct inode *inode, @@ -371,7 +387,10 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode, #endif /* !CONFIG_EROFS_FS_ZIP */ /* data.c */ +extern const struct file_operations erofs_file_fops; struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr); +int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); /* inode.c */ static inline unsigned long erofs_inode_hash(erofs_nid_t nid) @@ -441,8 +460,7 @@ int __init z_erofs_init_zip_subsystem(void); void z_erofs_exit_zip_subsystem(void); int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, struct erofs_workgroup *egrp); -int erofs_try_to_free_cached_page(struct address_space *mapping, - struct page *page); +int erofs_try_to_free_cached_page(struct page *page); int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, struct z_erofs_lz4_cfgs *lz4, int len); diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index a8271ce5e13f..8629e616028c 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -245,4 +245,5 @@ const struct inode_operations erofs_dir_iops = { .getattr = erofs_getattr, .listxattr = erofs_listxattr, .get_acl = erofs_get_acl, + .fiemap = erofs_fiemap, }; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 8fc6c04b54f4..a8d49e8fc83a 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "xattr.h" #define CREATE_TRACE_POINTS @@ -355,6 +356,8 @@ enum { Opt_user_xattr, Opt_acl, Opt_cache_strategy, + Opt_dax, + Opt_dax_enum, Opt_err }; @@ -365,14 +368,47 @@ static const struct constant_table erofs_param_cache_strategy[] = { {} }; +static const struct constant_table erofs_dax_param_enums[] = { + {"always", EROFS_MOUNT_DAX_ALWAYS}, + {"never", EROFS_MOUNT_DAX_NEVER}, + {} +}; + static const struct fs_parameter_spec erofs_fs_parameters[] = { fsparam_flag_no("user_xattr", Opt_user_xattr), fsparam_flag_no("acl", Opt_acl), fsparam_enum("cache_strategy", Opt_cache_strategy, erofs_param_cache_strategy), + fsparam_flag("dax", Opt_dax), + fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums), {} }; +static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) +{ +#ifdef CONFIG_FS_DAX + struct erofs_fs_context *ctx = fc->fs_private; + + switch (mode) { + case EROFS_MOUNT_DAX_ALWAYS: + warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + set_opt(ctx, DAX_ALWAYS); + clear_opt(ctx, DAX_NEVER); + return true; + case EROFS_MOUNT_DAX_NEVER: + set_opt(ctx, DAX_NEVER); + clear_opt(ctx, DAX_ALWAYS); + return true; + default: + DBG_BUGON(1); + return false; + } +#else + errorfc(fc, "dax options not supported"); + return false; +#endif +} + static int erofs_fc_parse_param(struct fs_context *fc, struct fs_parameter *param) { @@ -412,6 +448,14 @@ static int erofs_fc_parse_param(struct fs_context *fc, errorfc(fc, "compression not supported, cache_strategy ignored"); #endif break; + case Opt_dax: + if (!erofs_fc_set_dax_mode(fc, EROFS_MOUNT_DAX_ALWAYS)) + return -EINVAL; + break; + case Opt_dax_enum: + if (!erofs_fc_set_dax_mode(fc, result.uint_32)) + return -EINVAL; + break; default: return -ENOPARAM; } @@ -430,7 +474,7 @@ static int erofs_managed_cache_releasepage(struct page *page, gfp_t gfp_mask) DBG_BUGON(mapping->a_ops != &managed_cache_aops); if (PagePrivate(page)) - ret = erofs_try_to_free_cached_page(mapping, page); + ret = erofs_try_to_free_cached_page(page); return ret; } @@ -496,10 +540,16 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return -ENOMEM; sb->s_fs_info = sbi; + sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev); err = erofs_read_superblock(sb); if (err) return err; + if (test_opt(ctx, DAX_ALWAYS) && + !bdev_dax_supported(sb->s_bdev, EROFS_BLKSIZ)) { + errorfc(fc, "DAX unsupported by block device. Turning off DAX."); + clear_opt(ctx, DAX_ALWAYS); + } sb->s_flags |= SB_RDONLY | SB_NOATIME; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_time_gran = 1; @@ -609,6 +659,7 @@ static void erofs_kill_sb(struct super_block *sb) sbi = EROFS_SB(sb); if (!sbi) return; + fs_put_dax(sbi->dax_dev); kfree(sbi); sb->s_fs_info = NULL; } @@ -711,8 +762,8 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) static int erofs_show_options(struct seq_file *seq, struct dentry *root) { - struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb); - struct erofs_fs_context *ctx __maybe_unused = &sbi->ctx; + struct erofs_sb_info *sbi = EROFS_SB(root->d_sb); + struct erofs_fs_context *ctx = &sbi->ctx; #ifdef CONFIG_EROFS_FS_XATTR if (test_opt(ctx, XATTR_USER)) @@ -734,6 +785,10 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND) seq_puts(seq, ",cache_strategy=readaround"); #endif + if (test_opt(ctx, DAX_ALWAYS)) + seq_puts(seq, ",dax=always"); + if (test_opt(ctx, DAX_NEVER)) + seq_puts(seq, ",dax=never"); return 0; } diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index cb4d0889eca9..11c7a1aaebad 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -309,7 +309,6 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, { struct z_erofs_pcluster *const pcl = container_of(grp, struct z_erofs_pcluster, obj); - struct address_space *const mapping = MNGD_MAPPING(sbi); int i; /* @@ -326,7 +325,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, if (!trylock_page(page)) return -EBUSY; - if (page->mapping != mapping) + if (!erofs_page_is_managed(sbi, page)) continue; /* barrier is implied in the following 'unlock_page' */ @@ -337,8 +336,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, return 0; } -int erofs_try_to_free_cached_page(struct address_space *mapping, - struct page *page) +int erofs_try_to_free_cached_page(struct page *page) { struct z_erofs_pcluster *const pcl = (void *)page_private(page); int ret = 0; /* 0 - busy */ diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index f68aea4baed7..9fb98d85a3ce 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -212,9 +212,34 @@ static unsigned int decode_compactedbits(unsigned int lobits, return lo; } +static int get_compacted_la_distance(unsigned int lclusterbits, + unsigned int encodebits, + unsigned int vcnt, u8 *in, int i) +{ + const unsigned int lomask = (1 << lclusterbits) - 1; + unsigned int lo, d1 = 0; + u8 type; + + DBG_BUGON(i >= vcnt); + + do { + lo = decode_compactedbits(lclusterbits, lomask, + in, encodebits * i, &type); + + if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) + return d1; + ++d1; + } while (++i < vcnt); + + /* vcnt - 1 (Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) item */ + if (!(lo & Z_EROFS_VLE_DI_D0_CBLKCNT)) + d1 += lo - 1; + return d1; +} + static int unpack_compacted_index(struct z_erofs_maprecorder *m, unsigned int amortizedshift, - unsigned int eofs) + unsigned int eofs, bool lookahead) { struct erofs_inode *const vi = EROFS_I(m->inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; @@ -243,6 +268,11 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, m->type = type; if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { m->clusterofs = 1 << lclusterbits; + + /* figure out lookahead_distance: delta[1] if needed */ + if (lookahead) + m->delta[1] = get_compacted_la_distance(lclusterbits, + encodebits, vcnt, in, i); if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) { if (!big_pcluster) { DBG_BUGON(1); @@ -313,7 +343,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, } static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, - unsigned long lcn) + unsigned long lcn, bool lookahead) { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); @@ -364,11 +394,12 @@ out: err = z_erofs_reload_indexes(m, erofs_blknr(pos)); if (err) return err; - return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos)); + return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos), + lookahead); } static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m, - unsigned int lcn) + unsigned int lcn, bool lookahead) { const unsigned int datamode = EROFS_I(m->inode)->datalayout; @@ -376,7 +407,7 @@ static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m, return legacy_load_cluster_from_disk(m, lcn); if (datamode == EROFS_INODE_FLAT_COMPRESSION) - return compacted_load_cluster_from_disk(m, lcn); + return compacted_load_cluster_from_disk(m, lcn, lookahead); return -EINVAL; } @@ -399,7 +430,7 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, /* load extent head logical cluster if needed */ lcn -= lookback_distance; - err = z_erofs_load_cluster_from_disk(m, lcn); + err = z_erofs_load_cluster_from_disk(m, lcn, false); if (err) return err; @@ -450,7 +481,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, if (m->compressedlcs) goto out; - err = z_erofs_load_cluster_from_disk(m, lcn); + err = z_erofs_load_cluster_from_disk(m, lcn, false); if (err) return err; @@ -498,6 +529,48 @@ err_bonus_cblkcnt: return -EFSCORRUPTED; } +static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) +{ + struct inode *inode = m->inode; + struct erofs_inode *vi = EROFS_I(inode); + struct erofs_map_blocks *map = m->map; + unsigned int lclusterbits = vi->z_logical_clusterbits; + u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits; + int err; + + do { + /* handle the last EOF pcluster (no next HEAD lcluster) */ + if ((lcn << lclusterbits) >= inode->i_size) { + map->m_llen = inode->i_size - map->m_la; + return 0; + } + + err = z_erofs_load_cluster_from_disk(m, lcn, true); + if (err) + return err; + + if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { + DBG_BUGON(!m->delta[1] && + m->clusterofs != 1 << lclusterbits); + } else if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN || + m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD) { + /* go on until the next HEAD lcluster */ + if (lcn != headlcn) + break; + m->delta[1] = 1; + } else { + erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu", + m->type, lcn, vi->nid); + DBG_BUGON(1); + return -EOPNOTSUPP; + } + lcn += m->delta[1]; + } while (m->delta[1]); + + map->m_llen = (lcn << lclusterbits) + m->clusterofs - map->m_la; + return 0; +} + int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags) @@ -531,7 +604,7 @@ int z_erofs_map_blocks_iter(struct inode *inode, initial_lcn = ofs >> lclusterbits; endoff = ofs & ((1 << lclusterbits) - 1); - err = z_erofs_load_cluster_from_disk(&m, initial_lcn); + err = z_erofs_load_cluster_from_disk(&m, initial_lcn, false); if (err) goto unmap_out; @@ -581,6 +654,12 @@ int z_erofs_map_blocks_iter(struct inode *inode, err = z_erofs_get_extent_compressedlen(&m, initial_lcn); if (err) goto out; + + if (flags & EROFS_GET_BLOCKS_FIEMAP) { + err = z_erofs_get_extent_decompressedlen(&m); + if (!err) + map->m_flags |= EROFS_MAP_FULL_MAPPED; + } unmap_out: if (m.kaddr) kunmap_atomic(m.kaddr); @@ -596,3 +675,41 @@ out: DBG_BUGON(err < 0 && err != -ENOMEM); return err; } + +static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) +{ + int ret; + struct erofs_map_blocks map = { .m_la = offset }; + + ret = z_erofs_map_blocks_iter(inode, &map, EROFS_GET_BLOCKS_FIEMAP); + if (map.mpage) + put_page(map.mpage); + if (ret < 0) + return ret; + + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = map.m_la; + iomap->length = map.m_llen; + if (map.m_flags & EROFS_MAP_MAPPED) { + iomap->type = IOMAP_MAPPED; + iomap->addr = map.m_pa; + } else { + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + /* + * No strict rule how to describe extents for post EOF, yet + * we need do like below. Otherwise, iomap itself will get + * into an endless loop on post EOF. + */ + if (iomap->offset >= inode->i_size) + iomap->length = length + map.m_la - offset; + } + iomap->flags = 0; + return 0; +} + +const struct iomap_ops z_erofs_iomap_report_ops = { + .iomap_begin = z_erofs_iomap_begin_report, +};