iomap: Add per-block dirty state tracking to improve performance
authorRitesh Harjani (IBM) <ritesh.list@gmail.com>
Mon, 10 Jul 2023 21:12:43 +0000 (14:12 -0700)
committerRitesh Harjani (IBM) <ritesh.list@gmail.com>
Tue, 25 Jul 2023 05:25:56 +0000 (10:55 +0530)
When filesystem blocksize is less than folio size (either with
mapping_large_folio_support() or with blocksize < pagesize) and when the
folio is uptodate in pagecache, then even a byte write can cause
an entire folio to be written to disk during writeback. This happens
because we currently don't have a mechanism to track per-block dirty
state within struct iomap_folio_state. We currently only track uptodate
state.

This patch implements support for tracking per-block dirty state in
iomap_folio_state->state bitmap. This should help improve the filesystem
write performance and help reduce write amplification.

Performance testing of below fio workload reveals ~16x performance
improvement using nvme with XFS (4k blocksize) on Power (64K pagesize)
FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.

1. <test_randwrite.fio>
[global]
ioengine=psync
rw=randwrite
overwrite=1
pre_read=1
direct=0
bs=4k
size=1G
dir=./
numjobs=8
fdatasync=1
runtime=60
iodepth=64
group_reporting=1

[fio-run]

2. Also our internal performance team reported that this patch improves
   their database workload performance by around ~83% (with XFS on Power)

Reported-by: Aravinda Herle <araherle@in.ibm.com>
Reported-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
fs/gfs2/aops.c
fs/iomap/buffered-io.c
fs/xfs/xfs_aops.c
fs/zonefs/file.c
include/linux/iomap.h

index ae49256..9c4b26a 100644 (file)
@@ -747,7 +747,7 @@ static const struct address_space_operations gfs2_aops = {
        .writepages = gfs2_writepages,
        .read_folio = gfs2_read_folio,
        .readahead = gfs2_readahead,
-       .dirty_folio = filemap_dirty_folio,
+       .dirty_folio = iomap_dirty_folio,
        .release_folio = iomap_release_folio,
        .invalidate_folio = iomap_invalidate_folio,
        .bmap = gfs2_bmap,
index 1ec4ef4..283fb96 100644 (file)
 
 typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
 /*
- * Structure allocated for each folio to track per-block uptodate state
+ * Structure allocated for each folio to track per-block uptodate, dirty state
  * and I/O completions.
  */
 struct iomap_folio_state {
        atomic_t                read_bytes_pending;
        atomic_t                write_bytes_pending;
        spinlock_t              state_lock;
+
+       /*
+        * Each block has two bits in this bitmap:
+        * Bits [0..blocks_per_folio) has the uptodate status.
+        * Bits [b_p_f...(2*b_p_f))   has the dirty status.
+        */
        unsigned long           state[];
 };
 
@@ -78,6 +84,61 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off,
                folio_mark_uptodate(folio);
 }
 
+static inline bool ifs_block_is_dirty(struct folio *folio,
+               struct iomap_folio_state *ifs, int block)
+{
+       struct inode *inode = folio->mapping->host;
+       unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+
+       return test_bit(block + blks_per_folio, ifs->state);
+}
+
+static void ifs_clear_range_dirty(struct folio *folio,
+               struct iomap_folio_state *ifs, size_t off, size_t len)
+{
+       struct inode *inode = folio->mapping->host;
+       unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+       unsigned int first_blk = (off >> inode->i_blkbits);
+       unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
+       unsigned int nr_blks = last_blk - first_blk + 1;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ifs->state_lock, flags);
+       bitmap_clear(ifs->state, first_blk + blks_per_folio, nr_blks);
+       spin_unlock_irqrestore(&ifs->state_lock, flags);
+}
+
+static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len)
+{
+       struct iomap_folio_state *ifs = folio->private;
+
+       if (ifs)
+               ifs_clear_range_dirty(folio, ifs, off, len);
+}
+
+static void ifs_set_range_dirty(struct folio *folio,
+               struct iomap_folio_state *ifs, size_t off, size_t len)
+{
+       struct inode *inode = folio->mapping->host;
+       unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+       unsigned int first_blk = (off >> inode->i_blkbits);
+       unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
+       unsigned int nr_blks = last_blk - first_blk + 1;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ifs->state_lock, flags);
+       bitmap_set(ifs->state, first_blk + blks_per_folio, nr_blks);
+       spin_unlock_irqrestore(&ifs->state_lock, flags);
+}
+
+static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len)
+{
+       struct iomap_folio_state *ifs = folio->private;
+
+       if (ifs)
+               ifs_set_range_dirty(folio, ifs, off, len);
+}
+
 static struct iomap_folio_state *ifs_alloc(struct inode *inode,
                struct folio *folio, unsigned int flags)
 {
@@ -93,14 +154,24 @@ static struct iomap_folio_state *ifs_alloc(struct inode *inode,
        else
                gfp = GFP_NOFS | __GFP_NOFAIL;
 
-       ifs = kzalloc(struct_size(ifs, state, BITS_TO_LONGS(nr_blocks)),
-                     gfp);
-       if (ifs) {
-               spin_lock_init(&ifs->state_lock);
-               if (folio_test_uptodate(folio))
-                       bitmap_fill(ifs->state, nr_blocks);
-               folio_attach_private(folio, ifs);
-       }
+       /*
+        * ifs->state tracks two sets of state flags when the
+        * filesystem block size is smaller than the folio size.
+        * The first state tracks per-block uptodate and the
+        * second tracks per-block dirty state.
+        */
+       ifs = kzalloc(struct_size(ifs, state,
+                     BITS_TO_LONGS(2 * nr_blocks)), gfp);
+       if (!ifs)
+               return ifs;
+
+       spin_lock_init(&ifs->state_lock);
+       if (folio_test_uptodate(folio))
+               bitmap_set(ifs->state, 0, nr_blocks);
+       if (folio_test_dirty(folio))
+               bitmap_set(ifs->state, nr_blocks, nr_blocks);
+       folio_attach_private(folio, ifs);
+
        return ifs;
 }
 
@@ -519,6 +590,17 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
 }
 EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
 
+bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio)
+{
+       struct inode *inode = mapping->host;
+       size_t len = folio_size(folio);
+
+       ifs_alloc(inode, folio, 0);
+       iomap_set_range_dirty(folio, 0, len);
+       return filemap_dirty_folio(mapping, folio);
+}
+EXPORT_SYMBOL_GPL(iomap_dirty_folio);
+
 static void
 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 {
@@ -723,6 +805,7 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
        if (unlikely(copied < len && !folio_test_uptodate(folio)))
                return 0;
        iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len);
+       iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied);
        filemap_dirty_folio(inode->i_mapping, folio);
        return copied;
 }
@@ -892,6 +975,43 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
 }
 EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
 
+static int iomap_write_delalloc_ifs_punch(struct inode *inode,
+               struct folio *folio, loff_t start_byte, loff_t end_byte,
+               iomap_punch_t punch)
+{
+       unsigned int first_blk, last_blk, i;
+       loff_t last_byte;
+       u8 blkbits = inode->i_blkbits;
+       struct iomap_folio_state *ifs;
+       int ret = 0;
+
+       /*
+        * When we have per-block dirty tracking, there can be
+        * blocks within a folio which are marked uptodate
+        * but not dirty. In that case it is necessary to punch
+        * out such blocks to avoid leaking any delalloc blocks.
+        */
+       ifs = folio->private;
+       if (!ifs)
+               return ret;
+
+       last_byte = min_t(loff_t, end_byte - 1,
+                       folio_pos(folio) + folio_size(folio) - 1);
+       first_blk = offset_in_folio(folio, start_byte) >> blkbits;
+       last_blk = offset_in_folio(folio, last_byte) >> blkbits;
+       for (i = first_blk; i <= last_blk; i++) {
+               if (!ifs_block_is_dirty(folio, ifs, i)) {
+                       ret = punch(inode, folio_pos(folio) + (i << blkbits),
+                                   1 << blkbits);
+                       if (ret)
+                               return ret;
+               }
+       }
+
+       return ret;
+}
+
+
 static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
                loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
                iomap_punch_t punch)
@@ -909,6 +1029,12 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
                        return ret;
        }
 
+       /* Punch non-dirty blocks within folio */
+       ret = iomap_write_delalloc_ifs_punch(inode, folio, start_byte,
+                       end_byte, punch);
+       if (ret)
+               return ret;
+
        /*
         * Make sure the next punch start is correctly bound to
         * the end of this data range, not the end of the folio.
@@ -1639,7 +1765,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
                struct writeback_control *wbc, struct inode *inode,
                struct folio *folio, u64 end_pos)
 {
-       struct iomap_folio_state *ifs = ifs_alloc(inode, folio, 0);
+       struct iomap_folio_state *ifs = folio->private;
        struct iomap_ioend *ioend, *next;
        unsigned len = i_blocksize(inode);
        unsigned nblocks = i_blocks_per_folio(inode, folio);
@@ -1647,6 +1773,13 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
        int error = 0, count = 0, i;
        LIST_HEAD(submit_list);
 
+       WARN_ON_ONCE(end_pos <= pos);
+
+       if (!ifs && nblocks > 1) {
+               ifs = ifs_alloc(inode, folio, 0);
+               iomap_set_range_dirty(folio, 0, end_pos - pos);
+       }
+
        WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
 
        /*
@@ -1655,7 +1788,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
         * invalid, grab a new one.
         */
        for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
-               if (ifs && !ifs_block_is_uptodate(ifs, i))
+               if (ifs && !ifs_block_is_dirty(folio, ifs, i))
                        continue;
 
                error = wpc->ops->map_blocks(wpc, inode, pos);
@@ -1699,6 +1832,12 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
                }
        }
 
+       /*
+        * We can have dirty bits set past end of file in page_mkwrite path
+        * while mapping the last partial folio. Hence it's better to clear
+        * all the dirty bits in the folio here.
+        */
+       iomap_clear_range_dirty(folio, 0, folio_size(folio));
        folio_start_writeback(folio);
        folio_unlock(folio);
 
index 451942f..2fca4b4 100644 (file)
@@ -578,7 +578,7 @@ const struct address_space_operations xfs_address_space_operations = {
        .read_folio             = xfs_vm_read_folio,
        .readahead              = xfs_vm_readahead,
        .writepages             = xfs_vm_writepages,
-       .dirty_folio            = filemap_dirty_folio,
+       .dirty_folio            = iomap_dirty_folio,
        .release_folio          = iomap_release_folio,
        .invalidate_folio       = iomap_invalidate_folio,
        .bmap                   = xfs_vm_bmap,
index 92c9aaa..bf9bf24 100644 (file)
@@ -175,7 +175,7 @@ const struct address_space_operations zonefs_file_aops = {
        .read_folio             = zonefs_read_folio,
        .readahead              = zonefs_readahead,
        .writepages             = zonefs_writepages,
-       .dirty_folio            = filemap_dirty_folio,
+       .dirty_folio            = iomap_dirty_folio,
        .release_folio          = iomap_release_folio,
        .invalidate_folio       = iomap_invalidate_folio,
        .migrate_folio          = filemap_migrate_folio,
index 80facb9..fdc6e64 100644 (file)
@@ -264,6 +264,7 @@ bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
 struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len);
 bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
 void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
+bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
 int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
                const struct iomap_ops *ops);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,